diff --git a/source/lib/sysdep/arch/ia32/ia32.cpp b/source/lib/sysdep/arch/ia32/ia32.cpp index af54b9183e..7710951956 100644 --- a/source/lib/sysdep/arch/ia32/ia32.cpp +++ b/source/lib/sysdep/arch/ia32/ia32.cpp @@ -158,6 +158,11 @@ bool cpu_CAS(volatile intptr_t* location, intptr_t expected, intptr_t new_value) return ia32_asm_CAS(location, expected, new_value); } +bool cpu_CAS64(volatile u64* location, u64 expected, u64 new_value) +{ + return ia32_asm_CAS64(location, expected, new_value); +} + void* cpu_memcpy(void* RESTRICT dst, const void* RESTRICT src, size_t size) { diff --git a/source/lib/sysdep/arch/ia32/ia32_asm.asm b/source/lib/sysdep/arch/ia32/ia32_asm.asm index a0b218b2cf..1f5addb997 100644 --- a/source/lib/sysdep/arch/ia32/ia32_asm.asm +++ b/source/lib/sysdep/arch/ia32/ia32_asm.asm @@ -92,6 +92,25 @@ db 0xf0 ; LOCK prefix ret +; extern bool CALL_CONV ia32_asm_CAS64(volatile u64* location, u64 expected, u64 new_value); +global sym(ia32_asm_CAS64) +sym(ia32_asm_CAS64): + push ebx + push esi + mov esi, [esp+8+4] ; location + mov eax, [esp+8+8] + mov edx, [esp+8+12] ; edx:eax = expected + mov ebx, [esp+8+16] + mov ecx, [esp+8+20] ; ecx:ebx = new_value +db 0xf0 ; LOCK prefix + cmpxchg8b [esi] + sete al + movzx eax, al + pop esi + pop ebx + ret + + ;------------------------------------------------------------------------------- ; FPU ;------------------------------------------------------------------------------- diff --git a/source/lib/sysdep/arch/ia32/ia32_asm.h b/source/lib/sysdep/arch/ia32/ia32_asm.h index 78b15731a4..2ac7c18e15 100644 --- a/source/lib/sysdep/arch/ia32/ia32_asm.h +++ b/source/lib/sysdep/arch/ia32/ia32_asm.h @@ -36,6 +36,7 @@ extern void CALL_CONV ia32_asm_cpuid(x86_x64_CpuidRegs* regs); extern intptr_t CALL_CONV ia32_asm_AtomicAdd(volatile intptr_t* location, intptr_t increment); extern bool CALL_CONV ia32_asm_CAS(volatile intptr_t* location, intptr_t expected, intptr_t new_value); +extern bool CALL_CONV ia32_asm_CAS64(volatile u64* location, u64 expected, u64 new_value); /// control87 // FPU control word diff --git a/source/lib/sysdep/arch/x86_x64/tests/test_topology.h b/source/lib/sysdep/arch/x86_x64/tests/test_topology.h index 9993238446..e118e11fb2 100644 --- a/source/lib/sysdep/arch/x86_x64/tests/test_topology.h +++ b/source/lib/sysdep/arch/x86_x64/tests/test_topology.h @@ -29,8 +29,8 @@ class TestTopology : public CxxTest::TestSuite public: void test_run() { - // Just run the function, ignoring the return value, so - // Valgrind can check it's not doing anything very bad - cpu_topology_Detect(); + TS_ASSERT_LESS_THAN_EQUALS(1, cpu_topology_NumPackages()); + TS_ASSERT_LESS_THAN_EQUALS(1, cpu_topology_CoresPerPackage()); + TS_ASSERT_LESS_THAN_EQUALS(1, cpu_topology_LogicalPerCore()); } }; diff --git a/source/lib/sysdep/arch/x86_x64/topology.cpp b/source/lib/sysdep/arch/x86_x64/topology.cpp index e14a2dea95..4e3d475555 100644 --- a/source/lib/sysdep/arch/x86_x64/topology.cpp +++ b/source/lib/sysdep/arch/x86_x64/topology.cpp @@ -105,9 +105,9 @@ static size_t MaxLogicalPerCore() static size_t MaxLogicalPerCache() { - const x86_x64_Cache* const dcache = x86_x64_DCache(); - if(dcache->levels >= 2) - return dcache->parameters[1].sharedBy; + const x86_x64_Caches* const dcaches = x86_x64_DCaches(); + if(dcaches->numLevels >= 2) + return dcaches->levels[1].sharedBy; else return 1; // default } @@ -204,8 +204,9 @@ static size_t NumUniqueValuesInField(const u8* apicIds, size_t offset, size_t nu } -static size_t NumPackages(const u8* apicIds) +size_t cpu_topology_NumPackages() { + const u8* apicIds = ApicIds(); if(apicIds) { const size_t offset = ceil_log2(MaxCoresPerPackage()) + ceil_log2(MaxLogicalPerCore()); @@ -236,8 +237,9 @@ static size_t NumPackages(const u8* apicIds) } -static size_t CoresPerPackage(const u8* apicIds) +size_t cpu_topology_CoresPerPackage() { + const u8* apicIds = ApicIds(); if(apicIds) { const size_t offset = ceil_log2(MaxLogicalPerCore()); @@ -251,8 +253,9 @@ static size_t CoresPerPackage(const u8* apicIds) } -static size_t LogicalPerCore(const u8* apicIds) +size_t cpu_topology_LogicalPerCore() { + const u8* apicIds = ApicIds(); if(apicIds) { const size_t offset = 0; @@ -266,49 +269,6 @@ static size_t LogicalPerCore(const u8* apicIds) } -//----------------------------------------------------------------------------- -// CPU topology interface - -struct CpuTopology // POD -{ - size_t numPackages; - size_t coresPerPackage; - size_t logicalPerCore; -}; -static CpuTopology cpuTopology; - -static LibError InitCpuTopology() -{ - const u8* apicIds = ApicIds(); - cpuTopology.numPackages = NumPackages(apicIds); - cpuTopology.coresPerPackage = CoresPerPackage(apicIds); - cpuTopology.logicalPerCore = LogicalPerCore(apicIds); - return INFO::OK; -} - -const CpuTopology* cpu_topology_Detect() -{ - static ModuleInitState initState; - ModuleInit(&initState, InitCpuTopology); - return &cpuTopology; -} - -size_t cpu_topology_NumPackages(const CpuTopology* topology) -{ - return topology->numPackages; -} - -size_t cpu_topology_CoresPerPackage(const CpuTopology* topology) -{ - return topology->coresPerPackage; -} - -size_t cpu_topology_LogicalPerCore(const CpuTopology* topology) -{ - return topology->logicalPerCore; -} - - //----------------------------------------------------------------------------- // cache topology @@ -451,6 +411,7 @@ struct CacheTopology // POD uintptr_t cachesProcessorMask[os_cpu_MaxProcessors]; }; static CacheTopology cacheTopology; +static ModuleInitState cacheInitState; static LibError InitCacheTopology() { @@ -460,26 +421,22 @@ static LibError InitCacheTopology() return INFO::OK; } -const CacheTopology* cache_topology_Detect() +size_t cache_topology_NumCaches() { - static ModuleInitState initState; - ModuleInit(&initState, InitCacheTopology); - return &cacheTopology; + ModuleInit(&cacheInitState, InitCacheTopology); + return cacheTopology.numCaches; } -size_t cache_topology_NumCaches(const CacheTopology* topology) -{ - return topology->numCaches; -} - -size_t cache_topology_CacheFromProcessor(const CacheTopology* topology, size_t processor) +size_t cache_topology_CacheFromProcessor(size_t processor) { + ModuleInit(&cacheInitState, InitCacheTopology); debug_assert(processor < os_cpu_NumProcessors()); - return topology->processorsCache[processor]; + return cacheTopology.processorsCache[processor]; } -uintptr_t cache_topology_ProcessorMaskFromCache(const CacheTopology* topology, size_t cache) +uintptr_t cache_topology_ProcessorMaskFromCache(size_t cache) { - debug_assert(cache < topology->numCaches); - return topology->cachesProcessorMask[cache]; + ModuleInit(&cacheInitState, InitCacheTopology); + debug_assert(cache < cacheTopology.numCaches); + return cacheTopology.cachesProcessorMask[cache]; } diff --git a/source/lib/sysdep/arch/x86_x64/topology.h b/source/lib/sysdep/arch/x86_x64/topology.h index 077e561c4e..4e231b64d4 100644 --- a/source/lib/sysdep/arch/x86_x64/topology.h +++ b/source/lib/sysdep/arch/x86_x64/topology.h @@ -21,22 +21,13 @@ */ /* - * detection of CPU and cache topology + * detection of CPU and cache topology. + * thread-safe, no explicit initialization is required. */ #ifndef INCLUDED_TOPOLOGY #define INCLUDED_TOPOLOGY -// interface rationale: -// - explicit initialization avoids the difficulty and overhead of -// thread-safe lazy initialization checks. -// - requiring an opaque struct to be passed in ensures users call the -// init function before using the accessors. -// - delegating responsibility for thread-safety to the caller of the -// first *_Detect invocation avoids overhead and keeps us independent of -// the various threading packages (Boost, OpenMP, POSIX, Win32, ..) - - /** * @return a pointer to array (up to os_cpu_MaxProcessors entries; * os_cpu_NumProcessors() of them are valid) of the processors' @@ -49,76 +40,54 @@ LIB_API const u8* ApicIds(); //----------------------------------------------------------------------------- // cpu -/** - * stores CPU topology, i.e. how many packages, cores and SMT units are - * actually present and enabled. this is useful for detecting SMP systems, - * predicting performance and dimensioning thread pools. - * - * note: OS abstractions usually only mention "processors", which could be - * any mix of the above. - **/ -struct CpuTopology; - -/** - * initialize static storage from which topology can be retrieved by - * means of the following functions. - * @return const pointer to a shared instance. - **/ -LIB_API const CpuTopology* cpu_topology_Detect(); +// the CPU topology, i.e. how many packages, cores and SMT units are +// actually present and enabled, is useful for detecting SMP systems, +// predicting performance and dimensioning thread pools. +// +// note: OS abstractions usually only mention "processors", which could be +// any mix of the above. /** * @return number of *enabled* CPU packages / sockets. **/ -LIB_API size_t cpu_topology_NumPackages(const CpuTopology*); +LIB_API size_t cpu_topology_NumPackages(); /** * @return number of *enabled* CPU cores per package. * (2 on dual-core systems) **/ -LIB_API size_t cpu_topology_CoresPerPackage(const CpuTopology*); +LIB_API size_t cpu_topology_CoresPerPackage(); /** * @return number of *enabled* hyperthreading units per core. * (2 on P4 EE) **/ -LIB_API size_t cpu_topology_LogicalPerCore(const CpuTopology*); +LIB_API size_t cpu_topology_LogicalPerCore(); //----------------------------------------------------------------------------- // L2 cache -/** - * stores L2 cache topology, i.e. the mapping between processor and caches. - * this allows cores sharing a cache to work together on the same dataset, - * which may reduce contention and increase effective capacity. - * - * example: Intel Core2 micro-architectures (e.g. Intel Core2) feature - * partitioned L2 caches shared by two cores. - **/ -struct CacheTopology; +// knowledge of the cache topology, i.e. which processors share which caches, +// can be used to reduce contention and increase effective capacity by +// assigning the partner processors to work on the same dataset. +// +// example: Intel Core2 micro-architectures feature L2 caches shared by +// two cores. /** - * initialize static storage from which topology can be retrieved by - * means of the following functions. - * @return const pointer to a shared instance. - * - * WARNING: this function must not be reentered before it has returned once. + * @return number of distinct L2 caches. **/ -LIB_API const CacheTopology* cache_topology_Detect(); - -/** - * @return number of distinct L2 caches - **/ -LIB_API size_t cache_topology_NumCaches(const CacheTopology*); +LIB_API size_t cache_topology_NumCaches(); /** * @return L2 cache number (zero-based) to which belongs. **/ -LIB_API size_t cache_topology_CacheFromProcessor(const CacheTopology*, size_t processor); +LIB_API size_t cache_topology_CacheFromProcessor(size_t processor); /** * @return bit-mask of all processors sharing . **/ -LIB_API uintptr_t cache_topology_ProcessorMaskFromCache(const CacheTopology*, size_t cache); +LIB_API uintptr_t cache_topology_ProcessorMaskFromCache(size_t cache); #endif // #ifndef INCLUDED_TOPOLOGY diff --git a/source/lib/sysdep/arch/x86_x64/x86_x64.cpp b/source/lib/sysdep/arch/x86_x64/x86_x64.cpp index a41362acc4..df8a268ada 100644 --- a/source/lib/sysdep/arch/x86_x64/x86_x64.cpp +++ b/source/lib/sysdep/arch/x86_x64/x86_x64.cpp @@ -285,57 +285,67 @@ size_t x86_x64_Generation() //----------------------------------------------------------------------------- // cache -static const size_t maxCacheParams = 3; -static x86_x64_CacheParameters cacheParametersStorage[maxCacheParams*2]; -static x86_x64_Cache dcache = { 0, cacheParametersStorage }; -static x86_x64_Cache icache = { 0, cacheParametersStorage+maxCacheParams }; +static const size_t maxCacheLevels = 3; +static x86_x64_Cache cacheStorage[maxCacheLevels*2]; +static x86_x64_Caches dcaches = { 0, cacheStorage }; +static x86_x64_Caches icaches = { 0, cacheStorage+maxCacheLevels }; -static const size_t maxTLBParams = 15; -static x86_x64_TLBParameters tlbParametersStorage[maxTLBParams*2]; -static x86_x64_TLB dtlb = { 0, tlbParametersStorage }; -static x86_x64_TLB itlb = { 0, tlbParametersStorage+maxTLBParams }; +static const size_t maxTLBLevels = 15; +static x86_x64_TLB tlbStorage[maxTLBLevels*2]; +static x86_x64_TLBs dtlbs = { 0, tlbStorage }; +static x86_x64_TLBs itlbs = { 0, tlbStorage+maxTLBLevels }; -static void AddTLBParameters(const x86_x64_TLBParameters& params) +static bool IsData(x86_x64_CacheType type) { - if(params.type == X86_X64_CACHE_TYPE_INSTRUCTION || params.type == X86_X64_CACHE_TYPE_UNIFIED) + return (type == X86_X64_CACHE_TYPE_DATA || type == X86_X64_CACHE_TYPE_UNIFIED); +} + +static bool IsInstruction(x86_x64_CacheType type) +{ + return (type == X86_X64_CACHE_TYPE_INSTRUCTION || type == X86_X64_CACHE_TYPE_UNIFIED); +} + +static void AddTLB(const x86_x64_TLB& tlb) +{ + if(IsInstruction(tlb.type)) { - if(itlb.numParameters < maxTLBParams) - itlb.parameters[itlb.numParameters++] = params; + if(itlbs.numLevels < maxTLBLevels) + itlbs.levels[itlbs.numLevels++] = tlb; else debug_assert(0); } - if(params.type == X86_X64_CACHE_TYPE_DATA || params.type == X86_X64_CACHE_TYPE_UNIFIED) + if(IsData(tlb.type)) { - if(dtlb.numParameters < maxTLBParams) - dtlb.parameters[dtlb.numParameters++] = params; + if(dtlbs.numLevels < maxTLBLevels) + dtlbs.levels[dtlbs.numLevels++] = tlb; else debug_assert(0); } // large page TLBs have N 2M entries or N/2 4M entries; we generate a // second set of parameters for the latter from the former. - if(params.pageSize == 2*MiB) + if(tlb.pageSize == 2*MiB) { - x86_x64_TLBParameters params4M = params; - params4M.pageSize = 4*MiB; - params4M.entries = params.entries/2; - AddTLBParameters(params4M); + x86_x64_TLB tlb4M = tlb; + tlb4M.pageSize = 4*MiB; + tlb4M.entries = tlb.entries/2; + AddTLB(tlb4M); } } namespace AMD { -static x86_x64_CacheParameters L1Parameters(u32 reg, x86_x64_CacheType type) +static x86_x64_Cache L1Cache(u32 reg, x86_x64_CacheType type) { - x86_x64_CacheParameters params; - params.type = type; - params.level = 1; - params.associativity = bits(reg, 16, 23); - params.lineSize = bits(reg, 0, 7); - params.sharedBy = 1; - params.totalSize = bits(reg, 24, 31)*KiB; - return params; + x86_x64_Cache cache; + cache.type = type; + cache.level = 1; + cache.associativity = bits(reg, 16, 23); + cache.lineSize = bits(reg, 0, 7); + cache.sharedBy = 1; + cache.totalSize = bits(reg, 24, 31)*KiB; + return cache; } // applies to L2, L3 and TLB2 @@ -345,85 +355,85 @@ static const size_t associativities[16] = 16, 0, 32, 48, 64, 96, 128, x86_x64_fullyAssociative }; -static x86_x64_CacheParameters L2Parameters(u32 reg, x86_x64_CacheType type) +static x86_x64_Cache L2Cache(u32 reg, x86_x64_CacheType type) { - x86_x64_CacheParameters params; + x86_x64_Cache cache; const size_t associativityIndex = bits(reg, 12, 15); if(associativityIndex == 0) // disabled { - params.type = X86_X64_CACHE_TYPE_NULL; - params.associativity = 0; + cache.type = X86_X64_CACHE_TYPE_NULL; + cache.associativity = 0; } else { - params.type = type; - params.associativity = associativities[associativityIndex]; - debug_assert(params.associativity != 0); // else: encoding is "reserved" + cache.type = type; + cache.associativity = associativities[associativityIndex]; + debug_assert(cache.associativity != 0); // else: encoding is "reserved" } - params.level = 2; - params.lineSize = bits(reg, 0, 7); - params.sharedBy = 1; - params.totalSize = bits(reg, 16, 31)*KiB; - return params; + cache.level = 2; + cache.lineSize = bits(reg, 0, 7); + cache.sharedBy = 1; + cache.totalSize = bits(reg, 16, 31)*KiB; + return cache; } // (same as L2 except for the totalSize encoding) -static x86_x64_CacheParameters L3Parameters(u32 reg, x86_x64_CacheType type) +static x86_x64_Cache L3Cache(u32 reg, x86_x64_CacheType type) { - x86_x64_CacheParameters params = L2Parameters(reg, type); - params.level = 3; - params.totalSize = bits(reg, 18, 31)*512*KiB; // (rounded down) - return params; + x86_x64_Cache cache = L2Cache(reg, type); + cache.level = 3; + cache.totalSize = bits(reg, 18, 31)*512*KiB; // (rounded down) + return cache; } -static x86_x64_TLBParameters TLB1Parameters(u32 reg, size_t bitOffset, size_t pageSize, x86_x64_CacheType type) +static x86_x64_TLB TLB1(u32 reg, size_t bitOffset, size_t pageSize, x86_x64_CacheType type) { - x86_x64_TLBParameters params; - params.type = type; - params.level = 1; - params.associativity = bits(reg, bitOffset+8, bitOffset+15); - params.pageSize = pageSize; - params.entries = bits(reg, bitOffset, bitOffset+7); - return params; + x86_x64_TLB tlb; + tlb.type = type; + tlb.level = 1; + tlb.associativity = bits(reg, bitOffset+8, bitOffset+15); + tlb.pageSize = pageSize; + tlb.entries = bits(reg, bitOffset, bitOffset+7); + return tlb; } -static void AddTLB1Parameters(const x86_x64_CpuidRegs& regs) +static void AddTLB1(const x86_x64_CpuidRegs& regs) { - AddTLBParameters(TLB1Parameters(regs.eax, 0, 2*MiB, X86_X64_CACHE_TYPE_INSTRUCTION)); - AddTLBParameters(TLB1Parameters(regs.eax, 16, 2*MiB, X86_X64_CACHE_TYPE_DATA)); - AddTLBParameters(TLB1Parameters(regs.ebx, 0, 4*KiB, X86_X64_CACHE_TYPE_INSTRUCTION)); - AddTLBParameters(TLB1Parameters(regs.ebx, 16, 4*KiB, X86_X64_CACHE_TYPE_DATA)); + AddTLB(TLB1(regs.eax, 0, 2*MiB, X86_X64_CACHE_TYPE_INSTRUCTION)); + AddTLB(TLB1(regs.eax, 16, 2*MiB, X86_X64_CACHE_TYPE_DATA)); + AddTLB(TLB1(regs.ebx, 0, 4*KiB, X86_X64_CACHE_TYPE_INSTRUCTION)); + AddTLB(TLB1(regs.ebx, 16, 4*KiB, X86_X64_CACHE_TYPE_DATA)); } -static x86_x64_TLBParameters TLB2Parameters(u32 reg, size_t bitOffset, size_t pageSize, x86_x64_CacheType type) +static x86_x64_TLB TLB2(u32 reg, size_t bitOffset, size_t pageSize, x86_x64_CacheType type) { - x86_x64_TLBParameters params; + x86_x64_TLB tlb; const size_t associativityIndex = bits(reg, bitOffset+12, bitOffset+15); if(associativityIndex == 0) // disabled { - params.type = X86_X64_CACHE_TYPE_NULL; - params.associativity = 0; + tlb.type = X86_X64_CACHE_TYPE_NULL; + tlb.associativity = 0; } else { - params.type = type; - params.associativity = associativities[associativityIndex]; + tlb.type = type; + tlb.associativity = associativities[associativityIndex]; } - params.level = 2; - params.pageSize = pageSize; - params.entries = bits(reg, bitOffset, bitOffset+11); - return params; + tlb.level = 2; + tlb.pageSize = pageSize; + tlb.entries = bits(reg, bitOffset, bitOffset+11); + return tlb; } -static void AddTLB2ParameterPair(u32 reg, size_t pageSize) +static void AddTLB2Pair(u32 reg, size_t pageSize) { x86_x64_CacheType type = X86_X64_CACHE_TYPE_UNIFIED; if(bits(reg, 16, 31) != 0) // not unified { - AddTLBParameters(TLB2Parameters(reg, 16, pageSize, X86_X64_CACHE_TYPE_DATA)); + AddTLB(TLB2(reg, 16, pageSize, X86_X64_CACHE_TYPE_DATA)); type = X86_X64_CACHE_TYPE_INSTRUCTION; } - AddTLBParameters(TLB2Parameters(reg, 0, pageSize, type)); + AddTLB(TLB2(reg, 0, pageSize, type)); } // AMD reports maxCpuidIdFunction > 4 but consider functions 2..4 to be @@ -435,24 +445,24 @@ static void DetectCacheAndTLB() regs.eax = 0x80000005; if(x86_x64_cpuid(®s)) { - AddTLB1Parameters(regs); + AddTLB1(regs); - dcache.levels = icache.levels = 1; - dcache.parameters[0] = L1Parameters(regs.ecx, X86_X64_CACHE_TYPE_DATA); - icache.parameters[0] = L1Parameters(regs.edx, X86_X64_CACHE_TYPE_INSTRUCTION); + dcaches.numLevels = icaches.numLevels = 1; + dcaches.levels[0] = L1Cache(regs.ecx, X86_X64_CACHE_TYPE_DATA); + icaches.levels[0] = L1Cache(regs.edx, X86_X64_CACHE_TYPE_INSTRUCTION); } regs.eax = 0x80000006; if(x86_x64_cpuid(®s)) { - AddTLB2ParameterPair(regs.eax, 2*MiB); - AddTLB2ParameterPair(regs.ebx, 4*KiB); + AddTLB2Pair(regs.eax, 2*MiB); + AddTLB2Pair(regs.ebx, 4*KiB); - icache.levels = dcache.levels = 2; - icache.parameters[1] = dcache.parameters[1] = L2Parameters(regs.ecx, X86_X64_CACHE_TYPE_UNIFIED); + icaches.numLevels = dcaches.numLevels = 2; + icaches.levels[1] = dcaches.levels[1] = L2Cache(regs.ecx, X86_X64_CACHE_TYPE_UNIFIED); - icache.levels = dcache.levels = 3; - icache.parameters[2] = dcache.parameters[2] = L3Parameters(regs.edx, X86_X64_CACHE_TYPE_UNIFIED); + icaches.numLevels = dcaches.numLevels = 3; + icaches.levels[2] = dcaches.levels[2] = L3Cache(regs.edx, X86_X64_CACHE_TYPE_UNIFIED); } } @@ -480,27 +490,27 @@ static void DetectCache_CPUID4() if(type == X86_X64_CACHE_TYPE_NULL) // no more remaining break; - x86_x64_CacheParameters params; - params.type = type; - params.level = level; - params.associativity = (size_t)bits(regs.ebx, 22, 31)+1; - params.lineSize = (size_t)bits(regs.ebx, 0, 11)+1; // (yes, this also uses +1 encoding) - params.sharedBy = (size_t)bits(regs.eax, 14, 25)+1; + x86_x64_Cache cache; + cache.type = type; + cache.level = level; + cache.associativity = (size_t)bits(regs.ebx, 22, 31)+1; + cache.lineSize = (size_t)bits(regs.ebx, 0, 11)+1; // (yes, this also uses +1 encoding) + cache.sharedBy = (size_t)bits(regs.eax, 14, 25)+1; { const size_t partitions = (size_t)bits(regs.ebx, 12, 21)+1; const size_t sets = (size_t)bits(regs.ecx, 0, 31)+1; - params.totalSize = params.associativity * partitions * params.lineSize * sets; + cache.totalSize = cache.associativity * partitions * cache.lineSize * sets; } - if(type == X86_X64_CACHE_TYPE_INSTRUCTION || type == X86_X64_CACHE_TYPE_UNIFIED) + if(IsInstruction(type)) { - icache.levels = std::max(icache.levels, level); - icache.parameters[level-1] = params; + icaches.numLevels = std::max(icaches.numLevels, level); + icaches.levels[level-1] = cache; } - if(type == X86_X64_CACHE_TYPE_DATA || type == X86_X64_CACHE_TYPE_UNIFIED) + if(IsData(type)) { - dcache.levels = std::max(dcache.levels, level); - dcache.parameters[level-1] = params; + dcaches.numLevels = std::max(dcaches.numLevels, level); + dcaches.levels[level-1] = cache; } } } @@ -624,24 +634,24 @@ static void DecodeDescriptor(u8 descriptor) else debug_assert(0); - x86_x64_TLBParameters params; - params.type = type; - params.level = level; - params.associativity = properties.associativity; - params.pageSize = pageSize; - params.entries = properties.entries; + x86_x64_TLB tlb; + tlb.type = type; + tlb.level = level; + tlb.associativity = properties.associativity; + tlb.pageSize = pageSize; + tlb.entries = properties.entries; - if(type == X86_X64_CACHE_TYPE_INSTRUCTION || type == X86_X64_CACHE_TYPE_UNIFIED) + if(IsInstruction(type)) { - if(itlb.numParameters < maxTLBParams) - itlb.parameters[itlb.numParameters++] = params; + if(itlbs.numLevels < maxTLBLevels) + itlbs.levels[itlbs.numLevels++] = tlb; else debug_assert(0); } - if(type == X86_X64_CACHE_TYPE_DATA || type == X86_X64_CACHE_TYPE_UNIFIED) + if(IsData(type)) { - if(dtlb.numParameters < maxTLBParams) - dtlb.parameters[dtlb.numParameters++] = params; + if(dtlbs.numLevels < maxTLBLevels) + dtlbs.levels[dtlbs.numLevels++] = tlb; else debug_assert(0); } @@ -694,71 +704,71 @@ static LibError DetectCacheAndTLB() } // sanity check: cache type must match that of the data structure - for(size_t i = 0; i < dcache.levels; i++) - debug_assert(dcache.parameters[i].type != X86_X64_CACHE_TYPE_INSTRUCTION); - for(size_t i = 0; i < icache.levels; i++) - debug_assert(icache.parameters[i].type != X86_X64_CACHE_TYPE_DATA); - for(size_t i = 0; i < dtlb.numParameters; i++) - debug_assert(dtlb.parameters[i].type != X86_X64_CACHE_TYPE_INSTRUCTION); - for(size_t i = 0; i < itlb.numParameters; i++) - debug_assert(itlb.parameters[i].type != X86_X64_CACHE_TYPE_DATA); + for(size_t i = 0; i < dcaches.numLevels; i++) + debug_assert(dcaches.levels[i].type != X86_X64_CACHE_TYPE_INSTRUCTION); + for(size_t i = 0; i < icaches.numLevels; i++) + debug_assert(icaches.levels[i].type != X86_X64_CACHE_TYPE_DATA); + for(size_t i = 0; i < dtlbs.numLevels; i++) + debug_assert(dtlbs.levels[i].type != X86_X64_CACHE_TYPE_INSTRUCTION); + for(size_t i = 0; i < itlbs.numLevels; i++) + debug_assert(itlbs.levels[i].type != X86_X64_CACHE_TYPE_DATA); // ensure x86_x64_L1CacheLineSize and x86_x64_L2CacheLineSize will work - debug_assert(dcache.levels >= 2); - debug_assert(dcache.parameters[0].lineSize != 0); - debug_assert(dcache.parameters[1].lineSize != 0); + debug_assert(dcaches.numLevels >= 2); + debug_assert(dcaches.levels[0].lineSize != 0); + debug_assert(dcaches.levels[1].lineSize != 0); return INFO::OK; } -const x86_x64_Cache* x86_x64_ICache() +const x86_x64_Caches* x86_x64_ICaches() { ModuleInit(&cacheInitState, DetectCacheAndTLB); - return &icache; + return &icaches; } -const x86_x64_Cache* x86_x64_DCache() +const x86_x64_Caches* x86_x64_DCaches() { ModuleInit(&cacheInitState, DetectCacheAndTLB); - return &dcache; + return &dcaches; } size_t x86_x64_L1CacheLineSize() { - return x86_x64_DCache()->parameters[0].lineSize; + return x86_x64_DCaches()->levels[0].lineSize; } size_t x86_x64_L2CacheLineSize() { - return x86_x64_DCache()->parameters[1].lineSize; + return x86_x64_DCaches()->levels[1].lineSize; } -const x86_x64_TLB* x86_x64_ITLB() +const x86_x64_TLBs* x86_x64_ITLBs() { ModuleInit(&cacheInitState, DetectCacheAndTLB); - return &itlb; + return &itlbs; } -const x86_x64_TLB* x86_x64_DTLB() +const x86_x64_TLBs* x86_x64_DTLBs() { ModuleInit(&cacheInitState, DetectCacheAndTLB); - return &dtlb; + return &dtlbs; } -size_t x86_x64_TLBCoverage(const x86_x64_TLB* tlb) +size_t x86_x64_TLBCoverage(const x86_x64_TLBs* tlbs) { // note: receiving a TLB pointer means DetectCacheAndTLB was called. const u64 pageSize = 4*KiB; - const u64 largePageSize = 4*MiB; // TODO: find out if we're using 2MB or 4MB + const u64 largePageSize = os_cpu_LargePageSize(); u64 totalSize = 0; // [bytes] - for(size_t i = 0; i < tlb->numParameters; i++) + for(size_t i = 0; i < tlbs->numLevels; i++) { - const x86_x64_TLBParameters& params = tlb->parameters[i]; - if(params.pageSize == pageSize) - totalSize += pageSize * params.entries; - if(params.pageSize == largePageSize) - totalSize += largePageSize * params.entries; + const x86_x64_TLB& tlb = tlbs->levels[i]; + if(tlb.pageSize == pageSize) + totalSize += pageSize * tlb.entries; + if(tlb.pageSize == largePageSize) + totalSize += largePageSize * tlb.entries; } return size_t(totalSize / MiB); @@ -1036,8 +1046,8 @@ double x86_x64_ClockFrequency() // note: don't just take the lowest value! it could conceivably be // too low, if background processing delays reading c1 (see above). double sum = 0.0; - const int lo = numSamples/4, hi = 3*numSamples/4; - for(int i = lo; i < hi; i++) + const size_t lo = numSamples/4, hi = 3*numSamples/4; + for(size_t i = lo; i < hi; i++) sum += samples[i]; const double clockFrequency = sum / (hi-lo); diff --git a/source/lib/sysdep/arch/x86_x64/x86_x64.h b/source/lib/sysdep/arch/x86_x64/x86_x64.h index 704bdc5739..3b525759dc 100644 --- a/source/lib/sysdep/arch/x86_x64/x86_x64.h +++ b/source/lib/sysdep/arch/x86_x64/x86_x64.h @@ -134,10 +134,7 @@ enum x86_x64_CacheType const u8 x86_x64_fullyAssociative = 0xFF; -/** - * describes a level of one of the caches. - **/ -struct x86_x64_CacheParameters +struct x86_x64_Cache { /** * (used to determine if this cache is unified or disabled) @@ -155,34 +152,29 @@ struct x86_x64_CacheParameters * instruction and data caches are returned separately by the corresponding * accessor function; unified cache levels are reported by both. **/ -struct x86_x64_Cache +struct x86_x64_Caches { - /** - * total number of levels, each of which is described by - * an entry in parameters[]. - **/ - size_t levels; - - x86_x64_CacheParameters* parameters; + size_t numLevels; + x86_x64_Cache* levels; }; /** - * @return pointer to a static x86_x64_Cache describing the instruction cache. + * @return pointer to a static x86_x64_Caches describing the instruction caches. **/ -LIB_API const x86_x64_Cache* x86_x64_ICache(); +LIB_API const x86_x64_Caches* x86_x64_ICaches(); /** - * @return pointer to a static x86_x64_Cache describing the data cache. + * @return pointer to a static x86_x64_Caches describing the data caches. **/ -LIB_API const x86_x64_Cache* x86_x64_DCache(); +LIB_API const x86_x64_Caches* x86_x64_DCaches(); LIB_API size_t x86_x64_L1CacheLineSize(); LIB_API size_t x86_x64_L2CacheLineSize(); /** - * describes part of a Translation Lookaside Buffer. + * Translation Lookaside Buffer. **/ -struct x86_x64_TLBParameters +struct x86_x64_TLB { x86_x64_CacheType type; size_t level; @@ -192,32 +184,28 @@ struct x86_x64_TLBParameters }; /** - * describes all parts of a Translation Lookaside Buffer + * describes all levels of a TLB. **/ -struct x86_x64_TLB +struct x86_x64_TLBs { - /** - * total number of parts, each of which is described by - * an entry in parameters[] - **/ - size_t numParameters; - x86_x64_TLBParameters* parameters; + size_t numLevels; + x86_x64_TLB* levels; }; /** - * @return pointer to a static x86_x64_TLB describing the instruction TLB. + * @return pointer to a static x86_x64_TLB describing the instruction TLBs. **/ -LIB_API const x86_x64_TLB* x86_x64_ITLB(); +LIB_API const x86_x64_TLBs* x86_x64_ITLBs(); /** * @return pointer to a static x86_x64_TLB describing the data TLB. **/ -LIB_API const x86_x64_TLB* x86_x64_DTLB(); +LIB_API const x86_x64_TLBs* x86_x64_DTLBs(); /** - * @return coverage, i.e. total size [MiB] of the given TLB + * @return coverage, i.e. total size [MiB] of the given TLBs **/ -LIB_API size_t x86_x64_TLBCoverage(const x86_x64_TLB* tlb); +LIB_API size_t x86_x64_TLBCoverage(const x86_x64_TLBs* tlb); //----------------------------------------------------------------------------- diff --git a/source/lib/sysdep/cpu.cpp b/source/lib/sysdep/cpu.cpp index 679104b17e..6e9e3cc22e 100644 --- a/source/lib/sysdep/cpu.cpp +++ b/source/lib/sysdep/cpu.cpp @@ -31,10 +31,24 @@ ERROR_ASSOCIATE(ERR::CPU_FEATURE_MISSING, L"This CPU doesn't support a required ERROR_ASSOCIATE(ERR::CPU_UNKNOWN_OPCODE, L"Disassembly failed", -1); ERROR_ASSOCIATE(ERR::CPU_UNKNOWN_VENDOR, L"CPU vendor unknown", -1); -void cpu_TestAtomicAdd() + +static void TestCAS64() +{ + volatile u64 var = 1; + cpu_CAS64(&var, 1ull, 2ull); + debug_assert(var == 2ull); +} + +static void TestAtomicAdd() { volatile intptr_t i1 = 1; intptr_t prev = cpu_AtomicAdd(&i1, 1); debug_assert(prev == 1); debug_assert(i1 == 2); } + +void cpu_Test() +{ + TestCAS64(); + TestAtomicAdd(); +} diff --git a/source/lib/sysdep/cpu.h b/source/lib/sysdep/cpu.h index 53f5293540..6fbc669067 100644 --- a/source/lib/sysdep/cpu.h +++ b/source/lib/sysdep/cpu.h @@ -90,6 +90,13 @@ bool cpu_CAS(volatile T* location, T expected, T new_value) return cpu_CAS((volatile intptr_t*)location, (intptr_t)expected, (intptr_t)new_value); } +#if ARCH_AMD64 +# define cpu_CAS64 cpu_CAS +#else +LIB_API bool cpu_CAS64(volatile u64* location, u64 expected, u64 newValue); +#endif + + /** * add a signed value to a variable without the possibility of interference * from other threads/CPUs. @@ -98,7 +105,7 @@ bool cpu_CAS(volatile T* location, T expected, T new_value) **/ LIB_API intptr_t cpu_AtomicAdd(volatile intptr_t* location, intptr_t increment); -LIB_API void cpu_TestAtomicAdd(); +LIB_API void cpu_Test(); /** * enforce strict instruction ordering in the CPU pipeline. diff --git a/source/lib/sysdep/os/win/whrt/tsc.cpp b/source/lib/sysdep/os/win/whrt/tsc.cpp index 89e085f2a0..d834df9e87 100644 --- a/source/lib/sysdep/os/win/whrt/tsc.cpp +++ b/source/lib/sysdep/os/win/whrt/tsc.cpp @@ -45,45 +45,34 @@ static bool IsUniprocessor() { - const CpuTopology* topology = cpu_topology_Detect(); - if(cpu_topology_NumPackages(topology) != 1) + if(cpu_topology_NumPackages() != 1) return false; - if(cpu_topology_CoresPerPackage(topology) != 1) + if(cpu_topology_CoresPerPackage() != 1) return false; return true; } -enum AmdPowerNowFlags -{ - PN_FREQ_ID_CTRL = BIT(1), - PN_HW_THERMAL_CTRL = BIT(4), - PN_SW_THERMAL_CTRL = BIT(5), - PN_INVARIANT_TSC = BIT(8) -}; - static bool IsInvariantTSC() { #if ARCH_X86_X64 + // (we no longer need to check x86_x64_Vendor - Intel and AMD + // agreed on the definition of this feature check) x86_x64_CpuidRegs regs = { 0 }; - switch(x86_x64_Vendor()) + regs.eax = 0x80000007; + if(x86_x64_cpuid(®s)) { - case X86_X64_VENDOR_AMD: - regs.eax = 0x80000007; - if(x86_x64_cpuid(®s)) - { - // TSC is invariant across P-state, C-state and - // stop grant transitions (e.g. STPCLK) - if(regs.edx & PN_INVARIANT_TSC) - return true; - } - break; + // TSC is invariant across P-state, C-state, turbo, and + // stop grant transitions (e.g. STPCLK) + if(regs.edx & BIT(8)) + return true; } #endif return false; } + static bool IsThrottlingPossible() { #if ARCH_X86_X64 @@ -99,6 +88,12 @@ static bool IsThrottlingPossible() regs.eax = 0x80000007; if(x86_x64_cpuid(®s)) { + enum AmdPowerNowFlags + { + PN_FREQ_ID_CTRL = BIT(1), + PN_HW_THERMAL_CTRL = BIT(4), + PN_SW_THERMAL_CTRL = BIT(5) + }; if(regs.edx & (PN_FREQ_ID_CTRL|PN_HW_THERMAL_CTRL|PN_SW_THERMAL_CTRL)) return true; } diff --git a/source/lib/sysdep/os/win/whrt/whrt.cpp b/source/lib/sysdep/os/win/whrt/whrt.cpp index edd0a1ec77..513f9040b1 100644 --- a/source/lib/sysdep/os/win/whrt/whrt.cpp +++ b/source/lib/sysdep/os/win/whrt/whrt.cpp @@ -173,6 +173,8 @@ struct TimerState // (this enables calibration, which is currently not implemented, // but leaving open the possibility costs nothing) double time; + + u8 padding[48]; }; // how do we detect when the old TimerState is no longer in use and can be @@ -181,10 +183,10 @@ struct TimerState // entered critical sections (the latching of TimerState fields) will have // been exited before the next update comes around; if not, TimerState.time // changes, the critical section notices and re-reads the new values. -static TimerState timerStates[2]; +static __declspec(align(64)) TimerState timerStates[2]; // note: exchanging pointers is easier than XORing an index. -static TimerState* volatile ts = &timerStates[0]; -static TimerState* volatile ts2 = &timerStates[1]; +static volatile TimerState* volatile ts = &timerStates[0]; +static volatile TimerState* volatile ts2 = &timerStates[1]; static void UpdateTimerState() { @@ -201,7 +203,7 @@ static void UpdateTimerState() const u64 deltaTicks = CounterDelta(ts->counter, counter); ts2->counter = counter; ts2->time = ts->time + deltaTicks/nominalFrequency; - ts = (TimerState*)InterlockedExchangePointer((volatile PVOID*)&ts2, ts); + ts = (volatile TimerState*)InterlockedExchangePointer((volatile PVOID*)&ts2, (PVOID)ts); } double whrt_Time() @@ -209,6 +211,7 @@ double whrt_Time() retry: // latch timer state (counter and time must be from the same update) const double time = ts->time; + cpu_MemoryBarrier(); const u64 counter = ts->counter; // ts changed after reading time. note: don't compare counter because // it _might_ have the same value after two updates. diff --git a/source/lib/sysdep/os_cpu.h b/source/lib/sysdep/os_cpu.h index d53d4eb4b0..c68df3124c 100644 --- a/source/lib/sysdep/os_cpu.h +++ b/source/lib/sysdep/os_cpu.h @@ -115,6 +115,24 @@ LIB_API size_t os_cpu_MemoryAvailable(); **/ LIB_API uintptr_t os_cpu_SetThreadAffinityMask(uintptr_t processorMask); +class os_cpu_ScopedSetThreadAffinityMask +{ +public: + os_cpu_ScopedSetThreadAffinityMask(uintptr_t processorMask) + : m_previousProcessorMask(os_cpu_SetThreadAffinityMask(processorMask)) + { + } + + ~os_cpu_ScopedSetThreadAffinityMask() + { + (void)os_cpu_SetThreadAffinityMask(m_previousProcessorMask); + } + +private: + uintptr_t m_previousProcessorMask; +}; + + /** * called by os_cpu_CallByEachCPU. * @param processor ID of processor running the current thread for the diff --git a/source/lib/timer.cpp b/source/lib/timer.cpp index fc3a76b5a1..c5c4c582be 100644 --- a/source/lib/timer.cpp +++ b/source/lib/timer.cpp @@ -144,7 +144,7 @@ double timer_Resolution() // // do not use std::list et al. for this! we must be callable at any time, // especially before NLSO ctors run or before heap init. -static size_t num_clients; +static size_t numClients; static TimerClient* clients; @@ -157,31 +157,24 @@ TimerClient* timer_AddClient(TimerClient* tc, const wchar_t* description) // insert at front of list tc->next = clients; clients = tc; - num_clients++; + numClients++; return tc; } -void timer_BillClient(TimerClient* tc, TimerUnit t0, TimerUnit t1) -{ - tc->sum.AddDifference(t0, t1); - tc->num_calls++; -} - - void timer_DisplayClientTotals() { - debug_printf(L"TIMER TOTALS (%lu clients)\n", (unsigned long)num_clients); + debug_printf(L"TIMER TOTALS (%lu clients)\n", (unsigned long)numClients); debug_printf(L"-----------------------------------------------------\n"); while(clients) { // (make sure list and count are consistent) - debug_assert(num_clients != 0); + debug_assert(numClients != 0); TimerClient* tc = clients; clients = tc->next; - num_clients--; + numClients--; const std::wstring duration = tc->sum.ToString(); debug_printf(L" %ls: %ls (%lux)\n", tc->description, duration.c_str(), (unsigned long)tc->num_calls); diff --git a/source/lib/timer.h b/source/lib/timer.h index d8430d6cd1..f77df574de 100644 --- a/source/lib/timer.h +++ b/source/lib/timer.h @@ -28,6 +28,7 @@ #define INCLUDED_TIMER #include "lib/config2.h" // CONFIG2_TIMER_ALLOW_RDTSC +#include "lib/sysdep/cpu.h" // cpu_AtomicAdd #if ARCH_X86_X64 && CONFIG2_TIMER_ALLOW_RDTSC # include "lib/sysdep/arch/x86_x64/x86_x64.h" // x86_x64_rdtsc # include "lib/sysdep/os_cpu.h" // os_cpu_ClockFrequency @@ -172,6 +173,18 @@ public: m_ticks += t1.m_ticks - t0.m_ticks; } + void AddDifferenceAtomic(TimerUnit t0, TimerUnit t1) + { + const u64 delta = t1.m_ticks - t0.m_ticks; +#if ARCH_AMD64 + cpu_AtomicAdd((volatile intptr_t*)&m_ticks, (intptr_t)delta); +#else +retry: + if(!cpu_CAS64(&m_ticks, m_ticks, m_ticks+delta)) + goto retry; +#endif + } + void Subtract(TimerUnit t) { m_ticks -= t.m_ticks; @@ -226,6 +239,20 @@ public: m_seconds += t1.m_seconds - t0.m_seconds; } + void AddDifferenceAtomic(TimerUnit t0, TimerUnit t1) + { +retry: + intptr_t oldRepresentation; + memcpy(&oldRepresentation, &m_seconds, sizeof(oldRepresentation)); + + const double seconds = m_seconds + t1.m_seconds - t0.m_seconds; + intptr_t newRepresentation; + memcpy(&newRepresentation, &seconds, sizeof(newRepresentation)); + + if(!cpu_CAS64((volatile intptr_t*)&m_seconds, oldRepresentation, newRepresentation)) + goto retry; + } + void Subtract(TimerUnit t) { m_seconds -= t.m_seconds; @@ -274,7 +301,7 @@ struct TimerClient // how often timer_BillClient was called (helps measure relative // performance of something that is done indeterminately often). - size_t num_calls; + intptr_t num_calls; }; /** @@ -304,7 +331,21 @@ LIB_API TimerClient* timer_AddClient(TimerClient* tc, const wchar_t* description /** * bill the difference between t0 and t1 to the client's total. **/ -LIB_API void timer_BillClient(TimerClient* tc, TimerUnit t0, TimerUnit t1); +inline void timer_BillClient(TimerClient* tc, TimerUnit t0, TimerUnit t1) +{ + tc->sum.AddDifference(t0, t1); + tc->num_calls++; +} + +/** + * thread-safe version of timer_BillClient + * (not used by default due to its higher overhead) + **/ +inline void timer_BillClientAtomic(TimerClient* tc, TimerUnit t0, TimerUnit t1) +{ + tc->sum.AddDifferenceAtomic(t0, t1); + cpu_AtomicAdd(&tc->num_calls, +1); +} /** * display all clients' totals; does not reset them. @@ -335,6 +376,28 @@ private: TimerClient* m_tc; }; +class ScopeTimerAccrueAtomic +{ + NONCOPYABLE(ScopeTimerAccrueAtomic); +public: + ScopeTimerAccrueAtomic(TimerClient* tc) + : m_tc(tc) + { + m_t0.SetFromTimer(); + } + + ~ScopeTimerAccrueAtomic() + { + TimerUnit t1; + t1.SetFromTimer(); + timer_BillClientAtomic(m_tc, m_t0, t1); + } + +private: + TimerUnit m_t0; + TimerClient* m_tc; +}; + /** * Measure the time taken to execute code up until end of the current scope; * bill it to the given TimerClient object. Can safely be nested. @@ -356,5 +419,6 @@ private: * timer_DisplayClientTotals(); **/ #define TIMER_ACCRUE(client) ScopeTimerAccrue UID__(client) +#define TIMER_ACCRUE_ATOMIC(client) ScopeTimerAccrueAtomic UID__(client) #endif // #ifndef INCLUDED_TIMER diff --git a/source/ps/Util.cpp b/source/ps/Util.cpp index d1e92216ca..ab35d2f5dc 100644 --- a/source/ps/Util.cpp +++ b/source/ps/Util.cpp @@ -99,8 +99,7 @@ void WriteSystemInfo() fprintf(f, "OS : %s %s (%s)\n", un.sysname, un.release, un.version); // CPU - const CpuTopology* topology = cpu_topology_Detect(); - fprintf(f, "CPU : %s, %s (%dx%dx%d)", un.machine, cpu_IdentifierString(), (int)cpu_topology_NumPackages(topology), (int)cpu_topology_CoresPerPackage(topology), (int)cpu_topology_LogicalPerCore(topology)); + fprintf(f, "CPU : %s, %s (%dx%dx%d)", un.machine, cpu_IdentifierString(), (int)cpu_topology_NumPackages(), (int)cpu_topology_CoresPerPackage(), (int)cpu_topology_LogicalPerCore()); const double cpu_freq = os_cpu_ClockFrequency(); if(cpu_freq != 0.0f) {