from work: add thread-safe timer; add CAS64 (required for thread-safe timer); remove topology init requirements, no longer cache results (simplifies GroupPolicy_PerCache); add scoped affinitizer; whrt: fix race condition reported by parallel inspector. also refactor cache and TLB detection.
This was SVN commit r7785.
This commit is contained in:
parent
2b1541ba0b
commit
aa44bac652
@ -158,6 +158,11 @@ bool cpu_CAS(volatile intptr_t* location, intptr_t expected, intptr_t new_value)
|
||||
return ia32_asm_CAS(location, expected, new_value);
|
||||
}
|
||||
|
||||
bool cpu_CAS64(volatile u64* location, u64 expected, u64 new_value)
|
||||
{
|
||||
return ia32_asm_CAS64(location, expected, new_value);
|
||||
}
|
||||
|
||||
|
||||
void* cpu_memcpy(void* RESTRICT dst, const void* RESTRICT src, size_t size)
|
||||
{
|
||||
|
@ -92,6 +92,25 @@ db 0xf0 ; LOCK prefix
|
||||
ret
|
||||
|
||||
|
||||
; extern bool CALL_CONV ia32_asm_CAS64(volatile u64* location, u64 expected, u64 new_value);
|
||||
global sym(ia32_asm_CAS64)
|
||||
sym(ia32_asm_CAS64):
|
||||
push ebx
|
||||
push esi
|
||||
mov esi, [esp+8+4] ; location
|
||||
mov eax, [esp+8+8]
|
||||
mov edx, [esp+8+12] ; edx:eax = expected
|
||||
mov ebx, [esp+8+16]
|
||||
mov ecx, [esp+8+20] ; ecx:ebx = new_value
|
||||
db 0xf0 ; LOCK prefix
|
||||
cmpxchg8b [esi]
|
||||
sete al
|
||||
movzx eax, al
|
||||
pop esi
|
||||
pop ebx
|
||||
ret
|
||||
|
||||
|
||||
;-------------------------------------------------------------------------------
|
||||
; FPU
|
||||
;-------------------------------------------------------------------------------
|
||||
|
@ -36,6 +36,7 @@ extern void CALL_CONV ia32_asm_cpuid(x86_x64_CpuidRegs* regs);
|
||||
|
||||
extern intptr_t CALL_CONV ia32_asm_AtomicAdd(volatile intptr_t* location, intptr_t increment);
|
||||
extern bool CALL_CONV ia32_asm_CAS(volatile intptr_t* location, intptr_t expected, intptr_t new_value);
|
||||
extern bool CALL_CONV ia32_asm_CAS64(volatile u64* location, u64 expected, u64 new_value);
|
||||
|
||||
/// control87
|
||||
// FPU control word
|
||||
|
@ -29,8 +29,8 @@ class TestTopology : public CxxTest::TestSuite
|
||||
public:
|
||||
void test_run()
|
||||
{
|
||||
// Just run the function, ignoring the return value, so
|
||||
// Valgrind can check it's not doing anything very bad
|
||||
cpu_topology_Detect();
|
||||
TS_ASSERT_LESS_THAN_EQUALS(1, cpu_topology_NumPackages());
|
||||
TS_ASSERT_LESS_THAN_EQUALS(1, cpu_topology_CoresPerPackage());
|
||||
TS_ASSERT_LESS_THAN_EQUALS(1, cpu_topology_LogicalPerCore());
|
||||
}
|
||||
};
|
||||
|
@ -105,9 +105,9 @@ static size_t MaxLogicalPerCore()
|
||||
|
||||
static size_t MaxLogicalPerCache()
|
||||
{
|
||||
const x86_x64_Cache* const dcache = x86_x64_DCache();
|
||||
if(dcache->levels >= 2)
|
||||
return dcache->parameters[1].sharedBy;
|
||||
const x86_x64_Caches* const dcaches = x86_x64_DCaches();
|
||||
if(dcaches->numLevels >= 2)
|
||||
return dcaches->levels[1].sharedBy;
|
||||
else
|
||||
return 1; // default
|
||||
}
|
||||
@ -204,8 +204,9 @@ static size_t NumUniqueValuesInField(const u8* apicIds, size_t offset, size_t nu
|
||||
}
|
||||
|
||||
|
||||
static size_t NumPackages(const u8* apicIds)
|
||||
size_t cpu_topology_NumPackages()
|
||||
{
|
||||
const u8* apicIds = ApicIds();
|
||||
if(apicIds)
|
||||
{
|
||||
const size_t offset = ceil_log2(MaxCoresPerPackage()) + ceil_log2(MaxLogicalPerCore());
|
||||
@ -236,8 +237,9 @@ static size_t NumPackages(const u8* apicIds)
|
||||
}
|
||||
|
||||
|
||||
static size_t CoresPerPackage(const u8* apicIds)
|
||||
size_t cpu_topology_CoresPerPackage()
|
||||
{
|
||||
const u8* apicIds = ApicIds();
|
||||
if(apicIds)
|
||||
{
|
||||
const size_t offset = ceil_log2(MaxLogicalPerCore());
|
||||
@ -251,8 +253,9 @@ static size_t CoresPerPackage(const u8* apicIds)
|
||||
}
|
||||
|
||||
|
||||
static size_t LogicalPerCore(const u8* apicIds)
|
||||
size_t cpu_topology_LogicalPerCore()
|
||||
{
|
||||
const u8* apicIds = ApicIds();
|
||||
if(apicIds)
|
||||
{
|
||||
const size_t offset = 0;
|
||||
@ -266,49 +269,6 @@ static size_t LogicalPerCore(const u8* apicIds)
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// CPU topology interface
|
||||
|
||||
struct CpuTopology // POD
|
||||
{
|
||||
size_t numPackages;
|
||||
size_t coresPerPackage;
|
||||
size_t logicalPerCore;
|
||||
};
|
||||
static CpuTopology cpuTopology;
|
||||
|
||||
static LibError InitCpuTopology()
|
||||
{
|
||||
const u8* apicIds = ApicIds();
|
||||
cpuTopology.numPackages = NumPackages(apicIds);
|
||||
cpuTopology.coresPerPackage = CoresPerPackage(apicIds);
|
||||
cpuTopology.logicalPerCore = LogicalPerCore(apicIds);
|
||||
return INFO::OK;
|
||||
}
|
||||
|
||||
const CpuTopology* cpu_topology_Detect()
|
||||
{
|
||||
static ModuleInitState initState;
|
||||
ModuleInit(&initState, InitCpuTopology);
|
||||
return &cpuTopology;
|
||||
}
|
||||
|
||||
size_t cpu_topology_NumPackages(const CpuTopology* topology)
|
||||
{
|
||||
return topology->numPackages;
|
||||
}
|
||||
|
||||
size_t cpu_topology_CoresPerPackage(const CpuTopology* topology)
|
||||
{
|
||||
return topology->coresPerPackage;
|
||||
}
|
||||
|
||||
size_t cpu_topology_LogicalPerCore(const CpuTopology* topology)
|
||||
{
|
||||
return topology->logicalPerCore;
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// cache topology
|
||||
|
||||
@ -451,6 +411,7 @@ struct CacheTopology // POD
|
||||
uintptr_t cachesProcessorMask[os_cpu_MaxProcessors];
|
||||
};
|
||||
static CacheTopology cacheTopology;
|
||||
static ModuleInitState cacheInitState;
|
||||
|
||||
static LibError InitCacheTopology()
|
||||
{
|
||||
@ -460,26 +421,22 @@ static LibError InitCacheTopology()
|
||||
return INFO::OK;
|
||||
}
|
||||
|
||||
const CacheTopology* cache_topology_Detect()
|
||||
size_t cache_topology_NumCaches()
|
||||
{
|
||||
static ModuleInitState initState;
|
||||
ModuleInit(&initState, InitCacheTopology);
|
||||
return &cacheTopology;
|
||||
ModuleInit(&cacheInitState, InitCacheTopology);
|
||||
return cacheTopology.numCaches;
|
||||
}
|
||||
|
||||
size_t cache_topology_NumCaches(const CacheTopology* topology)
|
||||
{
|
||||
return topology->numCaches;
|
||||
}
|
||||
|
||||
size_t cache_topology_CacheFromProcessor(const CacheTopology* topology, size_t processor)
|
||||
size_t cache_topology_CacheFromProcessor(size_t processor)
|
||||
{
|
||||
ModuleInit(&cacheInitState, InitCacheTopology);
|
||||
debug_assert(processor < os_cpu_NumProcessors());
|
||||
return topology->processorsCache[processor];
|
||||
return cacheTopology.processorsCache[processor];
|
||||
}
|
||||
|
||||
uintptr_t cache_topology_ProcessorMaskFromCache(const CacheTopology* topology, size_t cache)
|
||||
uintptr_t cache_topology_ProcessorMaskFromCache(size_t cache)
|
||||
{
|
||||
debug_assert(cache < topology->numCaches);
|
||||
return topology->cachesProcessorMask[cache];
|
||||
ModuleInit(&cacheInitState, InitCacheTopology);
|
||||
debug_assert(cache < cacheTopology.numCaches);
|
||||
return cacheTopology.cachesProcessorMask[cache];
|
||||
}
|
||||
|
@ -21,22 +21,13 @@
|
||||
*/
|
||||
|
||||
/*
|
||||
* detection of CPU and cache topology
|
||||
* detection of CPU and cache topology.
|
||||
* thread-safe, no explicit initialization is required.
|
||||
*/
|
||||
|
||||
#ifndef INCLUDED_TOPOLOGY
|
||||
#define INCLUDED_TOPOLOGY
|
||||
|
||||
// interface rationale:
|
||||
// - explicit initialization avoids the difficulty and overhead of
|
||||
// thread-safe lazy initialization checks.
|
||||
// - requiring an opaque struct to be passed in ensures users call the
|
||||
// init function before using the accessors.
|
||||
// - delegating responsibility for thread-safety to the caller of the
|
||||
// first *_Detect invocation avoids overhead and keeps us independent of
|
||||
// the various threading packages (Boost, OpenMP, POSIX, Win32, ..)
|
||||
|
||||
|
||||
/**
|
||||
* @return a pointer to array (up to os_cpu_MaxProcessors entries;
|
||||
* os_cpu_NumProcessors() of them are valid) of the processors'
|
||||
@ -49,76 +40,54 @@ LIB_API const u8* ApicIds();
|
||||
//-----------------------------------------------------------------------------
|
||||
// cpu
|
||||
|
||||
/**
|
||||
* stores CPU topology, i.e. how many packages, cores and SMT units are
|
||||
* actually present and enabled. this is useful for detecting SMP systems,
|
||||
* predicting performance and dimensioning thread pools.
|
||||
*
|
||||
* note: OS abstractions usually only mention "processors", which could be
|
||||
* any mix of the above.
|
||||
**/
|
||||
struct CpuTopology;
|
||||
|
||||
/**
|
||||
* initialize static storage from which topology can be retrieved by
|
||||
* means of the following functions.
|
||||
* @return const pointer to a shared instance.
|
||||
**/
|
||||
LIB_API const CpuTopology* cpu_topology_Detect();
|
||||
// the CPU topology, i.e. how many packages, cores and SMT units are
|
||||
// actually present and enabled, is useful for detecting SMP systems,
|
||||
// predicting performance and dimensioning thread pools.
|
||||
//
|
||||
// note: OS abstractions usually only mention "processors", which could be
|
||||
// any mix of the above.
|
||||
|
||||
/**
|
||||
* @return number of *enabled* CPU packages / sockets.
|
||||
**/
|
||||
LIB_API size_t cpu_topology_NumPackages(const CpuTopology*);
|
||||
LIB_API size_t cpu_topology_NumPackages();
|
||||
|
||||
/**
|
||||
* @return number of *enabled* CPU cores per package.
|
||||
* (2 on dual-core systems)
|
||||
**/
|
||||
LIB_API size_t cpu_topology_CoresPerPackage(const CpuTopology*);
|
||||
LIB_API size_t cpu_topology_CoresPerPackage();
|
||||
|
||||
/**
|
||||
* @return number of *enabled* hyperthreading units per core.
|
||||
* (2 on P4 EE)
|
||||
**/
|
||||
LIB_API size_t cpu_topology_LogicalPerCore(const CpuTopology*);
|
||||
LIB_API size_t cpu_topology_LogicalPerCore();
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// L2 cache
|
||||
|
||||
/**
|
||||
* stores L2 cache topology, i.e. the mapping between processor and caches.
|
||||
* this allows cores sharing a cache to work together on the same dataset,
|
||||
* which may reduce contention and increase effective capacity.
|
||||
*
|
||||
* example: Intel Core2 micro-architectures (e.g. Intel Core2) feature
|
||||
* partitioned L2 caches shared by two cores.
|
||||
**/
|
||||
struct CacheTopology;
|
||||
// knowledge of the cache topology, i.e. which processors share which caches,
|
||||
// can be used to reduce contention and increase effective capacity by
|
||||
// assigning the partner processors to work on the same dataset.
|
||||
//
|
||||
// example: Intel Core2 micro-architectures feature L2 caches shared by
|
||||
// two cores.
|
||||
|
||||
/**
|
||||
* initialize static storage from which topology can be retrieved by
|
||||
* means of the following functions.
|
||||
* @return const pointer to a shared instance.
|
||||
*
|
||||
* WARNING: this function must not be reentered before it has returned once.
|
||||
* @return number of distinct L2 caches.
|
||||
**/
|
||||
LIB_API const CacheTopology* cache_topology_Detect();
|
||||
|
||||
/**
|
||||
* @return number of distinct L2 caches
|
||||
**/
|
||||
LIB_API size_t cache_topology_NumCaches(const CacheTopology*);
|
||||
LIB_API size_t cache_topology_NumCaches();
|
||||
|
||||
/**
|
||||
* @return L2 cache number (zero-based) to which <processor> belongs.
|
||||
**/
|
||||
LIB_API size_t cache_topology_CacheFromProcessor(const CacheTopology*, size_t processor);
|
||||
LIB_API size_t cache_topology_CacheFromProcessor(size_t processor);
|
||||
|
||||
/**
|
||||
* @return bit-mask of all processors sharing <cache>.
|
||||
**/
|
||||
LIB_API uintptr_t cache_topology_ProcessorMaskFromCache(const CacheTopology*, size_t cache);
|
||||
LIB_API uintptr_t cache_topology_ProcessorMaskFromCache(size_t cache);
|
||||
|
||||
#endif // #ifndef INCLUDED_TOPOLOGY
|
||||
|
@ -285,57 +285,67 @@ size_t x86_x64_Generation()
|
||||
//-----------------------------------------------------------------------------
|
||||
// cache
|
||||
|
||||
static const size_t maxCacheParams = 3;
|
||||
static x86_x64_CacheParameters cacheParametersStorage[maxCacheParams*2];
|
||||
static x86_x64_Cache dcache = { 0, cacheParametersStorage };
|
||||
static x86_x64_Cache icache = { 0, cacheParametersStorage+maxCacheParams };
|
||||
static const size_t maxCacheLevels = 3;
|
||||
static x86_x64_Cache cacheStorage[maxCacheLevels*2];
|
||||
static x86_x64_Caches dcaches = { 0, cacheStorage };
|
||||
static x86_x64_Caches icaches = { 0, cacheStorage+maxCacheLevels };
|
||||
|
||||
static const size_t maxTLBParams = 15;
|
||||
static x86_x64_TLBParameters tlbParametersStorage[maxTLBParams*2];
|
||||
static x86_x64_TLB dtlb = { 0, tlbParametersStorage };
|
||||
static x86_x64_TLB itlb = { 0, tlbParametersStorage+maxTLBParams };
|
||||
static const size_t maxTLBLevels = 15;
|
||||
static x86_x64_TLB tlbStorage[maxTLBLevels*2];
|
||||
static x86_x64_TLBs dtlbs = { 0, tlbStorage };
|
||||
static x86_x64_TLBs itlbs = { 0, tlbStorage+maxTLBLevels };
|
||||
|
||||
static void AddTLBParameters(const x86_x64_TLBParameters& params)
|
||||
static bool IsData(x86_x64_CacheType type)
|
||||
{
|
||||
if(params.type == X86_X64_CACHE_TYPE_INSTRUCTION || params.type == X86_X64_CACHE_TYPE_UNIFIED)
|
||||
return (type == X86_X64_CACHE_TYPE_DATA || type == X86_X64_CACHE_TYPE_UNIFIED);
|
||||
}
|
||||
|
||||
static bool IsInstruction(x86_x64_CacheType type)
|
||||
{
|
||||
return (type == X86_X64_CACHE_TYPE_INSTRUCTION || type == X86_X64_CACHE_TYPE_UNIFIED);
|
||||
}
|
||||
|
||||
static void AddTLB(const x86_x64_TLB& tlb)
|
||||
{
|
||||
if(IsInstruction(tlb.type))
|
||||
{
|
||||
if(itlb.numParameters < maxTLBParams)
|
||||
itlb.parameters[itlb.numParameters++] = params;
|
||||
if(itlbs.numLevels < maxTLBLevels)
|
||||
itlbs.levels[itlbs.numLevels++] = tlb;
|
||||
else
|
||||
debug_assert(0);
|
||||
}
|
||||
if(params.type == X86_X64_CACHE_TYPE_DATA || params.type == X86_X64_CACHE_TYPE_UNIFIED)
|
||||
if(IsData(tlb.type))
|
||||
{
|
||||
if(dtlb.numParameters < maxTLBParams)
|
||||
dtlb.parameters[dtlb.numParameters++] = params;
|
||||
if(dtlbs.numLevels < maxTLBLevels)
|
||||
dtlbs.levels[dtlbs.numLevels++] = tlb;
|
||||
else
|
||||
debug_assert(0);
|
||||
}
|
||||
|
||||
// large page TLBs have N 2M entries or N/2 4M entries; we generate a
|
||||
// second set of parameters for the latter from the former.
|
||||
if(params.pageSize == 2*MiB)
|
||||
if(tlb.pageSize == 2*MiB)
|
||||
{
|
||||
x86_x64_TLBParameters params4M = params;
|
||||
params4M.pageSize = 4*MiB;
|
||||
params4M.entries = params.entries/2;
|
||||
AddTLBParameters(params4M);
|
||||
x86_x64_TLB tlb4M = tlb;
|
||||
tlb4M.pageSize = 4*MiB;
|
||||
tlb4M.entries = tlb.entries/2;
|
||||
AddTLB(tlb4M);
|
||||
}
|
||||
}
|
||||
|
||||
namespace AMD
|
||||
{
|
||||
|
||||
static x86_x64_CacheParameters L1Parameters(u32 reg, x86_x64_CacheType type)
|
||||
static x86_x64_Cache L1Cache(u32 reg, x86_x64_CacheType type)
|
||||
{
|
||||
x86_x64_CacheParameters params;
|
||||
params.type = type;
|
||||
params.level = 1;
|
||||
params.associativity = bits(reg, 16, 23);
|
||||
params.lineSize = bits(reg, 0, 7);
|
||||
params.sharedBy = 1;
|
||||
params.totalSize = bits(reg, 24, 31)*KiB;
|
||||
return params;
|
||||
x86_x64_Cache cache;
|
||||
cache.type = type;
|
||||
cache.level = 1;
|
||||
cache.associativity = bits(reg, 16, 23);
|
||||
cache.lineSize = bits(reg, 0, 7);
|
||||
cache.sharedBy = 1;
|
||||
cache.totalSize = bits(reg, 24, 31)*KiB;
|
||||
return cache;
|
||||
}
|
||||
|
||||
// applies to L2, L3 and TLB2
|
||||
@ -345,85 +355,85 @@ static const size_t associativities[16] =
|
||||
16, 0, 32, 48, 64, 96, 128, x86_x64_fullyAssociative
|
||||
};
|
||||
|
||||
static x86_x64_CacheParameters L2Parameters(u32 reg, x86_x64_CacheType type)
|
||||
static x86_x64_Cache L2Cache(u32 reg, x86_x64_CacheType type)
|
||||
{
|
||||
x86_x64_CacheParameters params;
|
||||
x86_x64_Cache cache;
|
||||
const size_t associativityIndex = bits(reg, 12, 15);
|
||||
if(associativityIndex == 0) // disabled
|
||||
{
|
||||
params.type = X86_X64_CACHE_TYPE_NULL;
|
||||
params.associativity = 0;
|
||||
cache.type = X86_X64_CACHE_TYPE_NULL;
|
||||
cache.associativity = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
params.type = type;
|
||||
params.associativity = associativities[associativityIndex];
|
||||
debug_assert(params.associativity != 0); // else: encoding is "reserved"
|
||||
cache.type = type;
|
||||
cache.associativity = associativities[associativityIndex];
|
||||
debug_assert(cache.associativity != 0); // else: encoding is "reserved"
|
||||
}
|
||||
params.level = 2;
|
||||
params.lineSize = bits(reg, 0, 7);
|
||||
params.sharedBy = 1;
|
||||
params.totalSize = bits(reg, 16, 31)*KiB;
|
||||
return params;
|
||||
cache.level = 2;
|
||||
cache.lineSize = bits(reg, 0, 7);
|
||||
cache.sharedBy = 1;
|
||||
cache.totalSize = bits(reg, 16, 31)*KiB;
|
||||
return cache;
|
||||
}
|
||||
|
||||
// (same as L2 except for the totalSize encoding)
|
||||
static x86_x64_CacheParameters L3Parameters(u32 reg, x86_x64_CacheType type)
|
||||
static x86_x64_Cache L3Cache(u32 reg, x86_x64_CacheType type)
|
||||
{
|
||||
x86_x64_CacheParameters params = L2Parameters(reg, type);
|
||||
params.level = 3;
|
||||
params.totalSize = bits(reg, 18, 31)*512*KiB; // (rounded down)
|
||||
return params;
|
||||
x86_x64_Cache cache = L2Cache(reg, type);
|
||||
cache.level = 3;
|
||||
cache.totalSize = bits(reg, 18, 31)*512*KiB; // (rounded down)
|
||||
return cache;
|
||||
}
|
||||
|
||||
static x86_x64_TLBParameters TLB1Parameters(u32 reg, size_t bitOffset, size_t pageSize, x86_x64_CacheType type)
|
||||
static x86_x64_TLB TLB1(u32 reg, size_t bitOffset, size_t pageSize, x86_x64_CacheType type)
|
||||
{
|
||||
x86_x64_TLBParameters params;
|
||||
params.type = type;
|
||||
params.level = 1;
|
||||
params.associativity = bits(reg, bitOffset+8, bitOffset+15);
|
||||
params.pageSize = pageSize;
|
||||
params.entries = bits(reg, bitOffset, bitOffset+7);
|
||||
return params;
|
||||
x86_x64_TLB tlb;
|
||||
tlb.type = type;
|
||||
tlb.level = 1;
|
||||
tlb.associativity = bits(reg, bitOffset+8, bitOffset+15);
|
||||
tlb.pageSize = pageSize;
|
||||
tlb.entries = bits(reg, bitOffset, bitOffset+7);
|
||||
return tlb;
|
||||
}
|
||||
|
||||
static void AddTLB1Parameters(const x86_x64_CpuidRegs& regs)
|
||||
static void AddTLB1(const x86_x64_CpuidRegs& regs)
|
||||
{
|
||||
AddTLBParameters(TLB1Parameters(regs.eax, 0, 2*MiB, X86_X64_CACHE_TYPE_INSTRUCTION));
|
||||
AddTLBParameters(TLB1Parameters(regs.eax, 16, 2*MiB, X86_X64_CACHE_TYPE_DATA));
|
||||
AddTLBParameters(TLB1Parameters(regs.ebx, 0, 4*KiB, X86_X64_CACHE_TYPE_INSTRUCTION));
|
||||
AddTLBParameters(TLB1Parameters(regs.ebx, 16, 4*KiB, X86_X64_CACHE_TYPE_DATA));
|
||||
AddTLB(TLB1(regs.eax, 0, 2*MiB, X86_X64_CACHE_TYPE_INSTRUCTION));
|
||||
AddTLB(TLB1(regs.eax, 16, 2*MiB, X86_X64_CACHE_TYPE_DATA));
|
||||
AddTLB(TLB1(regs.ebx, 0, 4*KiB, X86_X64_CACHE_TYPE_INSTRUCTION));
|
||||
AddTLB(TLB1(regs.ebx, 16, 4*KiB, X86_X64_CACHE_TYPE_DATA));
|
||||
}
|
||||
|
||||
static x86_x64_TLBParameters TLB2Parameters(u32 reg, size_t bitOffset, size_t pageSize, x86_x64_CacheType type)
|
||||
static x86_x64_TLB TLB2(u32 reg, size_t bitOffset, size_t pageSize, x86_x64_CacheType type)
|
||||
{
|
||||
x86_x64_TLBParameters params;
|
||||
x86_x64_TLB tlb;
|
||||
const size_t associativityIndex = bits(reg, bitOffset+12, bitOffset+15);
|
||||
if(associativityIndex == 0) // disabled
|
||||
{
|
||||
params.type = X86_X64_CACHE_TYPE_NULL;
|
||||
params.associativity = 0;
|
||||
tlb.type = X86_X64_CACHE_TYPE_NULL;
|
||||
tlb.associativity = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
params.type = type;
|
||||
params.associativity = associativities[associativityIndex];
|
||||
tlb.type = type;
|
||||
tlb.associativity = associativities[associativityIndex];
|
||||
}
|
||||
params.level = 2;
|
||||
params.pageSize = pageSize;
|
||||
params.entries = bits(reg, bitOffset, bitOffset+11);
|
||||
return params;
|
||||
tlb.level = 2;
|
||||
tlb.pageSize = pageSize;
|
||||
tlb.entries = bits(reg, bitOffset, bitOffset+11);
|
||||
return tlb;
|
||||
}
|
||||
|
||||
static void AddTLB2ParameterPair(u32 reg, size_t pageSize)
|
||||
static void AddTLB2Pair(u32 reg, size_t pageSize)
|
||||
{
|
||||
x86_x64_CacheType type = X86_X64_CACHE_TYPE_UNIFIED;
|
||||
if(bits(reg, 16, 31) != 0) // not unified
|
||||
{
|
||||
AddTLBParameters(TLB2Parameters(reg, 16, pageSize, X86_X64_CACHE_TYPE_DATA));
|
||||
AddTLB(TLB2(reg, 16, pageSize, X86_X64_CACHE_TYPE_DATA));
|
||||
type = X86_X64_CACHE_TYPE_INSTRUCTION;
|
||||
}
|
||||
AddTLBParameters(TLB2Parameters(reg, 0, pageSize, type));
|
||||
AddTLB(TLB2(reg, 0, pageSize, type));
|
||||
}
|
||||
|
||||
// AMD reports maxCpuidIdFunction > 4 but consider functions 2..4 to be
|
||||
@ -435,24 +445,24 @@ static void DetectCacheAndTLB()
|
||||
regs.eax = 0x80000005;
|
||||
if(x86_x64_cpuid(®s))
|
||||
{
|
||||
AddTLB1Parameters(regs);
|
||||
AddTLB1(regs);
|
||||
|
||||
dcache.levels = icache.levels = 1;
|
||||
dcache.parameters[0] = L1Parameters(regs.ecx, X86_X64_CACHE_TYPE_DATA);
|
||||
icache.parameters[0] = L1Parameters(regs.edx, X86_X64_CACHE_TYPE_INSTRUCTION);
|
||||
dcaches.numLevels = icaches.numLevels = 1;
|
||||
dcaches.levels[0] = L1Cache(regs.ecx, X86_X64_CACHE_TYPE_DATA);
|
||||
icaches.levels[0] = L1Cache(regs.edx, X86_X64_CACHE_TYPE_INSTRUCTION);
|
||||
}
|
||||
|
||||
regs.eax = 0x80000006;
|
||||
if(x86_x64_cpuid(®s))
|
||||
{
|
||||
AddTLB2ParameterPair(regs.eax, 2*MiB);
|
||||
AddTLB2ParameterPair(regs.ebx, 4*KiB);
|
||||
AddTLB2Pair(regs.eax, 2*MiB);
|
||||
AddTLB2Pair(regs.ebx, 4*KiB);
|
||||
|
||||
icache.levels = dcache.levels = 2;
|
||||
icache.parameters[1] = dcache.parameters[1] = L2Parameters(regs.ecx, X86_X64_CACHE_TYPE_UNIFIED);
|
||||
icaches.numLevels = dcaches.numLevels = 2;
|
||||
icaches.levels[1] = dcaches.levels[1] = L2Cache(regs.ecx, X86_X64_CACHE_TYPE_UNIFIED);
|
||||
|
||||
icache.levels = dcache.levels = 3;
|
||||
icache.parameters[2] = dcache.parameters[2] = L3Parameters(regs.edx, X86_X64_CACHE_TYPE_UNIFIED);
|
||||
icaches.numLevels = dcaches.numLevels = 3;
|
||||
icaches.levels[2] = dcaches.levels[2] = L3Cache(regs.edx, X86_X64_CACHE_TYPE_UNIFIED);
|
||||
}
|
||||
}
|
||||
|
||||
@ -480,27 +490,27 @@ static void DetectCache_CPUID4()
|
||||
if(type == X86_X64_CACHE_TYPE_NULL) // no more remaining
|
||||
break;
|
||||
|
||||
x86_x64_CacheParameters params;
|
||||
params.type = type;
|
||||
params.level = level;
|
||||
params.associativity = (size_t)bits(regs.ebx, 22, 31)+1;
|
||||
params.lineSize = (size_t)bits(regs.ebx, 0, 11)+1; // (yes, this also uses +1 encoding)
|
||||
params.sharedBy = (size_t)bits(regs.eax, 14, 25)+1;
|
||||
x86_x64_Cache cache;
|
||||
cache.type = type;
|
||||
cache.level = level;
|
||||
cache.associativity = (size_t)bits(regs.ebx, 22, 31)+1;
|
||||
cache.lineSize = (size_t)bits(regs.ebx, 0, 11)+1; // (yes, this also uses +1 encoding)
|
||||
cache.sharedBy = (size_t)bits(regs.eax, 14, 25)+1;
|
||||
{
|
||||
const size_t partitions = (size_t)bits(regs.ebx, 12, 21)+1;
|
||||
const size_t sets = (size_t)bits(regs.ecx, 0, 31)+1;
|
||||
params.totalSize = params.associativity * partitions * params.lineSize * sets;
|
||||
cache.totalSize = cache.associativity * partitions * cache.lineSize * sets;
|
||||
}
|
||||
|
||||
if(type == X86_X64_CACHE_TYPE_INSTRUCTION || type == X86_X64_CACHE_TYPE_UNIFIED)
|
||||
if(IsInstruction(type))
|
||||
{
|
||||
icache.levels = std::max(icache.levels, level);
|
||||
icache.parameters[level-1] = params;
|
||||
icaches.numLevels = std::max(icaches.numLevels, level);
|
||||
icaches.levels[level-1] = cache;
|
||||
}
|
||||
if(type == X86_X64_CACHE_TYPE_DATA || type == X86_X64_CACHE_TYPE_UNIFIED)
|
||||
if(IsData(type))
|
||||
{
|
||||
dcache.levels = std::max(dcache.levels, level);
|
||||
dcache.parameters[level-1] = params;
|
||||
dcaches.numLevels = std::max(dcaches.numLevels, level);
|
||||
dcaches.levels[level-1] = cache;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -624,24 +634,24 @@ static void DecodeDescriptor(u8 descriptor)
|
||||
else
|
||||
debug_assert(0);
|
||||
|
||||
x86_x64_TLBParameters params;
|
||||
params.type = type;
|
||||
params.level = level;
|
||||
params.associativity = properties.associativity;
|
||||
params.pageSize = pageSize;
|
||||
params.entries = properties.entries;
|
||||
x86_x64_TLB tlb;
|
||||
tlb.type = type;
|
||||
tlb.level = level;
|
||||
tlb.associativity = properties.associativity;
|
||||
tlb.pageSize = pageSize;
|
||||
tlb.entries = properties.entries;
|
||||
|
||||
if(type == X86_X64_CACHE_TYPE_INSTRUCTION || type == X86_X64_CACHE_TYPE_UNIFIED)
|
||||
if(IsInstruction(type))
|
||||
{
|
||||
if(itlb.numParameters < maxTLBParams)
|
||||
itlb.parameters[itlb.numParameters++] = params;
|
||||
if(itlbs.numLevels < maxTLBLevels)
|
||||
itlbs.levels[itlbs.numLevels++] = tlb;
|
||||
else
|
||||
debug_assert(0);
|
||||
}
|
||||
if(type == X86_X64_CACHE_TYPE_DATA || type == X86_X64_CACHE_TYPE_UNIFIED)
|
||||
if(IsData(type))
|
||||
{
|
||||
if(dtlb.numParameters < maxTLBParams)
|
||||
dtlb.parameters[dtlb.numParameters++] = params;
|
||||
if(dtlbs.numLevels < maxTLBLevels)
|
||||
dtlbs.levels[dtlbs.numLevels++] = tlb;
|
||||
else
|
||||
debug_assert(0);
|
||||
}
|
||||
@ -694,71 +704,71 @@ static LibError DetectCacheAndTLB()
|
||||
}
|
||||
|
||||
// sanity check: cache type must match that of the data structure
|
||||
for(size_t i = 0; i < dcache.levels; i++)
|
||||
debug_assert(dcache.parameters[i].type != X86_X64_CACHE_TYPE_INSTRUCTION);
|
||||
for(size_t i = 0; i < icache.levels; i++)
|
||||
debug_assert(icache.parameters[i].type != X86_X64_CACHE_TYPE_DATA);
|
||||
for(size_t i = 0; i < dtlb.numParameters; i++)
|
||||
debug_assert(dtlb.parameters[i].type != X86_X64_CACHE_TYPE_INSTRUCTION);
|
||||
for(size_t i = 0; i < itlb.numParameters; i++)
|
||||
debug_assert(itlb.parameters[i].type != X86_X64_CACHE_TYPE_DATA);
|
||||
for(size_t i = 0; i < dcaches.numLevels; i++)
|
||||
debug_assert(dcaches.levels[i].type != X86_X64_CACHE_TYPE_INSTRUCTION);
|
||||
for(size_t i = 0; i < icaches.numLevels; i++)
|
||||
debug_assert(icaches.levels[i].type != X86_X64_CACHE_TYPE_DATA);
|
||||
for(size_t i = 0; i < dtlbs.numLevels; i++)
|
||||
debug_assert(dtlbs.levels[i].type != X86_X64_CACHE_TYPE_INSTRUCTION);
|
||||
for(size_t i = 0; i < itlbs.numLevels; i++)
|
||||
debug_assert(itlbs.levels[i].type != X86_X64_CACHE_TYPE_DATA);
|
||||
|
||||
// ensure x86_x64_L1CacheLineSize and x86_x64_L2CacheLineSize will work
|
||||
debug_assert(dcache.levels >= 2);
|
||||
debug_assert(dcache.parameters[0].lineSize != 0);
|
||||
debug_assert(dcache.parameters[1].lineSize != 0);
|
||||
debug_assert(dcaches.numLevels >= 2);
|
||||
debug_assert(dcaches.levels[0].lineSize != 0);
|
||||
debug_assert(dcaches.levels[1].lineSize != 0);
|
||||
|
||||
return INFO::OK;
|
||||
}
|
||||
|
||||
const x86_x64_Cache* x86_x64_ICache()
|
||||
const x86_x64_Caches* x86_x64_ICaches()
|
||||
{
|
||||
ModuleInit(&cacheInitState, DetectCacheAndTLB);
|
||||
return &icache;
|
||||
return &icaches;
|
||||
}
|
||||
|
||||
const x86_x64_Cache* x86_x64_DCache()
|
||||
const x86_x64_Caches* x86_x64_DCaches()
|
||||
{
|
||||
ModuleInit(&cacheInitState, DetectCacheAndTLB);
|
||||
return &dcache;
|
||||
return &dcaches;
|
||||
}
|
||||
|
||||
size_t x86_x64_L1CacheLineSize()
|
||||
{
|
||||
return x86_x64_DCache()->parameters[0].lineSize;
|
||||
return x86_x64_DCaches()->levels[0].lineSize;
|
||||
}
|
||||
|
||||
size_t x86_x64_L2CacheLineSize()
|
||||
{
|
||||
return x86_x64_DCache()->parameters[1].lineSize;
|
||||
return x86_x64_DCaches()->levels[1].lineSize;
|
||||
}
|
||||
|
||||
const x86_x64_TLB* x86_x64_ITLB()
|
||||
const x86_x64_TLBs* x86_x64_ITLBs()
|
||||
{
|
||||
ModuleInit(&cacheInitState, DetectCacheAndTLB);
|
||||
return &itlb;
|
||||
return &itlbs;
|
||||
}
|
||||
|
||||
const x86_x64_TLB* x86_x64_DTLB()
|
||||
const x86_x64_TLBs* x86_x64_DTLBs()
|
||||
{
|
||||
ModuleInit(&cacheInitState, DetectCacheAndTLB);
|
||||
return &dtlb;
|
||||
return &dtlbs;
|
||||
}
|
||||
|
||||
size_t x86_x64_TLBCoverage(const x86_x64_TLB* tlb)
|
||||
size_t x86_x64_TLBCoverage(const x86_x64_TLBs* tlbs)
|
||||
{
|
||||
// note: receiving a TLB pointer means DetectCacheAndTLB was called.
|
||||
|
||||
const u64 pageSize = 4*KiB;
|
||||
const u64 largePageSize = 4*MiB; // TODO: find out if we're using 2MB or 4MB
|
||||
const u64 largePageSize = os_cpu_LargePageSize();
|
||||
u64 totalSize = 0; // [bytes]
|
||||
for(size_t i = 0; i < tlb->numParameters; i++)
|
||||
for(size_t i = 0; i < tlbs->numLevels; i++)
|
||||
{
|
||||
const x86_x64_TLBParameters& params = tlb->parameters[i];
|
||||
if(params.pageSize == pageSize)
|
||||
totalSize += pageSize * params.entries;
|
||||
if(params.pageSize == largePageSize)
|
||||
totalSize += largePageSize * params.entries;
|
||||
const x86_x64_TLB& tlb = tlbs->levels[i];
|
||||
if(tlb.pageSize == pageSize)
|
||||
totalSize += pageSize * tlb.entries;
|
||||
if(tlb.pageSize == largePageSize)
|
||||
totalSize += largePageSize * tlb.entries;
|
||||
}
|
||||
|
||||
return size_t(totalSize / MiB);
|
||||
@ -1036,8 +1046,8 @@ double x86_x64_ClockFrequency()
|
||||
// note: don't just take the lowest value! it could conceivably be
|
||||
// too low, if background processing delays reading c1 (see above).
|
||||
double sum = 0.0;
|
||||
const int lo = numSamples/4, hi = 3*numSamples/4;
|
||||
for(int i = lo; i < hi; i++)
|
||||
const size_t lo = numSamples/4, hi = 3*numSamples/4;
|
||||
for(size_t i = lo; i < hi; i++)
|
||||
sum += samples[i];
|
||||
|
||||
const double clockFrequency = sum / (hi-lo);
|
||||
|
@ -134,10 +134,7 @@ enum x86_x64_CacheType
|
||||
|
||||
const u8 x86_x64_fullyAssociative = 0xFF;
|
||||
|
||||
/**
|
||||
* describes a level of one of the caches.
|
||||
**/
|
||||
struct x86_x64_CacheParameters
|
||||
struct x86_x64_Cache
|
||||
{
|
||||
/**
|
||||
* (used to determine if this cache is unified or disabled)
|
||||
@ -155,34 +152,29 @@ struct x86_x64_CacheParameters
|
||||
* instruction and data caches are returned separately by the corresponding
|
||||
* accessor function; unified cache levels are reported by both.
|
||||
**/
|
||||
struct x86_x64_Cache
|
||||
struct x86_x64_Caches
|
||||
{
|
||||
/**
|
||||
* total number of levels, each of which is described by
|
||||
* an entry in parameters[].
|
||||
**/
|
||||
size_t levels;
|
||||
|
||||
x86_x64_CacheParameters* parameters;
|
||||
size_t numLevels;
|
||||
x86_x64_Cache* levels;
|
||||
};
|
||||
|
||||
/**
|
||||
* @return pointer to a static x86_x64_Cache describing the instruction cache.
|
||||
* @return pointer to a static x86_x64_Caches describing the instruction caches.
|
||||
**/
|
||||
LIB_API const x86_x64_Cache* x86_x64_ICache();
|
||||
LIB_API const x86_x64_Caches* x86_x64_ICaches();
|
||||
|
||||
/**
|
||||
* @return pointer to a static x86_x64_Cache describing the data cache.
|
||||
* @return pointer to a static x86_x64_Caches describing the data caches.
|
||||
**/
|
||||
LIB_API const x86_x64_Cache* x86_x64_DCache();
|
||||
LIB_API const x86_x64_Caches* x86_x64_DCaches();
|
||||
|
||||
LIB_API size_t x86_x64_L1CacheLineSize();
|
||||
LIB_API size_t x86_x64_L2CacheLineSize();
|
||||
|
||||
/**
|
||||
* describes part of a Translation Lookaside Buffer.
|
||||
* Translation Lookaside Buffer.
|
||||
**/
|
||||
struct x86_x64_TLBParameters
|
||||
struct x86_x64_TLB
|
||||
{
|
||||
x86_x64_CacheType type;
|
||||
size_t level;
|
||||
@ -192,32 +184,28 @@ struct x86_x64_TLBParameters
|
||||
};
|
||||
|
||||
/**
|
||||
* describes all parts of a Translation Lookaside Buffer
|
||||
* describes all levels of a TLB.
|
||||
**/
|
||||
struct x86_x64_TLB
|
||||
struct x86_x64_TLBs
|
||||
{
|
||||
/**
|
||||
* total number of parts, each of which is described by
|
||||
* an entry in parameters[]
|
||||
**/
|
||||
size_t numParameters;
|
||||
x86_x64_TLBParameters* parameters;
|
||||
size_t numLevels;
|
||||
x86_x64_TLB* levels;
|
||||
};
|
||||
|
||||
/**
|
||||
* @return pointer to a static x86_x64_TLB describing the instruction TLB.
|
||||
* @return pointer to a static x86_x64_TLB describing the instruction TLBs.
|
||||
**/
|
||||
LIB_API const x86_x64_TLB* x86_x64_ITLB();
|
||||
LIB_API const x86_x64_TLBs* x86_x64_ITLBs();
|
||||
|
||||
/**
|
||||
* @return pointer to a static x86_x64_TLB describing the data TLB.
|
||||
**/
|
||||
LIB_API const x86_x64_TLB* x86_x64_DTLB();
|
||||
LIB_API const x86_x64_TLBs* x86_x64_DTLBs();
|
||||
|
||||
/**
|
||||
* @return coverage, i.e. total size [MiB] of the given TLB
|
||||
* @return coverage, i.e. total size [MiB] of the given TLBs
|
||||
**/
|
||||
LIB_API size_t x86_x64_TLBCoverage(const x86_x64_TLB* tlb);
|
||||
LIB_API size_t x86_x64_TLBCoverage(const x86_x64_TLBs* tlb);
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
|
@ -31,10 +31,24 @@ ERROR_ASSOCIATE(ERR::CPU_FEATURE_MISSING, L"This CPU doesn't support a required
|
||||
ERROR_ASSOCIATE(ERR::CPU_UNKNOWN_OPCODE, L"Disassembly failed", -1);
|
||||
ERROR_ASSOCIATE(ERR::CPU_UNKNOWN_VENDOR, L"CPU vendor unknown", -1);
|
||||
|
||||
void cpu_TestAtomicAdd()
|
||||
|
||||
static void TestCAS64()
|
||||
{
|
||||
volatile u64 var = 1;
|
||||
cpu_CAS64(&var, 1ull, 2ull);
|
||||
debug_assert(var == 2ull);
|
||||
}
|
||||
|
||||
static void TestAtomicAdd()
|
||||
{
|
||||
volatile intptr_t i1 = 1;
|
||||
intptr_t prev = cpu_AtomicAdd(&i1, 1);
|
||||
debug_assert(prev == 1);
|
||||
debug_assert(i1 == 2);
|
||||
}
|
||||
|
||||
void cpu_Test()
|
||||
{
|
||||
TestCAS64();
|
||||
TestAtomicAdd();
|
||||
}
|
||||
|
@ -90,6 +90,13 @@ bool cpu_CAS(volatile T* location, T expected, T new_value)
|
||||
return cpu_CAS((volatile intptr_t*)location, (intptr_t)expected, (intptr_t)new_value);
|
||||
}
|
||||
|
||||
#if ARCH_AMD64
|
||||
# define cpu_CAS64 cpu_CAS
|
||||
#else
|
||||
LIB_API bool cpu_CAS64(volatile u64* location, u64 expected, u64 newValue);
|
||||
#endif
|
||||
|
||||
|
||||
/**
|
||||
* add a signed value to a variable without the possibility of interference
|
||||
* from other threads/CPUs.
|
||||
@ -98,7 +105,7 @@ bool cpu_CAS(volatile T* location, T expected, T new_value)
|
||||
**/
|
||||
LIB_API intptr_t cpu_AtomicAdd(volatile intptr_t* location, intptr_t increment);
|
||||
|
||||
LIB_API void cpu_TestAtomicAdd();
|
||||
LIB_API void cpu_Test();
|
||||
|
||||
/**
|
||||
* enforce strict instruction ordering in the CPU pipeline.
|
||||
|
@ -45,45 +45,34 @@
|
||||
|
||||
static bool IsUniprocessor()
|
||||
{
|
||||
const CpuTopology* topology = cpu_topology_Detect();
|
||||
if(cpu_topology_NumPackages(topology) != 1)
|
||||
if(cpu_topology_NumPackages() != 1)
|
||||
return false;
|
||||
if(cpu_topology_CoresPerPackage(topology) != 1)
|
||||
if(cpu_topology_CoresPerPackage() != 1)
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
enum AmdPowerNowFlags
|
||||
{
|
||||
PN_FREQ_ID_CTRL = BIT(1),
|
||||
PN_HW_THERMAL_CTRL = BIT(4),
|
||||
PN_SW_THERMAL_CTRL = BIT(5),
|
||||
PN_INVARIANT_TSC = BIT(8)
|
||||
};
|
||||
|
||||
static bool IsInvariantTSC()
|
||||
{
|
||||
#if ARCH_X86_X64
|
||||
// (we no longer need to check x86_x64_Vendor - Intel and AMD
|
||||
// agreed on the definition of this feature check)
|
||||
x86_x64_CpuidRegs regs = { 0 };
|
||||
switch(x86_x64_Vendor())
|
||||
regs.eax = 0x80000007;
|
||||
if(x86_x64_cpuid(®s))
|
||||
{
|
||||
case X86_X64_VENDOR_AMD:
|
||||
regs.eax = 0x80000007;
|
||||
if(x86_x64_cpuid(®s))
|
||||
{
|
||||
// TSC is invariant across P-state, C-state and
|
||||
// stop grant transitions (e.g. STPCLK)
|
||||
if(regs.edx & PN_INVARIANT_TSC)
|
||||
return true;
|
||||
}
|
||||
break;
|
||||
// TSC is invariant across P-state, C-state, turbo, and
|
||||
// stop grant transitions (e.g. STPCLK)
|
||||
if(regs.edx & BIT(8))
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
static bool IsThrottlingPossible()
|
||||
{
|
||||
#if ARCH_X86_X64
|
||||
@ -99,6 +88,12 @@ static bool IsThrottlingPossible()
|
||||
regs.eax = 0x80000007;
|
||||
if(x86_x64_cpuid(®s))
|
||||
{
|
||||
enum AmdPowerNowFlags
|
||||
{
|
||||
PN_FREQ_ID_CTRL = BIT(1),
|
||||
PN_HW_THERMAL_CTRL = BIT(4),
|
||||
PN_SW_THERMAL_CTRL = BIT(5)
|
||||
};
|
||||
if(regs.edx & (PN_FREQ_ID_CTRL|PN_HW_THERMAL_CTRL|PN_SW_THERMAL_CTRL))
|
||||
return true;
|
||||
}
|
||||
|
@ -173,6 +173,8 @@ struct TimerState
|
||||
// (this enables calibration, which is currently not implemented,
|
||||
// but leaving open the possibility costs nothing)
|
||||
double time;
|
||||
|
||||
u8 padding[48];
|
||||
};
|
||||
|
||||
// how do we detect when the old TimerState is no longer in use and can be
|
||||
@ -181,10 +183,10 @@ struct TimerState
|
||||
// entered critical sections (the latching of TimerState fields) will have
|
||||
// been exited before the next update comes around; if not, TimerState.time
|
||||
// changes, the critical section notices and re-reads the new values.
|
||||
static TimerState timerStates[2];
|
||||
static __declspec(align(64)) TimerState timerStates[2];
|
||||
// note: exchanging pointers is easier than XORing an index.
|
||||
static TimerState* volatile ts = &timerStates[0];
|
||||
static TimerState* volatile ts2 = &timerStates[1];
|
||||
static volatile TimerState* volatile ts = &timerStates[0];
|
||||
static volatile TimerState* volatile ts2 = &timerStates[1];
|
||||
|
||||
static void UpdateTimerState()
|
||||
{
|
||||
@ -201,7 +203,7 @@ static void UpdateTimerState()
|
||||
const u64 deltaTicks = CounterDelta(ts->counter, counter);
|
||||
ts2->counter = counter;
|
||||
ts2->time = ts->time + deltaTicks/nominalFrequency;
|
||||
ts = (TimerState*)InterlockedExchangePointer((volatile PVOID*)&ts2, ts);
|
||||
ts = (volatile TimerState*)InterlockedExchangePointer((volatile PVOID*)&ts2, (PVOID)ts);
|
||||
}
|
||||
|
||||
double whrt_Time()
|
||||
@ -209,6 +211,7 @@ double whrt_Time()
|
||||
retry:
|
||||
// latch timer state (counter and time must be from the same update)
|
||||
const double time = ts->time;
|
||||
cpu_MemoryBarrier();
|
||||
const u64 counter = ts->counter;
|
||||
// ts changed after reading time. note: don't compare counter because
|
||||
// it _might_ have the same value after two updates.
|
||||
|
@ -115,6 +115,24 @@ LIB_API size_t os_cpu_MemoryAvailable();
|
||||
**/
|
||||
LIB_API uintptr_t os_cpu_SetThreadAffinityMask(uintptr_t processorMask);
|
||||
|
||||
class os_cpu_ScopedSetThreadAffinityMask
|
||||
{
|
||||
public:
|
||||
os_cpu_ScopedSetThreadAffinityMask(uintptr_t processorMask)
|
||||
: m_previousProcessorMask(os_cpu_SetThreadAffinityMask(processorMask))
|
||||
{
|
||||
}
|
||||
|
||||
~os_cpu_ScopedSetThreadAffinityMask()
|
||||
{
|
||||
(void)os_cpu_SetThreadAffinityMask(m_previousProcessorMask);
|
||||
}
|
||||
|
||||
private:
|
||||
uintptr_t m_previousProcessorMask;
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* called by os_cpu_CallByEachCPU.
|
||||
* @param processor ID of processor running the current thread for the
|
||||
|
@ -144,7 +144,7 @@ double timer_Resolution()
|
||||
//
|
||||
// do not use std::list et al. for this! we must be callable at any time,
|
||||
// especially before NLSO ctors run or before heap init.
|
||||
static size_t num_clients;
|
||||
static size_t numClients;
|
||||
static TimerClient* clients;
|
||||
|
||||
|
||||
@ -157,31 +157,24 @@ TimerClient* timer_AddClient(TimerClient* tc, const wchar_t* description)
|
||||
// insert at front of list
|
||||
tc->next = clients;
|
||||
clients = tc;
|
||||
num_clients++;
|
||||
numClients++;
|
||||
|
||||
return tc;
|
||||
}
|
||||
|
||||
|
||||
void timer_BillClient(TimerClient* tc, TimerUnit t0, TimerUnit t1)
|
||||
{
|
||||
tc->sum.AddDifference(t0, t1);
|
||||
tc->num_calls++;
|
||||
}
|
||||
|
||||
|
||||
void timer_DisplayClientTotals()
|
||||
{
|
||||
debug_printf(L"TIMER TOTALS (%lu clients)\n", (unsigned long)num_clients);
|
||||
debug_printf(L"TIMER TOTALS (%lu clients)\n", (unsigned long)numClients);
|
||||
debug_printf(L"-----------------------------------------------------\n");
|
||||
|
||||
while(clients)
|
||||
{
|
||||
// (make sure list and count are consistent)
|
||||
debug_assert(num_clients != 0);
|
||||
debug_assert(numClients != 0);
|
||||
TimerClient* tc = clients;
|
||||
clients = tc->next;
|
||||
num_clients--;
|
||||
numClients--;
|
||||
|
||||
const std::wstring duration = tc->sum.ToString();
|
||||
debug_printf(L" %ls: %ls (%lux)\n", tc->description, duration.c_str(), (unsigned long)tc->num_calls);
|
||||
|
@ -28,6 +28,7 @@
|
||||
#define INCLUDED_TIMER
|
||||
|
||||
#include "lib/config2.h" // CONFIG2_TIMER_ALLOW_RDTSC
|
||||
#include "lib/sysdep/cpu.h" // cpu_AtomicAdd
|
||||
#if ARCH_X86_X64 && CONFIG2_TIMER_ALLOW_RDTSC
|
||||
# include "lib/sysdep/arch/x86_x64/x86_x64.h" // x86_x64_rdtsc
|
||||
# include "lib/sysdep/os_cpu.h" // os_cpu_ClockFrequency
|
||||
@ -172,6 +173,18 @@ public:
|
||||
m_ticks += t1.m_ticks - t0.m_ticks;
|
||||
}
|
||||
|
||||
void AddDifferenceAtomic(TimerUnit t0, TimerUnit t1)
|
||||
{
|
||||
const u64 delta = t1.m_ticks - t0.m_ticks;
|
||||
#if ARCH_AMD64
|
||||
cpu_AtomicAdd((volatile intptr_t*)&m_ticks, (intptr_t)delta);
|
||||
#else
|
||||
retry:
|
||||
if(!cpu_CAS64(&m_ticks, m_ticks, m_ticks+delta))
|
||||
goto retry;
|
||||
#endif
|
||||
}
|
||||
|
||||
void Subtract(TimerUnit t)
|
||||
{
|
||||
m_ticks -= t.m_ticks;
|
||||
@ -226,6 +239,20 @@ public:
|
||||
m_seconds += t1.m_seconds - t0.m_seconds;
|
||||
}
|
||||
|
||||
void AddDifferenceAtomic(TimerUnit t0, TimerUnit t1)
|
||||
{
|
||||
retry:
|
||||
intptr_t oldRepresentation;
|
||||
memcpy(&oldRepresentation, &m_seconds, sizeof(oldRepresentation));
|
||||
|
||||
const double seconds = m_seconds + t1.m_seconds - t0.m_seconds;
|
||||
intptr_t newRepresentation;
|
||||
memcpy(&newRepresentation, &seconds, sizeof(newRepresentation));
|
||||
|
||||
if(!cpu_CAS64((volatile intptr_t*)&m_seconds, oldRepresentation, newRepresentation))
|
||||
goto retry;
|
||||
}
|
||||
|
||||
void Subtract(TimerUnit t)
|
||||
{
|
||||
m_seconds -= t.m_seconds;
|
||||
@ -274,7 +301,7 @@ struct TimerClient
|
||||
|
||||
// how often timer_BillClient was called (helps measure relative
|
||||
// performance of something that is done indeterminately often).
|
||||
size_t num_calls;
|
||||
intptr_t num_calls;
|
||||
};
|
||||
|
||||
/**
|
||||
@ -304,7 +331,21 @@ LIB_API TimerClient* timer_AddClient(TimerClient* tc, const wchar_t* description
|
||||
/**
|
||||
* bill the difference between t0 and t1 to the client's total.
|
||||
**/
|
||||
LIB_API void timer_BillClient(TimerClient* tc, TimerUnit t0, TimerUnit t1);
|
||||
inline void timer_BillClient(TimerClient* tc, TimerUnit t0, TimerUnit t1)
|
||||
{
|
||||
tc->sum.AddDifference(t0, t1);
|
||||
tc->num_calls++;
|
||||
}
|
||||
|
||||
/**
|
||||
* thread-safe version of timer_BillClient
|
||||
* (not used by default due to its higher overhead)
|
||||
**/
|
||||
inline void timer_BillClientAtomic(TimerClient* tc, TimerUnit t0, TimerUnit t1)
|
||||
{
|
||||
tc->sum.AddDifferenceAtomic(t0, t1);
|
||||
cpu_AtomicAdd(&tc->num_calls, +1);
|
||||
}
|
||||
|
||||
/**
|
||||
* display all clients' totals; does not reset them.
|
||||
@ -335,6 +376,28 @@ private:
|
||||
TimerClient* m_tc;
|
||||
};
|
||||
|
||||
class ScopeTimerAccrueAtomic
|
||||
{
|
||||
NONCOPYABLE(ScopeTimerAccrueAtomic);
|
||||
public:
|
||||
ScopeTimerAccrueAtomic(TimerClient* tc)
|
||||
: m_tc(tc)
|
||||
{
|
||||
m_t0.SetFromTimer();
|
||||
}
|
||||
|
||||
~ScopeTimerAccrueAtomic()
|
||||
{
|
||||
TimerUnit t1;
|
||||
t1.SetFromTimer();
|
||||
timer_BillClientAtomic(m_tc, m_t0, t1);
|
||||
}
|
||||
|
||||
private:
|
||||
TimerUnit m_t0;
|
||||
TimerClient* m_tc;
|
||||
};
|
||||
|
||||
/**
|
||||
* Measure the time taken to execute code up until end of the current scope;
|
||||
* bill it to the given TimerClient object. Can safely be nested.
|
||||
@ -356,5 +419,6 @@ private:
|
||||
* timer_DisplayClientTotals();
|
||||
**/
|
||||
#define TIMER_ACCRUE(client) ScopeTimerAccrue UID__(client)
|
||||
#define TIMER_ACCRUE_ATOMIC(client) ScopeTimerAccrueAtomic UID__(client)
|
||||
|
||||
#endif // #ifndef INCLUDED_TIMER
|
||||
|
@ -99,8 +99,7 @@ void WriteSystemInfo()
|
||||
fprintf(f, "OS : %s %s (%s)\n", un.sysname, un.release, un.version);
|
||||
|
||||
// CPU
|
||||
const CpuTopology* topology = cpu_topology_Detect();
|
||||
fprintf(f, "CPU : %s, %s (%dx%dx%d)", un.machine, cpu_IdentifierString(), (int)cpu_topology_NumPackages(topology), (int)cpu_topology_CoresPerPackage(topology), (int)cpu_topology_LogicalPerCore(topology));
|
||||
fprintf(f, "CPU : %s, %s (%dx%dx%d)", un.machine, cpu_IdentifierString(), (int)cpu_topology_NumPackages(), (int)cpu_topology_CoresPerPackage(), (int)cpu_topology_LogicalPerCore());
|
||||
const double cpu_freq = os_cpu_ClockFrequency();
|
||||
if(cpu_freq != 0.0f)
|
||||
{
|
||||
|
Loading…
Reference in New Issue
Block a user