1
0
forked from 0ad/0ad

from work: add thread-safe timer; add CAS64 (required for thread-safe timer); remove topology init requirements, no longer cache results (simplifies GroupPolicy_PerCache); add scoped affinitizer; whrt: fix race condition reported by parallel inspector. also refactor cache and TLB detection.

This was SVN commit r7785.
This commit is contained in:
janwas 2010-07-22 16:17:33 +00:00
parent 2b1541ba0b
commit aa44bac652
16 changed files with 373 additions and 331 deletions

View File

@ -158,6 +158,11 @@ bool cpu_CAS(volatile intptr_t* location, intptr_t expected, intptr_t new_value)
return ia32_asm_CAS(location, expected, new_value);
}
bool cpu_CAS64(volatile u64* location, u64 expected, u64 new_value)
{
return ia32_asm_CAS64(location, expected, new_value);
}
void* cpu_memcpy(void* RESTRICT dst, const void* RESTRICT src, size_t size)
{

View File

@ -92,6 +92,25 @@ db 0xf0 ; LOCK prefix
ret
; extern bool CALL_CONV ia32_asm_CAS64(volatile u64* location, u64 expected, u64 new_value);
global sym(ia32_asm_CAS64)
sym(ia32_asm_CAS64):
push ebx
push esi
mov esi, [esp+8+4] ; location
mov eax, [esp+8+8]
mov edx, [esp+8+12] ; edx:eax = expected
mov ebx, [esp+8+16]
mov ecx, [esp+8+20] ; ecx:ebx = new_value
db 0xf0 ; LOCK prefix
cmpxchg8b [esi]
sete al
movzx eax, al
pop esi
pop ebx
ret
;-------------------------------------------------------------------------------
; FPU
;-------------------------------------------------------------------------------

View File

@ -36,6 +36,7 @@ extern void CALL_CONV ia32_asm_cpuid(x86_x64_CpuidRegs* regs);
extern intptr_t CALL_CONV ia32_asm_AtomicAdd(volatile intptr_t* location, intptr_t increment);
extern bool CALL_CONV ia32_asm_CAS(volatile intptr_t* location, intptr_t expected, intptr_t new_value);
extern bool CALL_CONV ia32_asm_CAS64(volatile u64* location, u64 expected, u64 new_value);
/// control87
// FPU control word

View File

@ -29,8 +29,8 @@ class TestTopology : public CxxTest::TestSuite
public:
void test_run()
{
// Just run the function, ignoring the return value, so
// Valgrind can check it's not doing anything very bad
cpu_topology_Detect();
TS_ASSERT_LESS_THAN_EQUALS(1, cpu_topology_NumPackages());
TS_ASSERT_LESS_THAN_EQUALS(1, cpu_topology_CoresPerPackage());
TS_ASSERT_LESS_THAN_EQUALS(1, cpu_topology_LogicalPerCore());
}
};

View File

@ -105,9 +105,9 @@ static size_t MaxLogicalPerCore()
static size_t MaxLogicalPerCache()
{
const x86_x64_Cache* const dcache = x86_x64_DCache();
if(dcache->levels >= 2)
return dcache->parameters[1].sharedBy;
const x86_x64_Caches* const dcaches = x86_x64_DCaches();
if(dcaches->numLevels >= 2)
return dcaches->levels[1].sharedBy;
else
return 1; // default
}
@ -204,8 +204,9 @@ static size_t NumUniqueValuesInField(const u8* apicIds, size_t offset, size_t nu
}
static size_t NumPackages(const u8* apicIds)
size_t cpu_topology_NumPackages()
{
const u8* apicIds = ApicIds();
if(apicIds)
{
const size_t offset = ceil_log2(MaxCoresPerPackage()) + ceil_log2(MaxLogicalPerCore());
@ -236,8 +237,9 @@ static size_t NumPackages(const u8* apicIds)
}
static size_t CoresPerPackage(const u8* apicIds)
size_t cpu_topology_CoresPerPackage()
{
const u8* apicIds = ApicIds();
if(apicIds)
{
const size_t offset = ceil_log2(MaxLogicalPerCore());
@ -251,8 +253,9 @@ static size_t CoresPerPackage(const u8* apicIds)
}
static size_t LogicalPerCore(const u8* apicIds)
size_t cpu_topology_LogicalPerCore()
{
const u8* apicIds = ApicIds();
if(apicIds)
{
const size_t offset = 0;
@ -266,49 +269,6 @@ static size_t LogicalPerCore(const u8* apicIds)
}
//-----------------------------------------------------------------------------
// CPU topology interface
struct CpuTopology // POD
{
size_t numPackages;
size_t coresPerPackage;
size_t logicalPerCore;
};
static CpuTopology cpuTopology;
static LibError InitCpuTopology()
{
const u8* apicIds = ApicIds();
cpuTopology.numPackages = NumPackages(apicIds);
cpuTopology.coresPerPackage = CoresPerPackage(apicIds);
cpuTopology.logicalPerCore = LogicalPerCore(apicIds);
return INFO::OK;
}
const CpuTopology* cpu_topology_Detect()
{
static ModuleInitState initState;
ModuleInit(&initState, InitCpuTopology);
return &cpuTopology;
}
size_t cpu_topology_NumPackages(const CpuTopology* topology)
{
return topology->numPackages;
}
size_t cpu_topology_CoresPerPackage(const CpuTopology* topology)
{
return topology->coresPerPackage;
}
size_t cpu_topology_LogicalPerCore(const CpuTopology* topology)
{
return topology->logicalPerCore;
}
//-----------------------------------------------------------------------------
// cache topology
@ -451,6 +411,7 @@ struct CacheTopology // POD
uintptr_t cachesProcessorMask[os_cpu_MaxProcessors];
};
static CacheTopology cacheTopology;
static ModuleInitState cacheInitState;
static LibError InitCacheTopology()
{
@ -460,26 +421,22 @@ static LibError InitCacheTopology()
return INFO::OK;
}
const CacheTopology* cache_topology_Detect()
size_t cache_topology_NumCaches()
{
static ModuleInitState initState;
ModuleInit(&initState, InitCacheTopology);
return &cacheTopology;
ModuleInit(&cacheInitState, InitCacheTopology);
return cacheTopology.numCaches;
}
size_t cache_topology_NumCaches(const CacheTopology* topology)
{
return topology->numCaches;
}
size_t cache_topology_CacheFromProcessor(const CacheTopology* topology, size_t processor)
size_t cache_topology_CacheFromProcessor(size_t processor)
{
ModuleInit(&cacheInitState, InitCacheTopology);
debug_assert(processor < os_cpu_NumProcessors());
return topology->processorsCache[processor];
return cacheTopology.processorsCache[processor];
}
uintptr_t cache_topology_ProcessorMaskFromCache(const CacheTopology* topology, size_t cache)
uintptr_t cache_topology_ProcessorMaskFromCache(size_t cache)
{
debug_assert(cache < topology->numCaches);
return topology->cachesProcessorMask[cache];
ModuleInit(&cacheInitState, InitCacheTopology);
debug_assert(cache < cacheTopology.numCaches);
return cacheTopology.cachesProcessorMask[cache];
}

View File

@ -21,22 +21,13 @@
*/
/*
* detection of CPU and cache topology
* detection of CPU and cache topology.
* thread-safe, no explicit initialization is required.
*/
#ifndef INCLUDED_TOPOLOGY
#define INCLUDED_TOPOLOGY
// interface rationale:
// - explicit initialization avoids the difficulty and overhead of
// thread-safe lazy initialization checks.
// - requiring an opaque struct to be passed in ensures users call the
// init function before using the accessors.
// - delegating responsibility for thread-safety to the caller of the
// first *_Detect invocation avoids overhead and keeps us independent of
// the various threading packages (Boost, OpenMP, POSIX, Win32, ..)
/**
* @return a pointer to array (up to os_cpu_MaxProcessors entries;
* os_cpu_NumProcessors() of them are valid) of the processors'
@ -49,76 +40,54 @@ LIB_API const u8* ApicIds();
//-----------------------------------------------------------------------------
// cpu
/**
* stores CPU topology, i.e. how many packages, cores and SMT units are
* actually present and enabled. this is useful for detecting SMP systems,
* predicting performance and dimensioning thread pools.
*
* note: OS abstractions usually only mention "processors", which could be
* any mix of the above.
**/
struct CpuTopology;
/**
* initialize static storage from which topology can be retrieved by
* means of the following functions.
* @return const pointer to a shared instance.
**/
LIB_API const CpuTopology* cpu_topology_Detect();
// the CPU topology, i.e. how many packages, cores and SMT units are
// actually present and enabled, is useful for detecting SMP systems,
// predicting performance and dimensioning thread pools.
//
// note: OS abstractions usually only mention "processors", which could be
// any mix of the above.
/**
* @return number of *enabled* CPU packages / sockets.
**/
LIB_API size_t cpu_topology_NumPackages(const CpuTopology*);
LIB_API size_t cpu_topology_NumPackages();
/**
* @return number of *enabled* CPU cores per package.
* (2 on dual-core systems)
**/
LIB_API size_t cpu_topology_CoresPerPackage(const CpuTopology*);
LIB_API size_t cpu_topology_CoresPerPackage();
/**
* @return number of *enabled* hyperthreading units per core.
* (2 on P4 EE)
**/
LIB_API size_t cpu_topology_LogicalPerCore(const CpuTopology*);
LIB_API size_t cpu_topology_LogicalPerCore();
//-----------------------------------------------------------------------------
// L2 cache
/**
* stores L2 cache topology, i.e. the mapping between processor and caches.
* this allows cores sharing a cache to work together on the same dataset,
* which may reduce contention and increase effective capacity.
*
* example: Intel Core2 micro-architectures (e.g. Intel Core2) feature
* partitioned L2 caches shared by two cores.
**/
struct CacheTopology;
// knowledge of the cache topology, i.e. which processors share which caches,
// can be used to reduce contention and increase effective capacity by
// assigning the partner processors to work on the same dataset.
//
// example: Intel Core2 micro-architectures feature L2 caches shared by
// two cores.
/**
* initialize static storage from which topology can be retrieved by
* means of the following functions.
* @return const pointer to a shared instance.
*
* WARNING: this function must not be reentered before it has returned once.
* @return number of distinct L2 caches.
**/
LIB_API const CacheTopology* cache_topology_Detect();
/**
* @return number of distinct L2 caches
**/
LIB_API size_t cache_topology_NumCaches(const CacheTopology*);
LIB_API size_t cache_topology_NumCaches();
/**
* @return L2 cache number (zero-based) to which <processor> belongs.
**/
LIB_API size_t cache_topology_CacheFromProcessor(const CacheTopology*, size_t processor);
LIB_API size_t cache_topology_CacheFromProcessor(size_t processor);
/**
* @return bit-mask of all processors sharing <cache>.
**/
LIB_API uintptr_t cache_topology_ProcessorMaskFromCache(const CacheTopology*, size_t cache);
LIB_API uintptr_t cache_topology_ProcessorMaskFromCache(size_t cache);
#endif // #ifndef INCLUDED_TOPOLOGY

View File

@ -285,57 +285,67 @@ size_t x86_x64_Generation()
//-----------------------------------------------------------------------------
// cache
static const size_t maxCacheParams = 3;
static x86_x64_CacheParameters cacheParametersStorage[maxCacheParams*2];
static x86_x64_Cache dcache = { 0, cacheParametersStorage };
static x86_x64_Cache icache = { 0, cacheParametersStorage+maxCacheParams };
static const size_t maxCacheLevels = 3;
static x86_x64_Cache cacheStorage[maxCacheLevels*2];
static x86_x64_Caches dcaches = { 0, cacheStorage };
static x86_x64_Caches icaches = { 0, cacheStorage+maxCacheLevels };
static const size_t maxTLBParams = 15;
static x86_x64_TLBParameters tlbParametersStorage[maxTLBParams*2];
static x86_x64_TLB dtlb = { 0, tlbParametersStorage };
static x86_x64_TLB itlb = { 0, tlbParametersStorage+maxTLBParams };
static const size_t maxTLBLevels = 15;
static x86_x64_TLB tlbStorage[maxTLBLevels*2];
static x86_x64_TLBs dtlbs = { 0, tlbStorage };
static x86_x64_TLBs itlbs = { 0, tlbStorage+maxTLBLevels };
static void AddTLBParameters(const x86_x64_TLBParameters& params)
static bool IsData(x86_x64_CacheType type)
{
if(params.type == X86_X64_CACHE_TYPE_INSTRUCTION || params.type == X86_X64_CACHE_TYPE_UNIFIED)
return (type == X86_X64_CACHE_TYPE_DATA || type == X86_X64_CACHE_TYPE_UNIFIED);
}
static bool IsInstruction(x86_x64_CacheType type)
{
return (type == X86_X64_CACHE_TYPE_INSTRUCTION || type == X86_X64_CACHE_TYPE_UNIFIED);
}
static void AddTLB(const x86_x64_TLB& tlb)
{
if(IsInstruction(tlb.type))
{
if(itlb.numParameters < maxTLBParams)
itlb.parameters[itlb.numParameters++] = params;
if(itlbs.numLevels < maxTLBLevels)
itlbs.levels[itlbs.numLevels++] = tlb;
else
debug_assert(0);
}
if(params.type == X86_X64_CACHE_TYPE_DATA || params.type == X86_X64_CACHE_TYPE_UNIFIED)
if(IsData(tlb.type))
{
if(dtlb.numParameters < maxTLBParams)
dtlb.parameters[dtlb.numParameters++] = params;
if(dtlbs.numLevels < maxTLBLevels)
dtlbs.levels[dtlbs.numLevels++] = tlb;
else
debug_assert(0);
}
// large page TLBs have N 2M entries or N/2 4M entries; we generate a
// second set of parameters for the latter from the former.
if(params.pageSize == 2*MiB)
if(tlb.pageSize == 2*MiB)
{
x86_x64_TLBParameters params4M = params;
params4M.pageSize = 4*MiB;
params4M.entries = params.entries/2;
AddTLBParameters(params4M);
x86_x64_TLB tlb4M = tlb;
tlb4M.pageSize = 4*MiB;
tlb4M.entries = tlb.entries/2;
AddTLB(tlb4M);
}
}
namespace AMD
{
static x86_x64_CacheParameters L1Parameters(u32 reg, x86_x64_CacheType type)
static x86_x64_Cache L1Cache(u32 reg, x86_x64_CacheType type)
{
x86_x64_CacheParameters params;
params.type = type;
params.level = 1;
params.associativity = bits(reg, 16, 23);
params.lineSize = bits(reg, 0, 7);
params.sharedBy = 1;
params.totalSize = bits(reg, 24, 31)*KiB;
return params;
x86_x64_Cache cache;
cache.type = type;
cache.level = 1;
cache.associativity = bits(reg, 16, 23);
cache.lineSize = bits(reg, 0, 7);
cache.sharedBy = 1;
cache.totalSize = bits(reg, 24, 31)*KiB;
return cache;
}
// applies to L2, L3 and TLB2
@ -345,85 +355,85 @@ static const size_t associativities[16] =
16, 0, 32, 48, 64, 96, 128, x86_x64_fullyAssociative
};
static x86_x64_CacheParameters L2Parameters(u32 reg, x86_x64_CacheType type)
static x86_x64_Cache L2Cache(u32 reg, x86_x64_CacheType type)
{
x86_x64_CacheParameters params;
x86_x64_Cache cache;
const size_t associativityIndex = bits(reg, 12, 15);
if(associativityIndex == 0) // disabled
{
params.type = X86_X64_CACHE_TYPE_NULL;
params.associativity = 0;
cache.type = X86_X64_CACHE_TYPE_NULL;
cache.associativity = 0;
}
else
{
params.type = type;
params.associativity = associativities[associativityIndex];
debug_assert(params.associativity != 0); // else: encoding is "reserved"
cache.type = type;
cache.associativity = associativities[associativityIndex];
debug_assert(cache.associativity != 0); // else: encoding is "reserved"
}
params.level = 2;
params.lineSize = bits(reg, 0, 7);
params.sharedBy = 1;
params.totalSize = bits(reg, 16, 31)*KiB;
return params;
cache.level = 2;
cache.lineSize = bits(reg, 0, 7);
cache.sharedBy = 1;
cache.totalSize = bits(reg, 16, 31)*KiB;
return cache;
}
// (same as L2 except for the totalSize encoding)
static x86_x64_CacheParameters L3Parameters(u32 reg, x86_x64_CacheType type)
static x86_x64_Cache L3Cache(u32 reg, x86_x64_CacheType type)
{
x86_x64_CacheParameters params = L2Parameters(reg, type);
params.level = 3;
params.totalSize = bits(reg, 18, 31)*512*KiB; // (rounded down)
return params;
x86_x64_Cache cache = L2Cache(reg, type);
cache.level = 3;
cache.totalSize = bits(reg, 18, 31)*512*KiB; // (rounded down)
return cache;
}
static x86_x64_TLBParameters TLB1Parameters(u32 reg, size_t bitOffset, size_t pageSize, x86_x64_CacheType type)
static x86_x64_TLB TLB1(u32 reg, size_t bitOffset, size_t pageSize, x86_x64_CacheType type)
{
x86_x64_TLBParameters params;
params.type = type;
params.level = 1;
params.associativity = bits(reg, bitOffset+8, bitOffset+15);
params.pageSize = pageSize;
params.entries = bits(reg, bitOffset, bitOffset+7);
return params;
x86_x64_TLB tlb;
tlb.type = type;
tlb.level = 1;
tlb.associativity = bits(reg, bitOffset+8, bitOffset+15);
tlb.pageSize = pageSize;
tlb.entries = bits(reg, bitOffset, bitOffset+7);
return tlb;
}
static void AddTLB1Parameters(const x86_x64_CpuidRegs& regs)
static void AddTLB1(const x86_x64_CpuidRegs& regs)
{
AddTLBParameters(TLB1Parameters(regs.eax, 0, 2*MiB, X86_X64_CACHE_TYPE_INSTRUCTION));
AddTLBParameters(TLB1Parameters(regs.eax, 16, 2*MiB, X86_X64_CACHE_TYPE_DATA));
AddTLBParameters(TLB1Parameters(regs.ebx, 0, 4*KiB, X86_X64_CACHE_TYPE_INSTRUCTION));
AddTLBParameters(TLB1Parameters(regs.ebx, 16, 4*KiB, X86_X64_CACHE_TYPE_DATA));
AddTLB(TLB1(regs.eax, 0, 2*MiB, X86_X64_CACHE_TYPE_INSTRUCTION));
AddTLB(TLB1(regs.eax, 16, 2*MiB, X86_X64_CACHE_TYPE_DATA));
AddTLB(TLB1(regs.ebx, 0, 4*KiB, X86_X64_CACHE_TYPE_INSTRUCTION));
AddTLB(TLB1(regs.ebx, 16, 4*KiB, X86_X64_CACHE_TYPE_DATA));
}
static x86_x64_TLBParameters TLB2Parameters(u32 reg, size_t bitOffset, size_t pageSize, x86_x64_CacheType type)
static x86_x64_TLB TLB2(u32 reg, size_t bitOffset, size_t pageSize, x86_x64_CacheType type)
{
x86_x64_TLBParameters params;
x86_x64_TLB tlb;
const size_t associativityIndex = bits(reg, bitOffset+12, bitOffset+15);
if(associativityIndex == 0) // disabled
{
params.type = X86_X64_CACHE_TYPE_NULL;
params.associativity = 0;
tlb.type = X86_X64_CACHE_TYPE_NULL;
tlb.associativity = 0;
}
else
{
params.type = type;
params.associativity = associativities[associativityIndex];
tlb.type = type;
tlb.associativity = associativities[associativityIndex];
}
params.level = 2;
params.pageSize = pageSize;
params.entries = bits(reg, bitOffset, bitOffset+11);
return params;
tlb.level = 2;
tlb.pageSize = pageSize;
tlb.entries = bits(reg, bitOffset, bitOffset+11);
return tlb;
}
static void AddTLB2ParameterPair(u32 reg, size_t pageSize)
static void AddTLB2Pair(u32 reg, size_t pageSize)
{
x86_x64_CacheType type = X86_X64_CACHE_TYPE_UNIFIED;
if(bits(reg, 16, 31) != 0) // not unified
{
AddTLBParameters(TLB2Parameters(reg, 16, pageSize, X86_X64_CACHE_TYPE_DATA));
AddTLB(TLB2(reg, 16, pageSize, X86_X64_CACHE_TYPE_DATA));
type = X86_X64_CACHE_TYPE_INSTRUCTION;
}
AddTLBParameters(TLB2Parameters(reg, 0, pageSize, type));
AddTLB(TLB2(reg, 0, pageSize, type));
}
// AMD reports maxCpuidIdFunction > 4 but consider functions 2..4 to be
@ -435,24 +445,24 @@ static void DetectCacheAndTLB()
regs.eax = 0x80000005;
if(x86_x64_cpuid(&regs))
{
AddTLB1Parameters(regs);
AddTLB1(regs);
dcache.levels = icache.levels = 1;
dcache.parameters[0] = L1Parameters(regs.ecx, X86_X64_CACHE_TYPE_DATA);
icache.parameters[0] = L1Parameters(regs.edx, X86_X64_CACHE_TYPE_INSTRUCTION);
dcaches.numLevels = icaches.numLevels = 1;
dcaches.levels[0] = L1Cache(regs.ecx, X86_X64_CACHE_TYPE_DATA);
icaches.levels[0] = L1Cache(regs.edx, X86_X64_CACHE_TYPE_INSTRUCTION);
}
regs.eax = 0x80000006;
if(x86_x64_cpuid(&regs))
{
AddTLB2ParameterPair(regs.eax, 2*MiB);
AddTLB2ParameterPair(regs.ebx, 4*KiB);
AddTLB2Pair(regs.eax, 2*MiB);
AddTLB2Pair(regs.ebx, 4*KiB);
icache.levels = dcache.levels = 2;
icache.parameters[1] = dcache.parameters[1] = L2Parameters(regs.ecx, X86_X64_CACHE_TYPE_UNIFIED);
icaches.numLevels = dcaches.numLevels = 2;
icaches.levels[1] = dcaches.levels[1] = L2Cache(regs.ecx, X86_X64_CACHE_TYPE_UNIFIED);
icache.levels = dcache.levels = 3;
icache.parameters[2] = dcache.parameters[2] = L3Parameters(regs.edx, X86_X64_CACHE_TYPE_UNIFIED);
icaches.numLevels = dcaches.numLevels = 3;
icaches.levels[2] = dcaches.levels[2] = L3Cache(regs.edx, X86_X64_CACHE_TYPE_UNIFIED);
}
}
@ -480,27 +490,27 @@ static void DetectCache_CPUID4()
if(type == X86_X64_CACHE_TYPE_NULL) // no more remaining
break;
x86_x64_CacheParameters params;
params.type = type;
params.level = level;
params.associativity = (size_t)bits(regs.ebx, 22, 31)+1;
params.lineSize = (size_t)bits(regs.ebx, 0, 11)+1; // (yes, this also uses +1 encoding)
params.sharedBy = (size_t)bits(regs.eax, 14, 25)+1;
x86_x64_Cache cache;
cache.type = type;
cache.level = level;
cache.associativity = (size_t)bits(regs.ebx, 22, 31)+1;
cache.lineSize = (size_t)bits(regs.ebx, 0, 11)+1; // (yes, this also uses +1 encoding)
cache.sharedBy = (size_t)bits(regs.eax, 14, 25)+1;
{
const size_t partitions = (size_t)bits(regs.ebx, 12, 21)+1;
const size_t sets = (size_t)bits(regs.ecx, 0, 31)+1;
params.totalSize = params.associativity * partitions * params.lineSize * sets;
cache.totalSize = cache.associativity * partitions * cache.lineSize * sets;
}
if(type == X86_X64_CACHE_TYPE_INSTRUCTION || type == X86_X64_CACHE_TYPE_UNIFIED)
if(IsInstruction(type))
{
icache.levels = std::max(icache.levels, level);
icache.parameters[level-1] = params;
icaches.numLevels = std::max(icaches.numLevels, level);
icaches.levels[level-1] = cache;
}
if(type == X86_X64_CACHE_TYPE_DATA || type == X86_X64_CACHE_TYPE_UNIFIED)
if(IsData(type))
{
dcache.levels = std::max(dcache.levels, level);
dcache.parameters[level-1] = params;
dcaches.numLevels = std::max(dcaches.numLevels, level);
dcaches.levels[level-1] = cache;
}
}
}
@ -624,24 +634,24 @@ static void DecodeDescriptor(u8 descriptor)
else
debug_assert(0);
x86_x64_TLBParameters params;
params.type = type;
params.level = level;
params.associativity = properties.associativity;
params.pageSize = pageSize;
params.entries = properties.entries;
x86_x64_TLB tlb;
tlb.type = type;
tlb.level = level;
tlb.associativity = properties.associativity;
tlb.pageSize = pageSize;
tlb.entries = properties.entries;
if(type == X86_X64_CACHE_TYPE_INSTRUCTION || type == X86_X64_CACHE_TYPE_UNIFIED)
if(IsInstruction(type))
{
if(itlb.numParameters < maxTLBParams)
itlb.parameters[itlb.numParameters++] = params;
if(itlbs.numLevels < maxTLBLevels)
itlbs.levels[itlbs.numLevels++] = tlb;
else
debug_assert(0);
}
if(type == X86_X64_CACHE_TYPE_DATA || type == X86_X64_CACHE_TYPE_UNIFIED)
if(IsData(type))
{
if(dtlb.numParameters < maxTLBParams)
dtlb.parameters[dtlb.numParameters++] = params;
if(dtlbs.numLevels < maxTLBLevels)
dtlbs.levels[dtlbs.numLevels++] = tlb;
else
debug_assert(0);
}
@ -694,71 +704,71 @@ static LibError DetectCacheAndTLB()
}
// sanity check: cache type must match that of the data structure
for(size_t i = 0; i < dcache.levels; i++)
debug_assert(dcache.parameters[i].type != X86_X64_CACHE_TYPE_INSTRUCTION);
for(size_t i = 0; i < icache.levels; i++)
debug_assert(icache.parameters[i].type != X86_X64_CACHE_TYPE_DATA);
for(size_t i = 0; i < dtlb.numParameters; i++)
debug_assert(dtlb.parameters[i].type != X86_X64_CACHE_TYPE_INSTRUCTION);
for(size_t i = 0; i < itlb.numParameters; i++)
debug_assert(itlb.parameters[i].type != X86_X64_CACHE_TYPE_DATA);
for(size_t i = 0; i < dcaches.numLevels; i++)
debug_assert(dcaches.levels[i].type != X86_X64_CACHE_TYPE_INSTRUCTION);
for(size_t i = 0; i < icaches.numLevels; i++)
debug_assert(icaches.levels[i].type != X86_X64_CACHE_TYPE_DATA);
for(size_t i = 0; i < dtlbs.numLevels; i++)
debug_assert(dtlbs.levels[i].type != X86_X64_CACHE_TYPE_INSTRUCTION);
for(size_t i = 0; i < itlbs.numLevels; i++)
debug_assert(itlbs.levels[i].type != X86_X64_CACHE_TYPE_DATA);
// ensure x86_x64_L1CacheLineSize and x86_x64_L2CacheLineSize will work
debug_assert(dcache.levels >= 2);
debug_assert(dcache.parameters[0].lineSize != 0);
debug_assert(dcache.parameters[1].lineSize != 0);
debug_assert(dcaches.numLevels >= 2);
debug_assert(dcaches.levels[0].lineSize != 0);
debug_assert(dcaches.levels[1].lineSize != 0);
return INFO::OK;
}
const x86_x64_Cache* x86_x64_ICache()
const x86_x64_Caches* x86_x64_ICaches()
{
ModuleInit(&cacheInitState, DetectCacheAndTLB);
return &icache;
return &icaches;
}
const x86_x64_Cache* x86_x64_DCache()
const x86_x64_Caches* x86_x64_DCaches()
{
ModuleInit(&cacheInitState, DetectCacheAndTLB);
return &dcache;
return &dcaches;
}
size_t x86_x64_L1CacheLineSize()
{
return x86_x64_DCache()->parameters[0].lineSize;
return x86_x64_DCaches()->levels[0].lineSize;
}
size_t x86_x64_L2CacheLineSize()
{
return x86_x64_DCache()->parameters[1].lineSize;
return x86_x64_DCaches()->levels[1].lineSize;
}
const x86_x64_TLB* x86_x64_ITLB()
const x86_x64_TLBs* x86_x64_ITLBs()
{
ModuleInit(&cacheInitState, DetectCacheAndTLB);
return &itlb;
return &itlbs;
}
const x86_x64_TLB* x86_x64_DTLB()
const x86_x64_TLBs* x86_x64_DTLBs()
{
ModuleInit(&cacheInitState, DetectCacheAndTLB);
return &dtlb;
return &dtlbs;
}
size_t x86_x64_TLBCoverage(const x86_x64_TLB* tlb)
size_t x86_x64_TLBCoverage(const x86_x64_TLBs* tlbs)
{
// note: receiving a TLB pointer means DetectCacheAndTLB was called.
const u64 pageSize = 4*KiB;
const u64 largePageSize = 4*MiB; // TODO: find out if we're using 2MB or 4MB
const u64 largePageSize = os_cpu_LargePageSize();
u64 totalSize = 0; // [bytes]
for(size_t i = 0; i < tlb->numParameters; i++)
for(size_t i = 0; i < tlbs->numLevels; i++)
{
const x86_x64_TLBParameters& params = tlb->parameters[i];
if(params.pageSize == pageSize)
totalSize += pageSize * params.entries;
if(params.pageSize == largePageSize)
totalSize += largePageSize * params.entries;
const x86_x64_TLB& tlb = tlbs->levels[i];
if(tlb.pageSize == pageSize)
totalSize += pageSize * tlb.entries;
if(tlb.pageSize == largePageSize)
totalSize += largePageSize * tlb.entries;
}
return size_t(totalSize / MiB);
@ -1036,8 +1046,8 @@ double x86_x64_ClockFrequency()
// note: don't just take the lowest value! it could conceivably be
// too low, if background processing delays reading c1 (see above).
double sum = 0.0;
const int lo = numSamples/4, hi = 3*numSamples/4;
for(int i = lo; i < hi; i++)
const size_t lo = numSamples/4, hi = 3*numSamples/4;
for(size_t i = lo; i < hi; i++)
sum += samples[i];
const double clockFrequency = sum / (hi-lo);

View File

@ -134,10 +134,7 @@ enum x86_x64_CacheType
const u8 x86_x64_fullyAssociative = 0xFF;
/**
* describes a level of one of the caches.
**/
struct x86_x64_CacheParameters
struct x86_x64_Cache
{
/**
* (used to determine if this cache is unified or disabled)
@ -155,34 +152,29 @@ struct x86_x64_CacheParameters
* instruction and data caches are returned separately by the corresponding
* accessor function; unified cache levels are reported by both.
**/
struct x86_x64_Cache
struct x86_x64_Caches
{
/**
* total number of levels, each of which is described by
* an entry in parameters[].
**/
size_t levels;
x86_x64_CacheParameters* parameters;
size_t numLevels;
x86_x64_Cache* levels;
};
/**
* @return pointer to a static x86_x64_Cache describing the instruction cache.
* @return pointer to a static x86_x64_Caches describing the instruction caches.
**/
LIB_API const x86_x64_Cache* x86_x64_ICache();
LIB_API const x86_x64_Caches* x86_x64_ICaches();
/**
* @return pointer to a static x86_x64_Cache describing the data cache.
* @return pointer to a static x86_x64_Caches describing the data caches.
**/
LIB_API const x86_x64_Cache* x86_x64_DCache();
LIB_API const x86_x64_Caches* x86_x64_DCaches();
LIB_API size_t x86_x64_L1CacheLineSize();
LIB_API size_t x86_x64_L2CacheLineSize();
/**
* describes part of a Translation Lookaside Buffer.
* Translation Lookaside Buffer.
**/
struct x86_x64_TLBParameters
struct x86_x64_TLB
{
x86_x64_CacheType type;
size_t level;
@ -192,32 +184,28 @@ struct x86_x64_TLBParameters
};
/**
* describes all parts of a Translation Lookaside Buffer
* describes all levels of a TLB.
**/
struct x86_x64_TLB
struct x86_x64_TLBs
{
/**
* total number of parts, each of which is described by
* an entry in parameters[]
**/
size_t numParameters;
x86_x64_TLBParameters* parameters;
size_t numLevels;
x86_x64_TLB* levels;
};
/**
* @return pointer to a static x86_x64_TLB describing the instruction TLB.
* @return pointer to a static x86_x64_TLB describing the instruction TLBs.
**/
LIB_API const x86_x64_TLB* x86_x64_ITLB();
LIB_API const x86_x64_TLBs* x86_x64_ITLBs();
/**
* @return pointer to a static x86_x64_TLB describing the data TLB.
**/
LIB_API const x86_x64_TLB* x86_x64_DTLB();
LIB_API const x86_x64_TLBs* x86_x64_DTLBs();
/**
* @return coverage, i.e. total size [MiB] of the given TLB
* @return coverage, i.e. total size [MiB] of the given TLBs
**/
LIB_API size_t x86_x64_TLBCoverage(const x86_x64_TLB* tlb);
LIB_API size_t x86_x64_TLBCoverage(const x86_x64_TLBs* tlb);
//-----------------------------------------------------------------------------

View File

@ -31,10 +31,24 @@ ERROR_ASSOCIATE(ERR::CPU_FEATURE_MISSING, L"This CPU doesn't support a required
ERROR_ASSOCIATE(ERR::CPU_UNKNOWN_OPCODE, L"Disassembly failed", -1);
ERROR_ASSOCIATE(ERR::CPU_UNKNOWN_VENDOR, L"CPU vendor unknown", -1);
void cpu_TestAtomicAdd()
static void TestCAS64()
{
volatile u64 var = 1;
cpu_CAS64(&var, 1ull, 2ull);
debug_assert(var == 2ull);
}
static void TestAtomicAdd()
{
volatile intptr_t i1 = 1;
intptr_t prev = cpu_AtomicAdd(&i1, 1);
debug_assert(prev == 1);
debug_assert(i1 == 2);
}
void cpu_Test()
{
TestCAS64();
TestAtomicAdd();
}

View File

@ -90,6 +90,13 @@ bool cpu_CAS(volatile T* location, T expected, T new_value)
return cpu_CAS((volatile intptr_t*)location, (intptr_t)expected, (intptr_t)new_value);
}
#if ARCH_AMD64
# define cpu_CAS64 cpu_CAS
#else
LIB_API bool cpu_CAS64(volatile u64* location, u64 expected, u64 newValue);
#endif
/**
* add a signed value to a variable without the possibility of interference
* from other threads/CPUs.
@ -98,7 +105,7 @@ bool cpu_CAS(volatile T* location, T expected, T new_value)
**/
LIB_API intptr_t cpu_AtomicAdd(volatile intptr_t* location, intptr_t increment);
LIB_API void cpu_TestAtomicAdd();
LIB_API void cpu_Test();
/**
* enforce strict instruction ordering in the CPU pipeline.

View File

@ -45,45 +45,34 @@
static bool IsUniprocessor()
{
const CpuTopology* topology = cpu_topology_Detect();
if(cpu_topology_NumPackages(topology) != 1)
if(cpu_topology_NumPackages() != 1)
return false;
if(cpu_topology_CoresPerPackage(topology) != 1)
if(cpu_topology_CoresPerPackage() != 1)
return false;
return true;
}
enum AmdPowerNowFlags
{
PN_FREQ_ID_CTRL = BIT(1),
PN_HW_THERMAL_CTRL = BIT(4),
PN_SW_THERMAL_CTRL = BIT(5),
PN_INVARIANT_TSC = BIT(8)
};
static bool IsInvariantTSC()
{
#if ARCH_X86_X64
// (we no longer need to check x86_x64_Vendor - Intel and AMD
// agreed on the definition of this feature check)
x86_x64_CpuidRegs regs = { 0 };
switch(x86_x64_Vendor())
regs.eax = 0x80000007;
if(x86_x64_cpuid(&regs))
{
case X86_X64_VENDOR_AMD:
regs.eax = 0x80000007;
if(x86_x64_cpuid(&regs))
{
// TSC is invariant across P-state, C-state and
// stop grant transitions (e.g. STPCLK)
if(regs.edx & PN_INVARIANT_TSC)
return true;
}
break;
// TSC is invariant across P-state, C-state, turbo, and
// stop grant transitions (e.g. STPCLK)
if(regs.edx & BIT(8))
return true;
}
#endif
return false;
}
static bool IsThrottlingPossible()
{
#if ARCH_X86_X64
@ -99,6 +88,12 @@ static bool IsThrottlingPossible()
regs.eax = 0x80000007;
if(x86_x64_cpuid(&regs))
{
enum AmdPowerNowFlags
{
PN_FREQ_ID_CTRL = BIT(1),
PN_HW_THERMAL_CTRL = BIT(4),
PN_SW_THERMAL_CTRL = BIT(5)
};
if(regs.edx & (PN_FREQ_ID_CTRL|PN_HW_THERMAL_CTRL|PN_SW_THERMAL_CTRL))
return true;
}

View File

@ -173,6 +173,8 @@ struct TimerState
// (this enables calibration, which is currently not implemented,
// but leaving open the possibility costs nothing)
double time;
u8 padding[48];
};
// how do we detect when the old TimerState is no longer in use and can be
@ -181,10 +183,10 @@ struct TimerState
// entered critical sections (the latching of TimerState fields) will have
// been exited before the next update comes around; if not, TimerState.time
// changes, the critical section notices and re-reads the new values.
static TimerState timerStates[2];
static __declspec(align(64)) TimerState timerStates[2];
// note: exchanging pointers is easier than XORing an index.
static TimerState* volatile ts = &timerStates[0];
static TimerState* volatile ts2 = &timerStates[1];
static volatile TimerState* volatile ts = &timerStates[0];
static volatile TimerState* volatile ts2 = &timerStates[1];
static void UpdateTimerState()
{
@ -201,7 +203,7 @@ static void UpdateTimerState()
const u64 deltaTicks = CounterDelta(ts->counter, counter);
ts2->counter = counter;
ts2->time = ts->time + deltaTicks/nominalFrequency;
ts = (TimerState*)InterlockedExchangePointer((volatile PVOID*)&ts2, ts);
ts = (volatile TimerState*)InterlockedExchangePointer((volatile PVOID*)&ts2, (PVOID)ts);
}
double whrt_Time()
@ -209,6 +211,7 @@ double whrt_Time()
retry:
// latch timer state (counter and time must be from the same update)
const double time = ts->time;
cpu_MemoryBarrier();
const u64 counter = ts->counter;
// ts changed after reading time. note: don't compare counter because
// it _might_ have the same value after two updates.

View File

@ -115,6 +115,24 @@ LIB_API size_t os_cpu_MemoryAvailable();
**/
LIB_API uintptr_t os_cpu_SetThreadAffinityMask(uintptr_t processorMask);
class os_cpu_ScopedSetThreadAffinityMask
{
public:
os_cpu_ScopedSetThreadAffinityMask(uintptr_t processorMask)
: m_previousProcessorMask(os_cpu_SetThreadAffinityMask(processorMask))
{
}
~os_cpu_ScopedSetThreadAffinityMask()
{
(void)os_cpu_SetThreadAffinityMask(m_previousProcessorMask);
}
private:
uintptr_t m_previousProcessorMask;
};
/**
* called by os_cpu_CallByEachCPU.
* @param processor ID of processor running the current thread for the

View File

@ -144,7 +144,7 @@ double timer_Resolution()
//
// do not use std::list et al. for this! we must be callable at any time,
// especially before NLSO ctors run or before heap init.
static size_t num_clients;
static size_t numClients;
static TimerClient* clients;
@ -157,31 +157,24 @@ TimerClient* timer_AddClient(TimerClient* tc, const wchar_t* description)
// insert at front of list
tc->next = clients;
clients = tc;
num_clients++;
numClients++;
return tc;
}
void timer_BillClient(TimerClient* tc, TimerUnit t0, TimerUnit t1)
{
tc->sum.AddDifference(t0, t1);
tc->num_calls++;
}
void timer_DisplayClientTotals()
{
debug_printf(L"TIMER TOTALS (%lu clients)\n", (unsigned long)num_clients);
debug_printf(L"TIMER TOTALS (%lu clients)\n", (unsigned long)numClients);
debug_printf(L"-----------------------------------------------------\n");
while(clients)
{
// (make sure list and count are consistent)
debug_assert(num_clients != 0);
debug_assert(numClients != 0);
TimerClient* tc = clients;
clients = tc->next;
num_clients--;
numClients--;
const std::wstring duration = tc->sum.ToString();
debug_printf(L" %ls: %ls (%lux)\n", tc->description, duration.c_str(), (unsigned long)tc->num_calls);

View File

@ -28,6 +28,7 @@
#define INCLUDED_TIMER
#include "lib/config2.h" // CONFIG2_TIMER_ALLOW_RDTSC
#include "lib/sysdep/cpu.h" // cpu_AtomicAdd
#if ARCH_X86_X64 && CONFIG2_TIMER_ALLOW_RDTSC
# include "lib/sysdep/arch/x86_x64/x86_x64.h" // x86_x64_rdtsc
# include "lib/sysdep/os_cpu.h" // os_cpu_ClockFrequency
@ -172,6 +173,18 @@ public:
m_ticks += t1.m_ticks - t0.m_ticks;
}
void AddDifferenceAtomic(TimerUnit t0, TimerUnit t1)
{
const u64 delta = t1.m_ticks - t0.m_ticks;
#if ARCH_AMD64
cpu_AtomicAdd((volatile intptr_t*)&m_ticks, (intptr_t)delta);
#else
retry:
if(!cpu_CAS64(&m_ticks, m_ticks, m_ticks+delta))
goto retry;
#endif
}
void Subtract(TimerUnit t)
{
m_ticks -= t.m_ticks;
@ -226,6 +239,20 @@ public:
m_seconds += t1.m_seconds - t0.m_seconds;
}
void AddDifferenceAtomic(TimerUnit t0, TimerUnit t1)
{
retry:
intptr_t oldRepresentation;
memcpy(&oldRepresentation, &m_seconds, sizeof(oldRepresentation));
const double seconds = m_seconds + t1.m_seconds - t0.m_seconds;
intptr_t newRepresentation;
memcpy(&newRepresentation, &seconds, sizeof(newRepresentation));
if(!cpu_CAS64((volatile intptr_t*)&m_seconds, oldRepresentation, newRepresentation))
goto retry;
}
void Subtract(TimerUnit t)
{
m_seconds -= t.m_seconds;
@ -274,7 +301,7 @@ struct TimerClient
// how often timer_BillClient was called (helps measure relative
// performance of something that is done indeterminately often).
size_t num_calls;
intptr_t num_calls;
};
/**
@ -304,7 +331,21 @@ LIB_API TimerClient* timer_AddClient(TimerClient* tc, const wchar_t* description
/**
* bill the difference between t0 and t1 to the client's total.
**/
LIB_API void timer_BillClient(TimerClient* tc, TimerUnit t0, TimerUnit t1);
inline void timer_BillClient(TimerClient* tc, TimerUnit t0, TimerUnit t1)
{
tc->sum.AddDifference(t0, t1);
tc->num_calls++;
}
/**
* thread-safe version of timer_BillClient
* (not used by default due to its higher overhead)
**/
inline void timer_BillClientAtomic(TimerClient* tc, TimerUnit t0, TimerUnit t1)
{
tc->sum.AddDifferenceAtomic(t0, t1);
cpu_AtomicAdd(&tc->num_calls, +1);
}
/**
* display all clients' totals; does not reset them.
@ -335,6 +376,28 @@ private:
TimerClient* m_tc;
};
class ScopeTimerAccrueAtomic
{
NONCOPYABLE(ScopeTimerAccrueAtomic);
public:
ScopeTimerAccrueAtomic(TimerClient* tc)
: m_tc(tc)
{
m_t0.SetFromTimer();
}
~ScopeTimerAccrueAtomic()
{
TimerUnit t1;
t1.SetFromTimer();
timer_BillClientAtomic(m_tc, m_t0, t1);
}
private:
TimerUnit m_t0;
TimerClient* m_tc;
};
/**
* Measure the time taken to execute code up until end of the current scope;
* bill it to the given TimerClient object. Can safely be nested.
@ -356,5 +419,6 @@ private:
* timer_DisplayClientTotals();
**/
#define TIMER_ACCRUE(client) ScopeTimerAccrue UID__(client)
#define TIMER_ACCRUE_ATOMIC(client) ScopeTimerAccrueAtomic UID__(client)
#endif // #ifndef INCLUDED_TIMER

View File

@ -99,8 +99,7 @@ void WriteSystemInfo()
fprintf(f, "OS : %s %s (%s)\n", un.sysname, un.release, un.version);
// CPU
const CpuTopology* topology = cpu_topology_Detect();
fprintf(f, "CPU : %s, %s (%dx%dx%d)", un.machine, cpu_IdentifierString(), (int)cpu_topology_NumPackages(topology), (int)cpu_topology_CoresPerPackage(topology), (int)cpu_topology_LogicalPerCore(topology));
fprintf(f, "CPU : %s, %s (%dx%dx%d)", un.machine, cpu_IdentifierString(), (int)cpu_topology_NumPackages(), (int)cpu_topology_CoresPerPackage(), (int)cpu_topology_LogicalPerCore());
const double cpu_freq = os_cpu_ClockFrequency();
if(cpu_freq != 0.0f)
{