major refactor of topology.cpp: each piece of information is detected separately on-demand (reduces coupling and static data)

fix documentation of ceil_log2 relating to input=0
remove os_cpu_SetThreadAffinity (redundant)
wcpu: make max # processors available via os_cpu.h; remove warning if
process affinity is restricted

This was SVN commit r5951.
This commit is contained in:
janwas 2008-05-13 05:51:25 +00:00
parent 6e46b897c9
commit d1a9348b91
7 changed files with 341 additions and 350 deletions

View File

@ -101,9 +101,9 @@ bool is_pow2(T n)
}
/**
* ceil(log2(n))
* ceil(log2(x))
*
* @param n (integer) input; MUST be > 0, else results are undefined.
* @param x (unsigned integer)
* @return ceiling of the base-2 logarithm (i.e. rounded up).
**/
template<typename T>

View File

@ -115,13 +115,6 @@ uintptr_t os_cpu_SetThreadAffinityMask(uintptr_t processorMask)
}
void os_cpu_SetThreadAffinity(size_t processor)
{
const uintptr_t processorMask = uintptr_t(1) << processor;
(void)os_cpu_SetThreadAffinityMask(processorMask);
}
LibError cpu_CallByEachCPU(OsCpuCallback cb, uintptr_t cbData)
{
for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)

View File

@ -26,6 +26,13 @@ namespace ERR
// rationale: this spares users from having to deal with noncontiguous IDs,
// e.g. when administrative tools are used to restrict process affinity.
/**
* maximum number of processors supported by the OS (determined by the
* number of bits in an affinity mask)
**/
static const size_t os_cpu_MaxProcessors = sizeof(uintptr_t)*CHAR_BIT;
/**
* @return bit mask of processors that exist and are available to
* this process.
@ -84,7 +91,7 @@ LIB_API size_t os_cpu_MemoryAvailable();
/**
* restrict the current thread to a set of processors.
* it will not be rescheduled until a subsequent os_cpu_SetThreadAffinity*.
* it will not be rescheduled until affinity is again changed.
*
* @param processorMask a bit mask of acceptable processors
* (bit index i corresponds to processor i)
@ -92,12 +99,6 @@ LIB_API size_t os_cpu_MemoryAvailable();
**/
LIB_API uintptr_t os_cpu_SetThreadAffinityMask(uintptr_t processorMask);
/**
* restrict the current thread to a single processor.
* it will not be rescheduled until a subsequent os_cpu_SetThreadAffinity*.
**/
LIB_API void os_cpu_SetThreadAffinity(size_t processor);
/**
* called by os_cpu_CallByEachCPU.
* @param processor ID of processor running the current thread for the

View File

@ -94,13 +94,6 @@ uintptr_t os_cpu_SetThreadAffinityMask(uintptr_t processorMask)
}
void os_cpu_SetThreadAffinity(size_t processor)
{
const uintptr_t processorMask = uintptr_t(1) << processor;
(void)os_cpu_SetThreadAffinityMask(processorMask);
}
LibError cpu_CallByEachCPU(OsCpuCallback cb, uintptr_t cbData)
{
for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)

View File

@ -172,18 +172,12 @@ size_t os_cpu_MemoryAvailable()
//-----------------------------------------------------------------------------
/**
* maximum number of processors supported by the OS (determined by the
* number of bits in an affinity mask)
**/
static const DWORD maxProcessorNumber = sizeof(DWORD_PTR)*CHAR_BIT-1;
DWORD_PTR wcpu_AffinityFromProcessorMask(DWORD_PTR processAffinity, uintptr_t processorMask)
{
DWORD_PTR affinity = 0;
size_t processor = (size_t)-1;
for(DWORD processorNumber = 0; processorNumber <= maxProcessorNumber; processorNumber++)
for(DWORD processorNumber = 0; processorNumber < (DWORD)os_cpu_MaxProcessors; processorNumber++)
{
if(IsBitSet(processAffinity, processorNumber))
{
@ -202,7 +196,7 @@ uintptr_t wcpu_ProcessorMaskFromAffinity(DWORD_PTR processAffinity, DWORD_PTR af
uintptr_t processorMask = 0;
size_t processor = (size_t)-1;
for(DWORD processorNumber = 0; processorNumber <= maxProcessorNumber; processorNumber++)
for(DWORD processorNumber = 0; processorNumber < (DWORD)os_cpu_MaxProcessors; processorNumber++)
{
if(IsBitSet(processAffinity, processorNumber))
{
@ -271,31 +265,23 @@ uintptr_t os_cpu_SetThreadAffinityMask(uintptr_t processorMask)
}
void os_cpu_SetThreadAffinity(size_t processor)
{
debug_assert(processor < os_cpu_NumProcessors());
const uintptr_t processorMask = uintptr_t(1) << processor;
(void)os_cpu_SetThreadAffinityMask(processorMask);
}
LibError os_cpu_CallByEachCPU(OsCpuCallback cb, uintptr_t cbData)
{
// ensure we are able to run on all system processors
// abort if we can't run on all system processors
DWORD_PTR processAffinity, systemAffinity;
{
const BOOL ok = GetProcessAffinityMask(GetCurrentProcess(), &processAffinity, &systemAffinity);
debug_assert(ok);
if(processAffinity != systemAffinity)
WARN_RETURN(ERR::OS_CPU_RESTRICTED_AFFINITY);
return ERR::OS_CPU_RESTRICTED_AFFINITY; // NOWARN
}
const uintptr_t previousAffinity = os_cpu_SetThreadAffinityMask(os_cpu_ProcessorMask());
for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
{
os_cpu_SetThreadAffinity(processor);
const uintptr_t processorMask = uintptr_t(1) << processor;
os_cpu_SetThreadAffinityMask(processorMask);
cb(processor, cbData);
}

View File

@ -16,222 +16,311 @@
#include "lib/sysdep/os_cpu.h"
#include "x86_x64.h"
//-----------------------------------------------------------------------------
// note: Intel Appnote 485 (CPUID) assures uniformity of coresPerPackage and
// logicalPerCore across all packages.
static size_t DetectCoresPerPackage()
{
x86_x64_CpuidRegs regs;
switch(x86_x64_Vendor())
{
case X86_X64_VENDOR_INTEL:
regs.eax = 4;
regs.ecx = 0;
if(x86_x64_cpuid(&regs))
return bits(regs.eax, 26, 31)+1;
break;
case X86_X64_VENDOR_AMD:
regs.eax = 0x80000008;
if(x86_x64_cpuid(&regs))
return bits(regs.ecx, 0, 7)+1;
break;
}
return 1; // else: the CPU is single-core.
}
// detect *maximum* number of cores/packages/caches.
// note: some of them may be disabled by the OS or BIOS.
// note: Intel Appnote 485 assures us that they are uniform across packages.
static size_t CoresPerPackage()
{
static size_t coresPerPackage = 0;
if(!coresPerPackage)
coresPerPackage = DetectCoresPerPackage();
{
coresPerPackage = 1; // it's single core unless one of the following applies:
x86_x64_CpuidRegs regs;
switch(x86_x64_Vendor())
{
case X86_X64_VENDOR_INTEL:
regs.eax = 4;
regs.ecx = 0;
if(x86_x64_cpuid(&regs))
coresPerPackage = bits(regs.eax, 26, 31)+1;
break;
case X86_X64_VENDOR_AMD:
regs.eax = 0x80000008;
if(x86_x64_cpuid(&regs))
coresPerPackage = bits(regs.ecx, 0, 7)+1;
break;
}
}
return coresPerPackage;
}
static bool IsHyperthreadingCapable()
{
// definitely not
if(!x86_x64_cap(X86_X64_CAP_HT))
return false;
// AMD N-core systems falsely set the HT bit for compatibility reasons
// (don't bother resetting it, might confuse callers)
if(x86_x64_Vendor() == X86_X64_VENDOR_AMD && x86_x64_cap(X86_X64_CAP_AMD_CMP_LEGACY))
return false;
return true;
}
static size_t DetectLogicalPerCore()
{
if(!IsHyperthreadingCapable())
return 1;
x86_x64_CpuidRegs regs;
regs.eax = 1;
if(!x86_x64_cpuid(&regs))
DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING);
const size_t logicalPerPackage = bits(regs.ebx, 16, 23);
// cores ought to be uniform WRT # logical processors
debug_assert(logicalPerPackage % CoresPerPackage() == 0);
return logicalPerPackage / CoresPerPackage();
}
static size_t LogicalPerCore()
{
static size_t logicalPerCore = 0;
if(!logicalPerCore)
logicalPerCore = DetectLogicalPerCore();
{
struct IsHyperthreadingCapable
{
bool operator()() const
{
// definitely not
if(!x86_x64_cap(X86_X64_CAP_HT))
return false;
// AMD N-core systems falsely set the HT bit for compatibility reasons
// (don't bother resetting it, might confuse callers)
if(x86_x64_Vendor() == X86_X64_VENDOR_AMD && x86_x64_cap(X86_X64_CAP_AMD_CMP_LEGACY))
return false;
return true;
}
};
if(!IsHyperthreadingCapable()())
logicalPerCore = 1;
else
{
x86_x64_CpuidRegs regs;
regs.eax = 1;
if(!x86_x64_cpuid(&regs))
DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING);
const size_t logicalPerPackage = bits(regs.ebx, 16, 23);
// cores ought to be uniform WRT # logical processors
debug_assert(logicalPerPackage % CoresPerPackage() == 0);
logicalPerCore = logicalPerPackage / CoresPerPackage();
}
}
return logicalPerCore;
}
enum CacheType
{
CT_NONE = 0,
CT_DATA = 1,
CT_INSTRUCTION = 2,
CT_UNIFIED = 3
};
static bool IsL2DataCache(CacheType type, size_t level)
{
if(type != CT_DATA && type != CT_UNIFIED)
return false;
if(level != 2)
return false;
return true;
}
static size_t DetectLogicalPerCache()
{
// note: Intel Appnote 485 says the order in which caches are returned is
// undefined, so we need to loop through all of them.
for(u32 count = 0; ; count++)
{
x86_x64_CpuidRegs regs;
regs.eax = 4;
regs.ecx = count;
x86_x64_cpuid(&regs);
const CacheType type = (CacheType)bits(regs.eax, 0, 4);
// no more caches left
if(type == CT_NONE)
{
debug_assert(0); // we somehow didn't find the L2d
return 1;
}
const size_t level = bits(regs.eax, 5, 7);
if(IsL2DataCache(type, level))
{
const size_t logicalPerCache = bits(regs.eax, 14, 25)+1;
return logicalPerCache;
}
}
}
static size_t LogicalPerCache()
{
static size_t logicalPerCache;
if(!logicalPerCache)
logicalPerCache = DetectLogicalPerCache();
{
logicalPerCache = 1; // caches aren't shared unless we find a descriptor
// note: Intel Appnote 485 says the order in which caches are returned is
// undefined, so we need to loop through all of them.
for(u32 count = 0; ; count++)
{
// get next cache descriptor
x86_x64_CpuidRegs regs;
regs.eax = 4;
regs.ecx = count;
x86_x64_cpuid(&regs);
const u32 type = bits(regs.eax, 0, 4);
if(type == 0) // no more remaining
break;
struct IsL2DataCache
{
bool operator()(u32 type, u32 level) const
{
if(type != 1 && type != 3) // neither data nor unified
return false;
if(level != 2)
return false;
return true;
}
};
const u32 level = bits(regs.eax, 5, 7);
if(IsL2DataCache()(type, level))
logicalPerCache = bits(regs.eax, 14, 25)+1;
}
}
return logicalPerCache;
}
//-----------------------------------------------------------------------------
// determination of enabled cores/HTs
// the above functions give the maximum number of cores/logical units.
// however, some of them may actually be disabled by the BIOS!
// what we can do is to analyze the APIC IDs. they are allocated sequentially
// for all "processors". treating the IDs as variable-width bit fields
// (according to the number of cores/logical units present) allows
// determining the exact topology as well as number of packages.
// these are set by DetectProcessorTopology.
static size_t numPackages = 0; // i.e. sockets; > 1 => true SMP system
static size_t enabledCoresPerPackage = 0;
static size_t enabledLogicalPerCore = 0; // hyperthreading units
typedef std::vector<u8> Ids;
// add the currently running processor's APIC ID to a list of IDs.
static void StoreApicId(size_t UNUSED(processor), uintptr_t cbData)
{
Ids* const apicIds = (Ids*)cbData;
apicIds->push_back(x86_x64_ApicId());
}
// if successful, apicIds[i] contains the unique ID of OS processor i.
static bool GatherApicIds(Ids& apicIds)
{
// old APIC (see x86_x64_ApicId for details)
if(x86_x64_Generation() < 8)
return false;
// process affinity prevents us from seeing all APIC IDs
if(PopulationCount(os_cpu_ProcessorMask()) != os_cpu_NumProcessors())
return false;
const LibError ret = os_cpu_CallByEachCPU(StoreApicId, (uintptr_t)&apicIds);
debug_assert(ret == INFO::OK);
// ensure we got a unique ID for every processor
{
Ids tmp(apicIds);
Ids::iterator end = tmp.end();
std::sort(tmp.begin(), end);
debug_assert(std::unique(tmp.begin(), end) == end);
debug_assert(std::distance(tmp.begin(), end) == (ptrdiff_t)os_cpu_NumProcessors());
}
return true;
}
typedef std::set<u8> IdSet;
// APIC IDs consist of variable-length fields identifying the logical unit,
// core, package and shared cache. if they are available, we can determine
// the exact topology; otherwise we have to guess.
/**
* "field" := a range of bits sufficient to represent <numValues> integers.
* for each id in <apicIds>: extract the value of the field starting at
* <offset> and insert it into <ids>. afterwards, adjust <offset> to the
* next field.
*
* used to gather e.g. all core IDs from all APIC IDs.
* @return an array of the processors' unique APIC IDs or zero if
* no APIC is present or process affinity is limited.
**/
static void ExtractFieldIntoSet(const Ids& apicIds, size_t& offset, size_t numValues, IdSet& ids)
static const u8* ApicIds()
{
static u8 apicIdStorage[os_cpu_MaxProcessors];
static const u8* apicIds;
static volatile uintptr_t initialized = 0;
if(cpu_CAS(&initialized, 0, 1))
{
// requires 'new' APIC (see x86_x64_ApicId for details)
if(x86_x64_Generation() >= 8)
{
// store each processor's APIC ID in turn
struct StoreApicId
{
static void Callback(size_t processor, uintptr_t UNUSED(cbData))
{
apicIdStorage[processor] = x86_x64_ApicId();
}
};
if(os_cpu_CallByEachCPU(StoreApicId::Callback, (uintptr_t)&apicIds) == INFO::OK)
apicIds = apicIdStorage; // success, return valid array from now on
}
}
return apicIds;
}
/**
* count the number of unique values assumed by a certain field (i.e. part
* of the APIC ID).
* @param numBits width of the field; must be set to ceil_log2 of the
* maximum value that can be assumed by the field.
* @return number of unique values (one if numBits is zero - this is
* convenient and kind of justified by counting the empty symbol)
**/
static size_t NumUniqueValuesInField(const u8* apicIds, size_t offset, size_t numBits)
{
const size_t numBits = ceil_log2(numValues);
if(numBits == 0)
return;
return 1; // see above
const u8 mask = bit_mask<u8>(numBits);
for(size_t i = 0; i < apicIds.size(); i++)
typedef std::set<u8> IdSet;
IdSet ids;
for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
{
const u8 apicId = apicIds[i];
const u8 apicId = apicIds[processor];
const u8 field = u8(apicId >> offset) & mask;
ids.insert(field);
}
offset += numBits;
return ids.size();
}
static size_t numCaches = 0; // L2d
static std::vector<size_t> processorsCache;
static std::vector<uintptr_t> cachesProcessorMask;
size_t cpu_NumPackages()
{
static size_t numPackages = 0;
if(!numPackages)
{
const u8* apicIds = ApicIds();
if(apicIds)
{
const size_t offset = ceil_log2(CoresPerPackage()) + ceil_log2(LogicalPerCore());
const size_t numBits = 8;
numPackages = NumUniqueValuesInField(apicIds, offset, numBits);
}
else
{
// note: correct results cannot be guaranteed because unreported
// and disable logical units are indistinguishable. the below
// assumptions are reasonable because we care most about packages
// (i.e. whether the system is truly SMP). in contrast, it is
// safe to overestimate the number of cores because that
// only determines if memory barriers are needed or not.
// note: requiring modern processors featuring an APIC does not
// prevent this from being reached (the cause may be lack of
// OS support or restricted process affinity).
// assume cores are enabled and count as processors.
const size_t numPackagesTimesLogical = os_cpu_NumProcessors() / CoresPerPackage();
debug_assert(numPackagesTimesLogical != 0);
// assume hyperthreads are enabled; check if they count as processors.
if(numPackagesTimesLogical > LogicalPerCore())
numPackages = numPackagesTimesLogical / LogicalPerCore();
}
}
return numPackages;
}
size_t cpu_CoresPerPackage()
{
static size_t enabledCoresPerPackage;
class CacheManager
if(!enabledCoresPerPackage)
{
const u8* apicIds = ApicIds();
if(apicIds)
{
const size_t offset = ceil_log2(LogicalPerCore());
const size_t numBits = ceil_log2(CoresPerPackage());
enabledCoresPerPackage = NumUniqueValuesInField(apicIds, offset, numBits);
}
else
{
// guess (must match cpu_NumPackages's assumptions)
enabledCoresPerPackage = CoresPerPackage();
}
}
return enabledCoresPerPackage;
}
size_t cpu_LogicalPerCore()
{
static size_t enabledLogicalPerCore;
if(!enabledLogicalPerCore)
{
const u8* apicIds = ApicIds();
if(apicIds)
{
const size_t offset = 0;
const size_t numBits = ceil_log2(LogicalPerCore());
enabledLogicalPerCore = NumUniqueValuesInField(apicIds, offset, numBits);
}
else
{
// guess (must match cpu_NumPackages's assumptions)
enabledLogicalPerCore = LogicalPerCore();
}
}
return enabledLogicalPerCore;
}
//-----------------------------------------------------------------------------
// cache topology
// note: Windows 2003 GetLogicalProcessorInformation provides similar
// functionality but returns incorrect results. (it claims all cores in
// an Intel Core2 Quad processor share a single L2 cache.)
size_t cpu_NumCaches()
{
static size_t numCaches;
if(!numCaches)
{
const u8* apicIds = ApicIds();
if(apicIds)
{
const size_t offset = 0;
const size_t numBits = ceil_log2(LogicalPerCache());
numCaches = NumUniqueValuesInField(apicIds, offset, numBits);
}
else
{
// assume each processor has its own cache
numCaches = os_cpu_NumProcessors();
}
}
return numCaches;
}
class CacheTopology
{
public:
/**
* add processor to the processor mask owned by cache identified by <id>
**/
void Add(u8 id, size_t processor)
{
SharedCache* cache = Find(id);
@ -243,14 +332,20 @@ public:
cache->Add(processor);
}
void StoreProcessorMasks(std::vector<uintptr_t>& processorMasks)
/**
* store topology in an array (one entry per cache) of masks
* representing the processors that share a cache.
**/
void StoreProcessorMasks(uintptr_t* processorMasks)
{
processorMasks.resize(m_caches.size());
for(size_t i = 0; i < m_caches.size(); i++)
processorMasks[i] = m_caches[i].ProcessorMask();
}
private:
/**
* stores ID and tracks which processors share this cache
**/
class SharedCache
{
public:
@ -293,150 +388,64 @@ private:
std::vector<SharedCache> m_caches;
};
static void DetectCacheTopology(const Ids& apicIds)
uintptr_t cpu_ProcessorMaskFromCache(size_t cache)
{
const size_t numBits = ceil_log2(LogicalPerCache());
const u8 cacheIdMask = u8(0xFF << numBits);
static uintptr_t cachesProcessorMask[os_cpu_MaxProcessors];
CacheManager cacheManager;
for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
static volatile uintptr_t initialized = 0;
if(cpu_CAS(&initialized, 0, 1))
{
const u8 apicId = apicIds[processor];
const u8 cacheId = apicId & cacheIdMask;
cacheManager.Add(cacheId, processor);
}
cacheManager.StoreProcessorMasks(cachesProcessorMask);
numCaches = cachesProcessorMask.size();
const size_t invalidCache = ~(size_t)0;
processorsCache.resize(os_cpu_NumProcessors(), invalidCache);
for(size_t cache = 0; cache < numCaches; cache++)
{
const uintptr_t processorMask = cachesProcessorMask[cache];
for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
const u8* apicIds = ApicIds();
if(apicIds)
{
if(IsBitSet(processorMask, processor))
processorsCache[processor] = cache;
const size_t numBits = ceil_log2(LogicalPerCache());
const u8 cacheIdMask = u8(0xFF << numBits);
CacheTopology cacheManager;
for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
{
const u8 apicId = apicIds[processor];
const u8 cacheId = apicId & cacheIdMask;
cacheManager.Add(cacheId, processor);
}
cacheManager.StoreProcessorMasks(cachesProcessorMask);
}
else
{
// assume each cache belongs to exactly one processor and
// cache index == processor index.
for(size_t cache = 0; cache < cpu_NumCaches(); cache++)
cachesProcessorMask[cache] = uintptr_t(1) << cache;
}
}
for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
{
debug_assert(processorsCache[processor] != invalidCache);
debug_assert(processorsCache[processor] < numCaches);
}
}
// @return false if unavailable / no information can be returned.
static bool DetectProcessorTopologyViaApicIds()
{
Ids apicIds;
if(!GatherApicIds(apicIds))
return false;
// extract values from all 3 ID bit fields into separate sets
size_t offset = 0;
IdSet logicalIds;
ExtractFieldIntoSet(apicIds, offset, LogicalPerCore(), logicalIds);
IdSet coreIds;
ExtractFieldIntoSet(apicIds, offset, CoresPerPackage(), coreIds);
IdSet packageIds;
ExtractFieldIntoSet(apicIds, offset, 0xFF, packageIds);
numPackages = std::max(packageIds.size(), size_t(1));
enabledCoresPerPackage = std::max(coreIds .size(), size_t(1));
enabledLogicalPerCore = std::max(logicalIds.size(), size_t(1));
// note: cache ID possibly overlaps the other fields. we also want to
// retrieve more information (mappings between processor and cache ID),
// so this needs to be handled separately.
DetectCacheTopology(apicIds);
return true;
}
static void GuessProcessorTopologyViaOsCount()
{
const size_t numProcessors = os_cpu_NumProcessors();
// note: we cannot hope to always return correct results since disabled
// cores/logical units cannot be distinguished from the situation of the
// OS simply not reporting them as "processors". unfortunately this
// function won't always only be called for older (#core = #logical = 1)
// systems because DetectProcessorTopologyViaApicIds may fail due to
// lack of OS support. what we'll do is assume nothing is disabled; this
// is reasonable because we care most about #packages. it's fine to assume
// more cores (without inflating the total #processors) because that
// count only indicates memory barriers etc. ought to be used.
enabledCoresPerPackage = CoresPerPackage();
enabledLogicalPerCore = LogicalPerCore();
const size_t numPackagesTimesLogical = numProcessors / CoresPerPackage();
debug_assert(numPackagesTimesLogical != 0); // otherwise processors didn't include cores, which would be stupid
numPackages = numPackagesTimesLogical / LogicalPerCore();
if(!numPackages) // processors didn't include logical units (reasonable)
numPackages = numPackagesTimesLogical;
}
// determine how many CoresPerPackage and LogicalPerCore are
// actually enabled and also count numPackages.
static void DetectProcessorTopology()
{
// authoritative, but requires OS support and fairly recent CPUs
if(DetectProcessorTopologyViaApicIds())
return; // success, we're done.
GuessProcessorTopologyViaOsCount();
}
size_t cpu_NumPackages()
{
if(!numPackages)
DetectProcessorTopology();
return numPackages;
}
size_t cpu_CoresPerPackage()
{
if(!enabledCoresPerPackage)
DetectProcessorTopology();
return enabledCoresPerPackage;
}
size_t cpu_LogicalPerCore()
{
if(!enabledLogicalPerCore)
DetectProcessorTopology();
return enabledLogicalPerCore;
}
size_t cpu_NumCaches()
{
if(!numCaches)
DetectProcessorTopology();
return numCaches;
debug_assert(cache < cpu_NumCaches());
return cachesProcessorMask[cache];
}
size_t cpu_CacheFromProcessor(size_t processor)
{
static size_t processorsCache[os_cpu_MaxProcessors];
static volatile uintptr_t initialized = 0;
if(cpu_CAS(&initialized, 0, 1))
{
for(size_t cache = 0; cache < cpu_NumCaches(); cache++)
{
// write to all entries that share this cache
const uintptr_t processorMask = cpu_ProcessorMaskFromCache(cache);
for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
{
if(IsBitSet(processorMask, processor))
{
debug_assert(processorsCache[processor] == 0);
processorsCache[processor] = cache;
}
}
}
}
debug_assert(processor < os_cpu_NumProcessors());
DetectProcessorTopology();
return processorsCache.at(processor);
return processorsCache[processor];
}
uintptr_t cpu_ProcessorMaskFromCache(size_t cache)
{
debug_assert(cache < cpu_NumCaches());
DetectProcessorTopology();
return cachesProcessorMask.at(cache);
}
// note: Windows 2003 GetLogicalProcessorInformation returns incorrect
// information, claiming all cores in an Intel Core2 Quad processor
// share an L2 cache.

View File

@ -11,9 +11,13 @@
#ifndef INCLUDED_TOPOLOGY
#define INCLUDED_TOPOLOGY
// OSes report hyperthreading units and cores as "processors". we need to
// drill down and find out the exact counts (for thread pool dimensioning
// and cache sharing considerations).
//-----------------------------------------------------------------------------
// CPU
// OSes typically consider both SMT units and cores to be "processors".
// the following routines determine how many of each are actually present and
// enabled. this information is useful for detecting SMP systems, predicting
// performance and dimensioning thread pools.
/**
* @return number of *enabled* CPU packages / sockets.
@ -36,6 +40,11 @@ LIB_API size_t cpu_LogicalPerCore();
//-----------------------------------------------------------------------------
// L2 cache
// some CPU micro-architectures (e.g. Intel Core2) feature partitioned
// L2 caches. if the cores sharing a cache work together on the same
// sub-problem, contention may be reduced and effective capacity increased.
// the following routines allow discovery of the L2 cache topology:
/**
* @return number of distinct L2 caches
**/