|
|
|
@ -16,222 +16,311 @@
|
|
|
|
|
#include "lib/sysdep/os_cpu.h"
|
|
|
|
|
#include "x86_x64.h"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
//-----------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
// note: Intel Appnote 485 (CPUID) assures uniformity of coresPerPackage and
|
|
|
|
|
// logicalPerCore across all packages.
|
|
|
|
|
|
|
|
|
|
static size_t DetectCoresPerPackage()
|
|
|
|
|
{
|
|
|
|
|
x86_x64_CpuidRegs regs;
|
|
|
|
|
switch(x86_x64_Vendor())
|
|
|
|
|
{
|
|
|
|
|
case X86_X64_VENDOR_INTEL:
|
|
|
|
|
regs.eax = 4;
|
|
|
|
|
regs.ecx = 0;
|
|
|
|
|
if(x86_x64_cpuid(®s))
|
|
|
|
|
return bits(regs.eax, 26, 31)+1;
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case X86_X64_VENDOR_AMD:
|
|
|
|
|
regs.eax = 0x80000008;
|
|
|
|
|
if(x86_x64_cpuid(®s))
|
|
|
|
|
return bits(regs.ecx, 0, 7)+1;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return 1; // else: the CPU is single-core.
|
|
|
|
|
}
|
|
|
|
|
// detect *maximum* number of cores/packages/caches.
|
|
|
|
|
// note: some of them may be disabled by the OS or BIOS.
|
|
|
|
|
// note: Intel Appnote 485 assures us that they are uniform across packages.
|
|
|
|
|
|
|
|
|
|
static size_t CoresPerPackage()
|
|
|
|
|
{
|
|
|
|
|
static size_t coresPerPackage = 0;
|
|
|
|
|
|
|
|
|
|
if(!coresPerPackage)
|
|
|
|
|
coresPerPackage = DetectCoresPerPackage();
|
|
|
|
|
{
|
|
|
|
|
coresPerPackage = 1; // it's single core unless one of the following applies:
|
|
|
|
|
|
|
|
|
|
x86_x64_CpuidRegs regs;
|
|
|
|
|
switch(x86_x64_Vendor())
|
|
|
|
|
{
|
|
|
|
|
case X86_X64_VENDOR_INTEL:
|
|
|
|
|
regs.eax = 4;
|
|
|
|
|
regs.ecx = 0;
|
|
|
|
|
if(x86_x64_cpuid(®s))
|
|
|
|
|
coresPerPackage = bits(regs.eax, 26, 31)+1;
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case X86_X64_VENDOR_AMD:
|
|
|
|
|
regs.eax = 0x80000008;
|
|
|
|
|
if(x86_x64_cpuid(®s))
|
|
|
|
|
coresPerPackage = bits(regs.ecx, 0, 7)+1;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return coresPerPackage;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static bool IsHyperthreadingCapable()
|
|
|
|
|
{
|
|
|
|
|
// definitely not
|
|
|
|
|
if(!x86_x64_cap(X86_X64_CAP_HT))
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
// AMD N-core systems falsely set the HT bit for compatibility reasons
|
|
|
|
|
// (don't bother resetting it, might confuse callers)
|
|
|
|
|
if(x86_x64_Vendor() == X86_X64_VENDOR_AMD && x86_x64_cap(X86_X64_CAP_AMD_CMP_LEGACY))
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static size_t DetectLogicalPerCore()
|
|
|
|
|
{
|
|
|
|
|
if(!IsHyperthreadingCapable())
|
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
|
|
x86_x64_CpuidRegs regs;
|
|
|
|
|
regs.eax = 1;
|
|
|
|
|
if(!x86_x64_cpuid(®s))
|
|
|
|
|
DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING);
|
|
|
|
|
const size_t logicalPerPackage = bits(regs.ebx, 16, 23);
|
|
|
|
|
|
|
|
|
|
// cores ought to be uniform WRT # logical processors
|
|
|
|
|
debug_assert(logicalPerPackage % CoresPerPackage() == 0);
|
|
|
|
|
|
|
|
|
|
return logicalPerPackage / CoresPerPackage();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static size_t LogicalPerCore()
|
|
|
|
|
{
|
|
|
|
|
static size_t logicalPerCore = 0;
|
|
|
|
|
|
|
|
|
|
if(!logicalPerCore)
|
|
|
|
|
logicalPerCore = DetectLogicalPerCore();
|
|
|
|
|
{
|
|
|
|
|
struct IsHyperthreadingCapable
|
|
|
|
|
{
|
|
|
|
|
bool operator()() const
|
|
|
|
|
{
|
|
|
|
|
// definitely not
|
|
|
|
|
if(!x86_x64_cap(X86_X64_CAP_HT))
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
// AMD N-core systems falsely set the HT bit for compatibility reasons
|
|
|
|
|
// (don't bother resetting it, might confuse callers)
|
|
|
|
|
if(x86_x64_Vendor() == X86_X64_VENDOR_AMD && x86_x64_cap(X86_X64_CAP_AMD_CMP_LEGACY))
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
if(!IsHyperthreadingCapable()())
|
|
|
|
|
logicalPerCore = 1;
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
x86_x64_CpuidRegs regs;
|
|
|
|
|
regs.eax = 1;
|
|
|
|
|
if(!x86_x64_cpuid(®s))
|
|
|
|
|
DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING);
|
|
|
|
|
const size_t logicalPerPackage = bits(regs.ebx, 16, 23);
|
|
|
|
|
// cores ought to be uniform WRT # logical processors
|
|
|
|
|
debug_assert(logicalPerPackage % CoresPerPackage() == 0);
|
|
|
|
|
logicalPerCore = logicalPerPackage / CoresPerPackage();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return logicalPerCore;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
enum CacheType
|
|
|
|
|
{
|
|
|
|
|
CT_NONE = 0,
|
|
|
|
|
CT_DATA = 1,
|
|
|
|
|
CT_INSTRUCTION = 2,
|
|
|
|
|
CT_UNIFIED = 3
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
static bool IsL2DataCache(CacheType type, size_t level)
|
|
|
|
|
{
|
|
|
|
|
if(type != CT_DATA && type != CT_UNIFIED)
|
|
|
|
|
return false;
|
|
|
|
|
if(level != 2)
|
|
|
|
|
return false;
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static size_t DetectLogicalPerCache()
|
|
|
|
|
{
|
|
|
|
|
// note: Intel Appnote 485 says the order in which caches are returned is
|
|
|
|
|
// undefined, so we need to loop through all of them.
|
|
|
|
|
for(u32 count = 0; ; count++)
|
|
|
|
|
{
|
|
|
|
|
x86_x64_CpuidRegs regs;
|
|
|
|
|
regs.eax = 4;
|
|
|
|
|
regs.ecx = count;
|
|
|
|
|
x86_x64_cpuid(®s);
|
|
|
|
|
|
|
|
|
|
const CacheType type = (CacheType)bits(regs.eax, 0, 4);
|
|
|
|
|
// no more caches left
|
|
|
|
|
if(type == CT_NONE)
|
|
|
|
|
{
|
|
|
|
|
debug_assert(0); // we somehow didn't find the L2d
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const size_t level = bits(regs.eax, 5, 7);
|
|
|
|
|
if(IsL2DataCache(type, level))
|
|
|
|
|
{
|
|
|
|
|
const size_t logicalPerCache = bits(regs.eax, 14, 25)+1;
|
|
|
|
|
return logicalPerCache;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static size_t LogicalPerCache()
|
|
|
|
|
{
|
|
|
|
|
static size_t logicalPerCache;
|
|
|
|
|
|
|
|
|
|
if(!logicalPerCache)
|
|
|
|
|
logicalPerCache = DetectLogicalPerCache();
|
|
|
|
|
{
|
|
|
|
|
logicalPerCache = 1; // caches aren't shared unless we find a descriptor
|
|
|
|
|
|
|
|
|
|
// note: Intel Appnote 485 says the order in which caches are returned is
|
|
|
|
|
// undefined, so we need to loop through all of them.
|
|
|
|
|
for(u32 count = 0; ; count++)
|
|
|
|
|
{
|
|
|
|
|
// get next cache descriptor
|
|
|
|
|
x86_x64_CpuidRegs regs;
|
|
|
|
|
regs.eax = 4;
|
|
|
|
|
regs.ecx = count;
|
|
|
|
|
x86_x64_cpuid(®s);
|
|
|
|
|
const u32 type = bits(regs.eax, 0, 4);
|
|
|
|
|
if(type == 0) // no more remaining
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
struct IsL2DataCache
|
|
|
|
|
{
|
|
|
|
|
bool operator()(u32 type, u32 level) const
|
|
|
|
|
{
|
|
|
|
|
if(type != 1 && type != 3) // neither data nor unified
|
|
|
|
|
return false;
|
|
|
|
|
if(level != 2)
|
|
|
|
|
return false;
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
const u32 level = bits(regs.eax, 5, 7);
|
|
|
|
|
if(IsL2DataCache()(type, level))
|
|
|
|
|
logicalPerCache = bits(regs.eax, 14, 25)+1;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return logicalPerCache;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
//-----------------------------------------------------------------------------
|
|
|
|
|
// determination of enabled cores/HTs
|
|
|
|
|
|
|
|
|
|
// the above functions give the maximum number of cores/logical units.
|
|
|
|
|
// however, some of them may actually be disabled by the BIOS!
|
|
|
|
|
// what we can do is to analyze the APIC IDs. they are allocated sequentially
|
|
|
|
|
// for all "processors". treating the IDs as variable-width bit fields
|
|
|
|
|
// (according to the number of cores/logical units present) allows
|
|
|
|
|
// determining the exact topology as well as number of packages.
|
|
|
|
|
|
|
|
|
|
// these are set by DetectProcessorTopology.
|
|
|
|
|
static size_t numPackages = 0; // i.e. sockets; > 1 => true SMP system
|
|
|
|
|
static size_t enabledCoresPerPackage = 0;
|
|
|
|
|
static size_t enabledLogicalPerCore = 0; // hyperthreading units
|
|
|
|
|
|
|
|
|
|
typedef std::vector<u8> Ids;
|
|
|
|
|
|
|
|
|
|
// add the currently running processor's APIC ID to a list of IDs.
|
|
|
|
|
static void StoreApicId(size_t UNUSED(processor), uintptr_t cbData)
|
|
|
|
|
{
|
|
|
|
|
Ids* const apicIds = (Ids*)cbData;
|
|
|
|
|
apicIds->push_back(x86_x64_ApicId());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// if successful, apicIds[i] contains the unique ID of OS processor i.
|
|
|
|
|
static bool GatherApicIds(Ids& apicIds)
|
|
|
|
|
{
|
|
|
|
|
// old APIC (see x86_x64_ApicId for details)
|
|
|
|
|
if(x86_x64_Generation() < 8)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
// process affinity prevents us from seeing all APIC IDs
|
|
|
|
|
if(PopulationCount(os_cpu_ProcessorMask()) != os_cpu_NumProcessors())
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
const LibError ret = os_cpu_CallByEachCPU(StoreApicId, (uintptr_t)&apicIds);
|
|
|
|
|
debug_assert(ret == INFO::OK);
|
|
|
|
|
|
|
|
|
|
// ensure we got a unique ID for every processor
|
|
|
|
|
{
|
|
|
|
|
Ids tmp(apicIds);
|
|
|
|
|
Ids::iterator end = tmp.end();
|
|
|
|
|
std::sort(tmp.begin(), end);
|
|
|
|
|
debug_assert(std::unique(tmp.begin(), end) == end);
|
|
|
|
|
debug_assert(std::distance(tmp.begin(), end) == (ptrdiff_t)os_cpu_NumProcessors());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
typedef std::set<u8> IdSet;
|
|
|
|
|
// APIC IDs consist of variable-length fields identifying the logical unit,
|
|
|
|
|
// core, package and shared cache. if they are available, we can determine
|
|
|
|
|
// the exact topology; otherwise we have to guess.
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* "field" := a range of bits sufficient to represent <numValues> integers.
|
|
|
|
|
* for each id in <apicIds>: extract the value of the field starting at
|
|
|
|
|
* <offset> and insert it into <ids>. afterwards, adjust <offset> to the
|
|
|
|
|
* next field.
|
|
|
|
|
*
|
|
|
|
|
* used to gather e.g. all core IDs from all APIC IDs.
|
|
|
|
|
* @return an array of the processors' unique APIC IDs or zero if
|
|
|
|
|
* no APIC is present or process affinity is limited.
|
|
|
|
|
**/
|
|
|
|
|
static void ExtractFieldIntoSet(const Ids& apicIds, size_t& offset, size_t numValues, IdSet& ids)
|
|
|
|
|
static const u8* ApicIds()
|
|
|
|
|
{
|
|
|
|
|
static u8 apicIdStorage[os_cpu_MaxProcessors];
|
|
|
|
|
static const u8* apicIds;
|
|
|
|
|
|
|
|
|
|
static volatile uintptr_t initialized = 0;
|
|
|
|
|
if(cpu_CAS(&initialized, 0, 1))
|
|
|
|
|
{
|
|
|
|
|
// requires 'new' APIC (see x86_x64_ApicId for details)
|
|
|
|
|
if(x86_x64_Generation() >= 8)
|
|
|
|
|
{
|
|
|
|
|
// store each processor's APIC ID in turn
|
|
|
|
|
struct StoreApicId
|
|
|
|
|
{
|
|
|
|
|
static void Callback(size_t processor, uintptr_t UNUSED(cbData))
|
|
|
|
|
{
|
|
|
|
|
apicIdStorage[processor] = x86_x64_ApicId();
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
if(os_cpu_CallByEachCPU(StoreApicId::Callback, (uintptr_t)&apicIds) == INFO::OK)
|
|
|
|
|
apicIds = apicIdStorage; // success, return valid array from now on
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return apicIds;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* count the number of unique values assumed by a certain field (i.e. part
|
|
|
|
|
* of the APIC ID).
|
|
|
|
|
* @param numBits width of the field; must be set to ceil_log2 of the
|
|
|
|
|
* maximum value that can be assumed by the field.
|
|
|
|
|
* @return number of unique values (one if numBits is zero - this is
|
|
|
|
|
* convenient and kind of justified by counting the empty symbol)
|
|
|
|
|
**/
|
|
|
|
|
static size_t NumUniqueValuesInField(const u8* apicIds, size_t offset, size_t numBits)
|
|
|
|
|
{
|
|
|
|
|
const size_t numBits = ceil_log2(numValues);
|
|
|
|
|
if(numBits == 0)
|
|
|
|
|
return;
|
|
|
|
|
return 1; // see above
|
|
|
|
|
const u8 mask = bit_mask<u8>(numBits);
|
|
|
|
|
|
|
|
|
|
for(size_t i = 0; i < apicIds.size(); i++)
|
|
|
|
|
typedef std::set<u8> IdSet;
|
|
|
|
|
IdSet ids;
|
|
|
|
|
for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
|
|
|
|
|
{
|
|
|
|
|
const u8 apicId = apicIds[i];
|
|
|
|
|
const u8 apicId = apicIds[processor];
|
|
|
|
|
const u8 field = u8(apicId >> offset) & mask;
|
|
|
|
|
ids.insert(field);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
offset += numBits;
|
|
|
|
|
return ids.size();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static size_t numCaches = 0; // L2d
|
|
|
|
|
static std::vector<size_t> processorsCache;
|
|
|
|
|
static std::vector<uintptr_t> cachesProcessorMask;
|
|
|
|
|
|
|
|
|
|
size_t cpu_NumPackages()
|
|
|
|
|
{
|
|
|
|
|
static size_t numPackages = 0;
|
|
|
|
|
|
|
|
|
|
if(!numPackages)
|
|
|
|
|
{
|
|
|
|
|
const u8* apicIds = ApicIds();
|
|
|
|
|
if(apicIds)
|
|
|
|
|
{
|
|
|
|
|
const size_t offset = ceil_log2(CoresPerPackage()) + ceil_log2(LogicalPerCore());
|
|
|
|
|
const size_t numBits = 8;
|
|
|
|
|
numPackages = NumUniqueValuesInField(apicIds, offset, numBits);
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
// note: correct results cannot be guaranteed because unreported
|
|
|
|
|
// and disable logical units are indistinguishable. the below
|
|
|
|
|
// assumptions are reasonable because we care most about packages
|
|
|
|
|
// (i.e. whether the system is truly SMP). in contrast, it is
|
|
|
|
|
// safe to overestimate the number of cores because that
|
|
|
|
|
// only determines if memory barriers are needed or not.
|
|
|
|
|
// note: requiring modern processors featuring an APIC does not
|
|
|
|
|
// prevent this from being reached (the cause may be lack of
|
|
|
|
|
// OS support or restricted process affinity).
|
|
|
|
|
|
|
|
|
|
// assume cores are enabled and count as processors.
|
|
|
|
|
const size_t numPackagesTimesLogical = os_cpu_NumProcessors() / CoresPerPackage();
|
|
|
|
|
debug_assert(numPackagesTimesLogical != 0);
|
|
|
|
|
// assume hyperthreads are enabled; check if they count as processors.
|
|
|
|
|
if(numPackagesTimesLogical > LogicalPerCore())
|
|
|
|
|
numPackages = numPackagesTimesLogical / LogicalPerCore();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return numPackages;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
size_t cpu_CoresPerPackage()
|
|
|
|
|
{
|
|
|
|
|
static size_t enabledCoresPerPackage;
|
|
|
|
|
|
|
|
|
|
class CacheManager
|
|
|
|
|
if(!enabledCoresPerPackage)
|
|
|
|
|
{
|
|
|
|
|
const u8* apicIds = ApicIds();
|
|
|
|
|
if(apicIds)
|
|
|
|
|
{
|
|
|
|
|
const size_t offset = ceil_log2(LogicalPerCore());
|
|
|
|
|
const size_t numBits = ceil_log2(CoresPerPackage());
|
|
|
|
|
enabledCoresPerPackage = NumUniqueValuesInField(apicIds, offset, numBits);
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
// guess (must match cpu_NumPackages's assumptions)
|
|
|
|
|
enabledCoresPerPackage = CoresPerPackage();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return enabledCoresPerPackage;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
size_t cpu_LogicalPerCore()
|
|
|
|
|
{
|
|
|
|
|
static size_t enabledLogicalPerCore;
|
|
|
|
|
|
|
|
|
|
if(!enabledLogicalPerCore)
|
|
|
|
|
{
|
|
|
|
|
const u8* apicIds = ApicIds();
|
|
|
|
|
if(apicIds)
|
|
|
|
|
{
|
|
|
|
|
const size_t offset = 0;
|
|
|
|
|
const size_t numBits = ceil_log2(LogicalPerCore());
|
|
|
|
|
enabledLogicalPerCore = NumUniqueValuesInField(apicIds, offset, numBits);
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
// guess (must match cpu_NumPackages's assumptions)
|
|
|
|
|
enabledLogicalPerCore = LogicalPerCore();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return enabledLogicalPerCore;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
//-----------------------------------------------------------------------------
|
|
|
|
|
// cache topology
|
|
|
|
|
|
|
|
|
|
// note: Windows 2003 GetLogicalProcessorInformation provides similar
|
|
|
|
|
// functionality but returns incorrect results. (it claims all cores in
|
|
|
|
|
// an Intel Core2 Quad processor share a single L2 cache.)
|
|
|
|
|
|
|
|
|
|
size_t cpu_NumCaches()
|
|
|
|
|
{
|
|
|
|
|
static size_t numCaches;
|
|
|
|
|
if(!numCaches)
|
|
|
|
|
{
|
|
|
|
|
const u8* apicIds = ApicIds();
|
|
|
|
|
if(apicIds)
|
|
|
|
|
{
|
|
|
|
|
const size_t offset = 0;
|
|
|
|
|
const size_t numBits = ceil_log2(LogicalPerCache());
|
|
|
|
|
numCaches = NumUniqueValuesInField(apicIds, offset, numBits);
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
// assume each processor has its own cache
|
|
|
|
|
numCaches = os_cpu_NumProcessors();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return numCaches;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
class CacheTopology
|
|
|
|
|
{
|
|
|
|
|
public:
|
|
|
|
|
/**
|
|
|
|
|
* add processor to the processor mask owned by cache identified by <id>
|
|
|
|
|
**/
|
|
|
|
|
void Add(u8 id, size_t processor)
|
|
|
|
|
{
|
|
|
|
|
SharedCache* cache = Find(id);
|
|
|
|
@ -243,14 +332,20 @@ public:
|
|
|
|
|
cache->Add(processor);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void StoreProcessorMasks(std::vector<uintptr_t>& processorMasks)
|
|
|
|
|
/**
|
|
|
|
|
* store topology in an array (one entry per cache) of masks
|
|
|
|
|
* representing the processors that share a cache.
|
|
|
|
|
**/
|
|
|
|
|
void StoreProcessorMasks(uintptr_t* processorMasks)
|
|
|
|
|
{
|
|
|
|
|
processorMasks.resize(m_caches.size());
|
|
|
|
|
for(size_t i = 0; i < m_caches.size(); i++)
|
|
|
|
|
processorMasks[i] = m_caches[i].ProcessorMask();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
/**
|
|
|
|
|
* stores ID and tracks which processors share this cache
|
|
|
|
|
**/
|
|
|
|
|
class SharedCache
|
|
|
|
|
{
|
|
|
|
|
public:
|
|
|
|
@ -293,150 +388,64 @@ private:
|
|
|
|
|
std::vector<SharedCache> m_caches;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
static void DetectCacheTopology(const Ids& apicIds)
|
|
|
|
|
uintptr_t cpu_ProcessorMaskFromCache(size_t cache)
|
|
|
|
|
{
|
|
|
|
|
const size_t numBits = ceil_log2(LogicalPerCache());
|
|
|
|
|
const u8 cacheIdMask = u8(0xFF << numBits);
|
|
|
|
|
static uintptr_t cachesProcessorMask[os_cpu_MaxProcessors];
|
|
|
|
|
|
|
|
|
|
CacheManager cacheManager;
|
|
|
|
|
for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
|
|
|
|
|
static volatile uintptr_t initialized = 0;
|
|
|
|
|
if(cpu_CAS(&initialized, 0, 1))
|
|
|
|
|
{
|
|
|
|
|
const u8 apicId = apicIds[processor];
|
|
|
|
|
const u8 cacheId = apicId & cacheIdMask;
|
|
|
|
|
cacheManager.Add(cacheId, processor);
|
|
|
|
|
}
|
|
|
|
|
cacheManager.StoreProcessorMasks(cachesProcessorMask);
|
|
|
|
|
numCaches = cachesProcessorMask.size();
|
|
|
|
|
|
|
|
|
|
const size_t invalidCache = ~(size_t)0;
|
|
|
|
|
processorsCache.resize(os_cpu_NumProcessors(), invalidCache);
|
|
|
|
|
for(size_t cache = 0; cache < numCaches; cache++)
|
|
|
|
|
{
|
|
|
|
|
const uintptr_t processorMask = cachesProcessorMask[cache];
|
|
|
|
|
for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
|
|
|
|
|
const u8* apicIds = ApicIds();
|
|
|
|
|
if(apicIds)
|
|
|
|
|
{
|
|
|
|
|
if(IsBitSet(processorMask, processor))
|
|
|
|
|
processorsCache[processor] = cache;
|
|
|
|
|
const size_t numBits = ceil_log2(LogicalPerCache());
|
|
|
|
|
const u8 cacheIdMask = u8(0xFF << numBits);
|
|
|
|
|
|
|
|
|
|
CacheTopology cacheManager;
|
|
|
|
|
for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
|
|
|
|
|
{
|
|
|
|
|
const u8 apicId = apicIds[processor];
|
|
|
|
|
const u8 cacheId = apicId & cacheIdMask;
|
|
|
|
|
cacheManager.Add(cacheId, processor);
|
|
|
|
|
}
|
|
|
|
|
cacheManager.StoreProcessorMasks(cachesProcessorMask);
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
// assume each cache belongs to exactly one processor and
|
|
|
|
|
// cache index == processor index.
|
|
|
|
|
for(size_t cache = 0; cache < cpu_NumCaches(); cache++)
|
|
|
|
|
cachesProcessorMask[cache] = uintptr_t(1) << cache;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
|
|
|
|
|
{
|
|
|
|
|
debug_assert(processorsCache[processor] != invalidCache);
|
|
|
|
|
debug_assert(processorsCache[processor] < numCaches);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// @return false if unavailable / no information can be returned.
|
|
|
|
|
static bool DetectProcessorTopologyViaApicIds()
|
|
|
|
|
{
|
|
|
|
|
Ids apicIds;
|
|
|
|
|
if(!GatherApicIds(apicIds))
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
// extract values from all 3 ID bit fields into separate sets
|
|
|
|
|
size_t offset = 0;
|
|
|
|
|
IdSet logicalIds;
|
|
|
|
|
ExtractFieldIntoSet(apicIds, offset, LogicalPerCore(), logicalIds);
|
|
|
|
|
IdSet coreIds;
|
|
|
|
|
ExtractFieldIntoSet(apicIds, offset, CoresPerPackage(), coreIds);
|
|
|
|
|
IdSet packageIds;
|
|
|
|
|
ExtractFieldIntoSet(apicIds, offset, 0xFF, packageIds);
|
|
|
|
|
|
|
|
|
|
numPackages = std::max(packageIds.size(), size_t(1));
|
|
|
|
|
enabledCoresPerPackage = std::max(coreIds .size(), size_t(1));
|
|
|
|
|
enabledLogicalPerCore = std::max(logicalIds.size(), size_t(1));
|
|
|
|
|
|
|
|
|
|
// note: cache ID possibly overlaps the other fields. we also want to
|
|
|
|
|
// retrieve more information (mappings between processor and cache ID),
|
|
|
|
|
// so this needs to be handled separately.
|
|
|
|
|
DetectCacheTopology(apicIds);
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static void GuessProcessorTopologyViaOsCount()
|
|
|
|
|
{
|
|
|
|
|
const size_t numProcessors = os_cpu_NumProcessors();
|
|
|
|
|
|
|
|
|
|
// note: we cannot hope to always return correct results since disabled
|
|
|
|
|
// cores/logical units cannot be distinguished from the situation of the
|
|
|
|
|
// OS simply not reporting them as "processors". unfortunately this
|
|
|
|
|
// function won't always only be called for older (#core = #logical = 1)
|
|
|
|
|
// systems because DetectProcessorTopologyViaApicIds may fail due to
|
|
|
|
|
// lack of OS support. what we'll do is assume nothing is disabled; this
|
|
|
|
|
// is reasonable because we care most about #packages. it's fine to assume
|
|
|
|
|
// more cores (without inflating the total #processors) because that
|
|
|
|
|
// count only indicates memory barriers etc. ought to be used.
|
|
|
|
|
enabledCoresPerPackage = CoresPerPackage();
|
|
|
|
|
enabledLogicalPerCore = LogicalPerCore();
|
|
|
|
|
|
|
|
|
|
const size_t numPackagesTimesLogical = numProcessors / CoresPerPackage();
|
|
|
|
|
debug_assert(numPackagesTimesLogical != 0); // otherwise processors didn't include cores, which would be stupid
|
|
|
|
|
|
|
|
|
|
numPackages = numPackagesTimesLogical / LogicalPerCore();
|
|
|
|
|
if(!numPackages) // processors didn't include logical units (reasonable)
|
|
|
|
|
numPackages = numPackagesTimesLogical;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// determine how many CoresPerPackage and LogicalPerCore are
|
|
|
|
|
// actually enabled and also count numPackages.
|
|
|
|
|
static void DetectProcessorTopology()
|
|
|
|
|
{
|
|
|
|
|
// authoritative, but requires OS support and fairly recent CPUs
|
|
|
|
|
if(DetectProcessorTopologyViaApicIds())
|
|
|
|
|
return; // success, we're done.
|
|
|
|
|
|
|
|
|
|
GuessProcessorTopologyViaOsCount();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
size_t cpu_NumPackages()
|
|
|
|
|
{
|
|
|
|
|
if(!numPackages)
|
|
|
|
|
DetectProcessorTopology();
|
|
|
|
|
return numPackages;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
size_t cpu_CoresPerPackage()
|
|
|
|
|
{
|
|
|
|
|
if(!enabledCoresPerPackage)
|
|
|
|
|
DetectProcessorTopology();
|
|
|
|
|
return enabledCoresPerPackage;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
size_t cpu_LogicalPerCore()
|
|
|
|
|
{
|
|
|
|
|
if(!enabledLogicalPerCore)
|
|
|
|
|
DetectProcessorTopology();
|
|
|
|
|
return enabledLogicalPerCore;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
size_t cpu_NumCaches()
|
|
|
|
|
{
|
|
|
|
|
if(!numCaches)
|
|
|
|
|
DetectProcessorTopology();
|
|
|
|
|
return numCaches;
|
|
|
|
|
debug_assert(cache < cpu_NumCaches());
|
|
|
|
|
return cachesProcessorMask[cache];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
size_t cpu_CacheFromProcessor(size_t processor)
|
|
|
|
|
{
|
|
|
|
|
static size_t processorsCache[os_cpu_MaxProcessors];
|
|
|
|
|
|
|
|
|
|
static volatile uintptr_t initialized = 0;
|
|
|
|
|
if(cpu_CAS(&initialized, 0, 1))
|
|
|
|
|
{
|
|
|
|
|
for(size_t cache = 0; cache < cpu_NumCaches(); cache++)
|
|
|
|
|
{
|
|
|
|
|
// write to all entries that share this cache
|
|
|
|
|
const uintptr_t processorMask = cpu_ProcessorMaskFromCache(cache);
|
|
|
|
|
for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
|
|
|
|
|
{
|
|
|
|
|
if(IsBitSet(processorMask, processor))
|
|
|
|
|
{
|
|
|
|
|
debug_assert(processorsCache[processor] == 0);
|
|
|
|
|
processorsCache[processor] = cache;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
debug_assert(processor < os_cpu_NumProcessors());
|
|
|
|
|
DetectProcessorTopology();
|
|
|
|
|
return processorsCache.at(processor);
|
|
|
|
|
return processorsCache[processor];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
uintptr_t cpu_ProcessorMaskFromCache(size_t cache)
|
|
|
|
|
{
|
|
|
|
|
debug_assert(cache < cpu_NumCaches());
|
|
|
|
|
DetectProcessorTopology();
|
|
|
|
|
return cachesProcessorMask.at(cache);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// note: Windows 2003 GetLogicalProcessorInformation returns incorrect
|
|
|
|
|
// information, claiming all cores in an Intel Core2 Quad processor
|
|
|
|
|
// share an L2 cache.
|
|
|
|
|