major refactor of topology.cpp: each piece of information is detected separately on-demand (reduces coupling and static data)

fix documentation of ceil_log2 relating to input=0 remove os_cpu_SetThreadAffinity (redundant) wcpu: make max # processors available via os_cpu.h; remove warning if process affinity is restricted This was SVN commit r5951.
2008-05-13 05:51:25 +00:00 · 2008-05-13 05:51:25 +00:00 · d1a9348b91
commit d1a9348b91
parent 6e46b897c9
7 changed files with 341 additions and 350 deletions
--- a/source/lib/bits.h
+++ b/source/lib/bits.h
@ -101,9 +101,9 @@ bool is_pow2(T n)
 }

 /**
- * ceil(log2(n))
+ * ceil(log2(x))
 *
- * @param n (integer) input; MUST be > 0, else results are undefined.
+ * @param x (unsigned integer)
 * @return ceiling of the base-2 logarithm (i.e. rounded up).
 **/
 template<typename T>
--- a/source/lib/sysdep/linux/lcpu.cpp
+++ b/source/lib/sysdep/linux/lcpu.cpp
@ -115,13 +115,6 @@ uintptr_t os_cpu_SetThreadAffinityMask(uintptr_t processorMask)
 }


-void os_cpu_SetThreadAffinity(size_t processor)
-{
-	const uintptr_t processorMask = uintptr_t(1) << processor;
-	(void)os_cpu_SetThreadAffinityMask(processorMask);
-}
-
-
 LibError cpu_CallByEachCPU(OsCpuCallback cb, uintptr_t cbData)
 {
 	for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
--- a/source/lib/sysdep/os_cpu.h
+++ b/source/lib/sysdep/os_cpu.h
@ -26,6 +26,13 @@ namespace ERR
 // rationale: this spares users from having to deal with noncontiguous IDs,
 // e.g. when administrative tools are used to restrict process affinity.

+
+/**
+ * maximum number of processors supported by the OS (determined by the
+ * number of bits in an affinity mask)
+ **/
+static const size_t os_cpu_MaxProcessors = sizeof(uintptr_t)*CHAR_BIT;
+
 /**
 * @return bit mask of processors that exist and are available to
 * this process.
@ -84,7 +91,7 @@ LIB_API size_t os_cpu_MemoryAvailable();

 /**
 * restrict the current thread to a set of processors.
- * it will not be rescheduled until a subsequent os_cpu_SetThreadAffinity*.
+ * it will not be rescheduled until affinity is again changed.
 *
 * @param processorMask a bit mask of acceptable processors
 * (bit index i corresponds to processor i)
@ -92,12 +99,6 @@ LIB_API size_t os_cpu_MemoryAvailable();
 **/
 LIB_API uintptr_t os_cpu_SetThreadAffinityMask(uintptr_t processorMask);

-/**
- * restrict the current thread to a single processor.
- * it will not be rescheduled until a subsequent os_cpu_SetThreadAffinity*.
- **/
-LIB_API void os_cpu_SetThreadAffinity(size_t processor);
-
 /**
 * called by os_cpu_CallByEachCPU.
 * @param processor ID of processor running the current thread for the
--- a/source/lib/sysdep/osx/ocpu.cpp
+++ b/source/lib/sysdep/osx/ocpu.cpp
@ -94,13 +94,6 @@ uintptr_t os_cpu_SetThreadAffinityMask(uintptr_t processorMask)
 }


-void os_cpu_SetThreadAffinity(size_t processor)
-{
-	const uintptr_t processorMask = uintptr_t(1) << processor;
-	(void)os_cpu_SetThreadAffinityMask(processorMask);
-}
-
-
 LibError cpu_CallByEachCPU(OsCpuCallback cb, uintptr_t cbData)
 {
 	for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
--- a/source/lib/sysdep/win/wcpu.cpp
+++ b/source/lib/sysdep/win/wcpu.cpp
@ -172,18 +172,12 @@ size_t os_cpu_MemoryAvailable()

 //-----------------------------------------------------------------------------

-/**
- * maximum number of processors supported by the OS (determined by the
- * number of bits in an affinity mask)
- **/
-static const DWORD maxProcessorNumber = sizeof(DWORD_PTR)*CHAR_BIT-1;
-
 DWORD_PTR wcpu_AffinityFromProcessorMask(DWORD_PTR processAffinity, uintptr_t processorMask)
 {
 	DWORD_PTR affinity = 0;

 	size_t processor = (size_t)-1;
-	for(DWORD processorNumber = 0; processorNumber <= maxProcessorNumber; processorNumber++)
+	for(DWORD processorNumber = 0; processorNumber < (DWORD)os_cpu_MaxProcessors; processorNumber++)
 	{
 		if(IsBitSet(processAffinity, processorNumber))
 		{
@ -202,7 +196,7 @@ uintptr_t wcpu_ProcessorMaskFromAffinity(DWORD_PTR processAffinity, DWORD_PTR af
 	uintptr_t processorMask = 0;

 	size_t processor = (size_t)-1;
-	for(DWORD processorNumber = 0; processorNumber <= maxProcessorNumber; processorNumber++)
+	for(DWORD processorNumber = 0; processorNumber < (DWORD)os_cpu_MaxProcessors; processorNumber++)
 	{
 		if(IsBitSet(processAffinity, processorNumber))
 		{
@ -271,31 +265,23 @@ uintptr_t os_cpu_SetThreadAffinityMask(uintptr_t processorMask)
 }


-void os_cpu_SetThreadAffinity(size_t processor)
-{
-	debug_assert(processor < os_cpu_NumProcessors());
-
-	const uintptr_t processorMask = uintptr_t(1) << processor;
-	(void)os_cpu_SetThreadAffinityMask(processorMask);
-}
-
-
 LibError os_cpu_CallByEachCPU(OsCpuCallback cb, uintptr_t cbData)
 {
-	// ensure we are able to run on all system processors
+	// abort if we can't run on all system processors
 	DWORD_PTR processAffinity, systemAffinity;
 	{
 		const BOOL ok = GetProcessAffinityMask(GetCurrentProcess(), &processAffinity, &systemAffinity);
 		debug_assert(ok);
 		if(processAffinity != systemAffinity)
-			WARN_RETURN(ERR::OS_CPU_RESTRICTED_AFFINITY);
+			return ERR::OS_CPU_RESTRICTED_AFFINITY;	// NOWARN
 	}

 	const uintptr_t previousAffinity = os_cpu_SetThreadAffinityMask(os_cpu_ProcessorMask());

 	for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
 	{
-		os_cpu_SetThreadAffinity(processor);
+		const uintptr_t processorMask = uintptr_t(1) << processor;
+		os_cpu_SetThreadAffinityMask(processorMask);
 		cb(processor, cbData);
 	}

--- a/source/lib/sysdep/x86_x64/topology.cpp
+++ b/source/lib/sysdep/x86_x64/topology.cpp
@ -16,222 +16,311 @@
 #include "lib/sysdep/os_cpu.h"
 #include "x86_x64.h"

-
 //-----------------------------------------------------------------------------
-
-// note: Intel Appnote 485 (CPUID) assures uniformity of coresPerPackage and
-// logicalPerCore across all packages.
-
-static size_t DetectCoresPerPackage()
-{
-	x86_x64_CpuidRegs regs;
-	switch(x86_x64_Vendor())
-	{
-	case X86_X64_VENDOR_INTEL:
-		regs.eax = 4;
-		regs.ecx = 0;
-		if(x86_x64_cpuid(&regs))
-			return bits(regs.eax, 26, 31)+1;
-		break;
-
-	case X86_X64_VENDOR_AMD:
-		regs.eax = 0x80000008;
-		if(x86_x64_cpuid(&regs))
-			return bits(regs.ecx, 0, 7)+1;
-		break;
-	}
-
-	return 1;	// else: the CPU is single-core.
-}
+// detect *maximum* number of cores/packages/caches.
+// note: some of them may be disabled by the OS or BIOS.
+// note: Intel Appnote 485 assures us that they are uniform across packages.

 static size_t CoresPerPackage()
 {
 	static size_t coresPerPackage = 0;
+
 	if(!coresPerPackage)
-		coresPerPackage = DetectCoresPerPackage();
+	{
+		coresPerPackage = 1;	// it's single core unless one of the following applies:
+
+		x86_x64_CpuidRegs regs;
+		switch(x86_x64_Vendor())
+		{
+		case X86_X64_VENDOR_INTEL:
+			regs.eax = 4;
+			regs.ecx = 0;
+			if(x86_x64_cpuid(&regs))
+				coresPerPackage = bits(regs.eax, 26, 31)+1;
+			break;
+
+		case X86_X64_VENDOR_AMD:
+			regs.eax = 0x80000008;
+			if(x86_x64_cpuid(&regs))
+				coresPerPackage = bits(regs.ecx, 0, 7)+1;
+			break;
+		}
+	}
+
 	return coresPerPackage;
 }


-static bool IsHyperthreadingCapable()
-{
-	// definitely not
-	if(!x86_x64_cap(X86_X64_CAP_HT))
-		return false;
-
-	// AMD N-core systems falsely set the HT bit for compatibility reasons
-	// (don't bother resetting it, might confuse callers)
-	if(x86_x64_Vendor() == X86_X64_VENDOR_AMD && x86_x64_cap(X86_X64_CAP_AMD_CMP_LEGACY))
-		return false;
-
-	return true;
-}
-
-static size_t DetectLogicalPerCore()
-{
-	if(!IsHyperthreadingCapable())
-		return 1;
-
-	x86_x64_CpuidRegs regs;
-	regs.eax = 1;
-	if(!x86_x64_cpuid(&regs))
-		DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING);
-	const size_t logicalPerPackage = bits(regs.ebx, 16, 23);
-
-	// cores ought to be uniform WRT # logical processors
-	debug_assert(logicalPerPackage % CoresPerPackage() == 0);
-
-	return logicalPerPackage / CoresPerPackage();
-}
-
 static size_t LogicalPerCore()
 {
 	static size_t logicalPerCore = 0;
+
 	if(!logicalPerCore)
-		logicalPerCore = DetectLogicalPerCore();
+	{
+		struct IsHyperthreadingCapable
+		{
+			bool operator()() const
+			{
+				// definitely not
+				if(!x86_x64_cap(X86_X64_CAP_HT))
+					return false;
+
+				// AMD N-core systems falsely set the HT bit for compatibility reasons
+				// (don't bother resetting it, might confuse callers)
+				if(x86_x64_Vendor() == X86_X64_VENDOR_AMD && x86_x64_cap(X86_X64_CAP_AMD_CMP_LEGACY))
+					return false;
+
+				return true;
+			}
+		};
+		if(!IsHyperthreadingCapable()())
+			logicalPerCore = 1;
+		else
+		{
+			x86_x64_CpuidRegs regs;
+			regs.eax = 1;
+			if(!x86_x64_cpuid(&regs))
+				DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING);
+			const size_t logicalPerPackage = bits(regs.ebx, 16, 23);
+			// cores ought to be uniform WRT # logical processors
+			debug_assert(logicalPerPackage % CoresPerPackage() == 0);
+			logicalPerCore = logicalPerPackage / CoresPerPackage();
+		}
+	}
+
 	return logicalPerCore;
 }

-enum CacheType
-{
-	CT_NONE = 0,
-	CT_DATA = 1,
-	CT_INSTRUCTION = 2,
-	CT_UNIFIED = 3
-};
-
-static bool IsL2DataCache(CacheType type, size_t level)
-{
-	if(type != CT_DATA && type != CT_UNIFIED)
-		return false;
-	if(level != 2)
-		return false;
-	return true;
-}
-
-static size_t DetectLogicalPerCache()
-{
-	// note: Intel Appnote 485 says the order in which caches are returned is
-	// undefined, so we need to loop through all of them.
-	for(u32 count = 0; ; count++)
-	{
-		x86_x64_CpuidRegs regs;
-		regs.eax = 4;
-		regs.ecx = count;
-		x86_x64_cpuid(&regs);
-	
-		const CacheType type = (CacheType)bits(regs.eax, 0, 4);
-		// no more caches left
-		if(type == CT_NONE)
-		{
-			debug_assert(0);	// we somehow didn't find the L2d
-			return 1;
-		}
-
-		const size_t level = bits(regs.eax, 5, 7);
-		if(IsL2DataCache(type, level))
-		{
-			const size_t logicalPerCache = bits(regs.eax, 14, 25)+1;
-			return logicalPerCache;
-		}
-	}
-}

 static size_t LogicalPerCache()
 {
 	static size_t logicalPerCache;
+
 	if(!logicalPerCache)
-		logicalPerCache = DetectLogicalPerCache();
+	{
+		logicalPerCache = 1;	// caches aren't shared unless we find a descriptor
+
+		// note: Intel Appnote 485 says the order in which caches are returned is
+		// undefined, so we need to loop through all of them.
+		for(u32 count = 0; ; count++)
+		{
+			// get next cache descriptor
+			x86_x64_CpuidRegs regs;
+			regs.eax = 4;
+			regs.ecx = count;
+			x86_x64_cpuid(&regs);
+			const u32 type = bits(regs.eax, 0, 4);
+			if(type == 0)	// no more remaining
+				break;
+			
+			struct IsL2DataCache
+			{
+				bool operator()(u32 type, u32 level) const
+				{
+					if(type != 1 && type != 3)	// neither data nor unified
+						return false;
+					if(level != 2)
+						return false;
+					return true;
+				}
+			};
+			const u32 level = bits(regs.eax, 5, 7);
+			if(IsL2DataCache()(type, level))
+				logicalPerCache = bits(regs.eax, 14, 25)+1;
+		}
+	}
+
 	return logicalPerCache;
 }


 //-----------------------------------------------------------------------------
+// determination of enabled cores/HTs

-// the above functions give the maximum number of cores/logical units.
-// however, some of them may actually be disabled by the BIOS!
-// what we can do is to analyze the APIC IDs. they are allocated sequentially
-// for all "processors". treating the IDs as variable-width bit fields
-// (according to the number of cores/logical units present) allows
-// determining the exact topology as well as number of packages.
-
-// these are set by DetectProcessorTopology.
-static size_t numPackages = 0;	// i.e. sockets; > 1 => true SMP system
-static size_t enabledCoresPerPackage = 0;
-static size_t enabledLogicalPerCore = 0;	// hyperthreading units
-
-typedef std::vector<u8> Ids;
-
-// add the currently running processor's APIC ID to a list of IDs.
-static void StoreApicId(size_t UNUSED(processor), uintptr_t cbData)
-{
-	Ids* const apicIds = (Ids*)cbData;
-	apicIds->push_back(x86_x64_ApicId());
-}
-
-// if successful, apicIds[i] contains the unique ID of OS processor i.
-static bool GatherApicIds(Ids& apicIds)
-{
-	// old APIC (see x86_x64_ApicId for details)
-	if(x86_x64_Generation() < 8)
-		return false;
-
-	// process affinity prevents us from seeing all APIC IDs
-	if(PopulationCount(os_cpu_ProcessorMask()) != os_cpu_NumProcessors())
-		return false;
-
-	const LibError ret = os_cpu_CallByEachCPU(StoreApicId, (uintptr_t)&apicIds);
-	debug_assert(ret == INFO::OK);
-
-	// ensure we got a unique ID for every processor
-	{
-		Ids tmp(apicIds);
-		Ids::iterator end = tmp.end();
-		std::sort(tmp.begin(), end);
-		debug_assert(std::unique(tmp.begin(), end) == end);
-		debug_assert(std::distance(tmp.begin(), end) == (ptrdiff_t)os_cpu_NumProcessors());
-	}
-
-	return true;
-}
-
-
-typedef std::set<u8> IdSet;
+// APIC IDs consist of variable-length fields identifying the logical unit,
+// core, package and shared cache. if they are available, we can determine
+// the exact topology; otherwise we have to guess.

 /**
- * "field" := a range of bits sufficient to represent <numValues> integers.
- * for each id in <apicIds>: extract the value of the field starting at
- * <offset> and insert it into <ids>. afterwards, adjust <offset> to the
- * next field.
- *
- * used to gather e.g. all core IDs from all APIC IDs.
+ * @return an array of the processors' unique APIC IDs or zero if
+ * no APIC is present or process affinity is limited.
 **/
-static void ExtractFieldIntoSet(const Ids& apicIds, size_t& offset, size_t numValues, IdSet& ids)
+static const u8* ApicIds()
+{
+	static u8 apicIdStorage[os_cpu_MaxProcessors];
+	static const u8* apicIds;
+
+	static volatile uintptr_t initialized = 0;
+	if(cpu_CAS(&initialized, 0, 1))
+	{
+		// requires 'new' APIC (see x86_x64_ApicId for details)
+		if(x86_x64_Generation() >= 8)
+		{
+			// store each processor's APIC ID in turn
+			struct StoreApicId
+			{
+				static void Callback(size_t processor, uintptr_t UNUSED(cbData))
+				{
+					apicIdStorage[processor] = x86_x64_ApicId();
+				}
+			};
+			if(os_cpu_CallByEachCPU(StoreApicId::Callback, (uintptr_t)&apicIds) == INFO::OK)
+				apicIds = apicIdStorage;	// success, return valid array from now on
+		}
+	}
+
+	return apicIds;
+}
+
+
+/**
+ * count the number of unique values assumed by a certain field (i.e. part
+ * of the APIC ID).
+ * @param numBits width of the field; must be set to ceil_log2 of the
+ * maximum value that can be assumed by the field.
+ * @return number of unique values (one if numBits is zero - this is
+ * convenient and kind of justified by counting the empty symbol)
+ **/
+static size_t NumUniqueValuesInField(const u8* apicIds, size_t offset, size_t numBits)
 {
-	const size_t numBits = ceil_log2(numValues);
 	if(numBits == 0)
-		return;
+		return 1;	// see above
 	const u8 mask = bit_mask<u8>(numBits);

-	for(size_t i = 0; i < apicIds.size(); i++)
+	typedef std::set<u8> IdSet;
+	IdSet ids;
+	for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
 	{
-		const u8 apicId = apicIds[i];
+		const u8 apicId = apicIds[processor];
 		const u8 field = u8(apicId >> offset) & mask;
 		ids.insert(field);
 	}

-	offset += numBits;
+	return ids.size();
 }

-static size_t numCaches = 0;	// L2d
-static std::vector<size_t> processorsCache;
-static std::vector<uintptr_t> cachesProcessorMask;
+
+size_t cpu_NumPackages()
+{
+	static size_t numPackages = 0;
+
+	if(!numPackages)
+	{
+		const u8* apicIds = ApicIds();
+		if(apicIds)
+		{
+			const size_t offset = ceil_log2(CoresPerPackage()) + ceil_log2(LogicalPerCore());
+			const size_t numBits = 8;
+			numPackages = NumUniqueValuesInField(apicIds, offset, numBits);
+		}
+		else
+		{
+			// note: correct results cannot be guaranteed because unreported
+			// and disable logical units are indistinguishable. the below
+			// assumptions are reasonable because we care most about packages
+			// (i.e. whether the system is truly SMP). in contrast, it is
+			// safe to overestimate the number of cores because that
+			// only determines if memory barriers are needed or not.
+			// note: requiring modern processors featuring an APIC does not
+			// prevent this from being reached (the cause may be lack of
+			// OS support or restricted process affinity).
+
+			// assume cores are enabled and count as processors.
+			const size_t numPackagesTimesLogical = os_cpu_NumProcessors() / CoresPerPackage();
+			debug_assert(numPackagesTimesLogical != 0);
+			// assume hyperthreads are enabled; check if they count as processors.
+			if(numPackagesTimesLogical > LogicalPerCore())
+				numPackages = numPackagesTimesLogical / LogicalPerCore();
+		}
+	}
+
+	return numPackages;
+}


+size_t cpu_CoresPerPackage()
+{
+	static size_t enabledCoresPerPackage;

-class CacheManager
+	if(!enabledCoresPerPackage)
+	{
+		const u8* apicIds = ApicIds();
+		if(apicIds)
+		{
+			const size_t offset = ceil_log2(LogicalPerCore());
+			const size_t numBits = ceil_log2(CoresPerPackage());
+			enabledCoresPerPackage = NumUniqueValuesInField(apicIds, offset, numBits);
+		}
+		else
+		{
+			// guess (must match cpu_NumPackages's assumptions)
+			enabledCoresPerPackage = CoresPerPackage();
+		}
+	}
+
+	return enabledCoresPerPackage;
+}
+
+
+size_t cpu_LogicalPerCore()
+{
+	static size_t enabledLogicalPerCore;
+
+	if(!enabledLogicalPerCore)
+	{
+		const u8* apicIds = ApicIds();
+		if(apicIds)
+		{
+			const size_t offset = 0;
+			const size_t numBits = ceil_log2(LogicalPerCore());
+			enabledLogicalPerCore = NumUniqueValuesInField(apicIds, offset, numBits);
+		}
+		else
+		{
+			// guess (must match cpu_NumPackages's assumptions)
+			enabledLogicalPerCore = LogicalPerCore();
+		}
+	}
+
+	return enabledLogicalPerCore;
+}
+
+
+//-----------------------------------------------------------------------------
+// cache topology
+
+// note: Windows 2003 GetLogicalProcessorInformation provides similar
+// functionality but returns incorrect results. (it claims all cores in
+// an Intel Core2 Quad processor share a single L2 cache.)
+
+size_t cpu_NumCaches()
+{
+	static size_t numCaches;
+	if(!numCaches)
+	{
+		const u8* apicIds = ApicIds();
+		if(apicIds)
+		{
+			const size_t offset = 0;
+			const size_t numBits = ceil_log2(LogicalPerCache());
+			numCaches = NumUniqueValuesInField(apicIds, offset, numBits);
+		}
+		else
+		{
+			// assume each processor has its own cache
+			numCaches = os_cpu_NumProcessors();
+		}
+	}
+
+	return numCaches;
+}
+
+class CacheTopology
 {
 public:
+	/**
+	 * add processor to the processor mask owned by cache identified by <id>
+	 **/
 	void Add(u8 id, size_t processor)
 	{
 		SharedCache* cache = Find(id);
@ -243,14 +332,20 @@ public:
 		cache->Add(processor);
 	}

-	void StoreProcessorMasks(std::vector<uintptr_t>& processorMasks)
+	/**
+	 * store topology in an array (one entry per cache) of masks
+	 * representing the processors that share a cache.
+	 **/
+	void StoreProcessorMasks(uintptr_t* processorMasks)
 	{
-		processorMasks.resize(m_caches.size());
 		for(size_t i = 0; i < m_caches.size(); i++)
 			processorMasks[i] = m_caches[i].ProcessorMask();
 	}

 private:
+	/**
+	 * stores ID and tracks which processors share this cache
+	 **/
 	class SharedCache
 	{
 	public:
@ -293,150 +388,64 @@ private:
 	std::vector<SharedCache> m_caches;
 };

-static void DetectCacheTopology(const Ids& apicIds)
+uintptr_t cpu_ProcessorMaskFromCache(size_t cache)
 {
-	const size_t numBits = ceil_log2(LogicalPerCache());
-	const u8 cacheIdMask = u8(0xFF << numBits);
+	static uintptr_t cachesProcessorMask[os_cpu_MaxProcessors];

-	CacheManager cacheManager;
-	for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
+	static volatile uintptr_t initialized = 0;
+	if(cpu_CAS(&initialized, 0, 1))
 	{
-		const u8 apicId = apicIds[processor];
-		const u8 cacheId = apicId & cacheIdMask;
-		cacheManager.Add(cacheId, processor);
-	}
-	cacheManager.StoreProcessorMasks(cachesProcessorMask);
-	numCaches = cachesProcessorMask.size();
-
-	const size_t invalidCache = ~(size_t)0;
-	processorsCache.resize(os_cpu_NumProcessors(), invalidCache);
-	for(size_t cache = 0; cache < numCaches; cache++)
-	{
-		const uintptr_t processorMask = cachesProcessorMask[cache];
-		for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
+		const u8* apicIds = ApicIds();
+		if(apicIds)
 		{
-			if(IsBitSet(processorMask, processor))
-				processorsCache[processor] = cache;
+			const size_t numBits = ceil_log2(LogicalPerCache());
+			const u8 cacheIdMask = u8(0xFF << numBits);
+
+			CacheTopology cacheManager;
+			for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
+			{
+				const u8 apicId = apicIds[processor];
+				const u8 cacheId = apicId & cacheIdMask;
+				cacheManager.Add(cacheId, processor);
+			}
+			cacheManager.StoreProcessorMasks(cachesProcessorMask);
+		}
+		else
+		{
+			// assume each cache belongs to exactly one processor and
+			// cache index == processor index.
+			for(size_t cache = 0; cache < cpu_NumCaches(); cache++)
+				cachesProcessorMask[cache] = uintptr_t(1) << cache;
 		}
 	}
-	for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
-	{
-		debug_assert(processorsCache[processor] != invalidCache);
-		debug_assert(processorsCache[processor] < numCaches);
-	}
-}

-
-// @return false if unavailable / no information can be returned.
-static bool DetectProcessorTopologyViaApicIds()
-{
-	Ids apicIds;
-	if(!GatherApicIds(apicIds))
-		return false;
-
-	// extract values from all 3 ID bit fields into separate sets
-	size_t offset = 0;
-	IdSet logicalIds;
-	ExtractFieldIntoSet(apicIds, offset, LogicalPerCore(), logicalIds);
-	IdSet coreIds;
-	ExtractFieldIntoSet(apicIds, offset, CoresPerPackage(), coreIds);
-	IdSet packageIds;
-	ExtractFieldIntoSet(apicIds, offset, 0xFF, packageIds);
-
-	numPackages            = std::max(packageIds.size(), size_t(1));
-	enabledCoresPerPackage = std::max(coreIds   .size(), size_t(1));
-	enabledLogicalPerCore  = std::max(logicalIds.size(), size_t(1));
-
-	// note: cache ID possibly overlaps the other fields. we also want to
-	// retrieve more information (mappings between processor and cache ID),
-	// so this needs to be handled separately.
-	DetectCacheTopology(apicIds);
-
-	return true;
-}
-
-
-static void GuessProcessorTopologyViaOsCount()
-{
-	const size_t numProcessors = os_cpu_NumProcessors();
-
-	// note: we cannot hope to always return correct results since disabled
-	// cores/logical units cannot be distinguished from the situation of the
-	// OS simply not reporting them as "processors". unfortunately this
-	// function won't always only be called for older (#core = #logical = 1)
-	// systems because DetectProcessorTopologyViaApicIds may fail due to
-	// lack of OS support. what we'll do is assume nothing is disabled; this
-	// is reasonable because we care most about #packages. it's fine to assume
-	// more cores (without inflating the total #processors) because that
-	// count only indicates memory barriers etc. ought to be used.
-	enabledCoresPerPackage = CoresPerPackage();
-	enabledLogicalPerCore = LogicalPerCore();
-
-	const size_t numPackagesTimesLogical = numProcessors / CoresPerPackage();
-	debug_assert(numPackagesTimesLogical != 0);	// otherwise processors didn't include cores, which would be stupid
-
-	numPackages = numPackagesTimesLogical / LogicalPerCore();
-	if(!numPackages)	// processors didn't include logical units (reasonable)
-		numPackages = numPackagesTimesLogical;
-}
-
-
-// determine how many CoresPerPackage and LogicalPerCore are
-// actually enabled and also count numPackages.
-static void DetectProcessorTopology()
-{
-	// authoritative, but requires OS support and fairly recent CPUs
-	if(DetectProcessorTopologyViaApicIds())
-		return;	// success, we're done.
-
-	GuessProcessorTopologyViaOsCount();
-}
-
-
-size_t cpu_NumPackages()
-{
-	if(!numPackages)
-		DetectProcessorTopology();
-	return numPackages;
-}
-
-size_t cpu_CoresPerPackage()
-{
-	if(!enabledCoresPerPackage)
-		DetectProcessorTopology();
-	return enabledCoresPerPackage;
-}
-
-size_t cpu_LogicalPerCore()
-{
-	if(!enabledLogicalPerCore)
-		DetectProcessorTopology();
-	return enabledLogicalPerCore;
-}
-
-size_t cpu_NumCaches()
-{
-	if(!numCaches)
-		DetectProcessorTopology();
-	return numCaches;
+	debug_assert(cache < cpu_NumCaches());
+	return cachesProcessorMask[cache];
 }


 size_t cpu_CacheFromProcessor(size_t processor)
 {
+	static size_t processorsCache[os_cpu_MaxProcessors];
+
+	static volatile uintptr_t initialized = 0;
+	if(cpu_CAS(&initialized, 0, 1))
+	{
+		for(size_t cache = 0; cache < cpu_NumCaches(); cache++)
+		{
+			// write to all entries that share this cache
+			const uintptr_t processorMask = cpu_ProcessorMaskFromCache(cache);
+			for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
+			{
+				if(IsBitSet(processorMask, processor))
+				{
+					debug_assert(processorsCache[processor] == 0);
+					processorsCache[processor] = cache;
+				}
+			}
+		}
+	}
+
 	debug_assert(processor < os_cpu_NumProcessors());
-	DetectProcessorTopology();
-	return processorsCache.at(processor);
+	return processorsCache[processor];
 }
-
-uintptr_t cpu_ProcessorMaskFromCache(size_t cache)
-{
-	debug_assert(cache < cpu_NumCaches());
-	DetectProcessorTopology();
-	return cachesProcessorMask.at(cache);
-}
-
-
-// note: Windows 2003 GetLogicalProcessorInformation returns incorrect
-// information, claiming all cores in an Intel Core2 Quad processor
-// share an L2 cache.
--- a/source/lib/sysdep/x86_x64/topology.h
+++ b/source/lib/sysdep/x86_x64/topology.h
@ -11,9 +11,13 @@
 #ifndef INCLUDED_TOPOLOGY
 #define INCLUDED_TOPOLOGY

-// OSes report hyperthreading units and cores as "processors". we need to
-// drill down and find out the exact counts (for thread pool dimensioning
-// and cache sharing considerations).
+//-----------------------------------------------------------------------------
+// CPU
+
+// OSes typically consider both SMT units and cores to be "processors".
+// the following routines determine how many of each are actually present and
+// enabled. this information is useful for detecting SMP systems, predicting
+// performance and dimensioning thread pools.

 /**
 * @return number of *enabled* CPU packages / sockets.
@ -36,6 +40,11 @@ LIB_API size_t cpu_LogicalPerCore();
 //-----------------------------------------------------------------------------
 // L2 cache

+// some CPU micro-architectures (e.g. Intel Core2) feature partitioned
+// L2 caches. if the cores sharing a cache work together on the same
+// sub-problem, contention may be reduced and effective capacity increased.
+// the following routines allow discovery of the L2 cache topology:
+
 /**
 * @return number of distinct L2 caches
 **/