fixes+improvements from work:

- add AlignedAllocator - an STL allocator that returns cache-line-aligned objects (required to avoid RFOs when threads write to various independent items in a container) - bits: bit_mask can now be used for N=0..numBits (works around full-word-shifts-are-undefined issue) - precompiled.h: remove scoped_ptr, add function-related stuff from TR1 - numa: . add numa_IsMemoryInterleaved . numa_Allocate is now able to allocate large pages as well (reduces TLB misses) - os_cpu: interface change to support 32-bit apps running on WoW64 systems with > 4 GB of memory - topology: use new x86_x64_EnumerateCaches API; fix detection of cache ID - x86_x64: provide the means of enumerating all caches returned by CPUID and detect L1 cache size This was SVN commit r6004.
2008-06-01 08:25:12 +00:00 · 2008-06-01 08:25:12 +00:00 · 5d80d2ee5d
commit 5d80d2ee5d
parent 0118bfd634
14 changed files with 411 additions and 128 deletions
--- a/source/lib/allocators/aligned_allocator.cpp
+++ b/source/lib/allocators/aligned_allocator.cpp
@ -0,0 +1,12 @@
+/**
+ * =========================================================================
+ * File        : aligned_allocator.cpp
+ * Project     : 0 A.D.
+ * Description : STL allocator for aligned memory
+ * =========================================================================
+ */
+
+// license: GPL; see lib/license.txt
+
+#include "precompiled.h"
+#include "aligned_allocator.h"
--- a/source/lib/allocators/aligned_allocator.h
+++ b/source/lib/allocators/aligned_allocator.h
@ -0,0 +1,130 @@
+/**
+ * =========================================================================
+ * File        : aligned_allocator.h
+ * Project     : 0 A.D.
+ * Description : STL allocator for aligned memory
+ * =========================================================================
+ */
+
+// license: GPL; see lib/license.txt
+
+#ifndef ALIGNED_ALLOCATOR
+#define ALIGNED_ALLOCATOR
+
+#include "lib/bits.h"	// round_up
+#include "lib/sysdep/x86_x64/x86_x64.h"	// x86_x64_L1CacheLineSize
+
+
+/**
+ * stateless STL allocator that aligns elements to the L1 cache line size.
+ *
+ * note: the alignment is hard-coded to avoid any allocator state.
+ * this avoids portability problems, which is important since allocators
+ * are rather poorly specified.
+ *
+ * references:
+ * http://www.tantalon.com/pete/customallocators.ppt
+ * http://www.flipcode.com/archives/Aligned_Block_Allocation.shtml
+ * http://www.josuttis.com/cppcode/allocator.html
+ *
+ * derived from code that bears the following copyright notice:
+ * (C) Copyright Nicolai M. Josuttis 1999.
+ * Permission to copy, use, modify, sell and distribute this software
+ * is granted provided this copyright notice appears in all copies.
+ * This software is provided "as is" without express or implied
+ * warranty, and with no claim as to its suitability for any purpose.
+ **/
+template<class T>
+class AlignedAllocator
+{
+public:
+	// type definitions
+	typedef T        value_type;
+	typedef T*       pointer;
+	typedef const T* const_pointer;
+	typedef T&       reference;
+	typedef const T& const_reference;
+	typedef std::size_t    size_type;
+	typedef std::ptrdiff_t difference_type;
+
+	// rebind allocator to type U
+	template <class U>
+	struct rebind
+	{
+		typedef AlignedAllocator<U> other;
+	};
+
+	pointer address(reference value) const
+	{
+		return &value;
+	}
+
+	const_pointer address(const_reference value) const
+	{
+		return &value;
+	}
+
+	AlignedAllocator() throw()
+	{
+	}
+
+	AlignedAllocator(const AlignedAllocator&) throw()
+	{
+	}
+
+	template <class U>
+	AlignedAllocator (const AlignedAllocator<U>&) throw()
+	{
+	}
+
+	~AlignedAllocator() throw()
+	{
+	}
+
+	size_type max_size() const throw()
+	{
+		// maximum number of *elements* that can be allocated
+		return std::numeric_limits<std::size_t>::max() / sizeof(T);
+	}
+
+	// allocate uninitialized storage
+	pointer allocate(size_type numElements, const void* hint = 0)
+	{
+		const size_type alignment = x86_x64_L1CacheLineSize();
+		const size_type elementSize = round_up(sizeof(T), alignment);
+		const size_type size = numElements * elementSize;
+		pointer p = (pointer)_aligned_malloc(size, alignment);
+		return p;
+	}
+
+	// deallocate storage of elements that have been destroyed
+	void deallocate(pointer p, size_type num)
+	{
+		_aligned_free((void*)p);
+	}
+
+	void construct(pointer p, const T& value)
+	{
+		new((void*)p) T(value);
+	}
+
+	void destroy(pointer p)
+	{
+		p->~T();
+	}
+};
+
+// indicate that all specializations of this allocator are interchangeable
+template <class T1, class T2>
+bool operator==(const AlignedAllocator<T1>&, const AlignedAllocator<T2>&) throw()
+{
+	return true;
+}
+
+template <class T1, class T2>
+bool operator!=(const AlignedAllocator<T1>&, const AlignedAllocator<T2>&) throw()
+{
+	return false;
+}
+
+#endif	// #ifndef ALIGNED_ALLOCATOR
--- a/source/lib/bits.h
+++ b/source/lib/bits.h
@ -44,9 +44,14 @@ bool IsBitSet(T value, size_t index)
 * @param num_bits number of bits in mask
 **/
 template<typename T>
-T bit_mask(size_t num_bits)
+T bit_mask(size_t numBits)
 {
-	return (T)(T(1) << num_bits)-1;
+	if(numBits == 0)	// prevent shift count == bitsInT, which would be undefined.
+		return 0;
+	// note: the perhaps more intuitive (1 << numBits)-1 cannot
+	// handle numBits == bitsInT, but this implementation does.
+	const T bitsInT = sizeof(T)*CHAR_BIT;
+	return ~T(0) >> T(bitsInT-numBits);
 }


@ -64,7 +69,7 @@ template<typename T>
 inline T bits(T num, size_t lo_idx, size_t hi_idx)
 {
 	const size_t count = (hi_idx - lo_idx)+1;	// # bits to return
-	T result = num >> lo_idx;
+	T result = num >> T(lo_idx);
 	result &= bit_mask<T>(count);
 	return result;
 }
--- a/source/lib/precompiled.h
+++ b/source/lib/precompiled.h
@ -63,10 +63,15 @@
 # define BOOST_ALL_DYN_LINK 
 #endif
 #include <boost/utility.hpp>	// noncopyable
-#include <boost/shared_array.hpp>
+// the following boost libraries have been included in TR1 and are
+// thus deemed usable:
 #include <boost/shared_ptr.hpp>
-#include <boost/scoped_ptr.hpp>
-using boost::shared_ptr;	// has been added to TR1
+using boost::shared_ptr;
+#include <boost/mem_fn.hpp>
+using boost::mem_fn;
+#include <boost/function.hpp>
+using boost::function;
+#include <boost/bind.hpp>
 #include "lib/external_libraries/boost_filesystem.h"

 // (this must come after boost and common lib headers)
--- a/source/lib/rand.h
+++ b/source/lib/rand.h
@ -16,6 +16,6 @@
 * avoids several common pitfalls; see discussion at
 * http://www.azillionmonkeys.com/qed/random.html
 **/
-extern size_t rand(size_t min_inclusive, size_t max_exclusive);
+LIB_API size_t rand(size_t min_inclusive, size_t max_exclusive);

 #endif	// #ifndef INCLUDED_RAND
--- a/source/lib/sysdep/linux/lcpu.cpp
+++ b/source/lib/sysdep/linux/lcpu.cpp
@ -70,7 +70,10 @@ size_t os_cpu_MemorySize()
 	static size_t memorySize;

 	if(!memorySize)
-		memorySize = sysconf(_SC_PHYS_PAGES) * os_cpu_PageSize();
+	{
+		const uint64_t memorySizeBytes = (uint64_t)sysconf(_SC_PHYS_PAGES) * os_cpu_PageSize();
+		memorySize = size_t(memorySizeBytes / MiB);
+	}

 	return memorySize;
 }
@ -78,7 +81,8 @@ size_t os_cpu_MemorySize()

 size_t os_cpu_MemoryAvailable()
 {
-	const size_t memoryAvailable = sysconf(_SC_AVPHYS_PAGES) * os_cpu_PageSize();
+	const uint64_t memoryAvailableBytes = (uint64_t)sysconf(_SC_AVPHYS_PAGES) * os_cpu_PageSize();
+	const size_t memoryAvailable = size_t(memoryAvailableBytes / MiB);
 	return memoryAvailable;
 }

--- a/source/lib/sysdep/numa.h
+++ b/source/lib/sysdep/numa.h
@ -36,16 +36,19 @@ LIB_API size_t numa_AvailableMemory(size_t node);
 **/
 LIB_API double numa_Factor();

+/**
+ * @return an indication of whether memory pages are node-interleaved.
+ *
+ * note: this requires ACPI access, which may not be available on
+ * least-permission accounts. the default is to return false so as
+ * not to cause callers to panic and trigger performance warnings.
+ **/
+LIB_API bool numa_IsMemoryInterleaved();
+

 //-----------------------------------------------------------------------------
 // allocator

-/**
- * simple allocator that "does the right thing" on NUMA systems - page frames
- * will be taken from the node that first accesses them.
- **/
-LIB_API void* numa_Allocate(size_t size);
-
 enum LargePageDisposition
 {
 	LPD_DEFAULT,
@ -54,15 +57,25 @@ enum LargePageDisposition
 };

 /**
- * allocate memory from a specific node.
+ * simple allocator that "does the right thing" on NUMA systems.
 *
- * @param node node number (zero-based)
 * @param largePageDisposition - allows forcibly enabling/disabling the use
 * of large pages; the default decision involves a heuristic.
 * @param pageSize if non-zero, receives the size [bytes] of a single page
 * out of those used to map the memory.
+ *
+ * note: page frames will be taken from the node that first accesses them.
 **/
-LIB_API void* numa_AllocateOnNode(size_t size, size_t node, LargePageDisposition largePageDisposition = LPD_DEFAULT, size_t* pageSize = 0);
+LIB_API void* numa_Allocate(size_t size, LargePageDisposition largePageDisposition = LPD_DEFAULT, size_t* ppageSize = 0);
+
+/**
+ * allocate memory from a specific node.
+ *
+ * @param node node number (zero-based)
+ * @param largePageDisposition - see numa_Allocate
+ * @param pageSize - see numa_Allocate
+ **/
+LIB_API void* numa_AllocateOnNode(size_t node, size_t size, LargePageDisposition largePageDisposition = LPD_DEFAULT, size_t* pageSize = 0);

 /**
 * release memory that had been handed out by one of the above allocators.
--- a/source/lib/sysdep/os_cpu.h
+++ b/source/lib/sysdep/os_cpu.h
@ -76,12 +76,12 @@ LIB_API size_t os_cpu_PageSize();
 LIB_API size_t os_cpu_LargePageSize();

 /**
- * @return the size [bytes] of physical memory.
+ * @return the size [MB] of physical memory.
 **/
 LIB_API size_t os_cpu_MemorySize();

 /**
- * @return the size [bytes] of currently available memory.
+ * @return the size [MB] of currently available memory.
 **/
 LIB_API size_t os_cpu_MemoryAvailable();

--- a/source/lib/sysdep/osx/ocpu.cpp
+++ b/source/lib/sysdep/osx/ocpu.cpp
@ -69,6 +69,7 @@ size_t os_cpu_MemorySize()
 		// Argh, the API doesn't seem to be const-correct
 		/*const*/ int mib[2] = { CTL_HW, HW_PHYSMEM };
 		sysctl(mib, 2, &memorySize, &len, 0, 0);
+		memorySize /= MiB;
 	}

 	return memorySize;
@ -82,6 +83,7 @@ size_t os_cpu_MemoryAvailable()
 	// Argh, the API doesn't seem to be const-correct
 	/*const*/ int mib[2] = { CTL_HW, HW_USERMEM };
 	sysctl(mib, 2, &memoryAvailable, &len, 0, 0);
+	memoryAvailable /= MiB;
 	return memoryAvailable;
 }

--- a/source/lib/sysdep/win/wcpu.cpp
+++ b/source/lib/sysdep/win/wcpu.cpp
@ -142,31 +142,33 @@ static void GetMemoryStatus(MEMORYSTATUSEX& mse)

 size_t os_cpu_MemorySize()
 {
-	static size_t memorySize;
+	static size_t memorySizeMiB;

-	if(memorySize == 0)
+	if(memorySizeMiB == 0)
 	{
 		MEMORYSTATUSEX mse;
 		GetMemoryStatus(mse);
-		memorySize = (size_t)mse.ullTotalPhys;
+		DWORDLONG memorySize = mse.ullTotalPhys;

 		// Richter, "Programming Applications for Windows": the reported
 		// value doesn't include non-paged pool reserved during boot;
 		// it's not considered available to the kernel. (the amount is
 		// 528 KiB on a 512 MiB WinXP/Win2k machine). we'll round up
 		// to the nearest megabyte to fix this.
-		memorySize = round_up(memorySize, 1*MiB);
+		memorySize = round_up(memorySize, DWORDLONG(1*MiB));
+
+		memorySizeMiB = size_t(memorySize / MiB);
 	}

-	return memorySize;
+	return memorySizeMiB;
 }

 size_t os_cpu_MemoryAvailable()
 {
 	MEMORYSTATUSEX mse;
 	GetMemoryStatus(mse);
-	const size_t memoryAvailable = (size_t)mse.ullAvailPhys;
-	return memoryAvailable;
+	const size_t memoryAvailableMiB = size_t(mse.ullAvailPhys / MiB);
+	return memoryAvailableMiB;
 }


--- a/source/lib/sysdep/win/wnuma.cpp
+++ b/source/lib/sysdep/win/wnuma.cpp
@ -4,6 +4,7 @@
 #include "lib/bits.h"	// round_up, PopulationCount
 #include "lib/timer.h"
 #include "lib/sysdep/os_cpu.h"
+#include "lib/sysdep/acpi.h"
 #include "win.h"
 #include "wutil.h"
 #include "wcpu.h"
@ -141,7 +142,8 @@ size_t numa_AvailableMemory(size_t node)
 		ULONGLONG availableBytes;
 		const BOOL ok = pGetNumaAvailableMemoryNode((UCHAR)node, &availableBytes);
 		debug_assert(ok);
-		return (size_t)availableBytes;
+		const size_t availableMiB = size_t(availableBytes / MiB);
+		return availableMiB;
 	}
 	// NUMA not supported - return available system memory
 	else
@ -194,22 +196,34 @@ double numa_Factor()
 }


+bool numa_IsMemoryInterleaved()
+{
+	WinScopedLock lock(WNUMA_CS);
+	static int isInterleaved = -1;
+	if(isInterleaved == -1)
+	{
+		if(acpi_Init())
+		{
+			// the BIOS only generates an SRAT (System Resource Affinity Table)
+			// if node interleaving is disabled.
+			isInterleaved = acpi_GetTable("SRAT") == 0;
+			acpi_Shutdown();
+		}
+		else
+			isInterleaved = 0;	// can't tell
+	}
+
+	return isInterleaved != 0;
+}
+
+
 //-----------------------------------------------------------------------------
 // allocator
 //-----------------------------------------------------------------------------

-void* numa_Allocate(size_t size)
-{
-	void* const mem = VirtualAlloc(0, size, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE);
-	if(!mem)
-		throw std::bad_alloc();
-	return mem;
-}
-
-
 static bool largePageAllocationTookTooLong = false;

-static bool ShouldUseLargePages(LargePageDisposition disposition, size_t allocationSize, size_t node)
+static bool ShouldUseLargePages(LargePageDisposition disposition, size_t allocationSize)
 {
 	// can't, OS does not support large pages
 	if(os_cpu_LargePageSize() == 0)
@ -236,7 +250,7 @@ static bool ShouldUseLargePages(LargePageDisposition disposition, size_t allocat
 		// we want there to be plenty of memory available, otherwise the
 		// page frames are going to be terribly fragmented and even a
 		// single allocation would take SECONDS.
-		if(numa_AvailableMemory(node) < 2*GiB)
+		if(os_cpu_MemoryAvailable() < 2000)	// 2 GB
 			return false;
 	}

@ -244,6 +258,44 @@ static bool ShouldUseLargePages(LargePageDisposition disposition, size_t allocat
 }


+void* numa_Allocate(size_t size, LargePageDisposition largePageDisposition, size_t* ppageSize)
+{
+	void* mem = 0;
+
+	// try allocating with large pages (reduces TLB misses)
+	if(ShouldUseLargePages(largePageDisposition, size))
+	{
+		const size_t largePageSize = os_cpu_LargePageSize();
+		const size_t paddedSize = round_up(size, largePageSize);	// required by MEM_LARGE_PAGES
+		// note: this call can take SECONDS, which is why several checks are
+		// undertaken before we even try. these aren't authoritative, so we
+		// at least prevent future attempts if it takes too long.
+		const double startTime = timer_Time();
+		mem = VirtualAlloc(0, paddedSize, MEM_RESERVE|MEM_COMMIT|MEM_LARGE_PAGES, PAGE_READWRITE);
+		if(ppageSize)
+			*ppageSize = largePageSize;
+		const double elapsedTime = timer_Time() - startTime;
+		debug_printf("TIMER| NUMA large page allocation: %g\n", elapsedTime);
+		if(elapsedTime > 1.0)
+			largePageAllocationTookTooLong = true;
+	}
+
+	// try (again) with regular pages
+	if(!mem)
+	{
+		mem = VirtualAlloc(0, size, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE);
+		if(ppageSize)
+			*ppageSize = os_cpu_PageSize();
+	}
+
+	// all attempts failed - we're apparently out of memory.
+	if(!mem)
+		throw std::bad_alloc();
+
+	return mem;
+}
+
+
 static bool VerifyPages(void* mem, size_t size, size_t pageSize, size_t node)
 {
 	typedef BOOL (WINAPI *PQueryWorkingSetEx)(HANDLE hProcess, PVOID buffer, DWORD bufferSize);
@ -294,61 +346,35 @@ static bool VerifyPages(void* mem, size_t size, size_t pageSize, size_t node)
 }


-void* numa_AllocateOnNode(size_t size, size_t node, LargePageDisposition largePageDisposition, size_t* ppageSize)
+void* numa_AllocateOnNode(size_t node, size_t size, LargePageDisposition largePageDisposition, size_t* ppageSize)
 {
 	debug_assert(node < numa_NumNodes());

 	// see if there will be enough memory (non-authoritative, for debug purposes only)
 	{
-		const size_t availableBytes = numa_AvailableMemory(node);
-		if(availableBytes < size)
-			debug_printf("NUMA: warning: node reports insufficient memory (%d vs %d)\n", availableBytes, size);
+		const size_t sizeMiB = size/MiB;
+		const size_t availableMiB = numa_AvailableMemory(node);
+		if(availableMiB < sizeMiB)
+			debug_printf("NUMA: warning: node reports insufficient memory (%d vs %d MB)\n", availableMiB, sizeMiB);
 	}

-	void* mem = 0;
-	size_t pageSize = 0;
-
-	// try allocating with large pages (reduces TLB misses)
-	if(ShouldUseLargePages(largePageDisposition, size, node))
-	{
-		const size_t largePageSize = os_cpu_LargePageSize();
-		const size_t paddedSize = round_up(size, largePageSize);	// required by MEM_LARGE_PAGES
-		// note: this call can take SECONDS, which is why several checks are
-		// undertaken before we even try. these aren't authoritative, so we
-		// at least prevent future attempts if it takes too long.
-		const double startTime = timer_Time();
-		mem = VirtualAlloc(0, paddedSize, MEM_RESERVE|MEM_COMMIT|MEM_LARGE_PAGES, PAGE_READWRITE);
-		pageSize = largePageSize;
-		const double elapsedTime = timer_Time() - startTime;
-		debug_printf("TIMER| NUMA large page allocation: %g\n", elapsedTime);
-		if(elapsedTime > 1.0)
-			largePageAllocationTookTooLong = true;
-	}
-
-	// try (again) with regular pages
-	if(!mem)
-	{
-		mem = VirtualAlloc(0, size, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE);
-		pageSize = os_cpu_PageSize();
-	}
-
-	// all attempts failed - we're apparently out of memory.
-	if(!mem)
-		throw std::bad_alloc();
+	size_t pageSize;	// (used below even if ppageSize is zero)
+	void* const mem = numa_Allocate(size, largePageDisposition, &pageSize);
+	if(ppageSize)
+		*ppageSize = pageSize;

 	// we can't use VirtualAllocExNuma - it's only available in Vista and Server 2008.
 	// workaround: fault in all pages now to ensure they are allocated from the
 	// current node, then verify page attributes.
 	// (note: VirtualAlloc's MEM_COMMIT only maps virtual pages and does not
-	// actually allocate page frames. Windows uses a first-touch heuristic -
-	// the page will be taken from the node whose processor caused the fault.)
+	// actually allocate page frames. Windows XP uses a first-touch heuristic -
+	// the page will be taken from the node whose processor caused the fault.
+	// Windows Vista allocates on the "preferred" node, so affinity should be
+	// set such that this thread is running on <node>.)
 	memset(mem, 0, size);

 	VerifyPages(mem, size, pageSize, node);

-	if(ppageSize)
-		*ppageSize = pageSize;
-
 	return mem;
 }

--- a/source/lib/sysdep/x86_x64/topology.cpp
+++ b/source/lib/sysdep/x86_x64/topology.cpp
@ -12,7 +12,7 @@
 #include "topology.h"

 #include "lib/bits.h"
-#include "lib/sysdep/cpu.h"
+#include "lib/sysdep/cpu.h"	// ERR::CPU_FEATURE_MISSING
 #include "lib/sysdep/os_cpu.h"
 #include "x86_x64.h"

@ -99,36 +99,20 @@ static size_t LogicalPerCache()

 	if(!logicalPerCache)
 	{
-		logicalPerCache = 1;	// caches aren't shared unless we find a descriptor
+		logicalPerCache = 1;	// (default in case DetectL2Sharing fails)

-		// note: Intel Appnote 485 says the order in which caches are returned is
-		// undefined, so we need to loop through all of them.
-		for(u32 count = 0; ; count++)
+		struct DetectL2Sharing
 		{
-			// get next cache descriptor
-			x86_x64_CpuidRegs regs;
-			regs.eax = 4;
-			regs.ecx = count;
-			x86_x64_cpuid(&regs);
-			const u32 type = bits(regs.eax, 0, 4);
-			if(type == 0)	// no more remaining
-				break;
-			
-			struct IsL2DataCache
+			static void Callback(const x86_x64_CacheParameters* cache)
 			{
-				bool operator()(u32 type, u32 level) const
-				{
-					if(type != 1 && type != 3)	// neither data nor unified
-						return false;
-					if(level != 2)
-						return false;
-					return true;
-				}
-			};
-			const u32 level = bits(regs.eax, 5, 7);
-			if(IsL2DataCache()(type, level))
-				logicalPerCache = bits(regs.eax, 14, 25)+1;
-		}
+				if(cache->type != X86_X64_CACHE_TYPE_DATA && cache->type != X86_X64_CACHE_TYPE_UNIFIED)
+					return;
+				if(cache->level != 2)
+					return;
+				logicalPerCache = cache->sharedBy;
+			}
+		};
+		x86_x64_EnumerateCaches(DetectL2Sharing::Callback);
 	}

 	return logicalPerCache;
@ -177,25 +161,18 @@ static const u8* ApicIds()


 /**
- * count the number of unique values assumed by a certain field (i.e. part
- * of the APIC ID).
- * @param numBits width of the field; must be set to ceil_log2 of the
- * maximum value that can be assumed by the field.
- * @return number of unique values (one if numBits is zero - this is
- * convenient and kind of justified by counting the empty symbol)
+ * count the number of unique APIC IDs after application of a mask.
+ *
+ * this is used to implement NumUniqueValuesInField and also required
+ * for counting the number of caches.
 **/
-static size_t NumUniqueValuesInField(const u8* apicIds, size_t offset, size_t numBits)
+static size_t NumUniqueMaskedValues(const u8* apicIds, u8 mask)
 {
-	if(numBits == 0)
-		return 1;	// see above
-	const u8 mask = bit_mask<u8>(numBits);
-
-	typedef std::set<u8> IdSet;
-	IdSet ids;
+	std::set<u8> ids;
 	for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
 	{
 		const u8 apicId = apicIds[processor];
-		const u8 field = u8(apicId >> offset) & mask;
+		const u8 field = apicId & mask;
 		ids.insert(field);
 	}

@ -203,13 +180,31 @@ static size_t NumUniqueValuesInField(const u8* apicIds, size_t offset, size_t nu
 }


+/**
+ * count the number of values assumed by a certain field within APIC IDs.
+ *
+ * @param offset index of the lowest bit that is part of the field.
+ * @param numValues number of values that can be assumed by the field.
+ * if equal to one, the field is zero-width.
+ * @return number of unique values (for convenience of the topology code,
+ * this is always at least one)
+ **/
+static size_t NumUniqueValuesInField(const u8* apicIds, size_t offset, size_t numValues)
+{
+	if(numValues == 1)
+		return 1;	// see above
+	const size_t numBits = ceil_log2(numValues);
+	const u8 mask = u8((bit_mask<u8>(numBits) << offset) & 0xFF);
+	return NumUniqueMaskedValues(apicIds, mask);
+}
+
+
 static size_t NumPackages(const u8* apicIds)
 {
 	if(apicIds)
 	{
 		const size_t offset = ceil_log2(CoresPerPackage()) + ceil_log2(LogicalPerCore());
-		const size_t numBits = 8;
-		return NumUniqueValuesInField(apicIds, offset, numBits);
+		return NumUniqueValuesInField(apicIds, offset, 256);
 	}
 	else
 	{
@ -241,8 +236,7 @@ static size_t CoresPerPackage(const u8* apicIds)
 	if(apicIds)
 	{
 		const size_t offset = ceil_log2(LogicalPerCore());
-		const size_t numBits = ceil_log2(CoresPerPackage());
-		return NumUniqueValuesInField(apicIds, offset, numBits);
+		return NumUniqueValuesInField(apicIds, offset, CoresPerPackage());
 	}
 	else
 	{
@ -257,8 +251,7 @@ static size_t LogicalPerCore(const u8* apicIds)
 	if(apicIds)
 	{
 		const size_t offset = 0;
-		const size_t numBits = ceil_log2(LogicalPerCore());
-		return NumUniqueValuesInField(apicIds, offset, numBits);
+		return NumUniqueValuesInField(apicIds, offset, LogicalPerCore());
 	}
 	else
 	{
@ -320,9 +313,9 @@ static size_t NumCaches(const u8* apicIds)
 {
 	if(apicIds)
 	{
-		const size_t offset = 0;
 		const size_t numBits = ceil_log2(LogicalPerCache());
-		return NumUniqueValuesInField(apicIds, offset, numBits);
+		const u8 mask = u8((0xFF << numBits) & 0xFF);
+		return NumUniqueMaskedValues(apicIds, mask);
 	}
 	else
 	{
--- a/source/lib/sysdep/x86_x64/x86_x64.cpp
+++ b/source/lib/sysdep/x86_x64/x86_x64.cpp
@ -223,6 +223,63 @@ size_t x86_x64_Generation()
 }


+//-----------------------------------------------------------------------------
+// cache
+
+void x86_x64_EnumerateCaches(x86_x64_CacheCallback callback)
+{
+	for(u32 count = 0; ; count++)
+	{
+		x86_x64_CpuidRegs regs;
+		regs.eax = 4;
+		regs.ecx = count;
+		x86_x64_cpuid(&regs);
+
+		x86_x64_CacheParameters cache;
+		cache.type = (x86_x64_CacheType)bits(regs.eax, 0, 4);
+		if(cache.type == X86_X64_CACHE_TYPE_NULL)	// no more remaining
+			break;
+		cache.level = (size_t)bits(regs.eax, 5, 7);
+		cache.associativity = (size_t)bits(regs.ebx, 22, 31)+1;
+		cache.lineSize = (size_t)bits(regs.ebx, 0, 11)+1;	// (yes, this also uses +1 encoding)
+		cache.sharedBy = (size_t)bits(regs.eax, 14, 25)+1;
+		{
+			const size_t partitions = (size_t)bits(regs.ebx, 12, 21)+1;
+			const size_t sets = (size_t)bits(regs.ecx, 0, 31)+1;
+			cache.size = cache.associativity * partitions * cache.lineSize * sets;
+		}
+
+		callback(&cache);
+	}
+}
+
+
+size_t x86_x64_L1CacheLineSize()
+{
+	static size_t l1CacheLineSize;
+
+	if(!l1CacheLineSize)
+	{
+		l1CacheLineSize = 64;	// (default in case DetectL1CacheLineSize fails)
+
+		struct DetectL1CacheLineSize
+		{
+			static void Callback(const x86_x64_CacheParameters* cache)
+			{
+				if(cache->type != X86_X64_CACHE_TYPE_DATA && cache->type != X86_X64_CACHE_TYPE_UNIFIED)
+					return;
+				if(cache->level != 1)
+					return;
+				l1CacheLineSize = cache->lineSize;
+			}
+		};
+		x86_x64_EnumerateCaches(DetectL1CacheLineSize::Callback);
+	}
+
+	return l1CacheLineSize;
+}
+
+
 //-----------------------------------------------------------------------------
 // identifier string

--- a/source/lib/sysdep/x86_x64/x86_x64.h
+++ b/source/lib/sysdep/x86_x64/x86_x64.h
@ -96,6 +96,40 @@ enum x86_x64_Cap
 LIB_API bool x86_x64_cap(x86_x64_Cap cap);


+//-----------------------------------------------------------------------------
+// cache
+
+enum x86_x64_CacheType
+{
+	X86_X64_CACHE_TYPE_NULL,	// never passed to the callback
+	X86_X64_CACHE_TYPE_DATA,
+	X86_X64_CACHE_TYPE_INSTRUCTION,
+	X86_X64_CACHE_TYPE_UNIFIED
+	// note: further values are "reserved"
+};
+
+struct x86_x64_CacheParameters
+{
+	x86_x64_CacheType type;
+	size_t level;
+	size_t associativity;
+	size_t lineSize;
+	size_t sharedBy;
+	size_t size;
+};
+
+typedef void (CALL_CONV *x86_x64_CacheCallback)(const x86_x64_CacheParameters*);
+
+/**
+ * call back for each cache reported by CPUID.
+ *
+ * note: ordering is undefined (see Intel AP-485)
+ **/
+LIB_API void x86_x64_EnumerateCaches(x86_x64_CacheCallback callback);
+
+LIB_API size_t x86_x64_L1CacheLineSize();
+
+
 //-----------------------------------------------------------------------------
 // stateless