From 5d80d2ee5d549e809332bf242270c678f6fd68e5 Mon Sep 17 00:00:00 2001
From: janwas <jan@wildfiregames.com>
Date: Sun, 1 Jun 2008 08:25:12 +0000
Subject: [PATCH] fixes+improvements from work:

- add AlignedAllocator - an STL allocator that returns
cache-line-aligned objects (required to avoid RFOs when threads write to
various independent items in a container)
- bits: bit_mask can now be used for N=0..numBits (works around
full-word-shifts-are-undefined issue)
- precompiled.h: remove scoped_ptr, add function-related stuff from TR1
- numa:
  . add numa_IsMemoryInterleaved
  . numa_Allocate is now able to allocate large pages as well (reduces
TLB misses)
- os_cpu: interface change to support 32-bit apps running on WoW64
systems with > 4 GB of memory
- topology: use new x86_x64_EnumerateCaches API; fix detection of cache
ID
- x86_x64: provide the means of enumerating all caches returned by CPUID
and detect L1 cache size

This was SVN commit r6004.
---
 source/lib/allocators/aligned_allocator.cpp |  12 ++
 source/lib/allocators/aligned_allocator.h   | 130 ++++++++++++++++++++
 source/lib/bits.h                           |  11 +-
 source/lib/precompiled.h                    |  11 +-
 source/lib/rand.h                           |   2 +-
 source/lib/sysdep/linux/lcpu.cpp            |   8 +-
 source/lib/sysdep/numa.h                    |  31 +++--
 source/lib/sysdep/os_cpu.h                  |   4 +-
 source/lib/sysdep/osx/ocpu.cpp              |   2 +
 source/lib/sysdep/win/wcpu.cpp              |  16 +--
 source/lib/sysdep/win/wnuma.cpp             | 128 +++++++++++--------
 source/lib/sysdep/x86_x64/topology.cpp      |  93 +++++++-------
 source/lib/sysdep/x86_x64/x86_x64.cpp       |  57 +++++++++
 source/lib/sysdep/x86_x64/x86_x64.h         |  34 +++++
 14 files changed, 411 insertions(+), 128 deletions(-)
 create mode 100644 source/lib/allocators/aligned_allocator.cpp
 create mode 100644 source/lib/allocators/aligned_allocator.h

diff --git a/source/lib/allocators/aligned_allocator.cpp b/source/lib/allocators/aligned_allocator.cpp
new file mode 100644
index 0000000000..a8a08a1d17
--- /dev/null
+++ b/source/lib/allocators/aligned_allocator.cpp
@@ -0,0 +1,12 @@
+/**
+ * =========================================================================
+ * File        : aligned_allocator.cpp
+ * Project     : 0 A.D.
+ * Description : STL allocator for aligned memory
+ * =========================================================================
+ */
+
+// license: GPL; see lib/license.txt
+
+#include "precompiled.h"
+#include "aligned_allocator.h"
diff --git a/source/lib/allocators/aligned_allocator.h b/source/lib/allocators/aligned_allocator.h
new file mode 100644
index 0000000000..c1d42b5b6a
--- /dev/null
+++ b/source/lib/allocators/aligned_allocator.h
@@ -0,0 +1,130 @@
+/**
+ * =========================================================================
+ * File        : aligned_allocator.h
+ * Project     : 0 A.D.
+ * Description : STL allocator for aligned memory
+ * =========================================================================
+ */
+
+// license: GPL; see lib/license.txt
+
+#ifndef ALIGNED_ALLOCATOR
+#define ALIGNED_ALLOCATOR
+
+#include "lib/bits.h"	// round_up
+#include "lib/sysdep/x86_x64/x86_x64.h"	// x86_x64_L1CacheLineSize
+
+
+/**
+ * stateless STL allocator that aligns elements to the L1 cache line size.
+ *
+ * note: the alignment is hard-coded to avoid any allocator state.
+ * this avoids portability problems, which is important since allocators
+ * are rather poorly specified.
+ *
+ * references:
+ * http://www.tantalon.com/pete/customallocators.ppt
+ * http://www.flipcode.com/archives/Aligned_Block_Allocation.shtml
+ * http://www.josuttis.com/cppcode/allocator.html
+ *
+ * derived from code that bears the following copyright notice:
+ * (C) Copyright Nicolai M. Josuttis 1999.
+ * Permission to copy, use, modify, sell and distribute this software
+ * is granted provided this copyright notice appears in all copies.
+ * This software is provided "as is" without express or implied
+ * warranty, and with no claim as to its suitability for any purpose.
+ **/
+template<class T>
+class AlignedAllocator
+{
+public:
+	// type definitions
+	typedef T        value_type;
+	typedef T*       pointer;
+	typedef const T* const_pointer;
+	typedef T&       reference;
+	typedef const T& const_reference;
+	typedef std::size_t    size_type;
+	typedef std::ptrdiff_t difference_type;
+
+	// rebind allocator to type U
+	template <class U>
+	struct rebind
+	{
+		typedef AlignedAllocator<U> other;
+	};
+
+	pointer address(reference value) const
+	{
+		return &value;
+	}
+
+	const_pointer address(const_reference value) const
+	{
+		return &value;
+	}
+
+	AlignedAllocator() throw()
+	{
+	}
+
+	AlignedAllocator(const AlignedAllocator&) throw()
+	{
+	}
+
+	template <class U>
+	AlignedAllocator (const AlignedAllocator<U>&) throw()
+	{
+	}
+
+	~AlignedAllocator() throw()
+	{
+	}
+
+	size_type max_size() const throw()
+	{
+		// maximum number of *elements* that can be allocated
+		return std::numeric_limits<std::size_t>::max() / sizeof(T);
+	}
+
+	// allocate uninitialized storage
+	pointer allocate(size_type numElements, const void* hint = 0)
+	{
+		const size_type alignment = x86_x64_L1CacheLineSize();
+		const size_type elementSize = round_up(sizeof(T), alignment);
+		const size_type size = numElements * elementSize;
+		pointer p = (pointer)_aligned_malloc(size, alignment);
+		return p;
+	}
+
+	// deallocate storage of elements that have been destroyed
+	void deallocate(pointer p, size_type num)
+	{
+		_aligned_free((void*)p);
+	}
+
+	void construct(pointer p, const T& value)
+	{
+		new((void*)p) T(value);
+	}
+
+	void destroy(pointer p)
+	{
+		p->~T();
+	}
+};
+
+// indicate that all specializations of this allocator are interchangeable
+template <class T1, class T2>
+bool operator==(const AlignedAllocator<T1>&, const AlignedAllocator<T2>&) throw()
+{
+	return true;
+}
+
+template <class T1, class T2>
+bool operator!=(const AlignedAllocator<T1>&, const AlignedAllocator<T2>&) throw()
+{
+	return false;
+}
+
+#endif	// #ifndef ALIGNED_ALLOCATOR
diff --git a/source/lib/bits.h b/source/lib/bits.h
index 62d36bf243..a330e1073d 100644
--- a/source/lib/bits.h
+++ b/source/lib/bits.h
@@ -44,9 +44,14 @@ bool IsBitSet(T value, size_t index)
  * @param num_bits number of bits in mask
  **/
 template<typename T>
-T bit_mask(size_t num_bits)
+T bit_mask(size_t numBits)
 {
-	return (T)(T(1) << num_bits)-1;
+	if(numBits == 0)	// prevent shift count == bitsInT, which would be undefined.
+		return 0;
+	// note: the perhaps more intuitive (1 << numBits)-1 cannot
+	// handle numBits == bitsInT, but this implementation does.
+	const T bitsInT = sizeof(T)*CHAR_BIT;
+	return ~T(0) >> T(bitsInT-numBits);
 }
 
 
@@ -64,7 +69,7 @@ template<typename T>
 inline T bits(T num, size_t lo_idx, size_t hi_idx)
 {
 	const size_t count = (hi_idx - lo_idx)+1;	// # bits to return
-	T result = num >> lo_idx;
+	T result = num >> T(lo_idx);
 	result &= bit_mask<T>(count);
 	return result;
 }
diff --git a/source/lib/precompiled.h b/source/lib/precompiled.h
index 2d11e2baf4..e1ab670cd9 100644
--- a/source/lib/precompiled.h
+++ b/source/lib/precompiled.h
@@ -63,10 +63,15 @@
 # define BOOST_ALL_DYN_LINK 
 #endif
 #include <boost/utility.hpp>	// noncopyable
-#include <boost/shared_array.hpp>
+// the following boost libraries have been included in TR1 and are
+// thus deemed usable:
 #include <boost/shared_ptr.hpp>
-#include <boost/scoped_ptr.hpp>
-using boost::shared_ptr;	// has been added to TR1
+using boost::shared_ptr;
+#include <boost/mem_fn.hpp>
+using boost::mem_fn;
+#include <boost/function.hpp>
+using boost::function;
+#include <boost/bind.hpp>
 #include "lib/external_libraries/boost_filesystem.h"
 
 // (this must come after boost and common lib headers)
diff --git a/source/lib/rand.h b/source/lib/rand.h
index 7c6fdf40cf..f709751f11 100644
--- a/source/lib/rand.h
+++ b/source/lib/rand.h
@@ -16,6 +16,6 @@
  * avoids several common pitfalls; see discussion at
  * http://www.azillionmonkeys.com/qed/random.html
  **/
-extern size_t rand(size_t min_inclusive, size_t max_exclusive);
+LIB_API size_t rand(size_t min_inclusive, size_t max_exclusive);
 
 #endif	// #ifndef INCLUDED_RAND
diff --git a/source/lib/sysdep/linux/lcpu.cpp b/source/lib/sysdep/linux/lcpu.cpp
index 34fe56c7aa..305efb847a 100644
--- a/source/lib/sysdep/linux/lcpu.cpp
+++ b/source/lib/sysdep/linux/lcpu.cpp
@@ -70,7 +70,10 @@ size_t os_cpu_MemorySize()
 	static size_t memorySize;
 
 	if(!memorySize)
-		memorySize = sysconf(_SC_PHYS_PAGES) * os_cpu_PageSize();
+	{
+		const uint64_t memorySizeBytes = (uint64_t)sysconf(_SC_PHYS_PAGES) * os_cpu_PageSize();
+		memorySize = size_t(memorySizeBytes / MiB);
+	}
 
 	return memorySize;
 }
@@ -78,7 +81,8 @@ size_t os_cpu_MemorySize()
 
 size_t os_cpu_MemoryAvailable()
 {
-	const size_t memoryAvailable = sysconf(_SC_AVPHYS_PAGES) * os_cpu_PageSize();
+	const uint64_t memoryAvailableBytes = (uint64_t)sysconf(_SC_AVPHYS_PAGES) * os_cpu_PageSize();
+	const size_t memoryAvailable = size_t(memoryAvailableBytes / MiB);
 	return memoryAvailable;
 }
 
diff --git a/source/lib/sysdep/numa.h b/source/lib/sysdep/numa.h
index 934df81d32..1d2a0fbc81 100644
--- a/source/lib/sysdep/numa.h
+++ b/source/lib/sysdep/numa.h
@@ -36,16 +36,19 @@ LIB_API size_t numa_AvailableMemory(size_t node);
  **/
 LIB_API double numa_Factor();
 
+/**
+ * @return an indication of whether memory pages are node-interleaved.
+ *
+ * note: this requires ACPI access, which may not be available on
+ * least-permission accounts. the default is to return false so as
+ * not to cause callers to panic and trigger performance warnings.
+ **/
+LIB_API bool numa_IsMemoryInterleaved();
+
 
 //-----------------------------------------------------------------------------
 // allocator
 
-/**
- * simple allocator that "does the right thing" on NUMA systems - page frames
- * will be taken from the node that first accesses them.
- **/
-LIB_API void* numa_Allocate(size_t size);
-
 enum LargePageDisposition
 {
 	LPD_DEFAULT,
@@ -54,15 +57,25 @@ enum LargePageDisposition
 };
 
 /**
- * allocate memory from a specific node.
+ * simple allocator that "does the right thing" on NUMA systems.
  *
- * @param node node number (zero-based)
  * @param largePageDisposition - allows forcibly enabling/disabling the use
  * of large pages; the default decision involves a heuristic.
  * @param pageSize if non-zero, receives the size [bytes] of a single page
  * out of those used to map the memory.
+ *
+ * note: page frames will be taken from the node that first accesses them.
  **/
-LIB_API void* numa_AllocateOnNode(size_t size, size_t node, LargePageDisposition largePageDisposition = LPD_DEFAULT, size_t* pageSize = 0);
+LIB_API void* numa_Allocate(size_t size, LargePageDisposition largePageDisposition = LPD_DEFAULT, size_t* ppageSize = 0);
+
+/**
+ * allocate memory from a specific node.
+ *
+ * @param node node number (zero-based)
+ * @param largePageDisposition - see numa_Allocate
+ * @param pageSize - see numa_Allocate
+ **/
+LIB_API void* numa_AllocateOnNode(size_t node, size_t size, LargePageDisposition largePageDisposition = LPD_DEFAULT, size_t* pageSize = 0);
 
 /**
  * release memory that had been handed out by one of the above allocators.
diff --git a/source/lib/sysdep/os_cpu.h b/source/lib/sysdep/os_cpu.h
index cbc2070a44..97b260367c 100644
--- a/source/lib/sysdep/os_cpu.h
+++ b/source/lib/sysdep/os_cpu.h
@@ -76,12 +76,12 @@ LIB_API size_t os_cpu_PageSize();
 LIB_API size_t os_cpu_LargePageSize();
 
 /**
- * @return the size [bytes] of physical memory.
+ * @return the size [MB] of physical memory.
  **/
 LIB_API size_t os_cpu_MemorySize();
 
 /**
- * @return the size [bytes] of currently available memory.
+ * @return the size [MB] of currently available memory.
  **/
 LIB_API size_t os_cpu_MemoryAvailable();
 
diff --git a/source/lib/sysdep/osx/ocpu.cpp b/source/lib/sysdep/osx/ocpu.cpp
index 57cf237edb..713e78b551 100644
--- a/source/lib/sysdep/osx/ocpu.cpp
+++ b/source/lib/sysdep/osx/ocpu.cpp
@@ -69,6 +69,7 @@ size_t os_cpu_MemorySize()
 		// Argh, the API doesn't seem to be const-correct
 		/*const*/ int mib[2] = { CTL_HW, HW_PHYSMEM };
 		sysctl(mib, 2, &memorySize, &len, 0, 0);
+		memorySize /= MiB;
 	}
 
 	return memorySize;
@@ -82,6 +83,7 @@ size_t os_cpu_MemoryAvailable()
 	// Argh, the API doesn't seem to be const-correct
 	/*const*/ int mib[2] = { CTL_HW, HW_USERMEM };
 	sysctl(mib, 2, &memoryAvailable, &len, 0, 0);
+	memoryAvailable /= MiB;
 	return memoryAvailable;
 }
 
diff --git a/source/lib/sysdep/win/wcpu.cpp b/source/lib/sysdep/win/wcpu.cpp
index 7bc38e9e8a..0a5bce04fb 100644
--- a/source/lib/sysdep/win/wcpu.cpp
+++ b/source/lib/sysdep/win/wcpu.cpp
@@ -142,31 +142,33 @@ static void GetMemoryStatus(MEMORYSTATUSEX& mse)
 
 size_t os_cpu_MemorySize()
 {
-	static size_t memorySize;
+	static size_t memorySizeMiB;
 
-	if(memorySize == 0)
+	if(memorySizeMiB == 0)
 	{
 		MEMORYSTATUSEX mse;
 		GetMemoryStatus(mse);
-		memorySize = (size_t)mse.ullTotalPhys;
+		DWORDLONG memorySize = mse.ullTotalPhys;
 
 		// Richter, "Programming Applications for Windows": the reported
 		// value doesn't include non-paged pool reserved during boot;
 		// it's not considered available to the kernel. (the amount is
 		// 528 KiB on a 512 MiB WinXP/Win2k machine). we'll round up
 		// to the nearest megabyte to fix this.
-		memorySize = round_up(memorySize, 1*MiB);
+		memorySize = round_up(memorySize, DWORDLONG(1*MiB));
+
+		memorySizeMiB = size_t(memorySize / MiB);
 	}
 
-	return memorySize;
+	return memorySizeMiB;
 }
 
 size_t os_cpu_MemoryAvailable()
 {
 	MEMORYSTATUSEX mse;
 	GetMemoryStatus(mse);
-	const size_t memoryAvailable = (size_t)mse.ullAvailPhys;
-	return memoryAvailable;
+	const size_t memoryAvailableMiB = size_t(mse.ullAvailPhys / MiB);
+	return memoryAvailableMiB;
 }
 
 
diff --git a/source/lib/sysdep/win/wnuma.cpp b/source/lib/sysdep/win/wnuma.cpp
index a8a0368c48..2c78216b65 100644
--- a/source/lib/sysdep/win/wnuma.cpp
+++ b/source/lib/sysdep/win/wnuma.cpp
@@ -4,6 +4,7 @@
 #include "lib/bits.h"	// round_up, PopulationCount
 #include "lib/timer.h"
 #include "lib/sysdep/os_cpu.h"
+#include "lib/sysdep/acpi.h"
 #include "win.h"
 #include "wutil.h"
 #include "wcpu.h"
@@ -141,7 +142,8 @@ size_t numa_AvailableMemory(size_t node)
 		ULONGLONG availableBytes;
 		const BOOL ok = pGetNumaAvailableMemoryNode((UCHAR)node, &availableBytes);
 		debug_assert(ok);
-		return (size_t)availableBytes;
+		const size_t availableMiB = size_t(availableBytes / MiB);
+		return availableMiB;
 	}
 	// NUMA not supported - return available system memory
 	else
@@ -194,22 +196,34 @@ double numa_Factor()
 }
 
 
+bool numa_IsMemoryInterleaved()
+{
+	WinScopedLock lock(WNUMA_CS);
+	static int isInterleaved = -1;
+	if(isInterleaved == -1)
+	{
+		if(acpi_Init())
+		{
+			// the BIOS only generates an SRAT (System Resource Affinity Table)
+			// if node interleaving is disabled.
+			isInterleaved = acpi_GetTable("SRAT") == 0;
+			acpi_Shutdown();
+		}
+		else
+			isInterleaved = 0;	// can't tell
+	}
+
+	return isInterleaved != 0;
+}
+
+
 //-----------------------------------------------------------------------------
 // allocator
 //-----------------------------------------------------------------------------
 
-void* numa_Allocate(size_t size)
-{
-	void* const mem = VirtualAlloc(0, size, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE);
-	if(!mem)
-		throw std::bad_alloc();
-	return mem;
-}
-
-
 static bool largePageAllocationTookTooLong = false;
 
-static bool ShouldUseLargePages(LargePageDisposition disposition, size_t allocationSize, size_t node)
+static bool ShouldUseLargePages(LargePageDisposition disposition, size_t allocationSize)
 {
 	// can't, OS does not support large pages
 	if(os_cpu_LargePageSize() == 0)
@@ -236,7 +250,7 @@ static bool ShouldUseLargePages(LargePageDisposition disposition, size_t allocat
 		// we want there to be plenty of memory available, otherwise the
 		// page frames are going to be terribly fragmented and even a
 		// single allocation would take SECONDS.
-		if(numa_AvailableMemory(node) < 2*GiB)
+		if(os_cpu_MemoryAvailable() < 2000)	// 2 GB
 			return false;
 	}
 
@@ -244,6 +258,44 @@ static bool ShouldUseLargePages(LargePageDisposition disposition, size_t allocat
 }
 
 
+void* numa_Allocate(size_t size, LargePageDisposition largePageDisposition, size_t* ppageSize)
+{
+	void* mem = 0;
+
+	// try allocating with large pages (reduces TLB misses)
+	if(ShouldUseLargePages(largePageDisposition, size))
+	{
+		const size_t largePageSize = os_cpu_LargePageSize();
+		const size_t paddedSize = round_up(size, largePageSize);	// required by MEM_LARGE_PAGES
+		// note: this call can take SECONDS, which is why several checks are
+		// undertaken before we even try. these aren't authoritative, so we
+		// at least prevent future attempts if it takes too long.
+		const double startTime = timer_Time();
+		mem = VirtualAlloc(0, paddedSize, MEM_RESERVE|MEM_COMMIT|MEM_LARGE_PAGES, PAGE_READWRITE);
+		if(ppageSize)
+			*ppageSize = largePageSize;
+		const double elapsedTime = timer_Time() - startTime;
+		debug_printf("TIMER| NUMA large page allocation: %g\n", elapsedTime);
+		if(elapsedTime > 1.0)
+			largePageAllocationTookTooLong = true;
+	}
+
+	// try (again) with regular pages
+	if(!mem)
+	{
+		mem = VirtualAlloc(0, size, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE);
+		if(ppageSize)
+			*ppageSize = os_cpu_PageSize();
+	}
+
+	// all attempts failed - we're apparently out of memory.
+	if(!mem)
+		throw std::bad_alloc();
+
+	return mem;
+}
+
+
 static bool VerifyPages(void* mem, size_t size, size_t pageSize, size_t node)
 {
 	typedef BOOL (WINAPI *PQueryWorkingSetEx)(HANDLE hProcess, PVOID buffer, DWORD bufferSize);
@@ -294,61 +346,35 @@ static bool VerifyPages(void* mem, size_t size, size_t pageSize, size_t node)
 }
 
 
-void* numa_AllocateOnNode(size_t size, size_t node, LargePageDisposition largePageDisposition, size_t* ppageSize)
+void* numa_AllocateOnNode(size_t node, size_t size, LargePageDisposition largePageDisposition, size_t* ppageSize)
 {
 	debug_assert(node < numa_NumNodes());
 
 	// see if there will be enough memory (non-authoritative, for debug purposes only)
 	{
-		const size_t availableBytes = numa_AvailableMemory(node);
-		if(availableBytes < size)
-			debug_printf("NUMA: warning: node reports insufficient memory (%d vs %d)\n", availableBytes, size);
+		const size_t sizeMiB = size/MiB;
+		const size_t availableMiB = numa_AvailableMemory(node);
+		if(availableMiB < sizeMiB)
+			debug_printf("NUMA: warning: node reports insufficient memory (%d vs %d MB)\n", availableMiB, sizeMiB);
 	}
 
-	void* mem = 0;
-	size_t pageSize = 0;
-
-	// try allocating with large pages (reduces TLB misses)
-	if(ShouldUseLargePages(largePageDisposition, size, node))
-	{
-		const size_t largePageSize = os_cpu_LargePageSize();
-		const size_t paddedSize = round_up(size, largePageSize);	// required by MEM_LARGE_PAGES
-		// note: this call can take SECONDS, which is why several checks are
-		// undertaken before we even try. these aren't authoritative, so we
-		// at least prevent future attempts if it takes too long.
-		const double startTime = timer_Time();
-		mem = VirtualAlloc(0, paddedSize, MEM_RESERVE|MEM_COMMIT|MEM_LARGE_PAGES, PAGE_READWRITE);
-		pageSize = largePageSize;
-		const double elapsedTime = timer_Time() - startTime;
-		debug_printf("TIMER| NUMA large page allocation: %g\n", elapsedTime);
-		if(elapsedTime > 1.0)
-			largePageAllocationTookTooLong = true;
-	}
-
-	// try (again) with regular pages
-	if(!mem)
-	{
-		mem = VirtualAlloc(0, size, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE);
-		pageSize = os_cpu_PageSize();
-	}
-
-	// all attempts failed - we're apparently out of memory.
-	if(!mem)
-		throw std::bad_alloc();
+	size_t pageSize;	// (used below even if ppageSize is zero)
+	void* const mem = numa_Allocate(size, largePageDisposition, &pageSize);
+	if(ppageSize)
+		*ppageSize = pageSize;
 
 	// we can't use VirtualAllocExNuma - it's only available in Vista and Server 2008.
 	// workaround: fault in all pages now to ensure they are allocated from the
 	// current node, then verify page attributes.
 	// (note: VirtualAlloc's MEM_COMMIT only maps virtual pages and does not
-	// actually allocate page frames. Windows uses a first-touch heuristic -
-	// the page will be taken from the node whose processor caused the fault.)
+	// actually allocate page frames. Windows XP uses a first-touch heuristic -
+	// the page will be taken from the node whose processor caused the fault.
+	// Windows Vista allocates on the "preferred" node, so affinity should be
+	// set such that this thread is running on <node>.)
 	memset(mem, 0, size);
 
 	VerifyPages(mem, size, pageSize, node);
 
-	if(ppageSize)
-		*ppageSize = pageSize;
-
 	return mem;
 }
 
diff --git a/source/lib/sysdep/x86_x64/topology.cpp b/source/lib/sysdep/x86_x64/topology.cpp
index 2cecbf9601..acf31d5f30 100644
--- a/source/lib/sysdep/x86_x64/topology.cpp
+++ b/source/lib/sysdep/x86_x64/topology.cpp
@@ -12,7 +12,7 @@
 #include "topology.h"
 
 #include "lib/bits.h"
-#include "lib/sysdep/cpu.h"
+#include "lib/sysdep/cpu.h"	// ERR::CPU_FEATURE_MISSING
 #include "lib/sysdep/os_cpu.h"
 #include "x86_x64.h"
 
@@ -99,36 +99,20 @@ static size_t LogicalPerCache()
 
 	if(!logicalPerCache)
 	{
-		logicalPerCache = 1;	// caches aren't shared unless we find a descriptor
+		logicalPerCache = 1;	// (default in case DetectL2Sharing fails)
 
-		// note: Intel Appnote 485 says the order in which caches are returned is
-		// undefined, so we need to loop through all of them.
-		for(u32 count = 0; ; count++)
+		struct DetectL2Sharing
 		{
-			// get next cache descriptor
-			x86_x64_CpuidRegs regs;
-			regs.eax = 4;
-			regs.ecx = count;
-			x86_x64_cpuid(&regs);
-			const u32 type = bits(regs.eax, 0, 4);
-			if(type == 0)	// no more remaining
-				break;
-			
-			struct IsL2DataCache
+			static void Callback(const x86_x64_CacheParameters* cache)
 			{
-				bool operator()(u32 type, u32 level) const
-				{
-					if(type != 1 && type != 3)	// neither data nor unified
-						return false;
-					if(level != 2)
-						return false;
-					return true;
-				}
-			};
-			const u32 level = bits(regs.eax, 5, 7);
-			if(IsL2DataCache()(type, level))
-				logicalPerCache = bits(regs.eax, 14, 25)+1;
-		}
+				if(cache->type != X86_X64_CACHE_TYPE_DATA && cache->type != X86_X64_CACHE_TYPE_UNIFIED)
+					return;
+				if(cache->level != 2)
+					return;
+				logicalPerCache = cache->sharedBy;
+			}
+		};
+		x86_x64_EnumerateCaches(DetectL2Sharing::Callback);
 	}
 
 	return logicalPerCache;
@@ -177,25 +161,18 @@ static const u8* ApicIds()
 
 
 /**
- * count the number of unique values assumed by a certain field (i.e. part
- * of the APIC ID).
- * @param numBits width of the field; must be set to ceil_log2 of the
- * maximum value that can be assumed by the field.
- * @return number of unique values (one if numBits is zero - this is
- * convenient and kind of justified by counting the empty symbol)
+ * count the number of unique APIC IDs after application of a mask.
+ *
+ * this is used to implement NumUniqueValuesInField and also required
+ * for counting the number of caches.
  **/
-static size_t NumUniqueValuesInField(const u8* apicIds, size_t offset, size_t numBits)
+static size_t NumUniqueMaskedValues(const u8* apicIds, u8 mask)
 {
-	if(numBits == 0)
-		return 1;	// see above
-	const u8 mask = bit_mask<u8>(numBits);
-
-	typedef std::set<u8> IdSet;
-	IdSet ids;
+	std::set<u8> ids;
 	for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
 	{
 		const u8 apicId = apicIds[processor];
-		const u8 field = u8(apicId >> offset) & mask;
+		const u8 field = apicId & mask;
 		ids.insert(field);
 	}
 
@@ -203,13 +180,31 @@ static size_t NumUniqueValuesInField(const u8* apicIds, size_t offset, size_t nu
 }
 
 
+/**
+ * count the number of values assumed by a certain field within APIC IDs.
+ *
+ * @param offset index of the lowest bit that is part of the field.
+ * @param numValues number of values that can be assumed by the field.
+ * if equal to one, the field is zero-width.
+ * @return number of unique values (for convenience of the topology code,
+ * this is always at least one)
+ **/
+static size_t NumUniqueValuesInField(const u8* apicIds, size_t offset, size_t numValues)
+{
+	if(numValues == 1)
+		return 1;	// see above
+	const size_t numBits = ceil_log2(numValues);
+	const u8 mask = u8((bit_mask<u8>(numBits) << offset) & 0xFF);
+	return NumUniqueMaskedValues(apicIds, mask);
+}
+
+
 static size_t NumPackages(const u8* apicIds)
 {
 	if(apicIds)
 	{
 		const size_t offset = ceil_log2(CoresPerPackage()) + ceil_log2(LogicalPerCore());
-		const size_t numBits = 8;
-		return NumUniqueValuesInField(apicIds, offset, numBits);
+		return NumUniqueValuesInField(apicIds, offset, 256);
 	}
 	else
 	{
@@ -241,8 +236,7 @@ static size_t CoresPerPackage(const u8* apicIds)
 	if(apicIds)
 	{
 		const size_t offset = ceil_log2(LogicalPerCore());
-		const size_t numBits = ceil_log2(CoresPerPackage());
-		return NumUniqueValuesInField(apicIds, offset, numBits);
+		return NumUniqueValuesInField(apicIds, offset, CoresPerPackage());
 	}
 	else
 	{
@@ -257,8 +251,7 @@ static size_t LogicalPerCore(const u8* apicIds)
 	if(apicIds)
 	{
 		const size_t offset = 0;
-		const size_t numBits = ceil_log2(LogicalPerCore());
-		return NumUniqueValuesInField(apicIds, offset, numBits);
+		return NumUniqueValuesInField(apicIds, offset, LogicalPerCore());
 	}
 	else
 	{
@@ -320,9 +313,9 @@ static size_t NumCaches(const u8* apicIds)
 {
 	if(apicIds)
 	{
-		const size_t offset = 0;
 		const size_t numBits = ceil_log2(LogicalPerCache());
-		return NumUniqueValuesInField(apicIds, offset, numBits);
+		const u8 mask = u8((0xFF << numBits) & 0xFF);
+		return NumUniqueMaskedValues(apicIds, mask);
 	}
 	else
 	{
diff --git a/source/lib/sysdep/x86_x64/x86_x64.cpp b/source/lib/sysdep/x86_x64/x86_x64.cpp
index fcc966b2b7..1e9bb493d0 100644
--- a/source/lib/sysdep/x86_x64/x86_x64.cpp
+++ b/source/lib/sysdep/x86_x64/x86_x64.cpp
@@ -223,6 +223,63 @@ size_t x86_x64_Generation()
 }
 
 
+//-----------------------------------------------------------------------------
+// cache
+
+void x86_x64_EnumerateCaches(x86_x64_CacheCallback callback)
+{
+	for(u32 count = 0; ; count++)
+	{
+		x86_x64_CpuidRegs regs;
+		regs.eax = 4;
+		regs.ecx = count;
+		x86_x64_cpuid(&regs);
+
+		x86_x64_CacheParameters cache;
+		cache.type = (x86_x64_CacheType)bits(regs.eax, 0, 4);
+		if(cache.type == X86_X64_CACHE_TYPE_NULL)	// no more remaining
+			break;
+		cache.level = (size_t)bits(regs.eax, 5, 7);
+		cache.associativity = (size_t)bits(regs.ebx, 22, 31)+1;
+		cache.lineSize = (size_t)bits(regs.ebx, 0, 11)+1;	// (yes, this also uses +1 encoding)
+		cache.sharedBy = (size_t)bits(regs.eax, 14, 25)+1;
+		{
+			const size_t partitions = (size_t)bits(regs.ebx, 12, 21)+1;
+			const size_t sets = (size_t)bits(regs.ecx, 0, 31)+1;
+			cache.size = cache.associativity * partitions * cache.lineSize * sets;
+		}
+
+		callback(&cache);
+	}
+}
+
+
+size_t x86_x64_L1CacheLineSize()
+{
+	static size_t l1CacheLineSize;
+
+	if(!l1CacheLineSize)
+	{
+		l1CacheLineSize = 64;	// (default in case DetectL1CacheLineSize fails)
+
+		struct DetectL1CacheLineSize
+		{
+			static void Callback(const x86_x64_CacheParameters* cache)
+			{
+				if(cache->type != X86_X64_CACHE_TYPE_DATA && cache->type != X86_X64_CACHE_TYPE_UNIFIED)
+					return;
+				if(cache->level != 1)
+					return;
+				l1CacheLineSize = cache->lineSize;
+			}
+		};
+		x86_x64_EnumerateCaches(DetectL1CacheLineSize::Callback);
+	}
+
+	return l1CacheLineSize;
+}
+
+
 //-----------------------------------------------------------------------------
 // identifier string
 
diff --git a/source/lib/sysdep/x86_x64/x86_x64.h b/source/lib/sysdep/x86_x64/x86_x64.h
index 74146854db..87159e5c21 100644
--- a/source/lib/sysdep/x86_x64/x86_x64.h
+++ b/source/lib/sysdep/x86_x64/x86_x64.h
@@ -96,6 +96,40 @@ enum x86_x64_Cap
 LIB_API bool x86_x64_cap(x86_x64_Cap cap);
 
 
+//-----------------------------------------------------------------------------
+// cache
+
+enum x86_x64_CacheType
+{
+	X86_X64_CACHE_TYPE_NULL,	// never passed to the callback
+	X86_X64_CACHE_TYPE_DATA,
+	X86_X64_CACHE_TYPE_INSTRUCTION,
+	X86_X64_CACHE_TYPE_UNIFIED
+	// note: further values are "reserved"
+};
+
+struct x86_x64_CacheParameters
+{
+	x86_x64_CacheType type;
+	size_t level;
+	size_t associativity;
+	size_t lineSize;
+	size_t sharedBy;
+	size_t size;
+};
+
+typedef void (CALL_CONV *x86_x64_CacheCallback)(const x86_x64_CacheParameters*);
+
+/**
+ * call back for each cache reported by CPUID.
+ *
+ * note: ordering is undefined (see Intel AP-485)
+ **/
+LIB_API void x86_x64_EnumerateCaches(x86_x64_CacheCallback callback);
+
+LIB_API size_t x86_x64_L1CacheLineSize();
+
+
 //-----------------------------------------------------------------------------
 // stateless