add NUMA and shared-L2-cache detect code (required at work)

enable most of IA-32 specific code to be used in amd64 (resides in directory lib/sysdep/x86_x64) bits: add IsBitSet remove mem_PageSize (use os_cpu_PageSize instead) cpuid: change interface to allow gracefully supporting later subfunctions that require input parameters amd64_asm.asm: add amd64 implementation of cpuid cpu: move functions provided by OS to sysdep/os_cpu.cpp cpu topology: avoid trouble when process affinity is set by remapping processor numbers to 0..PopulationCount(processAffinity) topology.cpp: move ex-ia32 topology code here. This was SVN commit r5945.
2008-05-12 18:15:08 +00:00 · 2008-05-12 18:15:08 +00:00 · ffdff6888d
commit ffdff6888d
parent 7152e4a3e6
29 changed files with 2091 additions and 1062 deletions
--- a/source/graphics/Color.cpp
+++ b/source/graphics/Color.cpp
@ -12,7 +12,7 @@

 #include "maths/MathUtil.h"
 #include "graphics/SColor.h"
-#include "lib/sysdep/ia32/ia32.h"
+#include "lib/sysdep/x86_x64/x86_x64.h"

 static u32 fallback_ConvertRGBColorTo4ub(const RGBColor& src)
 {
@ -39,7 +39,7 @@ void ColorActivateFastImpl()
 	{
 	}
 #if ARCH_IA32
-	else if (ia32_cap(IA32_CAP_SSE))
+	else if (x86_x64_cap(X86_X64_CAP_SSE))
 	{
 		ConvertRGBColorTo4ub = sse_ConvertRGBColorTo4ub;
 	}
--- a/source/lib/allocators/mem_util.cpp
+++ b/source/lib/allocators/mem_util.cpp
@ -13,23 +13,17 @@

 #include "lib/bits.h"				// round_up
 #include "lib/posix/posix_mman.h"
-#include "lib/sysdep/cpu.h"			// cpu_PageSize
+#include "lib/sysdep/os_cpu.h"			// os_cpu_PageSize


-size_t mem_PageSize()
-{
-	static const size_t page_size = cpu_PageSize();
-	return page_size;
-}
-
 bool mem_IsPageMultiple(uintptr_t x)
 {
-	return (x & (mem_PageSize()-1)) == 0;
+	return (x & (os_cpu_PageSize()-1)) == 0;
 }

 size_t mem_RoundUpToPage(size_t size)
 {
-	return round_up(size, mem_PageSize());
+	return round_up(size, os_cpu_PageSize());
 }

 size_t mem_RoundUpToAlignment(size_t size)
--- a/source/lib/allocators/mem_util.h
+++ b/source/lib/allocators/mem_util.h
@ -11,14 +11,6 @@
 #ifndef INCLUDED_MEM_UTIL
 #define INCLUDED_MEM_UTIL

-
-/**
- * @return page size
- *
- * (this routine caches the result of cpu_PageSize and ensures the value
- * is available before static initializers have run.)
- **/
-extern size_t mem_PageSize();
 extern bool mem_IsPageMultiple(uintptr_t x);

 extern size_t mem_RoundUpToPage(size_t size);
--- a/source/lib/bits.h
+++ b/source/lib/bits.h
@ -25,6 +25,13 @@
 **/
 #define BIT64(n) (1ull << (n))

+template<typename T>
+bool IsBitSet(T value, size_t index)
+{
+	const T bit = T(1) << index;
+	return (value & bit) != 0;
+}
+

 // these are declared in the header and inlined to aid compiler optimizations
 // (they can easily end up being time-critical).
--- a/source/lib/file/vfs/file_cache.cpp
+++ b/source/lib/file/vfs/file_cache.cpp
@ -18,7 +18,7 @@
 #include "lib/allocators/allocators.h"
 #include "lib/allocators/shared_ptr.h"
 #include "lib/allocators/headerless.h"
-#include "lib/allocators/mem_util.h"    // mem_PageSize
+#include "lib/sysdep/os_cpu.h"	// os_cpu_PageSize


 //-----------------------------------------------------------------------------
--- a/source/lib/sysdep/amd64/amd64_asm.asm
+++ b/source/lib/sysdep/amd64/amd64_asm.asm
@ -0,0 +1,36 @@
+; =========================================================================
+; File        : amd64_asm.asm
+; Project     : 0 A.D.
+; Description : 
+; =========================================================================
+
+; license: GPL; see lib/license.txt
+
+; extern "C" void __cdecl amd64_asm_cpuid(Ia32CpuidRegs* reg);
+; reference: http://softwarecommunity.intel.com/articles/eng/2669.htm
+PUBLIC amd64_asm_cpuid
+.CODE
+	ALIGN 8
+amd64_asm_cpuid	PROC FRAME
+	sub			rsp, 32
+	.allocstack 32
+	push		rbx
+	.pushreg rbx
+	.endprolog
+
+	mov			r8, rcx
+	mov			eax, DWORD PTR [r8+0]
+	mov			ecx, DWORD PTR [r8+8]
+	cpuid
+	mov			DWORD PTR [r8+0], eax
+	mov			DWORD PTR [r8+4], ebx
+	mov			DWORD PTR [r8+8], ecx
+	mov			DWORD PTR [r8+12], edx
+
+	pop			rbx
+	add			rsp, 32
+
+	ret
+	ALIGN 8
+cpuid64 ENDP
+_TEXT ENDS
--- a/source/lib/sysdep/cpu.cpp
+++ b/source/lib/sysdep/cpu.cpp
@ -14,4 +14,3 @@
 ERROR_ASSOCIATE(ERR::CPU_FEATURE_MISSING, "This CPU doesn't support a required feature", -1);
 ERROR_ASSOCIATE(ERR::CPU_UNKNOWN_OPCODE, "Disassembly failed", -1);
 ERROR_ASSOCIATE(ERR::CPU_UNKNOWN_VENDOR, "CPU vendor unknown", -1);
-ERROR_ASSOCIATE(ERR::CPU_RESTRICTED_AFFINITY, "Cannot set desired CPU affinity", -1);
--- a/source/lib/sysdep/cpu.h
+++ b/source/lib/sysdep/cpu.h
@ -16,15 +16,9 @@ namespace ERR
 	const LibError CPU_FEATURE_MISSING     = -130000;
 	const LibError CPU_UNKNOWN_OPCODE      = -130001;
 	const LibError CPU_UNKNOWN_VENDOR      = -130002;
-	const LibError CPU_RESTRICTED_AFFINITY = -130003;
+	
 }

-// (some of these functions may be implemented in external asm files)
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-
 //-----------------------------------------------------------------------------
 // CPU detection

@ -44,52 +38,6 @@ LIB_API const char* cpu_IdentifierString();
 **/
 LIB_API double cpu_ClockFrequency();

-/**
- * @return the number of what the OS deems "processors" or -1 on failure.
- *
- * this is used by ia32 when it cannot determine the number via APIC IDs.
- * in other situations, the cpu_NumPackages function is preferable since
- * it is more specific.
- *
- * note: this function is necessary because POSIX sysconf _SC_NPROCESSORS_CONF
- * is not suppored on MacOSX, else we would use that.
- **/
-LIB_API size_t cpu_NumProcessors();
-
-/**
- * @return number of *enabled* CPU packages / sockets.
- **/
-LIB_API size_t cpu_NumPackages();
-
-/**
- * @return number of *enabled* CPU cores per package.
- * (2 on dual-core systems)
- **/
-LIB_API size_t cpu_CoresPerPackage();
-
-/**
- * @return number of *enabled* hyperthreading units per core.
- * (2 on P4 EE)
- **/
-LIB_API size_t cpu_LogicalPerCore();
-
-/**
- * @return the size [bytes] of a MMU page.
- * (4096 on most IA-32 systems)
- **/
-LIB_API size_t cpu_PageSize();
-
-enum CpuMemoryIndicators
-{
-	CPU_MEM_TOTAL,
-	CPU_MEM_AVAILABLE
-};
-
-/**
- * @return the amount [bytes] of available or total physical memory.
- **/
-LIB_API size_t cpu_MemorySize(CpuMemoryIndicators mem_type);
-

 //-----------------------------------------------------------------------------
 // lock-free support routines
@ -105,6 +53,16 @@ LIB_API size_t cpu_MemorySize(CpuMemoryIndicators mem_type);
 **/
 LIB_API bool cpu_CAS(volatile uintptr_t* location, uintptr_t expected, uintptr_t newValue);

+/**
+ * specialization of cpu_CAS for pointer types. this avoids error-prone
+ * casting in user code.
+ **/
+template<typename T>
+bool cpu_CAS(volatile T* location, T expected, T new_value)
+{
+	return cpu_CAS((volatile uintptr_t*)location, (uintptr_t)expected, (uintptr_t)new_value);
+}
+
 /**
 * add a signed value to a variable without the possibility of interference
 * from other threads/CPUs.
@ -130,17 +88,6 @@ LIB_API void cpu_MemoryFence();
 **/
 LIB_API void* cpu_memcpy(void* RESTRICT dst, const void* RESTRICT src, size_t size);

-/**
- * execute the specified function once on each CPU.
- * this includes logical HT units and proceeds serially (function
- * is never re-entered) in order of increasing OS CPU ID.
- * note: implemented by switching thread affinity masks and forcing
- * a reschedule, which is apparently not possible with POSIX.
- *
- * may fail if e.g. OS is preventing us from running on some CPUs.
- **/
-typedef void (*CpuCallback)(void* param);
-LIB_API LibError cpu_CallByEachCPU(CpuCallback cb, void* param);

 /**
 * set the FPU control word to "desirable" values (see implementation)
@ -155,19 +102,4 @@ LIB_API void cpu_ConfigureFloatingPoint();
 #define cpu_i32FromDouble(d) ((i32)d)
 #define cpu_i64FromDouble(d) ((i64)d)

-#ifdef __cplusplus
-}
-#endif
-
-
-/**
- * specialization of cpu_CAS for pointer types. this avoids error-prone
- * casting in user code.
- **/
-template<typename T>
-bool cpu_CAS(volatile T* location, T expected, T new_value)
-{
-	return cpu_CAS((volatile uintptr_t*)location, (uintptr_t)expected, (uintptr_t)new_value);
-}
-
 #endif	// #ifndef INCLUDED_CPU
--- a/source/lib/sysdep/ia32/ia32.cpp
+++ b/source/lib/sysdep/ia32/ia32.cpp
@ -2,7 +2,7 @@
 * =========================================================================
 * File        : ia32.cpp
 * Project     : 0 A.D.
- * Description : C++ and inline asm implementations of IA-32 functions
+ * Description : routines specific to IA-32
 * =========================================================================
 */

@ -11,715 +11,11 @@
 #include "precompiled.h"
 #include "ia32.h"

-#include <string.h>
-#include <stdio.h>
-#include <vector>
-#include <set>
-#include <algorithm>
-
-#include "lib/posix/posix.h"	// pthread
-#include "lib/bits.h"
-#include "lib/timer.h"
 #include "lib/sysdep/cpu.h"
 #include "ia32_memcpy.h"
 #include "ia32_asm.h"
-#include "../amd64/amd64_asm.h"
-#include <intrin.h>

-#if !MSC_VERSION && !GCC_VERSION
-# error we currently only support MSC/ICC or GCC
-#endif

-
-// note: unfortunately the MSC __cpuid intrinsic does not allow passing
-// additional inputs (e.g. ecx = count), so we need to implement this
-// in assembly for both IA-32 and AMD64.
-static void cpuid_impl(Ia32CpuidRegs* regs)
-{
-#if ARCH_IA32
-	ia32_asm_cpuid(regs);
-#else	// i.e. ARCH_AMD64
-	amd64_asm_cpuid(regs);
-#endif
-}
-
-bool ia32_cpuid(Ia32CpuidRegs* regs)
-{
-	static u32 maxFunction;
-	static u32 maxExtendedFunction;
-	if(!maxFunction)
-	{
-		regs->eax = 0;
-		cpuid_impl(regs);
-		maxFunction = regs->eax;
-		regs->eax = 0x80000000;
-		cpuid_impl(regs);
-		maxExtendedFunction = regs->eax;
-	}
-
-	const u32 function = regs->eax;
-	if(function > maxExtendedFunction)
-		return false;
-	if(function < 0x80000000 && function > maxFunction)
-		return false;
-
-	cpuid_impl(regs);
-	return true;
-}
-
-
-//-----------------------------------------------------------------------------
-// capability bits
-
-static void DetectFeatureFlags(u32 caps[4])
-{
-	Ia32CpuidRegs regs;
-	regs.eax = 1;
-	if(ia32_cpuid(&regs))
-	{
-		caps[0] = regs.ecx;
-		caps[1] = regs.edx;
-	}
-	regs.eax = 0x80000001;
-	if(ia32_cpuid(&regs))
-	{
-		caps[2] = regs.ecx;
-		caps[3] = regs.edx;
-	}
-}
-
-bool ia32_cap(IA32Cap cap)
-{
-	// treated as 128 bit field; order: std ecx, std edx, ext ecx, ext edx
-	// keep in sync with enum CpuCap!
-	static u32 ia32_caps[4];
-
-	// (since relevant CPUs will surely advertise at least one standard flag,
-	// they are zero iff we haven't been initialized yet)
-	if(!ia32_caps[1])
-		DetectFeatureFlags(ia32_caps);
-
-	const size_t tbl_idx = cap >> 5;
-	const size_t bit_idx = cap & 0x1f;
-	if(tbl_idx > 3)
-	{
-		DEBUG_WARN_ERR(ERR::INVALID_PARAM);
-		return false;
-	}
-	return (ia32_caps[tbl_idx] & BIT(bit_idx)) != 0;
-}
-
-
-//-----------------------------------------------------------------------------
-// CPU identification
-
-static Ia32Vendor DetectVendor()
-{
-	Ia32CpuidRegs regs;
-	regs.eax = 0;
-	if(!ia32_cpuid(&regs))
-		DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING);
-
-	// copy regs to string
-	// note: 'strange' ebx,edx,ecx reg order is due to ModR/M encoding order.
-	char vendor_str[13];
-	u32* vendor_str_u32 = (u32*)vendor_str;
-	vendor_str_u32[0] = regs.ebx;
-	vendor_str_u32[1] = regs.edx;
-	vendor_str_u32[2] = regs.ecx;
-	vendor_str[12] = '\0';	// 0-terminate
-
-	if(!strcmp(vendor_str, "AuthenticAMD"))
-		return IA32_VENDOR_AMD;
-	else if(!strcmp(vendor_str, "GenuineIntel"))
-		return IA32_VENDOR_INTEL;
-	else
-	{
-		DEBUG_WARN_ERR(ERR::CPU_UNKNOWN_VENDOR);
-		return IA32_VENDOR_UNKNOWN;
-	}
-}
-
-Ia32Vendor ia32_Vendor()
-{
-	static Ia32Vendor vendor = IA32_VENDOR_UNKNOWN;
-	if(vendor == IA32_VENDOR_UNKNOWN)
-		vendor = DetectVendor();
-	return vendor;
-}
-
-
-static void DetectSignature(size_t* model, size_t* family)
-{
-	Ia32CpuidRegs regs;
-	regs.eax = 1;
-	if(!ia32_cpuid(&regs))
-		DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING);
-	*model  = bits(regs.eax, 4, 7);
-	*family = bits(regs.eax, 8, 11);
-}
-
-
-static size_t DetectGeneration()
-{
-	size_t model, family;
-	DetectSignature(&model, &family);
-
-	switch(ia32_Vendor())
-	{
-	case IA32_VENDOR_AMD:
-		switch(family)
-		{
-		case 5:
-			if(model < 6)
-				return 5;	// K5
-			else
-				return 6;	// K6
-
-		case 6:
-			return 7;	// K7 (Athlon)
-
-		case 0xF:
-			return 8;	// K8 (Opteron)
-		}
-		break;
-
-	case IA32_VENDOR_INTEL:
-		switch(family)
-		{
-		case 5:
-			return 5;	// Pentium
-
-		case 6:
-			if(model <= 0xD)
-				return 6;	// Pentium Pro/II/III/M
-			else
-				return 8;	// Core2Duo
-
-		case 0xF:
-			if(model <= 6)
-				return 7;	// Pentium 4/D
-		}
-		break;
-	}
-
-	debug_assert(0);	// unknown CPU generation
-	return family;
-}
-
-size_t ia32_Generation()
-{
-	static size_t generation;
-	if(!generation)
-		generation = DetectGeneration();
-	return generation;
-}
-
-
-//-----------------------------------------------------------------------------
-// identifier string
-
-/// functor to remove substrings from the CPU identifier string
-class StringStripper
-{
-	char* m_string;
-	size_t m_max_chars;
-
-public:
-	StringStripper(char* string, size_t max_chars)
-	: m_string(string), m_max_chars(max_chars)
-	{
-	}
-
-	// remove all instances of substring from m_string
-	void operator()(const char* substring)
-	{
-		const size_t substring_length = strlen(substring);
-		for(;;)
-		{
-			char* substring_pos = strstr(m_string, substring);
-			if(!substring_pos)
-				break;
-			const size_t substring_ofs = substring_pos - m_string;
-			const size_t num_chars = m_max_chars - substring_ofs - substring_length;
-			memmove(substring_pos, substring_pos+substring_length, num_chars);
-		}
-	}
-};
-
-static void DetectIdentifierString(char* identifierString, size_t maxChars)
-{
-	// get brand string (if available)
-	char* pos = identifierString;
-	bool have_brand_string = true;
-	for(u32 function = 0x80000002; function <= 0x80000004; function++)
-	{
-		Ia32CpuidRegs regs;
-		regs.eax = function;
-		have_brand_string &= ia32_cpuid(&regs);
-		memcpy(pos, &regs, 16);
-		pos += 16;
-	}
-
-	// fall back to manual detect of CPU type because either:
-	// - CPU doesn't support brand string (we use a flag to indicate this
-	//   rather than comparing against a default value because it is safer);
-	// - the brand string is useless, e.g. "Unknown". this happens on
-	//   some older boards whose BIOS reprograms the string for CPUs it
-	//   doesn't recognize.
-	if(!have_brand_string || strncmp(identifierString, "Unknow", 6) == 0)
-	{
-		size_t model, family;
-		DetectSignature(&model, &family);
-
-		switch(ia32_Vendor())
-		{
-		case IA32_VENDOR_AMD:
-			// everything else is either too old, or should have a brand string.
-			if(family == 6)
-			{
-				if(model == 3 || model == 7)
-					strcpy_s(identifierString, maxChars, "AMD Duron");
-				else if(model <= 5)
-					strcpy_s(identifierString, maxChars, "AMD Athlon");
-				else
-				{
-					if(ia32_cap(IA32_CAP_AMD_MP))
-						strcpy_s(identifierString, maxChars, "AMD Athlon MP");
-					else
-						strcpy_s(identifierString, maxChars, "AMD Athlon XP");
-				}
-			}
-			break;
-
-		case IA32_VENDOR_INTEL:
-			// everything else is either too old, or should have a brand string.
-			if(family == 6)
-			{
-				if(model == 1)
-					strcpy_s(identifierString, maxChars, "Intel Pentium Pro");
-				else if(model == 3 || model == 5)
-					strcpy_s(identifierString, maxChars, "Intel Pentium II");
-				else if(model == 6)
-					strcpy_s(identifierString, maxChars, "Intel Celeron");	
-				else
-					strcpy_s(identifierString, maxChars, "Intel Pentium III");
-			}
-			break;
-		}
-	}
-	// identifierString already holds a valid brand string; pretty it up.
-	else
-	{
-		const char* const undesired_strings[] = { "(tm)", "(TM)", "(R)", "CPU " };
-		std::for_each(undesired_strings, undesired_strings+ARRAY_SIZE(undesired_strings),
-			StringStripper(identifierString, strlen(identifierString)+1));
-
-		// note: Intel brand strings include a frequency, but we can't rely
-		// on it because the CPU may be overclocked. we'll leave it in the
-		// string to show measurement accuracy and if SpeedStep is active.
-	}
-}
-
-const char* cpu_IdentifierString()
-{
-	// 3 calls x 4 registers x 4 bytes = 48
-	static char identifierString[48+1] = {'\0'};
-	if(identifierString[0] == '\0')
-		DetectIdentifierString(identifierString, ARRAY_SIZE(identifierString));
-	return identifierString;
-}
-
-
-//-----------------------------------------------------------------------------
-// CPU frequency
-
-// set scheduling priority and restore when going out of scope.
-class ScopedSetPriority
-{
-	int m_old_policy;
-	sched_param m_old_param;
-
-public:
-	ScopedSetPriority(int new_priority)
-	{
-		// get current scheduling policy and priority
-		pthread_getschedparam(pthread_self(), &m_old_policy, &m_old_param);
-
-		// set new priority
-		sched_param new_param = {0};
-		new_param.sched_priority = new_priority;
-		pthread_setschedparam(pthread_self(), SCHED_FIFO, &new_param);
-	}
-
-	~ScopedSetPriority()
-	{
-		// restore previous policy and priority.
-		pthread_setschedparam(pthread_self(), m_old_policy, &m_old_param);
-	}
-};
-
-// note: this function uses timer.cpp!timer_Time, which is implemented via
-// whrt.cpp on Windows, which again calls ia32_Init. be careful that
-// this function isn't called from there as well, else WHRT will be used
-// before its init completes.
-double ia32_ClockFrequency()
-{
-	// if the TSC isn't available, there's really no good way to count the
-	// actual CPU clocks per known time interval, so bail.
-	// note: loop iterations ("bogomips") are not a reliable measure due
-	// to differing IPC and compiler optimizations.
-	if(!ia32_cap(IA32_CAP_TSC))
-		return -1.0;	// impossible value
-
-	// increase priority to reduce interference while measuring.
-	const int priority = sched_get_priority_max(SCHED_FIFO)-1;
-	ScopedSetPriority ssp(priority);
-
-	// note: no need to "warm up" cpuid - it will already have been
-	// called several times by the time this code is reached.
-	// (background: it's used in ia32_rdtsc() to serialize instruction flow;
-	// the first call is documented to be slower on Intel CPUs)
-
-	int num_samples = 16;
-	// if clock is low-res, do less samples so it doesn't take too long.
-	// balance measuring time (~ 10 ms) and accuracy (< 1 0/00 error -
-	// ok for using the TSC as a time reference)
-	if(timer_Resolution() >= 1e-3)
-		num_samples = 8;
-	std::vector<double> samples(num_samples);
-
-	for(int i = 0; i < num_samples; i++)
-	{
-		double dt;
-		i64 dc; // i64 because VC6 can't convert u64 -> double,
-		        // and we don't need all 64 bits.
-
-		// count # of clocks in max{1 tick, 1 ms}:
-		// .. wait for start of tick.
-		const double t0 = timer_Time();
-		u64 c1; double t1;
-		do
-		{
-			// note: timer_Time effectively has a long delay (up to 5 us)
-			// before returning the time. we call it before ia32_rdtsc to
-			// minimize the delay between actually sampling time / TSC,
-			// thus decreasing the chance for interference.
-			// (if unavoidable background activity, e.g. interrupts,
-			// delays the second reading, inaccuracy is introduced).
-			t1 = timer_Time();
-			c1 = ia32_rdtsc();
-		}
-		while(t1 == t0);
-		// .. wait until start of next tick and at least 1 ms elapsed.
-		do
-		{
-			const double t2 = timer_Time();
-			const u64 c2 = ia32_rdtsc();
-			dc = (i64)(c2 - c1);
-			dt = t2 - t1;
-		}
-		while(dt < 1e-3);
-
-		// .. freq = (delta_clocks) / (delta_seconds);
-		//    ia32_rdtsc/timer overhead is negligible.
-		const double freq = dc / dt;
-		samples[i] = freq;
-	}
-
-	std::sort(samples.begin(), samples.end());
-
-	// median filter (remove upper and lower 25% and average the rest).
-	// note: don't just take the lowest value! it could conceivably be
-	// too low, if background processing delays reading c1 (see above).
-	double sum = 0.0;
-	const int lo = num_samples/4, hi = 3*num_samples/4;
-	for(int i = lo; i < hi; i++)
-		sum += samples[i];
-
-	const double clock_frequency = sum / (hi-lo);
-	return clock_frequency;
-}
-
-
-//-----------------------------------------------------------------------------
-// processor topology
-
-u8 ia32_ApicId()
-{
-	Ia32CpuidRegs regs;
-	regs.eax = 1;
-	if(!ia32_cpuid(&regs))
-		DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING);
-	const u8 apicId = (u8)bits(regs.ebx, 24, 31);
-	return apicId;
-}
-
-
-// OSes report hyperthreading units and cores as "processors". we need to
-// drill down and find out the exact counts (for thread pool dimensioning
-// and cache sharing considerations).
-// note: Intel Appnote 485 (CPUID) assures uniformity of coresPerPackage and
-// logicalPerCore.
-
-static size_t DetectCoresPerPackage()
-{
-	Ia32CpuidRegs regs;
-	switch(ia32_Vendor())
-	{
-	case IA32_VENDOR_INTEL:
-		regs.eax = 4;
-		if(ia32_cpuid(&regs))
-			return bits(regs.eax, 26, 31)+1;
-		break;
-
-	case IA32_VENDOR_AMD:
-		regs.eax = 0x80000008;
-		if(ia32_cpuid(&regs))
-			return bits(regs.ecx, 0, 7)+1;
-		break;
-	}
-
-	return 1;	// else: the CPU is single-core.
-}
-
-static size_t CoresPerPackage()
-{
-	static size_t coresPerPackage = 0;
-	if(!coresPerPackage)
-		coresPerPackage = DetectCoresPerPackage();
-	return coresPerPackage;
-}
-
-
-static bool IsHyperthreadingCapable()
-{
-	// definitely not
-	if(!ia32_cap(IA32_CAP_HT))
-		return false;
-
-	// AMD N-core systems falsely set the HT bit for compatibility reasons
-	// (don't bother resetting it, might confuse callers)
-	if(ia32_Vendor() == IA32_VENDOR_AMD && ia32_cap(IA32_CAP_AMD_CMP_LEGACY))
-		return false;
-
-	return true;
-}
-
-static size_t DetectLogicalPerCore()
-{
-	if(!IsHyperthreadingCapable())
-		return 1;
-
-	Ia32CpuidRegs regs;
-	regs.eax = 1;
-	if(!ia32_cpuid(&regs))
-		DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING);
-	const size_t logicalPerPackage = bits(regs.ebx, 16, 23);
-
-	// cores ought to be uniform WRT # logical processors
-	debug_assert(logicalPerPackage % CoresPerPackage() == 0);
-
-	return logicalPerPackage / CoresPerPackage();
-}
-
-static size_t LogicalPerCore()
-{
-	static size_t logicalPerCore = 0;
-	if(!logicalPerCore)
-		logicalPerCore = DetectLogicalPerCore();
-	return logicalPerCore;
-}
-
-
-// the above two functions give the maximum number of cores/logical units.
-// however, some of them may actually be disabled by the BIOS!
-// what we can do is to analyze the APIC IDs. they are allocated sequentially
-// for all "processors". treating the IDs as variable-width bitfields
-// (according to the number of cores/logical units present) allows
-// determining the exact topology as well as number of packages.
-
-// these are set by DetectProcessorTopology.
-static size_t numPackages = 0;	// i.e. sockets; > 1 => true SMP system
-static size_t enabledCoresPerPackage = 0;
-static size_t enabledLogicalPerCore = 0;	// hyperthreading units
-
-typedef std::vector<u8> Ids;
-typedef std::set<u8> IdSet;
-
-// add the currently running processor's APIC ID to a list of IDs.
-static void StoreApicId(void* param)
-{
-	Ids* apicIds = (Ids*)param;
-	apicIds->push_back(ia32_ApicId());
-}
-
-
-// field := a range of bits sufficient to represent <num_values> integers.
-// for each id in apicIds: extract the value of the field at offset bit_pos
-// and insert it into ids. afterwards, adjust bit_pos to the next field.
-// used to gather e.g. all core IDs from all APIC IDs.
-static void ExtractFieldsIntoSet(const Ids& apicIds, size_t& bit_pos, size_t num_values, IdSet& ids)
-{
-	const size_t id_bits = ceil_log2(num_values);
-	if(id_bits == 0)
-		return;
-
-	const u8 mask = bit_mask<u8>(id_bits);
-
-	for(size_t i = 0; i < apicIds.size(); i++)
-	{
-		const u8 apic_id = apicIds[i];
-		const u8 field = u8(apic_id >> bit_pos) & mask;
-		ids.insert(field);
-	}
-
-	bit_pos += id_bits;
-}
-
-
-// @return false if unavailable / no information can be returned.
-static bool DetectProcessorTopologyViaApicIds()
-{
-	// old APIC (see ia32_ApicId for details)
-	if(ia32_Generation() < 8)
-		return false;
-
-	// get the set of all APIC IDs
-	Ids apicIds;
-	// .. OS affinity support is missing or excludes us from some processors
-	if(cpu_CallByEachCPU(StoreApicId, &apicIds) != INFO::OK)
-		return false;
-	// .. if IDs aren't unique, cpu_CallByEachCPU is broken.
-	std::sort(apicIds.begin(), apicIds.end());
-	debug_assert(std::unique(apicIds.begin(), apicIds.end()) == apicIds.end());
-
-	// extract values from all 3 ID bitfields into separate sets
-	size_t bit_pos = 0;
-	IdSet logicalIds;
-	ExtractFieldsIntoSet(apicIds, bit_pos, LogicalPerCore(), logicalIds);
-	IdSet coreIds;
-	ExtractFieldsIntoSet(apicIds, bit_pos, CoresPerPackage(), coreIds);
-	IdSet packageIds;
-	ExtractFieldsIntoSet(apicIds, bit_pos, 0xFF, packageIds);
-
-	// (the set cardinality is representative of all packages/cores since
-	// their numbers are uniform across the system.)
-	numPackages            = std::max((size_t)packageIds.size(), 1u);
-	enabledCoresPerPackage = std::max((size_t)coreIds   .size(), 1u);
-	enabledLogicalPerCore  = std::max((size_t)logicalIds.size(), 1u);
-
-	// note: even though APIC IDs are assigned sequentially, we can't make any
-	// assumptions about the values/ordering because we get them according to
-	// the CPU affinity mask, which is unknown.
-
-	return true;
-}
-
-
-static void GuessProcessorTopologyViaOsCount()
-{
-	const size_t numProcessors = cpu_NumProcessors();
-
-	// note: we cannot hope to always return correct results since disabled
-	// cores/logical units cannot be distinguished from the situation of the
-	// OS simply not reporting them as "processors". unfortunately this
-	// function won't always only be called for older (#core = #logical = 1)
-	// systems because DetectProcessorTopologyViaApicIds may fail due to
-	// lack of OS support. what we'll do is assume nothing is disabled; this
-	// is reasonable because we care most about #packages. it's fine to assume
-	// more cores (without inflating the total #processors) because that
-	// count only indicates memory barriers etc. ought to be used.
-	enabledCoresPerPackage = CoresPerPackage();
-	enabledLogicalPerCore = LogicalPerCore();
-
-	const size_t numPackagesTimesLogical = numProcessors / CoresPerPackage();
-	debug_assert(numPackagesTimesLogical != 0);	// otherwise processors didn't include cores, which would be stupid
-
-	numPackages = numPackagesTimesLogical / LogicalPerCore();
-	if(!numPackages)	// processors didn't include logical units (reasonable)
-		numPackages = numPackagesTimesLogical;
-}
-
-
-// determine how many CoresPerPackage and LogicalPerCore are
-// actually enabled and also count numPackages.
-static void DetectProcessorTopology()
-{
-	// authoritative, but requires newer CPU, and OS support.
-	if(DetectProcessorTopologyViaApicIds())
-		return;	// success, we're done.
-
-	GuessProcessorTopologyViaOsCount();
-}
-
-
-size_t cpu_NumPackages()
-{
-	if(!numPackages)
-		DetectProcessorTopology();
-	return (size_t)numPackages;
-}
-
-size_t cpu_CoresPerPackage()
-{
-	if(!enabledCoresPerPackage)
-		DetectProcessorTopology();
-	return (size_t)enabledCoresPerPackage;
-}
-
-size_t cpu_LogicalPerCore()
-{
-	if(!enabledLogicalPerCore)
-		DetectProcessorTopology();
-	return (size_t)enabledLogicalPerCore;
-}
-
-
-//-----------------------------------------------------------------------------
-// misc stateless functions
-
-u64 ia32_rdtsc()
-{
-#if MSC_VERSION
-	return (u64)__rdtsc();
-#elif GCC_VERSION
-	// GCC supports "portable" assembly for both x86 and x86_64
-	volatile u32 lo, hi;
-	asm volatile ("rdtsc" : "=a" (lo), "=d" (hi));
-	return u64_from_u32(hi, lo);
-#endif
-}
-
-
-void ia32_DebugBreak()
-{
-#if MSC_VERSION
-	__debugbreak();
-#elif GCC_VERSION
-	// note: this probably isn't necessary, since unix_debug_break
-	// (SIGTRAP) is most probably available if GCC_VERSION.
-	// we include it for completeness, though.
-	__asm__ __volatile__ ("int $3");
-#endif
-}
-
-
-// enforce strong memory ordering.
-void cpu_MemoryFence()
-{
-	if(ia32_cap(IA32_CAP_SSE2))
-		_mm_mfence();
-}
-
-
-// checks if there is an IA-32 CALL instruction right before ret_addr.
-// returns INFO::OK if so and ERR::FAIL if not.
-// also attempts to determine the call target. if that is possible
-// (directly addressed relative or indirect jumps), it is stored in
-// target, which is otherwise 0.
-//
-// this is useful for walking the stack manually.
 LibError ia32_GetCallTarget(void* ret_addr, void** target)
 {
 	*target = 0;
@ -799,25 +95,17 @@ void cpu_ConfigureFloatingPoint()
 }


-//-----------------------------------------------------------------------------
-// thunk functions for ia32_asm to allow DLL export
-
 void cpu_AtomicAdd(volatile intptr_t* location, intptr_t increment)
 {
 	ia32_asm_AtomicAdd(location, increment);
 }

+
 bool cpu_CAS(volatile uintptr_t* location, uintptr_t expected, uintptr_t new_value)
 {
 	return ia32_asm_CAS(location, expected, new_value);
 }

-void cpu_Serialize()
-{
-	Ia32CpuidRegs regs;
-	regs.eax = 1;
-	ia32_cpuid(&regs);	// CPUID serializes execution.
-}

 void* cpu_memcpy(void* RESTRICT dst, const void* RESTRICT src, size_t size)
 {
--- a/source/lib/sysdep/ia32/ia32.h
+++ b/source/lib/sysdep/ia32/ia32.h
@ -2,7 +2,7 @@
 * =========================================================================
 * File        : ia32.h
 * Project     : 0 A.D.
- * Description : C++ and inline asm implementations of IA-32 functions
+ * Description : routines specific to IA-32
 * =========================================================================
 */

@ -11,106 +11,10 @@
 #ifndef INCLUDED_IA32
 #define INCLUDED_IA32

-#if !ARCH_IA32 && !ARCH_AMD64
-#error "including ia32.h without ARCH_IA32=1 or ARCH_AMD64=1"
+#if !ARCH_IA32
+# error "including ia32.h without ARCH_IA32=1"
 #endif

-/**
- * registers used/returned by ia32_cpuid
- **/
-struct Ia32CpuidRegs
-{
-	u32 eax;
-	u32 ebx;
-	u32 ecx;
-	u32 edx;
-};
-
-/**
- * invoke CPUID instruction.
- * @param regs input/output registers.
- *   regs->eax must be set to the desired function.
- *   some functions (e.g. 4) require regs->ecx to be set as well.
- *   rationale: this interface (input/output structure vs. function parameters)
- *     avoids unnecessary copying/initialization if some inputs aren't needed
- *     and allows graceful expansion to functions that require further inputs.
- * @return true on success or false if the sub-function isn't supported.
- **/
-extern bool ia32_cpuid(Ia32CpuidRegs* regs);
-
-/**
- * CPU vendor.
- * (this is exposed because some CPUID functions are vendor-specific.)
- * (an enum is easier to compare than the original string values.)
- **/
-enum Ia32Vendor
-{
-	IA32_VENDOR_UNKNOWN,
-	IA32_VENDOR_INTEL,
-	IA32_VENDOR_AMD,
-};
-
-LIB_API Ia32Vendor ia32_Vendor();
-
-
-/**
- * @return the colloquial processor generation
- * (5 = Pentium, 6 = Pentium Pro/II/III / K6, 7 = Pentium4 / Athlon, 8 = Core / Opteron)
- **/
-LIB_API size_t ia32_Generation();
-
-
-/**
- * bit indices of CPU capability flags (128 bits).
- * values are defined by IA-32 CPUID feature flags - do not change!
- **/
-enum IA32Cap
-{
-	// standard (ecx) - currently only defined by Intel
-	IA32_CAP_SSE3            = 0+0,	// Streaming SIMD Extensions 3
-	IA32_CAP_EST             = 0+7,	// Enhanced Speedstep Technology
-
-	// standard (edx)
-	IA32_CAP_FPU             = 32+0,	// Floating Point Unit
-	IA32_CAP_TSC             = 32+4,	// TimeStamp Counter
-	IA32_CAP_CMOV            = 32+15,	// Conditional MOVe
-	IA32_CAP_TM_SCC          = 32+22,	// Thermal Monitoring and Software Controlled Clock
-	IA32_CAP_MMX             = 32+23,	// MultiMedia eXtensions
-	IA32_CAP_SSE             = 32+25,	// Streaming SIMD Extensions
-	IA32_CAP_SSE2            = 32+26,	// Streaming SIMD Extensions 2
-	IA32_CAP_HT              = 32+28,	// HyperThreading
-
-	// extended (ecx)
-	IA32_CAP_AMD_CMP_LEGACY  = 64+1,	// N-core and IA32_CAP_HT is falsely set
-
-	// extended (edx)
-	IA32_CAP_AMD_MP          = 96+19,	// MultiProcessing capable; reserved on AMD64
-	IA32_CAP_AMD_MMX_EXT     = 96+22,
-	IA32_CAP_AMD_3DNOW_PRO   = 96+30,
-	IA32_CAP_AMD_3DNOW       = 96+31
-};
-
-/**
- * @return whether the CPU supports the indicated IA32Cap / feature flag.
- **/
-LIB_API bool ia32_cap(IA32Cap cap);
-
-
-//-----------------------------------------------------------------------------
-// stateless
-
-/**
- * @return APIC ID of the currently executing processor.
- *
- * the implementation uses CPUID.1 and only works on >= 8th generation CPUs;
- * (P4/Athlon XP); otherwise it returns 0. the alternative of accessing the
- * APIC mmio registers is not feasible - mahaf_MapPhysicalMemory only works
- * reliably on WinXP. also, the OS already has the APIC registers mapped and
- * in constant use, and we don't want to interfere.
- **/
-LIB_API u8 ia32_ApicId();
-
-
 /**
 * check if there is an IA-32 CALL instruction right before ret_addr.
 * @return INFO::OK if so and ERR::FAIL if not.
@ -123,45 +27,4 @@ LIB_API u8 ia32_ApicId();
 **/
 LIB_API LibError ia32_GetCallTarget(void* ret_addr, void** target);

-
-/**
- * @return the current value of the TimeStampCounter (a counter of
- * CPU cycles since power-on, which is useful for high-resolution timing
- * but potentially differs between multiple CPUs)
- **/
-LIB_API u64 ia32_rdtsc();
-
-/**
- * trigger a breakpoint inside this function when it is called.
- **/
-LIB_API void ia32_DebugBreak(void);
-
-
-
-/// fpclassify return values
-#define IA32_FP_NAN       0x0100
-#define IA32_FP_NORMAL    0x0400
-#define IA32_FP_INFINITE  (IA32_FP_NAN | IA32_FP_NORMAL)
-#define IA32_FP_ZERO      0x4000
-#define IA32_FP_SUBNORMAL (IA32_FP_NORMAL | IA32_FP_ZERO)
-
-// FPU control word (for ia32_asm_control87)
-// .. Precision Control:
-#define IA32_MCW_PC 0x0300
-#define IA32_PC_24  0x0000
-// .. Rounding Control:
-#define IA32_MCW_RC  0x0C00
-#define IA32_RC_NEAR 0x0000
-#define IA32_RC_DOWN 0x0400
-#define IA32_RC_UP   0x0800
-#define IA32_RC_CHOP 0x0C00
-// .. Exception Mask:
-#define IA32_MCW_EM 0x003f
-#define IA32_EM_INVALID    BIT(0)
-#define IA32_EM_DENORMAL   BIT(1)
-#define IA32_EM_ZERODIVIDE BIT(2)
-#define IA32_EM_OVERFLOW   BIT(3)
-#define IA32_EM_UNDERFLOW  BIT(4)
-#define IA32_EM_INEXACT    BIT(5)
-
 #endif	// #ifndef INCLUDED_IA32
--- a/source/lib/sysdep/ia32/ia32_asm.asm
+++ b/source/lib/sysdep/ia32/ia32_asm.asm
@ -17,7 +17,7 @@
 ; CPUID support
 ;-------------------------------------------------------------------------------

-; extern "C" void __cdecl ia32_asm_cpuid(Ia32CpuidRegs* regs);
+; extern "C" void __cdecl ia32_asm_cpuid(x86_x64_CpuidRegs* regs);
 global sym(ia32_asm_cpuid)
 sym(ia32_asm_cpuid):
 	push	ebx							; (clobbered by CPUID)
@ -90,7 +90,7 @@ round_bias		dd 0.4999999

 __SECT__

-; extern "C" size_t __cdecl ia32_asm_control87(size_t new_cw, size_t mask);
+; extern "C" u32 __cdecl ia32_asm_control87(u32 new_cw, u32 mask);
 global sym(ia32_asm_control87)
 sym(ia32_asm_control87):
 	push	eax
--- a/source/lib/sysdep/ia32/ia32_asm.h
+++ b/source/lib/sysdep/ia32/ia32_asm.h
@ -15,29 +15,52 @@
 extern "C" {
 #endif

-struct Ia32CpuidRegs;
-extern void CALL_CONV ia32_asm_cpuid(Ia32CpuidRegs* regs);
+struct x86_x64_CpuidRegs;
+extern void CALL_CONV ia32_asm_cpuid(x86_x64_CpuidRegs* regs);

 extern void CALL_CONV ia32_asm_AtomicAdd(volatile intptr_t* location, intptr_t increment);
 extern bool CALL_CONV ia32_asm_CAS(volatile uintptr_t* location, uintptr_t expected, uintptr_t new_value);

-
+/// control87
+// FPU control word
+// .. Precision Control:
+const u32 IA32_MCW_PC = 0x0300;
+const u32 IA32_PC_24  = 0x0000;
+// .. Rounding Control:
+const u32 IA32_MCW_RC  = 0x0C00;
+const u32 IA32_RC_NEAR = 0x0000;
+const u32 IA32_RC_DOWN = 0x0400;
+const u32 IA32_RC_UP   = 0x0800;
+const u32 IA32_RC_CHOP = 0x0C00;
+// .. Exception Mask:
+const u32 IA32_MCW_EM        = 0x3F;
+const u32 IA32_EM_INVALID    = 0x01;
+const u32 IA32_EM_DENORMAL   = 0x02;
+const u32 IA32_EM_ZERODIVIDE = 0x04;
+const u32 IA32_EM_OVERFLOW   = 0x08;
+const u32 IA32_EM_UNDERFLOW  = 0x10;
+const u32 IA32_EM_INEXACT    = 0x20;
 /**
 * for all 1-bits in mask, update the corresponding FPU control word bits
 * with the bit values in new_val.
 * @return 0 to indicate success.
 **/
-extern size_t CALL_CONV ia32_asm_control87(size_t new_val, size_t mask);
+extern u32 CALL_CONV ia32_asm_control87(u32 new_val, u32 mask);

-/// see POSIX fpclassify
+/// POSIX fpclassify
+#define IA32_FP_NAN       0x0100
+#define IA32_FP_NORMAL    0x0400
+#define IA32_FP_INFINITE  (IA32_FP_NAN | IA32_FP_NORMAL)
+#define IA32_FP_ZERO      0x4000
+#define IA32_FP_SUBNORMAL (IA32_FP_NORMAL | IA32_FP_ZERO)
 extern size_t CALL_CONV ia32_asm_fpclassifyd(double d);
 extern size_t CALL_CONV ia32_asm_fpclassifyf(float f);

-/// see POSIX rintf
+/// POSIX rintf
 extern float CALL_CONV ia32_asm_rintf(float);
 extern double CALL_CONV ia32_asm_rint(double);

-/// see POSIX fminf
+/// POSIX fminf
 extern float CALL_CONV ia32_asm_fminf(float, float);
 extern float CALL_CONV ia32_asm_fmaxf(float, float);

@ -45,7 +68,6 @@ extern i32 CALL_CONV ia32_asm_i32FromFloat(float f);
 extern i32 CALL_CONV ia32_asm_i32FromDouble(double d);
 extern i64 CALL_CONV ia32_asm_i64FromDouble(double d);

-
 /**
 * write the current execution state (e.g. all register values) into
 * (Win32::CONTEXT*)pcontext (defined as void* to avoid dependency).
--- a/source/lib/sysdep/numa.h
+++ b/source/lib/sysdep/numa.h
@ -0,0 +1,87 @@
+#ifndef INCLUDED_NUMA
+#define INCLUDED_NUMA
+
+//-----------------------------------------------------------------------------
+// node topology
+
+/**
+ * @return number of NUMA "nodes" (i.e. groups of CPUs with local memory).
+ **/
+LIB_API size_t numa_NumNodes();
+
+/**
+ * @return node number (zero-based) to which <processor> belongs.
+ **/
+LIB_API size_t numa_NodeFromProcessor(size_t processor);
+
+/**
+ * @return bit-mask of all processors constituting <node>.
+ **/
+LIB_API uintptr_t numa_ProcessorMaskFromNode(size_t node);
+
+
+//-----------------------------------------------------------------------------
+// memory
+
+/**
+ * @return bytes of memory available for allocation on <node>.
+ **/
+LIB_API size_t numa_AvailableMemory(size_t node);
+
+/**
+ * @return the ratio between maximum and minimum times that one processor
+ * from each node required to fill a globally allocated array.
+ * in other words, this is the maximum slowdown for NUMA-oblivious
+ * memory accesses. Microsoft guidelines require it to be <= 3.
+ **/
+LIB_API double numa_Factor();
+
+
+//-----------------------------------------------------------------------------
+// allocator
+
+/**
+ * simple allocator that "does the right thing" on NUMA systems - page frames
+ * will be taken from the node that first accesses them.
+ **/
+LIB_API void* numa_Allocate(size_t size);
+
+enum LargePageDisposition
+{
+	LPD_DEFAULT,
+	LPD_ALWAYS,
+	LPD_NEVER
+};
+
+/**
+ * allocate memory from a specific node.
+ *
+ * @param node node number (zero-based)
+ * @param largePageDisposition - allows forcibly enabling/disabling the use
+ * of large pages; the default decision involves a heuristic.
+ * @param pageSize if non-zero, receives the size [bytes] of a single page
+ * out of those used to map the memory.
+ **/
+LIB_API void* numa_AllocateOnNode(size_t size, size_t node, LargePageDisposition largePageDisposition = LPD_DEFAULT, size_t* pageSize = 0);
+
+/**
+ * release memory that had been handed out by one of the above allocators.
+ **/
+LIB_API void numa_Deallocate(void* mem);
+
+
+#ifdef __cplusplus
+
+// for use with shared_ptr
+template<typename T>
+struct numa_Deleter
+{
+	void operator()(T* p) const
+	{
+		numa_Deallocate(p);
+	}
+};
+
+#endif
+
+#endif	// #ifndef INCLUDED_NUMA
--- a/source/lib/sysdep/os_cpu.cpp
+++ b/source/lib/sysdep/os_cpu.cpp
@ -0,0 +1,14 @@
+/**
+ * =========================================================================
+ * File        : os_cpu.cpp
+ * Project     : 0 A.D.
+ * Description : OS-specific support functions relating to CPU and memory
+ * =========================================================================
+ */
+
+// license: GPL; see lib/license.txt
+
+#include "precompiled.h"
+#include "os_cpu.h"
+
+ERROR_ASSOCIATE(ERR::OS_CPU_RESTRICTED_AFFINITY, "Cannot set desired CPU affinity", -1);
--- a/source/lib/sysdep/os_cpu.h
+++ b/source/lib/sysdep/os_cpu.h
@ -0,0 +1,117 @@
+/**
+ * =========================================================================
+ * File        : os_cpu.h
+ * Project     : 0 A.D.
+ * Description : OS-specific support functions relating to CPU and memory
+ * =========================================================================
+ */
+
+// license: GPL; see lib/license.txt
+
+#ifndef INCLUDED_OS_CPU
+#define INCLUDED_OS_CPU
+
+namespace ERR
+{
+	const LibError OS_CPU_RESTRICTED_AFFINITY = -130100;
+}
+
+
+//-----------------------------------------------------------------------------
+// processor topology
+
+// processor ID = [0, os_cpu_NumProcessors())
+// they are a numbering of the bits of the process affinity mask where the
+// least significant nonzero bit corresponds to ID 0.
+// rationale: this spares users from having to deal with noncontiguous IDs,
+// e.g. when administrative tools are used to restrict process affinity.
+
+/**
+ * @return bit mask of processors that exist and are available to
+ * this process.
+ * its population count is by definition equal to os_cpu_NumProcessors().
+ **/
+LIB_API uintptr_t os_cpu_ProcessorMask();
+
+/**
+ * @return the number of processors available to this process.
+ *
+ * note: this function is necessary because POSIX sysconf _SC_NPROCESSORS_CONF
+ * is not suppored on MacOSX, else we would use that.
+ **/
+LIB_API size_t os_cpu_NumProcessors();
+
+// note: we do not provide an os_cpu_CurrentProcessor routine. that would
+// require Windows 2003 or a lot of work. worse, its results would be
+// worthless because they may change immediately afterwards. instead,
+// the recommended approach is to pin OpenMP threads (whose ID can be
+// queried) to the processor with the same number.
+
+
+//-----------------------------------------------------------------------------
+// CPU and memory characteristics
+
+/**
+ * @return a rough estimate of the CPU clock frequency.
+ * this is usually accurate to a few MHz and is faster than measurement loops.
+ **/
+LIB_API double os_cpu_ClockFrequency();
+
+/**
+ * @return the size [bytes] of a MMU page (4096 on most IA-32 systems)
+ **/
+LIB_API size_t os_cpu_PageSize();
+
+/**
+ * @return the size [bytes] of a large MMU page (4 MiB on most IA-32 systems)
+ * or zero if they are not supported.
+ **/
+LIB_API size_t os_cpu_LargePageSize();
+
+/**
+ * @return the size [bytes] of physical memory.
+ **/
+LIB_API size_t os_cpu_MemorySize();
+
+/**
+ * @return the size [bytes] of currently available memory.
+ **/
+LIB_API size_t os_cpu_MemoryAvailable();
+
+
+//-----------------------------------------------------------------------------
+// scheduling
+
+/**
+ * restrict the current thread to a set of processors.
+ * it will not be rescheduled until a subsequent os_cpu_SetThreadAffinity*.
+ *
+ * @param processorMask a bit mask of acceptable processors
+ * (bit index i corresponds to processor i)
+ * @return the previous mask
+ **/
+LIB_API uintptr_t os_cpu_SetThreadAffinityMask(uintptr_t processorMask);
+
+/**
+ * restrict the current thread to a single processor.
+ * it will not be rescheduled until a subsequent os_cpu_SetThreadAffinity*.
+ **/
+LIB_API void os_cpu_SetThreadAffinity(size_t processor);
+
+/**
+ * called by os_cpu_CallByEachCPU.
+ * @param processor ID of processor running the current thread for the
+ * duration of this function.
+ * @param cbData user-specified data passed through os_cpu_CallByEachCPU.
+ **/
+typedef void (*OsCpuCallback)(size_t processor, uintptr_t cbData);
+
+/**
+ * execute the specified function once on each processor.
+ * this proceeds serially (the callback is never reentered) in increasing
+ * order of processor ID.
+ * fails if process affinity prevents running on all processors.
+ **/
+LIB_API LibError os_cpu_CallByEachCPU(OsCpuCallback cb, uintptr_t cbData);
+
+#endif	// #ifndef INCLUDED_OS_CPU
--- a/source/lib/sysdep/win/tests/test_ia32.h
+++ b/source/lib/sysdep/win/tests/test_ia32.h
@ -1,6 +1,6 @@
 #include "lib/self_test.h"

-#include "lib/sysdep/ia32/ia32.h"
+#include "lib/sysdep/x86_x64/x86_x64.h"

 // note: ia32_i??_from_*, ia32_rint*, ia32_fm??f are all tested within
 // sysdep to avoid test duplication (both the ia32 versions and
@ -12,17 +12,17 @@ public:
 	void test_rdtsc()
 	{
 		// must increase monotonously
-		const u64 c1 = ia32_rdtsc();
-		const u64 c2 = ia32_rdtsc();
-		const u64 c3 = ia32_rdtsc();
+		const u64 c1 = x86_x64_rdtsc();
+		const u64 c2 = x86_x64_rdtsc();
+		const u64 c3 = x86_x64_rdtsc();
 		TS_ASSERT(c1 < c2 && c2 < c3);
 	}

 	void test_ia32_cap()
 	{
 		// make sure the really common/basic caps end up reported as true
-		TS_ASSERT(ia32_cap(IA32_CAP_FPU));
-		TS_ASSERT(ia32_cap(IA32_CAP_TSC));
-		TS_ASSERT(ia32_cap(IA32_CAP_MMX));
+		TS_ASSERT(x86_x64_cap(X86_X64_CAP_FPU));
+		TS_ASSERT(x86_x64_cap(X86_X64_CAP_TSC));
+		TS_ASSERT(x86_x64_cap(X86_X64_CAP_MMX));
 	}
 };
--- a/source/lib/sysdep/win/wcpu.cpp
+++ b/source/lib/sysdep/win/wcpu.cpp
@ -9,20 +9,62 @@
 // license: GPL; see lib/license.txt

 #include "precompiled.h"
-#include "../cpu.h"
+#include "lib/sysdep/os_cpu.h"

 #include "win.h"
 #include "lib/bits.h"
+#include "lib/module_init.h"
+
+#ifdef _OPENMP
+# include <omp.h>
+#endif


-static LibError ReadFrequencyFromRegistry(DWORD* freqMhz)
+uintptr_t os_cpu_ProcessorMask()
+{
+	static uintptr_t processorMask;
+
+	if(!processorMask)
+	{
+		const HANDLE hProcess = GetCurrentProcess();
+		DWORD_PTR processAffinity, systemAffinity;
+		const BOOL ok = GetProcessAffinityMask(hProcess, &processAffinity, &systemAffinity);
+		debug_assert(ok);
+		processorMask = processAffinity;
+	}
+
+	return processorMask;
+}
+
+
+size_t os_cpu_NumProcessors()
+{
+	static size_t numProcessors;
+
+	if(!numProcessors)
+	{
+		numProcessors = PopulationCount(os_cpu_ProcessorMask());
+
+		// sanity check
+		SYSTEM_INFO si;
+		GetSystemInfo(&si);	// guaranteed to succeed
+		debug_assert(numProcessors <= (size_t)si.dwNumberOfProcessors);
+	}
+
+	return numProcessors;
+}
+
+
+//-----------------------------------------------------------------------------
+
+static LibError ReadFrequencyFromRegistry(DWORD& freqMhz)
 {
 	HKEY hKey;
 	if(RegOpenKeyEx(HKEY_LOCAL_MACHINE, "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0", 0, KEY_QUERY_VALUE, &hKey) != ERROR_SUCCESS)
 		return ERR::NO_SYS;

-	DWORD size = sizeof(*freqMhz);
-	LONG ret = RegQueryValueEx(hKey, "~MHz", 0, 0, (LPBYTE)freqMhz, &size);
+	DWORD size = sizeof(&freqMhz);
+	LONG ret = RegQueryValueEx(hKey, "~MHz", 0, 0, (LPBYTE)&freqMhz, &size);

 	RegCloseKey(hKey);

@ -32,95 +74,232 @@ static LibError ReadFrequencyFromRegistry(DWORD* freqMhz)
 	return INFO::OK;
 }

-double cpu_ClockFrequency()
+double os_cpu_ClockFrequency()
 {
-	DWORD freqMhz;
-	if(ReadFrequencyFromRegistry(&freqMhz) < 0)
-		return -1.0;
+	static double clockFrequency;
+
+	if(clockFrequency == 0.0)
+	{
+		DWORD freqMhz;
+		if(ReadFrequencyFromRegistry(freqMhz) == INFO::OK)
+			clockFrequency = freqMhz * 1e6;
+		else
+			clockFrequency = -1.0;
+	}

-	const double clockFrequency = freqMhz * 1e6;
 	return clockFrequency;
 }


-size_t cpu_NumProcessors()
+size_t os_cpu_PageSize()
 {
-	SYSTEM_INFO si;
-	GetSystemInfo(&si);	// can't fail
-	const size_t numProcessors = (size_t)si.dwNumberOfProcessors;
-	return numProcessors;
+	static size_t systemPageSize;
+
+	if(!systemPageSize)
+	{
+		SYSTEM_INFO si;
+		GetSystemInfo(&si);	// guaranteed to succeed
+		systemPageSize = (size_t)si.dwPageSize;
+	}
+
+	return systemPageSize;
 }


-size_t cpu_PageSize()
+size_t os_cpu_LargePageSize()
 {
-	SYSTEM_INFO si;
-	GetSystemInfo(&si);	// can't fail
-	const size_t pageSize = (size_t)si.dwPageSize;
-	return pageSize;
+	static size_t largePageSize = ~(size_t)0;	// "0" has special significance
+
+	if(largePageSize == ~(size_t)0)
+	{
+		typedef SIZE_T (WINAPI *PGetLargePageMinimum)(void);
+		const HMODULE hKernel32 = GetModuleHandle("kernel32.dll");
+		const PGetLargePageMinimum pGetLargePageMinimum = (PGetLargePageMinimum)GetProcAddress(hKernel32, "GetLargePageMinimum");
+		if(pGetLargePageMinimum)
+		{
+			largePageSize = pGetLargePageMinimum();
+			debug_assert(largePageSize != 0);	// IA-32 and AMD64 definitely support large pages
+			debug_assert(largePageSize > os_cpu_PageSize());
+		}
+		// no OS support for large pages
+		else
+			largePageSize = 0;
+	}
+
+	return largePageSize;
 }


-size_t cpu_MemorySize(CpuMemoryIndicators mem_type)
+static void GetMemoryStatus(MEMORYSTATUSEX& mse)
 {
 	// note: we no longer bother dynamically importing GlobalMemoryStatusEx -
 	// it's available on Win2k and above. this function safely handles
 	// systems with > 4 GB of memory.
-	MEMORYSTATUSEX mse = { sizeof(mse) };
-	BOOL ok = GlobalMemoryStatusEx(&mse);
+	mse.dwLength = sizeof(mse);
+	const BOOL ok = GlobalMemoryStatusEx(&mse);
 	WARN_IF_FALSE(ok);
+}

-	if(mem_type == CPU_MEM_TOTAL)
+size_t os_cpu_MemorySize()
+{
+	static size_t memorySize;
+
+	if(memorySize == 0)
 	{
-		size_t memoryTotal = (size_t)mse.ullTotalPhys;
+		MEMORYSTATUSEX mse;
+		GetMemoryStatus(mse);
+		memorySize = (size_t)mse.ullTotalPhys;

 		// Richter, "Programming Applications for Windows": the reported
 		// value doesn't include non-paged pool reserved during boot;
 		// it's not considered available to the kernel. (the amount is
 		// 528 KiB on a 512 MiB WinXP/Win2k machine). we'll round up
 		// to the nearest megabyte to fix this.
-		memoryTotal = round_up(memoryTotal, 1*MiB);
-		return memoryTotal;
+		memorySize = round_up(memorySize, 1*MiB);
 	}
+
+	return memorySize;
+}
+
+size_t os_cpu_MemoryAvailable()
+{
+	MEMORYSTATUSEX mse;
+	GetMemoryStatus(mse);
+	const size_t memoryAvailable = (size_t)mse.ullAvailPhys;
+	return memoryAvailable;
+}
+
+
+//-----------------------------------------------------------------------------
+
+/**
+ * maximum number of processors supported by the OS (determined by the
+ * number of bits in an affinity mask)
+ **/
+static const DWORD maxProcessorNumber = sizeof(DWORD_PTR)*CHAR_BIT-1;
+
+DWORD_PTR wcpu_AffinityFromProcessorMask(DWORD_PTR processAffinity, uintptr_t processorMask)
+{
+	DWORD_PTR affinity = 0;
+
+	size_t processor = (size_t)-1;
+	for(DWORD processorNumber = 0; processorNumber <= maxProcessorNumber; processorNumber++)
+	{
+		if(IsBitSet(processAffinity, processorNumber))
+		{
+			++processor;	// now corresponds to processorNumber
+
+			if(IsBitSet(processorMask, processor))
+				affinity |= DWORD_PTR(1) << processorNumber;
+		}
+	}
+
+	return affinity;
+}
+
+uintptr_t wcpu_ProcessorMaskFromAffinity(DWORD_PTR processAffinity, DWORD_PTR affinity)
+{
+	uintptr_t processorMask = 0;
+
+	size_t processor = (size_t)-1;
+	for(DWORD processorNumber = 0; processorNumber <= maxProcessorNumber; processorNumber++)
+	{
+		if(IsBitSet(processAffinity, processorNumber))
+		{
+			++processor;	// now corresponds to processorNumber
+
+			if(IsBitSet(affinity, processorNumber))
+				processorMask |= uintptr_t(1) << processor;
+		}
+	}
+
+	return processorMask;
+}
+
+
+static const DWORD invalidProcessorNumber = (DWORD)-1;
+
+static DWORD CurrentProcessorNumber()
+{
+	typedef DWORD (WINAPI *PGetCurrentProcessorNumber)(void);
+	static PGetCurrentProcessorNumber pGetCurrentProcessorNumber;
+
+	static bool initialized;
+	if(!initialized)
+	{
+		initialized = true;
+		const HMODULE hKernel32 = GetModuleHandle("kernel32.dll");
+		// note: NtGetCurrentProcessorNumber and RtlGetCurrentProcessorNumber aren't
+		// implemented on WinXP SP2, so we can't use those either.
+		pGetCurrentProcessorNumber = (PGetCurrentProcessorNumber)GetProcAddress(hKernel32, "GetCurrentProcessorNumber");
+	}
+
+	if(pGetCurrentProcessorNumber)
+		return pGetCurrentProcessorNumber();
 	else
 	{
-		const size_t memoryAvailable = (size_t)mse.ullAvailPhys;
-		return memoryAvailable;
+		// note: we won't bother mapping APIC IDs to processor numbers or
+		// using LSL to re-implement GetCurrentProcessorNumber because
+		// this routine is just a debug aid.
+		return invalidProcessorNumber;
 	}
 }


-LibError cpu_CallByEachCPU(CpuCallback cb, void* param)
+uintptr_t os_cpu_SetThreadAffinityMask(uintptr_t processorMask)
 {
-	const HANDLE hProcess = GetCurrentProcess();
-	DWORD_PTR process_affinity, system_affinity;
-	if(!GetProcessAffinityMask(hProcess, &process_affinity, &system_affinity))
-		WARN_RETURN(ERR::FAIL);
-	// our affinity != system affinity: OS is limiting the CPUs that
-	// this process can run on. fail (cannot call back for each CPU).
-	if(process_affinity != system_affinity)
-		WARN_RETURN(ERR::CPU_RESTRICTED_AFFINITY);
+	debug_assert((processorMask >> os_cpu_NumProcessors()) == 0);

-	for(DWORD_PTR cpu_bit = 1; cpu_bit != 0 && cpu_bit <= process_affinity; cpu_bit *= 2)
+	DWORD_PTR processAffinity, systemAffinity;
+	const BOOL ok = GetProcessAffinityMask(GetCurrentProcess(), &processAffinity, &systemAffinity);
+	debug_assert(ok);
+
+	const DWORD_PTR affinity = wcpu_AffinityFromProcessorMask(processAffinity, processorMask);
+	const DWORD_PTR previousAffinity = SetThreadAffinityMask(GetCurrentThread(), affinity);
+	debug_assert(previousAffinity != 0);	// ensure function didn't fail
+
+	// hopefully reschedule our thread
+	Sleep(0);
+
+	// verify we're running on the correct processor
+	const DWORD currentProcessorNumber = CurrentProcessorNumber();
+	if(currentProcessorNumber != invalidProcessorNumber)
+		debug_assert(IsBitSet(affinity, currentProcessorNumber));
+
+	const uintptr_t previousProcessorMask = wcpu_ProcessorMaskFromAffinity(processAffinity, previousAffinity);
+	return previousProcessorMask;
+}
+
+
+void os_cpu_SetThreadAffinity(size_t processor)
+{
+	debug_assert(processor < os_cpu_NumProcessors());
+
+	const uintptr_t processorMask = uintptr_t(1) << processor;
+	(void)os_cpu_SetThreadAffinityMask(processorMask);
+}
+
+
+LibError os_cpu_CallByEachCPU(OsCpuCallback cb, uintptr_t cbData)
+{
+	// ensure we are able to run on all system processors
+	DWORD_PTR processAffinity, systemAffinity;
 	{
-		// check if we can switch to target CPU
-		if(!(process_affinity & cpu_bit))
-			continue;
-		// .. and do so.
-		if(!SetThreadAffinityMask(GetCurrentThread(), cpu_bit))
-		{
-			WARN_ERR(ERR::CPU_RESTRICTED_AFFINITY);
-			continue;
-		}
-
-		// reschedule to make sure we switch CPUs.
-		Sleep(1);
-
-		cb(param);
+		const BOOL ok = GetProcessAffinityMask(GetCurrentProcess(), &processAffinity, &systemAffinity);
+		debug_assert(ok);
+		if(processAffinity != systemAffinity)
+			WARN_RETURN(ERR::OS_CPU_RESTRICTED_AFFINITY);
 	}

-	// restore to original value
-	SetThreadAffinityMask(hProcess, process_affinity);
+	const uintptr_t previousAffinity = os_cpu_SetThreadAffinityMask(os_cpu_ProcessorMask());
+
+	for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
+	{
+		os_cpu_SetThreadAffinity(processor);
+		cb(processor, cbData);
+	}
+
+	(void)os_cpu_SetThreadAffinityMask(previousAffinity);

 	return INFO::OK;
 }
--- a/source/lib/sysdep/win/wcpu.h
+++ b/source/lib/sysdep/win/wcpu.h
@ -0,0 +1,25 @@
+/**
+ * =========================================================================
+ * File        : wcpu.h
+ * Project     : 0 A.D.
+ * Description : Windows backend of os_cpu
+ * =========================================================================
+ */
+
+// license: GPL; see lib/license.txt
+
+#ifndef INCLUDED_WCPU
+#define INCLUDED_WCPU
+
+#include "win.h"
+
+// "affinity" and "processorNumber" are what Windows sees.
+// "processorMask" and "processor" are the idealized representation we expose
+// to users. the latter insulates them from process affinity restrictions by
+// defining IDs as indices of the nonzero bits within the process affinity.
+// these routines are provided for the benefit of wnuma.
+
+extern DWORD_PTR wcpu_AffinityFromProcessorMask(DWORD_PTR processAffinity, uintptr_t processorMask);
+extern uintptr_t wcpu_ProcessorMaskFromAffinity(DWORD_PTR processAffinity, DWORD_PTR affinity);
+
+#endif	// #ifndef INCLUDED_WCPU
--- a/source/lib/sysdep/win/whrt/tsc.cpp
+++ b/source/lib/sysdep/win/whrt/tsc.cpp
@ -15,14 +15,9 @@
 #include "lib/sysdep/win/win.h"
 #include "lib/bits.h"

-#if MSC_VERSION
-# include <intrin.h>
-# if !ICC_VERSION
-#  pragma intrinsic(__rdtsc)
-# endif
-#endif
-#if ARCH_IA32
-# include "lib/sysdep/ia32/ia32.h"	// ia32_rdtsc
+#if ARCH_IA32 || ARCH_AMD64
+# include "lib/sysdep/x86_x64/x86_x64.h"	// x86_x64_rdtsc
+# include "lib/sysdep/x86_x64/topology.h"
 #endif


@ -38,18 +33,18 @@ enum AmdPowerNowFlags

 static bool IsThrottlingPossible()
 {
-#if ARCH_IA32
-	Ia32CpuidRegs regs;
-	switch(ia32_Vendor())
+#if ARCH_IA32 || ARCH_AMD64
+	x86_x64_CpuidRegs regs;
+	switch(x86_x64_Vendor())
 	{
-	case IA32_VENDOR_INTEL:
-		if(ia32_cap(IA32_CAP_TM_SCC) || ia32_cap(IA32_CAP_EST))
+	case X86_X64_VENDOR_INTEL:
+		if(x86_x64_cap(X86_X64_CAP_TM_SCC) || x86_x64_cap(X86_X64_CAP_EST))
 			return true;
 		break;

-	case IA32_VENDOR_AMD:
+	case X86_X64_VENDOR_AMD:
 		regs.eax = 0x80000007;
-		if(ia32_cpuid(&regs))
+		if(x86_x64_cpuid(&regs))
 		{
 			if(regs.edx & (PN_FREQ_ID_CTRL|PN_SW_THERMAL_CTRL))
 				return true;
@ -57,9 +52,6 @@ static bool IsThrottlingPossible()
 		break;
 	}
 	return false;
-#elif ARCH_AMD64
-	// not yet implemented - consider it unsafe.
-	return true;
 #endif
 }

@ -68,8 +60,8 @@ static bool IsThrottlingPossible()

 LibError CounterTSC::Activate()
 {
-#if ARCH_IA32
-	if(!ia32_cap(IA32_CAP_TSC))
+#if ARCH_IA32 || ARCH_AMD64
+	if(!x86_x64_cap(X86_X64_CAP_TSC))
 		return ERR::NO_SYS;		// NOWARN (CPU doesn't support RDTSC)
 #endif

@ -107,16 +99,16 @@ bool CounterTSC::IsSafe() const
 	if(cpu_NumPackages() != 1 || cpu_CoresPerPackage() != 1)
 		return false;

-#if ARCH_IA32
+#if ARCH_IA32 || ARCH_AMD64
 	// recent CPU:
-	if(ia32_Generation() >= 7)
+	if(x86_x64_Generation() >= 7)
 	{
 		// note: 8th generation CPUs support C1-clock ramping, which causes
 		// drift on multi-core systems, but those were excluded above.

-		Ia32CpuidRegs regs;
+		x86_x64_CpuidRegs regs;
 		regs.eax = 0x80000007;
-		if(ia32_cpuid(&regs))
+		if(x86_x64_cpuid(&regs))
 		{
 			// TSC is invariant WRT P-state, C-state and STPCLK => safe.
 			if(regs.edx & PN_INVARIANT_TSC)
@ -148,11 +140,7 @@ bool CounterTSC::IsSafe() const

 u64 CounterTSC::Counter() const
 {
-#if MSC_VERSION
-	return __rdtsc();
-#else
-	return ia32_rdtsc();
-#endif
+	return x86_x64_rdtsc();
 }

 /**
--- a/source/lib/sysdep/win/wnuma.cpp
+++ b/source/lib/sysdep/win/wnuma.cpp
@ -0,0 +1,359 @@
+#include "precompiled.h"
+#include "lib/sysdep/numa.h"
+
+#include "lib/bits.h"	// round_up, PopulationCount
+#include "lib/timer.h"
+#include "lib/sysdep/os_cpu.h"
+#include "win.h"
+#include "wutil.h"
+#include "wcpu.h"
+#include <Psapi.h>
+
+#ifdef _OPENMP
+# include <omp.h>
+#endif
+
+
+//-----------------------------------------------------------------------------
+// node topology
+//-----------------------------------------------------------------------------
+
+size_t numa_NumNodes()
+{
+	static size_t numNodes;
+
+	if(!numNodes)
+	{
+		typedef BOOL (WINAPI *PGetNumaHighestNodeNumber)(PULONG highestNode);
+		const HMODULE hKernel32 = GetModuleHandle("kernel32.dll");
+		const PGetNumaHighestNodeNumber pGetNumaHighestNodeNumber = (PGetNumaHighestNodeNumber)GetProcAddress(hKernel32, "GetNumaHighestNodeNumber");
+		if(pGetNumaHighestNodeNumber)
+		{
+			ULONG highestNode;
+			const BOOL ok = pGetNumaHighestNodeNumber(&highestNode);
+			debug_assert(ok);
+			debug_assert(highestNode < os_cpu_NumProcessors());	// #nodes <= #processors
+			numNodes = highestNode+1;
+		}
+		// NUMA not supported
+		else
+			numNodes = 1;
+	}
+
+	return numNodes;
+}
+
+
+// note: it is easier to implement this in terms of numa_ProcessorMaskFromNode
+// rather than the other way around because wcpu provides the
+// wcpu_ProcessorMaskFromAffinity helper. there is no similar function to
+// convert processor to processorNumber.
+size_t numa_NodeFromProcessor(size_t processor)
+{
+	debug_assert(processor < os_cpu_NumProcessors());
+
+	static std::vector<size_t> processorsNode;
+#ifdef _OPENMP
+#pragma omp critical
+#endif
+	if(processorsNode.empty())
+	{
+		processorsNode.resize(os_cpu_NumProcessors(), 0);
+		for(size_t node = 0; node < numa_NumNodes(); node++)
+		{
+			const uintptr_t processorMask = numa_ProcessorMaskFromNode(node);
+			for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
+			{
+				if(IsBitSet(processorMask, processor))
+					processorsNode[processor] = node;
+			}
+		}
+	}
+
+	return processorsNode.at(processor);
+}
+
+
+uintptr_t numa_ProcessorMaskFromNode(size_t node)
+{
+	debug_assert(node < numa_NumNodes());
+
+	static std::vector<uintptr_t> nodesProcessorMask;
+#ifdef _OPENMP
+#pragma omp critical
+#endif
+	if(nodesProcessorMask.empty())
+	{
+		typedef BOOL (WINAPI *PGetNumaNodeProcessorMask)(UCHAR node, PULONGLONG affinity);
+		const HMODULE hKernel32 = GetModuleHandle("kernel32.dll");
+		const PGetNumaNodeProcessorMask pGetNumaNodeProcessorMask = (PGetNumaNodeProcessorMask)GetProcAddress(hKernel32, "GetNumaNodeProcessorMask");
+		if(pGetNumaNodeProcessorMask)
+		{
+			DWORD_PTR processAffinity, systemAffinity;
+			const BOOL ok = GetProcessAffinityMask(GetCurrentProcess(), &processAffinity, &systemAffinity);
+			debug_assert(ok);
+
+			for(size_t node = 0; node < numa_NumNodes(); node++)
+			{
+				ULONGLONG affinity;
+				const BOOL ok = pGetNumaNodeProcessorMask((UCHAR)node, &affinity);
+				debug_assert(ok);
+				const uintptr_t processorMask = wcpu_ProcessorMaskFromAffinity(processAffinity, (DWORD_PTR)affinity);
+				nodesProcessorMask.push_back(processorMask);
+			}
+		}
+		// NUMA not supported - consider node 0 to consist of all system processors
+		else
+			nodesProcessorMask.push_back(os_cpu_ProcessorMask());
+	}
+
+	return nodesProcessorMask.at(node);
+}
+
+
+//-----------------------------------------------------------------------------
+// memory info
+//-----------------------------------------------------------------------------
+
+size_t numa_AvailableMemory(size_t node)
+{
+	debug_assert(node < numa_NumNodes());
+
+	// note: it is said that GetNumaAvailableMemoryNode sometimes incorrectly
+	// reports zero bytes. the actual cause may however be unexpected
+	// RAM configuration, e.g. not all slots filled.
+	typedef BOOL (WINAPI *PGetNumaAvailableMemoryNode)(UCHAR node, PULONGLONG availableBytes);
+	static PGetNumaAvailableMemoryNode pGetNumaAvailableMemoryNode;
+	if(!pGetNumaAvailableMemoryNode)
+	{
+		const HMODULE hKernel32 = GetModuleHandle("kernel32.dll");
+		pGetNumaAvailableMemoryNode = (PGetNumaAvailableMemoryNode)GetProcAddress(hKernel32, "GetNumaAvailableMemoryNode");
+	}
+
+	if(pGetNumaAvailableMemoryNode)
+	{
+		ULONGLONG availableBytes;
+		const BOOL ok = pGetNumaAvailableMemoryNode((UCHAR)node, &availableBytes);
+		debug_assert(ok);
+		return (size_t)availableBytes;
+	}
+	// NUMA not supported - return available system memory
+	else
+		return os_cpu_MemoryAvailable();
+}
+
+
+double numa_Factor()
+{
+	static double factor;
+
+	static bool initialized;
+#ifdef _OPENMP
+#pragma omp critical
+#endif
+	if(!initialized)
+	{
+		initialized = true;
+
+		// if non-NUMA, skip the (expensive) measurements below.
+		if(numa_NumNodes() == 1)
+			factor = 1.0;
+		else
+		{
+			// allocate memory on one node
+			const size_t size = 16*MiB;
+			shared_ptr<u8> buffer((u8*)numa_AllocateOnNode(size, 0), numa_Deleter<u8>());
+
+			const uintptr_t previousProcessorMask = os_cpu_SetThreadAffinityMask(os_cpu_ProcessorMask());
+
+			// measure min/max fill times required by a processor from each node
+			double minTime = 1e10, maxTime = 0.0;
+			for(size_t node = 0; node < numa_NumNodes(); node++)
+			{
+				const uintptr_t processorMask = numa_ProcessorMaskFromNode(node);
+				os_cpu_SetThreadAffinityMask(processorMask);
+
+				const double startTime = timer_Time();
+				memset(buffer.get(), 0, size);
+				const double elapsedTime = timer_Time() - startTime;
+
+				minTime = std::min(minTime, elapsedTime);
+				maxTime = std::max(maxTime, elapsedTime);
+			}
+
+			(void)os_cpu_SetThreadAffinityMask(previousProcessorMask);
+
+			factor = maxTime / minTime;
+		}
+
+		debug_assert(factor >= 1.0);
+		debug_assert(factor <= 3.0);	// (Microsoft guideline for NUMA systems)
+	}
+
+	return factor;
+}
+
+
+//-----------------------------------------------------------------------------
+// allocator
+//-----------------------------------------------------------------------------
+
+void* numa_Allocate(size_t size)
+{
+	void* const mem = VirtualAlloc(0, size, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE);
+	if(!mem)
+		throw std::bad_alloc();
+	return mem;
+}
+
+
+static bool largePageAllocationTookTooLong = false;
+
+static bool ShouldUseLargePages(LargePageDisposition disposition, size_t allocationSize, size_t node)
+{
+	// can't, OS does not support large pages
+	if(os_cpu_LargePageSize() == 0)
+		return false;
+
+	// overrides
+	if(disposition == LPD_NEVER)
+		return false;
+	if(disposition == LPD_ALWAYS)
+		return true;
+
+	// default disposition: use a heuristic
+	{
+		// a previous attempt already took too long (Windows is apparently
+		// shoveling aside lots of memory).
+		if(largePageAllocationTookTooLong)
+			return false;
+
+		// allocation is rather small and would "only" use half of the
+		// TLBs for its pages.
+		if(allocationSize < 64/2 * os_cpu_PageSize())
+			return false;
+
+		// we want there to be plenty of memory available, otherwise the
+		// page frames are going to be terribly fragmented and even a
+		// single allocation would take SECONDS.
+		if(numa_AvailableMemory(node) < 2*GiB)
+			return false;
+	}
+
+	return true;
+}
+
+
+static bool VerifyPages(void* mem, size_t size, size_t pageSize, size_t node)
+{
+	typedef BOOL (WINAPI *PQueryWorkingSetEx)(HANDLE hProcess, PVOID buffer, DWORD bufferSize);
+	static PQueryWorkingSetEx pQueryWorkingSetEx;
+	if(!pQueryWorkingSetEx)
+	{
+		const HMODULE hKernel32 = GetModuleHandle("kernel32.dll");
+		pQueryWorkingSetEx = (PQueryWorkingSetEx)GetProcAddress(hKernel32, "QueryWorkingSetEx");
+		if(!pQueryWorkingSetEx)
+			return true;	// can't do anything
+	}
+
+#if WINVER >= 0x600
+	// retrieve attributes of all pages constituting mem
+	const size_t numPages = (size + pageSize-1) / pageSize;
+	PSAPI_WORKING_SET_EX_INFORMATION* wsi = new PSAPI_WORKING_SET_EX_INFORMATION[numPages];
+	for(size_t i = 0; i < numPages; i++)
+		wsi[i].VirtualAddress = (u8*)mem + i*pageSize;
+	pQueryWorkingSetEx(GetCurrentProcess(), wsi, sizeof(PSAPI_WORKING_SET_EX_INFORMATION)*numPages);
+
+	// ensure each is valid and allocated on the correct node
+	for(size_t i = 0; i < numPages; i++)
+	{
+		const PSAPI_WORKING_SET_EX_BLOCK& attributes = wsi[i].VirtualAttributes;
+		if(!attributes.valid)
+			return false;
+		if(attributes.LargePage != (pageSize == LargePageSize()))
+		{
+			debug_printf("NUMA: is not a large page\n");
+			return false;
+		}
+		if(attributes.node != node)
+		{
+			debug_printf("NUMA: allocated from remote node\n");
+			return false;
+		}
+	}
+
+	delete[] wsi;
+#else
+	UNUSED2(mem);
+	UNUSED2(size);
+	UNUSED2(pageSize);
+	UNUSED2(node);
+#endif
+
+	return true;
+}
+
+
+void* numa_AllocateOnNode(size_t size, size_t node, LargePageDisposition largePageDisposition, size_t* ppageSize)
+{
+	debug_assert(node < numa_NumNodes());
+
+	// see if there will be enough memory (non-authoritative, for debug purposes only)
+	{
+		const size_t availableBytes = numa_AvailableMemory(node);
+		if(availableBytes < size)
+			debug_printf("NUMA: warning: node reports insufficient memory (%d vs %d)\n", availableBytes, size);
+	}
+
+	void* mem = 0;
+	size_t pageSize = 0;
+
+	// try allocating with large pages (reduces TLB misses)
+	if(ShouldUseLargePages(largePageDisposition, size, node))
+	{
+		const size_t largePageSize = os_cpu_LargePageSize();
+		const size_t paddedSize = round_up(size, largePageSize);	// required by MEM_LARGE_PAGES
+		// note: this call can take SECONDS, which is why several checks are
+		// undertaken before we even try. these aren't authoritative, so we
+		// at least prevent future attempts if it takes too long.
+		const double startTime = timer_Time();
+		mem = VirtualAlloc(0, paddedSize, MEM_RESERVE|MEM_COMMIT|MEM_LARGE_PAGES, PAGE_READWRITE);
+		pageSize = largePageSize;
+		const double elapsedTime = timer_Time() - startTime;
+		debug_printf("TIMER| NUMA large page allocation: %g\n", elapsedTime);
+		if(elapsedTime > 1.0)
+			largePageAllocationTookTooLong = true;
+	}
+
+	// try (again) with regular pages
+	if(!mem)
+	{
+		mem = VirtualAlloc(0, size, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE);
+		pageSize = os_cpu_PageSize();
+	}
+
+	// all attempts failed - we're apparently out of memory.
+	if(!mem)
+		throw std::bad_alloc();
+
+	// we can't use VirtualAllocExNuma - it's only available in Vista and Server 2008.
+	// workaround: fault in all pages now to ensure they are allocated from the
+	// current node, then verify page attributes.
+	// (note: VirtualAlloc's MEM_COMMIT only maps virtual pages and does not
+	// actually allocate page frames. Windows uses a first-touch heuristic -
+	// the page will be taken from the node whose processor caused the fault.)
+	memset(mem, 0, size);
+
+	VerifyPages(mem, size, pageSize, node);
+
+	if(ppageSize)
+		*ppageSize = pageSize;
+
+	return mem;
+}
+
+
+void numa_Deallocate(void* mem)
+{
+	VirtualFree(mem, 0, MEM_RELEASE);
+}
--- a/source/lib/sysdep/win/wposix/waio.cpp
+++ b/source/lib/sysdep/win/wposix/waio.cpp
@ -405,7 +405,7 @@ int aio_suspend(const struct aiocb* const cbs[], int n, const struct timespec* t
 	const BOOL waitAll = FALSE;
 	// convert timespec to milliseconds (ts == 0 => no timeout)
 	const DWORD timeout = ts? (DWORD)(ts->tv_sec*1000 + ts->tv_nsec/1000000) : INFINITE;
-	DWORD result = WaitForMultipleObjects(numPendingIos, hEvents, waitAll, timeout);
+	DWORD result = WaitForMultipleObjects((DWORD)numPendingIos, hEvents, waitAll, timeout);

 	for(size_t i = 0; i < numPendingIos; i++)
 		ResetEvent(hEvents[i]);
--- a/source/lib/sysdep/x86_x64/topology.cpp
+++ b/source/lib/sysdep/x86_x64/topology.cpp
@ -0,0 +1,442 @@
+/**
+ * =========================================================================
+ * File        : topology.cpp
+ * Project     : 0 A.D.
+ * Description : detection of CPU and cache topology
+ * =========================================================================
+ */
+
+// license: GPL; see lib/license.txt
+
+#include "precompiled.h"
+#include "topology.h"
+
+#include "lib/bits.h"
+#include "lib/sysdep/cpu.h"
+#include "lib/sysdep/os_cpu.h"
+#include "x86_x64.h"
+
+
+//-----------------------------------------------------------------------------
+
+// note: Intel Appnote 485 (CPUID) assures uniformity of coresPerPackage and
+// logicalPerCore across all packages.
+
+static size_t DetectCoresPerPackage()
+{
+	x86_x64_CpuidRegs regs;
+	switch(x86_x64_Vendor())
+	{
+	case X86_X64_VENDOR_INTEL:
+		regs.eax = 4;
+		regs.ecx = 0;
+		if(x86_x64_cpuid(&regs))
+			return bits(regs.eax, 26, 31)+1;
+		break;
+
+	case X86_X64_VENDOR_AMD:
+		regs.eax = 0x80000008;
+		if(x86_x64_cpuid(&regs))
+			return bits(regs.ecx, 0, 7)+1;
+		break;
+	}
+
+	return 1;	// else: the CPU is single-core.
+}
+
+static size_t CoresPerPackage()
+{
+	static size_t coresPerPackage = 0;
+	if(!coresPerPackage)
+		coresPerPackage = DetectCoresPerPackage();
+	return coresPerPackage;
+}
+
+
+static bool IsHyperthreadingCapable()
+{
+	// definitely not
+	if(!x86_x64_cap(X86_X64_CAP_HT))
+		return false;
+
+	// AMD N-core systems falsely set the HT bit for compatibility reasons
+	// (don't bother resetting it, might confuse callers)
+	if(x86_x64_Vendor() == X86_X64_VENDOR_AMD && x86_x64_cap(X86_X64_CAP_AMD_CMP_LEGACY))
+		return false;
+
+	return true;
+}
+
+static size_t DetectLogicalPerCore()
+{
+	if(!IsHyperthreadingCapable())
+		return 1;
+
+	x86_x64_CpuidRegs regs;
+	regs.eax = 1;
+	if(!x86_x64_cpuid(&regs))
+		DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING);
+	const size_t logicalPerPackage = bits(regs.ebx, 16, 23);
+
+	// cores ought to be uniform WRT # logical processors
+	debug_assert(logicalPerPackage % CoresPerPackage() == 0);
+
+	return logicalPerPackage / CoresPerPackage();
+}
+
+static size_t LogicalPerCore()
+{
+	static size_t logicalPerCore = 0;
+	if(!logicalPerCore)
+		logicalPerCore = DetectLogicalPerCore();
+	return logicalPerCore;
+}
+
+enum CacheType
+{
+	CT_NONE = 0,
+	CT_DATA = 1,
+	CT_INSTRUCTION = 2,
+	CT_UNIFIED = 3
+};
+
+static bool IsL2DataCache(CacheType type, size_t level)
+{
+	if(type != CT_DATA && type != CT_UNIFIED)
+		return false;
+	if(level != 2)
+		return false;
+	return true;
+}
+
+static size_t DetectLogicalPerCache()
+{
+	// note: Intel Appnote 485 says the order in which caches are returned is
+	// undefined, so we need to loop through all of them.
+	for(u32 count = 0; ; count++)
+	{
+		x86_x64_CpuidRegs regs;
+		regs.eax = 4;
+		regs.ecx = count;
+		x86_x64_cpuid(&regs);
+	
+		const CacheType type = (CacheType)bits(regs.eax, 0, 4);
+		// no more caches left
+		if(type == CT_NONE)
+		{
+			debug_assert(0);	// we somehow didn't find the L2d
+			return 1;
+		}
+
+		const size_t level = bits(regs.eax, 5, 7);
+		if(IsL2DataCache(type, level))
+		{
+			const size_t logicalPerCache = bits(regs.eax, 14, 25)+1;
+			return logicalPerCache;
+		}
+	}
+}
+
+static size_t LogicalPerCache()
+{
+	static size_t logicalPerCache;
+	if(!logicalPerCache)
+		logicalPerCache = DetectLogicalPerCache();
+	return logicalPerCache;
+}
+
+
+//-----------------------------------------------------------------------------
+
+// the above functions give the maximum number of cores/logical units.
+// however, some of them may actually be disabled by the BIOS!
+// what we can do is to analyze the APIC IDs. they are allocated sequentially
+// for all "processors". treating the IDs as variable-width bit fields
+// (according to the number of cores/logical units present) allows
+// determining the exact topology as well as number of packages.
+
+// these are set by DetectProcessorTopology.
+static size_t numPackages = 0;	// i.e. sockets; > 1 => true SMP system
+static size_t enabledCoresPerPackage = 0;
+static size_t enabledLogicalPerCore = 0;	// hyperthreading units
+
+typedef std::vector<u8> Ids;
+
+// add the currently running processor's APIC ID to a list of IDs.
+static void StoreApicId(size_t UNUSED(processor), uintptr_t cbData)
+{
+	Ids* const apicIds = (Ids*)cbData;
+	apicIds->push_back(x86_x64_ApicId());
+}
+
+// if successful, apicIds[i] contains the unique ID of OS processor i.
+static bool GatherApicIds(Ids& apicIds)
+{
+	// old APIC (see x86_x64_ApicId for details)
+	if(x86_x64_Generation() < 8)
+		return false;
+
+	// process affinity prevents us from seeing all APIC IDs
+	if(PopulationCount(os_cpu_ProcessorMask()) != os_cpu_NumProcessors())
+		return false;
+
+	const LibError ret = os_cpu_CallByEachCPU(StoreApicId, (uintptr_t)&apicIds);
+	debug_assert(ret == INFO::OK);
+
+	// ensure we got a unique ID for every processor
+	{
+		Ids tmp(apicIds);
+		Ids::iterator end = tmp.end();
+		std::sort(tmp.begin(), end);
+		debug_assert(std::unique(tmp.begin(), end) == end);
+		debug_assert(std::distance(tmp.begin(), end) == (ptrdiff_t)os_cpu_NumProcessors());
+	}
+
+	return true;
+}
+
+
+typedef std::set<u8> IdSet;
+
+/**
+ * "field" := a range of bits sufficient to represent <numValues> integers.
+ * for each id in <apicIds>: extract the value of the field starting at
+ * <offset> and insert it into <ids>. afterwards, adjust <offset> to the
+ * next field.
+ *
+ * used to gather e.g. all core IDs from all APIC IDs.
+ **/
+static void ExtractFieldIntoSet(const Ids& apicIds, size_t& offset, size_t numValues, IdSet& ids)
+{
+	const size_t numBits = ceil_log2(numValues);
+	if(numBits == 0)
+		return;
+	const u8 mask = bit_mask<u8>(numBits);
+
+	for(size_t i = 0; i < apicIds.size(); i++)
+	{
+		const u8 apicId = apicIds[i];
+		const u8 field = u8(apicId >> offset) & mask;
+		ids.insert(field);
+	}
+
+	offset += numBits;
+}
+
+static size_t numCaches = 0;	// L2d
+static std::vector<size_t> processorsCache;
+static std::vector<uintptr_t> cachesProcessorMask;
+
+
+
+class CacheManager
+{
+public:
+	void Add(u8 id, size_t processor)
+	{
+		SharedCache* cache = Find(id);
+		if(!cache)
+		{
+			m_caches.push_back(id);
+			cache = &m_caches.back();
+		}
+		cache->Add(processor);
+	}
+
+	void StoreProcessorMasks(std::vector<uintptr_t>& processorMasks)
+	{
+		processorMasks.resize(m_caches.size());
+		for(size_t i = 0; i < m_caches.size(); i++)
+			processorMasks[i] = m_caches[i].ProcessorMask();
+	}
+
+private:
+	class SharedCache
+	{
+	public:
+		SharedCache(u8 id)
+			: m_id(id), m_processorMask(0)
+		{
+		}
+
+		bool Matches(u8 id) const
+		{
+			return m_id == id;
+		}
+
+		void Add(size_t processor)
+		{
+			m_processorMask |= uintptr_t(1) << processor;
+		}
+
+		uintptr_t ProcessorMask() const
+		{
+			return m_processorMask;
+		}
+
+	private:
+		u8 m_id;
+		uintptr_t m_processorMask;
+	};
+
+	SharedCache* Find(u8 id)
+	{
+		for(size_t i = 0; i < m_caches.size(); i++)
+		{
+			if(m_caches[i].Matches(id))
+				return &m_caches[i];
+		}
+
+		return 0;
+	}
+
+	std::vector<SharedCache> m_caches;
+};
+
+static void DetectCacheTopology(const Ids& apicIds)
+{
+	const size_t numBits = ceil_log2(LogicalPerCache());
+	const u8 cacheIdMask = u8(0xFF << numBits);
+
+	CacheManager cacheManager;
+	for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
+	{
+		const u8 apicId = apicIds[processor];
+		const u8 cacheId = apicId & cacheIdMask;
+		cacheManager.Add(cacheId, processor);
+	}
+	cacheManager.StoreProcessorMasks(cachesProcessorMask);
+	numCaches = cachesProcessorMask.size();
+
+	const size_t invalidCache = ~(size_t)0;
+	processorsCache.resize(os_cpu_NumProcessors(), invalidCache);
+	for(size_t cache = 0; cache < numCaches; cache++)
+	{
+		const uintptr_t processorMask = cachesProcessorMask[cache];
+		for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
+		{
+			if(IsBitSet(processorMask, processor))
+				processorsCache[processor] = cache;
+		}
+	}
+	for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
+	{
+		debug_assert(processorsCache[processor] != invalidCache);
+		debug_assert(processorsCache[processor] < numCaches);
+	}
+}
+
+
+// @return false if unavailable / no information can be returned.
+static bool DetectProcessorTopologyViaApicIds()
+{
+	Ids apicIds;
+	if(!GatherApicIds(apicIds))
+		return false;
+
+	// extract values from all 3 ID bit fields into separate sets
+	size_t offset = 0;
+	IdSet logicalIds;
+	ExtractFieldIntoSet(apicIds, offset, LogicalPerCore(), logicalIds);
+	IdSet coreIds;
+	ExtractFieldIntoSet(apicIds, offset, CoresPerPackage(), coreIds);
+	IdSet packageIds;
+	ExtractFieldIntoSet(apicIds, offset, 0xFF, packageIds);
+
+	numPackages            = std::max(packageIds.size(), size_t(1));
+	enabledCoresPerPackage = std::max(coreIds   .size(), size_t(1));
+	enabledLogicalPerCore  = std::max(logicalIds.size(), size_t(1));
+
+	// note: cache ID possibly overlaps the other fields. we also want to
+	// retrieve more information (mappings between processor and cache ID),
+	// so this needs to be handled separately.
+	DetectCacheTopology(apicIds);
+
+	return true;
+}
+
+
+static void GuessProcessorTopologyViaOsCount()
+{
+	const size_t numProcessors = os_cpu_NumProcessors();
+
+	// note: we cannot hope to always return correct results since disabled
+	// cores/logical units cannot be distinguished from the situation of the
+	// OS simply not reporting them as "processors". unfortunately this
+	// function won't always only be called for older (#core = #logical = 1)
+	// systems because DetectProcessorTopologyViaApicIds may fail due to
+	// lack of OS support. what we'll do is assume nothing is disabled; this
+	// is reasonable because we care most about #packages. it's fine to assume
+	// more cores (without inflating the total #processors) because that
+	// count only indicates memory barriers etc. ought to be used.
+	enabledCoresPerPackage = CoresPerPackage();
+	enabledLogicalPerCore = LogicalPerCore();
+
+	const size_t numPackagesTimesLogical = numProcessors / CoresPerPackage();
+	debug_assert(numPackagesTimesLogical != 0);	// otherwise processors didn't include cores, which would be stupid
+
+	numPackages = numPackagesTimesLogical / LogicalPerCore();
+	if(!numPackages)	// processors didn't include logical units (reasonable)
+		numPackages = numPackagesTimesLogical;
+}
+
+
+// determine how many CoresPerPackage and LogicalPerCore are
+// actually enabled and also count numPackages.
+static void DetectProcessorTopology()
+{
+	// authoritative, but requires OS support and fairly recent CPUs
+	if(DetectProcessorTopologyViaApicIds())
+		return;	// success, we're done.
+
+	GuessProcessorTopologyViaOsCount();
+}
+
+
+size_t cpu_NumPackages()
+{
+	if(!numPackages)
+		DetectProcessorTopology();
+	return numPackages;
+}
+
+size_t cpu_CoresPerPackage()
+{
+	if(!enabledCoresPerPackage)
+		DetectProcessorTopology();
+	return enabledCoresPerPackage;
+}
+
+size_t cpu_LogicalPerCore()
+{
+	if(!enabledLogicalPerCore)
+		DetectProcessorTopology();
+	return enabledLogicalPerCore;
+}
+
+size_t cpu_NumCaches()
+{
+	if(!numCaches)
+		DetectProcessorTopology();
+	return numCaches;
+}
+
+
+size_t cpu_CacheFromProcessor(size_t processor)
+{
+	debug_assert(processor < os_cpu_NumProcessors());
+	DetectProcessorTopology();
+	return processorsCache.at(processor);
+}
+
+uintptr_t cpu_ProcessorMaskFromCache(size_t cache)
+{
+	debug_assert(cache < cpu_NumCaches());
+	DetectProcessorTopology();
+	return cachesProcessorMask.at(cache);
+}
+
+
+// note: Windows 2003 GetLogicalProcessorInformation returns incorrect
+// information, claiming all cores in an Intel Core2 Quad processor
+// share an L2 cache.
--- a/source/lib/sysdep/x86_x64/topology.h
+++ b/source/lib/sysdep/x86_x64/topology.h
@ -0,0 +1,54 @@
+/**
+ * =========================================================================
+ * File        : topology.cpp
+ * Project     : 0 A.D.
+ * Description : detection of CPU and cache topology
+ * =========================================================================
+ */
+
+// license: GPL; see lib/license.txt
+
+#ifndef INCLUDED_TOPOLOGY
+#define INCLUDED_TOPOLOGY
+
+// OSes report hyperthreading units and cores as "processors". we need to
+// drill down and find out the exact counts (for thread pool dimensioning
+// and cache sharing considerations).
+
+/**
+ * @return number of *enabled* CPU packages / sockets.
+ **/
+LIB_API size_t cpu_NumPackages();
+
+/**
+ * @return number of *enabled* CPU cores per package.
+ * (2 on dual-core systems)
+ **/
+LIB_API size_t cpu_CoresPerPackage();
+
+/**
+ * @return number of *enabled* hyperthreading units per core.
+ * (2 on P4 EE)
+ **/
+LIB_API size_t cpu_LogicalPerCore();
+
+
+//-----------------------------------------------------------------------------
+// L2 cache
+
+/**
+ * @return number of distinct L2 caches
+ **/
+LIB_API size_t cpu_NumCaches();
+
+/**
+ * @return L2 cache number (zero-based) to which <processor> belongs.
+ **/
+LIB_API size_t cpu_CacheFromProcessor(size_t processor);
+
+/**
+ * @return bit-mask of all processors sharing <cache>.
+ **/
+LIB_API uintptr_t cpu_ProcessorMaskFromCache(size_t cache);
+
+#endif	// #ifndef INCLUDED_TOPOLOGY
--- a/source/lib/sysdep/x86_x64/x86_x64.cpp
+++ b/source/lib/sysdep/x86_x64/x86_x64.cpp
@ -0,0 +1,505 @@
+/**
+ * =========================================================================
+ * File        : x86_x64.cpp
+ * Project     : 0 A.D.
+ * Description : CPU-specific routines common to 32 and 64-bit x86
+ * =========================================================================
+ */
+
+// license: GPL; see lib/license.txt
+
+#include "precompiled.h"
+#include "x86_x64.h"
+
+#include <string.h>
+#include <stdio.h>
+#include <vector>
+#include <set>
+#include <algorithm>
+
+#include "lib/posix/posix.h"	// pthread
+#include "lib/bits.h"
+#include "lib/timer.h"
+#include "lib/sysdep/cpu.h"
+#include "lib/sysdep/os_cpu.h"
+
+#if ARCH_IA32
+# include "../ia32/ia32_asm.h"
+#else
+#include "../amd64/amd64_asm.h"
+# endif
+
+#if MSC_VERSION
+# include <intrin.h>
+#elif GCC_VERSION
+#else
+# error compiler not supported
+#endif
+
+
+// note: unfortunately the MSC __cpuid intrinsic does not allow passing
+// additional inputs (e.g. ecx = count), so we need to implement this
+// in assembly for both IA-32 and AMD64.
+static void cpuid_impl(x86_x64_CpuidRegs* regs)
+{
+#if ARCH_IA32
+	ia32_asm_cpuid(regs);
+#else
+	amd64_asm_cpuid(regs);
+#endif
+}
+
+bool x86_x64_cpuid(x86_x64_CpuidRegs* regs)
+{
+	static u32 maxFunction;
+	static u32 maxExtendedFunction;
+	if(!maxFunction)
+	{
+		x86_x64_CpuidRegs regs2;
+		regs2.eax = 0;
+		cpuid_impl(&regs2);
+		maxFunction = regs2.eax;
+		regs2.eax = 0x80000000;
+		cpuid_impl(&regs2);
+		maxExtendedFunction = regs2.eax;
+	}
+
+	const u32 function = regs->eax;
+	if(function > maxExtendedFunction)
+		return false;
+	if(function < 0x80000000 && function > maxFunction)
+		return false;
+
+	cpuid_impl(regs);
+	return true;
+}
+
+
+//-----------------------------------------------------------------------------
+// capability bits
+
+static void DetectFeatureFlags(u32 caps[4])
+{
+	x86_x64_CpuidRegs regs;
+	regs.eax = 1;
+	if(x86_x64_cpuid(&regs))
+	{
+		caps[0] = regs.ecx;
+		caps[1] = regs.edx;
+	}
+	regs.eax = 0x80000001;
+	if(x86_x64_cpuid(&regs))
+	{
+		caps[2] = regs.ecx;
+		caps[3] = regs.edx;
+	}
+}
+
+bool x86_x64_cap(x86_x64_Cap cap)
+{
+	// treated as 128 bit field; order: std ecx, std edx, ext ecx, ext edx
+	// keep in sync with enum CpuCap!
+	static u32 x86_x64_caps[4];
+
+	// (since relevant CPUs will surely advertise at least one standard flag,
+	// they are zero iff we haven't been initialized yet)
+	if(!x86_x64_caps[1])
+		DetectFeatureFlags(x86_x64_caps);
+
+	const size_t tbl_idx = cap >> 5;
+	const size_t bit_idx = cap & 0x1f;
+	if(tbl_idx > 3)
+	{
+		DEBUG_WARN_ERR(ERR::INVALID_PARAM);
+		return false;
+	}
+	return (x86_x64_caps[tbl_idx] & BIT(bit_idx)) != 0;
+}
+
+
+//-----------------------------------------------------------------------------
+// CPU identification
+
+static x86_x64_Vendors DetectVendor()
+{
+	x86_x64_CpuidRegs regs;
+	regs.eax = 0;
+	if(!x86_x64_cpuid(&regs))
+		DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING);
+
+	// copy regs to string
+	// note: 'strange' ebx,edx,ecx reg order is due to ModR/M encoding order.
+	char vendor_str[13];
+	u32* vendor_str_u32 = (u32*)vendor_str;
+	vendor_str_u32[0] = regs.ebx;
+	vendor_str_u32[1] = regs.edx;
+	vendor_str_u32[2] = regs.ecx;
+	vendor_str[12] = '\0';	// 0-terminate
+
+	if(!strcmp(vendor_str, "AuthenticAMD"))
+		return X86_X64_VENDOR_AMD;
+	else if(!strcmp(vendor_str, "GenuineIntel"))
+		return X86_X64_VENDOR_INTEL;
+	else
+	{
+		DEBUG_WARN_ERR(ERR::CPU_UNKNOWN_VENDOR);
+		return X86_X64_VENDOR_UNKNOWN;
+	}
+}
+
+x86_x64_Vendors x86_x64_Vendor()
+{
+	static x86_x64_Vendors vendor = X86_X64_VENDOR_UNKNOWN;
+	if(vendor == X86_X64_VENDOR_UNKNOWN)
+		vendor = DetectVendor();
+	return vendor;
+}
+
+
+static void DetectSignature(size_t* model, size_t* family)
+{
+	x86_x64_CpuidRegs regs;
+	regs.eax = 1;
+	if(!x86_x64_cpuid(&regs))
+		DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING);
+	*model  = bits(regs.eax, 4, 7);
+	*family = bits(regs.eax, 8, 11);
+}
+
+
+static size_t DetectGeneration()
+{
+	size_t model, family;
+	DetectSignature(&model, &family);
+
+	switch(x86_x64_Vendor())
+	{
+	case X86_X64_VENDOR_AMD:
+		switch(family)
+		{
+		case 5:
+			if(model < 6)
+				return 5;	// K5
+			else
+				return 6;	// K6
+
+		case 6:
+			return 7;	// K7 (Athlon)
+
+		case 0xF:
+			return 8;	// K8 (Opteron)
+		}
+		break;
+
+	case X86_X64_VENDOR_INTEL:
+		switch(family)
+		{
+		case 5:
+			return 5;	// Pentium
+
+		case 6:
+			if(model <= 0xD)
+				return 6;	// Pentium Pro/II/III/M
+			else
+				return 8;	// Core2Duo
+
+		case 0xF:
+			if(model <= 6)
+				return 7;	// Pentium 4/D
+		}
+		break;
+	}
+
+	debug_assert(0);	// unknown CPU generation
+	return family;
+}
+
+size_t x86_x64_Generation()
+{
+	static size_t generation;
+	if(!generation)
+		generation = DetectGeneration();
+	return generation;
+}
+
+
+//-----------------------------------------------------------------------------
+// identifier string
+
+/// functor to remove substrings from the CPU identifier string
+class StringStripper
+{
+	char* m_string;
+	size_t m_max_chars;
+
+public:
+	StringStripper(char* string, size_t max_chars)
+	: m_string(string), m_max_chars(max_chars)
+	{
+	}
+
+	// remove all instances of substring from m_string
+	void operator()(const char* substring)
+	{
+		const size_t substring_length = strlen(substring);
+		for(;;)
+		{
+			char* substring_pos = strstr(m_string, substring);
+			if(!substring_pos)
+				break;
+			const size_t substring_ofs = substring_pos - m_string;
+			const size_t num_chars = m_max_chars - substring_ofs - substring_length;
+			memmove(substring_pos, substring_pos+substring_length, num_chars);
+		}
+	}
+};
+
+static void DetectIdentifierString(char* identifierString, size_t maxChars)
+{
+	// get brand string (if available)
+	char* pos = identifierString;
+	bool have_brand_string = true;
+	for(u32 function = 0x80000002; function <= 0x80000004; function++)
+	{
+		x86_x64_CpuidRegs regs;
+		regs.eax = function;
+		have_brand_string &= x86_x64_cpuid(&regs);
+		memcpy(pos, &regs, 16);
+		pos += 16;
+	}
+
+	// fall back to manual detect of CPU type because either:
+	// - CPU doesn't support brand string (we use a flag to indicate this
+	//   rather than comparing against a default value because it is safer);
+	// - the brand string is useless, e.g. "Unknown". this happens on
+	//   some older boards whose BIOS reprograms the string for CPUs it
+	//   doesn't recognize.
+	if(!have_brand_string || strncmp(identifierString, "Unknow", 6) == 0)
+	{
+		size_t model, family;
+		DetectSignature(&model, &family);
+
+		switch(x86_x64_Vendor())
+		{
+		case X86_X64_VENDOR_AMD:
+			// everything else is either too old, or should have a brand string.
+			if(family == 6)
+			{
+				if(model == 3 || model == 7)
+					strcpy_s(identifierString, maxChars, "AMD Duron");
+				else if(model <= 5)
+					strcpy_s(identifierString, maxChars, "AMD Athlon");
+				else
+				{
+					if(x86_x64_cap(X86_X64_CAP_AMD_MP))
+						strcpy_s(identifierString, maxChars, "AMD Athlon MP");
+					else
+						strcpy_s(identifierString, maxChars, "AMD Athlon XP");
+				}
+			}
+			break;
+
+		case X86_X64_VENDOR_INTEL:
+			// everything else is either too old, or should have a brand string.
+			if(family == 6)
+			{
+				if(model == 1)
+					strcpy_s(identifierString, maxChars, "Intel Pentium Pro");
+				else if(model == 3 || model == 5)
+					strcpy_s(identifierString, maxChars, "Intel Pentium II");
+				else if(model == 6)
+					strcpy_s(identifierString, maxChars, "Intel Celeron");	
+				else
+					strcpy_s(identifierString, maxChars, "Intel Pentium III");
+			}
+			break;
+		}
+	}
+	// identifierString already holds a valid brand string; pretty it up.
+	else
+	{
+		const char* const undesired_strings[] = { "(tm)", "(TM)", "(R)", "CPU " };
+		std::for_each(undesired_strings, undesired_strings+ARRAY_SIZE(undesired_strings),
+			StringStripper(identifierString, strlen(identifierString)+1));
+
+		// note: Intel brand strings include a frequency, but we can't rely
+		// on it because the CPU may be overclocked. we'll leave it in the
+		// string to show measurement accuracy and if SpeedStep is active.
+	}
+}
+
+const char* cpu_IdentifierString()
+{
+	// 3 calls x 4 registers x 4 bytes = 48
+	static char identifierString[48+1] = {'\0'};
+	if(identifierString[0] == '\0')
+		DetectIdentifierString(identifierString, ARRAY_SIZE(identifierString));
+	return identifierString;
+}
+
+
+//-----------------------------------------------------------------------------
+// CPU frequency
+
+// set scheduling priority and restore when going out of scope.
+class ScopedSetPriority
+{
+	int m_old_policy;
+	sched_param m_old_param;
+
+public:
+	ScopedSetPriority(int new_priority)
+	{
+		// get current scheduling policy and priority
+		pthread_getschedparam(pthread_self(), &m_old_policy, &m_old_param);
+
+		// set new priority
+		sched_param new_param = {0};
+		new_param.sched_priority = new_priority;
+		pthread_setschedparam(pthread_self(), SCHED_FIFO, &new_param);
+	}
+
+	~ScopedSetPriority()
+	{
+		// restore previous policy and priority.
+		pthread_setschedparam(pthread_self(), m_old_policy, &m_old_param);
+	}
+};
+
+// note: this function uses timer.cpp!timer_Time, which is implemented via
+// whrt.cpp on Windows, which again calls x86_x64_Init. be careful that
+// this function isn't called from there as well, else WHRT will be used
+// before its init completes.
+double cpu_ClockFrequency()
+{
+	// if the TSC isn't available, there's really no good way to count the
+	// actual CPU clocks per known time interval, so bail.
+	// note: loop iterations ("bogomips") are not a reliable measure due
+	// to differing IPC and compiler optimizations.
+	if(!x86_x64_cap(X86_X64_CAP_TSC))
+		return -1.0;	// impossible value
+
+	// increase priority to reduce interference while measuring.
+	const int priority = sched_get_priority_max(SCHED_FIFO)-1;
+	ScopedSetPriority ssp(priority);
+
+	// note: no need to "warm up" cpuid - it will already have been
+	// called several times by the time this code is reached.
+	// (background: it's used in x86_x64_rdtsc() to serialize instruction flow;
+	// the first call is documented to be slower on Intel CPUs)
+
+	int num_samples = 16;
+	// if clock is low-res, do less samples so it doesn't take too long.
+	// balance measuring time (~ 10 ms) and accuracy (< 1 0/00 error -
+	// ok for using the TSC as a time reference)
+	if(timer_Resolution() >= 1e-3)
+		num_samples = 8;
+	std::vector<double> samples(num_samples);
+
+	for(int i = 0; i < num_samples; i++)
+	{
+		double dt;
+		i64 dc; // i64 because VC6 can't convert u64 -> double,
+		        // and we don't need all 64 bits.
+
+		// count # of clocks in max{1 tick, 1 ms}:
+		// .. wait for start of tick.
+		const double t0 = timer_Time();
+		u64 c1; double t1;
+		do
+		{
+			// note: timer_Time effectively has a long delay (up to 5 us)
+			// before returning the time. we call it before x86_x64_rdtsc to
+			// minimize the delay between actually sampling time / TSC,
+			// thus decreasing the chance for interference.
+			// (if unavoidable background activity, e.g. interrupts,
+			// delays the second reading, inaccuracy is introduced).
+			t1 = timer_Time();
+			c1 = x86_x64_rdtsc();
+		}
+		while(t1 == t0);
+		// .. wait until start of next tick and at least 1 ms elapsed.
+		do
+		{
+			const double t2 = timer_Time();
+			const u64 c2 = x86_x64_rdtsc();
+			dc = (i64)(c2 - c1);
+			dt = t2 - t1;
+		}
+		while(dt < 1e-3);
+
+		// .. freq = (delta_clocks) / (delta_seconds);
+		//    x86_x64_rdtsc/timer overhead is negligible.
+		const double freq = dc / dt;
+		samples[i] = freq;
+	}
+
+	std::sort(samples.begin(), samples.end());
+
+	// median filter (remove upper and lower 25% and average the rest).
+	// note: don't just take the lowest value! it could conceivably be
+	// too low, if background processing delays reading c1 (see above).
+	double sum = 0.0;
+	const int lo = num_samples/4, hi = 3*num_samples/4;
+	for(int i = lo; i < hi; i++)
+		sum += samples[i];
+
+	const double clock_frequency = sum / (hi-lo);
+	return clock_frequency;
+}
+
+
+//-----------------------------------------------------------------------------
+// misc stateless functions
+
+u8 x86_x64_ApicId()
+{
+	x86_x64_CpuidRegs regs;
+	regs.eax = 1;
+	if(!x86_x64_cpuid(&regs))
+		DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING);
+	const u8 apicId = (u8)bits(regs.ebx, 24, 31);
+	return apicId;
+}
+
+
+u64 x86_x64_rdtsc()
+{
+#if MSC_VERSION
+	return (u64)__rdtsc();
+#elif GCC_VERSION
+	// GCC supports "portable" assembly for both x86 and x64
+	volatile u32 lo, hi;
+	asm volatile ("rdtsc" : "=a" (lo), "=d" (hi));
+	return u64_from_u32(hi, lo);
+#endif
+}
+
+
+void x86_x64_DebugBreak()
+{
+#if MSC_VERSION
+	__debugbreak();
+#elif GCC_VERSION
+	// note: this probably isn't necessary, since unix_debug_break
+	// (SIGTRAP) is most probably available if GCC_VERSION.
+	// we include it for completeness, though.
+	__asm__ __volatile__ ("int $3");
+#endif
+}
+
+
+// enforce strong memory ordering.
+void cpu_MemoryFence()
+{
+	if(x86_x64_cap(X86_X64_CAP_SSE2))
+		_mm_mfence();
+}
+
+
+void cpu_Serialize()
+{
+	x86_x64_CpuidRegs regs;
+	regs.eax = 1;
+	x86_x64_cpuid(&regs);	// CPUID serializes execution.
+}
--- a/source/lib/sysdep/x86_x64/x86_x64.h
+++ b/source/lib/sysdep/x86_x64/x86_x64.h
@ -0,0 +1,125 @@
+/**
+ * =========================================================================
+ * File        : x86_x64.h
+ * Project     : 0 A.D.
+ * Description : CPU-specific routines common to 32 and 64-bit x86
+ * =========================================================================
+ */
+
+// license: GPL; see lib/license.txt
+
+#ifndef INCLUDED_X86_X64
+#define INCLUDED_X86_X64
+
+#if !ARCH_IA32 && !ARCH_AMD64
+#error "including x86_x64.h without ARCH_IA32=1 or ARCH_AMD64=1"
+#endif
+
+/**
+ * registers used/returned by x86_x64_cpuid
+ **/
+struct x86_x64_CpuidRegs
+{
+	u32 eax;
+	u32 ebx;
+	u32 ecx;
+	u32 edx;
+};
+
+/**
+ * invoke CPUID instruction.
+ * @param regs input/output registers.
+ *   regs->eax must be set to the desired function.
+ *   some functions (e.g. 4) require regs->ecx to be set as well.
+ *   rationale: this interface (input/output structure vs. function parameters)
+ *     avoids unnecessary copying/initialization if some inputs aren't needed
+ *     and allows graceful expansion to functions that require further inputs.
+ * @return true on success or false if the sub-function isn't supported.
+ **/
+extern bool x86_x64_cpuid(x86_x64_CpuidRegs* regs);
+
+/**
+ * CPU vendor.
+ * (this is exposed because some CPUID functions are vendor-specific.)
+ * (an enum is easier to compare than the original string values.)
+ **/
+enum x86_x64_Vendors
+{
+	X86_X64_VENDOR_UNKNOWN,
+	X86_X64_VENDOR_INTEL,
+	X86_X64_VENDOR_AMD,
+};
+
+LIB_API x86_x64_Vendors x86_x64_Vendor();
+
+
+/**
+ * @return the colloquial processor generation
+ * (5 = Pentium, 6 = Pentium Pro/II/III / K6, 7 = Pentium4 / Athlon, 8 = Core / Opteron)
+ **/
+LIB_API size_t x86_x64_Generation();
+
+
+/**
+ * bit indices of CPU capability flags (128 bits).
+ * values are defined by IA-32 CPUID feature flags - do not change!
+ **/
+enum x86_x64_Cap
+{
+	// standard (ecx) - currently only defined by Intel
+	X86_X64_CAP_SSE3            = 0+0,	// Streaming SIMD Extensions 3
+	X86_X64_CAP_EST             = 0+7,	// Enhanced Speedstep Technology
+
+	// standard (edx)
+	X86_X64_CAP_FPU             = 32+0,	// Floating Point Unit
+	X86_X64_CAP_TSC             = 32+4,	// TimeStamp Counter
+	X86_X64_CAP_CMOV            = 32+15,	// Conditional MOVe
+	X86_X64_CAP_TM_SCC          = 32+22,	// Thermal Monitoring and Software Controlled Clock
+	X86_X64_CAP_MMX             = 32+23,	// MultiMedia eXtensions
+	X86_X64_CAP_SSE             = 32+25,	// Streaming SIMD Extensions
+	X86_X64_CAP_SSE2            = 32+26,	// Streaming SIMD Extensions 2
+	X86_X64_CAP_HT              = 32+28,	// HyperThreading
+
+	// extended (ecx)
+	X86_X64_CAP_AMD_CMP_LEGACY  = 64+1,	// N-core and X86_X64_CAP_HT is falsely set
+
+	// extended (edx)
+	X86_X64_CAP_AMD_MP          = 96+19,	// MultiProcessing capable; reserved on AMD64
+	X86_X64_CAP_AMD_MMX_EXT     = 96+22,
+	X86_X64_CAP_AMD_3DNOW_PRO   = 96+30,
+	X86_X64_CAP_AMD_3DNOW       = 96+31
+};
+
+/**
+ * @return whether the CPU supports the indicated x86_x64_Cap / feature flag.
+ **/
+LIB_API bool x86_x64_cap(x86_x64_Cap cap);
+
+
+//-----------------------------------------------------------------------------
+// stateless
+
+/**
+ * @return APIC ID of the currently executing processor.
+ *
+ * the implementation uses CPUID.1 and only works on >= 8th generation CPUs;
+ * (P4/Athlon XP); otherwise it returns 0. the alternative of accessing the
+ * APIC mmio registers is not feasible - mahaf_MapPhysicalMemory only works
+ * reliably on WinXP. also, the OS already has the APIC registers mapped and
+ * in constant use, and we don't want to interfere.
+ **/
+LIB_API u8 x86_x64_ApicId();
+
+/**
+ * @return the current value of the TimeStampCounter (a counter of
+ * CPU cycles since power-on, which is useful for high-resolution timing
+ * but potentially differs between multiple CPUs)
+ **/
+LIB_API u64 x86_x64_rdtsc();
+
+/**
+ * trigger a breakpoint inside this function when it is called.
+ **/
+LIB_API void x86_x64_DebugBreak(void);
+
+#endif	// #ifndef INCLUDED_X86_X64
--- a/source/lib/tests/test_lockfree.h
+++ b/source/lib/tests/test_lockfree.h
@ -186,7 +186,7 @@ class TestMultithread : public CxxTest::TestSuite
 			break;

 			case TA_SLEEP:
-				usleep(sleep_duration_ms*1000);
+				usleep(useconds_t(sleep_duration_ms*1000));
 				break;

 			default:
--- a/source/lib/tests/test_rand.h
+++ b/source/lib/tests/test_rand.h
@ -36,7 +36,7 @@ public:
 			if(x == 1) ones++;
 			if(x == 2) twos++;
 		}
-		TS_ASSERT_EQUALS(ones+twos, 100);
+		TS_ASSERT_EQUALS(ones+twos, size_t(100));
 		TS_ASSERT(ones > 10 && twos > 10);
 	}
 };
--- a/source/lib/timer.cpp
+++ b/source/lib/timer.cpp
@ -25,8 +25,8 @@
 # include <unistd.h>
 #endif
 #include "lib/config2.h"	// CONFIG2_TIMER_ALLOW_RDTSC
-#if ARCH_IA32 && CONFIG2_TIMER_ALLOW_RDTSC
-# include "lib/sysdep/ia32/ia32.h"	// ia32_rdtsc
+#if (ARCH_IA32 || ARCH_AMD64) && CONFIG2_TIMER_ALLOW_RDTSC
+# include "lib/sysdep/x86_x64/x86_x64.h"	// x86_x64_rdtsc
 #endif

 #if OS_UNIX || OS_WIN
@ -177,7 +177,7 @@ void TimerUnit::SetToZero()

 void TimerUnit::SetFromTimer()
 {
-	m_ticks = ia32_rdtsc();
+	m_ticks = x86_x64_rdtsc();
 }

 void TimerUnit::AddDifference(TimerUnit t0, TimerUnit t1)
--- a/source/ps/Util.cpp
+++ b/source/ps/Util.cpp
@ -8,7 +8,8 @@
 #include "lib/allocators/shared_ptr.h"
 #include "lib/sysdep/gfx.h"
 #include "lib/sysdep/snd.h"
-#include "lib/sysdep/cpu.h"
+#include "lib/sysdep/os_cpu.h"
+#include "lib/sysdep/x86_x64/topology.h"
 #include "lib/tex/tex.h"
 #include "lib/file/io/io_align.h"	// BLOCK_SIZE

@ -87,7 +88,7 @@ void WriteSystemInfo()
 		fprintf(f, "\n");

 	// memory
-	fprintf(f, "Memory         : %lu MiB; %lu MiB free\n", cpu_MemorySize(CPU_MEM_TOTAL)/MiB, cpu_MemorySize(CPU_MEM_AVAILABLE)/MiB);
+	fprintf(f, "Memory         : %lu MiB; %lu MiB free\n", os_cpu_MemorySize()/MiB, os_cpu_MemoryAvailable()/MiB);

 	// graphics
 	fprintf(f, "Graphics Card  : %s\n", gfx_card);