# fix race conditions in ModuleInit and related cleanup.

CAS: uintptr_t->intptr_t to allow use of both cpu_CAS and cpu_AtomicAdd topology: remove non-thread safe caching, expose ApicIds, use ModuleInit x86_x64: use ModuleInit instead of unsafe static flags; zero-init regs instead of just setting ecx ModuleInitState now holds the LibError returned by the init callback (so that the second init doesn't appear to succeed despite the first failing) wnuma: cleanup, add ACPI SLIT relative distance detection This was SVN commit r7741.
2010-07-12 12:57:58 +00:00 · 2010-07-12 12:57:58 +00:00 · 98770fa4cc
commit 98770fa4cc
parent b1bdb2b1f2
31 changed files with 649 additions and 590 deletions
--- a/source/lib/allocators/allocators.cpp
+++ b/source/lib/allocators/allocators.cpp
@ -107,7 +107,7 @@ void matrix_free(void** matrix)
 // allocator optimized for single instances
 //-----------------------------------------------------------------------------

-void* single_calloc(void* storage, volatile uintptr_t* in_use_flag, size_t size)
+void* single_calloc(void* storage, volatile intptr_t* in_use_flag, size_t size)
 {
 	// sanity check
 	debug_assert(*in_use_flag == 0 || *in_use_flag == 1);
@ -126,7 +126,7 @@ void* single_calloc(void* storage, volatile uintptr_t* in_use_flag, size_t size)
 }


-void single_free(void* storage, volatile uintptr_t* in_use_flag, void* p)
+void single_free(void* storage, volatile intptr_t* in_use_flag, void* p)
 {
 	// sanity check
 	debug_assert(*in_use_flag == 0 || *in_use_flag == 1);
--- a/source/lib/allocators/allocators.h
+++ b/source/lib/allocators/allocators.h
@ -150,7 +150,7 @@ extern void matrix_free(void** matrix);
 * @return allocated memory (typically = <storage>, but falls back to
 * malloc if that's in-use), or 0 (with warning) if out of memory.
 **/
-extern void* single_calloc(void* storage, volatile uintptr_t* in_use_flag, size_t size);
+extern void* single_calloc(void* storage, volatile intptr_t* in_use_flag, size_t size);

 /**
 * Free a memory block that had been allocated by single_calloc.
@ -159,7 +159,7 @@ extern void* single_calloc(void* storage, volatile uintptr_t* in_use_flag, size_
 * @param in_use_flag Exact value passed to single_calloc.
 * @param Exact value returned by single_calloc.
 **/
-extern void single_free(void* storage, volatile uintptr_t* in_use_flag, void* p);
+extern void single_free(void* storage, volatile intptr_t* in_use_flag, void* p);

 #ifdef __cplusplus

--- a/source/lib/debug.cpp
+++ b/source/lib/debug.cpp
@ -202,7 +202,7 @@ LibError debug_WriteCrashlog(const wchar_t* text)
 	// initializing local static objects from constants may happen when
 	// this is first called, which isn't thread-safe. (see C++ 6.7.4)
 	cassert(IDLE == 0);
-	static volatile uintptr_t state;
+	static volatile intptr_t state;

 	if(!cpu_CAS(&state, IDLE, BUSY))
 		return ERR::REENTERED;	// NOWARN
@ -500,7 +500,7 @@ enum SkipStatus
 {
 	INVALID, VALID, BUSY
 };
-static uintptr_t skipStatus = INVALID;	// cpu_CAS requires uintptr_t
+static intptr_t skipStatus = INVALID;	// cpu_CAS requires uintptr_t
 static LibError errorToSkip;
 static size_t numSkipped;

--- a/source/lib/file/vfs/vfs_tree.h
+++ b/source/lib/file/vfs/vfs_tree.h
@ -149,7 +149,7 @@ private:
 	VfsSubdirectories m_subdirectories;

 	PRealDirectory m_realDirectory;
-	volatile uintptr_t m_shouldPopulate;	// (cpu_CAS can't be used on bool)
+	volatile intptr_t m_shouldPopulate;	// (cpu_CAS can't be used on bool)
 };


--- a/source/lib/lockfree.cpp
+++ b/source/lib/lockfree.cpp
@ -707,21 +707,13 @@ LibError lfh_erase(LFHash* hash, uintptr_t key)

 //-----------------------------------------------------------------------------

-static ModuleInitState initState;
-
 void lockfree_Init()
 {
-	if(!ModuleShouldInitialize(&initState))
-		return;
-
 	tls_init();
 }

 void lockfree_Shutdown()
 {
-	if(!ModuleShouldShutdown(&initState))
-		return;
-
 	smr_shutdown();
 }

--- a/source/lib/lockfree.h
+++ b/source/lib/lockfree.h
@ -232,10 +232,10 @@ public:
 	}

 private:
-	static const uintptr_t S_REFCNT = (~0u) >> 1;		// 0x7F..F
-	static const uintptr_t S_EXCLUSIVE = S_REFCNT+1u;	// 0x80..0
+	static const intptr_t S_REFCNT = (~0u) >> 1;		// 0x7F..F
+	static const intptr_t S_EXCLUSIVE = S_REFCNT+1u;	// 0x80..0

-	volatile uintptr_t m_status;
+	volatile intptr_t m_status;
 };

 #endif	// #ifndef INCLUDED_LOCKFREE
--- a/source/lib/module_init.cpp
+++ b/source/lib/module_init.cpp
@ -27,64 +27,64 @@
 #include "precompiled.h"
 #include "lib/module_init.h"

-#include "lib/sysdep/cpu.h"	// cpu_CAS, cpu_AtomicAdd
+#include "lib/sysdep/cpu.h"	// cpu_CAS

-// notes:
-// - value must be 0 to allow users to just define uninitialized static
-//   variables (they don't have access to our MODULE_* symbols)
-// - unlike expected in-game operation, the self-tests require repeated
-//   sequences of init/shutdown pairs. we therefore allow this in general
-//   (resetting back to MODULE_UNINITIALIZED after shutdown) because
-//   there's no real disadvantage other than loss of strictness.
-static const ModuleInitState MODULE_UNINITIALIZED = 0u;
-
-// (1..N = reference count)
-
-static const ModuleInitState MODULE_ERROR = ~(uintptr_t)1u;
+// not yet initialized, or already shutdown
+static const ModuleInitState UNINITIALIZED = 0;	// value documented in header
+// running user callback - concurrent ModuleInit callers must spin
+static const ModuleInitState BUSY = INFO::ALREADY_EXISTS;	// never returned
+// init succeeded; allow shutdown
+static const ModuleInitState INITIALIZED = INFO::SKIPPED;


-bool ModuleShouldInitialize(volatile ModuleInitState* pInitState)
+LibError ModuleInit(volatile ModuleInitState* initState, LibError (*init)())
 {
-	// currently uninitialized, so give the green light.
-	if(cpu_CAS(pInitState, MODULE_UNINITIALIZED, 1))
-		return true;
+	for(;;)
+	{
+		if(cpu_CAS(initState, UNINITIALIZED, BUSY))
+		{
+			LibError ret = init();
+			*initState = (ret == INFO::OK)? INITIALIZED : ret;
+			cpu_MemoryBarrier();
+			return ret;
+		}

-	// increment reference count - unless already in a final state.
-retry:
-	ModuleInitState latchedInitState = *pInitState;
-	if(latchedInitState == MODULE_ERROR)
-		return false;
-	if(!cpu_CAS(pInitState, latchedInitState, latchedInitState+1))
-		goto retry;
-	return false;
+		const ModuleInitState latchedInitState = *initState;
+		if(latchedInitState == UNINITIALIZED || latchedInitState == BUSY)
+		{
+			_mm_pause();
+			continue;
+		}
+
+		debug_assert(latchedInitState == INITIALIZED || latchedInitState < 0);
+		return (LibError)latchedInitState;
+	}
 }


-bool ModuleShouldShutdown(volatile ModuleInitState* pInitState)
+LibError ModuleShutdown(volatile ModuleInitState* initState, void (*shutdown)())
 {
-	// decrement reference count - unless already in a final state.
-retry:
-	ModuleInitState latchedInitState = *pInitState;
-	if(latchedInitState == MODULE_UNINITIALIZED || latchedInitState == MODULE_ERROR)
-		return false;
-	if(!cpu_CAS(pInitState, latchedInitState, latchedInitState-1))
-		goto retry;
+	for(;;)
+	{
+		if(cpu_CAS(initState, INITIALIZED, BUSY))
+		{
+			shutdown();
+			*initState = UNINITIALIZED;
+			cpu_MemoryBarrier();
+			return INFO::OK;
+		}

-	// refcount reached zero => allow shutdown.
-	if(latchedInitState-1 == MODULE_UNINITIALIZED)
-		return true;
+		const ModuleInitState latchedInitState = *initState;
+		if(latchedInitState == INITIALIZED || latchedInitState == BUSY)
+		{
+			_mm_pause();
+			continue;
+		}

-	return false;
-}
-
-
-void ModuleSetError(volatile ModuleInitState* pInitState)
-{
-	*pInitState = MODULE_ERROR;
-}
-
-
-bool ModuleIsError(volatile ModuleInitState* pInitState)
-{
-	return (*pInitState == MODULE_ERROR);
+		if(latchedInitState == UNINITIALIZED)
+			return INFO::SKIPPED;
+
+		debug_assert(latchedInitState < 0);
+		return (LibError)latchedInitState;
+	}
 }
--- a/source/lib/module_init.h
+++ b/source/lib/module_init.h
@ -28,45 +28,47 @@
 #define INCLUDED_MODULE_INIT

 /**
- * initialization state of a module: class, source file, or whatever.
- *
- * can be declared as a static variable => no initializer needed,
- * since 0 is the correct initial value.
- *
- * DO NOT change the value directly! (that'd break the carefully thought-out
- * lock-free implementation)
+ * initialization state of a module (class, source file, etc.)
+ * must be initialized to zero (e.g. by defining as a static variable).
+ * DO NOT change the value!
 **/
-typedef uintptr_t ModuleInitState;	// uintptr_t required by cpu_CAS
+typedef intptr_t ModuleInitState;	// intptr_t is required by cpu_CAS

 /**
- * @return whether initialization should go forward, i.e. initState is
- * currently MODULE_UNINITIALIZED. increments initState afterwards.
+ * calls a user-defined init function if initState is zero.
 *
- * (the reason for this function - and tricky part - is thread-safety)
- **/
-extern bool ModuleShouldInitialize(volatile ModuleInitState* initState);
-
-/**
- * if module reference count is valid, decrement it.
- * @return whether shutdown should go forward, i.e. this is the last
- * shutdown call.
- **/
-extern bool ModuleShouldShutdown(volatile ModuleInitState* initState);
-
-/**
- * indicate the module is unusable, e.g. due to failure during init.
- * all subsequent ModuleShouldInitialize/ModuleShouldShutdown calls
- * for this initState will return false.
- **/
-extern void ModuleSetError(volatile ModuleInitState* initState);
-
-/**
- * @return whether the module is in the failure state, i.e. ModuleSetError
- * was previously called on the same initState.
+ * @return INFO::SKIPPED if already initialized, a LibError if the
+ * previous invocation failed, or the value returned by the callback.
 *
- * this function is provided so that modules can report init failure to
- * the second or later caller.
+ * postcondition: initState is "initialized" if the callback returned
+ * INFO::OK, otherwise its LibError return value (which prevents
+ * shutdown from being called).
+ *
+ * thread-safe: subsequent callers spin until the callback returns
+ * (this prevents using partially-initialized modules)
+ *
+ * note that callbacks typically reference static data and thus do not
+ * require a function argument, but that can later be added if necessary.
 **/
-extern bool ModuleIsError(volatile ModuleInitState* initState);
+LIB_API LibError ModuleInit(volatile ModuleInitState* initState, LibError (*init)());
+
+/**
+ * calls a user-defined shutdown function if initState is "initialized".
+ *
+ * @return INFO::OK if shutdown occurred, INFO::SKIPPED if initState was
+ * zero (uninitialized), otherwise the LibError returned by ModuleInit.
+ *
+ * postcondition: initState remains set to the LibError, or has been
+ * reset to zero to allow multiple init/shutdown pairs, e.g. in self-tests.
+ *
+ * note: there is no provision for reference-counting because that
+ * turns out to be problematic (a user might call shutdown immediately
+ * after init; if this is the first use of the module, it will
+ * be shutdown prematurely, which is at least inefficient and
+ * possibly dangerous). instead, shutdown should only be called when
+ * cleanup is necessary (e.g. at exit before leak reporting) and
+ * it is certain that the module is no longer in use.
+ **/
+LIB_API LibError ModuleShutdown(volatile ModuleInitState* initState, void (*shutdown)());

 #endif	// #ifndef INCLUDED_MODULE_INIT
--- a/source/lib/res/h_mgr.cpp
+++ b/source/lib/res/h_mgr.cpp
@ -805,18 +805,13 @@ int h_get_refcnt(Handle h)

 static ModuleInitState initState;

-void h_mgr_init()
+static LibError Init()
 {
-	if(!ModuleShouldInitialize(&initState))
-		return;
+	return INFO::OK;
 }

-
-void h_mgr_shutdown()
+static void Shutdown()
 {
-	if(!ModuleShouldShutdown(&initState))
-		return;
-
 	debug_printf(L"H_MGR| shutdown. any handle frees after this are leaks!\n");

 	// forcibly close all open handles
@ -853,3 +848,14 @@ void h_mgr_shutdown()
 		pages[j] = 0;
 	}
 }
+
+
+void h_mgr_init()
+{
+	ModuleInit(&initState, Init);
+}
+
+void h_mgr_shutdown()
+{
+	ModuleShutdown(&initState, Shutdown);
+}
--- a/source/lib/res/sound/snd_mgr.cpp
+++ b/source/lib/res/sound/snd_mgr.cpp
@ -472,40 +472,40 @@ static size_t al_src_cap = AL_SRC_MAX;

 // note: to catch double-free bugs and ensure all sources are
 // released at exit, we segregate them into free and used lists.
-static uintptr_t al_srcs_used[AL_SRC_MAX];
-static uintptr_t al_srcs_free[AL_SRC_MAX];
+static intptr_t al_srcs_used[AL_SRC_MAX];
+static intptr_t al_srcs_free[AL_SRC_MAX];

 // total number of sources allocated by al_src_init
 static size_t al_src_allocated;


-static void srcs_insert(uintptr_t* srcs, ALuint al_src)
+static void srcs_insert(volatile intptr_t* srcs, ALuint al_src)
 {
 	for(size_t i = 0; i < al_src_allocated; i++)
 	{
-		if(cpu_CAS(&srcs[i], 0, (uintptr_t)al_src))
+		if(cpu_CAS(&srcs[i], 0, (intptr_t)al_src))
 			return;
 	}
 	debug_assert(0);	// list full (can't happen)
 }

-static void srcs_remove(uintptr_t* srcs, ALuint al_src)
+static void srcs_remove(volatile intptr_t* srcs, ALuint al_src)
 {
 	for(size_t i = 0; i < al_src_allocated; i++)
 	{
-		if(cpu_CAS(&srcs[i], (uintptr_t)al_src, 0))
+		if(cpu_CAS(&srcs[i], (intptr_t)al_src, 0))
 			return;
 	}
 	debug_assert(0);	// source not found (can't happen)
 }

 // @return first nonzero entry (which is then zeroed), or zero if there are none.
-static ALuint srcs_pop(uintptr_t* srcs)
+static ALuint srcs_pop(volatile intptr_t* srcs)
 {
 	for(size_t i = 0; i < al_src_allocated; i++)
 	{
 retry:
-		uintptr_t al_src = srcs[i];
+		intptr_t al_src = srcs[i];
 		cpu_MemoryBarrier();
 		if(!cpu_CAS(&srcs[i], al_src, 0))
 			goto retry;
--- a/source/lib/sysdep/arch/amd64/amd64.cpp
+++ b/source/lib/sysdep/arch/amd64/amd64.cpp
@ -45,9 +45,9 @@ void* cpu_memcpy(void* RESTRICT dst, const void* RESTRICT src, size_t size)

 #if MSC_VERSION

-bool cpu_CAS(volatile uintptr_t* location, uintptr_t expected, uintptr_t newValue)
+bool cpu_CAS(volatile intptr_t* location, intptr_t expected, intptr_t newValue)
 {
-	const uintptr_t initial = _InterlockedCompareExchange64((volatile __int64*)location, newValue, expected);
+	const intptr_t initial = _InterlockedCompareExchange64((volatile __int64*)location, newValue, expected);
 	return initial == expected;
 }

@ -63,7 +63,7 @@ void cpu_AtomicAdd(volatile intptr_t* location, intptr_t increment)
 	amd64_AtomicAdd(location, increment);
 }

-bool cpu_CAS(volatile uintptr_t* location, uintptr_t expected, uintptr_t newValue)
+bool cpu_CAS(volatile intptr_t* location, intptr_t expected, intptr_t newValue)
 {
 	return amd64_CAS(location, expected, newValue) ? true : false;
 }
--- a/source/lib/sysdep/arch/amd64/amd64_asm.h
+++ b/source/lib/sysdep/arch/amd64/amd64_asm.h
@ -34,7 +34,7 @@ extern "C" {
 struct x86_x64_CpuidRegs;
 extern void CALL_CONV amd64_asm_cpuid(x86_x64_CpuidRegs* reg);

-extern intptr_t CALL_CONV amd64_CAS(volatile uintptr_t *location, uintptr_t expected, uintptr_t newValue);
+extern intptr_t CALL_CONV amd64_CAS(volatile intptr_t *location, intptr_t expected, intptr_t newValue);

 extern void CALL_CONV amd64_AtomicAdd(volatile intptr_t *location, intptr_t increment);

--- a/source/lib/sysdep/arch/ia32/ia32.cpp
+++ b/source/lib/sysdep/arch/ia32/ia32.cpp
@ -153,7 +153,7 @@ void cpu_AtomicAdd(volatile intptr_t* location, intptr_t increment)
 }


-bool cpu_CAS(volatile uintptr_t* location, uintptr_t expected, uintptr_t new_value)
+bool cpu_CAS(volatile intptr_t* location, intptr_t expected, intptr_t new_value)
 {
 	return ia32_asm_CAS(location, expected, new_value);
 }
--- a/source/lib/sysdep/arch/ia32/ia32_asm.h
+++ b/source/lib/sysdep/arch/ia32/ia32_asm.h
@ -35,7 +35,7 @@ struct x86_x64_CpuidRegs;
 extern void CALL_CONV ia32_asm_cpuid(x86_x64_CpuidRegs* regs);

 extern void CALL_CONV ia32_asm_AtomicAdd(volatile intptr_t* location, intptr_t increment);
-extern bool CALL_CONV ia32_asm_CAS(volatile uintptr_t* location, uintptr_t expected, uintptr_t new_value);
+extern bool CALL_CONV ia32_asm_CAS(volatile intptr_t* location, intptr_t expected, intptr_t new_value);

 /// control87
 // FPU control word
--- a/source/lib/sysdep/arch/x86_x64/topology.cpp
+++ b/source/lib/sysdep/arch/x86_x64/topology.cpp
@ -30,6 +30,7 @@
 #include <set>

 #include "lib/bits.h"
+#include "lib/module_init.h"
 #include "lib/sysdep/cpu.h"	// ERR::CPU_FEATURE_MISSING
 #include "lib/sysdep/os_cpu.h"
 #include "lib/sysdep/arch/x86_x64/x86_x64.h"
@ -40,93 +41,77 @@
 // note: some of them may be disabled by the OS or BIOS.
 // note: Intel Appnote 485 assures us that they are uniform across packages.

-static size_t CoresPerPackage()
+static size_t MaxCoresPerPackage()
 {
-	static size_t coresPerPackage = 0;
+	// assume single-core unless one of the following applies:
+	size_t maxCoresPerPackage = 1;

-	if(!coresPerPackage)
+	x86_x64_CpuidRegs regs;
+	switch(x86_x64_Vendor())
 	{
-		coresPerPackage = 1;	// it's single core unless one of the following applies:
+	case X86_X64_VENDOR_INTEL:
+		regs.eax = 4;
+		regs.ecx = 0;
+		if(x86_x64_cpuid(&regs))
+			maxCoresPerPackage = bits(regs.eax, 26, 31)+1;
+		break;

+	case X86_X64_VENDOR_AMD:
+		regs.eax = 0x80000008;
+		regs.ecx = 0;
+		if(x86_x64_cpuid(&regs))
+			maxCoresPerPackage = bits(regs.ecx, 0, 7)+1;
+		break;
+	}
+
+	return maxCoresPerPackage;
+}
+
+
+static size_t MaxLogicalPerCore()
+{
+	struct IsHyperthreadingCapable
+	{
+		bool operator()() const
+		{
+			// definitely not
+			if(!x86_x64_cap(X86_X64_CAP_HT))
+				return false;
+
+			// AMD N-core systems falsely set the HT bit for compatibility reasons
+			// (don't bother resetting it, might confuse callers)
+			if(x86_x64_Vendor() == X86_X64_VENDOR_AMD && x86_x64_cap(X86_X64_CAP_AMD_CMP_LEGACY))
+				return false;
+
+			return true;
+		}
+	};
+	if(IsHyperthreadingCapable()())
+	{
 		x86_x64_CpuidRegs regs;
-		switch(x86_x64_Vendor())
-		{
-		case X86_X64_VENDOR_INTEL:
-			regs.eax = 4;
-			regs.ecx = 0;
-			if(x86_x64_cpuid(&regs))
-				coresPerPackage = bits(regs.eax, 26, 31)+1;
-			break;
-
-		case X86_X64_VENDOR_AMD:
-			regs.eax = 0x80000008;
-			regs.ecx = 0;
-			if(x86_x64_cpuid(&regs))
-				coresPerPackage = bits(regs.ecx, 0, 7)+1;
-			break;
-		}
+		regs.eax = 1;
+		regs.ecx = 0;
+		if(!x86_x64_cpuid(&regs))
+			DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING);
+		const size_t logicalPerPackage = bits(regs.ebx, 16, 23);
+		const size_t maxCoresPerPackage = MaxCoresPerPackage();
+		// cores ought to be uniform WRT # logical processors
+		debug_assert(logicalPerPackage % maxCoresPerPackage == 0);
+		const size_t maxLogicalPerCore = logicalPerPackage / maxCoresPerPackage;
+		return maxLogicalPerCore;
 	}
-
-	return coresPerPackage;
+	else
+		return 1;
 }


-static size_t LogicalPerCore()
+static size_t MaxLogicalPerCache()
 {
-	static size_t logicalPerCore = 0;
-
-	if(!logicalPerCore)
-	{
-		struct IsHyperthreadingCapable
-		{
-			bool operator()() const
-			{
-				// definitely not
-				if(!x86_x64_cap(X86_X64_CAP_HT))
-					return false;
-
-				// AMD N-core systems falsely set the HT bit for compatibility reasons
-				// (don't bother resetting it, might confuse callers)
-				if(x86_x64_Vendor() == X86_X64_VENDOR_AMD && x86_x64_cap(X86_X64_CAP_AMD_CMP_LEGACY))
-					return false;
-
-				return true;
-			}
-		};
-		if(!IsHyperthreadingCapable()())
-			logicalPerCore = 1;
-		else
-		{
-			x86_x64_CpuidRegs regs;
-			regs.eax = 1;
-			regs.ecx = 0;
-			if(!x86_x64_cpuid(&regs))
-				DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING);
-			const size_t logicalPerPackage = bits(regs.ebx, 16, 23);
-			// cores ought to be uniform WRT # logical processors
-			debug_assert(logicalPerPackage % CoresPerPackage() == 0);
-			logicalPerCore = logicalPerPackage / CoresPerPackage();
-		}
-	}
-
-	return logicalPerCore;
-}
-
-
-static size_t LogicalPerCache()
-{
-	static size_t logicalPerCache;
-
-	if(!logicalPerCache)
-	{
-		const x86_x64_Cache* const dcache = x86_x64_DCache();
-		if(dcache->levels < 2)
-			logicalPerCache = 1;	// default
-		else
-			logicalPerCache = dcache->parameters[1].sharedBy;
-	}
-
-	return logicalPerCache;
+	const x86_x64_Cache* const dcache = x86_x64_DCache();
+	if(dcache->levels >= 2)
+		return dcache->parameters[1].sharedBy;
+	else
+		return 1;	// default
 }


@ -138,49 +123,46 @@ static size_t LogicalPerCache()
 // the exact topology; otherwise we have to guess.

 // side effect: `removes' (via std::unique) duplicate IDs.
-static bool AreApicIdsUnique(u8* apicIds, size_t numProcessors)
+static bool AreApicIdsUnique(u8* apicIds, size_t numIds)
 {
-	u8* const end = std::unique(apicIds, apicIds+numProcessors);
-	const size_t numIds = end-apicIds;
-	if(numIds == numProcessors)	// all unique
+	u8* const end = std::unique(apicIds, apicIds+numIds);
+	const size_t numUnique = end-apicIds;
+	if(numUnique == numIds)	// all unique
 		return true;

 	// the only legitimate cause of duplication is when no xAPIC is
 	// present (i.e. all are 0)
-	debug_assert(numIds == 1);
+	debug_assert(numUnique == 1);
 	debug_assert(apicIds[0] == 0);
 	return false;
 }

-/**
- * @return an array of the processors' unique APIC IDs or zero if
- * no xAPIC is present or process affinity is limited.
- **/
-static const u8* ApicIds()
+static u8 apicIdStorage[os_cpu_MaxProcessors];
+static const u8* apicIds;
+
+static LibError InitApicIds()
 {
-	const u8* const uninitialized = (const u8*)1;
-	static const u8* apicIds = uninitialized;
-
-	if(apicIds == uninitialized)
+	// store each processor's APIC ID in turn
+	struct StoreApicId
 	{
-		apicIds = 0;	// return zero from now on unless everything below succeeds
-
-		// store each processor's APIC ID in turn
-		static u8 apicIdStorage[os_cpu_MaxProcessors];
-		struct StoreApicId
+		static void Callback(size_t processor, uintptr_t UNUSED(cbData))
 		{
-			static void Callback(size_t processor, uintptr_t UNUSED(cbData))
-			{
-				apicIdStorage[processor] = x86_x64_ApicId();
-			}
-		};
-		if(os_cpu_CallByEachCPU(StoreApicId::Callback, (uintptr_t)&apicIds) == INFO::OK)
-		{
-			if(AreApicIdsUnique(apicIdStorage, os_cpu_NumProcessors()))
-				apicIds = apicIdStorage;	// return valid array from now on
+			apicIdStorage[processor] = x86_x64_ApicId();
 		}
+	};
+	if(os_cpu_CallByEachCPU(StoreApicId::Callback, (uintptr_t)&apicIds) == INFO::OK)
+	{
+		if(AreApicIdsUnique(apicIdStorage, os_cpu_NumProcessors()))
+			apicIds = apicIdStorage;	// return valid array from now on
 	}

+	return INFO::OK;
+}
+
+const u8* ApicIds()
+{
+	static ModuleInitState initState;
+	ModuleInit(&initState, InitApicIds);
 	return apicIds;
 }

@ -228,7 +210,7 @@ static size_t NumPackages(const u8* apicIds)
 {
 	if(apicIds)
 	{
-		const size_t offset = ceil_log2(CoresPerPackage()) + ceil_log2(LogicalPerCore());
+		const size_t offset = ceil_log2(MaxCoresPerPackage()) + ceil_log2(MaxLogicalPerCore());
 		return NumUniqueValuesInField(apicIds, offset, 256);
 	}
 	else
@ -244,13 +226,13 @@ static size_t NumPackages(const u8* apicIds)
 		// OS support or restricted process affinity).

 		// assume cores are enabled and count as processors.
-		const size_t numPackagesTimesLogical = os_cpu_NumProcessors() / CoresPerPackage();
+		const size_t numPackagesTimesLogical = os_cpu_NumProcessors() / MaxCoresPerPackage();
 		debug_assert(numPackagesTimesLogical != 0);
 		// assume hyperthreads are enabled.
 		size_t numPackages = numPackagesTimesLogical;
 		// if they are reported as processors, remove them from the count.
-		if(numPackages > LogicalPerCore())
-			numPackages /= LogicalPerCore();
+		if(numPackages > MaxLogicalPerCore())
+			numPackages /= MaxLogicalPerCore();
 		return numPackages;
 	}
 }
@ -260,13 +242,13 @@ static size_t CoresPerPackage(const u8* apicIds)
 {
 	if(apicIds)
 	{
-		const size_t offset = ceil_log2(LogicalPerCore());
-		return NumUniqueValuesInField(apicIds, offset, CoresPerPackage());
+		const size_t offset = ceil_log2(MaxLogicalPerCore());
+		return NumUniqueValuesInField(apicIds, offset, MaxCoresPerPackage());
 	}
 	else
 	{
 		// guess (must match NumPackages's assumptions)
-		return CoresPerPackage();
+		return MaxCoresPerPackage();
 	}
 }

@ -276,12 +258,12 @@ static size_t LogicalPerCore(const u8* apicIds)
 	if(apicIds)
 	{
 		const size_t offset = 0;
-		return NumUniqueValuesInField(apicIds, offset, LogicalPerCore());
+		return NumUniqueValuesInField(apicIds, offset, MaxLogicalPerCore());
 	}
 	else
 	{
 		// guess (must match NumPackages's assumptions)
-		return LogicalPerCore();
+		return MaxLogicalPerCore();
 	}
 }

@ -295,20 +277,22 @@ struct CpuTopology	// POD
 	size_t coresPerPackage;
 	size_t logicalPerCore;
 };
+static CpuTopology cpuTopology;
+
+static LibError InitCpuTopology()
+{
+	const u8* apicIds = ApicIds();
+	cpuTopology.numPackages = NumPackages(apicIds);
+	cpuTopology.coresPerPackage = CoresPerPackage(apicIds);
+	cpuTopology.logicalPerCore = LogicalPerCore(apicIds);
+	return INFO::OK;
+}

 const CpuTopology* cpu_topology_Detect()
 {
-	static CpuTopology topology;
-
-	if(!topology.numPackages)
-	{
-		const u8* apicIds = ApicIds();
-		topology.numPackages = NumPackages(apicIds);
-		topology.coresPerPackage = CoresPerPackage(apicIds);
-		topology.logicalPerCore = LogicalPerCore(apicIds);
-	}
-
-	return &topology;
+	static ModuleInitState initState;
+	ModuleInit(&initState, InitCpuTopology);
+	return &cpuTopology;
 }

 size_t cpu_topology_NumPackages(const CpuTopology* topology)
@ -334,46 +318,36 @@ size_t cpu_topology_LogicalPerCore(const CpuTopology* topology)
 // functionality but returns incorrect results. (it claims all cores in
 // an Intel Core2 Quad processor share a single L2 cache.)

-static size_t NumCaches(const u8* apicIds)
-{
-	if(apicIds)
-	{
-		const size_t numBits = ceil_log2(LogicalPerCache());
-		const u8 mask = u8((0xFF << numBits) & 0xFF);
-		return NumUniqueMaskedValues(apicIds, mask);
-	}
-	else
-	{
-		// assume each processor has its own cache
-		return os_cpu_NumProcessors();
-	}
-}
-
 class CacheRelations
 {
 public:
 	/**
 	 * add processor to the processor mask owned by cache identified by <id>
 	 **/
-	void Add(u8 id, size_t processor)
+	void Add(u8 cacheId, size_t processor)
 	{
-		SharedCache* cache = Find(id);
+		SharedCache* cache = Find(cacheId);
 		if(!cache)
 		{
-			m_caches.push_back(id);
+			m_caches.push_back(cacheId);
 			cache = &m_caches.back();
 		}
 		cache->Add(processor);
 	}

+	size_t NumCaches() const
+	{
+		return m_caches.size();
+	}
+
 	/**
 	 * store topology in an array (one entry per cache) of masks
 	 * representing the processors that share a cache.
 	 **/
-	void StoreProcessorMasks(uintptr_t* processorMasks)
+	void StoreProcessorMasks(uintptr_t* cachesProcessorMask)
 	{
-		for(size_t i = 0; i < m_caches.size(); i++)
-			processorMasks[i] = m_caches[i].ProcessorMask();
+		for(size_t i = 0; i < NumCaches(); i++)
+			cachesProcessorMask[i] = m_caches[i].ProcessorMask();
 	}

 private:
@ -383,14 +357,14 @@ private:
 	class SharedCache
 	{
 	public:
-		SharedCache(u8 id)
-			: m_id(id), m_processorMask(0)
+		SharedCache(u8 cacheId)
+			: m_cacheId(cacheId), m_processorMask(0)
 		{
 		}

 		bool Matches(u8 id) const
 		{
-			return m_id == id;
+			return m_cacheId == id;
 		}

 		void Add(size_t processor)
@ -404,15 +378,15 @@ private:
 		}

 	private:
-		u8 m_id;
+		u8 m_cacheId;
 		uintptr_t m_processorMask;
 	};

-	SharedCache* Find(u8 id)
+	SharedCache* Find(u8 cacheId)
 	{
 		for(size_t i = 0; i < m_caches.size(); i++)
 		{
-			if(m_caches[i].Matches(id))
+			if(m_caches[i].Matches(cacheId))
 				return &m_caches[i];
 		}

@ -422,38 +396,42 @@ private:
 	std::vector<SharedCache> m_caches;
 };

-static void DetermineCachesProcessorMask(const u8* apicIds, uintptr_t* cachesProcessorMask)
+static void DetermineCachesProcessorMask(const u8* apicIds, uintptr_t* cachesProcessorMask, size_t& numCaches)
 {
+	CacheRelations cacheRelations;
 	if(apicIds)
 	{
-		const size_t numBits = ceil_log2(LogicalPerCache());
-		const u8 cacheIdMask = u8(0xFF << numBits);
-
-		CacheRelations cacheRelations;
+		const size_t numBits = ceil_log2(MaxLogicalPerCache());
+		const u8 cacheIdMask = u8((0xFF << numBits) & 0xFF);
 		for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
 		{
 			const u8 apicId = apicIds[processor];
 			const u8 cacheId = u8(apicId & cacheIdMask);
 			cacheRelations.Add(cacheId, processor);
 		}
-		cacheRelations.StoreProcessorMasks(cachesProcessorMask);
 	}
 	else
 	{
-		// assume each processor has exactly one cache with matching IDs
 		for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
-			cachesProcessorMask[processor] = uintptr_t(1) << processor;
+		{
+			// assume each processor has exactly one cache with matching IDs
+			const u8 cacheId = (u8)processor;
+			cacheRelations.Add(cacheId, processor);
+		}
 	}
+
+	numCaches = cacheRelations.NumCaches();
+	cacheRelations.StoreProcessorMasks(cachesProcessorMask);
 }


-static void DetermineProcessorsCache(size_t numCaches, const uintptr_t* cachesProcessorMask, size_t* processorsCache)
+static void DetermineProcessorsCache(const uintptr_t* cachesProcessorMask, size_t numCaches, size_t* processorsCache, size_t numProcessors)
 {
 	for(size_t cache = 0; cache < numCaches; cache++)
 	{
 		// write to all entries that share this cache
 		const uintptr_t processorMask = cachesProcessorMask[cache];
-		for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
+		for(size_t processor = 0; processor < numProcessors; processor++)
 		{
 			if(IsBitSet(processorMask, processor))
 			{
@ -474,20 +452,21 @@ struct CacheTopology	// POD
 	size_t processorsCache[os_cpu_MaxProcessors];
 	uintptr_t cachesProcessorMask[os_cpu_MaxProcessors];
 };
+static CacheTopology cacheTopology;
+
+static LibError InitCacheTopology()
+{
+	const u8* apicIds = ApicIds();
+	DetermineCachesProcessorMask(apicIds, cacheTopology.cachesProcessorMask, cacheTopology.numCaches);
+	DetermineProcessorsCache(cacheTopology.cachesProcessorMask, cacheTopology.numCaches, cacheTopology.processorsCache, os_cpu_NumProcessors());
+	return INFO::OK;
+}

 const CacheTopology* cache_topology_Detect()
 {
-	static CacheTopology topology;
-
-	if(!topology.numCaches)
-	{
-		const u8* apicIds = ApicIds();
-		topology.numCaches = NumCaches(apicIds);
-		DetermineCachesProcessorMask(apicIds, topology.cachesProcessorMask);
-		DetermineProcessorsCache(topology.numCaches, topology.cachesProcessorMask, topology.processorsCache);
-	}
-
-	return &topology;
+	static ModuleInitState initState;
+	ModuleInit(&initState, InitCacheTopology);
+	return &cacheTopology;
 }

 size_t cache_topology_NumCaches(const CacheTopology* topology)
--- a/source/lib/sysdep/arch/x86_x64/topology.h
+++ b/source/lib/sysdep/arch/x86_x64/topology.h
@ -37,6 +37,15 @@
 //   the various threading packages (Boost, OpenMP, POSIX, Win32, ..)


+/**
+ * @return a pointer to array (up to os_cpu_MaxProcessors entries;
+ * os_cpu_NumProcessors() of them are valid) of the processors'
+ * unique APIC IDs or zero if no xAPIC is present or
+ * process affinity is restricted.
+ **/
+LIB_API const u8* ApicIds();
+
+
 //-----------------------------------------------------------------------------
 // cpu

@ -54,8 +63,6 @@ struct CpuTopology;
 * initialize static storage from which topology can be retrieved by
 * means of the following functions.
 * @return const pointer to a shared instance.
- *
- * WARNING: this function must not be reentered before it has returned once.
 **/
 LIB_API const CpuTopology* cpu_topology_Detect();

--- a/source/lib/sysdep/arch/x86_x64/x86_x64.cpp
+++ b/source/lib/sysdep/arch/x86_x64/x86_x64.cpp
@ -36,6 +36,7 @@
 #include "lib/posix/posix.h"	// pthread
 #include "lib/bits.h"
 #include "lib/timer.h"
+#include "lib/module_init.h"
 #include "lib/sysdep/cpu.h"
 #include "lib/sysdep/os_cpu.h"

@ -54,6 +55,19 @@
 #endif


+// some of this module's functions are frequently called but require
+// non-trivial initialization, so caching is helpful. isInitialized
+// flags aren't thread-safe, so we use ModuleInit. calling it from
+// every function is a bit wasteful, but it is convenient to avoid
+// requiring users to pass around a global state object.
+// one big Init() would be prone to deadlock if its subroutines also
+// call a public function (that re-enters ModuleInit), so each
+// function gets its own initState.
+
+
+//-----------------------------------------------------------------------------
+// CPUID
+
 // note: unfortunately the MSC __cpuid intrinsic does not allow passing
 // additional inputs (e.g. ecx = count), so we need to implement this
 // in assembly for both IA-32 and AMD64.
@ -66,26 +80,33 @@ static void cpuid_impl(x86_x64_CpuidRegs* regs)
 #endif
 }

+static u32 cpuid_maxFunction;
+static u32 cpuid_maxExtendedFunction;
+
+static LibError InitCpuid()
+{
+	x86_x64_CpuidRegs regs = { 0 };
+
+	regs.eax = 0;
+	cpuid_impl(&regs);
+	cpuid_maxFunction = regs.eax;
+
+	regs.eax = 0x80000000;
+	cpuid_impl(&regs);
+	cpuid_maxExtendedFunction = regs.eax;
+
+	return INFO::OK;
+}
+
 bool x86_x64_cpuid(x86_x64_CpuidRegs* regs)
 {
-	static u32 maxFunction;
-	static u32 maxExtendedFunction;
-	if(!maxFunction)
-	{
-		x86_x64_CpuidRegs regs2;
-		regs2.eax = 0;
-		regs2.ecx = 0; // necessary to avoid valgrind uninitialized-value warnings
-		cpuid_impl(&regs2);
-		maxFunction = regs2.eax;
-		regs2.eax = 0x80000000;
-		cpuid_impl(&regs2);
-		maxExtendedFunction = regs2.eax;
-	}
+	static ModuleInitState initState;
+	ModuleInit(&initState, InitCpuid);

 	const u32 function = regs->eax;
-	if(function > maxExtendedFunction)
+	if(function > cpuid_maxExtendedFunction)
 		return false;
-	if(function < 0x80000000 && function > maxFunction)
+	if(function < 0x80000000 && function > cpuid_maxFunction)
 		return false;

 	cpuid_impl(regs);
@ -96,11 +117,14 @@ bool x86_x64_cpuid(x86_x64_CpuidRegs* regs)
 //-----------------------------------------------------------------------------
 // capability bits

-static void DetectFeatureFlags(u32 caps[4])
+// treated as 128 bit field; order: std ecx, std edx, ext ecx, ext edx
+// keep in sync with enum x86_x64_Cap!
+static u32 caps[4];
+
+static LibError InitCaps()
 {
-	x86_x64_CpuidRegs regs;
+	x86_x64_CpuidRegs regs = { 0 };
 	regs.eax = 1;
-	regs.ecx = 0; // necessary to avoid valgrind uninitialized-value warnings
 	if(x86_x64_cpuid(&regs))
 	{
 		caps[0] = regs.ecx;
@ -112,77 +136,77 @@ static void DetectFeatureFlags(u32 caps[4])
 		caps[2] = regs.ecx;
 		caps[3] = regs.edx;
 	}
+
+	return INFO::OK;
 }

 bool x86_x64_cap(x86_x64_Cap cap)
 {
-	// treated as 128 bit field; order: std ecx, std edx, ext ecx, ext edx
-	// keep in sync with enum CpuCap!
-	static u32 x86_x64_caps[4];
+	static ModuleInitState initState;
+	ModuleInit(&initState, InitCaps);

-	// (since relevant CPUs will surely advertise at least one standard flag,
-	// they are zero iff we haven't been initialized yet)
-	if(!x86_x64_caps[1])
-		DetectFeatureFlags(x86_x64_caps);
-
-	const size_t tbl_idx = cap >> 5;
-	const size_t bit_idx = cap & 0x1f;
-	if(tbl_idx > 3)
+	const size_t index = cap >> 5;
+	const size_t bit = cap & 0x1F;
+	if(index >= ARRAY_SIZE(caps))
 	{
 		DEBUG_WARN_ERR(ERR::INVALID_PARAM);
 		return false;
 	}
-	return (x86_x64_caps[tbl_idx] & Bit<u32>(bit_idx)) != 0;
+	return IsBitSet(caps[index], bit);
 }


 //-----------------------------------------------------------------------------
 // CPU identification

-static x86_x64_Vendors DetectVendor()
+static x86_x64_Vendors vendor;
+
+static LibError InitVendor()
 {
-	x86_x64_CpuidRegs regs;
+	x86_x64_CpuidRegs regs = { 0 };
 	regs.eax = 0;
-	regs.ecx = 0;
 	if(!x86_x64_cpuid(&regs))
 		DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING);

 	// copy regs to string
 	// note: 'strange' ebx,edx,ecx reg order is due to ModR/M encoding order.
-	char vendor_str[13];
-	memcpy(&vendor_str[0], &regs.ebx, 4);
-	memcpy(&vendor_str[4], &regs.edx, 4);
-	memcpy(&vendor_str[8], &regs.ecx, 4);
-	vendor_str[12] = '\0';	// 0-terminate
+	char vendorString[13];
+	memcpy(&vendorString[0], &regs.ebx, 4);
+	memcpy(&vendorString[4], &regs.edx, 4);
+	memcpy(&vendorString[8], &regs.ecx, 4);
+	vendorString[12] = '\0';	// 0-terminate

-	if(!strcmp(vendor_str, "AuthenticAMD"))
-		return X86_X64_VENDOR_AMD;
-	else if(!strcmp(vendor_str, "GenuineIntel"))
-		return X86_X64_VENDOR_INTEL;
+	if(!strcmp(vendorString, "AuthenticAMD"))
+		vendor = X86_X64_VENDOR_AMD;
+	else if(!strcmp(vendorString, "GenuineIntel"))
+		vendor = X86_X64_VENDOR_INTEL;
 	else
 	{
 		DEBUG_WARN_ERR(ERR::CPU_UNKNOWN_VENDOR);
-		return X86_X64_VENDOR_UNKNOWN;
+		vendor = X86_X64_VENDOR_UNKNOWN;
 	}
+
+	return INFO::OK;
 }

 x86_x64_Vendors x86_x64_Vendor()
 {
-	static x86_x64_Vendors vendor = X86_X64_VENDOR_UNKNOWN;
-	if(vendor == X86_X64_VENDOR_UNKNOWN)
-		vendor = DetectVendor();
+	static ModuleInitState initState;
+	ModuleInit(&initState, InitVendor);
 	return vendor;
 }


-static void DetectSignature(size_t& model, size_t& family)
+static size_t model;
+static size_t family;
+
+static void InitModelAndFamily()
 {
-	x86_x64_CpuidRegs regs;
+	x86_x64_CpuidRegs regs = { 0 };
 	regs.eax = 1;
-	regs.ecx = 0;
 	if(!x86_x64_cpuid(&regs))
 		DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING);
-	model  = bits(regs.eax, 4, 7);
+	model = bits(regs.eax, 4, 7);
 	family = bits(regs.eax, 8, 11);
 	const size_t extendedModel = bits(regs.eax, 16, 19);
 	const size_t extendedFamily = bits(regs.eax, 20, 27);
@ -193,10 +217,11 @@ static void DetectSignature(size_t& model, size_t& family)
 }


-static size_t DetectGeneration()
+static size_t generation;
+
+static LibError InitGeneration()
 {
-	size_t model, family;
-	DetectSignature(model, family);
+	InitModelAndFamily();

 	switch(x86_x64_Vendor())
 	{
@ -205,15 +230,19 @@ static size_t DetectGeneration()
 		{
 		case 5:
 			if(model < 6)
-				return 5;	// K5
+				generation = 5;	// K5
 			else
-				return 6;	// K6
+				generation = 6;	// K6
+			break;

 		case 6:
-			return 7;	// K7 (Athlon)
+			generation = 7;	// K7 (Athlon)
+			break;

 		case 0xF:
-			return 8;	// K8 (Opteron)
+		case 0x10:
+			generation = 8;	// K8 (Opteron)
+			break;
 		}
 		break;

@ -221,32 +250,34 @@ static size_t DetectGeneration()
 		switch(family)
 		{
 		case 5:
-			return 5;	// Pentium
+			generation = 5;	// Pentium
+			break;

 		case 6:
 			if(model < 0xF)
-				return 6;	// Pentium Pro/II/III/M
+				generation = 6;	// Pentium Pro/II/III/M
 			else
-				return 8;	// Core2Duo
+				generation = 8;	// Core2Duo
+			break;

 		case 0xF:
 			if(model <= 6)
-				return 7;	// Pentium 4/D
+				generation = 7;	// Pentium 4/D
+			break;
 		}
 		if(family >= 0x10)
-			return 9;
+			generation = 9;
 		break;
 	}

-	debug_assert(0);	// unknown CPU generation
-	return family;
+	debug_assert(generation != 0);
+	return INFO::OK;
 }

 size_t x86_x64_Generation()
 {
-	static size_t generation;
-	if(!generation)
-		generation = DetectGeneration();
+	static ModuleInitState initState;
+	ModuleInit(&initState, InitGeneration);
 	return generation;
 }

@ -308,7 +339,7 @@ static x86_x64_CacheParameters L1Parameters(u32 reg, x86_x64_CacheType type)
 }

 // applies to L2, L3 and TLB2
-const size_t associativities[16] =
+static const size_t associativities[16] =
 {
 	0, 1, 2, 0, 4, 0, 8, 0,
 	16, 0, 32, 48, 64, 96, 128, x86_x64_fullyAssociative
@ -397,12 +428,11 @@ static void AddTLB2ParameterPair(u32 reg, size_t pageSize)

 // AMD reports maxCpuidIdFunction > 4 but consider functions 2..4 to be
 // "reserved". cache characteristics are returned via ext. functions.
-static void DetectCacheAndTLB()
+static void InitCacheAndTLB()
 {
-	x86_x64_CpuidRegs regs;
+	x86_x64_CpuidRegs regs = { 0 };

 	regs.eax = 0x80000005;
-	regs.ecx = 0;
 	if(x86_x64_cpuid(&regs))
 	{
 		AddTLB1Parameters(regs);
@ -433,7 +463,7 @@ static void DetectCache_CPUID4()
 	// note: ordering is undefined (see Intel AP-485)
 	for(u32 count = 0; ; count++)
 	{
-		x86_x64_CpuidRegs regs;
+		x86_x64_CpuidRegs regs = { 0 };
 		regs.eax = 4;
 		regs.ecx = count;
 		if(!x86_x64_cpuid(&regs))
@ -622,9 +652,8 @@ static void DetectTLB_CPUID2()
 	// TODO: ensure we are pinned to the same CPU

 	// extract descriptors
-	x86_x64_CpuidRegs regs;
+	x86_x64_CpuidRegs regs = { 0 };
 	regs.eax = 2;
-	regs.ecx = 0;
 	if(!x86_x64_cpuid(&regs))
 		return;
 	size_t iterations = bits(regs.eax, 0, 7);
@ -649,15 +678,12 @@ static void DetectTLB_CPUID2()
 	}
 }

-static void DetectCacheAndTLB()
-{
-	static bool alreadyDone;
-	if(alreadyDone)
-		return;
-	alreadyDone = true;
+static ModuleInitState cacheInitState;

+static LibError InitCacheAndTLB()
+{
 	if(x86_x64_Vendor() == X86_X64_VENDOR_AMD)
-		AMD::DetectCacheAndTLB();
+		AMD::InitCacheAndTLB();
 	else
 	{
 		DetectCache_CPUID4();
@ -678,17 +704,19 @@ static void DetectCacheAndTLB()
 	debug_assert(dcache.levels >= 2);
 	debug_assert(dcache.parameters[0].lineSize != 0);
 	debug_assert(dcache.parameters[1].lineSize != 0);
+
+	return INFO::OK;
 }

 const x86_x64_Cache* x86_x64_ICache()
 {
-	DetectCacheAndTLB();
+	ModuleInit(&cacheInitState, InitCacheAndTLB);
 	return &icache;
 }

 const x86_x64_Cache* x86_x64_DCache()
 {
-	DetectCacheAndTLB();
+	ModuleInit(&cacheInitState, InitCacheAndTLB);
 	return &dcache;
 }

@ -704,18 +732,20 @@ size_t x86_x64_L2CacheLineSize()

 const x86_x64_TLB* x86_x64_ITLB()
 {
-	DetectCacheAndTLB();
+	ModuleInit(&cacheInitState, InitCacheAndTLB);
 	return &itlb;
 }

 const x86_x64_TLB* x86_x64_DTLB()
 {
-	DetectCacheAndTLB();
+	ModuleInit(&cacheInitState, InitCacheAndTLB);
 	return &dtlb;
 }

 size_t x86_x64_TLBCoverage(const x86_x64_TLB* tlb)
 {
+	// note: receiving a TLB pointer means InitCacheAndTLB was called.
+
 	const u64 pageSize = 4*KiB;
 	const u64 largePageSize = 4*MiB;	// TODO: find out if we're using 2MB or 4MB
 	u64 totalSize = 0;	// [bytes]
@ -738,12 +768,9 @@ size_t x86_x64_TLBCoverage(const x86_x64_TLB* tlb)
 /// functor to remove substrings from the CPU identifier string
 class StringStripper
 {
-	char* m_string;
-	size_t m_max_chars;
-
 public:
 	StringStripper(char* string, size_t max_chars)
-	: m_string(string), m_max_chars(max_chars)
+		: m_string(string), m_max_chars(max_chars)
 	{
 	}

@ -761,19 +788,25 @@ public:
 			memmove(substring_pos, substring_pos+substring_length, num_chars);
 		}
 	}
+
+private:
+	char* m_string;
+	size_t m_max_chars;
 };

-static void DetectIdentifierString(char* identifierString, size_t maxChars)
+// 3 calls x 4 registers x 4 bytes = 48 + 0-terminator
+static char identifierString[48+1];
+
+static LibError InitIdentifierString()
 {
 	// get brand string (if available)
 	char* pos = identifierString;
-	bool have_brand_string = true;
+	bool gotBrandString = true;
 	for(u32 function = 0x80000002; function <= 0x80000004; function++)
 	{
-		x86_x64_CpuidRegs regs;
+		x86_x64_CpuidRegs regs = { 0 };
 		regs.eax = function;
-		regs.ecx = 0;
-		have_brand_string &= x86_x64_cpuid(&regs);
+		gotBrandString &= x86_x64_cpuid(&regs);
 		memcpy(pos, &regs, 16);
 		pos += 16;
 	}
@ -784,11 +817,9 @@ static void DetectIdentifierString(char* identifierString, size_t maxChars)
 	// - the brand string is useless, e.g. "Unknown". this happens on
 	//   some older boards whose BIOS reprograms the string for CPUs it
 	//   doesn't recognize.
-	if(!have_brand_string || strncmp(identifierString, "Unknow", 6) == 0)
+	if(!gotBrandString || strncmp(identifierString, "Unknow", 6) == 0)
 	{
-		size_t model, family;
-		DetectSignature(model, family);
-
+		InitModelAndFamily();
 		switch(x86_x64_Vendor())
 		{
 		case X86_X64_VENDOR_AMD:
@ -796,15 +827,15 @@ static void DetectIdentifierString(char* identifierString, size_t maxChars)
 			if(family == 6)
 			{
 				if(model == 3 || model == 7)
-					strcpy_s(identifierString, maxChars, "AMD Duron");
+					strcpy_s(identifierString, ARRAY_SIZE(identifierString), "AMD Duron");
 				else if(model <= 5)
-					strcpy_s(identifierString, maxChars, "AMD Athlon");
+					strcpy_s(identifierString, ARRAY_SIZE(identifierString), "AMD Athlon");
 				else
 				{
 					if(x86_x64_cap(X86_X64_CAP_AMD_MP))
-						strcpy_s(identifierString, maxChars, "AMD Athlon MP");
+						strcpy_s(identifierString, ARRAY_SIZE(identifierString), "AMD Athlon MP");
 					else
-						strcpy_s(identifierString, maxChars, "AMD Athlon XP");
+						strcpy_s(identifierString, ARRAY_SIZE(identifierString), "AMD Athlon XP");
 				}
 			}
 			break;
@ -814,13 +845,13 @@ static void DetectIdentifierString(char* identifierString, size_t maxChars)
 			if(family == 6)
 			{
 				if(model == 1)
-					strcpy_s(identifierString, maxChars, "Intel Pentium Pro");
+					strcpy_s(identifierString, ARRAY_SIZE(identifierString), "Intel Pentium Pro");
 				else if(model == 3 || model == 5)
-					strcpy_s(identifierString, maxChars, "Intel Pentium II");
+					strcpy_s(identifierString, ARRAY_SIZE(identifierString), "Intel Pentium II");
 				else if(model == 6)
-					strcpy_s(identifierString, maxChars, "Intel Celeron");	
+					strcpy_s(identifierString, ARRAY_SIZE(identifierString), "Intel Celeron");	
 				else
-					strcpy_s(identifierString, maxChars, "Intel Pentium III");
+					strcpy_s(identifierString, ARRAY_SIZE(identifierString), "Intel Pentium III");
 			}
 			break;
 		}
@ -828,38 +859,41 @@ static void DetectIdentifierString(char* identifierString, size_t maxChars)
 	// identifierString already holds a valid brand string; pretty it up.
 	else
 	{
-		const char* const undesired_strings[] = { "(tm)", "(TM)", "(R)", "CPU ", "          " };
-		std::for_each(undesired_strings, undesired_strings+ARRAY_SIZE(undesired_strings),
+		const char* const undesiredStrings[] = { "(tm)", "(TM)", "(R)", "CPU ", "          " };
+		std::for_each(undesiredStrings, undesiredStrings+ARRAY_SIZE(undesiredStrings),
 			StringStripper(identifierString, strlen(identifierString)+1));

 		// note: Intel brand strings include a frequency, but we can't rely
 		// on it because the CPU may be overclocked. we'll leave it in the
 		// string to show measurement accuracy and if SpeedStep is active.
 	}
+
+	return INFO::OK;
 }

 const char* cpu_IdentifierString()
 {
-	// 3 calls x 4 registers x 4 bytes = 48
-	static char identifierString[48+1] = {'\0'};
-	if(identifierString[0] == '\0')
-		DetectIdentifierString(identifierString, ARRAY_SIZE(identifierString));
+	static ModuleInitState initState;
+	ModuleInit(&initState, InitIdentifierString);
 	return identifierString;
 }


 //-----------------------------------------------------------------------------
-// misc stateless functions
+// miscellaneous stateless functions
+
+// these routines do not call ModuleInit (because some of them are
+// time-critical, e.g. cpu_Serialize) and should also avoid the
+// other x86_x64* functions and their global state.
+// in particular, use cpuid_impl instead of x86_x64_cpuid.

 u8 x86_x64_ApicId()
 {
-	x86_x64_CpuidRegs regs;
+	x86_x64_CpuidRegs regs = { 0 };
 	regs.eax = 1;
-	regs.ecx = 0;
-	// note: CPUID function 1 should be available everywhere, but only
-	// processors with an xAPIC (e.g. P4/Athlon XP) will return a nonzero value.
-	if(!x86_x64_cpuid(&regs))
-		DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING);
+	// note: CPUID function 1 is always supported, but only processors with
+	// an xAPIC (e.g. P4/Athlon XP) will return a nonzero ID.
+	cpuid_impl(&regs);
 	const u8 apicId = (u8)bits(regs.ebx, 24, 31);
 	return apicId;
 }
@ -893,10 +927,9 @@ void x86_x64_DebugBreak()

 void cpu_Serialize()
 {
-	x86_x64_CpuidRegs regs;
+	x86_x64_CpuidRegs regs = { 0 };
 	regs.eax = 1;
-	regs.ecx = 0;
-	x86_x64_cpuid(&regs);	// CPUID serializes execution.
+	cpuid_impl(&regs);	// CPUID serializes execution.
 }


@ -906,26 +939,27 @@ void cpu_Serialize()
 // set scheduling priority and restore when going out of scope.
 class ScopedSetPriority
 {
-	int m_old_policy;
-	sched_param m_old_param;
-
 public:
-	ScopedSetPriority(int new_priority)
+	ScopedSetPriority(int newPriority)
 	{
 		// get current scheduling policy and priority
-		pthread_getschedparam(pthread_self(), &m_old_policy, &m_old_param);
+		pthread_getschedparam(pthread_self(), &m_oldPolicy, &m_oldParam);

 		// set new priority
-		sched_param new_param = {0};
-		new_param.sched_priority = new_priority;
-		pthread_setschedparam(pthread_self(), SCHED_FIFO, &new_param);
+		sched_param newParam = {0};
+		newParam.sched_priority = newPriority;
+		pthread_setschedparam(pthread_self(), SCHED_FIFO, &newParam);
 	}

 	~ScopedSetPriority()
 	{
 		// restore previous policy and priority.
-		pthread_setschedparam(pthread_self(), m_old_policy, &m_old_param);
+		pthread_setschedparam(pthread_self(), m_oldPolicy, &m_oldParam);
 	}
+
+private:
+	int m_oldPolicy;
+	sched_param m_oldParam;
 };

 // note: this function uses timer.cpp!timer_Time, which is implemented via
@ -948,19 +982,18 @@ double x86_x64_ClockFrequency()
 	// (background: it's used in x86_x64_rdtsc() to serialize instruction flow;
 	// the first call is documented to be slower on Intel CPUs)

-	int num_samples = 16;
+	size_t numSamples = 16;
 	// if clock is low-res, do less samples so it doesn't take too long.
 	// balance measuring time (~ 10 ms) and accuracy (< 1 0/00 error -
 	// ok for using the TSC as a time reference)
 	if(timer_Resolution() >= 1e-3)
-		num_samples = 8;
-	std::vector<double> samples(num_samples);
+		numSamples = 8;
+	std::vector<double> samples(numSamples);

-	for(int i = 0; i < num_samples; i++)
+	for(size_t i = 0; i < numSamples; i++)
 	{
 		double dt;
-		i64 dc; // i64 because VC6 can't convert u64 -> double,
-		        // and we don't need all 64 bits.
+		i64 dc;	// (i64 instead of u64 for faster conversion to double)

 		// count # of clocks in max{1 tick, 1 ms}:
 		// .. wait for start of tick.
@ -1000,10 +1033,10 @@ double x86_x64_ClockFrequency()
 	// note: don't just take the lowest value! it could conceivably be
 	// too low, if background processing delays reading c1 (see above).
 	double sum = 0.0;
-	const int lo = num_samples/4, hi = 3*num_samples/4;
+	const int lo = numSamples/4, hi = 3*numSamples/4;
 	for(int i = lo; i < hi; i++)
 		sum += samples[i];

-	const double clock_frequency = sum / (hi-lo);
-	return clock_frequency;
+	const double clockFrequency = sum / (hi-lo);
+	return clockFrequency;
 }
--- a/source/lib/sysdep/cpu.h
+++ b/source/lib/sysdep/cpu.h
@ -78,7 +78,7 @@ LIB_API const char* cpu_IdentifierString();
 * @return false if the target word doesn't match the expected value,
 * otherwise true (also overwriting the contents of location)
 **/
-LIB_API bool cpu_CAS(volatile uintptr_t* location, uintptr_t expected, uintptr_t newValue);
+LIB_API bool cpu_CAS(volatile intptr_t* location, intptr_t expected, intptr_t newValue);

 /**
 * specialization of cpu_CAS for pointer types. this avoids error-prone
@ -87,7 +87,7 @@ LIB_API bool cpu_CAS(volatile uintptr_t* location, uintptr_t expected, uintptr_t
 template<typename T>
 bool cpu_CAS(volatile T* location, T expected, T new_value)
 {
-	return cpu_CAS((volatile uintptr_t*)location, (uintptr_t)expected, (uintptr_t)new_value);
+	return cpu_CAS((volatile intptr_t*)location, (intptr_t)expected, (intptr_t)new_value);
 }

 /**
--- a/source/lib/sysdep/os/win/mahaf.cpp
+++ b/source/lib/sysdep/os/win/mahaf.cpp
@ -295,45 +295,42 @@ static fs::wpath DriverPathname()

 //-----------------------------------------------------------------------------

-static ModuleInitState initState;
-
-bool mahaf_Init()
+static LibError Init()
 {
-	if(ModuleIsError(&initState))
-		return false;
-	if(!ModuleShouldInitialize(&initState))
-		return true;
-
 	if(wutil_HasCommandLineArgument(L"-wNoMahaf"))
-		goto fail;
+		return ERR::NOT_SUPPORTED;

 	{
-	const fs::wpath driverPathname = DriverPathname();
-	StartDriver(driverPathname);
+		const fs::wpath driverPathname = DriverPathname();
+		StartDriver(driverPathname);
 	}

 	{
-	const DWORD shareMode = 0;
-	hAken = CreateFileW(L"\\\\.\\Aken", GENERIC_READ, shareMode, 0, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, 0);
-	if(hAken == INVALID_HANDLE_VALUE)
-		goto fail;
+		const DWORD shareMode = 0;
+		hAken = CreateFileW(L"\\\\.\\Aken", GENERIC_READ, shareMode, 0, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, 0);
+		if(hAken == INVALID_HANDLE_VALUE)
+			return ERR::INVALID_HANDLE;
 	}

-	return true;
-
-fail:
-	ModuleSetError(&initState);
-	return false;
+	return INFO::OK;
 }

-
-void mahaf_Shutdown()
+static void Shutdown()
 {
-	if(!ModuleShouldShutdown(&initState))
-		return;
-
 	CloseHandle(hAken);
 	hAken = INVALID_HANDLE_VALUE;

 	UninstallDriver();
 }
+
+static ModuleInitState initState;
+
+LibError mahaf_Init()
+{
+	return ModuleInit(&initState, Init);
+}
+
+void mahaf_Shutdown()
+{
+	ModuleShutdown(&initState, Shutdown);
+}
--- a/source/lib/sysdep/os/win/mahaf.h
+++ b/source/lib/sysdep/os/win/mahaf.h
@ -42,7 +42,7 @@
 extern bool mahaf_IsPhysicalMappingDangerous();


-extern bool mahaf_Init();
+extern LibError mahaf_Init();
 extern void mahaf_Shutdown();

 extern u8  mahaf_ReadPort8 (u16 port);
--- a/source/lib/sysdep/os/win/wdbg_sym.cpp
+++ b/source/lib/sysdep/os/win/wdbg_sym.cpp
@ -32,8 +32,8 @@
 #include <set>

 #include "lib/byte_order.h"
+#include "lib/module_init.h"
 #include "lib/sysdep/cpu.h"
-
 #include "lib/debug_stl.h"
 #include "lib/app_hooks.h"
 #include "lib/path_util.h"
@ -65,7 +65,7 @@ static HANDLE hProcess;
 static uintptr_t mod_base;

 #if !IA32_STACK_WALK_ENABLED
-// for StackWalk64; taken from PE header by sym_init.
+// for StackWalk64; taken from PE header by InitDbghelp.
 static WORD machine;
 #endif

@ -76,16 +76,8 @@ static WORD machine;
 static WUTIL_FUNC(pRtlCaptureContext, VOID, (PCONTEXT));


-// call on-demand (allows handling exceptions raised before winit.cpp
-// init functions are called); no effect if already initialized.
-static LibError sym_init()
+static LibError InitDbghelp()
 {
-	// bail if already initialized (there's nothing to do).
-	// don't use pthread_once because we need to return success/error code.
-	static uintptr_t already_initialized = 0;
-	if(!cpu_CAS(&already_initialized, 0, 1))
-		return INFO::OK;
-
 	hProcess = GetCurrentProcess();

 	dbghelp_ImportFunctions();
@ -111,7 +103,7 @@ static LibError sym_init()
 	const BOOL ok = pSymInitializeW(hProcess, UserSearchPath, fInvadeProcess);
 	WARN_IF_FALSE(ok);

-	mod_base = (uintptr_t)pSymGetModuleBase64(hProcess, (u64)&sym_init);
+	mod_base = (uintptr_t)pSymGetModuleBase64(hProcess, (u64)&InitDbghelp);

 #if !IA32_STACK_WALK_ENABLED
 	IMAGE_NT_HEADERS* const header = pImageNtHeader((void*)(uintptr_t)mod_base);
@ -121,6 +113,16 @@ static LibError sym_init()
 	return INFO::OK;
 }

+// ensure dbghelp is initialized exactly once.
+// call every time before dbghelp functions are used.
+// (on-demand initialization allows handling exceptions raised before
+// winit.cpp init functions are called)
+static void sym_init()
+{
+	static ModuleInitState initState;
+	ModuleInit(&initState, InitDbghelp);
+}
+

 struct SYMBOL_INFO_PACKAGEW2 : public SYMBOL_INFO_PACKAGEW
 {
@ -1834,8 +1836,8 @@ static LibError dump_frame_cb(const _tagSTACKFRAME64* sf, uintptr_t UNUSED(cbDat

 LibError debug_DumpStack(wchar_t* buf, size_t maxChars, void* pcontext, const wchar_t* lastFuncToSkip)
 {
-	static uintptr_t already_in_progress;
-	if(!cpu_CAS(&already_in_progress, 0, 1))
+	static intptr_t busy;
+	if(!cpu_CAS(&busy, 0, 1))
 		return ERR::REENTERED;	// NOWARN

 	out_init(buf, maxChars);
@ -1843,7 +1845,8 @@ LibError debug_DumpStack(wchar_t* buf, size_t maxChars, void* pcontext, const wc

 	LibError ret = wdbg_sym_WalkStack(dump_frame_cb, 0, (const CONTEXT*)pcontext, lastFuncToSkip);

-	already_in_progress = 0;
+	busy = 0;
+	cpu_MemoryBarrier();

 	return ret;
 }
--- a/source/lib/sysdep/os/win/whrt/counter.cpp
+++ b/source/lib/sysdep/os/win/whrt/counter.cpp
@ -75,7 +75,7 @@ static ICounter* ConstructCounterAt(size_t id, void* address, size_t size)
 }


-static volatile uintptr_t isCounterAllocated;
+static volatile intptr_t isCounterAllocated;

 ICounter* CreateCounter(size_t id)
 {
--- a/source/lib/sysdep/os/win/whrt/hpet.cpp
+++ b/source/lib/sysdep/os/win/whrt/hpet.cpp
@ -89,9 +89,6 @@ public:
 			mahaf_UnmapPhysicalMemory((void*)m_hpetRegisters);
 			m_hpetRegisters = 0;
 		}
-
-		acpi_Shutdown();
-		mahaf_Shutdown();
 	}

 	bool IsSafe() const
@ -152,8 +149,7 @@ private:
 	{
 		if(mahaf_IsPhysicalMappingDangerous())
 			return ERR::FAIL;	// NOWARN (happens on Win2k)
-		if(!mahaf_Init())
-			return ERR::FAIL;	// NOWARN (no Administrator privileges)
+		RETURN_ERR(mahaf_Init());	// (fails without Administrator privileges)

 		const HpetDescriptionTable* hpet = (const HpetDescriptionTable*)acpi_GetTable("HPET");
 		if(!hpet)
--- a/source/lib/sysdep/os/win/whrt/pmt.cpp
+++ b/source/lib/sysdep/os/win/whrt/pmt.cpp
@ -66,8 +66,7 @@ public:
 	LibError Activate()
 	{
 		// mahaf is needed for port I/O.
-		if(!mahaf_Init())
-			return ERR::FAIL;	// NOWARN (no Administrator privileges)
+		RETURN_ERR(mahaf_Init());	// (fails without Administrator privileges)
 		// (note: it's called FADT, but the signature is "FACP")
 		const FADT* fadt = (const FADT*)acpi_GetTable("FACP");
 		if(!fadt)
@ -79,7 +78,6 @@ public:

 	void Shutdown()
 	{
-		mahaf_Shutdown();
 	}

 	bool IsSafe() const
--- a/source/lib/sysdep/os/win/wmi.cpp
+++ b/source/lib/sysdep/os/win/wmi.cpp
@ -44,14 +44,10 @@ static ModuleInitState initState;

 static LibError Init()
 {
-	if(!ModuleShouldInitialize(&initState))
-		return INFO::SKIPPED;
-
 	HRESULT hr;

 	hr = CoInitialize(0);
-	if(FAILED(hr))
-		WARN_RETURN(ERR::_1);
+	debug_assert(hr == S_OK || hr == S_FALSE);	// S_FALSE => already initialized

 	hr = CoInitializeSecurity(0, -1, 0, 0, RPC_C_AUTHN_LEVEL_DEFAULT, RPC_C_IMP_LEVEL_IMPERSONATE, 0, EOAC_NONE, 0);
 	if(FAILED(hr))
@ -75,22 +71,23 @@ static LibError Init()
 	return INFO::OK;
 }

-
-void wmi_Shutdown()
+static void Shutdown()
 {
-	if(!ModuleShouldShutdown(&initState))
-		return;
-
 	pSvc->Release();

 	// note: don't shut down COM because other modules may still be using it.
 	//CoUninitialize();
 }

+void wmi_Shutdown()
+{
+	ModuleShutdown(&initState, Shutdown);
+}
+

 LibError wmi_GetClass(const wchar_t* className, WmiMap& wmiMap)
 {
-	RETURN_ERR(Init());
+	RETURN_ERR(ModuleInit(&initState, Init));

 	IEnumWbemClassObjectPtr pEnum = 0;
 	wchar_t query[200];
--- a/source/lib/sysdep/os/win/wnuma.cpp
+++ b/source/lib/sysdep/os/win/wnuma.cpp
@ -41,7 +41,9 @@ WINIT_REGISTER_EARLY_INIT(wnuma_Init);
 // node topology
 //-----------------------------------------------------------------------------

-static size_t NumNodes()
+// @return maximum (not actual) number of nodes, because Windows doesn't
+// guarantee node numbers are contiguous.
+static size_t MaxNodes()
 {
 	WUTIL_FUNC(pGetNumaHighestNodeNumber, BOOL, (PULONG));
 	WUTIL_IMPORT_KERNEL32(GetNumaHighestNodeNumber, pGetNumaHighestNodeNumber);
@ -49,8 +51,8 @@ static size_t NumNodes()
 	{
 		ULONG highestNode;
 		const BOOL ok = pGetNumaHighestNodeNumber(&highestNode);
-		debug_assert(ok);
-		debug_assert(highestNode < os_cpu_NumProcessors());	// #nodes <= #processors
+		WARN_IF_FALSE(ok);
+		debug_assert(highestNode < os_cpu_NumProcessors());	// node index < #processors
 		return highestNode+1;
 	}
 	// NUMA not supported
@ -59,7 +61,8 @@ static size_t NumNodes()
 }


-static void FillNodesProcessorMask(uintptr_t* nodesProcessorMask)
+// @param nodesProcessorMask array of processor masks for each node
+static void FillNodesProcessorMask(uintptr_t* nodesProcessorMask, size_t maxNodes)
 {
 	WUTIL_FUNC(pGetNumaNodeProcessorMask, BOOL, (UCHAR, PULONGLONG));
 	WUTIL_IMPORT_KERNEL32(GetNumaNodeProcessorMask, pGetNumaNodeProcessorMask);
@ -68,15 +71,15 @@ static void FillNodesProcessorMask(uintptr_t* nodesProcessorMask)
 		DWORD_PTR processAffinity, systemAffinity;
 		{
 			const BOOL ok = GetProcessAffinityMask(GetCurrentProcess(), &processAffinity, &systemAffinity);
-			debug_assert(ok);
+			WARN_IF_FALSE(ok);
 		}

-		for(size_t node = 0; node < numa_NumNodes(); node++)
+		for(size_t node = 0; node < maxNodes; node++)
 		{
 			ULONGLONG affinity;
 			{
 				const BOOL ok = pGetNumaNodeProcessorMask((UCHAR)node, &affinity);
-				debug_assert(ok);
+				WARN_IF_FALSE(ok);
 			}
 			const uintptr_t processorMask = wcpu_ProcessorMaskFromAffinity(processAffinity, (DWORD_PTR)affinity);
 			nodesProcessorMask[node] = processorMask;
@ -92,16 +95,21 @@ static void FillNodesProcessorMask(uintptr_t* nodesProcessorMask)
 // rather than the other way around because wcpu provides the
 // wcpu_ProcessorMaskFromAffinity helper. there is no similar function to
 // convert processor to processorNumber.
-static void FillProcessorsNode(size_t numNodes, const uintptr_t* nodesProcessorMask, size_t* processorsNode)
+static void FillProcessorsNode(const uintptr_t* nodesProcessorMask, size_t maxNodes, size_t* processorsNode, size_t numProcessors)
 {
-	for(size_t node = 0; node < numNodes; node++)
+	for(size_t processor = 0; processor < numProcessors; processor++)
 	{
-		const uintptr_t processorMask = nodesProcessorMask[node];
-		for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
+		bool foundNode = false;
+		for(size_t node = 0; node < maxNodes; node++)
 		{
-			if(IsBitSet(processorMask, processor))
+			if(IsBitSet(nodesProcessorMask[node], processor))
+			{
 				processorsNode[processor] = node;
+				foundNode = true;
+				break;
+			}
 		}
+		debug_assert(foundNode);
 	}
 }

@ -111,7 +119,7 @@ static void FillProcessorsNode(size_t numNodes, const uintptr_t* nodesProcessorM

 struct NodeTopology	// POD
 {
-	size_t numNodes;
+	size_t maxNodes;
 	size_t processorsNode[os_cpu_MaxProcessors];
 	uintptr_t nodesProcessorMask[os_cpu_MaxProcessors];
 };
@ -119,14 +127,14 @@ static NodeTopology s_nodeTopology;

 static void DetectNodeTopology()
 {
-	s_nodeTopology.numNodes = NumNodes();
-	FillNodesProcessorMask(s_nodeTopology.nodesProcessorMask);
-	FillProcessorsNode(s_nodeTopology.numNodes, s_nodeTopology.nodesProcessorMask, s_nodeTopology.processorsNode);
+	s_nodeTopology.maxNodes = MaxNodes();
+	FillNodesProcessorMask(s_nodeTopology.nodesProcessorMask, s_nodeTopology.maxNodes);
+	FillProcessorsNode(s_nodeTopology.nodesProcessorMask, s_nodeTopology.maxNodes, s_nodeTopology.processorsNode, os_cpu_NumProcessors());
 }

 size_t numa_NumNodes()
 {
-	return s_nodeTopology.numNodes;
+	return s_nodeTopology.maxNodes;
 }

 size_t numa_NodeFromProcessor(size_t processor)
@ -137,7 +145,7 @@ size_t numa_NodeFromProcessor(size_t processor)

 uintptr_t numa_ProcessorMaskFromNode(size_t node)
 {
-	debug_assert(node < s_nodeTopology.numNodes);
+	debug_assert(node < s_nodeTopology.maxNodes);
 	return s_nodeTopology.nodesProcessorMask[node];
 }

@ -159,7 +167,7 @@ size_t numa_AvailableMemory(size_t node)
 	{
 		ULONGLONG availableBytes;
 		const BOOL ok = pGetNumaAvailableMemoryNode((UCHAR)node, &availableBytes);
-		debug_assert(ok);
+		WARN_IF_FALSE(ok);
 		const size_t availableMiB = size_t(availableBytes / MiB);
 		return availableMiB;
 	}
@ -169,43 +177,71 @@ size_t numa_AvailableMemory(size_t node)
 }


+#pragma pack(push, 1)
+
+// ACPI System Locality Information Table
+struct SLIT
+{
+	AcpiTable header;
+	u64 numSystemLocalities;
+	u8 entries[1];		// numSystemLocalities*numSystemLocalities entries
+};
+
+#pragma pack(pop)
+
+static double DetectRelativeDistance()
+{
+	// trust values reported by the BIOS, if available
+	const SLIT* slit = (const SLIT*)acpi_GetTable("SLIT");
+	if(slit)
+	{
+		const size_t n = slit->numSystemLocalities;
+		debug_assert(slit->header.size == sizeof(SLIT)-sizeof(slit->entries)+n*n);
+		// diagonals are specified to be 10
+		for(size_t i = 0; i < n; i++)
+			debug_assert(slit->entries[i*n+i] == 10);
+		// entries = relativeDistance * 10
+		return *std::max_element(slit->entries, slit->entries+n*n) / 10.0;
+	}
+
+	// if non-NUMA, skip the (expensive) measurement below.
+	if(numa_NumNodes() == 1)
+		return 1.0;
+
+	// allocate memory on one node
+	const size_t size = 16*MiB;
+	shared_ptr<u8> buffer((u8*)numa_AllocateOnNode(size, 0), numa_Deleter<u8>());
+
+	const uintptr_t previousProcessorMask = os_cpu_SetThreadAffinityMask(os_cpu_ProcessorMask());
+
+	// measure min/max fill times required by a processor from each node
+	double minTime = 1e10, maxTime = 0.0;
+	for(size_t node = 0; node < numa_NumNodes(); node++)
+	{
+		const uintptr_t processorMask = numa_ProcessorMaskFromNode(node);
+		os_cpu_SetThreadAffinityMask(processorMask);
+
+		const double startTime = timer_Time();
+		memset(buffer.get(), 0, size);
+		const double elapsedTime = timer_Time() - startTime;
+
+		minTime = std::min(minTime, elapsedTime);
+		maxTime = std::max(maxTime, elapsedTime);
+	}
+
+	(void)os_cpu_SetThreadAffinityMask(previousProcessorMask);
+
+	return maxTime / minTime;
+}
+
+
 double numa_Factor()
 {
 	WinScopedLock lock(WNUMA_CS);
 	static double factor;
 	if(factor == 0.0)
 	{
-		// if non-NUMA, skip the (expensive) measurements below.
-		if(numa_NumNodes() == 1)
-			factor = 1.0;
-		else
-		{
-			// allocate memory on one node
-			const size_t size = 16*MiB;
-			shared_ptr<u8> buffer((u8*)numa_AllocateOnNode(size, 0), numa_Deleter<u8>());
-
-			const uintptr_t previousProcessorMask = os_cpu_SetThreadAffinityMask(os_cpu_ProcessorMask());
-
-			// measure min/max fill times required by a processor from each node
-			double minTime = 1e10, maxTime = 0.0;
-			for(size_t node = 0; node < numa_NumNodes(); node++)
-			{
-				const uintptr_t processorMask = numa_ProcessorMaskFromNode(node);
-				os_cpu_SetThreadAffinityMask(processorMask);
-
-				const double startTime = timer_Time();
-				memset(buffer.get(), 0, size);
-				const double elapsedTime = timer_Time() - startTime;
-
-				minTime = std::min(minTime, elapsedTime);
-				maxTime = std::max(maxTime, elapsedTime);
-			}
-
-			(void)os_cpu_SetThreadAffinityMask(previousProcessorMask);
-
-			factor = maxTime / minTime;
-		}
-
+		factor = DetectRelativeDistance();
 		debug_assert(factor >= 1.0);
 		debug_assert(factor <= 3.0);	// (Microsoft guideline for NUMA systems)
 	}
@ -214,16 +250,25 @@ double numa_Factor()
 }


+static int DetectMemoryInterleaving()
+{
+	// not NUMA => no interleaving
+	if(numa_NumNodes() == 1)
+		return 0;
+
+	// BIOS only generates SRAT if interleaving is disabled
+	if(acpi_GetTable("SRAT"))
+		return 0;
+
+	return 1;
+}
+
 bool numa_IsMemoryInterleaved()
 {
 	WinScopedLock lock(WNUMA_CS);
 	static int isInterleaved = -1;
 	if(isInterleaved == -1)
-	{
-		// the BIOS only generates an SRAT (System Resource Affinity Table)
-		// if node interleaving is disabled.
-		isInterleaved = acpi_GetTable("SRAT") == 0;
-	}
+		isInterleaved = DetectMemoryInterleaving();

 	return isInterleaved != 0;
 }
--- a/source/lib/sysdep/os/win/wposix/wfilesystem.cpp
+++ b/source/lib/sysdep/os/win/wposix/wfilesystem.cpp
@ -173,7 +173,7 @@ struct WDIR
 // thread-safe.

 static WDIR global_wdir;
-static uintptr_t global_wdir_is_in_use;
+static intptr_t global_wdir_is_in_use;

 // zero-initializes the WDIR (code below relies on this)
 static inline WDIR* wdir_alloc()
--- a/source/lib/sysdep/os/win/wposix/wpthread.cpp
+++ b/source/lib/sysdep/os/win/wposix/wpthread.cpp
@ -60,7 +60,7 @@ pthread_t pthread_self()

 int pthread_once(pthread_once_t* once, void (*init_routine)())
 {
-	if(cpu_CAS(once, 0, 1))
+	if(cpu_CAS((volatile intptr_t*)once, 0, 1))
 		init_routine();
 	return 0;
 }
@ -137,7 +137,7 @@ int pthread_key_create(pthread_key_t* key, void (*dtor)(void*))
 	size_t i;
 	for(i = 0; i < MAX_DTORS; i++)
 	{
-		if(cpu_CAS((volatile uintptr_t*)&dtors[i].dtor, 0, (uintptr_t)dtor))
+		if(cpu_CAS((volatile intptr_t*)&dtors[i].dtor, (intptr_t)0, (intptr_t)dtor))
 			goto have_slot;
 	}

--- a/source/lib/sysdep/os/win/wposix/wpthread.h
+++ b/source/lib/sysdep/os/win/wposix/wpthread.h
@ -54,7 +54,7 @@ enum
 //

 // one-time init
-typedef uintptr_t pthread_once_t;
+typedef intptr_t pthread_once_t;	// required for cpu_CAS
 #define PTHREAD_ONCE_INIT 0	// static pthread_once_t x = PTHREAD_ONCE_INIT;

 LIB_API int pthread_once(pthread_once_t*, void (*init_routine)());
--- a/source/lib/sysdep/os/win/wposix/wsock.cpp
+++ b/source/lib/sysdep/os/win/wposix/wsock.cpp
@ -54,9 +54,8 @@ const struct in6_addr in6addr_loopback = IN6ADDR_LOOPBACK_INIT; // ::_1
 // surprising users. speed is irrelevant here. manually writing these stubs
 // is ugly, but delay-load error handling is hairy, so don't use that.
 //
-// the first call of these stubs must trigger wsock_ActualInit in case no
-// other winsock function was called yet. we can't simply rely on
-// ModuleShouldInitialize because taking references prevents shutdown.
+// the first call of these stubs must trigger OnLoad in case no
+// other winsock function was called yet.
 // adding an extra haveInitialized flag would be redundant. instead,
 // enter a clever but safe hack: we call a harmless winsock function that
 // triggers the delay load or does nothing if init has already happened.
@ -115,15 +114,8 @@ static void ImportOptionalFunctions()

 //-----------------------------------------------------------------------------

-static ModuleInitState initState;
-
-// called from delay loader the first time a wsock function is called
-// (shortly before the actual wsock function is called).
-static LibError wsock_ActualInit()
+static LibError Init()
 {
-	if(!ModuleShouldInitialize(&initState))
-		return INFO::OK;
-
 	char d[1024];
 	int ret = WSAStartup(0x0002, d);	// want 2.0
 	debug_assert(ret == 0);
@ -133,22 +125,31 @@ static LibError wsock_ActualInit()
 	return INFO::OK;
 }

+static void Shutdown()
+{
+	int ret = WSACleanup();
+	debug_assert(ret >= 0);
+}
+
+static ModuleInitState initState;
+
+// called from delay loader the first time a wsock function is called
+// (shortly before the actual wsock function is called).
+static LibError OnLoad()
+{
+	return ModuleInit(&initState, Init);
+}
+
 static LibError wsock_Init()
 {
-	// trigger wsock_ActualInit when someone first calls a winsock function.
-	static WdllLoadNotify loadNotify = { "ws2_32", wsock_ActualInit };
+	// trigger OnLoad when someone first calls a wsock function.
+	static WdllLoadNotify loadNotify = { "ws2_32", OnLoad };
 	wdll_add_notify(&loadNotify);
 	return INFO::OK;
 }

-
 static LibError wsock_Shutdown()
 {
-	if(!ModuleShouldShutdown(&initState))
-		return INFO::OK;
-
-	int ret = WSACleanup();
-	debug_assert(ret >= 0);
-
+	ModuleShutdown(&initState, Shutdown);
 	return INFO::OK;
 }
--- a/source/lib/sysdep/os/win/wsdl.cpp
+++ b/source/lib/sysdep/os/win/wsdl.cpp
@ -1365,21 +1365,14 @@ void* SDL_GL_GetProcAddress(const char* name)
 // around ATI driver breakage)
 static ModuleInitState initState;

-int SDL_Init(Uint32 UNUSED(flags))
+static LibError Init()
 {
-	if(!ModuleShouldInitialize(&initState))
-		return 0;
-
 	key_Init();
-
-	return 0;
+	return INFO::OK;
 }

-void SDL_Quit()
+static void Shutdown()
 {
-	if(!ModuleShouldShutdown(&initState))
-		return;
-
 	is_quitting = true;

 	if(wutil_IsValidHandle(g_hDC))
@ -1391,6 +1384,16 @@ void SDL_Quit()
 	video_Shutdown();
 }

+int SDL_Init(Uint32 UNUSED(flags))
+{
+	return (ModuleInit(&initState, Init) < 0)? -1 : 0;
+}
+
+void SDL_Quit()
+{
+	ModuleShutdown(&initState, Shutdown);
+}
+

 static void RedirectStdout()
 {