fixes/improvements to lib code from work

- topology.cpp: modify interface due to thread-safety issue. caller is now responsible for ensuring the first _Detect call isn't reentered; everything else is safe. - fix thread-safety issue in wnuma; use winit mechanism to ensure it's ready before use - VFS now takes a cacheSize parameter (required for being able to disable read-only file caches for the image loader at work) - allow dynarray that isn't actually holding memory - debug_stl: VC9 fix (disable this code except on the exact STL version on which it was tested) - zlib, lib_api: changes to macro used to toggle between static and dynamic linking - add boost filesystem header in external_libraries - amd64: cpu_ topology functions are now provided by x86_x64 - cpu: remove cpu_ClockFrequency (dangerous, may be tempting to use during WHRT init which would cause a crash). use x86_x64_ClockFrequency or os_cpu_ClockFrequency instead. - werrno: cope with newer boost versions - wmman: follow SUSv3 in rejecting zero-length mappings This was SVN commit r5954.
2008-05-13 19:43:02 +00:00 · 2008-05-13 19:43:02 +00:00 · 04127c7af3
commit 04127c7af3
parent bafc8d0cfa
37 changed files with 520 additions and 412 deletions
--- a/source/graphics/tests/test_MeshManager.h
+++ b/source/graphics/tests/test_MeshManager.h
@ -45,7 +45,7 @@ class TestMeshManager : public CxxTest::TestSuite
 		TS_ASSERT(fs::create_directory(MOD_PATH.external_directory_string()));
 		TS_ASSERT(fs::create_directory(CACHE_PATH.external_directory_string()));

-		g_VFS = CreateVfs();
+		g_VFS = CreateVfs(20*MiB);

 		TS_ASSERT_OK(g_VFS->Mount("", MOD_PATH));
 		TS_ASSERT_OK(g_VFS->Mount("collada/", "tests/collada"));
--- a/source/lib/allocators/dynarray.cpp
+++ b/source/lib/allocators/dynarray.cpp
@ -25,14 +25,15 @@ static LibError validate_da(DynArray* da)
 {
 	if(!da)
 		WARN_RETURN(ERR::INVALID_PARAM);
-	u8* const base           = da->base;
+//	u8* const base           = da->base;
 	const size_t max_size_pa = da->max_size_pa;
 	const size_t cur_size    = da->cur_size;
 	const size_t pos         = da->pos;
 	const int prot           = da->prot;

-	if(debug_is_pointer_bogus(base))
-		WARN_RETURN(ERR::_1);
+	// note: this happens if max_size == 0
+//	if(debug_is_pointer_bogus(base))
+//		WARN_RETURN(ERR::_1);
 	// note: don't check if base is page-aligned -
 	// might not be true for 'wrapped' mem regions.
 //	if(!mem_IsPageMultiple((uintptr_t)base))
@ -56,8 +57,9 @@ LibError da_alloc(DynArray* da, size_t max_size)
 {
 	const size_t max_size_pa = mem_RoundUpToPage(max_size);

-	u8* p;
-	RETURN_ERR(mem_Reserve(max_size_pa, &p));
+	u8* p = 0;
+	if(max_size_pa)	// (avoid mmap failure)
+		RETURN_ERR(mem_Reserve(max_size_pa, &p));

 	da->base        = p;
 	da->max_size_pa = max_size_pa;
@ -85,7 +87,7 @@ LibError da_free(DynArray* da)
 	// skip mem_Release if <da> was allocated via da_wrap_fixed
 	// (i.e. it doesn't actually own any memory). don't complain;
 	// da_free is supposed to be called even in the above case.
-	if(!was_wrapped)
+	if(!was_wrapped && size_pa)
 		RETURN_ERR(mem_Release(p, size_pa));
 	return INFO::OK;
 }
--- a/source/lib/debug_stl.cpp
+++ b/source/lib/debug_stl.cpp
@ -211,7 +211,7 @@ struct ContainerBase : public Container

 struct Any_deque : public ContainerBase<std::deque<int> >
 {
-#if STL_DINKUMWARE
+#if STL_DINKUMWARE == 405

 	bool IsValid(size_t el_size) const
 	{
@ -277,7 +277,7 @@ struct Any_list : public ContainerBase<std::list<int> >
 };


-#if STL_DINKUMWARE
+#if STL_DINKUMWARE == 405

 template<class _Traits>
 struct Any_tree : public std::_Tree<_Traits>
@ -385,7 +385,7 @@ struct Any_vector: public ContainerBase<std::vector<int> >
 		return true;
 	}

-#if STL_DINKUMWARE
+#if STL_DINKUMWARE == 405

 	size_t NumElements(size_t el_size) const
 	{
@ -416,7 +416,7 @@ struct Any_vector: public ContainerBase<std::vector<int> >
 };


-#if STL_DINKUMWARE
+#if STL_DINKUMWARE == 405

 struct Any_basic_string : public ContainerBase<std::string>
 {
@ -461,7 +461,7 @@ struct Any_stack : public Any_deque

 struct Any_hash_map: public ContainerBase<STL_HASH_MAP<int,int> >
 {
-#if STL_DINKUMWARE
+#if STL_DINKUMWARE == 405

 	bool IsValid(size_t el_size) const
 	{
@ -482,7 +482,7 @@ struct Any_hash_multimap : public Any_hash_map

 struct Any_hash_set: public ContainerBase<STL_HASH_SET<int> >
 {
-#if STL_DINKUMWARE
+#if STL_DINKUMWARE == 405

 	bool IsValid(size_t el_size) const
 	{
@ -610,7 +610,7 @@ LibError debug_stl_get_container_info(const char* type_name, const u8* p, size_t
 	STD_CONTAINER(deque)
 	STD_CONTAINER(list)
 	STD_CONTAINER(vector)
-#if STL_DINKUMWARE
+#if STL_DINKUMWARE == 405
 	STD_CONTAINER(map)
 	STD_CONTAINER(multimap)
 	STD_CONTAINER(set)
--- a/source/lib/external_libraries/boost_filesystem.h
+++ b/source/lib/external_libraries/boost_filesystem.h
@ -0,0 +1,26 @@
+/**
+ * =========================================================================
+ * File        : boost_filesystem.h
+ * Project     : 0 A.D.
+ * Description : bring in Boost filesystem library
+ * =========================================================================
+ */
+
+// license: GPL; see lib/license.txt
+
+#ifndef INCLUDED_BOOST_FILESYSTEM
+#define INCLUDED_BOOST_FILESYSTEM
+
+// not W4-clean
+#if MSC_VERSION
+# pragma warning(push, 3)
+#endif
+
+#include "boost/filesystem.hpp"
+namespace fs = boost::filesystem;
+
+#if MSC_VERSION
+# pragma warning(pop)
+#endif
+
+#endif	// #ifndef INCLUDED_BOOST_FILESYSTEM
--- a/source/lib/external_libraries/zlib.h
+++ b/source/lib/external_libraries/zlib.h
@ -20,7 +20,7 @@
 # define WINAPIV __cdecl
 #endif

-#ifndef FOM_ZLIB
+#ifndef ZLIB_STATIC
 #define ZLIB_DLL
 #endif

@ -28,18 +28,10 @@

 // automatically link against the required library
 #if MSC_VERSION
-# ifdef FOM_ZLIB
-#  ifdef NDEBUG
-#   pragma comment(lib, "fom_zlib.lib")
-#  else
-#   pragma comment(lib, "fom_zlib_d.lib")
-#  endif
+# ifdef NDEBUG
+#  pragma comment(lib, "zlib1.lib")
 # else
-#  ifdef NDEBUG
-#   pragma comment(lib, "zlib1.lib")
-#  else
-#   pragma comment(lib, "zlib1d.lib")
-#  endif
+#  pragma comment(lib, "zlib1d.lib")
 # endif
 #endif

--- a/source/lib/file/vfs/vfs.cpp
+++ b/source/lib/file/vfs/vfs.cpp
@ -25,8 +25,8 @@
 class VFS : public IVFS
 {
 public:
-	VFS()
-		: m_fileCache(ChooseCacheSize())
+	VFS(size_t cacheSize)
+		: m_cacheSize(cacheSize), m_fileCache(m_cacheSize)
 		, m_trace(CreateTrace(4*MiB))
 	{
 	}
@ -106,7 +106,7 @@ public:
 			// safely handle zero-length files
 			if(!size)
 				fileContents = DummySharedPtr((u8*)0);
-			else if(size > ChooseCacheSize())
+			else if(size > m_cacheSize)
 			{
 				fileContents = io_Allocate(size);
 				RETURN_ERR(file->Load(fileContents));
@ -152,19 +152,15 @@ public:
 	}

 private:
-	static size_t ChooseCacheSize()
-	{
-		return 96*MiB;
-	}
-
-	mutable VfsDirectory m_rootDirectory;
+	size_t m_cacheSize;
 	FileCache m_fileCache;
 	PITrace m_trace;
+	mutable VfsDirectory m_rootDirectory;
 };

 //-----------------------------------------------------------------------------

-PIVFS CreateVfs()
+PIVFS CreateVfs(size_t cacheSize)
 {
-	return PIVFS(new VFS);
+	return PIVFS(new VFS(cacheSize));
 }
--- a/source/lib/file/vfs/vfs.h
+++ b/source/lib/file/vfs/vfs.h
@ -100,6 +100,6 @@ struct IVFS
 };

 typedef shared_ptr<IVFS> PIVFS;
-LIB_API PIVFS CreateVfs();
+LIB_API PIVFS CreateVfs(size_t cacheSize);

 #endif	// #ifndef INCLUDED_VFS
--- a/source/lib/lib_api.h
+++ b/source/lib/lib_api.h
@ -3,22 +3,19 @@
 // note: EXTERN_C cannot be used because shared_ptr is often returned
 // by value, which requires C++ linkage.

-#ifdef LIB_DLL
+#ifdef LIB_STATIC_LINK
+# define LIB_API
+#else
 # ifdef LIB_BUILD
 #  define LIB_API __declspec(dllexport)
 # else
 #  define LIB_API __declspec(dllimport)
-# endif
-#else
-# define LIB_API
-#endif
-
-#if defined(LIB_DLL) && !defined(LIB_BUILD)
-# if MSC_VERSION
-#  ifdef NDEBUG
-#   pragma comment(lib, "lib.lib")
-#  else
-#   pragma comment(lib, "lib_d.lib")
+#  if MSC_VERSION
+#   ifdef NDEBUG
+#    pragma comment(lib, "lib.lib")
+#   else
+#    pragma comment(lib, "lib_d.lib")
+#   endif
 #  endif
 # endif
 #endif
--- a/source/lib/precompiled.h
+++ b/source/lib/precompiled.h
@ -56,19 +56,18 @@
 #include "lib/code_annotation.h"

 // Boost
+// .. if this package isn't going to be statically linked, we're better off
+// using Boost via DLL. (otherwise, we would have to ensure the exact same
+// compiler is used, which is a pain because MSC8, MSC9 and ICC 10 are in use)
+#ifndef LIB_STATIC_LINK
+# define BOOST_ALL_DYN_LINK 
+#endif
 #include <boost/utility.hpp>	// noncopyable
 #include <boost/shared_array.hpp>
 #include <boost/shared_ptr.hpp>
 #include <boost/scoped_ptr.hpp>
-#if MSC_VERSION
-# pragma warning(push, 3)	// filesystem isn't W4-clean
-#endif
-#include <boost/filesystem.hpp>
-#if MSC_VERSION
-# pragma warning(pop)
-#endif
 using boost::shared_ptr;	// has been added to TR1
-namespace fs = boost::filesystem;
+#include "lib/external_libraries/boost_filesystem.h"

 // (this must come after boost and common lib headers)
 #include "lib/posix/posix.h"
--- a/source/lib/sysdep/amd64/amd64.cpp
+++ b/source/lib/sysdep/amd64/amd64.cpp
@ -26,16 +26,3 @@ void cpu_AtomicAdd(volatile intptr_t* location, intptr_t increment)
 }

 #endif
-
-// note: ACPI processor detection not yet implemented here, so we treat
-// dual-core systems as multiprocessors.
-
-size_t cpu_NumPackages()
-{
-	return cpu_NumProcessors();
-}
-
-size_t cpu_CoresPerPackage()
-{
-	return 1;
-}
--- a/source/lib/sysdep/cpu.h
+++ b/source/lib/sysdep/cpu.h
@ -28,16 +28,6 @@ namespace ERR
 **/
 LIB_API const char* cpu_IdentifierString();

-/**
- * @return a rough estimate of the CPU clock frequency.
- *
- * note: the accuracy of this value is not important. while it is used by
- * the TSC timing backend, thermal drift is an issue that requires
- * continual recalibration anyway, which makes the initial accuracy moot.
- * querying frequency via OS is also much faster than ia32's measurement loop.
- **/
-LIB_API double cpu_ClockFrequency();
-

 //-----------------------------------------------------------------------------
 // lock-free support routines
--- a/source/lib/sysdep/win/manifest.cpp
+++ b/source/lib/sysdep/win/manifest.cpp
@ -11,7 +11,7 @@ to add the necessary parts to that generated manifest.
 ICC 10.1 IPO considers this string to be an input file, hence this
 is currently disabled there.
 */
-#if MSC_VERSION >= 1400 && !ICC_VERSION
+#if MSC_VERSION >= 1400 && !ICC_VERSION && defined(LIB_STATIC_LINK)
 # if ARCH_IA32
 #  pragma comment(linker, "\"/manifestdependency:type='win32' name='Microsoft.Windows.Common-Controls' version='6.0.0.0' processorArchitecture='X86' publicKeyToken='6595b64144ccf1df'\"")
 # elif ARCH_AMD64
--- a/source/lib/sysdep/win/whrt/counter.h
+++ b/source/lib/sysdep/win/whrt/counter.h
@ -46,7 +46,7 @@ public:

 	/**
 	 * initial measurement of the tick rate. not necessarily correct
-	 * (e.g. when using TSC: cpu_ClockFrequency isn't exact).
+	 * (e.g. when using TSC: os_cpu_ClockFrequency isn't exact).
 	 **/
 	virtual double NominalFrequency() const = 0;

--- a/source/lib/sysdep/win/whrt/hpet.cpp
+++ b/source/lib/sysdep/win/whrt/hpet.cpp
@ -113,7 +113,7 @@ size_t CounterHPET::CounterBits() const

 /**
 * initial measurement of the tick rate. not necessarily correct
- * (e.g. when using TSC: cpu_ClockFrequency isn't exact).
+ * (e.g. when using TSC: os_cpu_ClockFrequency isn't exact).
 **/
 double CounterHPET::NominalFrequency() const
 {
--- a/source/lib/sysdep/win/whrt/hpet.h
+++ b/source/lib/sysdep/win/whrt/hpet.h
@ -42,7 +42,7 @@ public:

 	/**
 	 * initial measurement of the tick rate. not necessarily correct
-	 * (e.g. when using TSC: cpu_ClockFrequency isn't exact).
+	 * (e.g. when using TSC: os_cpu_ClockFrequency isn't exact).
 	 **/
 	virtual double NominalFrequency() const;

--- a/source/lib/sysdep/win/whrt/pmt.cpp
+++ b/source/lib/sysdep/win/whrt/pmt.cpp
@ -84,7 +84,7 @@ size_t CounterPMT::CounterBits() const

 /**
 * initial measurement of the tick rate. not necessarily correct
- * (e.g. when using TSC: cpu_ClockFrequency isn't exact).
+ * (e.g. when using TSC: os_cpu_ClockFrequency isn't exact).
 **/
 double CounterPMT::NominalFrequency() const
 {
--- a/source/lib/sysdep/win/whrt/pmt.h
+++ b/source/lib/sysdep/win/whrt/pmt.h
@ -43,7 +43,7 @@ public:

 	/**
 	 * initial measurement of the tick rate. not necessarily correct
-	 * (e.g. when using TSC: cpu_ClockFrequency isn't exact).
+	 * (e.g. when using TSC: os_cpu_ClockFrequency isn't exact).
 	 **/
 	virtual double NominalFrequency() const;

--- a/source/lib/sysdep/win/whrt/qpc.cpp
+++ b/source/lib/sysdep/win/whrt/qpc.cpp
@ -11,7 +11,7 @@
 #include "precompiled.h"
 #include "qpc.h"

-#include "lib/sysdep/cpu.h"
+#include "lib/sysdep/os_cpu.h"
 #include "lib/sysdep/win/win.h"
 #include "lib/sysdep/win/wutil.h"	// wutil_argv
 #include "pit.h"	// PIT_FREQ
@ -65,10 +65,10 @@ bool CounterQPC::IsSafe() const
 	// used on MP HAL systems and can be detected by comparing QPF with the
 	// CPU clock. we consider it unsafe unless the user promises (via
 	// command line) that it's patched and thus reliable on their system.
-	bool usesTsc = IsSimilarMagnitude(m_frequency, cpu_ClockFrequency());
+	bool usesTsc = IsSimilarMagnitude(m_frequency, os_cpu_ClockFrequency());
 	// unconfirmed reports indicate QPC sometimes uses 1/3 of the
 	// CPU clock frequency, so check that as well.
-	usesTsc |= IsSimilarMagnitude(m_frequency, cpu_ClockFrequency()/3);
+	usesTsc |= IsSimilarMagnitude(m_frequency, os_cpu_ClockFrequency()/3);
 	if(usesTsc)
 	{
 		const bool isTscSafe = wutil_HasCommandLineArgument("-wQpcTscSafe");
@ -108,7 +108,7 @@ size_t CounterQPC::CounterBits() const

 /**
 * initial measurement of the tick rate. not necessarily correct
- * (e.g. when using TSC: cpu_ClockFrequency isn't exact).
+ * (e.g. when using TSC: os_cpu_ClockFrequency isn't exact).
 **/
 double CounterQPC::NominalFrequency() const
 {
--- a/source/lib/sysdep/win/whrt/qpc.h
+++ b/source/lib/sysdep/win/whrt/qpc.h
@ -41,7 +41,7 @@ public:

 	/**
 	 * initial measurement of the tick rate. not necessarily correct
-	 * (e.g. when using TSC: cpu_ClockFrequency isn't exact).
+	 * (e.g. when using TSC: os_cpu_ClockFrequency isn't exact).
 	 **/
 	virtual double NominalFrequency() const;

--- a/source/lib/sysdep/win/whrt/tgt.cpp
+++ b/source/lib/sysdep/win/whrt/tgt.cpp
@ -69,7 +69,7 @@ size_t CounterTGT::CounterBits() const

 /**
 * initial measurement of the tick rate. not necessarily correct
- * (e.g. when using TSC: cpu_ClockFrequency isn't exact).
+ * (e.g. when using TSC: os_cpu_ClockFrequency isn't exact).
 **/
 double CounterTGT::NominalFrequency() const
 {
--- a/source/lib/sysdep/win/whrt/tgt.h
+++ b/source/lib/sysdep/win/whrt/tgt.h
@ -36,7 +36,7 @@ public:

 	/**
 	 * initial measurement of the tick rate. not necessarily correct
-	 * (e.g. when using TSC: cpu_ClockFrequency isn't exact).
+	 * (e.g. when using TSC: os_cpu_ClockFrequency isn't exact).
 	 **/
 	virtual double NominalFrequency() const;

--- a/source/lib/sysdep/win/whrt/tsc.cpp
+++ b/source/lib/sysdep/win/whrt/tsc.cpp
@ -11,9 +11,10 @@
 #include "precompiled.h"
 #include "tsc.h"

-#include "lib/sysdep/cpu.h"
-#include "lib/sysdep/win/win.h"
 #include "lib/bits.h"
+#include "lib/sysdep/os_cpu.h"
+#include "lib/sysdep/win/win.h"
+#include "lib/sysdep/win/wutil.h"

 #if ARCH_IA32 || ARCH_AMD64
 # include "lib/sysdep/x86_x64/x86_x64.h"	// x86_x64_rdtsc
@ -96,8 +97,12 @@ bool CounterTSC::IsSafe() const
 	// per-core counter state and the abovementioned race condition.
 	// however, we won't bother, since such platforms aren't yet widespread
 	// and would surely support the nice and safe HPET, anyway)
-	if(cpu_NumPackages() != 1 || cpu_CoresPerPackage() != 1)
-		return false;
+	{
+		WinScopedLock lock(WHRT_CS);
+		const CpuTopology* topology = cpu_topology_Detect();
+		if(cpu_topology_NumPackages(topology) != 1 || cpu_topology_CoresPerPackage(topology) != 1)
+			return false;
+	}

 #if ARCH_IA32 || ARCH_AMD64
 	// recent CPU:
@ -154,9 +159,16 @@ size_t CounterTSC::CounterBits() const

 /**
 * initial measurement of the tick rate. not necessarily correct
- * (e.g. when using TSC: cpu_ClockFrequency isn't exact).
+ * (e.g. when using TSC: os_cpu_ClockFrequency isn't exact).
 **/
 double CounterTSC::NominalFrequency() const
 {
-	return cpu_ClockFrequency();
+	// WARNING: do not call x86_x64_ClockFrequency because it uses the
+	// HRT, which we're currently in the process of initializing.
+	// instead query CPU clock frequency via OS.
+	//
+	// note: even here, initial accuracy isn't critical because the
+	// clock is subject to thermal drift and would require continual
+	// recalibration anyway.
+	return os_cpu_ClockFrequency();
 }
--- a/source/lib/sysdep/win/whrt/tsc.h
+++ b/source/lib/sysdep/win/whrt/tsc.h
@ -36,7 +36,7 @@ public:

 	/**
 	 * initial measurement of the tick rate. not necessarily correct
-	 * (e.g. when using TSC: cpu_ClockFrequency isn't exact).
+	 * (e.g. when using TSC: os_cpu_ClockFrequency isn't exact).
 	 **/
 	virtual double NominalFrequency() const;
 };
--- a/source/lib/sysdep/win/wnuma.cpp
+++ b/source/lib/sysdep/win/wnuma.cpp
@ -7,107 +7,113 @@
 #include "win.h"
 #include "wutil.h"
 #include "wcpu.h"
+#include "winit.h"
 #include <Psapi.h>

-#ifdef _OPENMP
-# include <omp.h>
-#endif
+
+WINIT_REGISTER_EARLY_INIT(wnuma_Init);


 //-----------------------------------------------------------------------------
 // node topology
 //-----------------------------------------------------------------------------

-size_t numa_NumNodes()
+static size_t NumNodes()
 {
-	static size_t numNodes;
-
-	if(!numNodes)
+	typedef BOOL (WINAPI *PGetNumaHighestNodeNumber)(PULONG highestNode);
+	const HMODULE hKernel32 = GetModuleHandle("kernel32.dll");
+	const PGetNumaHighestNodeNumber pGetNumaHighestNodeNumber = (PGetNumaHighestNodeNumber)GetProcAddress(hKernel32, "GetNumaHighestNodeNumber");
+	if(pGetNumaHighestNodeNumber)
 	{
-		typedef BOOL (WINAPI *PGetNumaHighestNodeNumber)(PULONG highestNode);
-		const HMODULE hKernel32 = GetModuleHandle("kernel32.dll");
-		const PGetNumaHighestNodeNumber pGetNumaHighestNodeNumber = (PGetNumaHighestNodeNumber)GetProcAddress(hKernel32, "GetNumaHighestNodeNumber");
-		if(pGetNumaHighestNodeNumber)
-		{
-			ULONG highestNode;
-			const BOOL ok = pGetNumaHighestNodeNumber(&highestNode);
-			debug_assert(ok);
-			debug_assert(highestNode < os_cpu_NumProcessors());	// #nodes <= #processors
-			numNodes = highestNode+1;
-		}
-		// NUMA not supported
-		else
-			numNodes = 1;
+		ULONG highestNode;
+		const BOOL ok = pGetNumaHighestNodeNumber(&highestNode);
+		debug_assert(ok);
+		debug_assert(highestNode < os_cpu_NumProcessors());	// #nodes <= #processors
+		return highestNode+1;
 	}
-
-	return numNodes;
+	// NUMA not supported
+	else
+		return 1;
 }


-// note: it is easier to implement this in terms of numa_ProcessorMaskFromNode
+static void FillNodesProcessorMask(uintptr_t* nodesProcessorMask)
+{
+	typedef BOOL (WINAPI *PGetNumaNodeProcessorMask)(UCHAR node, PULONGLONG affinity);
+	const HMODULE hKernel32 = GetModuleHandle("kernel32.dll");
+	const PGetNumaNodeProcessorMask pGetNumaNodeProcessorMask = (PGetNumaNodeProcessorMask)GetProcAddress(hKernel32, "GetNumaNodeProcessorMask");
+	if(pGetNumaNodeProcessorMask)
+	{
+		DWORD_PTR processAffinity, systemAffinity;
+		const BOOL ok = GetProcessAffinityMask(GetCurrentProcess(), &processAffinity, &systemAffinity);
+		debug_assert(ok);
+
+		for(size_t node = 0; node < numa_NumNodes(); node++)
+		{
+			ULONGLONG affinity;
+			const BOOL ok = pGetNumaNodeProcessorMask((UCHAR)node, &affinity);
+			debug_assert(ok);
+			const uintptr_t processorMask = wcpu_ProcessorMaskFromAffinity(processAffinity, (DWORD_PTR)affinity);
+			nodesProcessorMask[node] = processorMask;
+		}
+	}
+	// NUMA not supported - consider node 0 to consist of all system processors
+	else
+		nodesProcessorMask[0] = os_cpu_ProcessorMask();
+}
+
+
+// note: it is easier to implement this in terms of nodesProcessorMask
 // rather than the other way around because wcpu provides the
 // wcpu_ProcessorMaskFromAffinity helper. there is no similar function to
 // convert processor to processorNumber.
-size_t numa_NodeFromProcessor(size_t processor)
+static void FillProcessorsNode(size_t numNodes, const uintptr_t* nodesProcessorMask, size_t* processorsNode)
 {
-	debug_assert(processor < os_cpu_NumProcessors());
-
-	static std::vector<size_t> processorsNode;
-#ifdef _OPENMP
-#pragma omp critical
-#endif
-	if(processorsNode.empty())
+	for(size_t node = 0; node < numNodes; node++)
 	{
-		processorsNode.resize(os_cpu_NumProcessors(), 0);
-		for(size_t node = 0; node < numa_NumNodes(); node++)
+		const uintptr_t processorMask = nodesProcessorMask[node];
+		for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
 		{
-			const uintptr_t processorMask = numa_ProcessorMaskFromNode(node);
-			for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
-			{
-				if(IsBitSet(processorMask, processor))
-					processorsNode[processor] = node;
-			}
+			if(IsBitSet(processorMask, processor))
+				processorsNode[processor] = node;
 		}
 	}
-
-	return processorsNode.at(processor);
 }


+//-----------------------------------------------------------------------------
+// node topology interface
+
+struct NodeTopology	// POD
+{
+	size_t numNodes;
+	size_t processorsNode[os_cpu_MaxProcessors];
+	uintptr_t nodesProcessorMask[os_cpu_MaxProcessors];
+};
+static NodeTopology s_nodeTopology;
+
+static void DetectNodeTopology()
+{
+	s_nodeTopology.numNodes = NumNodes();
+	FillNodesProcessorMask(s_nodeTopology.nodesProcessorMask);
+	FillProcessorsNode(s_nodeTopology.numNodes, s_nodeTopology.nodesProcessorMask, s_nodeTopology.processorsNode);
+}
+
+size_t numa_NumNodes()
+{
+	return s_nodeTopology.numNodes;
+}
+
+size_t numa_NodeFromProcessor(size_t processor)
+{
+	debug_assert(processor < os_cpu_NumProcessors());
+	return s_nodeTopology.processorsNode[processor];
+}
+
 uintptr_t numa_ProcessorMaskFromNode(size_t node)
 {
-	debug_assert(node < numa_NumNodes());
-
-	static std::vector<uintptr_t> nodesProcessorMask;
-#ifdef _OPENMP
-#pragma omp critical
-#endif
-	if(nodesProcessorMask.empty())
-	{
-		typedef BOOL (WINAPI *PGetNumaNodeProcessorMask)(UCHAR node, PULONGLONG affinity);
-		const HMODULE hKernel32 = GetModuleHandle("kernel32.dll");
-		const PGetNumaNodeProcessorMask pGetNumaNodeProcessorMask = (PGetNumaNodeProcessorMask)GetProcAddress(hKernel32, "GetNumaNodeProcessorMask");
-		if(pGetNumaNodeProcessorMask)
-		{
-			DWORD_PTR processAffinity, systemAffinity;
-			const BOOL ok = GetProcessAffinityMask(GetCurrentProcess(), &processAffinity, &systemAffinity);
-			debug_assert(ok);
-
-			for(size_t node = 0; node < numa_NumNodes(); node++)
-			{
-				ULONGLONG affinity;
-				const BOOL ok = pGetNumaNodeProcessorMask((UCHAR)node, &affinity);
-				debug_assert(ok);
-				const uintptr_t processorMask = wcpu_ProcessorMaskFromAffinity(processAffinity, (DWORD_PTR)affinity);
-				nodesProcessorMask.push_back(processorMask);
-			}
-		}
-		// NUMA not supported - consider node 0 to consist of all system processors
-		else
-			nodesProcessorMask.push_back(os_cpu_ProcessorMask());
-	}
-
-	return nodesProcessorMask.at(node);
+	debug_assert(node < s_nodeTopology.numNodes);
+	return s_nodeTopology.nodesProcessorMask[node];
 }


@ -145,16 +151,10 @@ size_t numa_AvailableMemory(size_t node)

 double numa_Factor()
 {
+	WinScopedLock lock(WNUMA_CS);
 	static double factor;
-
-	static bool initialized;
-#ifdef _OPENMP
-#pragma omp critical
-#endif
-	if(!initialized)
+	if(factor == 0.0)
 	{
-		initialized = true;
-
 		// if non-NUMA, skip the (expensive) measurements below.
 		if(numa_NumNodes() == 1)
 			factor = 1.0;
@ -357,3 +357,12 @@ void numa_Deallocate(void* mem)
 {
 	VirtualFree(mem, 0, MEM_RELEASE);
 }
+
+
+//-----------------------------------------------------------------------------
+
+static LibError wnuma_Init()
+{
+	DetectNodeTopology();
+	return INFO::OK;
+}
--- a/source/lib/sysdep/win/wposix/werrno.h
+++ b/source/lib/sysdep/win/wposix/werrno.h
@ -58,6 +58,8 @@

 // defined by winsock2 and also Linux (with different values)
 // (values derived from winsock2 WSA* constants minus WSABASEERR)
+// update: disabled on newer Boost versions because filesystem drags in boost/cerrno.hpp
+#if BOOST_VERSION <= 103401
 #define EWOULDBLOCK     35
 #define EINPROGRESS     36
 #define EALREADY        37
@ -84,6 +86,7 @@
 #define EHOSTUNREACH    65
 #define EDQUOT          69
 #define ESTALE          70
+#endif

 // defined by winsock2 but not Linux
 // (commented out because they're not portable)
--- a/source/lib/sysdep/win/wposix/wmman.cpp
+++ b/source/lib/sysdep/win/wposix/wmman.cpp
@ -38,7 +38,7 @@ static DWORD win32_prot(int prot)
 	NODEFAULT;
 	}

-	UNREACHABLE;
+	return 0;	// UNREACHABLE
 }


@ -176,6 +176,13 @@ static LibError mmap_file(void* start, size_t len, int prot, int flags, int fd,

 void* mmap(void* start, size_t len, int prot, int flags, int fd, off_t ofs)
 {
+	if(len == 0)	// POSIX says this must cause mmap to fail
+	{
+		debug_assert(0);
+		errno = EINVAL;
+		return MAP_FAILED;
+	}
+
 	void* p;
 	LibError err;
 	if(flags & MAP_ANONYMOUS)
--- a/source/lib/sysdep/win/wseh.cpp
+++ b/source/lib/sysdep/win/wseh.cpp
@ -330,7 +330,7 @@ C++ classes. this way is more reliable/documented, but has several drawbacks:

 */

-#ifndef LIB_DLL
+#ifdef LIB_STATIC_LINK

 EXTERN_C int mainCRTStartup();

--- a/source/lib/sysdep/win/wutil.cpp
+++ b/source/lib/sysdep/win/wutil.cpp
@ -413,7 +413,7 @@ WinScopedDisableWow64Redirection::~WinScopedDisableWow64Redirection()
 //-----------------------------------------------------------------------------
 // module handle

-#ifdef LIB_DLL
+#ifndef LIB_STATIC_LINK

 HMODULE wutil_LibModuleHandle;

--- a/source/lib/sysdep/win/wutil.h
+++ b/source/lib/sysdep/win/wutil.h
@ -35,6 +35,8 @@ enum WinLockId
 {
 	WAIO_CS,
 	WDBG_SYM_CS,	// protects (non-reentrant) dbghelp.dll
+	WHRT_CS,
+	WNUMA_CS,

 	NUM_CS
 };
--- a/source/lib/sysdep/x86_x64/topology.cpp
+++ b/source/lib/sysdep/x86_x64/topology.cpp
@ -16,6 +16,7 @@
 #include "lib/sysdep/os_cpu.h"
 #include "x86_x64.h"

+
 //-----------------------------------------------------------------------------
 // detect *maximum* number of cores/packages/caches.
 // note: some of them may be disabled by the OS or BIOS.
@ -143,20 +144,22 @@ static size_t LogicalPerCache()

 /**
 * @return an array of the processors' unique APIC IDs or zero if
- * no APIC is present or process affinity is limited.
+ * no xAPIC is present or process affinity is limited.
 **/
 static const u8* ApicIds()
 {
-	static u8 apicIdStorage[os_cpu_MaxProcessors];
-	static const u8* apicIds;
+	const u8* const uninitialized = (const u8*)1;
+	static const u8* apicIds = uninitialized;

-	static volatile uintptr_t initialized = 0;
-	if(cpu_CAS(&initialized, 0, 1))
+	if(apicIds == uninitialized)
 	{
-		// requires 'new' APIC (see x86_x64_ApicId for details)
+		apicIds = 0;	// return zero from now on unless the below succeeds
+
+		// requires xAPIC (see x86_x64_ApicId for details)
 		if(x86_x64_Generation() >= 8)
 		{
 			// store each processor's APIC ID in turn
+			static u8 apicIdStorage[os_cpu_MaxProcessors];
 			struct StoreApicId
 			{
 				static void Callback(size_t processor, uintptr_t UNUSED(cbData))
@ -200,89 +203,109 @@ static size_t NumUniqueValuesInField(const u8* apicIds, size_t offset, size_t nu
 }


-size_t cpu_NumPackages()
+static size_t NumPackages(const u8* apicIds)
 {
-	static size_t numPackages = 0;
-
-	if(!numPackages)
+	if(apicIds)
 	{
-		const u8* apicIds = ApicIds();
-		if(apicIds)
-		{
-			const size_t offset = ceil_log2(CoresPerPackage()) + ceil_log2(LogicalPerCore());
-			const size_t numBits = 8;
-			numPackages = NumUniqueValuesInField(apicIds, offset, numBits);
-		}
-		else
-		{
-			// note: correct results cannot be guaranteed because unreported
-			// and disable logical units are indistinguishable. the below
-			// assumptions are reasonable because we care most about packages
-			// (i.e. whether the system is truly SMP). in contrast, it is
-			// safe to overestimate the number of cores because that
-			// only determines if memory barriers are needed or not.
-			// note: requiring modern processors featuring an APIC does not
-			// prevent this from being reached (the cause may be lack of
-			// OS support or restricted process affinity).
-
-			// assume cores are enabled and count as processors.
-			const size_t numPackagesTimesLogical = os_cpu_NumProcessors() / CoresPerPackage();
-			debug_assert(numPackagesTimesLogical != 0);
-			// assume hyperthreads are enabled; check if they count as processors.
-			if(numPackagesTimesLogical > LogicalPerCore())
-				numPackages = numPackagesTimesLogical / LogicalPerCore();
-		}
+		const size_t offset = ceil_log2(CoresPerPackage()) + ceil_log2(LogicalPerCore());
+		const size_t numBits = 8;
+		return NumUniqueValuesInField(apicIds, offset, numBits);
 	}
+	else
+	{
+		// note: correct results cannot be guaranteed because unreported
+		// and disable logical units are indistinguishable. the below
+		// assumptions are reasonable because we care most about packages
+		// (i.e. whether the system is truly SMP). in contrast, it is
+		// safe to overestimate the number of cores because that
+		// only determines if memory barriers are needed or not.
+		// note: requiring modern processors featuring an APIC does not
+		// prevent this from being reached (the cause may be lack of
+		// OS support or restricted process affinity).

-	return numPackages;
+		// assume cores are enabled and count as processors.
+		const size_t numPackagesTimesLogical = os_cpu_NumProcessors() / CoresPerPackage();
+		debug_assert(numPackagesTimesLogical != 0);
+		// assume hyperthreads are enabled.
+		size_t numPackages = numPackagesTimesLogical;
+		// if they are reported as processors, remove them from the count.
+		if(numPackages > LogicalPerCore())
+			numPackages /= LogicalPerCore();
+		return numPackages;
+	}
 }


-size_t cpu_CoresPerPackage()
+static size_t CoresPerPackage(const u8* apicIds)
 {
-	static size_t enabledCoresPerPackage;
-
-	if(!enabledCoresPerPackage)
+	if(apicIds)
 	{
-		const u8* apicIds = ApicIds();
-		if(apicIds)
-		{
-			const size_t offset = ceil_log2(LogicalPerCore());
-			const size_t numBits = ceil_log2(CoresPerPackage());
-			enabledCoresPerPackage = NumUniqueValuesInField(apicIds, offset, numBits);
-		}
-		else
-		{
-			// guess (must match cpu_NumPackages's assumptions)
-			enabledCoresPerPackage = CoresPerPackage();
-		}
+		const size_t offset = ceil_log2(LogicalPerCore());
+		const size_t numBits = ceil_log2(CoresPerPackage());
+		return NumUniqueValuesInField(apicIds, offset, numBits);
+	}
+	else
+	{
+		// guess (must match NumPackages's assumptions)
+		return CoresPerPackage();
 	}
-
-	return enabledCoresPerPackage;
 }


-size_t cpu_LogicalPerCore()
+static size_t LogicalPerCore(const u8* apicIds)
 {
-	static size_t enabledLogicalPerCore;
+	if(apicIds)
+	{
+		const size_t offset = 0;
+		const size_t numBits = ceil_log2(LogicalPerCore());
+		return NumUniqueValuesInField(apicIds, offset, numBits);
+	}
+	else
+	{
+		// guess (must match NumPackages's assumptions)
+		return LogicalPerCore();
+	}
+}

-	if(!enabledLogicalPerCore)
+
+//-----------------------------------------------------------------------------
+// CPU topology interface
+
+struct CpuTopology	// POD
+{
+	size_t numPackages;
+	size_t coresPerPackage;
+	size_t logicalPerCore;
+};
+
+const CpuTopology* cpu_topology_Detect()
+{
+	static CpuTopology topology;
+
+	if(!topology.numPackages)
 	{
 		const u8* apicIds = ApicIds();
-		if(apicIds)
-		{
-			const size_t offset = 0;
-			const size_t numBits = ceil_log2(LogicalPerCore());
-			enabledLogicalPerCore = NumUniqueValuesInField(apicIds, offset, numBits);
-		}
-		else
-		{
-			// guess (must match cpu_NumPackages's assumptions)
-			enabledLogicalPerCore = LogicalPerCore();
-		}
+		topology.numPackages = NumPackages(apicIds);
+		topology.coresPerPackage = CoresPerPackage(apicIds);
+		topology.logicalPerCore = LogicalPerCore(apicIds);
 	}

-	return enabledLogicalPerCore;
+	return &topology;
+}
+
+size_t cpu_topology_NumPackages(const CpuTopology* topology)
+{
+	return topology->numPackages;
+}
+
+size_t cpu_topology_CoresPerPackage(const CpuTopology* topology)
+{
+	return topology->coresPerPackage;
+}
+
+size_t cpu_topology_LogicalPerCore(const CpuTopology* topology)
+{
+	return topology->logicalPerCore;
 }


@ -293,29 +316,22 @@ size_t cpu_LogicalPerCore()
 // functionality but returns incorrect results. (it claims all cores in
 // an Intel Core2 Quad processor share a single L2 cache.)

-size_t cpu_NumCaches()
+static size_t NumCaches(const u8* apicIds)
 {
-	static size_t numCaches;
-	if(!numCaches)
+	if(apicIds)
 	{
-		const u8* apicIds = ApicIds();
-		if(apicIds)
-		{
-			const size_t offset = 0;
-			const size_t numBits = ceil_log2(LogicalPerCache());
-			numCaches = NumUniqueValuesInField(apicIds, offset, numBits);
-		}
-		else
-		{
-			// assume each processor has its own cache
-			numCaches = os_cpu_NumProcessors();
-		}
+		const size_t offset = 0;
+		const size_t numBits = ceil_log2(LogicalPerCache());
+		return NumUniqueValuesInField(apicIds, offset, numBits);
+	}
+	else
+	{
+		// assume each processor has its own cache
+		return os_cpu_NumProcessors();
 	}
-
-	return numCaches;
 }

-class CacheTopology
+class CacheRelations
 {
 public:
 	/**
@ -388,64 +404,87 @@ private:
 	std::vector<SharedCache> m_caches;
 };

-uintptr_t cpu_ProcessorMaskFromCache(size_t cache)
+static void DetermineCachesProcessorMask(const u8* apicIds, uintptr_t* cachesProcessorMask)
 {
-	static uintptr_t cachesProcessorMask[os_cpu_MaxProcessors];
+	if(apicIds)
+	{
+		const size_t numBits = ceil_log2(LogicalPerCache());
+		const u8 cacheIdMask = u8(0xFF << numBits);

-	static volatile uintptr_t initialized = 0;
-	if(cpu_CAS(&initialized, 0, 1))
+		CacheRelations cacheRelations;
+		for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
+		{
+			const u8 apicId = apicIds[processor];
+			const u8 cacheId = apicId & cacheIdMask;
+			cacheRelations.Add(cacheId, processor);
+		}
+		cacheRelations.StoreProcessorMasks(cachesProcessorMask);
+	}
+	else
+	{
+		// assume each processor has exactly one cache with matching IDs
+		for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
+			cachesProcessorMask[processor] = uintptr_t(1) << processor;
+	}
+}
+
+
+static void DetermineProcessorsCache(size_t numCaches, const uintptr_t* cachesProcessorMask, size_t* processorsCache)
+{
+	for(size_t cache = 0; cache < numCaches; cache++)
+	{
+		// write to all entries that share this cache
+		const uintptr_t processorMask = cachesProcessorMask[cache];
+		for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
+		{
+			if(IsBitSet(processorMask, processor))
+			{
+				debug_assert(processorsCache[processor] == 0);
+				processorsCache[processor] = cache;
+			}
+		}
+	}
+}
+
+
+//-----------------------------------------------------------------------------
+// cache topology interface
+
+struct CacheTopology	// POD
+{
+	size_t numCaches;
+	size_t processorsCache[os_cpu_MaxProcessors];
+	uintptr_t cachesProcessorMask[os_cpu_MaxProcessors];
+};
+
+const CacheTopology* cache_topology_Detect()
+{
+	static CacheTopology topology;
+
+	if(!topology.numCaches)
 	{
 		const u8* apicIds = ApicIds();
-		if(apicIds)
-		{
-			const size_t numBits = ceil_log2(LogicalPerCache());
-			const u8 cacheIdMask = u8(0xFF << numBits);
-
-			CacheTopology cacheManager;
-			for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
-			{
-				const u8 apicId = apicIds[processor];
-				const u8 cacheId = apicId & cacheIdMask;
-				cacheManager.Add(cacheId, processor);
-			}
-			cacheManager.StoreProcessorMasks(cachesProcessorMask);
-		}
-		else
-		{
-			// assume each cache belongs to exactly one processor and
-			// cache index == processor index.
-			for(size_t cache = 0; cache < cpu_NumCaches(); cache++)
-				cachesProcessorMask[cache] = uintptr_t(1) << cache;
-		}
+		topology.numCaches = NumCaches(apicIds);
+		DetermineCachesProcessorMask(apicIds, topology.cachesProcessorMask);
+		DetermineProcessorsCache(topology.numCaches, topology.cachesProcessorMask, topology.processorsCache);
 	}

-	debug_assert(cache < cpu_NumCaches());
-	return cachesProcessorMask[cache];
+	return &topology;
 }

-
-size_t cpu_CacheFromProcessor(size_t processor)
+size_t cache_topology_NumCaches(const CacheTopology* topology)
 {
-	static size_t processorsCache[os_cpu_MaxProcessors];
-
-	static volatile uintptr_t initialized = 0;
-	if(cpu_CAS(&initialized, 0, 1))
-	{
-		for(size_t cache = 0; cache < cpu_NumCaches(); cache++)
-		{
-			// write to all entries that share this cache
-			const uintptr_t processorMask = cpu_ProcessorMaskFromCache(cache);
-			for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
-			{
-				if(IsBitSet(processorMask, processor))
-				{
-					debug_assert(processorsCache[processor] == 0);
-					processorsCache[processor] = cache;
-				}
-			}
-		}
-	}
-
-	debug_assert(processor < os_cpu_NumProcessors());
-	return processorsCache[processor];
+	return topology->numCaches;
+}
+
+size_t cache_topology_CacheFromProcessor(const CacheTopology* topology, size_t processor)
+{
+	debug_assert(processor < os_cpu_NumProcessors());
+	return topology->processorsCache[processor];
+}
+
+uintptr_t cache_topology_ProcessorMaskFromCache(const CacheTopology* topology, size_t cache)
+{
+	debug_assert(cache < topology->numCaches);
+	return topology->cachesProcessorMask[cache];
 }
--- a/source/lib/sysdep/x86_x64/topology.h
+++ b/source/lib/sysdep/x86_x64/topology.h
@ -11,53 +11,91 @@
 #ifndef INCLUDED_TOPOLOGY
 #define INCLUDED_TOPOLOGY

-//-----------------------------------------------------------------------------
-// CPU
+// interface rationale:
+// - explicit initialization avoids the difficulty and overhead of
+//   thread-safe lazy initialization checks.
+// - requiring an opaque struct to be passed in ensures users call the
+//   init function before using the accessors.
+// - delegating responsibility for thread-safety to the caller of the
+//   first *_Detect invocation avoids overhead and keeps us independent of
+//   the various threading packages (Boost, OpenMP, POSIX, Win32, ..)

-// OSes typically consider both SMT units and cores to be "processors".
-// the following routines determine how many of each are actually present and
-// enabled. this information is useful for detecting SMP systems, predicting
-// performance and dimensioning thread pools.
+
+//-----------------------------------------------------------------------------
+// cpu
+
+/**
+ * stores CPU topology, i.e. how many packages, cores and SMT units are
+ * actually present and enabled. this is useful for detecting SMP systems,
+ * predicting performance and dimensioning thread pools.
+ *
+ * note: OS abstractions usually only mention "processors", which could be
+ * any mix of the above.
+ **/
+struct CpuTopology;
+
+/**
+ * initialize static storage from which topology can be retrieved by
+ * means of the following functions.
+ * @return const pointer to a shared instance.
+ *
+ * WARNING: this function must not be reentered before it has returned once.
+ **/
+LIB_API const CpuTopology* cpu_topology_Detect();

 /**
 * @return number of *enabled* CPU packages / sockets.
 **/
-LIB_API size_t cpu_NumPackages();
+LIB_API size_t cpu_topology_NumPackages(const CpuTopology*);

 /**
 * @return number of *enabled* CPU cores per package.
 * (2 on dual-core systems)
 **/
-LIB_API size_t cpu_CoresPerPackage();
+LIB_API size_t cpu_topology_CoresPerPackage(const CpuTopology*);

 /**
 * @return number of *enabled* hyperthreading units per core.
 * (2 on P4 EE)
 **/
-LIB_API size_t cpu_LogicalPerCore();
+LIB_API size_t cpu_topology_LogicalPerCore(const CpuTopology*);


 //-----------------------------------------------------------------------------
 // L2 cache

-// some CPU micro-architectures (e.g. Intel Core2) feature partitioned
-// L2 caches. if the cores sharing a cache work together on the same
-// sub-problem, contention may be reduced and effective capacity increased.
-// the following routines allow discovery of the L2 cache topology:
+/**
+ * stores L2 cache topology, i.e. the mapping between processor and caches.
+ * this allows cores sharing a cache to work together on the same dataset,
+ * which may reduce contention and increase effective capacity.
+ *
+ * example: Intel Core2 micro-architectures (e.g. Intel Core2) feature
+ * partitioned L2 caches shared by two cores.
+ **/
+struct CacheTopology;
+
+/**
+ * initialize static storage from which topology can be retrieved by
+ * means of the following functions.
+ * @return const pointer to a shared instance.
+ *
+ * WARNING: this function must not be reentered before it has returned once.
+ **/
+LIB_API const CacheTopology* cache_topology_Detect();

 /**
 * @return number of distinct L2 caches
 **/
-LIB_API size_t cpu_NumCaches();
+LIB_API size_t cache_topology_NumCaches(const CacheTopology*);

 /**
 * @return L2 cache number (zero-based) to which <processor> belongs.
 **/
-LIB_API size_t cpu_CacheFromProcessor(size_t processor);
+LIB_API size_t cache_topology_CacheFromProcessor(const CacheTopology*, size_t processor);

 /**
 * @return bit-mask of all processors sharing <cache>.
 **/
-LIB_API uintptr_t cpu_ProcessorMaskFromCache(size_t cache);
+LIB_API uintptr_t cache_topology_ProcessorMaskFromCache(const CacheTopology*, size_t cache);

 #endif	// #ifndef INCLUDED_TOPOLOGY
--- a/source/lib/sysdep/x86_x64/x86_x64.cpp
+++ b/source/lib/sysdep/x86_x64/x86_x64.cpp
@ -338,6 +338,65 @@ const char* cpu_IdentifierString()
 }


+//-----------------------------------------------------------------------------
+// misc stateless functions
+
+u8 x86_x64_ApicId()
+{
+	x86_x64_CpuidRegs regs;
+	regs.eax = 1;
+	// note: CPUID function 1 should be available everywhere, but only
+	// processors with an xAPIC (8th generation or above, e.g. P4/Athlon XP)
+	// will return a nonzero value.
+	if(!x86_x64_cpuid(&regs))
+		DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING);
+	const u8 apicId = (u8)bits(regs.ebx, 24, 31);
+	return apicId;
+}
+
+
+u64 x86_x64_rdtsc()
+{
+#if MSC_VERSION
+	return (u64)__rdtsc();
+#elif GCC_VERSION
+	// GCC supports "portable" assembly for both x86 and x64
+	volatile u32 lo, hi;
+	asm volatile ("rdtsc" : "=a" (lo), "=d" (hi));
+	return u64_from_u32(hi, lo);
+#endif
+}
+
+
+void x86_x64_DebugBreak()
+{
+#if MSC_VERSION
+	__debugbreak();
+#elif GCC_VERSION
+	// note: this probably isn't necessary, since unix_debug_break
+	// (SIGTRAP) is most probably available if GCC_VERSION.
+	// we include it for completeness, though.
+	__asm__ __volatile__ ("int $3");
+#endif
+}
+
+
+// enforce strong memory ordering.
+void cpu_MemoryFence()
+{
+	if(x86_x64_cap(X86_X64_CAP_SSE2))
+		_mm_mfence();
+}
+
+
+void cpu_Serialize()
+{
+	x86_x64_CpuidRegs regs;
+	regs.eax = 1;
+	x86_x64_cpuid(&regs);	// CPUID serializes execution.
+}
+
+
 //-----------------------------------------------------------------------------
 // CPU frequency

@ -367,10 +426,8 @@ public:
 };

 // note: this function uses timer.cpp!timer_Time, which is implemented via
-// whrt.cpp on Windows, which again calls x86_x64_Init. be careful that
-// this function isn't called from there as well, else WHRT will be used
-// before its init completes.
-double cpu_ClockFrequency()
+// whrt.cpp on Windows.
+double x86_x64_ClockFrequency()
 {
 	// if the TSC isn't available, there's really no good way to count the
 	// actual CPU clocks per known time interval, so bail.
@ -447,59 +504,3 @@ double cpu_ClockFrequency()
 	const double clock_frequency = sum / (hi-lo);
 	return clock_frequency;
 }
-
-
-//-----------------------------------------------------------------------------
-// misc stateless functions
-
-u8 x86_x64_ApicId()
-{
-	x86_x64_CpuidRegs regs;
-	regs.eax = 1;
-	if(!x86_x64_cpuid(&regs))
-		DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING);
-	const u8 apicId = (u8)bits(regs.ebx, 24, 31);
-	return apicId;
-}
-
-
-u64 x86_x64_rdtsc()
-{
-#if MSC_VERSION
-	return (u64)__rdtsc();
-#elif GCC_VERSION
-	// GCC supports "portable" assembly for both x86 and x64
-	volatile u32 lo, hi;
-	asm volatile ("rdtsc" : "=a" (lo), "=d" (hi));
-	return u64_from_u32(hi, lo);
-#endif
-}
-
-
-void x86_x64_DebugBreak()
-{
-#if MSC_VERSION
-	__debugbreak();
-#elif GCC_VERSION
-	// note: this probably isn't necessary, since unix_debug_break
-	// (SIGTRAP) is most probably available if GCC_VERSION.
-	// we include it for completeness, though.
-	__asm__ __volatile__ ("int $3");
-#endif
-}
-
-
-// enforce strong memory ordering.
-void cpu_MemoryFence()
-{
-	if(x86_x64_cap(X86_X64_CAP_SSE2))
-		_mm_mfence();
-}
-
-
-void cpu_Serialize()
-{
-	x86_x64_CpuidRegs regs;
-	regs.eax = 1;
-	x86_x64_cpuid(&regs);	// CPUID serializes execution.
-}
--- a/source/lib/sysdep/x86_x64/x86_x64.h
+++ b/source/lib/sysdep/x86_x64/x86_x64.h
@ -100,13 +100,12 @@ LIB_API bool x86_x64_cap(x86_x64_Cap cap);
 // stateless

 /**
- * @return APIC ID of the currently executing processor.
+ * @return APIC ID of the currently executing processor or zero if the
+ * platform does not have an xAPIC (i.e. 7th generation x86 or below).
 *
- * the implementation uses CPUID.1 and only works on >= 8th generation CPUs;
- * (P4/Athlon XP); otherwise it returns 0. the alternative of accessing the
- * APIC mmio registers is not feasible - mahaf_MapPhysicalMemory only works
- * reliably on WinXP. also, the OS already has the APIC registers mapped and
- * in constant use, and we don't want to interfere.
+ * rationale: the alternative of accessing the APIC mmio registers is not
+ * feasible - mahaf_MapPhysicalMemory only works reliably on WinXP. we also
+ * don't want to intefere with the OS's constant use of the APIC registers.
 **/
 LIB_API u8 x86_x64_ApicId();

@ -122,4 +121,12 @@ LIB_API u64 x86_x64_rdtsc();
 **/
 LIB_API void x86_x64_DebugBreak(void);

+/**
+ * measure the CPU clock frequency via x86_x64_rdtsc and timer_Time.
+ * (it follows that this must not be called from WHRT init.)
+ * this takes several milliseconds (i.e. much longer than
+ * os_cpu_ClockFrequency) but delivers accurate measurements.
+ **/
+LIB_API double x86_x64_ClockFrequency();
+
 #endif	// #ifndef INCLUDED_X86_X64
--- a/source/network/NetLog.cpp
+++ b/source/network/NetLog.cpp
@ -141,7 +141,7 @@ void CNetLogSink::DoSink( const CNetLogEvent& event )
 //-----------------------------------------------------------------------------
 void CNetLogSink::DoBulkSink( const CNetLogEvent* pEvents, size_t eventCount )
 {
-	unsigned*	pIndices	= NULL;
+	size_t*	pIndices	= NULL;
 	size_t	indexCount  = 0;
 	size_t	i;

--- a/source/ps/GameSetup/GameSetup.cpp
+++ b/source/ps/GameSetup/GameSetup.cpp
@ -577,7 +577,7 @@ static void InitVfs(const CmdLineArgs& args)
 	// the VFS prevents any accesses to files above this directory.
 	path_SetRoot(args.GetArg0(), "../data");

-	g_VFS = CreateVfs();
+	g_VFS = CreateVfs(96*MiB);

 	g_VFS->Mount("screenshots/", "screenshots");
 	g_VFS->Mount("config/", "config");
--- a/source/ps/Util.cpp
+++ b/source/ps/Util.cpp
@ -75,8 +75,9 @@ void WriteSystemInfo()
 	fprintf(f, "OS             : %s %s (%s)\n", un.sysname, un.release, un.version);

 	// CPU
-	fprintf(f, "CPU            : %s, %s (%dx%dx%d)", un.machine, cpu_IdentifierString(), cpu_NumPackages(), cpu_CoresPerPackage(), cpu_LogicalPerCore());
-	const double cpu_freq = cpu_ClockFrequency();
+	const CpuTopology* topology = cpu_topology_Detect();
+	fprintf(f, "CPU            : %s, %s (%dx%dx%d)", un.machine, cpu_IdentifierString(), cpu_topology_NumPackages(topology), cpu_topology_CoresPerPackage(topology), cpu_topology_LogicalPerCore(topology));
+	const double cpu_freq = os_cpu_ClockFrequency();
 	if(cpu_freq != 0.0f)
 	{
 		if(cpu_freq < 1e9)
--- a/source/ps/XML/tests/test_Xeromyces.h
+++ b/source/ps/XML/tests/test_Xeromyces.h
@ -10,7 +10,7 @@ public:
 	void test_paths()
 	{
 		TS_ASSERT_OK(path_SetRoot(0, "../data"));
-		PIVFS vfs = CreateVfs();
+		PIVFS vfs = CreateVfs(20*MiB);

 		TS_ASSERT_OK(vfs->Mount("", "mods/_test.xero"));