diff --git a/source/lib/sysdep/arch/ia32/ia32.cpp b/source/lib/sysdep/arch/ia32/ia32.cpp
index af54b9183e..7710951956 100644
--- a/source/lib/sysdep/arch/ia32/ia32.cpp
+++ b/source/lib/sysdep/arch/ia32/ia32.cpp
@@ -158,6 +158,11 @@ bool cpu_CAS(volatile intptr_t* location, intptr_t expected, intptr_t new_value)
 	return ia32_asm_CAS(location, expected, new_value);
 }
 
+bool cpu_CAS64(volatile u64* location, u64 expected, u64 new_value)
+{
+	return ia32_asm_CAS64(location, expected, new_value);
+}
+
 
 void* cpu_memcpy(void* RESTRICT dst, const void* RESTRICT src, size_t size)
 {
diff --git a/source/lib/sysdep/arch/ia32/ia32_asm.asm b/source/lib/sysdep/arch/ia32/ia32_asm.asm
index a0b218b2cf..1f5addb997 100644
--- a/source/lib/sysdep/arch/ia32/ia32_asm.asm
+++ b/source/lib/sysdep/arch/ia32/ia32_asm.asm
@@ -92,6 +92,25 @@ db		0xf0							; LOCK prefix
 	ret
 
 
+; extern bool CALL_CONV ia32_asm_CAS64(volatile u64* location, u64 expected, u64 new_value);
+global sym(ia32_asm_CAS64)
+sym(ia32_asm_CAS64):
+	push	ebx
+	push	esi
+	mov		esi, [esp+8+4]				; location
+	mov		eax, [esp+8+8]
+	mov		edx, [esp+8+12]				; edx:eax = expected
+	mov		ebx, [esp+8+16]
+	mov		ecx, [esp+8+20]				; ecx:ebx = new_value
+db		0xf0							; LOCK prefix
+	cmpxchg8b	[esi]
+	sete	al
+	movzx	eax, al
+	pop		esi
+	pop		ebx
+	ret
+
+
 ;-------------------------------------------------------------------------------
 ; FPU
 ;-------------------------------------------------------------------------------
diff --git a/source/lib/sysdep/arch/ia32/ia32_asm.h b/source/lib/sysdep/arch/ia32/ia32_asm.h
index 78b15731a4..2ac7c18e15 100644
--- a/source/lib/sysdep/arch/ia32/ia32_asm.h
+++ b/source/lib/sysdep/arch/ia32/ia32_asm.h
@@ -36,6 +36,7 @@ extern void CALL_CONV ia32_asm_cpuid(x86_x64_CpuidRegs* regs);
 
 extern intptr_t CALL_CONV ia32_asm_AtomicAdd(volatile intptr_t* location, intptr_t increment);
 extern bool CALL_CONV ia32_asm_CAS(volatile intptr_t* location, intptr_t expected, intptr_t new_value);
+extern bool CALL_CONV ia32_asm_CAS64(volatile u64* location, u64 expected, u64 new_value);
 
 /// control87
 // FPU control word
diff --git a/source/lib/sysdep/arch/x86_x64/tests/test_topology.h b/source/lib/sysdep/arch/x86_x64/tests/test_topology.h
index 9993238446..e118e11fb2 100644
--- a/source/lib/sysdep/arch/x86_x64/tests/test_topology.h
+++ b/source/lib/sysdep/arch/x86_x64/tests/test_topology.h
@@ -29,8 +29,8 @@ class TestTopology : public CxxTest::TestSuite
 public:
 	void test_run()
 	{
-		// Just run the function, ignoring the return value, so
-		// Valgrind can check it's not doing anything very bad
-		cpu_topology_Detect();
+		TS_ASSERT_LESS_THAN_EQUALS(1, cpu_topology_NumPackages());
+		TS_ASSERT_LESS_THAN_EQUALS(1, cpu_topology_CoresPerPackage());
+		TS_ASSERT_LESS_THAN_EQUALS(1, cpu_topology_LogicalPerCore());
 	}
 };
diff --git a/source/lib/sysdep/arch/x86_x64/topology.cpp b/source/lib/sysdep/arch/x86_x64/topology.cpp
index e14a2dea95..4e3d475555 100644
--- a/source/lib/sysdep/arch/x86_x64/topology.cpp
+++ b/source/lib/sysdep/arch/x86_x64/topology.cpp
@@ -105,9 +105,9 @@ static size_t MaxLogicalPerCore()
 
 static size_t MaxLogicalPerCache()
 {
-	const x86_x64_Cache* const dcache = x86_x64_DCache();
-	if(dcache->levels >= 2)
-		return dcache->parameters[1].sharedBy;
+	const x86_x64_Caches* const dcaches = x86_x64_DCaches();
+	if(dcaches->numLevels >= 2)
+		return dcaches->levels[1].sharedBy;
 	else
 		return 1;	// default
 }
@@ -204,8 +204,9 @@ static size_t NumUniqueValuesInField(const u8* apicIds, size_t offset, size_t nu
 }
 
 
-static size_t NumPackages(const u8* apicIds)
+size_t cpu_topology_NumPackages()
 {
+	const u8* apicIds = ApicIds();
 	if(apicIds)
 	{
 		const size_t offset = ceil_log2(MaxCoresPerPackage()) + ceil_log2(MaxLogicalPerCore());
@@ -236,8 +237,9 @@ static size_t NumPackages(const u8* apicIds)
 }
 
 
-static size_t CoresPerPackage(const u8* apicIds)
+size_t cpu_topology_CoresPerPackage()
 {
+	const u8* apicIds = ApicIds();
 	if(apicIds)
 	{
 		const size_t offset = ceil_log2(MaxLogicalPerCore());
@@ -251,8 +253,9 @@ static size_t CoresPerPackage(const u8* apicIds)
 }
 
 
-static size_t LogicalPerCore(const u8* apicIds)
+size_t cpu_topology_LogicalPerCore()
 {
+	const u8* apicIds = ApicIds();
 	if(apicIds)
 	{
 		const size_t offset = 0;
@@ -266,49 +269,6 @@ static size_t LogicalPerCore(const u8* apicIds)
 }
 
 
-//-----------------------------------------------------------------------------
-// CPU topology interface
-
-struct CpuTopology	// POD
-{
-	size_t numPackages;
-	size_t coresPerPackage;
-	size_t logicalPerCore;
-};
-static CpuTopology cpuTopology;
-
-static LibError InitCpuTopology()
-{
-	const u8* apicIds = ApicIds();
-	cpuTopology.numPackages = NumPackages(apicIds);
-	cpuTopology.coresPerPackage = CoresPerPackage(apicIds);
-	cpuTopology.logicalPerCore = LogicalPerCore(apicIds);
-	return INFO::OK;
-}
-
-const CpuTopology* cpu_topology_Detect()
-{
-	static ModuleInitState initState;
-	ModuleInit(&initState, InitCpuTopology);
-	return &cpuTopology;
-}
-
-size_t cpu_topology_NumPackages(const CpuTopology* topology)
-{
-	return topology->numPackages;
-}
-
-size_t cpu_topology_CoresPerPackage(const CpuTopology* topology)
-{
-	return topology->coresPerPackage;
-}
-
-size_t cpu_topology_LogicalPerCore(const CpuTopology* topology)
-{
-	return topology->logicalPerCore;
-}
-
-
 //-----------------------------------------------------------------------------
 // cache topology
 
@@ -451,6 +411,7 @@ struct CacheTopology	// POD
 	uintptr_t cachesProcessorMask[os_cpu_MaxProcessors];
 };
 static CacheTopology cacheTopology;
+static ModuleInitState cacheInitState;
 
 static LibError InitCacheTopology()
 {
@@ -460,26 +421,22 @@ static LibError InitCacheTopology()
 	return INFO::OK;
 }
 
-const CacheTopology* cache_topology_Detect()
+size_t cache_topology_NumCaches()
 {
-	static ModuleInitState initState;
-	ModuleInit(&initState, InitCacheTopology);
-	return &cacheTopology;
+	ModuleInit(&cacheInitState, InitCacheTopology);
+	return cacheTopology.numCaches;
 }
 
-size_t cache_topology_NumCaches(const CacheTopology* topology)
-{
-	return topology->numCaches;
-}
-
-size_t cache_topology_CacheFromProcessor(const CacheTopology* topology, size_t processor)
+size_t cache_topology_CacheFromProcessor(size_t processor)
 {
+	ModuleInit(&cacheInitState, InitCacheTopology);
 	debug_assert(processor < os_cpu_NumProcessors());
-	return topology->processorsCache[processor];
+	return cacheTopology.processorsCache[processor];
 }
 
-uintptr_t cache_topology_ProcessorMaskFromCache(const CacheTopology* topology, size_t cache)
+uintptr_t cache_topology_ProcessorMaskFromCache(size_t cache)
 {
-	debug_assert(cache < topology->numCaches);
-	return topology->cachesProcessorMask[cache];
+	ModuleInit(&cacheInitState, InitCacheTopology);
+	debug_assert(cache < cacheTopology.numCaches);
+	return cacheTopology.cachesProcessorMask[cache];
 }
diff --git a/source/lib/sysdep/arch/x86_x64/topology.h b/source/lib/sysdep/arch/x86_x64/topology.h
index 077e561c4e..4e231b64d4 100644
--- a/source/lib/sysdep/arch/x86_x64/topology.h
+++ b/source/lib/sysdep/arch/x86_x64/topology.h
@@ -21,22 +21,13 @@
  */
 
 /*
- * detection of CPU and cache topology
+ * detection of CPU and cache topology.
+ * thread-safe, no explicit initialization is required.
  */
 
 #ifndef INCLUDED_TOPOLOGY
 #define INCLUDED_TOPOLOGY
 
-// interface rationale:
-// - explicit initialization avoids the difficulty and overhead of
-//   thread-safe lazy initialization checks.
-// - requiring an opaque struct to be passed in ensures users call the
-//   init function before using the accessors.
-// - delegating responsibility for thread-safety to the caller of the
-//   first *_Detect invocation avoids overhead and keeps us independent of
-//   the various threading packages (Boost, OpenMP, POSIX, Win32, ..)
-
-
 /**
  * @return a pointer to array (up to os_cpu_MaxProcessors entries;
  * os_cpu_NumProcessors() of them are valid) of the processors'
@@ -49,76 +40,54 @@ LIB_API const u8* ApicIds();
 //-----------------------------------------------------------------------------
 // cpu
 
-/**
- * stores CPU topology, i.e. how many packages, cores and SMT units are
- * actually present and enabled. this is useful for detecting SMP systems,
- * predicting performance and dimensioning thread pools.
- *
- * note: OS abstractions usually only mention "processors", which could be
- * any mix of the above.
- **/
-struct CpuTopology;
-
-/**
- * initialize static storage from which topology can be retrieved by
- * means of the following functions.
- * @return const pointer to a shared instance.
- **/
-LIB_API const CpuTopology* cpu_topology_Detect();
+// the CPU topology, i.e. how many packages, cores and SMT units are
+// actually present and enabled, is useful for detecting SMP systems,
+// predicting performance and dimensioning thread pools.
+//
+// note: OS abstractions usually only mention "processors", which could be
+// any mix of the above.
 
 /**
  * @return number of *enabled* CPU packages / sockets.
  **/
-LIB_API size_t cpu_topology_NumPackages(const CpuTopology*);
+LIB_API size_t cpu_topology_NumPackages();
 
 /**
  * @return number of *enabled* CPU cores per package.
  * (2 on dual-core systems)
  **/
-LIB_API size_t cpu_topology_CoresPerPackage(const CpuTopology*);
+LIB_API size_t cpu_topology_CoresPerPackage();
 
 /**
  * @return number of *enabled* hyperthreading units per core.
  * (2 on P4 EE)
  **/
-LIB_API size_t cpu_topology_LogicalPerCore(const CpuTopology*);
+LIB_API size_t cpu_topology_LogicalPerCore();
 
 
 //-----------------------------------------------------------------------------
 // L2 cache
 
-/**
- * stores L2 cache topology, i.e. the mapping between processor and caches.
- * this allows cores sharing a cache to work together on the same dataset,
- * which may reduce contention and increase effective capacity.
- *
- * example: Intel Core2 micro-architectures (e.g. Intel Core2) feature
- * partitioned L2 caches shared by two cores.
- **/
-struct CacheTopology;
+// knowledge of the cache topology, i.e. which processors share which caches,
+// can be used to reduce contention and increase effective capacity by
+// assigning the partner processors to work on the same dataset.
+//
+// example: Intel Core2 micro-architectures feature L2 caches shared by
+// two cores.
 
 /**
- * initialize static storage from which topology can be retrieved by
- * means of the following functions.
- * @return const pointer to a shared instance.
- *
- * WARNING: this function must not be reentered before it has returned once.
+ * @return number of distinct L2 caches.
  **/
-LIB_API const CacheTopology* cache_topology_Detect();
-
-/**
- * @return number of distinct L2 caches
- **/
-LIB_API size_t cache_topology_NumCaches(const CacheTopology*);
+LIB_API size_t cache_topology_NumCaches();
 
 /**
  * @return L2 cache number (zero-based) to which <processor> belongs.
  **/
-LIB_API size_t cache_topology_CacheFromProcessor(const CacheTopology*, size_t processor);
+LIB_API size_t cache_topology_CacheFromProcessor(size_t processor);
 
 /**
  * @return bit-mask of all processors sharing <cache>.
  **/
-LIB_API uintptr_t cache_topology_ProcessorMaskFromCache(const CacheTopology*, size_t cache);
+LIB_API uintptr_t cache_topology_ProcessorMaskFromCache(size_t cache);
 
 #endif	// #ifndef INCLUDED_TOPOLOGY
diff --git a/source/lib/sysdep/arch/x86_x64/x86_x64.cpp b/source/lib/sysdep/arch/x86_x64/x86_x64.cpp
index a41362acc4..df8a268ada 100644
--- a/source/lib/sysdep/arch/x86_x64/x86_x64.cpp
+++ b/source/lib/sysdep/arch/x86_x64/x86_x64.cpp
@@ -285,57 +285,67 @@ size_t x86_x64_Generation()
 //-----------------------------------------------------------------------------
 // cache
 
-static const size_t maxCacheParams = 3;
-static x86_x64_CacheParameters cacheParametersStorage[maxCacheParams*2];
-static x86_x64_Cache dcache = { 0, cacheParametersStorage };
-static x86_x64_Cache icache = { 0, cacheParametersStorage+maxCacheParams };
+static const size_t maxCacheLevels = 3;
+static x86_x64_Cache cacheStorage[maxCacheLevels*2];
+static x86_x64_Caches dcaches = { 0, cacheStorage };
+static x86_x64_Caches icaches = { 0, cacheStorage+maxCacheLevels };
 
-static const size_t maxTLBParams = 15;
-static x86_x64_TLBParameters tlbParametersStorage[maxTLBParams*2];
-static x86_x64_TLB dtlb = { 0, tlbParametersStorage };
-static x86_x64_TLB itlb = { 0, tlbParametersStorage+maxTLBParams };
+static const size_t maxTLBLevels = 15;
+static x86_x64_TLB tlbStorage[maxTLBLevels*2];
+static x86_x64_TLBs dtlbs = { 0, tlbStorage };
+static x86_x64_TLBs itlbs = { 0, tlbStorage+maxTLBLevels };
 
-static void AddTLBParameters(const x86_x64_TLBParameters& params)
+static bool IsData(x86_x64_CacheType type)
 {
-	if(params.type == X86_X64_CACHE_TYPE_INSTRUCTION || params.type == X86_X64_CACHE_TYPE_UNIFIED)
+	return (type == X86_X64_CACHE_TYPE_DATA || type == X86_X64_CACHE_TYPE_UNIFIED);
+}
+
+static bool IsInstruction(x86_x64_CacheType type)
+{
+	return (type == X86_X64_CACHE_TYPE_INSTRUCTION || type == X86_X64_CACHE_TYPE_UNIFIED);
+}
+
+static void AddTLB(const x86_x64_TLB& tlb)
+{
+	if(IsInstruction(tlb.type))
 	{
-		if(itlb.numParameters < maxTLBParams)
-			itlb.parameters[itlb.numParameters++] = params;
+		if(itlbs.numLevels < maxTLBLevels)
+			itlbs.levels[itlbs.numLevels++] = tlb;
 		else
 			debug_assert(0);
 	}
-	if(params.type == X86_X64_CACHE_TYPE_DATA || params.type == X86_X64_CACHE_TYPE_UNIFIED)
+	if(IsData(tlb.type))
 	{
-		if(dtlb.numParameters < maxTLBParams)
-			dtlb.parameters[dtlb.numParameters++] = params;
+		if(dtlbs.numLevels < maxTLBLevels)
+			dtlbs.levels[dtlbs.numLevels++] = tlb;
 		else
 			debug_assert(0);
 	}
 
 	// large page TLBs have N 2M entries or N/2 4M entries; we generate a
 	// second set of parameters for the latter from the former.
-	if(params.pageSize == 2*MiB)
+	if(tlb.pageSize == 2*MiB)
 	{
-		x86_x64_TLBParameters params4M = params;
-		params4M.pageSize = 4*MiB;
-		params4M.entries  = params.entries/2;
-		AddTLBParameters(params4M);
+		x86_x64_TLB tlb4M = tlb;
+		tlb4M.pageSize = 4*MiB;
+		tlb4M.entries  = tlb.entries/2;
+		AddTLB(tlb4M);
 	}
 }
 
 namespace AMD
 {
 
-static x86_x64_CacheParameters L1Parameters(u32 reg, x86_x64_CacheType type)
+static x86_x64_Cache L1Cache(u32 reg, x86_x64_CacheType type)
 {
-	x86_x64_CacheParameters params;
-	params.type          = type;
-	params.level         = 1;
-	params.associativity = bits(reg, 16, 23);
-	params.lineSize      = bits(reg,  0,  7);
-	params.sharedBy      = 1;
-	params.totalSize     = bits(reg, 24, 31)*KiB;
-	return params;
+	x86_x64_Cache cache;
+	cache.type          = type;
+	cache.level         = 1;
+	cache.associativity = bits(reg, 16, 23);
+	cache.lineSize      = bits(reg,  0,  7);
+	cache.sharedBy      = 1;
+	cache.totalSize     = bits(reg, 24, 31)*KiB;
+	return cache;
 }
 
 // applies to L2, L3 and TLB2
@@ -345,85 +355,85 @@ static const size_t associativities[16] =
 	16, 0, 32, 48, 64, 96, 128, x86_x64_fullyAssociative
 };
 
-static x86_x64_CacheParameters L2Parameters(u32 reg, x86_x64_CacheType type)
+static x86_x64_Cache L2Cache(u32 reg, x86_x64_CacheType type)
 {
-	x86_x64_CacheParameters params;
+	x86_x64_Cache cache;
 	const size_t associativityIndex = bits(reg, 12, 15);
 	if(associativityIndex == 0)	// disabled
 	{
-		params.type = X86_X64_CACHE_TYPE_NULL;
-		params.associativity = 0;
+		cache.type = X86_X64_CACHE_TYPE_NULL;
+		cache.associativity = 0;
 	}
 	else
 	{
-		params.type = type;
-		params.associativity = associativities[associativityIndex];
-		debug_assert(params.associativity != 0);	// else: encoding is "reserved"
+		cache.type = type;
+		cache.associativity = associativities[associativityIndex];
+		debug_assert(cache.associativity != 0);	// else: encoding is "reserved"
 	}
-	params.level = 2;
-	params.lineSize  = bits(reg,  0,  7);
-	params.sharedBy  = 1;
-	params.totalSize = bits(reg, 16, 31)*KiB;
-	return params;
+	cache.level = 2;
+	cache.lineSize  = bits(reg,  0,  7);
+	cache.sharedBy  = 1;
+	cache.totalSize = bits(reg, 16, 31)*KiB;
+	return cache;
 }
 
 // (same as L2 except for the totalSize encoding)
-static x86_x64_CacheParameters L3Parameters(u32 reg, x86_x64_CacheType type)
+static x86_x64_Cache L3Cache(u32 reg, x86_x64_CacheType type)
 {
-	x86_x64_CacheParameters params = L2Parameters(reg, type);
-	params.level = 3;
-	params.totalSize = bits(reg, 18, 31)*512*KiB;	// (rounded down)
-	return params;
+	x86_x64_Cache cache = L2Cache(reg, type);
+	cache.level = 3;
+	cache.totalSize = bits(reg, 18, 31)*512*KiB;	// (rounded down)
+	return cache;
 }
 
-static x86_x64_TLBParameters TLB1Parameters(u32 reg, size_t bitOffset, size_t pageSize, x86_x64_CacheType type)
+static x86_x64_TLB TLB1(u32 reg, size_t bitOffset, size_t pageSize, x86_x64_CacheType type)
 {
-	x86_x64_TLBParameters params;
-	params.type          = type;
-	params.level         = 1;
-	params.associativity = bits(reg, bitOffset+8, bitOffset+15);
-	params.pageSize      = pageSize;
-	params.entries       = bits(reg, bitOffset, bitOffset+7);
-	return params;
+	x86_x64_TLB tlb;
+	tlb.type          = type;
+	tlb.level         = 1;
+	tlb.associativity = bits(reg, bitOffset+8, bitOffset+15);
+	tlb.pageSize      = pageSize;
+	tlb.entries       = bits(reg, bitOffset, bitOffset+7);
+	return tlb;
 }
 
-static void AddTLB1Parameters(const x86_x64_CpuidRegs& regs)
+static void AddTLB1(const x86_x64_CpuidRegs& regs)
 {
-	AddTLBParameters(TLB1Parameters(regs.eax,  0, 2*MiB, X86_X64_CACHE_TYPE_INSTRUCTION));
-	AddTLBParameters(TLB1Parameters(regs.eax, 16, 2*MiB, X86_X64_CACHE_TYPE_DATA));
-	AddTLBParameters(TLB1Parameters(regs.ebx,  0, 4*KiB, X86_X64_CACHE_TYPE_INSTRUCTION));
-	AddTLBParameters(TLB1Parameters(regs.ebx, 16, 4*KiB, X86_X64_CACHE_TYPE_DATA));
+	AddTLB(TLB1(regs.eax,  0, 2*MiB, X86_X64_CACHE_TYPE_INSTRUCTION));
+	AddTLB(TLB1(regs.eax, 16, 2*MiB, X86_X64_CACHE_TYPE_DATA));
+	AddTLB(TLB1(regs.ebx,  0, 4*KiB, X86_X64_CACHE_TYPE_INSTRUCTION));
+	AddTLB(TLB1(regs.ebx, 16, 4*KiB, X86_X64_CACHE_TYPE_DATA));
 }
 
-static x86_x64_TLBParameters TLB2Parameters(u32 reg, size_t bitOffset, size_t pageSize, x86_x64_CacheType type)
+static x86_x64_TLB TLB2(u32 reg, size_t bitOffset, size_t pageSize, x86_x64_CacheType type)
 {
-	x86_x64_TLBParameters params;
+	x86_x64_TLB tlb;
 	const size_t associativityIndex = bits(reg, bitOffset+12, bitOffset+15);
 	if(associativityIndex == 0)	// disabled
 	{
-		params.type = X86_X64_CACHE_TYPE_NULL;
-		params.associativity = 0;
+		tlb.type = X86_X64_CACHE_TYPE_NULL;
+		tlb.associativity = 0;
 	}
 	else
 	{
-		params.type = type;
-		params.associativity = associativities[associativityIndex];
+		tlb.type = type;
+		tlb.associativity = associativities[associativityIndex];
 	}
-	params.level    = 2;
-	params.pageSize = pageSize;
-	params.entries  = bits(reg, bitOffset, bitOffset+11);
-	return params;
+	tlb.level    = 2;
+	tlb.pageSize = pageSize;
+	tlb.entries  = bits(reg, bitOffset, bitOffset+11);
+	return tlb;
 }
 
-static void AddTLB2ParameterPair(u32 reg, size_t pageSize)
+static void AddTLB2Pair(u32 reg, size_t pageSize)
 {
 	x86_x64_CacheType type = X86_X64_CACHE_TYPE_UNIFIED;
 	if(bits(reg, 16, 31) != 0)	// not unified
 	{
-		AddTLBParameters(TLB2Parameters(reg, 16, pageSize, X86_X64_CACHE_TYPE_DATA));
+		AddTLB(TLB2(reg, 16, pageSize, X86_X64_CACHE_TYPE_DATA));
 		type = X86_X64_CACHE_TYPE_INSTRUCTION;
 	}
-	AddTLBParameters(TLB2Parameters(reg, 0, pageSize, type));
+	AddTLB(TLB2(reg, 0, pageSize, type));
 }
 
 // AMD reports maxCpuidIdFunction > 4 but consider functions 2..4 to be
@@ -435,24 +445,24 @@ static void DetectCacheAndTLB()
 	regs.eax = 0x80000005;
 	if(x86_x64_cpuid(&regs))
 	{
-		AddTLB1Parameters(regs);
+		AddTLB1(regs);
 
-		dcache.levels = icache.levels = 1;
-		dcache.parameters[0] = L1Parameters(regs.ecx, X86_X64_CACHE_TYPE_DATA);
-		icache.parameters[0] = L1Parameters(regs.edx, X86_X64_CACHE_TYPE_INSTRUCTION);
+		dcaches.numLevels = icaches.numLevels = 1;
+		dcaches.levels[0] = L1Cache(regs.ecx, X86_X64_CACHE_TYPE_DATA);
+		icaches.levels[0] = L1Cache(regs.edx, X86_X64_CACHE_TYPE_INSTRUCTION);
 	}
 
 	regs.eax = 0x80000006;
 	if(x86_x64_cpuid(&regs))
 	{
-		AddTLB2ParameterPair(regs.eax, 2*MiB);
-		AddTLB2ParameterPair(regs.ebx, 4*KiB);
+		AddTLB2Pair(regs.eax, 2*MiB);
+		AddTLB2Pair(regs.ebx, 4*KiB);
 
-		icache.levels = dcache.levels = 2;
-		icache.parameters[1] = dcache.parameters[1] = L2Parameters(regs.ecx, X86_X64_CACHE_TYPE_UNIFIED);
+		icaches.numLevels = dcaches.numLevels = 2;
+		icaches.levels[1] = dcaches.levels[1] = L2Cache(regs.ecx, X86_X64_CACHE_TYPE_UNIFIED);
 
-		icache.levels = dcache.levels = 3;
-		icache.parameters[2] = dcache.parameters[2] = L3Parameters(regs.edx, X86_X64_CACHE_TYPE_UNIFIED);
+		icaches.numLevels = dcaches.numLevels = 3;
+		icaches.levels[2] = dcaches.levels[2] = L3Cache(regs.edx, X86_X64_CACHE_TYPE_UNIFIED);
 	}
 }
 
@@ -480,27 +490,27 @@ static void DetectCache_CPUID4()
 		if(type == X86_X64_CACHE_TYPE_NULL)	// no more remaining
 			break;
 
-		x86_x64_CacheParameters params;
-		params.type          = type;
-		params.level         = level;
-		params.associativity = (size_t)bits(regs.ebx, 22, 31)+1;
-		params.lineSize      = (size_t)bits(regs.ebx,  0, 11)+1;	// (yes, this also uses +1 encoding)
-		params.sharedBy      = (size_t)bits(regs.eax, 14, 25)+1;
+		x86_x64_Cache cache;
+		cache.type          = type;
+		cache.level         = level;
+		cache.associativity = (size_t)bits(regs.ebx, 22, 31)+1;
+		cache.lineSize      = (size_t)bits(regs.ebx,  0, 11)+1;	// (yes, this also uses +1 encoding)
+		cache.sharedBy      = (size_t)bits(regs.eax, 14, 25)+1;
 		{
 			const size_t partitions = (size_t)bits(regs.ebx, 12, 21)+1;
 			const size_t sets = (size_t)bits(regs.ecx, 0, 31)+1;
-			params.totalSize = params.associativity * partitions * params.lineSize * sets;
+			cache.totalSize = cache.associativity * partitions * cache.lineSize * sets;
 		}
 
-		if(type == X86_X64_CACHE_TYPE_INSTRUCTION || type == X86_X64_CACHE_TYPE_UNIFIED)
+		if(IsInstruction(type))
 		{
-			icache.levels = std::max(icache.levels, level);
-			icache.parameters[level-1] = params;
+			icaches.numLevels = std::max(icaches.numLevels, level);
+			icaches.levels[level-1] = cache;
 		}
-		if(type == X86_X64_CACHE_TYPE_DATA || type == X86_X64_CACHE_TYPE_UNIFIED)
+		if(IsData(type))
 		{
-			dcache.levels = std::max(dcache.levels, level);
-			dcache.parameters[level-1] = params;
+			dcaches.numLevels = std::max(dcaches.numLevels, level);
+			dcaches.levels[level-1] = cache;
 		}
 	}
 }
@@ -624,24 +634,24 @@ static void DecodeDescriptor(u8 descriptor)
 		else
 			debug_assert(0);
 
-		x86_x64_TLBParameters params;
-		params.type          = type;
-		params.level         = level;
-		params.associativity = properties.associativity;
-		params.pageSize      = pageSize;
-		params.entries       = properties.entries;
+		x86_x64_TLB tlb;
+		tlb.type          = type;
+		tlb.level         = level;
+		tlb.associativity = properties.associativity;
+		tlb.pageSize      = pageSize;
+		tlb.entries       = properties.entries;
 
-		if(type == X86_X64_CACHE_TYPE_INSTRUCTION || type == X86_X64_CACHE_TYPE_UNIFIED)
+		if(IsInstruction(type))
 		{
-			if(itlb.numParameters < maxTLBParams)
-				itlb.parameters[itlb.numParameters++] = params;
+			if(itlbs.numLevels < maxTLBLevels)
+				itlbs.levels[itlbs.numLevels++] = tlb;
 			else
 				debug_assert(0);
 		}
-		if(type == X86_X64_CACHE_TYPE_DATA || type == X86_X64_CACHE_TYPE_UNIFIED)
+		if(IsData(type))
 		{
-			if(dtlb.numParameters < maxTLBParams)
-				dtlb.parameters[dtlb.numParameters++] = params;
+			if(dtlbs.numLevels < maxTLBLevels)
+				dtlbs.levels[dtlbs.numLevels++] = tlb;
 			else
 				debug_assert(0);
 		}
@@ -694,71 +704,71 @@ static LibError DetectCacheAndTLB()
 	}
 
 	// sanity check: cache type must match that of the data structure
-	for(size_t i = 0; i < dcache.levels; i++)
-		debug_assert(dcache.parameters[i].type != X86_X64_CACHE_TYPE_INSTRUCTION);
-	for(size_t i = 0; i < icache.levels; i++)
-		debug_assert(icache.parameters[i].type != X86_X64_CACHE_TYPE_DATA);
-	for(size_t i = 0; i < dtlb.numParameters; i++)
-		debug_assert(dtlb.parameters[i].type != X86_X64_CACHE_TYPE_INSTRUCTION);
-	for(size_t i = 0; i < itlb.numParameters; i++)
-		debug_assert(itlb.parameters[i].type != X86_X64_CACHE_TYPE_DATA);
+	for(size_t i = 0; i < dcaches.numLevels; i++)
+		debug_assert(dcaches.levels[i].type != X86_X64_CACHE_TYPE_INSTRUCTION);
+	for(size_t i = 0; i < icaches.numLevels; i++)
+		debug_assert(icaches.levels[i].type != X86_X64_CACHE_TYPE_DATA);
+	for(size_t i = 0; i < dtlbs.numLevels; i++)
+		debug_assert(dtlbs.levels[i].type != X86_X64_CACHE_TYPE_INSTRUCTION);
+	for(size_t i = 0; i < itlbs.numLevels; i++)
+		debug_assert(itlbs.levels[i].type != X86_X64_CACHE_TYPE_DATA);
 
 	// ensure x86_x64_L1CacheLineSize and x86_x64_L2CacheLineSize will work
-	debug_assert(dcache.levels >= 2);
-	debug_assert(dcache.parameters[0].lineSize != 0);
-	debug_assert(dcache.parameters[1].lineSize != 0);
+	debug_assert(dcaches.numLevels >= 2);
+	debug_assert(dcaches.levels[0].lineSize != 0);
+	debug_assert(dcaches.levels[1].lineSize != 0);
 
 	return INFO::OK;
 }
 
-const x86_x64_Cache* x86_x64_ICache()
+const x86_x64_Caches* x86_x64_ICaches()
 {
 	ModuleInit(&cacheInitState, DetectCacheAndTLB);
-	return &icache;
+	return &icaches;
 }
 
-const x86_x64_Cache* x86_x64_DCache()
+const x86_x64_Caches* x86_x64_DCaches()
 {
 	ModuleInit(&cacheInitState, DetectCacheAndTLB);
-	return &dcache;
+	return &dcaches;
 }
 
 size_t x86_x64_L1CacheLineSize()
 {
-	return x86_x64_DCache()->parameters[0].lineSize;
+	return x86_x64_DCaches()->levels[0].lineSize;
 }
 
 size_t x86_x64_L2CacheLineSize()
 {
-	return x86_x64_DCache()->parameters[1].lineSize;
+	return x86_x64_DCaches()->levels[1].lineSize;
 }
 
-const x86_x64_TLB* x86_x64_ITLB()
+const x86_x64_TLBs* x86_x64_ITLBs()
 {
 	ModuleInit(&cacheInitState, DetectCacheAndTLB);
-	return &itlb;
+	return &itlbs;
 }
 
-const x86_x64_TLB* x86_x64_DTLB()
+const x86_x64_TLBs* x86_x64_DTLBs()
 {
 	ModuleInit(&cacheInitState, DetectCacheAndTLB);
-	return &dtlb;
+	return &dtlbs;
 }
 
-size_t x86_x64_TLBCoverage(const x86_x64_TLB* tlb)
+size_t x86_x64_TLBCoverage(const x86_x64_TLBs* tlbs)
 {
 	// note: receiving a TLB pointer means DetectCacheAndTLB was called.
 
 	const u64 pageSize = 4*KiB;
-	const u64 largePageSize = 4*MiB;	// TODO: find out if we're using 2MB or 4MB
+	const u64 largePageSize = os_cpu_LargePageSize();
 	u64 totalSize = 0;	// [bytes]
-	for(size_t i = 0; i < tlb->numParameters; i++)
+	for(size_t i = 0; i < tlbs->numLevels; i++)
 	{
-		const x86_x64_TLBParameters& params = tlb->parameters[i];
-		if(params.pageSize == pageSize)
-			totalSize += pageSize * params.entries;
-		if(params.pageSize == largePageSize)
-			totalSize += largePageSize * params.entries;
+		const x86_x64_TLB& tlb = tlbs->levels[i];
+		if(tlb.pageSize == pageSize)
+			totalSize += pageSize * tlb.entries;
+		if(tlb.pageSize == largePageSize)
+			totalSize += largePageSize * tlb.entries;
 	}
 
 	return size_t(totalSize / MiB);
@@ -1036,8 +1046,8 @@ double x86_x64_ClockFrequency()
 	// note: don't just take the lowest value! it could conceivably be
 	// too low, if background processing delays reading c1 (see above).
 	double sum = 0.0;
-	const int lo = numSamples/4, hi = 3*numSamples/4;
-	for(int i = lo; i < hi; i++)
+	const size_t lo = numSamples/4, hi = 3*numSamples/4;
+	for(size_t i = lo; i < hi; i++)
 		sum += samples[i];
 
 	const double clockFrequency = sum / (hi-lo);
diff --git a/source/lib/sysdep/arch/x86_x64/x86_x64.h b/source/lib/sysdep/arch/x86_x64/x86_x64.h
index 704bdc5739..3b525759dc 100644
--- a/source/lib/sysdep/arch/x86_x64/x86_x64.h
+++ b/source/lib/sysdep/arch/x86_x64/x86_x64.h
@@ -134,10 +134,7 @@ enum x86_x64_CacheType
 
 const u8 x86_x64_fullyAssociative = 0xFF;
 
-/**
- * describes a level of one of the caches.
- **/
-struct x86_x64_CacheParameters
+struct x86_x64_Cache
 {
 	/**
 	 * (used to determine if this cache is unified or disabled)
@@ -155,34 +152,29 @@ struct x86_x64_CacheParameters
  * instruction and data caches are returned separately by the corresponding
  * accessor function; unified cache levels are reported by both.
  **/
-struct x86_x64_Cache
+struct x86_x64_Caches
 {
-	/**
-	 * total number of levels, each of which is described by
-	 * an entry in parameters[].
-	 **/
-	size_t levels;
-
-	x86_x64_CacheParameters* parameters;
+	size_t numLevels;
+	x86_x64_Cache* levels;
 };
 
 /**
- * @return pointer to a static x86_x64_Cache describing the instruction cache.
+ * @return pointer to a static x86_x64_Caches describing the instruction caches.
  **/
-LIB_API const x86_x64_Cache* x86_x64_ICache();
+LIB_API const x86_x64_Caches* x86_x64_ICaches();
 
 /**
- * @return pointer to a static x86_x64_Cache describing the data cache.
+ * @return pointer to a static x86_x64_Caches describing the data caches.
  **/
-LIB_API const x86_x64_Cache* x86_x64_DCache();
+LIB_API const x86_x64_Caches* x86_x64_DCaches();
 
 LIB_API size_t x86_x64_L1CacheLineSize();
 LIB_API size_t x86_x64_L2CacheLineSize();
 
 /**
- * describes part of a Translation Lookaside Buffer.
+ * Translation Lookaside Buffer.
  **/
-struct x86_x64_TLBParameters
+struct x86_x64_TLB
 {
 	x86_x64_CacheType type;
 	size_t level;
@@ -192,32 +184,28 @@ struct x86_x64_TLBParameters
 };
 
 /**
- * describes all parts of a Translation Lookaside Buffer
+ * describes all levels of a TLB.
  **/
-struct x86_x64_TLB
+struct x86_x64_TLBs
 {
-	/**
-	 * total number of parts, each of which is described by
-	 * an entry in parameters[]
-	 **/
-	size_t numParameters;
-	x86_x64_TLBParameters* parameters;
+	size_t numLevels;
+	x86_x64_TLB* levels;
 };
 
 /**
- * @return pointer to a static x86_x64_TLB describing the instruction TLB.
+ * @return pointer to a static x86_x64_TLB describing the instruction TLBs.
  **/
-LIB_API const x86_x64_TLB* x86_x64_ITLB();
+LIB_API const x86_x64_TLBs* x86_x64_ITLBs();
 
 /**
  * @return pointer to a static x86_x64_TLB describing the data TLB.
  **/
-LIB_API const x86_x64_TLB* x86_x64_DTLB();
+LIB_API const x86_x64_TLBs* x86_x64_DTLBs();
 
 /**
- * @return coverage, i.e. total size [MiB] of the given TLB
+ * @return coverage, i.e. total size [MiB] of the given TLBs
  **/
-LIB_API size_t x86_x64_TLBCoverage(const x86_x64_TLB* tlb);
+LIB_API size_t x86_x64_TLBCoverage(const x86_x64_TLBs* tlb);
 
 
 //-----------------------------------------------------------------------------
diff --git a/source/lib/sysdep/cpu.cpp b/source/lib/sysdep/cpu.cpp
index 679104b17e..6e9e3cc22e 100644
--- a/source/lib/sysdep/cpu.cpp
+++ b/source/lib/sysdep/cpu.cpp
@@ -31,10 +31,24 @@ ERROR_ASSOCIATE(ERR::CPU_FEATURE_MISSING, L"This CPU doesn't support a required
 ERROR_ASSOCIATE(ERR::CPU_UNKNOWN_OPCODE, L"Disassembly failed", -1);
 ERROR_ASSOCIATE(ERR::CPU_UNKNOWN_VENDOR, L"CPU vendor unknown", -1);
 
-void cpu_TestAtomicAdd()
+
+static void TestCAS64()
+{
+	volatile u64 var = 1;
+	cpu_CAS64(&var, 1ull, 2ull);
+	debug_assert(var == 2ull);
+}
+
+static void TestAtomicAdd()
 {
 	volatile intptr_t i1 = 1;
 	intptr_t prev = cpu_AtomicAdd(&i1, 1);
 	debug_assert(prev == 1);
 	debug_assert(i1 == 2);
 }
+
+void cpu_Test()
+{
+	TestCAS64();
+	TestAtomicAdd();
+}
diff --git a/source/lib/sysdep/cpu.h b/source/lib/sysdep/cpu.h
index 53f5293540..6fbc669067 100644
--- a/source/lib/sysdep/cpu.h
+++ b/source/lib/sysdep/cpu.h
@@ -90,6 +90,13 @@ bool cpu_CAS(volatile T* location, T expected, T new_value)
 	return cpu_CAS((volatile intptr_t*)location, (intptr_t)expected, (intptr_t)new_value);
 }
 
+#if ARCH_AMD64
+# define cpu_CAS64 cpu_CAS
+#else
+LIB_API bool cpu_CAS64(volatile u64* location, u64 expected, u64 newValue);
+#endif
+
+
 /**
  * add a signed value to a variable without the possibility of interference
  * from other threads/CPUs.
@@ -98,7 +105,7 @@ bool cpu_CAS(volatile T* location, T expected, T new_value)
  **/
 LIB_API intptr_t cpu_AtomicAdd(volatile intptr_t* location, intptr_t increment);
 
-LIB_API void cpu_TestAtomicAdd();
+LIB_API void cpu_Test();
 
 /**
  * enforce strict instruction ordering in the CPU pipeline.
diff --git a/source/lib/sysdep/os/win/whrt/tsc.cpp b/source/lib/sysdep/os/win/whrt/tsc.cpp
index 89e085f2a0..d834df9e87 100644
--- a/source/lib/sysdep/os/win/whrt/tsc.cpp
+++ b/source/lib/sysdep/os/win/whrt/tsc.cpp
@@ -45,45 +45,34 @@
 
 static bool IsUniprocessor()
 {
-	const CpuTopology* topology = cpu_topology_Detect();
-	if(cpu_topology_NumPackages(topology) != 1)
+	if(cpu_topology_NumPackages() != 1)
 		return false;
-	if(cpu_topology_CoresPerPackage(topology) != 1)
+	if(cpu_topology_CoresPerPackage() != 1)
 		return false;
 	return true;
 }
 
 
-enum AmdPowerNowFlags
-{
-	PN_FREQ_ID_CTRL    = BIT(1),
-	PN_HW_THERMAL_CTRL = BIT(4),
-	PN_SW_THERMAL_CTRL = BIT(5),
-	PN_INVARIANT_TSC   = BIT(8)
-};
-
 static bool IsInvariantTSC()
 {
 #if ARCH_X86_X64
+	// (we no longer need to check x86_x64_Vendor - Intel and AMD
+	// agreed on the definition of this feature check)
 	x86_x64_CpuidRegs regs = { 0 };
-	switch(x86_x64_Vendor())
+	regs.eax = 0x80000007;
+	if(x86_x64_cpuid(&regs))
 	{
-	case X86_X64_VENDOR_AMD:
-		regs.eax = 0x80000007;
-		if(x86_x64_cpuid(&regs))
-		{
-			// TSC is invariant across P-state, C-state and
-			// stop grant transitions (e.g. STPCLK)
-			if(regs.edx & PN_INVARIANT_TSC)
-				return true;
-		}
-		break;
+		// TSC is invariant across P-state, C-state, turbo, and
+		// stop grant transitions (e.g. STPCLK)
+		if(regs.edx & BIT(8))
+			return true;
 	}
 #endif
 
 	return false;
 }
 
+
 static bool IsThrottlingPossible()
 {
 #if ARCH_X86_X64
@@ -99,6 +88,12 @@ static bool IsThrottlingPossible()
 		regs.eax = 0x80000007;
 		if(x86_x64_cpuid(&regs))
 		{
+			enum AmdPowerNowFlags
+			{
+				PN_FREQ_ID_CTRL    = BIT(1),
+				PN_HW_THERMAL_CTRL = BIT(4),
+				PN_SW_THERMAL_CTRL = BIT(5)
+			};
 			if(regs.edx & (PN_FREQ_ID_CTRL|PN_HW_THERMAL_CTRL|PN_SW_THERMAL_CTRL))
 				return true;
 		}
diff --git a/source/lib/sysdep/os/win/whrt/whrt.cpp b/source/lib/sysdep/os/win/whrt/whrt.cpp
index edd0a1ec77..513f9040b1 100644
--- a/source/lib/sysdep/os/win/whrt/whrt.cpp
+++ b/source/lib/sysdep/os/win/whrt/whrt.cpp
@@ -173,6 +173,8 @@ struct TimerState
 	// (this enables calibration, which is currently not implemented,
 	// but leaving open the possibility costs nothing)
 	double time;
+
+	u8 padding[48];
 };
 
 // how do we detect when the old TimerState is no longer in use and can be
@@ -181,10 +183,10 @@ struct TimerState
 // entered critical sections (the latching of TimerState fields) will have
 // been exited before the next update comes around; if not, TimerState.time
 // changes, the critical section notices and re-reads the new values.
-static TimerState timerStates[2];
+static __declspec(align(64)) TimerState timerStates[2];
 // note: exchanging pointers is easier than XORing an index.
-static TimerState* volatile ts  = &timerStates[0];
-static TimerState* volatile ts2 = &timerStates[1];
+static volatile TimerState* volatile ts  = &timerStates[0];
+static volatile TimerState* volatile ts2 = &timerStates[1];
 
 static void UpdateTimerState()
 {
@@ -201,7 +203,7 @@ static void UpdateTimerState()
 	const u64 deltaTicks = CounterDelta(ts->counter, counter);
 	ts2->counter = counter;
 	ts2->time = ts->time + deltaTicks/nominalFrequency;
-	ts = (TimerState*)InterlockedExchangePointer((volatile PVOID*)&ts2, ts);
+	ts = (volatile TimerState*)InterlockedExchangePointer((volatile PVOID*)&ts2, (PVOID)ts);
 }
 
 double whrt_Time()
@@ -209,6 +211,7 @@ double whrt_Time()
 retry:
 	// latch timer state (counter and time must be from the same update)
 	const double time = ts->time;
+	cpu_MemoryBarrier();
 	const u64 counter = ts->counter;
 	// ts changed after reading time. note: don't compare counter because
 	// it _might_ have the same value after two updates.
diff --git a/source/lib/sysdep/os_cpu.h b/source/lib/sysdep/os_cpu.h
index d53d4eb4b0..c68df3124c 100644
--- a/source/lib/sysdep/os_cpu.h
+++ b/source/lib/sysdep/os_cpu.h
@@ -115,6 +115,24 @@ LIB_API size_t os_cpu_MemoryAvailable();
  **/
 LIB_API uintptr_t os_cpu_SetThreadAffinityMask(uintptr_t processorMask);
 
+class os_cpu_ScopedSetThreadAffinityMask
+{
+public:
+	os_cpu_ScopedSetThreadAffinityMask(uintptr_t processorMask)
+		: m_previousProcessorMask(os_cpu_SetThreadAffinityMask(processorMask))
+	{
+	}
+
+	~os_cpu_ScopedSetThreadAffinityMask()
+	{
+		(void)os_cpu_SetThreadAffinityMask(m_previousProcessorMask);
+	}
+
+private:
+	uintptr_t m_previousProcessorMask;
+};
+
+
 /**
  * called by os_cpu_CallByEachCPU.
  * @param processor ID of processor running the current thread for the
diff --git a/source/lib/timer.cpp b/source/lib/timer.cpp
index fc3a76b5a1..c5c4c582be 100644
--- a/source/lib/timer.cpp
+++ b/source/lib/timer.cpp
@@ -144,7 +144,7 @@ double timer_Resolution()
 //
 // do not use std::list et al. for this! we must be callable at any time,
 // especially before NLSO ctors run or before heap init.
-static size_t num_clients;
+static size_t numClients;
 static TimerClient* clients;
 
 
@@ -157,31 +157,24 @@ TimerClient* timer_AddClient(TimerClient* tc, const wchar_t* description)
 	// insert at front of list
 	tc->next = clients;
 	clients = tc;
-	num_clients++;
+	numClients++;
 
 	return tc;
 }
 
 
-void timer_BillClient(TimerClient* tc, TimerUnit t0, TimerUnit t1)
-{
-	tc->sum.AddDifference(t0, t1);
-	tc->num_calls++;
-}
-
-
 void timer_DisplayClientTotals()
 {
-	debug_printf(L"TIMER TOTALS (%lu clients)\n", (unsigned long)num_clients);
+	debug_printf(L"TIMER TOTALS (%lu clients)\n", (unsigned long)numClients);
 	debug_printf(L"-----------------------------------------------------\n");
 
 	while(clients)
 	{
 		// (make sure list and count are consistent)
-		debug_assert(num_clients != 0);
+		debug_assert(numClients != 0);
 		TimerClient* tc = clients;
 		clients = tc->next;
-		num_clients--;
+		numClients--;
 
 		const std::wstring duration = tc->sum.ToString();
 		debug_printf(L"  %ls: %ls (%lux)\n", tc->description, duration.c_str(), (unsigned long)tc->num_calls);
diff --git a/source/lib/timer.h b/source/lib/timer.h
index d8430d6cd1..f77df574de 100644
--- a/source/lib/timer.h
+++ b/source/lib/timer.h
@@ -28,6 +28,7 @@
 #define INCLUDED_TIMER
 
 #include "lib/config2.h"	// CONFIG2_TIMER_ALLOW_RDTSC
+#include "lib/sysdep/cpu.h"	// cpu_AtomicAdd
 #if ARCH_X86_X64 && CONFIG2_TIMER_ALLOW_RDTSC
 # include "lib/sysdep/arch/x86_x64/x86_x64.h"	// x86_x64_rdtsc
 # include "lib/sysdep/os_cpu.h"	// os_cpu_ClockFrequency
@@ -172,6 +173,18 @@ public:
 		m_ticks += t1.m_ticks - t0.m_ticks;
 	}
 
+	void AddDifferenceAtomic(TimerUnit t0, TimerUnit t1)
+	{
+		const u64 delta = t1.m_ticks - t0.m_ticks;
+#if ARCH_AMD64
+		cpu_AtomicAdd((volatile intptr_t*)&m_ticks, (intptr_t)delta);
+#else
+retry:
+		if(!cpu_CAS64(&m_ticks, m_ticks, m_ticks+delta))
+			goto retry;
+#endif
+	}
+
 	void Subtract(TimerUnit t)
 	{
 		m_ticks -= t.m_ticks;
@@ -226,6 +239,20 @@ public:
 		m_seconds += t1.m_seconds - t0.m_seconds;
 	}
 
+	void AddDifferenceAtomic(TimerUnit t0, TimerUnit t1)
+	{
+retry:
+		intptr_t oldRepresentation;
+		memcpy(&oldRepresentation, &m_seconds, sizeof(oldRepresentation));
+
+		const double seconds = m_seconds + t1.m_seconds - t0.m_seconds;
+		intptr_t newRepresentation;
+		memcpy(&newRepresentation, &seconds, sizeof(newRepresentation));
+
+		if(!cpu_CAS64((volatile intptr_t*)&m_seconds, oldRepresentation, newRepresentation))
+			goto retry;
+	}
+
 	void Subtract(TimerUnit t)
 	{
 		m_seconds -= t.m_seconds;
@@ -274,7 +301,7 @@ struct TimerClient
 
 	// how often timer_BillClient was called (helps measure relative
 	// performance of something that is done indeterminately often).
-	size_t num_calls;
+	intptr_t num_calls;
 };
 
 /**
@@ -304,7 +331,21 @@ LIB_API TimerClient* timer_AddClient(TimerClient* tc, const wchar_t* description
 /**
  * bill the difference between t0 and t1 to the client's total.
  **/
-LIB_API void timer_BillClient(TimerClient* tc, TimerUnit t0, TimerUnit t1);
+inline void timer_BillClient(TimerClient* tc, TimerUnit t0, TimerUnit t1)
+{
+	tc->sum.AddDifference(t0, t1);
+	tc->num_calls++;
+}
+
+/**
+ * thread-safe version of timer_BillClient
+ * (not used by default due to its higher overhead)
+ **/
+inline void timer_BillClientAtomic(TimerClient* tc, TimerUnit t0, TimerUnit t1)
+{
+	tc->sum.AddDifferenceAtomic(t0, t1);
+	cpu_AtomicAdd(&tc->num_calls, +1);
+}
 
 /**
  * display all clients' totals; does not reset them.
@@ -335,6 +376,28 @@ private:
 	TimerClient* m_tc;
 };
 
+class ScopeTimerAccrueAtomic
+{
+	NONCOPYABLE(ScopeTimerAccrueAtomic);
+public:
+	ScopeTimerAccrueAtomic(TimerClient* tc)
+		: m_tc(tc)
+	{
+		m_t0.SetFromTimer();
+	}
+
+	~ScopeTimerAccrueAtomic()
+	{
+		TimerUnit t1;
+		t1.SetFromTimer();
+		timer_BillClientAtomic(m_tc, m_t0, t1);
+	}
+
+private:
+	TimerUnit m_t0;
+	TimerClient* m_tc;
+};
+
 /**
  * Measure the time taken to execute code up until end of the current scope; 
  * bill it to the given TimerClient object. Can safely be nested.
@@ -356,5 +419,6 @@ private:
  * 	timer_DisplayClientTotals();
  **/
 #define TIMER_ACCRUE(client) ScopeTimerAccrue UID__(client)
+#define TIMER_ACCRUE_ATOMIC(client) ScopeTimerAccrueAtomic UID__(client)
 
 #endif	// #ifndef INCLUDED_TIMER
diff --git a/source/ps/Util.cpp b/source/ps/Util.cpp
index d1e92216ca..ab35d2f5dc 100644
--- a/source/ps/Util.cpp
+++ b/source/ps/Util.cpp
@@ -99,8 +99,7 @@ void WriteSystemInfo()
 	fprintf(f, "OS             : %s %s (%s)\n", un.sysname, un.release, un.version);
 
 	// CPU
-	const CpuTopology* topology = cpu_topology_Detect();
-	fprintf(f, "CPU            : %s, %s (%dx%dx%d)", un.machine, cpu_IdentifierString(), (int)cpu_topology_NumPackages(topology), (int)cpu_topology_CoresPerPackage(topology), (int)cpu_topology_LogicalPerCore(topology));
+	fprintf(f, "CPU            : %s, %s (%dx%dx%d)", un.machine, cpu_IdentifierString(), (int)cpu_topology_NumPackages(), (int)cpu_topology_CoresPerPackage(), (int)cpu_topology_LogicalPerCore());
 	const double cpu_freq = os_cpu_ClockFrequency();
 	if(cpu_freq != 0.0f)
 	{