From c3da7f5e339cd5b075de9e8b6c4718155895432c Mon Sep 17 00:00:00 2001
From: janwas <jan@wildfiregames.com>
Date: Tue, 15 Mar 2005 18:51:54 +0000
Subject: [PATCH] statically allocated per-CPU info (simplicity), and profiler
 WIP

This was SVN commit r1995.
---
 source/lib/sysdep/win/wcpu.cpp | 261 +++++++++++++++++++++++++++++++--
 1 file changed, 246 insertions(+), 15 deletions(-)
diff --git a/source/lib/sysdep/win/wcpu.cpp b/source/lib/sysdep/win/wcpu.cpp
index 5b52b6a21c..d35f5db76a 100755
--- a/source/lib/sysdep/win/wcpu.cpp
+++ b/source/lib/sysdep/win/wcpu.cpp
@@ -18,13 +18,16 @@
 
 #include "precompiled.h"
 
-#include <stdlib.h> // for malloc, free
-
 #include "lib.h"
 #include "win_internal.h"
 
 #include "sysdep/cpu.h"
 
+// limit allows statically allocated per-CPU structures (for simplicity).
+// we're Windows-specific anyway; such systems won't foreseeably have more.
+// note: int instead of unsigned because <cpus> is also signed (tri-state).
+static const int MAX_CPUS = 32;
+
 
 // not possible with POSIX calls.
 // called from ia32.cpp check_smp
@@ -79,34 +82,29 @@ static void check_speedstep()
 	{
 		// most likely not speedstep-capable if these aren't supported
 		SYSTEM_POWER_CAPABILITIES spc;
-		if(pCNPI(SystemPowerCapabilities, 0, 0, &spc, sizeof(spc)) == STATUS_SUCCESS)
+		if(pCNPI(SystemPowerCapabilities, 0,0, &spc,sizeof(spc)) == STATUS_SUCCESS)
 			if(!spc.ProcessorThrottle || !spc.ThermalControl)
 				cpu_speedstep = 0;
 
 		// probably speedstep if cooling mode active.
 		// the documentation of PO_TZ_* is unclear, so we can't be sure.
 		SYSTEM_POWER_INFORMATION spi;
-		if(pCNPI(SystemPowerInformation, 0, 0, &spi, sizeof(spi)) == STATUS_SUCCESS)
+		if(pCNPI(SystemPowerInformation, 0,0, &spi,sizeof(spi)) == STATUS_SUCCESS)
 			if(spi.CoolingMode != PO_TZ_INVALID_MODE)
 				cpu_speedstep = 1;
 
-		// definitely speedstep if a CPU has thermal throttling active.
-		// note that we don't care about user-defined throttles
-		// (see ppi.CurrentMhz) - they don't change often.
-		const size_t ppi_buf_size = cpus * sizeof(PROCESSOR_POWER_INFORMATION);
-		void* ppi_buf = malloc(ppi_buf_size);
-		if(pCNPI(ProcessorInformation, 0, 0, ppi_buf, (ULONG)ppi_buf_size) == STATUS_SUCCESS)
+		// definitely speedstep if any throttle is less than 100%.
+		PROCESSOR_POWER_INFORMATION ppi[MAX_CPUS];
+		if(pCNPI(ProcessorInformation, 0,0, ppi,sizeof(ppi)) == STATUS_SUCCESS)
 		{
-			PROCESSOR_POWER_INFORMATION* ppi = (PROCESSOR_POWER_INFORMATION*)ppi_buf;
-			for(int i = 0; i < cpus; i++)
-				// thermal throttling currently active
-				if(ppi[i].MaxMhz != ppi[i].MhzLimit)
+			const PROCESSOR_POWER_INFORMATION* p = ppi;
+			for(int i = 0; i < MIN(cpus, MAX_CPUS); i++, p++)
+				if(p->MhzLimit != p->MaxMhz || p->CurrentMhz != p->MaxMhz)
 				{
 					cpu_speedstep = 1;
 					break;
 				}
 		}
-		free(ppi_buf);
 	}
 	FreeLibrary(hPowrprofDll);
 		// this is most likely the only reference,
@@ -157,3 +155,236 @@ int win_get_cpu_info()
 
 	return 0;
 }
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+//
+//
+//////////////////////////////////////////////////////////////////////////////
+
+
+// we need a means of measuring performance, since it is hard to predict and
+// depends on many factors. to cover a wider range of configurations, this
+// must also be possible on end-user systems lacking specialized developer
+// tools. therefore, we must ship our own implementation; this complements
+// Intel VTune et al.
+//
+// there are 3 approaches to the problem:
+// - single-step analysis logs every executed instruction. very thorough, but
+//   intolerably slow (~1000x) and not suitable for performance measurement.
+// - intrusive measuring tracks execution time of explicitly marked
+//   functions or 'zones'. more complex, requires adding code, and
+//   inaccurate when thread switches are frequent.
+// - IP sampling records the current instruction pointer at regular
+//   intervals; slow sections of code will over time appear more often.
+//   not exact, but simple and low-overhead.
+//
+// we implement IP sampling due to its simplicity. an intrusive approach
+// might also be added later to account for performance per-module
+// (helps spot the culprit in case hotspots are called from multiple sites).
+
+
+// on Windows, we retrieve the current IP with GetThreadContext. dox require
+// this to happen from another thread, and for the target to be suspended
+// (now enforced by XP SP2). this leads to all sorts of problems:
+// - if the suspended thread was dispatching an exception in the kernel,
+//   register state may be a mix between the correct values and
+//   those captured from the exception.
+// - if running on Win9x with real-mode drivers, interrupts may interfere
+//   with GetThreadContext. however, it's not supported anyway due to other
+//   deficiencies (e.g. lack of proper mmap support).
+// - the suspended thread may be holding locks; we need to be extremely
+//   careful to avoid deadlock! many win api functions acquire locks in
+//   non-obvious ways.
+
+static HANDLE prof_target_thread;
+
+static pthread_t prof_thread;
+
+// delay [ms] between samples. OS sleep timers usually provide only
+// ms resolution. increasing interval reduces overhead and accuracy.
+static const int PROFILE_INTERVAL_MS = 1;
+
+
+static uintptr_t get_target_pc()
+{
+	DWORD ret;
+	HANDLE hThread = prof_target_thread;	// convenience
+
+	ret = SuspendThread(hThread);
+	if(ret == (DWORD)-1)
+	{
+		debug_warn("SuspendThread failed");
+		return -1;
+	}
+	// note: we don't need to call more than once: this increments a DWORD
+	// 'suspend count'; target is guaranteed to be suspended unless
+	// the function failed.
+
+	/////////////////////////////////////////////
+
+	// be VERY CAREFUL to avoid anything that may acquire a lock until
+	// after ResumeThread! this includes locks taken by the OS,
+	// e.g. malloc -> heap or GetProcAddres -> loader.
+	// reason is, if the target thread was holding a lock we try to
+	// acquire here, a classic deadlock results.
+
+	uintptr_t pc = 0;	// => will return 0 if GetThreadContext fails
+
+	CONTEXT context;
+	if(GetThreadContext(hThread, &context))
+	{
+#if defined(_M_AMD64)
+		pc = context.Rip;
+#elif defined(_M_IX86)
+		pc = context.Eip;
+#else
+# error "port CONTEXT"
+#endif
+	}
+
+	/////////////////////////////////////////////
+
+	ret = ResumeThread(hThread);
+	assert(ret != 0);
+		// don't fail (we have a valid PC), but warn
+
+	return pc;
+}
+
+
+static pthread_t thread;
+static sem_t exit_flag;
+
+static void* prof_thread_func(void* data)
+{
+	UNUSED(data);
+
+	const long _1e6 = 1000000;
+	const long _1e9 = 1000000000;
+
+	for(;;)
+	{
+		// calculate absolute timeout for sem_timedwait
+		struct timespec abs_timeout;
+		clock_gettime(CLOCK_REALTIME, &abs_timeout);
+		abs_timeout.tv_nsec += PROFILE_INTERVAL_MS * _1e6;
+		// .. handle nanosecond wraparound (must not be > 1000m)
+		if(abs_timeout.tv_nsec >= _1e9)
+		{
+			abs_timeout.tv_nsec -= _1e9;
+			abs_timeout.tv_sec++;
+		}
+
+		errno = 0;
+		// if we acquire the semaphore, exit was requested.
+		if(sem_timedwait(&exit_flag, &abs_timeout) == 0)
+			break;
+		// actual error: warn
+		if(errno != ETIMEDOUT)
+			debug_warn("wpcu prof_thread_func: sem_timedwait failed");
+
+		uintptr_t pc = get_target_pc();
+
+		// ADD TO LIST
+	}
+
+	return 0;
+}
+
+
+
+// call from thread that is to be profiled
+int prof_start()
+{
+	// we need a real HANDLE to the target thread for use with
+	// Suspend|ResumeThread and GetThreadContext.
+	// alternative: DuplicateHandle on the current thread pseudo-HANDLE.
+	// this way is a bit more obvious/simple.
+	const DWORD access = THREAD_GET_CONTEXT|THREAD_SUSPEND_RESUME;
+	HANDLE hThread = OpenThread(access, FALSE, GetCurrentThreadId());
+	if(hThread == INVALID_HANDLE_VALUE)
+	{
+		debug_warn("OpenThread failed");
+		return -1;
+	}
+
+	prof_target_thread = hThread;
+
+	sem_init(&exit_flag, 0, 0);
+	pthread_create(&thread, 0, prof_thread_func, 0);
+	return 0;
+}
+
+int prof_shutdown()
+{
+	CloseHandle(prof_target_thread);
+	return 0;
+}
+
+
+
+/*
+open question: how to store the EIP values returned? some background:
+the mechanism above churns out an EIP value (may be in our process, but might
+also be bogus); we need to store it somehow pending analysis.
+
+when done with the current run, we'd want to resolve EIP -> function name,
+source file etc. (rather slow, so don't do it at runtime).
+
+so, how to store it in the meantime? 2 possibilities:
+- simple array/vector of addresses (of course optimized to reduce allocs)
+- fixed size array of 'bins' (range of addresses; may be as fine as 1 byte);
+  each bin has a counter which is incremented when the bin's corresponding
+  address has been hit.
+
+it's a size tradeoff here; for simple runs of < 1 min (60,000 ms), #1
+would use 240kb of mem. #2 requires sizeof_whole_program * bytes_per_counter
+up front, and has problems measuring DLLs (we'd have to explicitly map
+the DLL address range into a bin - ugh). however, if we ever want to
+test for say an hour (improves accuracy of profiling due to larger sample size),
+#1 would guzzle 15mb of memory.
+
+hm, another idea would be to write out #1's list of addresses periodically.
+to make sure the disk I/O doesn't come at a bad time, we could have the main
+thread call into the profiler and request it write out at that time.
+this would require extreme caution to avoid the deadlock problem, but looks
+doable.
+
+-------- [2] ----------
+
+realistic profiler runs will take up to an hour.
+
+writing out to disk would work: could have main thread call back.
+that and adding EIP to list would be atomic (locked).
+BUT: large amount of data, that's bad (loading at 30mb/s => 500ms load time alone)
+
+problem with enumerating all symbols at startup: how do we enum all DLLs?
+
+hybrid idea: std::map of EIPs. we don't build the map at startup,
+but add when first seen and subsequently increment counter stored there.
+	problem: uses more memory/slower access than list.
+		would have to make sure EIPs are reused.
+		to help that, could quantize down to 4 byte (or so) bins.
+		accessing debug information at runtime to determine function length is too slow.
+
+	maybe some weird data structure: one bucket controls say 256 bytes of code
+		bucket is found by stripping off lower 8 bits. then, store only
+		the hit count for that byte. where's the savings over normal count?
+
+TODO: what if the thread is sleeping at the time we query EIP?
+can't detect that - suspend count is only set by SuspendThread
+do we want to report that point (it's good to know), or try to access other threads?
+
+TODO split off target thread / get PC into sysdep; profiler thread is portable!
+
+
+at exit: resolve list to hotspots
+probably hard; a start would be just the function in which the address is, then hit count
+
+
+==========================================
+
+
+*/
\ No newline at end of file