2006-04-12 01:59:08 +02:00
|
|
|
/**
|
|
|
|
* =========================================================================
|
|
|
|
* File : wcpu.cpp
|
|
|
|
* Project : 0 A.D.
|
2007-05-08 17:11:53 +02:00
|
|
|
* Description : Windows backend for CPU related code
|
2006-04-12 01:59:08 +02:00
|
|
|
* =========================================================================
|
|
|
|
*/
|
|
|
|
|
2007-05-07 18:33:24 +02:00
|
|
|
// license: GPL; see lib/license.txt
|
2005-01-27 17:18:22 +01:00
|
|
|
|
2004-06-19 16:36:59 +02:00
|
|
|
#include "precompiled.h"
|
2007-04-25 20:19:35 +02:00
|
|
|
#include "wcpu.h"
|
2004-06-19 16:36:59 +02:00
|
|
|
|
2007-01-01 22:25:47 +01:00
|
|
|
#include "lib/posix/posix_pthread.h"
|
|
|
|
#include "lib/posix/posix_time.h"
|
2007-05-08 17:11:53 +02:00
|
|
|
#include "win.h"
|
2007-05-04 19:30:32 +02:00
|
|
|
#include "wutil.h"
|
2007-05-27 00:42:08 +02:00
|
|
|
#include "winit.h"
|
|
|
|
|
2007-06-05 00:59:14 +02:00
|
|
|
WINIT_REGISTER_EARLY_INIT(wcpu_Init); // wcpu -> whrt
|
2004-06-19 16:36:59 +02:00
|
|
|
|
2007-05-27 00:42:08 +02:00
|
|
|
static uint numProcessors = 0;
|
2007-04-25 20:19:35 +02:00
|
|
|
|
2007-05-02 14:07:08 +02:00
|
|
|
/// get number of CPUs (can't fail)
|
|
|
|
uint wcpu_NumProcessors()
|
2007-04-25 20:19:35 +02:00
|
|
|
{
|
2007-05-27 00:42:08 +02:00
|
|
|
debug_assert(numProcessors != 0);
|
|
|
|
return numProcessors;
|
|
|
|
}
|
2007-05-26 17:34:10 +02:00
|
|
|
|
2007-05-27 00:42:08 +02:00
|
|
|
static void DetectNumProcessors()
|
|
|
|
{
|
2007-04-25 20:19:35 +02:00
|
|
|
SYSTEM_INFO si;
|
2007-05-27 00:42:08 +02:00
|
|
|
GetSystemInfo(&si); // can't fail
|
2007-05-26 17:34:10 +02:00
|
|
|
numProcessors = (uint)si.dwNumberOfProcessors;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2007-05-27 00:42:08 +02:00
|
|
|
static double clockFrequency = -1.0;
|
|
|
|
|
2007-05-26 17:34:10 +02:00
|
|
|
double wcpu_ClockFrequency()
|
|
|
|
{
|
2007-05-27 00:42:08 +02:00
|
|
|
debug_assert(clockFrequency > 0.0);
|
|
|
|
return clockFrequency;
|
|
|
|
}
|
2007-05-26 17:34:10 +02:00
|
|
|
|
2007-05-27 00:42:08 +02:00
|
|
|
static void DetectClockFrequency()
|
|
|
|
{
|
|
|
|
// read from registry
|
2007-05-26 17:34:10 +02:00
|
|
|
HKEY hKey;
|
|
|
|
if(RegOpenKeyEx(HKEY_LOCAL_MACHINE, "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0", 0, KEY_QUERY_VALUE, &hKey) == ERROR_SUCCESS)
|
|
|
|
{
|
|
|
|
DWORD freqMhz;
|
|
|
|
DWORD size = sizeof(freqMhz);
|
|
|
|
if(RegQueryValueEx(hKey, "~MHz", 0, 0, (LPBYTE)&freqMhz, &size) == STATUS_SUCCESS)
|
|
|
|
clockFrequency = freqMhz * 1e6;
|
|
|
|
else
|
|
|
|
debug_assert(0);
|
|
|
|
|
|
|
|
RegCloseKey(hKey);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
debug_assert(0);
|
2007-04-25 20:19:35 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2006-04-11 03:45:07 +02:00
|
|
|
//-----------------------------------------------------------------------------
|
|
|
|
|
|
|
|
// execute the specified function once on each CPU.
|
|
|
|
// this includes logical HT units and proceeds serially (function
|
|
|
|
// is never re-entered) in order of increasing OS CPU ID.
|
|
|
|
// note: implemented by switching thread affinity masks and forcing
|
|
|
|
// a reschedule, which is apparently not possible with POSIX.
|
2006-04-24 07:20:14 +02:00
|
|
|
//
|
|
|
|
// may fail if e.g. OS is preventing us from running on some CPUs.
|
|
|
|
// called from ia32.cpp get_cpu_count.
|
2007-05-02 14:07:08 +02:00
|
|
|
LibError wcpu_CallByEachCPU(CpuCallback cb, void* param)
|
2006-04-11 03:45:07 +02:00
|
|
|
{
|
|
|
|
const HANDLE hProcess = GetCurrentProcess();
|
|
|
|
DWORD process_affinity, system_affinity;
|
|
|
|
if(!GetProcessAffinityMask(hProcess, &process_affinity, &system_affinity))
|
2006-09-22 15:19:40 +02:00
|
|
|
WARN_RETURN(ERR::FAIL);
|
2006-04-11 03:45:07 +02:00
|
|
|
// our affinity != system affinity: OS is limiting the CPUs that
|
|
|
|
// this process can run on. fail (cannot call back for each CPU).
|
|
|
|
if(process_affinity != system_affinity)
|
2006-09-22 15:19:40 +02:00
|
|
|
WARN_RETURN(ERR::CPU_RESTRICTED_AFFINITY);
|
2006-04-11 03:45:07 +02:00
|
|
|
|
2007-04-25 20:19:35 +02:00
|
|
|
for(DWORD_PTR cpu_bit = 1; cpu_bit != 0 && cpu_bit <= process_affinity; cpu_bit *= 2)
|
2006-04-11 03:45:07 +02:00
|
|
|
{
|
|
|
|
// check if we can switch to target CPU
|
|
|
|
if(!(process_affinity & cpu_bit))
|
|
|
|
continue;
|
|
|
|
// .. and do so.
|
2007-04-25 20:19:35 +02:00
|
|
|
if(!SetThreadAffinityMask(GetCurrentThread(), cpu_bit))
|
2006-04-11 03:45:07 +02:00
|
|
|
{
|
2006-09-22 15:19:40 +02:00
|
|
|
WARN_ERR(ERR::CPU_RESTRICTED_AFFINITY);
|
2006-04-11 03:45:07 +02:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2007-04-25 20:19:35 +02:00
|
|
|
// reschedule to make sure we switch CPUs.
|
2006-04-11 03:45:07 +02:00
|
|
|
Sleep(1);
|
|
|
|
|
2007-04-25 20:19:35 +02:00
|
|
|
cb(param);
|
2006-04-11 03:45:07 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// restore to original value
|
2007-04-25 20:19:35 +02:00
|
|
|
SetThreadAffinityMask(hProcess, process_affinity);
|
2006-04-11 03:45:07 +02:00
|
|
|
|
2006-09-22 15:19:40 +02:00
|
|
|
return INFO::OK;
|
2006-04-11 03:45:07 +02:00
|
|
|
}
|
|
|
|
|
2007-05-27 00:42:08 +02:00
|
|
|
//-----------------------------------------------------------------------------
|
|
|
|
|
|
|
|
static LibError wcpu_Init()
|
|
|
|
{
|
|
|
|
DetectNumProcessors();
|
|
|
|
DetectClockFrequency();
|
|
|
|
|
|
|
|
return INFO::OK;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2006-04-11 03:45:07 +02:00
|
|
|
|
2005-03-15 19:51:54 +01:00
|
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
|
|
//
|
|
|
|
//
|
|
|
|
//
|
|
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
|
|
|
2007-05-26 17:34:10 +02:00
|
|
|
#if 0
|
2005-03-15 19:51:54 +01:00
|
|
|
|
|
|
|
// we need a means of measuring performance, since it is hard to predict and
|
|
|
|
// depends on many factors. to cover a wider range of configurations, this
|
|
|
|
// must also be possible on end-user systems lacking specialized developer
|
|
|
|
// tools. therefore, we must ship our own implementation; this complements
|
|
|
|
// Intel VTune et al.
|
|
|
|
//
|
|
|
|
// there are 3 approaches to the problem:
|
|
|
|
// - single-step analysis logs every executed instruction. very thorough, but
|
|
|
|
// intolerably slow (~1000x) and not suitable for performance measurement.
|
|
|
|
// - intrusive measuring tracks execution time of explicitly marked
|
|
|
|
// functions or 'zones'. more complex, requires adding code, and
|
|
|
|
// inaccurate when thread switches are frequent.
|
|
|
|
// - IP sampling records the current instruction pointer at regular
|
|
|
|
// intervals; slow sections of code will over time appear more often.
|
|
|
|
// not exact, but simple and low-overhead.
|
|
|
|
//
|
|
|
|
// we implement IP sampling due to its simplicity. an intrusive approach
|
|
|
|
// might also be added later to account for performance per-module
|
|
|
|
// (helps spot the culprit in case hotspots are called from multiple sites).
|
|
|
|
|
|
|
|
|
|
|
|
// on Windows, we retrieve the current IP with GetThreadContext. dox require
|
|
|
|
// this to happen from another thread, and for the target to be suspended
|
|
|
|
// (now enforced by XP SP2). this leads to all sorts of problems:
|
|
|
|
// - if the suspended thread was dispatching an exception in the kernel,
|
|
|
|
// register state may be a mix between the correct values and
|
|
|
|
// those captured from the exception.
|
|
|
|
// - if running on Win9x with real-mode drivers, interrupts may interfere
|
|
|
|
// with GetThreadContext. however, it's not supported anyway due to other
|
|
|
|
// deficiencies (e.g. lack of proper mmap support).
|
|
|
|
// - the suspended thread may be holding locks; we need to be extremely
|
|
|
|
// careful to avoid deadlock! many win api functions acquire locks in
|
|
|
|
// non-obvious ways.
|
|
|
|
|
|
|
|
static HANDLE prof_target_thread;
|
|
|
|
|
|
|
|
static pthread_t prof_thread;
|
|
|
|
|
|
|
|
// delay [ms] between samples. OS sleep timers usually provide only
|
|
|
|
// ms resolution. increasing interval reduces overhead and accuracy.
|
|
|
|
static const int PROFILE_INTERVAL_MS = 1;
|
|
|
|
|
|
|
|
|
|
|
|
static uintptr_t get_target_pc()
|
|
|
|
{
|
|
|
|
DWORD ret;
|
|
|
|
HANDLE hThread = prof_target_thread; // convenience
|
|
|
|
|
|
|
|
ret = SuspendThread(hThread);
|
|
|
|
if(ret == (DWORD)-1)
|
|
|
|
{
|
2005-05-11 06:35:21 +02:00
|
|
|
debug_warn("get_target_pc: SuspendThread failed");
|
2005-08-09 18:23:19 +02:00
|
|
|
return 0;
|
2005-03-15 19:51:54 +01:00
|
|
|
}
|
|
|
|
// note: we don't need to call more than once: this increments a DWORD
|
|
|
|
// 'suspend count'; target is guaranteed to be suspended unless
|
|
|
|
// the function failed.
|
|
|
|
|
|
|
|
/////////////////////////////////////////////
|
|
|
|
|
|
|
|
// be VERY CAREFUL to avoid anything that may acquire a lock until
|
|
|
|
// after ResumeThread! this includes locks taken by the OS,
|
2005-05-11 06:35:21 +02:00
|
|
|
// e.g. malloc -> heap or GetProcAddress -> loader.
|
2005-03-15 19:51:54 +01:00
|
|
|
// reason is, if the target thread was holding a lock we try to
|
|
|
|
// acquire here, a classic deadlock results.
|
|
|
|
|
|
|
|
uintptr_t pc = 0; // => will return 0 if GetThreadContext fails
|
|
|
|
|
|
|
|
CONTEXT context;
|
2005-05-11 06:35:21 +02:00
|
|
|
context.ContextFlags = CONTEXT_CONTROL;
|
2005-03-15 19:51:54 +01:00
|
|
|
if(GetThreadContext(hThread, &context))
|
2005-08-09 18:23:19 +02:00
|
|
|
pc = context.PC_;
|
2005-03-15 19:51:54 +01:00
|
|
|
|
|
|
|
/////////////////////////////////////////////
|
|
|
|
|
|
|
|
ret = ResumeThread(hThread);
|
2005-06-28 06:06:25 +02:00
|
|
|
debug_assert(ret != 0);
|
2005-03-15 19:51:54 +01:00
|
|
|
// don't fail (we have a valid PC), but warn
|
|
|
|
|
|
|
|
return pc;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static pthread_t thread;
|
|
|
|
static sem_t exit_flag;
|
|
|
|
|
2005-08-09 18:23:19 +02:00
|
|
|
static void* prof_thread_func(void* UNUSED(data))
|
2005-03-15 19:51:54 +01:00
|
|
|
{
|
2005-09-20 00:48:20 +02:00
|
|
|
debug_set_thread_name("eip_sampler");
|
2005-03-15 19:51:54 +01:00
|
|
|
|
|
|
|
const long _1e6 = 1000000;
|
|
|
|
const long _1e9 = 1000000000;
|
|
|
|
|
|
|
|
for(;;)
|
|
|
|
{
|
|
|
|
// calculate absolute timeout for sem_timedwait
|
|
|
|
struct timespec abs_timeout;
|
|
|
|
clock_gettime(CLOCK_REALTIME, &abs_timeout);
|
|
|
|
abs_timeout.tv_nsec += PROFILE_INTERVAL_MS * _1e6;
|
|
|
|
// .. handle nanosecond wraparound (must not be > 1000m)
|
|
|
|
if(abs_timeout.tv_nsec >= _1e9)
|
|
|
|
{
|
|
|
|
abs_timeout.tv_nsec -= _1e9;
|
|
|
|
abs_timeout.tv_sec++;
|
|
|
|
}
|
|
|
|
|
|
|
|
errno = 0;
|
|
|
|
// if we acquire the semaphore, exit was requested.
|
|
|
|
if(sem_timedwait(&exit_flag, &abs_timeout) == 0)
|
|
|
|
break;
|
|
|
|
// actual error: warn
|
|
|
|
if(errno != ETIMEDOUT)
|
|
|
|
debug_warn("wpcu prof_thread_func: sem_timedwait failed");
|
|
|
|
|
|
|
|
uintptr_t pc = get_target_pc();
|
2005-08-13 19:09:57 +02:00
|
|
|
UNUSED2(pc);
|
2005-03-15 19:51:54 +01:00
|
|
|
|
|
|
|
// ADD TO LIST
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// call from thread that is to be profiled
|
2005-12-11 23:23:55 +01:00
|
|
|
LibError prof_start()
|
2005-03-15 19:51:54 +01:00
|
|
|
{
|
|
|
|
// we need a real HANDLE to the target thread for use with
|
|
|
|
// Suspend|ResumeThread and GetThreadContext.
|
|
|
|
// alternative: DuplicateHandle on the current thread pseudo-HANDLE.
|
|
|
|
// this way is a bit more obvious/simple.
|
|
|
|
const DWORD access = THREAD_GET_CONTEXT|THREAD_SUSPEND_RESUME;
|
|
|
|
HANDLE hThread = OpenThread(access, FALSE, GetCurrentThreadId());
|
|
|
|
if(hThread == INVALID_HANDLE_VALUE)
|
2006-09-22 15:19:40 +02:00
|
|
|
WARN_RETURN(ERR::FAIL);
|
2005-03-15 19:51:54 +01:00
|
|
|
|
|
|
|
prof_target_thread = hThread;
|
|
|
|
|
|
|
|
sem_init(&exit_flag, 0, 0);
|
|
|
|
pthread_create(&thread, 0, prof_thread_func, 0);
|
2006-09-22 15:19:40 +02:00
|
|
|
return INFO::OK;
|
2005-03-15 19:51:54 +01:00
|
|
|
}
|
|
|
|
|
2005-12-11 23:23:55 +01:00
|
|
|
LibError prof_shutdown()
|
2005-03-15 19:51:54 +01:00
|
|
|
{
|
2005-12-11 23:23:55 +01:00
|
|
|
WARN_IF_FALSE(CloseHandle(prof_target_thread));
|
2006-09-22 15:19:40 +02:00
|
|
|
return INFO::OK;
|
2005-03-15 19:51:54 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
open question: how to store the EIP values returned? some background:
|
|
|
|
the mechanism above churns out an EIP value (may be in our process, but might
|
|
|
|
also be bogus); we need to store it somehow pending analysis.
|
|
|
|
|
|
|
|
when done with the current run, we'd want to resolve EIP -> function name,
|
|
|
|
source file etc. (rather slow, so don't do it at runtime).
|
|
|
|
|
|
|
|
so, how to store it in the meantime? 2 possibilities:
|
|
|
|
- simple array/vector of addresses (of course optimized to reduce allocs)
|
|
|
|
- fixed size array of 'bins' (range of addresses; may be as fine as 1 byte);
|
|
|
|
each bin has a counter which is incremented when the bin's corresponding
|
|
|
|
address has been hit.
|
|
|
|
|
|
|
|
it's a size tradeoff here; for simple runs of < 1 min (60,000 ms), #1
|
|
|
|
would use 240kb of mem. #2 requires sizeof_whole_program * bytes_per_counter
|
|
|
|
up front, and has problems measuring DLLs (we'd have to explicitly map
|
|
|
|
the DLL address range into a bin - ugh). however, if we ever want to
|
|
|
|
test for say an hour (improves accuracy of profiling due to larger sample size),
|
|
|
|
#1 would guzzle 15mb of memory.
|
|
|
|
|
|
|
|
hm, another idea would be to write out #1's list of addresses periodically.
|
|
|
|
to make sure the disk I/O doesn't come at a bad time, we could have the main
|
|
|
|
thread call into the profiler and request it write out at that time.
|
|
|
|
this would require extreme caution to avoid the deadlock problem, but looks
|
|
|
|
doable.
|
|
|
|
|
|
|
|
-------- [2] ----------
|
|
|
|
|
|
|
|
realistic profiler runs will take up to an hour.
|
|
|
|
|
|
|
|
writing out to disk would work: could have main thread call back.
|
|
|
|
that and adding EIP to list would be atomic (locked).
|
|
|
|
BUT: large amount of data, that's bad (loading at 30mb/s => 500ms load time alone)
|
|
|
|
|
|
|
|
problem with enumerating all symbols at startup: how do we enum all DLLs?
|
|
|
|
|
|
|
|
hybrid idea: std::map of EIPs. we don't build the map at startup,
|
|
|
|
but add when first seen and subsequently increment counter stored there.
|
|
|
|
problem: uses more memory/slower access than list.
|
|
|
|
would have to make sure EIPs are reused.
|
|
|
|
to help that, could quantize down to 4 byte (or so) bins.
|
|
|
|
accessing debug information at runtime to determine function length is too slow.
|
|
|
|
|
|
|
|
maybe some weird data structure: one bucket controls say 256 bytes of code
|
|
|
|
bucket is found by stripping off lower 8 bits. then, store only
|
|
|
|
the hit count for that byte. where's the savings over normal count?
|
|
|
|
|
|
|
|
TODO: what if the thread is sleeping at the time we query EIP?
|
|
|
|
can't detect that - suspend count is only set by SuspendThread
|
|
|
|
do we want to report that point (it's good to know), or try to access other threads?
|
|
|
|
|
|
|
|
TODO split off target thread / get PC into sysdep; profiler thread is portable!
|
|
|
|
|
|
|
|
|
|
|
|
at exit: resolve list to hotspots
|
|
|
|
probably hard; a start would be just the function in which the address is, then hit count
|
|
|
|
|
|
|
|
|
|
|
|
==========================================
|
|
|
|
|
|
|
|
|
2007-04-25 20:19:35 +02:00
|
|
|
*/
|
2007-05-26 17:34:10 +02:00
|
|
|
#endif
|