1
0
forked from 0ad/0ad

statically allocated per-CPU info (simplicity), and profiler WIP

This was SVN commit r1995.
This commit is contained in:
janwas 2005-03-15 18:51:54 +00:00
parent 8afc1d9daf
commit c3da7f5e33

View File

@ -18,13 +18,16 @@
#include "precompiled.h"
#include <stdlib.h> // for malloc, free
#include "lib.h"
#include "win_internal.h"
#include "sysdep/cpu.h"
// limit allows statically allocated per-CPU structures (for simplicity).
// we're Windows-specific anyway; such systems won't foreseeably have more.
// note: int instead of unsigned because <cpus> is also signed (tri-state).
static const int MAX_CPUS = 32;
// not possible with POSIX calls.
// called from ia32.cpp check_smp
@ -79,34 +82,29 @@ static void check_speedstep()
{
// most likely not speedstep-capable if these aren't supported
SYSTEM_POWER_CAPABILITIES spc;
if(pCNPI(SystemPowerCapabilities, 0, 0, &spc, sizeof(spc)) == STATUS_SUCCESS)
if(pCNPI(SystemPowerCapabilities, 0,0, &spc,sizeof(spc)) == STATUS_SUCCESS)
if(!spc.ProcessorThrottle || !spc.ThermalControl)
cpu_speedstep = 0;
// probably speedstep if cooling mode active.
// the documentation of PO_TZ_* is unclear, so we can't be sure.
SYSTEM_POWER_INFORMATION spi;
if(pCNPI(SystemPowerInformation, 0, 0, &spi, sizeof(spi)) == STATUS_SUCCESS)
if(pCNPI(SystemPowerInformation, 0,0, &spi,sizeof(spi)) == STATUS_SUCCESS)
if(spi.CoolingMode != PO_TZ_INVALID_MODE)
cpu_speedstep = 1;
// definitely speedstep if a CPU has thermal throttling active.
// note that we don't care about user-defined throttles
// (see ppi.CurrentMhz) - they don't change often.
const size_t ppi_buf_size = cpus * sizeof(PROCESSOR_POWER_INFORMATION);
void* ppi_buf = malloc(ppi_buf_size);
if(pCNPI(ProcessorInformation, 0, 0, ppi_buf, (ULONG)ppi_buf_size) == STATUS_SUCCESS)
// definitely speedstep if any throttle is less than 100%.
PROCESSOR_POWER_INFORMATION ppi[MAX_CPUS];
if(pCNPI(ProcessorInformation, 0,0, ppi,sizeof(ppi)) == STATUS_SUCCESS)
{
PROCESSOR_POWER_INFORMATION* ppi = (PROCESSOR_POWER_INFORMATION*)ppi_buf;
for(int i = 0; i < cpus; i++)
// thermal throttling currently active
if(ppi[i].MaxMhz != ppi[i].MhzLimit)
const PROCESSOR_POWER_INFORMATION* p = ppi;
for(int i = 0; i < MIN(cpus, MAX_CPUS); i++, p++)
if(p->MhzLimit != p->MaxMhz || p->CurrentMhz != p->MaxMhz)
{
cpu_speedstep = 1;
break;
}
}
free(ppi_buf);
}
FreeLibrary(hPowrprofDll);
// this is most likely the only reference,
@ -157,3 +155,236 @@ int win_get_cpu_info()
return 0;
}
//////////////////////////////////////////////////////////////////////////////
//
//
//
//////////////////////////////////////////////////////////////////////////////
// we need a means of measuring performance, since it is hard to predict and
// depends on many factors. to cover a wider range of configurations, this
// must also be possible on end-user systems lacking specialized developer
// tools. therefore, we must ship our own implementation; this complements
// Intel VTune et al.
//
// there are 3 approaches to the problem:
// - single-step analysis logs every executed instruction. very thorough, but
// intolerably slow (~1000x) and not suitable for performance measurement.
// - intrusive measuring tracks execution time of explicitly marked
// functions or 'zones'. more complex, requires adding code, and
// inaccurate when thread switches are frequent.
// - IP sampling records the current instruction pointer at regular
// intervals; slow sections of code will over time appear more often.
// not exact, but simple and low-overhead.
//
// we implement IP sampling due to its simplicity. an intrusive approach
// might also be added later to account for performance per-module
// (helps spot the culprit in case hotspots are called from multiple sites).
// on Windows, we retrieve the current IP with GetThreadContext. dox require
// this to happen from another thread, and for the target to be suspended
// (now enforced by XP SP2). this leads to all sorts of problems:
// - if the suspended thread was dispatching an exception in the kernel,
// register state may be a mix between the correct values and
// those captured from the exception.
// - if running on Win9x with real-mode drivers, interrupts may interfere
// with GetThreadContext. however, it's not supported anyway due to other
// deficiencies (e.g. lack of proper mmap support).
// - the suspended thread may be holding locks; we need to be extremely
// careful to avoid deadlock! many win api functions acquire locks in
// non-obvious ways.
static HANDLE prof_target_thread;
static pthread_t prof_thread;
// delay [ms] between samples. OS sleep timers usually provide only
// ms resolution. increasing interval reduces overhead and accuracy.
static const int PROFILE_INTERVAL_MS = 1;
static uintptr_t get_target_pc()
{
DWORD ret;
HANDLE hThread = prof_target_thread; // convenience
ret = SuspendThread(hThread);
if(ret == (DWORD)-1)
{
debug_warn("SuspendThread failed");
return -1;
}
// note: we don't need to call more than once: this increments a DWORD
// 'suspend count'; target is guaranteed to be suspended unless
// the function failed.
/////////////////////////////////////////////
// be VERY CAREFUL to avoid anything that may acquire a lock until
// after ResumeThread! this includes locks taken by the OS,
// e.g. malloc -> heap or GetProcAddres -> loader.
// reason is, if the target thread was holding a lock we try to
// acquire here, a classic deadlock results.
uintptr_t pc = 0; // => will return 0 if GetThreadContext fails
CONTEXT context;
if(GetThreadContext(hThread, &context))
{
#if defined(_M_AMD64)
pc = context.Rip;
#elif defined(_M_IX86)
pc = context.Eip;
#else
# error "port CONTEXT"
#endif
}
/////////////////////////////////////////////
ret = ResumeThread(hThread);
assert(ret != 0);
// don't fail (we have a valid PC), but warn
return pc;
}
static pthread_t thread;
static sem_t exit_flag;
static void* prof_thread_func(void* data)
{
UNUSED(data);
const long _1e6 = 1000000;
const long _1e9 = 1000000000;
for(;;)
{
// calculate absolute timeout for sem_timedwait
struct timespec abs_timeout;
clock_gettime(CLOCK_REALTIME, &abs_timeout);
abs_timeout.tv_nsec += PROFILE_INTERVAL_MS * _1e6;
// .. handle nanosecond wraparound (must not be > 1000m)
if(abs_timeout.tv_nsec >= _1e9)
{
abs_timeout.tv_nsec -= _1e9;
abs_timeout.tv_sec++;
}
errno = 0;
// if we acquire the semaphore, exit was requested.
if(sem_timedwait(&exit_flag, &abs_timeout) == 0)
break;
// actual error: warn
if(errno != ETIMEDOUT)
debug_warn("wpcu prof_thread_func: sem_timedwait failed");
uintptr_t pc = get_target_pc();
// ADD TO LIST
}
return 0;
}
// call from thread that is to be profiled
int prof_start()
{
// we need a real HANDLE to the target thread for use with
// Suspend|ResumeThread and GetThreadContext.
// alternative: DuplicateHandle on the current thread pseudo-HANDLE.
// this way is a bit more obvious/simple.
const DWORD access = THREAD_GET_CONTEXT|THREAD_SUSPEND_RESUME;
HANDLE hThread = OpenThread(access, FALSE, GetCurrentThreadId());
if(hThread == INVALID_HANDLE_VALUE)
{
debug_warn("OpenThread failed");
return -1;
}
prof_target_thread = hThread;
sem_init(&exit_flag, 0, 0);
pthread_create(&thread, 0, prof_thread_func, 0);
return 0;
}
int prof_shutdown()
{
CloseHandle(prof_target_thread);
return 0;
}
/*
open question: how to store the EIP values returned? some background:
the mechanism above churns out an EIP value (may be in our process, but might
also be bogus); we need to store it somehow pending analysis.
when done with the current run, we'd want to resolve EIP -> function name,
source file etc. (rather slow, so don't do it at runtime).
so, how to store it in the meantime? 2 possibilities:
- simple array/vector of addresses (of course optimized to reduce allocs)
- fixed size array of 'bins' (range of addresses; may be as fine as 1 byte);
each bin has a counter which is incremented when the bin's corresponding
address has been hit.
it's a size tradeoff here; for simple runs of < 1 min (60,000 ms), #1
would use 240kb of mem. #2 requires sizeof_whole_program * bytes_per_counter
up front, and has problems measuring DLLs (we'd have to explicitly map
the DLL address range into a bin - ugh). however, if we ever want to
test for say an hour (improves accuracy of profiling due to larger sample size),
#1 would guzzle 15mb of memory.
hm, another idea would be to write out #1's list of addresses periodically.
to make sure the disk I/O doesn't come at a bad time, we could have the main
thread call into the profiler and request it write out at that time.
this would require extreme caution to avoid the deadlock problem, but looks
doable.
-------- [2] ----------
realistic profiler runs will take up to an hour.
writing out to disk would work: could have main thread call back.
that and adding EIP to list would be atomic (locked).
BUT: large amount of data, that's bad (loading at 30mb/s => 500ms load time alone)
problem with enumerating all symbols at startup: how do we enum all DLLs?
hybrid idea: std::map of EIPs. we don't build the map at startup,
but add when first seen and subsequently increment counter stored there.
problem: uses more memory/slower access than list.
would have to make sure EIPs are reused.
to help that, could quantize down to 4 byte (or so) bins.
accessing debug information at runtime to determine function length is too slow.
maybe some weird data structure: one bucket controls say 256 bytes of code
bucket is found by stripping off lower 8 bits. then, store only
the hit count for that byte. where's the savings over normal count?
TODO: what if the thread is sleeping at the time we query EIP?
can't detect that - suspend count is only set by SuspendThread
do we want to report that point (it's good to know), or try to access other threads?
TODO split off target thread / get PC into sysdep; profiler thread is portable!
at exit: resolve list to hotspots
probably hard; a start would be just the function in which the address is, then hit count
==========================================
*/