fixes+improvements from work:
timer: cleanup, remove duplication topology: fix incorrect param order tsc: ensure MSRs are available before accessing msr: add prefix to make clear that registers are nehalem-specific This was SVN commit r7892.
This commit is contained in:
parent
e6c3f1c482
commit
7eebe05485
@ -29,11 +29,12 @@
|
||||
|
||||
namespace MSR {
|
||||
|
||||
bool IsSupported()
|
||||
bool IsAccessible()
|
||||
{
|
||||
if(!x86_x64_Cap(X86_X64_CAP_MSR))
|
||||
return false;
|
||||
|
||||
// only read/writable from ring 0, so we need the driver.
|
||||
if(mahaf_Init() < 0)
|
||||
return false;
|
||||
|
||||
|
@ -45,15 +45,15 @@ enum ModelSpecificRegisters
|
||||
IA32_PERF_GLOBAL_OVF_CTRL = 0x390,
|
||||
|
||||
// Nehalem (requires HasNehalem)
|
||||
PLATFORM_INFO = 0x0CE,
|
||||
UNCORE_PERF_GLOBAL_CTRL = 0x391,
|
||||
UNCORE_PERF_GLOBAL_STATUS = 0x392,
|
||||
UNCORE_PERF_GLOBAL_OVF_CTRL = 0x393,
|
||||
UNCORE_PMC0 = 0x3B0,
|
||||
UNCORE_PERFEVTSEL0 = 0x3C0
|
||||
NHM_PLATFORM_INFO = 0x0CE,
|
||||
NHM_UNCORE_PERF_GLOBAL_CTRL = 0x391,
|
||||
NHM_UNCORE_PERF_GLOBAL_STATUS = 0x392,
|
||||
NHM_UNCORE_PERF_GLOBAL_OVF_CTRL = 0x393,
|
||||
NHM_UNCORE_PMC0 = 0x3B0,
|
||||
NHM_UNCORE_PERFEVTSEL0 = 0x3C0
|
||||
};
|
||||
|
||||
LIB_API bool IsSupported();
|
||||
LIB_API bool IsAccessible();
|
||||
|
||||
LIB_API bool HasEnergyPerfBias();
|
||||
LIB_API bool HasNehalem();
|
||||
|
@ -222,7 +222,7 @@ static LibError InitCpuTopology()
|
||||
std::set<size_t> values;
|
||||
for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
|
||||
{
|
||||
const size_t value = ApicField(apicIds[processor], numValues, indexOfLowestBit);
|
||||
const size_t value = ApicField(apicIds[processor], indexOfLowestBit, numValues);
|
||||
values.insert(value);
|
||||
}
|
||||
return values.size();
|
||||
@ -299,19 +299,19 @@ size_t cpu_topology_LogicalPerCore()
|
||||
return cpuTopology.logicalPerCore;
|
||||
}
|
||||
|
||||
size_t cpu_topology_LogicalFromId(size_t apicId)
|
||||
size_t cpu_topology_LogicalFromApicId(size_t apicId)
|
||||
{
|
||||
ModuleInit(&cpuInitState, InitCpuTopology);
|
||||
return ApicField(apicId, cpuTopology.logicalOffset, cpuTopology.maxLogicalPerCore);
|
||||
}
|
||||
|
||||
size_t cpu_topology_CoreFromId(size_t apicId)
|
||||
size_t cpu_topology_CoreFromApicId(size_t apicId)
|
||||
{
|
||||
ModuleInit(&cpuInitState, InitCpuTopology);
|
||||
return ApicField(apicId, cpuTopology.coreOffset, cpuTopology.maxCoresPerPackage);
|
||||
}
|
||||
|
||||
size_t cpu_topology_PackageFromId(size_t apicId)
|
||||
size_t cpu_topology_PackageFromApicId(size_t apicId)
|
||||
{
|
||||
ModuleInit(&cpuInitState, InitCpuTopology);
|
||||
return ApicField(apicId, cpuTopology.packageOffset, 256);
|
||||
|
@ -65,9 +65,9 @@ LIB_API size_t cpu_topology_CoresPerPackage();
|
||||
LIB_API size_t cpu_topology_LogicalPerCore();
|
||||
|
||||
|
||||
LIB_API size_t cpu_topology_LogicalFromId(size_t apicId);
|
||||
LIB_API size_t cpu_topology_CoreFromId(size_t apicId);
|
||||
LIB_API size_t cpu_topology_PackageFromId(size_t apicId);
|
||||
LIB_API size_t cpu_topology_LogicalFromApicId(size_t apicId);
|
||||
LIB_API size_t cpu_topology_CoreFromApicId(size_t apicId);
|
||||
LIB_API size_t cpu_topology_PackageFromApicId(size_t apicId);
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
|
@ -219,9 +219,9 @@ public:
|
||||
// clock is subject to thermal drift and would require continual
|
||||
// recalibration anyway.
|
||||
#if ARCH_X86_X64
|
||||
if(MSR::HasNehalem())
|
||||
if(MSR::IsAccessible && MSR::HasNehalem())
|
||||
{
|
||||
const u64 platformInfo = MSR::Read(MSR::PLATFORM_INFO);
|
||||
const u64 platformInfo = MSR::Read(MSR::NHM_PLATFORM_INFO);
|
||||
const u8 maxNonTurboRatio = bits(platformInfo, 8, 15);
|
||||
return maxNonTurboRatio * 133.33e6f;
|
||||
}
|
||||
|
@ -27,14 +27,16 @@
|
||||
#include "precompiled.h"
|
||||
#include "lib/timer.h"
|
||||
|
||||
#include <sstream> // std::stringstream
|
||||
#include <numeric>
|
||||
#include <math.h>
|
||||
#include <float.h>
|
||||
#include <stdarg.h>
|
||||
#include <cmath>
|
||||
#include <cfloat>
|
||||
#include <cstdarg>
|
||||
|
||||
#include "lib/module_init.h"
|
||||
#include "lib/posix/posix_time.h"
|
||||
#if OS_WIN
|
||||
#include "lib/sysdep/os/win/whrt/whrt.h"
|
||||
# include "lib/sysdep/os/win/whrt/whrt.h"
|
||||
#endif
|
||||
#if OS_UNIX
|
||||
# include <unistd.h>
|
||||
@ -107,31 +109,33 @@ double timer_Time()
|
||||
}
|
||||
|
||||
|
||||
double timer_Resolution()
|
||||
// cached because the default implementation may take several milliseconds
|
||||
static double resolution;
|
||||
|
||||
static LibError InitResolution()
|
||||
{
|
||||
// may take a while to determine, so cache it
|
||||
static double cached_res = 0.0;
|
||||
if(cached_res != 0.0)
|
||||
return cached_res;
|
||||
|
||||
double res = 0.0;
|
||||
|
||||
#if OS_WIN
|
||||
res = whrt_Resolution();
|
||||
resolution = whrt_Resolution();
|
||||
#elif HAVE_CLOCK_GETTIME
|
||||
struct timespec ts;
|
||||
if(clock_getres(CLOCK_REALTIME, &ts) == 0)
|
||||
res = ts.tv_nsec * 1e-9;
|
||||
resolution = ts.tv_nsec * 1e-9;
|
||||
#else
|
||||
const double t0 = timer_Time();
|
||||
double t1, t2;
|
||||
do t1 = timer_Time(); while(t1 == t0);
|
||||
do t2 = timer_Time(); while(t2 == t1);
|
||||
res = t2-t1;
|
||||
resolution = t2-t1;
|
||||
#endif
|
||||
|
||||
cached_res = res;
|
||||
return res;
|
||||
return INFO::OK;
|
||||
}
|
||||
|
||||
double timer_Resolution()
|
||||
{
|
||||
static ModuleInitState initState;
|
||||
ModuleInit(&initState, InitResolution);
|
||||
return resolution;
|
||||
}
|
||||
|
||||
|
||||
@ -182,3 +186,39 @@ void timer_DisplayClientTotals()
|
||||
|
||||
debug_printf(L"-----------------------------------------------------\n");
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
std::wstring StringForSeconds(double seconds)
|
||||
{
|
||||
double scale = 1e6;
|
||||
const wchar_t* unit = L" us";
|
||||
if(seconds > 1.0)
|
||||
scale = 1, unit = L" s";
|
||||
else if(seconds > 1e-3)
|
||||
scale = 1e3, unit = L" ms";
|
||||
|
||||
std::wstringstream ss;
|
||||
ss << seconds*scale;
|
||||
ss << unit;
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
|
||||
std::wstring StringForCycles(i64 cycles)
|
||||
{
|
||||
double scale = 1.0;
|
||||
const wchar_t* unit = L" c";
|
||||
if(cycles > 10000000000LL) // 10 Gc
|
||||
scale = 1e-9, unit = L" Gc";
|
||||
else if(cycles > 10000000) // 10 Mc
|
||||
scale = 1e-6, unit = L" Mc";
|
||||
else if(cycles > 10000) // 10 kc
|
||||
scale = 1e-3, unit = L" kc";
|
||||
|
||||
std::wstringstream ss;
|
||||
ss << cycles*scale;
|
||||
ss << unit;
|
||||
return ss.str();
|
||||
}
|
||||
|
@ -34,7 +34,6 @@
|
||||
# include "lib/sysdep/os_cpu.h" // os_cpu_ClockFrequency
|
||||
#endif
|
||||
|
||||
#include <sstream> // std::stringstream
|
||||
|
||||
/**
|
||||
* timer_Time will subsequently return values relative to the current time.
|
||||
@ -52,6 +51,14 @@ LIB_API double timer_Time();
|
||||
LIB_API double timer_Resolution();
|
||||
|
||||
|
||||
/**
|
||||
* internal helper functions for returning an easily readable
|
||||
* string (i.e. re-scaled to appropriate units)
|
||||
**/
|
||||
LIB_API std::wstring StringForSeconds(double seconds);
|
||||
LIB_API std::wstring StringForCycles(i64 cycles);
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// scope timing
|
||||
|
||||
@ -67,18 +74,9 @@ public:
|
||||
|
||||
~ScopeTimer()
|
||||
{
|
||||
double t1 = timer_Time();
|
||||
double dt = t1-m_t0;
|
||||
|
||||
// determine scale factor for pretty display
|
||||
double scale = 1e6;
|
||||
const wchar_t* unit = L"us";
|
||||
if(dt > 1.0)
|
||||
scale = 1, unit = L"s";
|
||||
else if(dt > 1e-3)
|
||||
scale = 1e3, unit = L"ms";
|
||||
|
||||
debug_printf(L"TIMER| %ls: %g %ls\n", m_description, dt*scale, unit);
|
||||
const double t1 = timer_Time();
|
||||
const std::wstring elapsedTimeString = StringForSeconds(t1-m_t0);
|
||||
debug_printf(L"TIMER| %ls: %ls\n", m_description, elapsedTimeString.c_str());
|
||||
}
|
||||
|
||||
private:
|
||||
@ -137,7 +135,7 @@ private:
|
||||
|
||||
// since TIMER_ACCRUE et al. are called so often, we try to keep
|
||||
// overhead to an absolute minimum. storing raw tick counts (e.g. CPU cycles
|
||||
// returned by ia32_rdtsc) instead of absolute time has two benefits:
|
||||
// returned by x86_x64_rdtsc) instead of absolute time has two benefits:
|
||||
// - no need to convert from raw->time on every call
|
||||
// (instead, it's only done once when displaying the totals)
|
||||
// - possibly less overhead to querying the time itself
|
||||
@ -160,63 +158,49 @@ class TimerUnit
|
||||
public:
|
||||
void SetToZero()
|
||||
{
|
||||
m_ticks = 0;
|
||||
m_cycles = 0;
|
||||
}
|
||||
|
||||
void SetFromTimer()
|
||||
{
|
||||
m_ticks = x86_x64_rdtsc();
|
||||
m_cycles = x86_x64_rdtsc();
|
||||
}
|
||||
|
||||
void AddDifference(TimerUnit t0, TimerUnit t1)
|
||||
{
|
||||
m_ticks += t1.m_ticks - t0.m_ticks;
|
||||
m_cycles += t1.m_cycles - t0.m_cycles;
|
||||
}
|
||||
|
||||
void AddDifferenceAtomic(TimerUnit t0, TimerUnit t1)
|
||||
{
|
||||
const i64 delta = t1.m_ticks - t0.m_ticks;
|
||||
const i64 delta = t1.m_cycles - t0.m_cycles;
|
||||
#if ARCH_AMD64
|
||||
cpu_AtomicAdd((volatile intptr_t*)&m_ticks, (intptr_t)delta);
|
||||
cpu_AtomicAdd((volatile intptr_t*)&m_cycles, (intptr_t)delta);
|
||||
#else
|
||||
retry:
|
||||
if(!cpu_CAS64(&m_ticks, m_ticks, m_ticks+delta))
|
||||
if(!cpu_CAS64(&m_cycles, m_cycles, m_cycles+delta))
|
||||
goto retry;
|
||||
#endif
|
||||
}
|
||||
|
||||
void Subtract(TimerUnit t)
|
||||
{
|
||||
m_ticks -= t.m_ticks;
|
||||
m_cycles -= t.m_cycles;
|
||||
}
|
||||
|
||||
std::wstring ToString() const
|
||||
{
|
||||
debug_assert(m_ticks >= 0.0);
|
||||
|
||||
// determine scale factor for pretty display
|
||||
double scale = 1.0;
|
||||
const wchar_t* unit = L" c";
|
||||
if(m_ticks > 10000000000LL) // 10 Gc
|
||||
scale = 1e-9, unit = L" Gc";
|
||||
else if(m_ticks > 10000000) // 10 Mc
|
||||
scale = 1e-6, unit = L" Mc";
|
||||
else if(m_ticks > 10000) // 10 kc
|
||||
scale = 1e-3, unit = L" kc";
|
||||
|
||||
std::wstringstream ss;
|
||||
ss << m_ticks*scale;
|
||||
ss << unit;
|
||||
return ss.str();
|
||||
debug_assert(m_cycles >= 0.0);
|
||||
return StringForCycles(m_cycles);
|
||||
}
|
||||
|
||||
double ToSeconds() const
|
||||
{
|
||||
return m_ticks / os_cpu_ClockFrequency();
|
||||
return m_cycles / os_cpu_ClockFrequency();
|
||||
}
|
||||
|
||||
private:
|
||||
i64 m_ticks;
|
||||
i64 m_cycles;
|
||||
};
|
||||
|
||||
#else
|
||||
@ -261,19 +245,7 @@ retry:
|
||||
std::wstring ToString() const
|
||||
{
|
||||
debug_assert(m_seconds >= 0.0);
|
||||
|
||||
// determine scale factor for pretty display
|
||||
double scale = 1e6;
|
||||
const wchar_t* unit = L" us";
|
||||
if(m_seconds > 1.0)
|
||||
scale = 1, unit = L" s";
|
||||
else if(m_seconds > 1e-3)
|
||||
scale = 1e3, unit = L" ms";
|
||||
|
||||
std::wstringstream ss;
|
||||
ss << m_seconds*scale;
|
||||
ss << unit;
|
||||
return ss.str();
|
||||
return StringForSeconds(m_seconds);
|
||||
}
|
||||
|
||||
double ToSeconds() const
|
||||
@ -299,7 +271,7 @@ struct TimerClient
|
||||
|
||||
TimerClient* next;
|
||||
|
||||
// how often timer_BillClient was called (helps measure relative
|
||||
// how often the timer was billed (helps measure relative
|
||||
// performance of something that is done indeterminately often).
|
||||
intptr_t num_calls;
|
||||
};
|
||||
@ -307,7 +279,7 @@ struct TimerClient
|
||||
/**
|
||||
* make the given TimerClient (usually instantiated as static data)
|
||||
* ready for use. returns its address for TIMER_ADD_CLIENT's convenience.
|
||||
* this client's total (added to by timer_BillClient) will be
|
||||
* this client's total (which is increased by a BillingPolicy) will be
|
||||
* displayed by timer_DisplayClientTotals.
|
||||
* notes:
|
||||
* - may be called at any time;
|
||||
@ -331,21 +303,29 @@ LIB_API TimerClient* timer_AddClient(TimerClient* tc, const wchar_t* description
|
||||
/**
|
||||
* bill the difference between t0 and t1 to the client's total.
|
||||
**/
|
||||
inline void timer_BillClient(TimerClient* tc, TimerUnit t0, TimerUnit t1)
|
||||
struct BillingPolicy_Default
|
||||
{
|
||||
tc->sum.AddDifference(t0, t1);
|
||||
tc->num_calls++;
|
||||
}
|
||||
void operator()(TimerClient* tc, TimerUnit t0, TimerUnit t1) const
|
||||
{
|
||||
tc->sum.AddDifference(t0, t1);
|
||||
tc->num_calls++;
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* thread-safe version of timer_BillClient
|
||||
* (not used by default due to its higher overhead)
|
||||
* thread-safe (not used by default due to its higher overhead)
|
||||
* note: we can't just use thread-local variables to avoid
|
||||
* synchronization overhead because we don't have control over all
|
||||
* threads (for accumulating their separate timer copies).
|
||||
**/
|
||||
inline void timer_BillClientAtomic(TimerClient* tc, TimerUnit t0, TimerUnit t1)
|
||||
struct BillingPolicy_Atomic
|
||||
{
|
||||
tc->sum.AddDifferenceAtomic(t0, t1);
|
||||
cpu_AtomicAdd(&tc->num_calls, +1);
|
||||
}
|
||||
void operator()(TimerClient* tc, TimerUnit t0, TimerUnit t1) const
|
||||
{
|
||||
tc->sum.AddDifferenceAtomic(t0, t1);
|
||||
cpu_AtomicAdd(&tc->num_calls, +1);
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* display all clients' totals; does not reset them.
|
||||
@ -353,7 +333,9 @@ inline void timer_BillClientAtomic(TimerClient* tc, TimerUnit t0, TimerUnit t1)
|
||||
**/
|
||||
LIB_API void timer_DisplayClientTotals();
|
||||
|
||||
|
||||
/// used by TIMER_ACCRUE
|
||||
template<class BillingPolicy = BillingPolicy_Default>
|
||||
class ScopeTimerAccrue
|
||||
{
|
||||
NONCOPYABLE(ScopeTimerAccrue);
|
||||
@ -368,29 +350,7 @@ public:
|
||||
{
|
||||
TimerUnit t1;
|
||||
t1.SetFromTimer();
|
||||
timer_BillClient(m_tc, m_t0, t1);
|
||||
}
|
||||
|
||||
private:
|
||||
TimerUnit m_t0;
|
||||
TimerClient* m_tc;
|
||||
};
|
||||
|
||||
class ScopeTimerAccrueAtomic
|
||||
{
|
||||
NONCOPYABLE(ScopeTimerAccrueAtomic);
|
||||
public:
|
||||
ScopeTimerAccrueAtomic(TimerClient* tc)
|
||||
: m_tc(tc)
|
||||
{
|
||||
m_t0.SetFromTimer();
|
||||
}
|
||||
|
||||
~ScopeTimerAccrueAtomic()
|
||||
{
|
||||
TimerUnit t1;
|
||||
t1.SetFromTimer();
|
||||
timer_BillClientAtomic(m_tc, m_t0, t1);
|
||||
BillingPolicy()(m_tc, m_t0, t1);
|
||||
}
|
||||
|
||||
private:
|
||||
@ -403,22 +363,21 @@ private:
|
||||
* bill it to the given TimerClient object. Can safely be nested.
|
||||
* Useful for measuring total time spent in a function or basic block over the
|
||||
* entire program.
|
||||
* <description> must remain valid over the lifetime of this object;
|
||||
* a string literal is safest.
|
||||
* `client' is an identifier registered via TIMER_ADD_CLIENT.
|
||||
*
|
||||
* Example usage:
|
||||
* TIMER_ADD_CLIENT(identifier);
|
||||
*
|
||||
* TIMER_ADD_CLIENT(client);
|
||||
*
|
||||
* void func()
|
||||
* {
|
||||
* TIMER_ACCRUE(name_of_pointer_to_client);
|
||||
* TIMER_ACCRUE(client);
|
||||
* // code to be measured
|
||||
* }
|
||||
*
|
||||
* [at exit]
|
||||
* [later or at exit]
|
||||
* timer_DisplayClientTotals();
|
||||
**/
|
||||
#define TIMER_ACCRUE(client) ScopeTimerAccrue UID__(client)
|
||||
#define TIMER_ACCRUE_ATOMIC(client) ScopeTimerAccrueAtomic UID__(client)
|
||||
#define TIMER_ACCRUE(client) ScopeTimerAccrue<> UID__(client)
|
||||
#define TIMER_ACCRUE_ATOMIC(client) ScopeTimerAccrue<BillingPolicy_Atomic> UID__(client)
|
||||
|
||||
#endif // #ifndef INCLUDED_TIMER
|
||||
|
@ -182,7 +182,7 @@ JSBool StopJsTimer(JSContext* cx, JSObject*, uintN argc, jsval* argv, jsval* rva
|
||||
TimerUnit now;
|
||||
now.SetFromTimer();
|
||||
now.Subtract(js_timer_overhead);
|
||||
timer_BillClient(&js_timer_clients[slot], js_start_times[slot], now);
|
||||
BillingPolicy_Default()(&js_timer_clients[slot], js_start_times[slot], now);
|
||||
js_start_times[slot].SetToZero();
|
||||
return JS_TRUE;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user