1
0
forked from 0ad/0ad

# finalize WHRT implementation

cpu: avoid measuring cpu freq if possible
ia32: cleanup, fix #cores detection on AMD (they're incompatible,
*sigh*), add ia32_Generation
wcpu: remove IsThrottling code (clunky and not necessary - we rely on
ia32's feature bit detection instead)
counter: move whrt's counter create code here
qpc: update comment
tsc: finalize IsSafe implementation, remove per-thread stuff (since it
cannot be made to work)
whrt: cleanup, remove calibration code (no longer needed)

This was SVN commit r5121.
This commit is contained in:
janwas 2007-05-31 00:11:38 +00:00
parent d778b22d61
commit bd0d0c0026
11 changed files with 435 additions and 583 deletions

View File

@ -51,6 +51,16 @@ double cpu_ClockFrequency()
static void DetectClockFrequency()
{
#if OS_WIN
clockFrequency = wcpu_ClockFrequency();
// success; we stick with this value because it either doesn't matter
// (WHRT isn't using the TSC), or cannot be determined more accurately
// (ia32 will use WHRT's TSC to measure its own frequency).
// bonus: the wcpu function is much faster than ia32's measurement loop.
if(clockFrequency > 0.0)
return;
#endif
#if CPU_IA32
clockFrequency = ia32_ClockFrequency(); // authoritative, precise
#endif

View File

@ -34,7 +34,6 @@
// keep in sync with enum CpuCap!
static u32 ia32_caps[4];
static void ia32_cap_init()
{
u32 regs[4];
@ -63,6 +62,9 @@ bool ia32_cap(IA32Cap cap)
}
//-----------------------------------------------------------------------------
// CPU identification
static Ia32Vendor vendor;
Ia32Vendor ia32_Vendor()
@ -94,81 +96,49 @@ static void DetectVendor()
}
static uint model, family;
static uint generation;
//-----------------------------------------------------------------------------
// this RDTSC implementation writes edx:eax to a temporary and returns that.
// rationale: this insulates against changing compiler calling conventions,
// at the cost of some efficiency.
// use ia32_asm_rdtsc_edx_eax instead if the return convention is known to be
// edx:eax (should be the case on all 32-bit x86).
u64 ia32_rdtsc_safe()
uint ia32_Generation()
{
u64 c;
#if HAVE_MS_ASM
__asm
{
cpuid
rdtsc
mov dword ptr [c], eax
mov dword ptr [c+4], edx
}
#elif HAVE_GNU_ASM
// note: we save+restore EBX to avoid xcode complaining about a
// "PIC register" being clobbered, whatever that means.
__asm__ __volatile__ (
"pushl %%ebx; cpuid; popl %%ebx; rdtsc"
: "=A" (c)
: /* no input */
: "ecx" /* cpuid clobbers eax..edx, but the rest are covered */);
#endif
return c;
return generation;
}
void ia32_DebugBreak()
static void DetectSignature()
{
#if HAVE_MS_ASM
__asm int 3
// note: this probably isn't necessary, since unix_debug_break
// (SIGTRAP) is most probably available if HAVE_GNU_ASM.
// we include it for completeness, though.
#elif HAVE_GNU_ASM
__asm__ __volatile__ ("int $3");
#endif
}
u32 regs[4];
if(!ia32_asm_cpuid(1, regs))
DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING);
model = bits(regs[EAX], 4, 7);
family = bits(regs[EAX], 8, 11);
//-----------------------------------------------------------------------------
// support code for lock-free primitives
//-----------------------------------------------------------------------------
// enforce strong memory ordering.
void ia32_MemoryFence()
{
// Pentium IV
if(ia32_cap(IA32_CAP_SSE2))
#if HAVE_MS_ASM
__asm mfence
#elif HAVE_GNU_ASM
__asm__ __volatile__ ("mfence");
#endif
}
void ia32_Serialize()
{
#if HAVE_MS_ASM
__asm cpuid
#elif HAVE_GNU_ASM
__asm__ __volatile__ ("cpuid");
#endif
switch(family)
{
case 5:
case 6:
case 7:
generation = family;
break;
case 0xF:
generation = 8;
break;
default:
debug_assert(0);
}
}
//-----------------------------------------------------------------------------
// identifier string
// 3 calls x 4 registers x 4 bytes = 48
static char identifierString[48+1] = {'\0'};
const char* ia32_IdentifierString()
{
return identifierString;
}
/// functor to remove substrings from the CPU identifier string
class StringStripper
{
@ -197,26 +167,12 @@ public:
}
};
const char* ia32_IdentifierString()
static void DetectIdentifierString()
{
// 3 calls x 4 registers x 4 bytes = 48
static char identifier_string[48+1] = {'\0'};
// not first call, return previous result
if(identifier_string[0] != '\0')
return identifier_string;
// get processor signature
u32 regs[4];
if(!ia32_asm_cpuid(1, regs))
DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING);
const uint model = bits(regs[EAX], 4, 7);
const uint family = bits(regs[EAX], 8, 11);
// get brand string (if available)
// note: ia32_asm_cpuid writes 4 u32s directly to identifier_string -
// note: ia32_asm_cpuid writes 4 u32s directly to identifierString -
// be very careful with pointer arithmetic!
u32* u32_string = (u32*)identifier_string;
u32* u32_string = (u32*)identifierString;
bool have_brand_string = false;
if(ia32_asm_cpuid(0x80000002, u32_string+0 ) &&
ia32_asm_cpuid(0x80000003, u32_string+4) &&
@ -232,7 +188,7 @@ const char* ia32_IdentifierString()
// - the brand string is useless, e.g. "Unknown". this happens on
// some older boards whose BIOS reprograms the string for CPUs it
// doesn't recognize.
if(!have_brand_string || strncmp(identifier_string, "Unknow", 6) == 0)
if(!have_brand_string || strncmp(identifierString, "Unknow", 6) == 0)
{
if(vendor == IA32_VENDOR_AMD)
{
@ -240,15 +196,15 @@ const char* ia32_IdentifierString()
if(family == 6)
{
if(model == 3 || model == 7)
SAFE_STRCPY(identifier_string, "IA32_VENDOR_AMD Duron");
SAFE_STRCPY(identifierString, "IA32_VENDOR_AMD Duron");
else if(model <= 5)
SAFE_STRCPY(identifier_string, "IA32_VENDOR_AMD Athlon");
SAFE_STRCPY(identifierString, "IA32_VENDOR_AMD Athlon");
else
{
if(ia32_cap(IA32_CAP_AMD_MP))
SAFE_STRCPY(identifier_string, "IA32_VENDOR_AMD Athlon MP");
SAFE_STRCPY(identifierString, "IA32_VENDOR_AMD Athlon MP");
else
SAFE_STRCPY(identifier_string, "IA32_VENDOR_AMD Athlon XP");
SAFE_STRCPY(identifierString, "IA32_VENDOR_AMD Athlon XP");
}
}
}
@ -258,29 +214,27 @@ const char* ia32_IdentifierString()
if(family == 6)
{
if(model == 1)
SAFE_STRCPY(identifier_string, "Intel Pentium Pro");
SAFE_STRCPY(identifierString, "Intel Pentium Pro");
else if(model == 3 || model == 5)
SAFE_STRCPY(identifier_string, "Intel Pentium II");
SAFE_STRCPY(identifierString, "Intel Pentium II");
else if(model == 6)
SAFE_STRCPY(identifier_string, "Intel Celeron");
SAFE_STRCPY(identifierString, "Intel Celeron");
else
SAFE_STRCPY(identifier_string, "Intel Pentium III");
SAFE_STRCPY(identifierString, "Intel Pentium III");
}
}
}
// identifier_string already holds a valid brand string; pretty it up.
// identifierString already holds a valid brand string; pretty it up.
else
{
const char* const undesired_strings[] = { "(tm)", "(TM)", "(R)", "CPU " };
std::for_each(undesired_strings, undesired_strings+ARRAY_SIZE(undesired_strings),
StringStripper(identifier_string, ARRAY_SIZE(identifier_string)));
StringStripper(identifierString, ARRAY_SIZE(identifierString)));
// note: Intel brand strings include a frequency, but we can't rely
// on it because the CPU may be overclocked. we'll leave it in the
// string to show measurement accuracy and if SpeedStep is active.
}
return identifier_string;
}
@ -312,6 +266,10 @@ public:
}
};
// note: this function uses timer.cpp!get_time, which is implemented via
// whrt.cpp on Windows, which again calls ia32_Init. be careful that
// this function isn't called from there as well, else WHRT will be used
// before its init completes.
double ia32_ClockFrequency()
{
// if the TSC isn't available, there's really no good way to count the
@ -392,7 +350,7 @@ double ia32_ClockFrequency()
//-----------------------------------------------------------------------------
// detect processor types / topology
// processor topology
//-----------------------------------------------------------------------------
uint ia32_ApicId()
@ -408,44 +366,62 @@ uint ia32_ApicId()
// OSes report hyperthreading units and cores as "processors". we need to
// drill down and find out the exact counts (for thread pool dimensioning
// and cache sharing considerations).
// note: Intel Appnote 485 (CPUID) assures uniformity of CoresPerPackage and
// LogicalPerCore.
// note: Intel Appnote 485 (CPUID) assures uniformity of coresPerPackage and
// logicalPerCore.
static uint CoresPerPackage()
static uint coresPerPackage = 0;
static uint logicalPerCore = 0;
static void DetectCoresPerPackage()
{
static uint coresPerPackage = 0;
if(coresPerPackage == 0)
u32 regs[4];
coresPerPackage = 1; // single-core unless..
switch(vendor)
{
u32 regs[4];
case IA32_VENDOR_INTEL:
if(ia32_asm_cpuid(4, regs))
coresPerPackage = bits(regs[EAX], 26, 31)+1;
else
coresPerPackage = 1; // single-core
}
break;
return coresPerPackage;
case IA32_VENDOR_AMD:
if(ia32_asm_cpuid(0x80000008, regs))
coresPerPackage = bits(regs[ECX], 0, 7)+1;
break;
}
}
static uint LogicalPerCore()
static bool IsHyperthreadingCapable()
{
static uint logicalPerCore = 0;
if(logicalPerCore == 0)
// definitely not
if(!ia32_cap(IA32_CAP_HT))
return false;
// AMD N-core systems falsely set the HT bit for compatibility reasons
// (don't bother resetting it, might confuse callers)
if(vendor == IA32_VENDOR_AMD && ia32_cap(IA32_CAP_AMD_CMP_LEGACY))
return false;
return true;
}
static void DetectLogicalPerCore()
{
u32 regs[4];
if(!IsHyperthreadingCapable())
{
if(ia32_cap(IA32_CAP_HT))
{
u32 regs[4];
if(!ia32_asm_cpuid(1, regs))
DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING);
const uint logical_per_package = bits(regs[EBX], 16, 23);
// cores ought to be uniform WRT # logical processors
debug_assert(logical_per_package % CoresPerPackage() == 0);
logicalPerCore = logical_per_package / CoresPerPackage();
}
else
logicalPerCore = 1; // not Hyperthreading capable
logicalPerCore = 1;
return;
}
return logicalPerCore;
if(!ia32_asm_cpuid(1, regs))
DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING);
const uint logicalPerPackage = bits(regs[EBX], 16, 23);
// cores ought to be uniform WRT # logical processors
debug_assert(logicalPerPackage % coresPerPackage == 0);
logicalPerCore = logicalPerPackage / coresPerPackage;
}
// the above two functions give the maximum number of cores/logical units.
@ -508,18 +484,18 @@ static void DetectProcessorTopology()
// extract values from all 3 ID bitfields into separate sets
uint bit_pos = 0;
IdSet logical_ids;
ExtractFieldsIntoSet(apicIds, bit_pos, LogicalPerCore(), logical_ids);
IdSet core_ids;
ExtractFieldsIntoSet(apicIds, bit_pos, CoresPerPackage(), core_ids);
IdSet package_ids;
ExtractFieldsIntoSet(apicIds, bit_pos, 0xFF, package_ids);
IdSet logicalIds;
ExtractFieldsIntoSet(apicIds, bit_pos, logicalPerCore, logicalIds);
IdSet coreIds;
ExtractFieldsIntoSet(apicIds, bit_pos, coresPerPackage, coreIds);
IdSet packageIds;
ExtractFieldsIntoSet(apicIds, bit_pos, 0xFF, packageIds);
// (the set cardinality is representative of all packages/cores since
// they are uniform.)
numPackages = std::max((uint)package_ids.size(), 1u);
enabledCoresPerPackage = std::max((uint)core_ids .size(), 1u);
enabledLogicalPerCore = std::max((uint)logical_ids.size(), 1u);
numPackages = std::max((uint)packageIds.size(), 1u);
enabledCoresPerPackage = std::max((uint)coreIds .size(), 1u);
enabledLogicalPerCore = std::max((uint)logicalIds.size(), 1u);
// note: even though APIC IDs are assigned sequentially, we can't make any
// assumptions about the values/ordering because we get them according to
@ -553,6 +529,70 @@ uint ia32_LogicalPerCore()
//-----------------------------------------------------------------------------
// misc stateless functions
// this RDTSC implementation writes edx:eax to a temporary and returns that.
// rationale: this insulates against changing compiler calling conventions,
// at the cost of some efficiency.
// use ia32_asm_rdtsc_edx_eax instead if the return convention is known to be
// edx:eax (should be the case on all 32-bit x86).
u64 ia32_rdtsc_safe()
{
u64 c;
#if HAVE_MS_ASM
__asm
{
cpuid
rdtsc
mov dword ptr [c], eax
mov dword ptr [c+4], edx
}
#elif HAVE_GNU_ASM
// note: we save+restore EBX to avoid xcode complaining about a
// "PIC register" being clobbered, whatever that means.
__asm__ __volatile__ (
"pushl %%ebx; cpuid; popl %%ebx; rdtsc"
: "=A" (c)
: /* no input */
: "ecx" /* cpuid clobbers eax..edx, but the rest are covered */);
#endif
return c;
}
void ia32_DebugBreak()
{
#if HAVE_MS_ASM
__asm int 3
// note: this probably isn't necessary, since unix_debug_break
// (SIGTRAP) is most probably available if HAVE_GNU_ASM.
// we include it for completeness, though.
#elif HAVE_GNU_ASM
__asm__ __volatile__ ("int $3");
#endif
}
// enforce strong memory ordering.
void ia32_MemoryFence()
{
// Pentium IV
if(ia32_cap(IA32_CAP_SSE2))
#if HAVE_MS_ASM
__asm mfence
#elif HAVE_GNU_ASM
__asm__ __volatile__ ("mfence");
#endif
}
void ia32_Serialize()
{
#if HAVE_MS_ASM
__asm cpuid
#elif HAVE_GNU_ASM
__asm__ __volatile__ ("cpuid");
#endif
}
// checks if there is an IA-32 CALL instruction right before ret_addr.
@ -632,7 +672,11 @@ void ia32_Init()
ia32_cap_init();
DetectVendor();
DetectSignature();
DetectIdentifierString();
DetectCoresPerPackage();
DetectLogicalPerCore();
DetectProcessorTopology();
}

View File

@ -39,6 +39,13 @@ enum Ia32Vendor
extern Ia32Vendor ia32_Vendor();
/**
* @return the colloquial processor generation
* (6 = Pentium II / K6, 7 = Pentium III / Athlon, 8 = Opteron)
**/
extern uint ia32_Generation();
/**
* bit indices of CPU capability flags (128 bits).
* values are defined by IA-32 CPUID feature flags - do not change!
@ -46,25 +53,27 @@ extern Ia32Vendor ia32_Vendor();
enum IA32Cap
{
// standard (ecx) - currently only defined by Intel
IA32_CAP_SSE3 = 0+0, // Streaming SIMD Extensions 3
IA32_CAP_EST = 0+7, // Enhanced Speedstep Technology
IA32_CAP_SSE3 = 0+0, // Streaming SIMD Extensions 3
IA32_CAP_EST = 0+7, // Enhanced Speedstep Technology
// standard (edx)
IA32_CAP_FPU = 32+0, // Floating Point Unit
IA32_CAP_TSC = 32+4, // TimeStamp Counter
IA32_CAP_CMOV = 32+15, // Conditional MOVe
IA32_CAP_MMX = 32+23, // MultiMedia eXtensions
IA32_CAP_SSE = 32+25, // Streaming SIMD Extensions
IA32_CAP_SSE2 = 32+26, // Streaming SIMD Extensions 2
IA32_CAP_HT = 32+28, // HyperThreading
IA32_CAP_FPU = 32+0, // Floating Point Unit
IA32_CAP_TSC = 32+4, // TimeStamp Counter
IA32_CAP_CMOV = 32+15, // Conditional MOVe
IA32_CAP_TM_SCC = 32+22, // Thermal Monitoring and Software Controlled Clock
IA32_CAP_MMX = 32+23, // MultiMedia eXtensions
IA32_CAP_SSE = 32+25, // Streaming SIMD Extensions
IA32_CAP_SSE2 = 32+26, // Streaming SIMD Extensions 2
IA32_CAP_HT = 32+28, // HyperThreading
// extended (ecx)
IA32_CAP_AMD_CMP_LEGACY = 64+1, // N-core and IA32_CAP_HT is falsely set
// extended (edx) - currently only defined by AMD
IA32_CAP_AMD_MP = 96+19, // MultiProcessing capable; reserved on AMD64
IA32_CAP_AMD_MMX_EXT = 96+22,
IA32_CAP_AMD_3DNOW_PRO = 96+30,
IA32_CAP_AMD_3DNOW = 96+31
// extended (edx)
IA32_CAP_AMD_MP = 96+19, // MultiProcessing capable; reserved on AMD64
IA32_CAP_AMD_MMX_EXT = 96+22,
IA32_CAP_AMD_3DNOW_PRO = 96+30,
IA32_CAP_AMD_3DNOW = 96+31
};
/**

View File

@ -23,11 +23,6 @@ WINIT_REGISTER_FUNC(wcpu_Init);
#pragma SECTION_RESTORE
// limit allows statically allocated per-CPU structures (for simplicity).
// WinAPI only supports max. 32 CPUs anyway (due to DWORD bitfields).
static const uint MAX_CPUS = 32;
static uint numProcessors = 0;
/// get number of CPUs (can't fail)
@ -73,84 +68,6 @@ static void DetectClockFrequency()
}
static int isThrottlingPossible = -1;
int wcpu_IsThrottlingPossible()
{
debug_assert(isThrottlingPossible != -1);
return isThrottlingPossible;
}
static void CheckIfThrottlingPossible()
{
WIN_SAVE_LAST_ERROR;
// CallNtPowerInformation
// (manual import because it's not supported on Win95)
NTSTATUS (WINAPI *pCNPI)(POWER_INFORMATION_LEVEL, PVOID, ULONG, PVOID, ULONG) = 0;
// this is most likely the only reference, so don't free it
// (=> unload) until done with the DLL.
HMODULE hPowrprofDll = LoadLibrary("powrprof.dll");
*(void**)&pCNPI = GetProcAddress(hPowrprofDll, "CallNtPowerInformation");
if(pCNPI)
{
// most likely not speedstep-capable if these aren't supported
SYSTEM_POWER_CAPABILITIES spc;
if(pCNPI(SystemPowerCapabilities, 0,0, &spc,sizeof(spc)) == STATUS_SUCCESS)
{
if(!spc.ProcessorThrottle || !spc.ThermalControl)
isThrottlingPossible = 0;
}
// probably speedstep if cooling mode active.
// the documentation of PO_TZ_* is unclear, so we can't be sure.
SYSTEM_POWER_INFORMATION spi;
if(pCNPI(SystemPowerInformation, 0,0, &spi,sizeof(spi)) == STATUS_SUCCESS)
{
if(spi.CoolingMode != PO_TZ_INVALID_MODE)
isThrottlingPossible = 1;
}
// definitely speedstep if any throttle is less than 100%.
PROCESSOR_POWER_INFORMATION ppi[MAX_CPUS];
if(pCNPI(ProcessorInformation, 0,0, ppi,sizeof(ppi)) == STATUS_SUCCESS)
{
const PROCESSOR_POWER_INFORMATION* p = ppi;
for(uint i = 0; i < std::min(wcpu_NumProcessors(), MAX_CPUS); i++, p++)
{
if(p->MhzLimit != p->MaxMhz || p->CurrentMhz != p->MaxMhz)
{
isThrottlingPossible = 1;
break;
}
}
}
}
FreeLibrary(hPowrprofDll);
// CallNtPowerInformation not available, or none of the above apply:
// don't know yet (for certain, at least).
if(isThrottlingPossible == -1)
{
// check if running on a laptop
HW_PROFILE_INFO hi;
GetCurrentHwProfile(&hi);
const bool is_laptop = !(hi.dwDockInfo & DOCKINFO_DOCKED) ^ !(hi.dwDockInfo & DOCKINFO_UNDOCKED);
// both flags set <==> this is a desktop machine.
// both clear is unspecified; we assume it's not a laptop.
// NOTE: ! is necessary (converts expression to bool)
// we'll guess SpeedStep is active if on a laptop.
// ia32 code will get a second crack at it.
isThrottlingPossible = (is_laptop)? 1 : 0;
}
WIN_RESTORE_LAST_ERROR;
debug_assert(isThrottlingPossible == 0 || isThrottlingPossible == 1);
}
//-----------------------------------------------------------------------------
// execute the specified function once on each CPU.
@ -202,7 +119,6 @@ static LibError wcpu_Init()
{
DetectNumProcessors();
DetectClockFrequency();
CheckIfThrottlingPossible();
return INFO::OK;
}

View File

@ -15,7 +15,7 @@
extern uint wcpu_NumProcessors();
extern double wcpu_ClockFrequency();
extern int wcpu_IsThrottlingPossible();
extern LibError wcpu_CallByEachCPU(CpuCallback cb, void* param);
#endif // #ifndef INCLUDED_WCPU

View File

@ -0,0 +1,106 @@
/**
* =========================================================================
* File : counter.cpp
* Project : 0 A.D.
* Description : Interface for counter implementations
* =========================================================================
*/
// license: GPL; see lib/license.txt
#include "precompiled.h"
#include "counter.h"
#include "lib/bits.h"
#include "tsc.h"
#include "hpet.h"
#include "pmt.h"
#include "qpc.h"
#include "tgt.h"
// to add a new counter type, simply include its header here and
// insert a case in ConstructCounterAt's switch statement.
//-----------------------------------------------------------------------------
// create/destroy counters
/**
* @return pointer to a newly constructed ICounter subclass of type <id> at
* the given address, or 0 iff the ID is invalid.
* @param size receives the size [bytes] of the created instance.
**/
static ICounter* ConstructCounterAt(uint id, void* address, size_t& size)
{
// rationale for placement new: see call site.
#define CREATE(impl)\
size = sizeof(Counter##impl);\
return new(address) Counter##impl();
#include "lib/nommgr.h" // MMGR interferes with placement new
// counters are chosen according to the following order. rationale:
// - TSC must come before QPC and PMT to make sure a bug in the latter on
// Pentium systems doesn't come up.
// - PMT works, but is inexplicably slower than QPC on a PIII Mobile.
// - TGT really isn't as safe as the others, so it should be last.
// - low-overhead and high-resolution counters are preferred.
switch(id)
{
case 0:
CREATE(HPET)
case 1:
CREATE(TSC)
case 2:
CREATE(QPC)
case 3:
CREATE(PMT)
case 4:
CREATE(TGT)
default:
size = 0;
return 0;
}
#include "lib/mmgr.h"
#undef CREATE
}
ICounter* CreateCounter(uint id)
{
// we placement-new the Counter classes in a static buffer.
// this is dangerous, but we are careful to ensure alignment. it is
// unusual and thus bad, but there's also one advantage: we avoid
// using global operator new before the CRT is initialized (risky).
//
// alternatives:
// - defining as static doesn't work because the ctors (necessary for
// vptr initialization) run during _cinit, which comes after our
// first use of them.
// - using static_calloc isn't possible because we don't know the
// size until after the alloc / placement new.
static const size_t MEM_SIZE = 200; // checked below
static u8 mem[MEM_SIZE];
static u8* nextMem = mem;
u8* addr = (u8*)round_up((uintptr_t)nextMem, 16);
size_t size;
ICounter* counter = ConstructCounterAt(id, addr, size);
nextMem = addr+size;
debug_assert(nextMem < mem+MEM_SIZE); // had enough room?
return counter;
}
void DestroyCounter(ICounter*& counter)
{
if(!counter)
return;
counter->Shutdown();
counter->~ICounter(); // must be called due to placement new
counter = 0;
}

View File

@ -2,14 +2,14 @@
* =========================================================================
* File : counter.h
* Project : 0 A.D.
* Description : Interface for timer implementations
* Description : Interface for counter implementations
* =========================================================================
*/
// license: GPL; see lib/license.txt
#ifndef INCLUDED_TICK_SOURCE
#define INCLUDED_TICK_SOURCE
#ifndef INCLUDED_COUNTER
#define INCLUDED_COUNTER
// derived implementations must be called CounterIMPL,
// where IMPL matches the WHRT_IMPL identifier. (see CREATE)
@ -60,4 +60,16 @@ public:
}
};
#endif // #ifndef INCLUDED_TICK_SOURCE
/**
* @return a newly created ICounter of type <id> or 0 iff the ID is invalid.
* @param id integer ID (0..N-1)
**/
extern ICounter* CreateCounter(uint id);
/**
* shut down the counter, free its resources and zero its pointer.
**/
extern void DestroyCounter(ICounter*& counter);
#endif // #ifndef INCLUDED_COUNTER

View File

@ -46,8 +46,8 @@ bool CounterQPC::IsSafe() const
// note: we have separate modules that directly access some of the
// counters potentially used by QPC. disabling the redundant counters
// would be ugly (increased coupling). instead, we'll make sure our
// implementations can coexist with QPC and verify the secondary
// reference timer has a different frequency.
// implementations could (if necessary) coexist with QPC, but it
// shouldn't come to that since only one counter is needed/used.
// the PMT is generally safe (see discussion in CounterPmt::IsSafe),
// but older QPC implementations had problems with 24-bit rollover.

View File

@ -13,116 +13,46 @@
#include "lib/sysdep/win/win.h"
#include "lib/sysdep/win/wcpu.h"
#include "lib/sysdep/ia32/ia32.h"
#include "lib/sysdep/cpu.h" // cpu_CAS
#include "lib/sysdep/ia32/ia32.h" // ia32_rdtsc
#include "lib/bits.h"
//-----------------------------------------------------------------------------
// per-CPU state
// detect throttling
// necessary because CPUs are initialized one-by-one and the TSC values
// differ significantly. (while at it, we also keep per-CPU frequency values
// in case the clocks aren't exactly synced)
//
// note: only reading the TSC from one CPU (possible via thread affinity)
// would work but take much longer (context switch).
struct PerCpuTscState
enum AmdPowerNowFlags
{
u64 m_lastTicks;
double m_lastTime;
double m_observedFrequency;
// mark this struct used just in case cpu_CallByEachCPU doesn't ensure
// only one thread is running. a flag is safer than a magic APIC ID value.
uintptr_t m_isInitialized;
uint m_apicId;
PN_FREQ_ID_CTRL = BIT(1),
PN_SW_THERMAL_CTRL = BIT(5),
PN_INVARIANT_TSC = BIT(8)
};
static const size_t MAX_CPUS = 32; // Win32 also imposes this limit
static PerCpuTscState cpuTscStates[MAX_CPUS];
static PerCpuTscState& NextUnusedPerCpuTscState()
static bool IsThrottlingPossible()
{
for(size_t i = 0; i < MAX_CPUS; i++)
u32 regs[4];
switch(ia32_Vendor())
{
PerCpuTscState& cpuTscState = cpuTscStates[i];
if(cpu_CAS(&cpuTscState.m_isInitialized, 0, 1))
return cpuTscState;
}
case IA32_VENDOR_INTEL:
if(ia32_cap(IA32_CAP_TM_SCC) || ia32_cap(IA32_CAP_EST))
return true;
break;
throw std::runtime_error("allocated too many PerCpuTscState");
}
static PerCpuTscState& CurrentCpuTscState()
{
const uint apicId = ia32_ApicId();
for(size_t i = 0; i < MAX_CPUS; i++)
{
PerCpuTscState& cpuTscState = cpuTscStates[i];
if(cpuTscState.m_isInitialized && cpuTscState.m_apicId == apicId)
return cpuTscState;
}
throw std::runtime_error("no matching PerCpuTscState found");
}
static void InitPerCpuTscState(void* param) // callback
{
const double cpuClockFrequency = *(double*)param;
PerCpuTscState& cpuTscState = NextUnusedPerCpuTscState();
cpuTscState.m_apicId = ia32_ApicId();
cpuTscState.m_lastTicks = ia32_rdtsc();
cpuTscState.m_lastTime = 0.0;
cpuTscState.m_observedFrequency = cpuClockFrequency;
}
static LibError InitPerCpuTscStates(double cpuClockFrequency)
{
LibError ret = cpu_CallByEachCPU(InitPerCpuTscState, &cpuClockFrequency);
CHECK_ERR(ret);
return INFO::OK;
}
//-----------------------------------------------------------------------------
/*
int ia32_IsThrottlingPossible()
{
// returned in edx by CPUID 0x80000007.
enum AmdPowerNowFlags
{
POWERNOW_FREQ_ID_CTRL = 2
};
if(vendor == IA32_VENDOR_INTEL)
{
if(ia32_cap(IA32_CAP_EST))
return 1;
}
else if(vendor == IA32_VENDOR_AMD)
{
u32 regs[4];
case IA32_VENDOR_AMD:
if(ia32_asm_cpuid(0x80000007, regs))
{
if(regs[EDX] & POWERNOW_FREQ_ID_CTRL)
return 1;
if(regs[EDX] & (PN_FREQ_ID_CTRL|PN_SW_THERMAL_CTRL))
return true;
}
break;
}
return 0; // pretty much authoritative, so don't return -1.
return false;
}
*/
//-----------------------------------------------------------------------------
// note: calibration is necessary due to long-term thermal drift
// (oscillator is usually poor quality) and inaccurate initial measurement.
//-----------------------------------------------------------------------------
LibError CounterTSC::Activate()
{
ia32_Init();
@ -130,7 +60,6 @@ LibError CounterTSC::Activate()
if(!ia32_cap(IA32_CAP_TSC))
return ERR::NO_SYS; // NOWARN (CPU doesn't support RDTSC)
// RETURN_ERR(InitPerCpuTscStates(wcpu_ClockFrequency()));
return INFO::OK;
}
@ -141,57 +70,65 @@ void CounterTSC::Shutdown()
bool CounterTSC::IsSafe() const
{
return false;
// use of the TSC for timing is subject to a litany of potential problems:
// - separate, unsynchronized counters with offset and drift;
// - frequency changes (P-state transitions and STPCLK throttling);
// - failure to increment in C3 and C4 deep-sleep states.
// we will discuss the specifics below.
u32 regs[4];
if(ia32_asm_cpuid(0x80000007, regs))
// SMP or multi-core => counters are unsynchronized. this could be
// solved by maintaining separate per-core counter states, but that
// requires atomic reads of the TSC and the current processor number.
//
// (otherwise, we have a subtle race condition: if preempted while
// reading the time and rescheduled on a different core, incorrect
// results may be returned, which would be unacceptable.)
//
// unfortunately this isn't possible without OS support or the
// as yet unavailable RDTSCP instruction => unsafe.
//
// (note: if the TSC is invariant, drift is no longer a concern.
// we could synchronize the TSC MSRs during initialization and avoid
// per-core counter state and the abovementioned race condition.
// however, we won't bother, since such platforms aren't yet widespread
// and would surely support the nice and safe HPET, anyway)
if(ia32_NumPackages() != 1 || ia32_CoresPerPackage() != 1)
return false;
// recent CPU:
if(ia32_Generation() >= 7)
{
// if(regs[EDX] & POWERNOW_FREQ_ID_CTRL)
// note: 8th generation CPUs support C1-clock ramping, which causes
// drift on multi-core systems, but those were excluded above.
u32 regs[4];
if(ia32_asm_cpuid(0x80000007, regs))
{
// TSC is invariant WRT P-state, C-state and STPCLK => safe.
if(regs[EDX] & PN_INVARIANT_TSC)
return true;
}
// in addition to P-state transitions, we're also subject to
// STPCLK throttling. this happens when the chipset thinks the
// system is dangerously overheated; the OS isn't even notified.
// it may be rare, but could cause incorrect results => unsafe.
return false;
// newer systems also support the C3 Deep Sleep state, in which
// the TSC isn't incremented. that's not nice, but irrelevant
// since STPCLK dooms the TSC on those systems anyway.
}
// we're dealing with a single older CPU; the only problem there is
// throttling, i.e. changes to the TSC frequency. we don't want to
// disable this because it may be important for cooling. the OS
// initiates changes but doesn't notify us; jumps are too frequent
// and drastic to detect and account for => unsafe.
if(IsThrottlingPossible())
return false;
/*
AMD has defined a CPUID feature bit that
software can test to determine if the TSC is
invariant. Issuing a CPUID instruction with an %eax register
value of 0x8000_0007, on a processor whose base family is
0xF, returns "Advanced Power Management Information" in the
%eax, %ebx, %ecx, and %edx registers. Bit 8 of the return
%edx is the "TscInvariant" feature flag which is set when
TSC is P-state, C-state, and STPCLK-throttling invariant; it
is clear otherwise.
*/
#if 0
if (CPUID.base_family < 0xf) {
// TSC drift doesn't exist on 7th Gen or less
// However, OS still needs to consider effects
// of P-state changes on TSC
return TRUE;
} else if (CPUID.AdvPowerMgmtInfo.TscInvariant) {
// Invariant TSC on 8th Gen or newer, use it
// (assume all cores have invariant TSC)
return TRUE;
// - deep sleep modes: TSC may not be advanced.
// not a problem though, because if the TSC is disabled, the CPU
// isn't doing any other work, either.
// - SpeedStep/'gearshift' CPUs: frequency may change.
// this happens on notebooks now, but eventually desktop systems
// will do this as well (if not to save power, for heat reasons).
// frequency changes are too often and drastic to correct,
// and we don't want to mess with the system power settings => unsafe.
if(cpu_IsThrottlingPossible() == 0)
return true;
/* But TSC doesn't tick in C3 so don't use it there */
957 if (acpi_fadt.length > 0 && acpi_fadt.plvl3_lat < 1000)
958 return 1;
#endif
return false;
return true;
}
u64 CounterTSC::Counter() const
@ -214,5 +151,5 @@ uint CounterTSC::CounterBits() const
**/
double CounterTSC::NominalFrequency() const
{
return wcpu_ClockFrequency();
return cpu_ClockFrequency();
}

View File

@ -20,16 +20,10 @@
#include "lib/adts.h"
#include "lib/bits.h"
#include "tsc.h"
#include "hpet.h"
#include "pmt.h"
#include "qpc.h"
#include "tgt.h"
// to add a new counter type, simply include its header here and
// insert a case in ConstructCounterAt's switch statement.
#include "counter.h"
#pragma SECTION_INIT(4) // wposix depends on us
#pragma SECTION_INIT(4) // wtime depends on us
WINIT_REGISTER_FUNC(whrt_Init);
#pragma FORCE_INCLUDE(whrt_Init)
#pragma SECTION_SHUTDOWN(8)
@ -45,94 +39,7 @@ namespace ERR
//-----------------------------------------------------------------------------
// create/destroy counters
/**
* @return pointer to a newly constructed ICounter subclass of type <id> at
* the given address, or 0 iff the ID is invalid.
* @param size receives the size [bytes] of the created instance.
**/
static ICounter* ConstructCounterAt(uint id, void* address, size_t& size)
{
// rationale for placement new: see call site.
#define CREATE(impl)\
size = sizeof(Counter##impl);\
return new(address) Counter##impl();
#include "lib/nommgr.h" // MMGR interferes with placement new
// counters are chosen according to the following order. rationale:
// - TSC must come before QPC and PMT to make sure a bug in the latter on
// Pentium systems doesn't come up.
// - PMT works, but is inexplicably slower than QPC on a PIII Mobile.
// - TGT really isn't as safe as the others, so it should be last.
// - low-overhead and high-resolution counters are preferred.
switch(id)
{
case 0:
CREATE(TSC)
case 1:
CREATE(HPET)
case 2:
CREATE(QPC)
case 3:
CREATE(PMT)
case 4:
CREATE(TGT)
default:
size = 0;
return 0;
}
#include "lib/mmgr.h"
#undef CREATE
}
/**
* @return a newly created Counter of type <id> or 0 iff the ID is invalid.
**/
static ICounter* CreateCounter(uint id)
{
// we placement-new the Counter classes in a static buffer.
// this is dangerous, but we are careful to ensure alignment. it is
// unusual and thus bad, but there's also one advantage: we avoid
// using global operator new before the CRT is initialized (risky).
//
// alternatives:
// - defining as static doesn't work because the ctors (necessary for
// vptr initialization) run during _cinit, which comes after our
// first use of them.
// - using static_calloc isn't possible because we don't know the
// size until after the alloc / placement new.
static const size_t MEM_SIZE = 200; // checked below
static u8 mem[MEM_SIZE];
static u8* nextMem = mem;
u8* addr = (u8*)round_up((uintptr_t)nextMem, 16);
size_t size;
ICounter* counter = ConstructCounterAt(id, addr, size);
nextMem = addr+size;
debug_assert(nextMem < mem+MEM_SIZE); // had enough room?
return counter;
}
static inline void DestroyCounter(ICounter*& counter)
{
if(!counter)
return;
counter->Shutdown();
counter->~ICounter(); // must be called due to placement new
counter = 0;
}
//-----------------------------------------------------------------------------
// choose best available counter
// choose best available safe counter
// (moved into a separate function to simplify error handling)
static inline LibError ActivateCounter(ICounter* counter)
@ -178,6 +85,7 @@ static ICounter* GetNextBestSafeCounter()
// counter that drives the timer
static ICounter* counter;
// (these counter properties are cached for efficiency and convenience:)
static double nominalFrequency;
static double resolution;
static uint counterBits;
@ -213,7 +121,10 @@ static inline u64 Counter()
return counter->Counter();
}
/// @return difference [ticks], taking rollover into account.
/**
* @return difference [ticks], taking rollover into account.
* (time-critical, so it's not called through ICounter.)
**/
static inline u64 CounterDelta(u64 oldCounter, u64 newCounter)
{
return (newCounter - oldCounter) & counterMask;
@ -228,27 +139,28 @@ double whrt_Resolution()
//-----------------------------------------------------------------------------
// timer state
// we're not going to bother calibrating the counter (i.e. measuring its
// current frequency by means of a second timer). rationale:
// - all counters except the TSC are stable and run at fixed frequencies;
// - it's not clear that any other HRT or the tick count would be useful
// as a stable time reference (if it were, we should be using it instead);
// - calibration would complicate the code (we'd have to make sure the
// secondary counter is safe and can co-exist with the primary).
/**
* stores all timer state shared between readers and the update thread.
* (must be POD because it's used before static ctors run.)
**/
struct TimerState
{
// current value of the counter.
// value of the counter at last update.
u64 counter;
// sum of all counter ticks since first update.
// rollover is not an issue (even at a high frequency of 10 GHz,
// it'd only happen after 58 years)
u64 ticks;
// total elapsed time [seconds] since first update.
// converted from tick deltas with the *then current* frequency
// (avoids retroactive changes when then frequency changes)
// (this enables calibration, which is currently not implemented,
// but leaving open the possibility costs nothing)
double time;
// current frequency that will be used to convert ticks to seconds.
double frequency;
};
// how do we detect when the old TimerState is no longer in use and can be
@ -276,9 +188,7 @@ static void UpdateTimerState()
const u64 counter = Counter();
const u64 deltaTicks = CounterDelta(ts->counter, counter);
ts2->counter = counter;
ts2->frequency = nominalFrequency;
ts2->ticks = ts->ticks + deltaTicks;
ts2->time = ts->time + deltaTicks/ts2->frequency;
ts2->time = ts->time + deltaTicks/nominalFrequency;
ts = (TimerState*)InterlockedExchangePointer(&ts2, ts);
}
@ -294,117 +204,25 @@ retry:
goto retry;
const u64 deltaTicks = CounterDelta(counter, Counter());
return (time + deltaTicks/ts->frequency);
return (time + deltaTicks/nominalFrequency);
}
#if 0
class Calibrator
{
double LastFreqs[8]; // ring buffer
// current ticks per second; average of last few values measured in
// calibrate(). needed to prevent long-term drift, and because
// hrt_nominal_freq isn't necessarily correct. only affects the ticks since
// last calibration - don't want to retroactively change the time.
double CurFreq;
};
calibrationCounter = DetermineBestSafeCounter(counter);
IsSimilarMagnitude(counter->NominalFrequency(), counter2->NominalFrequency()
// measure current HRT freq - prevents long-term drift; also useful because
// hrt_nominal_freq isn't necessarily exact.
static void calibrate_lk()
{
debug_assert(hrt_cal_ticks > 0);
// we're called from a WinMM event or after thread wakeup,
// so the timer has just been updated. no need to determine tick / compensate.
// get elapsed HRT ticks
const i64 hrt_cur = ticks_lk();
const i64 hrt_d = hrt_cur - hrt_cal_ticks;
hrt_cal_ticks = hrt_cur;
hrt_cal_time += hrt_d / hrt_cur_freq;
// get elapsed time from safe millisecond timer
static long safe_last = LONG_MAX;
// chosen so that dt and therefore hrt_est_freq will be negative
// on first call => it won't be added to buffer
const long safe_cur = safe_time();
const double dt = (safe_cur - safe_last) / safe_timer_freq;
safe_last = safe_cur;
double hrt_est_freq = hrt_d / dt;
// past couple of calculated hrt freqs, for averaging
typedef RingBuf<double, 8> SampleBuf;
static SampleBuf samples;
// only add to buffer if within 10% of nominal
// (don't want to pollute buffer with flukes / incorrect results)
if(fabs(hrt_est_freq/hrt_nominal_freq - 1.0) < 0.10)
{
samples.push_back(hrt_est_freq);
// average all samples in buffer
double freq_sum = std::accumulate(samples.begin(), samples.end(), 0.0);
hrt_cur_freq = freq_sum / (int)samples.size();
}
else
{
samples.clear();
hrt_cur_freq = hrt_nominal_freq;
}
debug_assert(hrt_cur_freq > 0.0);
}
#endif
//-----------------------------------------------------------------------------
// update thread
// note: we used to discipline the HRT timestamp to the system time, so it
// was advantageous to perform updates triggered by a WinMM event
// (reducing instances where we're called in the middle of a scheduler tick).
// was advantageous to trigger updates via WinMM event (thus reducing
// instances where we're called in the middle of a scheduler tick).
// since that's no longer relevant, we prefer using a thread, because that
// avoids the dependency on WinMM and its lengthy startup time.
// rationale: (+ and - are reasons for longer and shorter lengths)
// + minimize CPU usage
// + tolerate possibly low secondary counter resolution
// + ensure all threads currently using TimerState return from those
// functions before the next interval
// - notice frequency drift quickly enough
// - ensure there's no more than 1 counter rollover per interval (this is
// checked via RolloversPerCalibrationInterval)
// - avoid more than 1 counter rollover per interval (InitUpdateThread makes
// sure our interval is shorter than the current counter's rollover rate)
static const DWORD UPDATE_INTERVAL_MS = 1000;
static HANDLE hExitEvent;
@ -430,7 +248,7 @@ static unsigned __stdcall UpdateThread(void* UNUSED(data))
static inline LibError InitUpdateThread()
{
// make sure our interval isn't too long
// (counterBits can be 64 => BIT64 would overflow => calculate period/2
// (counterBits can be 64 => BIT64 would overflow => calculate period/2)
const double period_2 = BIT64(counterBits-1) / nominalFrequency;
const uint rolloversPerInterval = UPDATE_INTERVAL_MS / cpu_i64FromDouble(period_2*2.0*1000.0);
debug_assert(rolloversPerInterval <= 1);

View File

@ -55,7 +55,7 @@ static void CallFunctionPointers(PfnLibErrorVoid* begin, PfnLibErrorVoid* end)
}
const DWORD t1 = GetTickCount();
debug_printf("WINIT/ total elapsed time in callbacks %d ms (+-10)\n", t1-t0);
debug_printf("WINIT| total elapsed time in callbacks %d ms (+-10)\n", t1-t0);
}