# finalize WHRT implementation
cpu: avoid measuring cpu freq if possible ia32: cleanup, fix #cores detection on AMD (they're incompatible, *sigh*), add ia32_Generation wcpu: remove IsThrottling code (clunky and not necessary - we rely on ia32's feature bit detection instead) counter: move whrt's counter create code here qpc: update comment tsc: finalize IsSafe implementation, remove per-thread stuff (since it cannot be made to work) whrt: cleanup, remove calibration code (no longer needed) This was SVN commit r5121.
This commit is contained in:
parent
d778b22d61
commit
bd0d0c0026
@ -51,6 +51,16 @@ double cpu_ClockFrequency()
|
||||
|
||||
static void DetectClockFrequency()
|
||||
{
|
||||
#if OS_WIN
|
||||
clockFrequency = wcpu_ClockFrequency();
|
||||
// success; we stick with this value because it either doesn't matter
|
||||
// (WHRT isn't using the TSC), or cannot be determined more accurately
|
||||
// (ia32 will use WHRT's TSC to measure its own frequency).
|
||||
// bonus: the wcpu function is much faster than ia32's measurement loop.
|
||||
if(clockFrequency > 0.0)
|
||||
return;
|
||||
#endif
|
||||
|
||||
#if CPU_IA32
|
||||
clockFrequency = ia32_ClockFrequency(); // authoritative, precise
|
||||
#endif
|
||||
|
@ -34,7 +34,6 @@
|
||||
// keep in sync with enum CpuCap!
|
||||
static u32 ia32_caps[4];
|
||||
|
||||
|
||||
static void ia32_cap_init()
|
||||
{
|
||||
u32 regs[4];
|
||||
@ -63,6 +62,9 @@ bool ia32_cap(IA32Cap cap)
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// CPU identification
|
||||
|
||||
static Ia32Vendor vendor;
|
||||
|
||||
Ia32Vendor ia32_Vendor()
|
||||
@ -94,81 +96,49 @@ static void DetectVendor()
|
||||
}
|
||||
|
||||
|
||||
static uint model, family;
|
||||
static uint generation;
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
// this RDTSC implementation writes edx:eax to a temporary and returns that.
|
||||
// rationale: this insulates against changing compiler calling conventions,
|
||||
// at the cost of some efficiency.
|
||||
// use ia32_asm_rdtsc_edx_eax instead if the return convention is known to be
|
||||
// edx:eax (should be the case on all 32-bit x86).
|
||||
u64 ia32_rdtsc_safe()
|
||||
uint ia32_Generation()
|
||||
{
|
||||
u64 c;
|
||||
#if HAVE_MS_ASM
|
||||
__asm
|
||||
{
|
||||
cpuid
|
||||
rdtsc
|
||||
mov dword ptr [c], eax
|
||||
mov dword ptr [c+4], edx
|
||||
}
|
||||
#elif HAVE_GNU_ASM
|
||||
// note: we save+restore EBX to avoid xcode complaining about a
|
||||
// "PIC register" being clobbered, whatever that means.
|
||||
__asm__ __volatile__ (
|
||||
"pushl %%ebx; cpuid; popl %%ebx; rdtsc"
|
||||
: "=A" (c)
|
||||
: /* no input */
|
||||
: "ecx" /* cpuid clobbers eax..edx, but the rest are covered */);
|
||||
#endif
|
||||
return c;
|
||||
return generation;
|
||||
}
|
||||
|
||||
|
||||
void ia32_DebugBreak()
|
||||
static void DetectSignature()
|
||||
{
|
||||
#if HAVE_MS_ASM
|
||||
__asm int 3
|
||||
// note: this probably isn't necessary, since unix_debug_break
|
||||
// (SIGTRAP) is most probably available if HAVE_GNU_ASM.
|
||||
// we include it for completeness, though.
|
||||
#elif HAVE_GNU_ASM
|
||||
__asm__ __volatile__ ("int $3");
|
||||
#endif
|
||||
}
|
||||
u32 regs[4];
|
||||
if(!ia32_asm_cpuid(1, regs))
|
||||
DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING);
|
||||
model = bits(regs[EAX], 4, 7);
|
||||
family = bits(regs[EAX], 8, 11);
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// support code for lock-free primitives
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
// enforce strong memory ordering.
|
||||
void ia32_MemoryFence()
|
||||
{
|
||||
// Pentium IV
|
||||
if(ia32_cap(IA32_CAP_SSE2))
|
||||
#if HAVE_MS_ASM
|
||||
__asm mfence
|
||||
#elif HAVE_GNU_ASM
|
||||
__asm__ __volatile__ ("mfence");
|
||||
#endif
|
||||
}
|
||||
|
||||
void ia32_Serialize()
|
||||
{
|
||||
#if HAVE_MS_ASM
|
||||
__asm cpuid
|
||||
#elif HAVE_GNU_ASM
|
||||
__asm__ __volatile__ ("cpuid");
|
||||
#endif
|
||||
switch(family)
|
||||
{
|
||||
case 5:
|
||||
case 6:
|
||||
case 7:
|
||||
generation = family;
|
||||
break;
|
||||
case 0xF:
|
||||
generation = 8;
|
||||
break;
|
||||
default:
|
||||
debug_assert(0);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// identifier string
|
||||
|
||||
// 3 calls x 4 registers x 4 bytes = 48
|
||||
static char identifierString[48+1] = {'\0'};
|
||||
|
||||
const char* ia32_IdentifierString()
|
||||
{
|
||||
return identifierString;
|
||||
}
|
||||
|
||||
/// functor to remove substrings from the CPU identifier string
|
||||
class StringStripper
|
||||
{
|
||||
@ -197,26 +167,12 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
const char* ia32_IdentifierString()
|
||||
static void DetectIdentifierString()
|
||||
{
|
||||
// 3 calls x 4 registers x 4 bytes = 48
|
||||
static char identifier_string[48+1] = {'\0'};
|
||||
|
||||
// not first call, return previous result
|
||||
if(identifier_string[0] != '\0')
|
||||
return identifier_string;
|
||||
|
||||
// get processor signature
|
||||
u32 regs[4];
|
||||
if(!ia32_asm_cpuid(1, regs))
|
||||
DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING);
|
||||
const uint model = bits(regs[EAX], 4, 7);
|
||||
const uint family = bits(regs[EAX], 8, 11);
|
||||
|
||||
// get brand string (if available)
|
||||
// note: ia32_asm_cpuid writes 4 u32s directly to identifier_string -
|
||||
// note: ia32_asm_cpuid writes 4 u32s directly to identifierString -
|
||||
// be very careful with pointer arithmetic!
|
||||
u32* u32_string = (u32*)identifier_string;
|
||||
u32* u32_string = (u32*)identifierString;
|
||||
bool have_brand_string = false;
|
||||
if(ia32_asm_cpuid(0x80000002, u32_string+0 ) &&
|
||||
ia32_asm_cpuid(0x80000003, u32_string+4) &&
|
||||
@ -232,7 +188,7 @@ const char* ia32_IdentifierString()
|
||||
// - the brand string is useless, e.g. "Unknown". this happens on
|
||||
// some older boards whose BIOS reprograms the string for CPUs it
|
||||
// doesn't recognize.
|
||||
if(!have_brand_string || strncmp(identifier_string, "Unknow", 6) == 0)
|
||||
if(!have_brand_string || strncmp(identifierString, "Unknow", 6) == 0)
|
||||
{
|
||||
if(vendor == IA32_VENDOR_AMD)
|
||||
{
|
||||
@ -240,15 +196,15 @@ const char* ia32_IdentifierString()
|
||||
if(family == 6)
|
||||
{
|
||||
if(model == 3 || model == 7)
|
||||
SAFE_STRCPY(identifier_string, "IA32_VENDOR_AMD Duron");
|
||||
SAFE_STRCPY(identifierString, "IA32_VENDOR_AMD Duron");
|
||||
else if(model <= 5)
|
||||
SAFE_STRCPY(identifier_string, "IA32_VENDOR_AMD Athlon");
|
||||
SAFE_STRCPY(identifierString, "IA32_VENDOR_AMD Athlon");
|
||||
else
|
||||
{
|
||||
if(ia32_cap(IA32_CAP_AMD_MP))
|
||||
SAFE_STRCPY(identifier_string, "IA32_VENDOR_AMD Athlon MP");
|
||||
SAFE_STRCPY(identifierString, "IA32_VENDOR_AMD Athlon MP");
|
||||
else
|
||||
SAFE_STRCPY(identifier_string, "IA32_VENDOR_AMD Athlon XP");
|
||||
SAFE_STRCPY(identifierString, "IA32_VENDOR_AMD Athlon XP");
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -258,29 +214,27 @@ const char* ia32_IdentifierString()
|
||||
if(family == 6)
|
||||
{
|
||||
if(model == 1)
|
||||
SAFE_STRCPY(identifier_string, "Intel Pentium Pro");
|
||||
SAFE_STRCPY(identifierString, "Intel Pentium Pro");
|
||||
else if(model == 3 || model == 5)
|
||||
SAFE_STRCPY(identifier_string, "Intel Pentium II");
|
||||
SAFE_STRCPY(identifierString, "Intel Pentium II");
|
||||
else if(model == 6)
|
||||
SAFE_STRCPY(identifier_string, "Intel Celeron");
|
||||
SAFE_STRCPY(identifierString, "Intel Celeron");
|
||||
else
|
||||
SAFE_STRCPY(identifier_string, "Intel Pentium III");
|
||||
SAFE_STRCPY(identifierString, "Intel Pentium III");
|
||||
}
|
||||
}
|
||||
}
|
||||
// identifier_string already holds a valid brand string; pretty it up.
|
||||
// identifierString already holds a valid brand string; pretty it up.
|
||||
else
|
||||
{
|
||||
const char* const undesired_strings[] = { "(tm)", "(TM)", "(R)", "CPU " };
|
||||
std::for_each(undesired_strings, undesired_strings+ARRAY_SIZE(undesired_strings),
|
||||
StringStripper(identifier_string, ARRAY_SIZE(identifier_string)));
|
||||
StringStripper(identifierString, ARRAY_SIZE(identifierString)));
|
||||
|
||||
// note: Intel brand strings include a frequency, but we can't rely
|
||||
// on it because the CPU may be overclocked. we'll leave it in the
|
||||
// string to show measurement accuracy and if SpeedStep is active.
|
||||
}
|
||||
|
||||
return identifier_string;
|
||||
}
|
||||
|
||||
|
||||
@ -312,6 +266,10 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
// note: this function uses timer.cpp!get_time, which is implemented via
|
||||
// whrt.cpp on Windows, which again calls ia32_Init. be careful that
|
||||
// this function isn't called from there as well, else WHRT will be used
|
||||
// before its init completes.
|
||||
double ia32_ClockFrequency()
|
||||
{
|
||||
// if the TSC isn't available, there's really no good way to count the
|
||||
@ -392,7 +350,7 @@ double ia32_ClockFrequency()
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// detect processor types / topology
|
||||
// processor topology
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
uint ia32_ApicId()
|
||||
@ -408,44 +366,62 @@ uint ia32_ApicId()
|
||||
// OSes report hyperthreading units and cores as "processors". we need to
|
||||
// drill down and find out the exact counts (for thread pool dimensioning
|
||||
// and cache sharing considerations).
|
||||
// note: Intel Appnote 485 (CPUID) assures uniformity of CoresPerPackage and
|
||||
// LogicalPerCore.
|
||||
// note: Intel Appnote 485 (CPUID) assures uniformity of coresPerPackage and
|
||||
// logicalPerCore.
|
||||
|
||||
static uint CoresPerPackage()
|
||||
static uint coresPerPackage = 0;
|
||||
static uint logicalPerCore = 0;
|
||||
|
||||
static void DetectCoresPerPackage()
|
||||
{
|
||||
static uint coresPerPackage = 0;
|
||||
if(coresPerPackage == 0)
|
||||
u32 regs[4];
|
||||
|
||||
coresPerPackage = 1; // single-core unless..
|
||||
|
||||
switch(vendor)
|
||||
{
|
||||
u32 regs[4];
|
||||
case IA32_VENDOR_INTEL:
|
||||
if(ia32_asm_cpuid(4, regs))
|
||||
coresPerPackage = bits(regs[EAX], 26, 31)+1;
|
||||
else
|
||||
coresPerPackage = 1; // single-core
|
||||
}
|
||||
break;
|
||||
|
||||
return coresPerPackage;
|
||||
case IA32_VENDOR_AMD:
|
||||
if(ia32_asm_cpuid(0x80000008, regs))
|
||||
coresPerPackage = bits(regs[ECX], 0, 7)+1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static uint LogicalPerCore()
|
||||
static bool IsHyperthreadingCapable()
|
||||
{
|
||||
static uint logicalPerCore = 0;
|
||||
if(logicalPerCore == 0)
|
||||
// definitely not
|
||||
if(!ia32_cap(IA32_CAP_HT))
|
||||
return false;
|
||||
|
||||
// AMD N-core systems falsely set the HT bit for compatibility reasons
|
||||
// (don't bother resetting it, might confuse callers)
|
||||
if(vendor == IA32_VENDOR_AMD && ia32_cap(IA32_CAP_AMD_CMP_LEGACY))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static void DetectLogicalPerCore()
|
||||
{
|
||||
u32 regs[4];
|
||||
|
||||
if(!IsHyperthreadingCapable())
|
||||
{
|
||||
if(ia32_cap(IA32_CAP_HT))
|
||||
{
|
||||
u32 regs[4];
|
||||
if(!ia32_asm_cpuid(1, regs))
|
||||
DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING);
|
||||
const uint logical_per_package = bits(regs[EBX], 16, 23);
|
||||
// cores ought to be uniform WRT # logical processors
|
||||
debug_assert(logical_per_package % CoresPerPackage() == 0);
|
||||
logicalPerCore = logical_per_package / CoresPerPackage();
|
||||
}
|
||||
else
|
||||
logicalPerCore = 1; // not Hyperthreading capable
|
||||
logicalPerCore = 1;
|
||||
return;
|
||||
}
|
||||
|
||||
return logicalPerCore;
|
||||
if(!ia32_asm_cpuid(1, regs))
|
||||
DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING);
|
||||
const uint logicalPerPackage = bits(regs[EBX], 16, 23);
|
||||
// cores ought to be uniform WRT # logical processors
|
||||
debug_assert(logicalPerPackage % coresPerPackage == 0);
|
||||
logicalPerCore = logicalPerPackage / coresPerPackage;
|
||||
}
|
||||
|
||||
// the above two functions give the maximum number of cores/logical units.
|
||||
@ -508,18 +484,18 @@ static void DetectProcessorTopology()
|
||||
|
||||
// extract values from all 3 ID bitfields into separate sets
|
||||
uint bit_pos = 0;
|
||||
IdSet logical_ids;
|
||||
ExtractFieldsIntoSet(apicIds, bit_pos, LogicalPerCore(), logical_ids);
|
||||
IdSet core_ids;
|
||||
ExtractFieldsIntoSet(apicIds, bit_pos, CoresPerPackage(), core_ids);
|
||||
IdSet package_ids;
|
||||
ExtractFieldsIntoSet(apicIds, bit_pos, 0xFF, package_ids);
|
||||
IdSet logicalIds;
|
||||
ExtractFieldsIntoSet(apicIds, bit_pos, logicalPerCore, logicalIds);
|
||||
IdSet coreIds;
|
||||
ExtractFieldsIntoSet(apicIds, bit_pos, coresPerPackage, coreIds);
|
||||
IdSet packageIds;
|
||||
ExtractFieldsIntoSet(apicIds, bit_pos, 0xFF, packageIds);
|
||||
|
||||
// (the set cardinality is representative of all packages/cores since
|
||||
// they are uniform.)
|
||||
numPackages = std::max((uint)package_ids.size(), 1u);
|
||||
enabledCoresPerPackage = std::max((uint)core_ids .size(), 1u);
|
||||
enabledLogicalPerCore = std::max((uint)logical_ids.size(), 1u);
|
||||
numPackages = std::max((uint)packageIds.size(), 1u);
|
||||
enabledCoresPerPackage = std::max((uint)coreIds .size(), 1u);
|
||||
enabledLogicalPerCore = std::max((uint)logicalIds.size(), 1u);
|
||||
|
||||
// note: even though APIC IDs are assigned sequentially, we can't make any
|
||||
// assumptions about the values/ordering because we get them according to
|
||||
@ -553,6 +529,70 @@ uint ia32_LogicalPerCore()
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// misc stateless functions
|
||||
|
||||
// this RDTSC implementation writes edx:eax to a temporary and returns that.
|
||||
// rationale: this insulates against changing compiler calling conventions,
|
||||
// at the cost of some efficiency.
|
||||
// use ia32_asm_rdtsc_edx_eax instead if the return convention is known to be
|
||||
// edx:eax (should be the case on all 32-bit x86).
|
||||
u64 ia32_rdtsc_safe()
|
||||
{
|
||||
u64 c;
|
||||
#if HAVE_MS_ASM
|
||||
__asm
|
||||
{
|
||||
cpuid
|
||||
rdtsc
|
||||
mov dword ptr [c], eax
|
||||
mov dword ptr [c+4], edx
|
||||
}
|
||||
#elif HAVE_GNU_ASM
|
||||
// note: we save+restore EBX to avoid xcode complaining about a
|
||||
// "PIC register" being clobbered, whatever that means.
|
||||
__asm__ __volatile__ (
|
||||
"pushl %%ebx; cpuid; popl %%ebx; rdtsc"
|
||||
: "=A" (c)
|
||||
: /* no input */
|
||||
: "ecx" /* cpuid clobbers eax..edx, but the rest are covered */);
|
||||
#endif
|
||||
return c;
|
||||
}
|
||||
|
||||
|
||||
void ia32_DebugBreak()
|
||||
{
|
||||
#if HAVE_MS_ASM
|
||||
__asm int 3
|
||||
// note: this probably isn't necessary, since unix_debug_break
|
||||
// (SIGTRAP) is most probably available if HAVE_GNU_ASM.
|
||||
// we include it for completeness, though.
|
||||
#elif HAVE_GNU_ASM
|
||||
__asm__ __volatile__ ("int $3");
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
// enforce strong memory ordering.
|
||||
void ia32_MemoryFence()
|
||||
{
|
||||
// Pentium IV
|
||||
if(ia32_cap(IA32_CAP_SSE2))
|
||||
#if HAVE_MS_ASM
|
||||
__asm mfence
|
||||
#elif HAVE_GNU_ASM
|
||||
__asm__ __volatile__ ("mfence");
|
||||
#endif
|
||||
}
|
||||
|
||||
void ia32_Serialize()
|
||||
{
|
||||
#if HAVE_MS_ASM
|
||||
__asm cpuid
|
||||
#elif HAVE_GNU_ASM
|
||||
__asm__ __volatile__ ("cpuid");
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
// checks if there is an IA-32 CALL instruction right before ret_addr.
|
||||
@ -632,7 +672,11 @@ void ia32_Init()
|
||||
ia32_cap_init();
|
||||
|
||||
DetectVendor();
|
||||
DetectSignature();
|
||||
DetectIdentifierString();
|
||||
|
||||
DetectCoresPerPackage();
|
||||
DetectLogicalPerCore();
|
||||
DetectProcessorTopology();
|
||||
}
|
||||
|
||||
|
@ -39,6 +39,13 @@ enum Ia32Vendor
|
||||
extern Ia32Vendor ia32_Vendor();
|
||||
|
||||
|
||||
/**
|
||||
* @return the colloquial processor generation
|
||||
* (6 = Pentium II / K6, 7 = Pentium III / Athlon, 8 = Opteron)
|
||||
**/
|
||||
extern uint ia32_Generation();
|
||||
|
||||
|
||||
/**
|
||||
* bit indices of CPU capability flags (128 bits).
|
||||
* values are defined by IA-32 CPUID feature flags - do not change!
|
||||
@ -46,25 +53,27 @@ extern Ia32Vendor ia32_Vendor();
|
||||
enum IA32Cap
|
||||
{
|
||||
// standard (ecx) - currently only defined by Intel
|
||||
IA32_CAP_SSE3 = 0+0, // Streaming SIMD Extensions 3
|
||||
IA32_CAP_EST = 0+7, // Enhanced Speedstep Technology
|
||||
IA32_CAP_SSE3 = 0+0, // Streaming SIMD Extensions 3
|
||||
IA32_CAP_EST = 0+7, // Enhanced Speedstep Technology
|
||||
|
||||
// standard (edx)
|
||||
IA32_CAP_FPU = 32+0, // Floating Point Unit
|
||||
IA32_CAP_TSC = 32+4, // TimeStamp Counter
|
||||
IA32_CAP_CMOV = 32+15, // Conditional MOVe
|
||||
IA32_CAP_MMX = 32+23, // MultiMedia eXtensions
|
||||
IA32_CAP_SSE = 32+25, // Streaming SIMD Extensions
|
||||
IA32_CAP_SSE2 = 32+26, // Streaming SIMD Extensions 2
|
||||
IA32_CAP_HT = 32+28, // HyperThreading
|
||||
IA32_CAP_FPU = 32+0, // Floating Point Unit
|
||||
IA32_CAP_TSC = 32+4, // TimeStamp Counter
|
||||
IA32_CAP_CMOV = 32+15, // Conditional MOVe
|
||||
IA32_CAP_TM_SCC = 32+22, // Thermal Monitoring and Software Controlled Clock
|
||||
IA32_CAP_MMX = 32+23, // MultiMedia eXtensions
|
||||
IA32_CAP_SSE = 32+25, // Streaming SIMD Extensions
|
||||
IA32_CAP_SSE2 = 32+26, // Streaming SIMD Extensions 2
|
||||
IA32_CAP_HT = 32+28, // HyperThreading
|
||||
|
||||
// extended (ecx)
|
||||
IA32_CAP_AMD_CMP_LEGACY = 64+1, // N-core and IA32_CAP_HT is falsely set
|
||||
|
||||
// extended (edx) - currently only defined by AMD
|
||||
IA32_CAP_AMD_MP = 96+19, // MultiProcessing capable; reserved on AMD64
|
||||
IA32_CAP_AMD_MMX_EXT = 96+22,
|
||||
IA32_CAP_AMD_3DNOW_PRO = 96+30,
|
||||
IA32_CAP_AMD_3DNOW = 96+31
|
||||
// extended (edx)
|
||||
IA32_CAP_AMD_MP = 96+19, // MultiProcessing capable; reserved on AMD64
|
||||
IA32_CAP_AMD_MMX_EXT = 96+22,
|
||||
IA32_CAP_AMD_3DNOW_PRO = 96+30,
|
||||
IA32_CAP_AMD_3DNOW = 96+31
|
||||
};
|
||||
|
||||
/**
|
||||
|
@ -23,11 +23,6 @@ WINIT_REGISTER_FUNC(wcpu_Init);
|
||||
#pragma SECTION_RESTORE
|
||||
|
||||
|
||||
// limit allows statically allocated per-CPU structures (for simplicity).
|
||||
// WinAPI only supports max. 32 CPUs anyway (due to DWORD bitfields).
|
||||
static const uint MAX_CPUS = 32;
|
||||
|
||||
|
||||
static uint numProcessors = 0;
|
||||
|
||||
/// get number of CPUs (can't fail)
|
||||
@ -73,84 +68,6 @@ static void DetectClockFrequency()
|
||||
}
|
||||
|
||||
|
||||
static int isThrottlingPossible = -1;
|
||||
|
||||
int wcpu_IsThrottlingPossible()
|
||||
{
|
||||
debug_assert(isThrottlingPossible != -1);
|
||||
return isThrottlingPossible;
|
||||
}
|
||||
|
||||
static void CheckIfThrottlingPossible()
|
||||
{
|
||||
WIN_SAVE_LAST_ERROR;
|
||||
|
||||
// CallNtPowerInformation
|
||||
// (manual import because it's not supported on Win95)
|
||||
NTSTATUS (WINAPI *pCNPI)(POWER_INFORMATION_LEVEL, PVOID, ULONG, PVOID, ULONG) = 0;
|
||||
// this is most likely the only reference, so don't free it
|
||||
// (=> unload) until done with the DLL.
|
||||
HMODULE hPowrprofDll = LoadLibrary("powrprof.dll");
|
||||
*(void**)&pCNPI = GetProcAddress(hPowrprofDll, "CallNtPowerInformation");
|
||||
if(pCNPI)
|
||||
{
|
||||
// most likely not speedstep-capable if these aren't supported
|
||||
SYSTEM_POWER_CAPABILITIES spc;
|
||||
if(pCNPI(SystemPowerCapabilities, 0,0, &spc,sizeof(spc)) == STATUS_SUCCESS)
|
||||
{
|
||||
if(!spc.ProcessorThrottle || !spc.ThermalControl)
|
||||
isThrottlingPossible = 0;
|
||||
}
|
||||
|
||||
// probably speedstep if cooling mode active.
|
||||
// the documentation of PO_TZ_* is unclear, so we can't be sure.
|
||||
SYSTEM_POWER_INFORMATION spi;
|
||||
if(pCNPI(SystemPowerInformation, 0,0, &spi,sizeof(spi)) == STATUS_SUCCESS)
|
||||
{
|
||||
if(spi.CoolingMode != PO_TZ_INVALID_MODE)
|
||||
isThrottlingPossible = 1;
|
||||
}
|
||||
|
||||
// definitely speedstep if any throttle is less than 100%.
|
||||
PROCESSOR_POWER_INFORMATION ppi[MAX_CPUS];
|
||||
if(pCNPI(ProcessorInformation, 0,0, ppi,sizeof(ppi)) == STATUS_SUCCESS)
|
||||
{
|
||||
const PROCESSOR_POWER_INFORMATION* p = ppi;
|
||||
for(uint i = 0; i < std::min(wcpu_NumProcessors(), MAX_CPUS); i++, p++)
|
||||
{
|
||||
if(p->MhzLimit != p->MaxMhz || p->CurrentMhz != p->MaxMhz)
|
||||
{
|
||||
isThrottlingPossible = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
FreeLibrary(hPowrprofDll);
|
||||
|
||||
// CallNtPowerInformation not available, or none of the above apply:
|
||||
// don't know yet (for certain, at least).
|
||||
if(isThrottlingPossible == -1)
|
||||
{
|
||||
// check if running on a laptop
|
||||
HW_PROFILE_INFO hi;
|
||||
GetCurrentHwProfile(&hi);
|
||||
const bool is_laptop = !(hi.dwDockInfo & DOCKINFO_DOCKED) ^ !(hi.dwDockInfo & DOCKINFO_UNDOCKED);
|
||||
// both flags set <==> this is a desktop machine.
|
||||
// both clear is unspecified; we assume it's not a laptop.
|
||||
// NOTE: ! is necessary (converts expression to bool)
|
||||
|
||||
// we'll guess SpeedStep is active if on a laptop.
|
||||
// ia32 code will get a second crack at it.
|
||||
isThrottlingPossible = (is_laptop)? 1 : 0;
|
||||
}
|
||||
|
||||
WIN_RESTORE_LAST_ERROR;
|
||||
|
||||
debug_assert(isThrottlingPossible == 0 || isThrottlingPossible == 1);
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
// execute the specified function once on each CPU.
|
||||
@ -202,7 +119,6 @@ static LibError wcpu_Init()
|
||||
{
|
||||
DetectNumProcessors();
|
||||
DetectClockFrequency();
|
||||
CheckIfThrottlingPossible();
|
||||
|
||||
return INFO::OK;
|
||||
}
|
||||
|
@ -15,7 +15,7 @@
|
||||
|
||||
extern uint wcpu_NumProcessors();
|
||||
extern double wcpu_ClockFrequency();
|
||||
extern int wcpu_IsThrottlingPossible();
|
||||
|
||||
extern LibError wcpu_CallByEachCPU(CpuCallback cb, void* param);
|
||||
|
||||
#endif // #ifndef INCLUDED_WCPU
|
||||
|
106
source/lib/sysdep/win/whrt/counter.cpp
Normal file
106
source/lib/sysdep/win/whrt/counter.cpp
Normal file
@ -0,0 +1,106 @@
|
||||
/**
|
||||
* =========================================================================
|
||||
* File : counter.cpp
|
||||
* Project : 0 A.D.
|
||||
* Description : Interface for counter implementations
|
||||
* =========================================================================
|
||||
*/
|
||||
|
||||
// license: GPL; see lib/license.txt
|
||||
|
||||
#include "precompiled.h"
|
||||
#include "counter.h"
|
||||
|
||||
#include "lib/bits.h"
|
||||
|
||||
#include "tsc.h"
|
||||
#include "hpet.h"
|
||||
#include "pmt.h"
|
||||
#include "qpc.h"
|
||||
#include "tgt.h"
|
||||
// to add a new counter type, simply include its header here and
|
||||
// insert a case in ConstructCounterAt's switch statement.
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// create/destroy counters
|
||||
|
||||
/**
|
||||
* @return pointer to a newly constructed ICounter subclass of type <id> at
|
||||
* the given address, or 0 iff the ID is invalid.
|
||||
* @param size receives the size [bytes] of the created instance.
|
||||
**/
|
||||
static ICounter* ConstructCounterAt(uint id, void* address, size_t& size)
|
||||
{
|
||||
// rationale for placement new: see call site.
|
||||
#define CREATE(impl)\
|
||||
size = sizeof(Counter##impl);\
|
||||
return new(address) Counter##impl();
|
||||
|
||||
#include "lib/nommgr.h" // MMGR interferes with placement new
|
||||
|
||||
// counters are chosen according to the following order. rationale:
|
||||
// - TSC must come before QPC and PMT to make sure a bug in the latter on
|
||||
// Pentium systems doesn't come up.
|
||||
// - PMT works, but is inexplicably slower than QPC on a PIII Mobile.
|
||||
// - TGT really isn't as safe as the others, so it should be last.
|
||||
// - low-overhead and high-resolution counters are preferred.
|
||||
switch(id)
|
||||
{
|
||||
case 0:
|
||||
CREATE(HPET)
|
||||
case 1:
|
||||
CREATE(TSC)
|
||||
case 2:
|
||||
CREATE(QPC)
|
||||
case 3:
|
||||
CREATE(PMT)
|
||||
case 4:
|
||||
CREATE(TGT)
|
||||
default:
|
||||
size = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#include "lib/mmgr.h"
|
||||
|
||||
#undef CREATE
|
||||
}
|
||||
|
||||
ICounter* CreateCounter(uint id)
|
||||
{
|
||||
// we placement-new the Counter classes in a static buffer.
|
||||
// this is dangerous, but we are careful to ensure alignment. it is
|
||||
// unusual and thus bad, but there's also one advantage: we avoid
|
||||
// using global operator new before the CRT is initialized (risky).
|
||||
//
|
||||
// alternatives:
|
||||
// - defining as static doesn't work because the ctors (necessary for
|
||||
// vptr initialization) run during _cinit, which comes after our
|
||||
// first use of them.
|
||||
// - using static_calloc isn't possible because we don't know the
|
||||
// size until after the alloc / placement new.
|
||||
static const size_t MEM_SIZE = 200; // checked below
|
||||
static u8 mem[MEM_SIZE];
|
||||
static u8* nextMem = mem;
|
||||
|
||||
u8* addr = (u8*)round_up((uintptr_t)nextMem, 16);
|
||||
size_t size;
|
||||
ICounter* counter = ConstructCounterAt(id, addr, size);
|
||||
|
||||
nextMem = addr+size;
|
||||
debug_assert(nextMem < mem+MEM_SIZE); // had enough room?
|
||||
|
||||
return counter;
|
||||
}
|
||||
|
||||
|
||||
void DestroyCounter(ICounter*& counter)
|
||||
{
|
||||
if(!counter)
|
||||
return;
|
||||
|
||||
counter->Shutdown();
|
||||
counter->~ICounter(); // must be called due to placement new
|
||||
counter = 0;
|
||||
}
|
@ -2,14 +2,14 @@
|
||||
* =========================================================================
|
||||
* File : counter.h
|
||||
* Project : 0 A.D.
|
||||
* Description : Interface for timer implementations
|
||||
* Description : Interface for counter implementations
|
||||
* =========================================================================
|
||||
*/
|
||||
|
||||
// license: GPL; see lib/license.txt
|
||||
|
||||
#ifndef INCLUDED_TICK_SOURCE
|
||||
#define INCLUDED_TICK_SOURCE
|
||||
#ifndef INCLUDED_COUNTER
|
||||
#define INCLUDED_COUNTER
|
||||
|
||||
// derived implementations must be called CounterIMPL,
|
||||
// where IMPL matches the WHRT_IMPL identifier. (see CREATE)
|
||||
@ -60,4 +60,16 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
#endif // #ifndef INCLUDED_TICK_SOURCE
|
||||
|
||||
/**
|
||||
* @return a newly created ICounter of type <id> or 0 iff the ID is invalid.
|
||||
* @param id integer ID (0..N-1)
|
||||
**/
|
||||
extern ICounter* CreateCounter(uint id);
|
||||
|
||||
/**
|
||||
* shut down the counter, free its resources and zero its pointer.
|
||||
**/
|
||||
extern void DestroyCounter(ICounter*& counter);
|
||||
|
||||
#endif // #ifndef INCLUDED_COUNTER
|
||||
|
@ -46,8 +46,8 @@ bool CounterQPC::IsSafe() const
|
||||
// note: we have separate modules that directly access some of the
|
||||
// counters potentially used by QPC. disabling the redundant counters
|
||||
// would be ugly (increased coupling). instead, we'll make sure our
|
||||
// implementations can coexist with QPC and verify the secondary
|
||||
// reference timer has a different frequency.
|
||||
// implementations could (if necessary) coexist with QPC, but it
|
||||
// shouldn't come to that since only one counter is needed/used.
|
||||
|
||||
// the PMT is generally safe (see discussion in CounterPmt::IsSafe),
|
||||
// but older QPC implementations had problems with 24-bit rollover.
|
||||
|
@ -13,116 +13,46 @@
|
||||
|
||||
#include "lib/sysdep/win/win.h"
|
||||
#include "lib/sysdep/win/wcpu.h"
|
||||
#include "lib/sysdep/ia32/ia32.h"
|
||||
#include "lib/sysdep/cpu.h" // cpu_CAS
|
||||
#include "lib/sysdep/ia32/ia32.h" // ia32_rdtsc
|
||||
#include "lib/bits.h"
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// per-CPU state
|
||||
// detect throttling
|
||||
|
||||
// necessary because CPUs are initialized one-by-one and the TSC values
|
||||
// differ significantly. (while at it, we also keep per-CPU frequency values
|
||||
// in case the clocks aren't exactly synced)
|
||||
//
|
||||
// note: only reading the TSC from one CPU (possible via thread affinity)
|
||||
// would work but take much longer (context switch).
|
||||
|
||||
struct PerCpuTscState
|
||||
enum AmdPowerNowFlags
|
||||
{
|
||||
u64 m_lastTicks;
|
||||
double m_lastTime;
|
||||
double m_observedFrequency;
|
||||
// mark this struct used just in case cpu_CallByEachCPU doesn't ensure
|
||||
// only one thread is running. a flag is safer than a magic APIC ID value.
|
||||
uintptr_t m_isInitialized;
|
||||
uint m_apicId;
|
||||
PN_FREQ_ID_CTRL = BIT(1),
|
||||
PN_SW_THERMAL_CTRL = BIT(5),
|
||||
PN_INVARIANT_TSC = BIT(8)
|
||||
};
|
||||
|
||||
static const size_t MAX_CPUS = 32; // Win32 also imposes this limit
|
||||
static PerCpuTscState cpuTscStates[MAX_CPUS];
|
||||
|
||||
static PerCpuTscState& NextUnusedPerCpuTscState()
|
||||
static bool IsThrottlingPossible()
|
||||
{
|
||||
for(size_t i = 0; i < MAX_CPUS; i++)
|
||||
u32 regs[4];
|
||||
|
||||
switch(ia32_Vendor())
|
||||
{
|
||||
PerCpuTscState& cpuTscState = cpuTscStates[i];
|
||||
if(cpu_CAS(&cpuTscState.m_isInitialized, 0, 1))
|
||||
return cpuTscState;
|
||||
}
|
||||
case IA32_VENDOR_INTEL:
|
||||
if(ia32_cap(IA32_CAP_TM_SCC) || ia32_cap(IA32_CAP_EST))
|
||||
return true;
|
||||
break;
|
||||
|
||||
throw std::runtime_error("allocated too many PerCpuTscState");
|
||||
}
|
||||
|
||||
static PerCpuTscState& CurrentCpuTscState()
|
||||
{
|
||||
const uint apicId = ia32_ApicId();
|
||||
for(size_t i = 0; i < MAX_CPUS; i++)
|
||||
{
|
||||
PerCpuTscState& cpuTscState = cpuTscStates[i];
|
||||
if(cpuTscState.m_isInitialized && cpuTscState.m_apicId == apicId)
|
||||
return cpuTscState;
|
||||
}
|
||||
|
||||
throw std::runtime_error("no matching PerCpuTscState found");
|
||||
}
|
||||
|
||||
static void InitPerCpuTscState(void* param) // callback
|
||||
{
|
||||
const double cpuClockFrequency = *(double*)param;
|
||||
|
||||
PerCpuTscState& cpuTscState = NextUnusedPerCpuTscState();
|
||||
cpuTscState.m_apicId = ia32_ApicId();
|
||||
cpuTscState.m_lastTicks = ia32_rdtsc();
|
||||
cpuTscState.m_lastTime = 0.0;
|
||||
cpuTscState.m_observedFrequency = cpuClockFrequency;
|
||||
}
|
||||
|
||||
static LibError InitPerCpuTscStates(double cpuClockFrequency)
|
||||
{
|
||||
LibError ret = cpu_CallByEachCPU(InitPerCpuTscState, &cpuClockFrequency);
|
||||
CHECK_ERR(ret);
|
||||
return INFO::OK;
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
/*
|
||||
int ia32_IsThrottlingPossible()
|
||||
{
|
||||
|
||||
// returned in edx by CPUID 0x80000007.
|
||||
enum AmdPowerNowFlags
|
||||
{
|
||||
POWERNOW_FREQ_ID_CTRL = 2
|
||||
};
|
||||
|
||||
|
||||
if(vendor == IA32_VENDOR_INTEL)
|
||||
{
|
||||
if(ia32_cap(IA32_CAP_EST))
|
||||
return 1;
|
||||
}
|
||||
else if(vendor == IA32_VENDOR_AMD)
|
||||
{
|
||||
u32 regs[4];
|
||||
case IA32_VENDOR_AMD:
|
||||
if(ia32_asm_cpuid(0x80000007, regs))
|
||||
{
|
||||
if(regs[EDX] & POWERNOW_FREQ_ID_CTRL)
|
||||
return 1;
|
||||
if(regs[EDX] & (PN_FREQ_ID_CTRL|PN_SW_THERMAL_CTRL))
|
||||
return true;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
return 0; // pretty much authoritative, so don't return -1.
|
||||
return false;
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
// note: calibration is necessary due to long-term thermal drift
|
||||
// (oscillator is usually poor quality) and inaccurate initial measurement.
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
|
||||
LibError CounterTSC::Activate()
|
||||
{
|
||||
ia32_Init();
|
||||
@ -130,7 +60,6 @@ LibError CounterTSC::Activate()
|
||||
if(!ia32_cap(IA32_CAP_TSC))
|
||||
return ERR::NO_SYS; // NOWARN (CPU doesn't support RDTSC)
|
||||
|
||||
// RETURN_ERR(InitPerCpuTscStates(wcpu_ClockFrequency()));
|
||||
return INFO::OK;
|
||||
}
|
||||
|
||||
@ -141,57 +70,65 @@ void CounterTSC::Shutdown()
|
||||
|
||||
bool CounterTSC::IsSafe() const
|
||||
{
|
||||
return false;
|
||||
// use of the TSC for timing is subject to a litany of potential problems:
|
||||
// - separate, unsynchronized counters with offset and drift;
|
||||
// - frequency changes (P-state transitions and STPCLK throttling);
|
||||
// - failure to increment in C3 and C4 deep-sleep states.
|
||||
// we will discuss the specifics below.
|
||||
|
||||
u32 regs[4];
|
||||
if(ia32_asm_cpuid(0x80000007, regs))
|
||||
// SMP or multi-core => counters are unsynchronized. this could be
|
||||
// solved by maintaining separate per-core counter states, but that
|
||||
// requires atomic reads of the TSC and the current processor number.
|
||||
//
|
||||
// (otherwise, we have a subtle race condition: if preempted while
|
||||
// reading the time and rescheduled on a different core, incorrect
|
||||
// results may be returned, which would be unacceptable.)
|
||||
//
|
||||
// unfortunately this isn't possible without OS support or the
|
||||
// as yet unavailable RDTSCP instruction => unsafe.
|
||||
//
|
||||
// (note: if the TSC is invariant, drift is no longer a concern.
|
||||
// we could synchronize the TSC MSRs during initialization and avoid
|
||||
// per-core counter state and the abovementioned race condition.
|
||||
// however, we won't bother, since such platforms aren't yet widespread
|
||||
// and would surely support the nice and safe HPET, anyway)
|
||||
if(ia32_NumPackages() != 1 || ia32_CoresPerPackage() != 1)
|
||||
return false;
|
||||
|
||||
// recent CPU:
|
||||
if(ia32_Generation() >= 7)
|
||||
{
|
||||
// if(regs[EDX] & POWERNOW_FREQ_ID_CTRL)
|
||||
// note: 8th generation CPUs support C1-clock ramping, which causes
|
||||
// drift on multi-core systems, but those were excluded above.
|
||||
|
||||
u32 regs[4];
|
||||
if(ia32_asm_cpuid(0x80000007, regs))
|
||||
{
|
||||
// TSC is invariant WRT P-state, C-state and STPCLK => safe.
|
||||
if(regs[EDX] & PN_INVARIANT_TSC)
|
||||
return true;
|
||||
}
|
||||
|
||||
// in addition to P-state transitions, we're also subject to
|
||||
// STPCLK throttling. this happens when the chipset thinks the
|
||||
// system is dangerously overheated; the OS isn't even notified.
|
||||
// it may be rare, but could cause incorrect results => unsafe.
|
||||
return false;
|
||||
|
||||
// newer systems also support the C3 Deep Sleep state, in which
|
||||
// the TSC isn't incremented. that's not nice, but irrelevant
|
||||
// since STPCLK dooms the TSC on those systems anyway.
|
||||
}
|
||||
|
||||
// we're dealing with a single older CPU; the only problem there is
|
||||
// throttling, i.e. changes to the TSC frequency. we don't want to
|
||||
// disable this because it may be important for cooling. the OS
|
||||
// initiates changes but doesn't notify us; jumps are too frequent
|
||||
// and drastic to detect and account for => unsafe.
|
||||
if(IsThrottlingPossible())
|
||||
return false;
|
||||
|
||||
/*
|
||||
AMD has defined a CPUID feature bit that
|
||||
software can test to determine if the TSC is
|
||||
invariant. Issuing a CPUID instruction with an %eax register
|
||||
value of 0x8000_0007, on a processor whose base family is
|
||||
0xF, returns "Advanced Power Management Information" in the
|
||||
%eax, %ebx, %ecx, and %edx registers. Bit 8 of the return
|
||||
%edx is the "TscInvariant" feature flag which is set when
|
||||
TSC is P-state, C-state, and STPCLK-throttling invariant; it
|
||||
is clear otherwise.
|
||||
*/
|
||||
|
||||
#if 0
|
||||
if (CPUID.base_family < 0xf) {
|
||||
// TSC drift doesn't exist on 7th Gen or less
|
||||
// However, OS still needs to consider effects
|
||||
// of P-state changes on TSC
|
||||
return TRUE;
|
||||
|
||||
} else if (CPUID.AdvPowerMgmtInfo.TscInvariant) {
|
||||
// Invariant TSC on 8th Gen or newer, use it
|
||||
// (assume all cores have invariant TSC)
|
||||
return TRUE;
|
||||
|
||||
// - deep sleep modes: TSC may not be advanced.
|
||||
// not a problem though, because if the TSC is disabled, the CPU
|
||||
// isn't doing any other work, either.
|
||||
// - SpeedStep/'gearshift' CPUs: frequency may change.
|
||||
// this happens on notebooks now, but eventually desktop systems
|
||||
// will do this as well (if not to save power, for heat reasons).
|
||||
// frequency changes are too often and drastic to correct,
|
||||
// and we don't want to mess with the system power settings => unsafe.
|
||||
if(cpu_IsThrottlingPossible() == 0)
|
||||
return true;
|
||||
|
||||
|
||||
/* But TSC doesn't tick in C3 so don't use it there */
|
||||
957 if (acpi_fadt.length > 0 && acpi_fadt.plvl3_lat < 1000)
|
||||
958 return 1;
|
||||
|
||||
#endif
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
u64 CounterTSC::Counter() const
|
||||
@ -214,5 +151,5 @@ uint CounterTSC::CounterBits() const
|
||||
**/
|
||||
double CounterTSC::NominalFrequency() const
|
||||
{
|
||||
return wcpu_ClockFrequency();
|
||||
return cpu_ClockFrequency();
|
||||
}
|
||||
|
@ -20,16 +20,10 @@
|
||||
#include "lib/adts.h"
|
||||
#include "lib/bits.h"
|
||||
|
||||
#include "tsc.h"
|
||||
#include "hpet.h"
|
||||
#include "pmt.h"
|
||||
#include "qpc.h"
|
||||
#include "tgt.h"
|
||||
// to add a new counter type, simply include its header here and
|
||||
// insert a case in ConstructCounterAt's switch statement.
|
||||
#include "counter.h"
|
||||
|
||||
|
||||
#pragma SECTION_INIT(4) // wposix depends on us
|
||||
#pragma SECTION_INIT(4) // wtime depends on us
|
||||
WINIT_REGISTER_FUNC(whrt_Init);
|
||||
#pragma FORCE_INCLUDE(whrt_Init)
|
||||
#pragma SECTION_SHUTDOWN(8)
|
||||
@ -45,94 +39,7 @@ namespace ERR
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// create/destroy counters
|
||||
|
||||
/**
|
||||
* @return pointer to a newly constructed ICounter subclass of type <id> at
|
||||
* the given address, or 0 iff the ID is invalid.
|
||||
* @param size receives the size [bytes] of the created instance.
|
||||
**/
|
||||
static ICounter* ConstructCounterAt(uint id, void* address, size_t& size)
|
||||
{
|
||||
// rationale for placement new: see call site.
|
||||
#define CREATE(impl)\
|
||||
size = sizeof(Counter##impl);\
|
||||
return new(address) Counter##impl();
|
||||
|
||||
#include "lib/nommgr.h" // MMGR interferes with placement new
|
||||
|
||||
// counters are chosen according to the following order. rationale:
|
||||
// - TSC must come before QPC and PMT to make sure a bug in the latter on
|
||||
// Pentium systems doesn't come up.
|
||||
// - PMT works, but is inexplicably slower than QPC on a PIII Mobile.
|
||||
// - TGT really isn't as safe as the others, so it should be last.
|
||||
// - low-overhead and high-resolution counters are preferred.
|
||||
switch(id)
|
||||
{
|
||||
case 0:
|
||||
CREATE(TSC)
|
||||
case 1:
|
||||
CREATE(HPET)
|
||||
case 2:
|
||||
CREATE(QPC)
|
||||
case 3:
|
||||
CREATE(PMT)
|
||||
case 4:
|
||||
CREATE(TGT)
|
||||
default:
|
||||
size = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#include "lib/mmgr.h"
|
||||
|
||||
#undef CREATE
|
||||
}
|
||||
|
||||
/**
|
||||
* @return a newly created Counter of type <id> or 0 iff the ID is invalid.
|
||||
**/
|
||||
static ICounter* CreateCounter(uint id)
|
||||
{
|
||||
// we placement-new the Counter classes in a static buffer.
|
||||
// this is dangerous, but we are careful to ensure alignment. it is
|
||||
// unusual and thus bad, but there's also one advantage: we avoid
|
||||
// using global operator new before the CRT is initialized (risky).
|
||||
//
|
||||
// alternatives:
|
||||
// - defining as static doesn't work because the ctors (necessary for
|
||||
// vptr initialization) run during _cinit, which comes after our
|
||||
// first use of them.
|
||||
// - using static_calloc isn't possible because we don't know the
|
||||
// size until after the alloc / placement new.
|
||||
static const size_t MEM_SIZE = 200; // checked below
|
||||
static u8 mem[MEM_SIZE];
|
||||
static u8* nextMem = mem;
|
||||
|
||||
u8* addr = (u8*)round_up((uintptr_t)nextMem, 16);
|
||||
size_t size;
|
||||
ICounter* counter = ConstructCounterAt(id, addr, size);
|
||||
|
||||
nextMem = addr+size;
|
||||
debug_assert(nextMem < mem+MEM_SIZE); // had enough room?
|
||||
|
||||
return counter;
|
||||
}
|
||||
|
||||
|
||||
static inline void DestroyCounter(ICounter*& counter)
|
||||
{
|
||||
if(!counter)
|
||||
return;
|
||||
|
||||
counter->Shutdown();
|
||||
counter->~ICounter(); // must be called due to placement new
|
||||
counter = 0;
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// choose best available counter
|
||||
// choose best available safe counter
|
||||
|
||||
// (moved into a separate function to simplify error handling)
|
||||
static inline LibError ActivateCounter(ICounter* counter)
|
||||
@ -178,6 +85,7 @@ static ICounter* GetNextBestSafeCounter()
|
||||
// counter that drives the timer
|
||||
|
||||
static ICounter* counter;
|
||||
// (these counter properties are cached for efficiency and convenience:)
|
||||
static double nominalFrequency;
|
||||
static double resolution;
|
||||
static uint counterBits;
|
||||
@ -213,7 +121,10 @@ static inline u64 Counter()
|
||||
return counter->Counter();
|
||||
}
|
||||
|
||||
/// @return difference [ticks], taking rollover into account.
|
||||
/**
|
||||
* @return difference [ticks], taking rollover into account.
|
||||
* (time-critical, so it's not called through ICounter.)
|
||||
**/
|
||||
static inline u64 CounterDelta(u64 oldCounter, u64 newCounter)
|
||||
{
|
||||
return (newCounter - oldCounter) & counterMask;
|
||||
@ -228,27 +139,28 @@ double whrt_Resolution()
|
||||
//-----------------------------------------------------------------------------
|
||||
// timer state
|
||||
|
||||
// we're not going to bother calibrating the counter (i.e. measuring its
|
||||
// current frequency by means of a second timer). rationale:
|
||||
// - all counters except the TSC are stable and run at fixed frequencies;
|
||||
// - it's not clear that any other HRT or the tick count would be useful
|
||||
// as a stable time reference (if it were, we should be using it instead);
|
||||
// - calibration would complicate the code (we'd have to make sure the
|
||||
// secondary counter is safe and can co-exist with the primary).
|
||||
|
||||
/**
|
||||
* stores all timer state shared between readers and the update thread.
|
||||
* (must be POD because it's used before static ctors run.)
|
||||
**/
|
||||
struct TimerState
|
||||
{
|
||||
// current value of the counter.
|
||||
// value of the counter at last update.
|
||||
u64 counter;
|
||||
|
||||
// sum of all counter ticks since first update.
|
||||
// rollover is not an issue (even at a high frequency of 10 GHz,
|
||||
// it'd only happen after 58 years)
|
||||
u64 ticks;
|
||||
|
||||
// total elapsed time [seconds] since first update.
|
||||
// converted from tick deltas with the *then current* frequency
|
||||
// (avoids retroactive changes when then frequency changes)
|
||||
// (this enables calibration, which is currently not implemented,
|
||||
// but leaving open the possibility costs nothing)
|
||||
double time;
|
||||
|
||||
// current frequency that will be used to convert ticks to seconds.
|
||||
double frequency;
|
||||
};
|
||||
|
||||
// how do we detect when the old TimerState is no longer in use and can be
|
||||
@ -276,9 +188,7 @@ static void UpdateTimerState()
|
||||
const u64 counter = Counter();
|
||||
const u64 deltaTicks = CounterDelta(ts->counter, counter);
|
||||
ts2->counter = counter;
|
||||
ts2->frequency = nominalFrequency;
|
||||
ts2->ticks = ts->ticks + deltaTicks;
|
||||
ts2->time = ts->time + deltaTicks/ts2->frequency;
|
||||
ts2->time = ts->time + deltaTicks/nominalFrequency;
|
||||
ts = (TimerState*)InterlockedExchangePointer(&ts2, ts);
|
||||
}
|
||||
|
||||
@ -294,117 +204,25 @@ retry:
|
||||
goto retry;
|
||||
|
||||
const u64 deltaTicks = CounterDelta(counter, Counter());
|
||||
return (time + deltaTicks/ts->frequency);
|
||||
return (time + deltaTicks/nominalFrequency);
|
||||
}
|
||||
|
||||
|
||||
|
||||
#if 0
|
||||
|
||||
|
||||
|
||||
class Calibrator
|
||||
{
|
||||
double LastFreqs[8]; // ring buffer
|
||||
|
||||
// current ticks per second; average of last few values measured in
|
||||
// calibrate(). needed to prevent long-term drift, and because
|
||||
// hrt_nominal_freq isn't necessarily correct. only affects the ticks since
|
||||
// last calibration - don't want to retroactively change the time.
|
||||
double CurFreq;
|
||||
};
|
||||
|
||||
calibrationCounter = DetermineBestSafeCounter(counter);
|
||||
IsSimilarMagnitude(counter->NominalFrequency(), counter2->NominalFrequency()
|
||||
|
||||
// measure current HRT freq - prevents long-term drift; also useful because
|
||||
// hrt_nominal_freq isn't necessarily exact.
|
||||
static void calibrate_lk()
|
||||
{
|
||||
debug_assert(hrt_cal_ticks > 0);
|
||||
|
||||
// we're called from a WinMM event or after thread wakeup,
|
||||
// so the timer has just been updated. no need to determine tick / compensate.
|
||||
|
||||
// get elapsed HRT ticks
|
||||
const i64 hrt_cur = ticks_lk();
|
||||
const i64 hrt_d = hrt_cur - hrt_cal_ticks;
|
||||
hrt_cal_ticks = hrt_cur;
|
||||
|
||||
hrt_cal_time += hrt_d / hrt_cur_freq;
|
||||
|
||||
// get elapsed time from safe millisecond timer
|
||||
static long safe_last = LONG_MAX;
|
||||
// chosen so that dt and therefore hrt_est_freq will be negative
|
||||
// on first call => it won't be added to buffer
|
||||
const long safe_cur = safe_time();
|
||||
const double dt = (safe_cur - safe_last) / safe_timer_freq;
|
||||
safe_last = safe_cur;
|
||||
|
||||
double hrt_est_freq = hrt_d / dt;
|
||||
|
||||
// past couple of calculated hrt freqs, for averaging
|
||||
typedef RingBuf<double, 8> SampleBuf;
|
||||
static SampleBuf samples;
|
||||
|
||||
// only add to buffer if within 10% of nominal
|
||||
// (don't want to pollute buffer with flukes / incorrect results)
|
||||
if(fabs(hrt_est_freq/hrt_nominal_freq - 1.0) < 0.10)
|
||||
{
|
||||
samples.push_back(hrt_est_freq);
|
||||
|
||||
// average all samples in buffer
|
||||
double freq_sum = std::accumulate(samples.begin(), samples.end(), 0.0);
|
||||
hrt_cur_freq = freq_sum / (int)samples.size();
|
||||
}
|
||||
else
|
||||
{
|
||||
samples.clear();
|
||||
|
||||
hrt_cur_freq = hrt_nominal_freq;
|
||||
}
|
||||
|
||||
debug_assert(hrt_cur_freq > 0.0);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// update thread
|
||||
|
||||
// note: we used to discipline the HRT timestamp to the system time, so it
|
||||
// was advantageous to perform updates triggered by a WinMM event
|
||||
// (reducing instances where we're called in the middle of a scheduler tick).
|
||||
// was advantageous to trigger updates via WinMM event (thus reducing
|
||||
// instances where we're called in the middle of a scheduler tick).
|
||||
// since that's no longer relevant, we prefer using a thread, because that
|
||||
// avoids the dependency on WinMM and its lengthy startup time.
|
||||
|
||||
// rationale: (+ and - are reasons for longer and shorter lengths)
|
||||
// + minimize CPU usage
|
||||
// + tolerate possibly low secondary counter resolution
|
||||
// + ensure all threads currently using TimerState return from those
|
||||
// functions before the next interval
|
||||
// - notice frequency drift quickly enough
|
||||
// - ensure there's no more than 1 counter rollover per interval (this is
|
||||
// checked via RolloversPerCalibrationInterval)
|
||||
// - avoid more than 1 counter rollover per interval (InitUpdateThread makes
|
||||
// sure our interval is shorter than the current counter's rollover rate)
|
||||
static const DWORD UPDATE_INTERVAL_MS = 1000;
|
||||
|
||||
static HANDLE hExitEvent;
|
||||
@ -430,7 +248,7 @@ static unsigned __stdcall UpdateThread(void* UNUSED(data))
|
||||
static inline LibError InitUpdateThread()
|
||||
{
|
||||
// make sure our interval isn't too long
|
||||
// (counterBits can be 64 => BIT64 would overflow => calculate period/2
|
||||
// (counterBits can be 64 => BIT64 would overflow => calculate period/2)
|
||||
const double period_2 = BIT64(counterBits-1) / nominalFrequency;
|
||||
const uint rolloversPerInterval = UPDATE_INTERVAL_MS / cpu_i64FromDouble(period_2*2.0*1000.0);
|
||||
debug_assert(rolloversPerInterval <= 1);
|
||||
|
@ -55,7 +55,7 @@ static void CallFunctionPointers(PfnLibErrorVoid* begin, PfnLibErrorVoid* end)
|
||||
}
|
||||
|
||||
const DWORD t1 = GetTickCount();
|
||||
debug_printf("WINIT/ total elapsed time in callbacks %d ms (+-10)\n", t1-t0);
|
||||
debug_printf("WINIT| total elapsed time in callbacks %d ms (+-10)\n", t1-t0);
|
||||
}
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user