add NUMA and shared-L2-cache detect code (required at work)
enable most of IA-32 specific code to be used in amd64 (resides in directory lib/sysdep/x86_x64) bits: add IsBitSet remove mem_PageSize (use os_cpu_PageSize instead) cpuid: change interface to allow gracefully supporting later subfunctions that require input parameters amd64_asm.asm: add amd64 implementation of cpuid cpu: move functions provided by OS to sysdep/os_cpu.cpp cpu topology: avoid trouble when process affinity is set by remapping processor numbers to 0..PopulationCount(processAffinity) topology.cpp: move ex-ia32 topology code here. This was SVN commit r5945.
This commit is contained in:
parent
7152e4a3e6
commit
ffdff6888d
@ -12,7 +12,7 @@
|
||||
|
||||
#include "maths/MathUtil.h"
|
||||
#include "graphics/SColor.h"
|
||||
#include "lib/sysdep/ia32/ia32.h"
|
||||
#include "lib/sysdep/x86_x64/x86_x64.h"
|
||||
|
||||
static u32 fallback_ConvertRGBColorTo4ub(const RGBColor& src)
|
||||
{
|
||||
@ -39,7 +39,7 @@ void ColorActivateFastImpl()
|
||||
{
|
||||
}
|
||||
#if ARCH_IA32
|
||||
else if (ia32_cap(IA32_CAP_SSE))
|
||||
else if (x86_x64_cap(X86_X64_CAP_SSE))
|
||||
{
|
||||
ConvertRGBColorTo4ub = sse_ConvertRGBColorTo4ub;
|
||||
}
|
||||
|
@ -13,23 +13,17 @@
|
||||
|
||||
#include "lib/bits.h" // round_up
|
||||
#include "lib/posix/posix_mman.h"
|
||||
#include "lib/sysdep/cpu.h" // cpu_PageSize
|
||||
#include "lib/sysdep/os_cpu.h" // os_cpu_PageSize
|
||||
|
||||
|
||||
size_t mem_PageSize()
|
||||
{
|
||||
static const size_t page_size = cpu_PageSize();
|
||||
return page_size;
|
||||
}
|
||||
|
||||
bool mem_IsPageMultiple(uintptr_t x)
|
||||
{
|
||||
return (x & (mem_PageSize()-1)) == 0;
|
||||
return (x & (os_cpu_PageSize()-1)) == 0;
|
||||
}
|
||||
|
||||
size_t mem_RoundUpToPage(size_t size)
|
||||
{
|
||||
return round_up(size, mem_PageSize());
|
||||
return round_up(size, os_cpu_PageSize());
|
||||
}
|
||||
|
||||
size_t mem_RoundUpToAlignment(size_t size)
|
||||
|
@ -11,14 +11,6 @@
|
||||
#ifndef INCLUDED_MEM_UTIL
|
||||
#define INCLUDED_MEM_UTIL
|
||||
|
||||
|
||||
/**
|
||||
* @return page size
|
||||
*
|
||||
* (this routine caches the result of cpu_PageSize and ensures the value
|
||||
* is available before static initializers have run.)
|
||||
**/
|
||||
extern size_t mem_PageSize();
|
||||
extern bool mem_IsPageMultiple(uintptr_t x);
|
||||
|
||||
extern size_t mem_RoundUpToPage(size_t size);
|
||||
|
@ -25,6 +25,13 @@
|
||||
**/
|
||||
#define BIT64(n) (1ull << (n))
|
||||
|
||||
template<typename T>
|
||||
bool IsBitSet(T value, size_t index)
|
||||
{
|
||||
const T bit = T(1) << index;
|
||||
return (value & bit) != 0;
|
||||
}
|
||||
|
||||
|
||||
// these are declared in the header and inlined to aid compiler optimizations
|
||||
// (they can easily end up being time-critical).
|
||||
|
@ -18,7 +18,7 @@
|
||||
#include "lib/allocators/allocators.h"
|
||||
#include "lib/allocators/shared_ptr.h"
|
||||
#include "lib/allocators/headerless.h"
|
||||
#include "lib/allocators/mem_util.h" // mem_PageSize
|
||||
#include "lib/sysdep/os_cpu.h" // os_cpu_PageSize
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
|
36
source/lib/sysdep/amd64/amd64_asm.asm
Normal file
36
source/lib/sysdep/amd64/amd64_asm.asm
Normal file
@ -0,0 +1,36 @@
|
||||
; =========================================================================
|
||||
; File : amd64_asm.asm
|
||||
; Project : 0 A.D.
|
||||
; Description :
|
||||
; =========================================================================
|
||||
|
||||
; license: GPL; see lib/license.txt
|
||||
|
||||
; extern "C" void __cdecl amd64_asm_cpuid(Ia32CpuidRegs* reg);
|
||||
; reference: http://softwarecommunity.intel.com/articles/eng/2669.htm
|
||||
PUBLIC amd64_asm_cpuid
|
||||
.CODE
|
||||
ALIGN 8
|
||||
amd64_asm_cpuid PROC FRAME
|
||||
sub rsp, 32
|
||||
.allocstack 32
|
||||
push rbx
|
||||
.pushreg rbx
|
||||
.endprolog
|
||||
|
||||
mov r8, rcx
|
||||
mov eax, DWORD PTR [r8+0]
|
||||
mov ecx, DWORD PTR [r8+8]
|
||||
cpuid
|
||||
mov DWORD PTR [r8+0], eax
|
||||
mov DWORD PTR [r8+4], ebx
|
||||
mov DWORD PTR [r8+8], ecx
|
||||
mov DWORD PTR [r8+12], edx
|
||||
|
||||
pop rbx
|
||||
add rsp, 32
|
||||
|
||||
ret
|
||||
ALIGN 8
|
||||
cpuid64 ENDP
|
||||
_TEXT ENDS
|
@ -14,4 +14,3 @@
|
||||
ERROR_ASSOCIATE(ERR::CPU_FEATURE_MISSING, "This CPU doesn't support a required feature", -1);
|
||||
ERROR_ASSOCIATE(ERR::CPU_UNKNOWN_OPCODE, "Disassembly failed", -1);
|
||||
ERROR_ASSOCIATE(ERR::CPU_UNKNOWN_VENDOR, "CPU vendor unknown", -1);
|
||||
ERROR_ASSOCIATE(ERR::CPU_RESTRICTED_AFFINITY, "Cannot set desired CPU affinity", -1);
|
||||
|
@ -16,15 +16,9 @@ namespace ERR
|
||||
const LibError CPU_FEATURE_MISSING = -130000;
|
||||
const LibError CPU_UNKNOWN_OPCODE = -130001;
|
||||
const LibError CPU_UNKNOWN_VENDOR = -130002;
|
||||
const LibError CPU_RESTRICTED_AFFINITY = -130003;
|
||||
|
||||
}
|
||||
|
||||
// (some of these functions may be implemented in external asm files)
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// CPU detection
|
||||
|
||||
@ -44,52 +38,6 @@ LIB_API const char* cpu_IdentifierString();
|
||||
**/
|
||||
LIB_API double cpu_ClockFrequency();
|
||||
|
||||
/**
|
||||
* @return the number of what the OS deems "processors" or -1 on failure.
|
||||
*
|
||||
* this is used by ia32 when it cannot determine the number via APIC IDs.
|
||||
* in other situations, the cpu_NumPackages function is preferable since
|
||||
* it is more specific.
|
||||
*
|
||||
* note: this function is necessary because POSIX sysconf _SC_NPROCESSORS_CONF
|
||||
* is not suppored on MacOSX, else we would use that.
|
||||
**/
|
||||
LIB_API size_t cpu_NumProcessors();
|
||||
|
||||
/**
|
||||
* @return number of *enabled* CPU packages / sockets.
|
||||
**/
|
||||
LIB_API size_t cpu_NumPackages();
|
||||
|
||||
/**
|
||||
* @return number of *enabled* CPU cores per package.
|
||||
* (2 on dual-core systems)
|
||||
**/
|
||||
LIB_API size_t cpu_CoresPerPackage();
|
||||
|
||||
/**
|
||||
* @return number of *enabled* hyperthreading units per core.
|
||||
* (2 on P4 EE)
|
||||
**/
|
||||
LIB_API size_t cpu_LogicalPerCore();
|
||||
|
||||
/**
|
||||
* @return the size [bytes] of a MMU page.
|
||||
* (4096 on most IA-32 systems)
|
||||
**/
|
||||
LIB_API size_t cpu_PageSize();
|
||||
|
||||
enum CpuMemoryIndicators
|
||||
{
|
||||
CPU_MEM_TOTAL,
|
||||
CPU_MEM_AVAILABLE
|
||||
};
|
||||
|
||||
/**
|
||||
* @return the amount [bytes] of available or total physical memory.
|
||||
**/
|
||||
LIB_API size_t cpu_MemorySize(CpuMemoryIndicators mem_type);
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// lock-free support routines
|
||||
@ -105,6 +53,16 @@ LIB_API size_t cpu_MemorySize(CpuMemoryIndicators mem_type);
|
||||
**/
|
||||
LIB_API bool cpu_CAS(volatile uintptr_t* location, uintptr_t expected, uintptr_t newValue);
|
||||
|
||||
/**
|
||||
* specialization of cpu_CAS for pointer types. this avoids error-prone
|
||||
* casting in user code.
|
||||
**/
|
||||
template<typename T>
|
||||
bool cpu_CAS(volatile T* location, T expected, T new_value)
|
||||
{
|
||||
return cpu_CAS((volatile uintptr_t*)location, (uintptr_t)expected, (uintptr_t)new_value);
|
||||
}
|
||||
|
||||
/**
|
||||
* add a signed value to a variable without the possibility of interference
|
||||
* from other threads/CPUs.
|
||||
@ -130,17 +88,6 @@ LIB_API void cpu_MemoryFence();
|
||||
**/
|
||||
LIB_API void* cpu_memcpy(void* RESTRICT dst, const void* RESTRICT src, size_t size);
|
||||
|
||||
/**
|
||||
* execute the specified function once on each CPU.
|
||||
* this includes logical HT units and proceeds serially (function
|
||||
* is never re-entered) in order of increasing OS CPU ID.
|
||||
* note: implemented by switching thread affinity masks and forcing
|
||||
* a reschedule, which is apparently not possible with POSIX.
|
||||
*
|
||||
* may fail if e.g. OS is preventing us from running on some CPUs.
|
||||
**/
|
||||
typedef void (*CpuCallback)(void* param);
|
||||
LIB_API LibError cpu_CallByEachCPU(CpuCallback cb, void* param);
|
||||
|
||||
/**
|
||||
* set the FPU control word to "desirable" values (see implementation)
|
||||
@ -155,19 +102,4 @@ LIB_API void cpu_ConfigureFloatingPoint();
|
||||
#define cpu_i32FromDouble(d) ((i32)d)
|
||||
#define cpu_i64FromDouble(d) ((i64)d)
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
/**
|
||||
* specialization of cpu_CAS for pointer types. this avoids error-prone
|
||||
* casting in user code.
|
||||
**/
|
||||
template<typename T>
|
||||
bool cpu_CAS(volatile T* location, T expected, T new_value)
|
||||
{
|
||||
return cpu_CAS((volatile uintptr_t*)location, (uintptr_t)expected, (uintptr_t)new_value);
|
||||
}
|
||||
|
||||
#endif // #ifndef INCLUDED_CPU
|
||||
|
@ -2,7 +2,7 @@
|
||||
* =========================================================================
|
||||
* File : ia32.cpp
|
||||
* Project : 0 A.D.
|
||||
* Description : C++ and inline asm implementations of IA-32 functions
|
||||
* Description : routines specific to IA-32
|
||||
* =========================================================================
|
||||
*/
|
||||
|
||||
@ -11,715 +11,11 @@
|
||||
#include "precompiled.h"
|
||||
#include "ia32.h"
|
||||
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <vector>
|
||||
#include <set>
|
||||
#include <algorithm>
|
||||
|
||||
#include "lib/posix/posix.h" // pthread
|
||||
#include "lib/bits.h"
|
||||
#include "lib/timer.h"
|
||||
#include "lib/sysdep/cpu.h"
|
||||
#include "ia32_memcpy.h"
|
||||
#include "ia32_asm.h"
|
||||
#include "../amd64/amd64_asm.h"
|
||||
#include <intrin.h>
|
||||
|
||||
#if !MSC_VERSION && !GCC_VERSION
|
||||
# error we currently only support MSC/ICC or GCC
|
||||
#endif
|
||||
|
||||
|
||||
// note: unfortunately the MSC __cpuid intrinsic does not allow passing
|
||||
// additional inputs (e.g. ecx = count), so we need to implement this
|
||||
// in assembly for both IA-32 and AMD64.
|
||||
static void cpuid_impl(Ia32CpuidRegs* regs)
|
||||
{
|
||||
#if ARCH_IA32
|
||||
ia32_asm_cpuid(regs);
|
||||
#else // i.e. ARCH_AMD64
|
||||
amd64_asm_cpuid(regs);
|
||||
#endif
|
||||
}
|
||||
|
||||
bool ia32_cpuid(Ia32CpuidRegs* regs)
|
||||
{
|
||||
static u32 maxFunction;
|
||||
static u32 maxExtendedFunction;
|
||||
if(!maxFunction)
|
||||
{
|
||||
regs->eax = 0;
|
||||
cpuid_impl(regs);
|
||||
maxFunction = regs->eax;
|
||||
regs->eax = 0x80000000;
|
||||
cpuid_impl(regs);
|
||||
maxExtendedFunction = regs->eax;
|
||||
}
|
||||
|
||||
const u32 function = regs->eax;
|
||||
if(function > maxExtendedFunction)
|
||||
return false;
|
||||
if(function < 0x80000000 && function > maxFunction)
|
||||
return false;
|
||||
|
||||
cpuid_impl(regs);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// capability bits
|
||||
|
||||
static void DetectFeatureFlags(u32 caps[4])
|
||||
{
|
||||
Ia32CpuidRegs regs;
|
||||
regs.eax = 1;
|
||||
if(ia32_cpuid(®s))
|
||||
{
|
||||
caps[0] = regs.ecx;
|
||||
caps[1] = regs.edx;
|
||||
}
|
||||
regs.eax = 0x80000001;
|
||||
if(ia32_cpuid(®s))
|
||||
{
|
||||
caps[2] = regs.ecx;
|
||||
caps[3] = regs.edx;
|
||||
}
|
||||
}
|
||||
|
||||
bool ia32_cap(IA32Cap cap)
|
||||
{
|
||||
// treated as 128 bit field; order: std ecx, std edx, ext ecx, ext edx
|
||||
// keep in sync with enum CpuCap!
|
||||
static u32 ia32_caps[4];
|
||||
|
||||
// (since relevant CPUs will surely advertise at least one standard flag,
|
||||
// they are zero iff we haven't been initialized yet)
|
||||
if(!ia32_caps[1])
|
||||
DetectFeatureFlags(ia32_caps);
|
||||
|
||||
const size_t tbl_idx = cap >> 5;
|
||||
const size_t bit_idx = cap & 0x1f;
|
||||
if(tbl_idx > 3)
|
||||
{
|
||||
DEBUG_WARN_ERR(ERR::INVALID_PARAM);
|
||||
return false;
|
||||
}
|
||||
return (ia32_caps[tbl_idx] & BIT(bit_idx)) != 0;
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// CPU identification
|
||||
|
||||
static Ia32Vendor DetectVendor()
|
||||
{
|
||||
Ia32CpuidRegs regs;
|
||||
regs.eax = 0;
|
||||
if(!ia32_cpuid(®s))
|
||||
DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING);
|
||||
|
||||
// copy regs to string
|
||||
// note: 'strange' ebx,edx,ecx reg order is due to ModR/M encoding order.
|
||||
char vendor_str[13];
|
||||
u32* vendor_str_u32 = (u32*)vendor_str;
|
||||
vendor_str_u32[0] = regs.ebx;
|
||||
vendor_str_u32[1] = regs.edx;
|
||||
vendor_str_u32[2] = regs.ecx;
|
||||
vendor_str[12] = '\0'; // 0-terminate
|
||||
|
||||
if(!strcmp(vendor_str, "AuthenticAMD"))
|
||||
return IA32_VENDOR_AMD;
|
||||
else if(!strcmp(vendor_str, "GenuineIntel"))
|
||||
return IA32_VENDOR_INTEL;
|
||||
else
|
||||
{
|
||||
DEBUG_WARN_ERR(ERR::CPU_UNKNOWN_VENDOR);
|
||||
return IA32_VENDOR_UNKNOWN;
|
||||
}
|
||||
}
|
||||
|
||||
Ia32Vendor ia32_Vendor()
|
||||
{
|
||||
static Ia32Vendor vendor = IA32_VENDOR_UNKNOWN;
|
||||
if(vendor == IA32_VENDOR_UNKNOWN)
|
||||
vendor = DetectVendor();
|
||||
return vendor;
|
||||
}
|
||||
|
||||
|
||||
static void DetectSignature(size_t* model, size_t* family)
|
||||
{
|
||||
Ia32CpuidRegs regs;
|
||||
regs.eax = 1;
|
||||
if(!ia32_cpuid(®s))
|
||||
DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING);
|
||||
*model = bits(regs.eax, 4, 7);
|
||||
*family = bits(regs.eax, 8, 11);
|
||||
}
|
||||
|
||||
|
||||
static size_t DetectGeneration()
|
||||
{
|
||||
size_t model, family;
|
||||
DetectSignature(&model, &family);
|
||||
|
||||
switch(ia32_Vendor())
|
||||
{
|
||||
case IA32_VENDOR_AMD:
|
||||
switch(family)
|
||||
{
|
||||
case 5:
|
||||
if(model < 6)
|
||||
return 5; // K5
|
||||
else
|
||||
return 6; // K6
|
||||
|
||||
case 6:
|
||||
return 7; // K7 (Athlon)
|
||||
|
||||
case 0xF:
|
||||
return 8; // K8 (Opteron)
|
||||
}
|
||||
break;
|
||||
|
||||
case IA32_VENDOR_INTEL:
|
||||
switch(family)
|
||||
{
|
||||
case 5:
|
||||
return 5; // Pentium
|
||||
|
||||
case 6:
|
||||
if(model <= 0xD)
|
||||
return 6; // Pentium Pro/II/III/M
|
||||
else
|
||||
return 8; // Core2Duo
|
||||
|
||||
case 0xF:
|
||||
if(model <= 6)
|
||||
return 7; // Pentium 4/D
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
debug_assert(0); // unknown CPU generation
|
||||
return family;
|
||||
}
|
||||
|
||||
size_t ia32_Generation()
|
||||
{
|
||||
static size_t generation;
|
||||
if(!generation)
|
||||
generation = DetectGeneration();
|
||||
return generation;
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// identifier string
|
||||
|
||||
/// functor to remove substrings from the CPU identifier string
|
||||
class StringStripper
|
||||
{
|
||||
char* m_string;
|
||||
size_t m_max_chars;
|
||||
|
||||
public:
|
||||
StringStripper(char* string, size_t max_chars)
|
||||
: m_string(string), m_max_chars(max_chars)
|
||||
{
|
||||
}
|
||||
|
||||
// remove all instances of substring from m_string
|
||||
void operator()(const char* substring)
|
||||
{
|
||||
const size_t substring_length = strlen(substring);
|
||||
for(;;)
|
||||
{
|
||||
char* substring_pos = strstr(m_string, substring);
|
||||
if(!substring_pos)
|
||||
break;
|
||||
const size_t substring_ofs = substring_pos - m_string;
|
||||
const size_t num_chars = m_max_chars - substring_ofs - substring_length;
|
||||
memmove(substring_pos, substring_pos+substring_length, num_chars);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
static void DetectIdentifierString(char* identifierString, size_t maxChars)
|
||||
{
|
||||
// get brand string (if available)
|
||||
char* pos = identifierString;
|
||||
bool have_brand_string = true;
|
||||
for(u32 function = 0x80000002; function <= 0x80000004; function++)
|
||||
{
|
||||
Ia32CpuidRegs regs;
|
||||
regs.eax = function;
|
||||
have_brand_string &= ia32_cpuid(®s);
|
||||
memcpy(pos, ®s, 16);
|
||||
pos += 16;
|
||||
}
|
||||
|
||||
// fall back to manual detect of CPU type because either:
|
||||
// - CPU doesn't support brand string (we use a flag to indicate this
|
||||
// rather than comparing against a default value because it is safer);
|
||||
// - the brand string is useless, e.g. "Unknown". this happens on
|
||||
// some older boards whose BIOS reprograms the string for CPUs it
|
||||
// doesn't recognize.
|
||||
if(!have_brand_string || strncmp(identifierString, "Unknow", 6) == 0)
|
||||
{
|
||||
size_t model, family;
|
||||
DetectSignature(&model, &family);
|
||||
|
||||
switch(ia32_Vendor())
|
||||
{
|
||||
case IA32_VENDOR_AMD:
|
||||
// everything else is either too old, or should have a brand string.
|
||||
if(family == 6)
|
||||
{
|
||||
if(model == 3 || model == 7)
|
||||
strcpy_s(identifierString, maxChars, "AMD Duron");
|
||||
else if(model <= 5)
|
||||
strcpy_s(identifierString, maxChars, "AMD Athlon");
|
||||
else
|
||||
{
|
||||
if(ia32_cap(IA32_CAP_AMD_MP))
|
||||
strcpy_s(identifierString, maxChars, "AMD Athlon MP");
|
||||
else
|
||||
strcpy_s(identifierString, maxChars, "AMD Athlon XP");
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case IA32_VENDOR_INTEL:
|
||||
// everything else is either too old, or should have a brand string.
|
||||
if(family == 6)
|
||||
{
|
||||
if(model == 1)
|
||||
strcpy_s(identifierString, maxChars, "Intel Pentium Pro");
|
||||
else if(model == 3 || model == 5)
|
||||
strcpy_s(identifierString, maxChars, "Intel Pentium II");
|
||||
else if(model == 6)
|
||||
strcpy_s(identifierString, maxChars, "Intel Celeron");
|
||||
else
|
||||
strcpy_s(identifierString, maxChars, "Intel Pentium III");
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
// identifierString already holds a valid brand string; pretty it up.
|
||||
else
|
||||
{
|
||||
const char* const undesired_strings[] = { "(tm)", "(TM)", "(R)", "CPU " };
|
||||
std::for_each(undesired_strings, undesired_strings+ARRAY_SIZE(undesired_strings),
|
||||
StringStripper(identifierString, strlen(identifierString)+1));
|
||||
|
||||
// note: Intel brand strings include a frequency, but we can't rely
|
||||
// on it because the CPU may be overclocked. we'll leave it in the
|
||||
// string to show measurement accuracy and if SpeedStep is active.
|
||||
}
|
||||
}
|
||||
|
||||
const char* cpu_IdentifierString()
|
||||
{
|
||||
// 3 calls x 4 registers x 4 bytes = 48
|
||||
static char identifierString[48+1] = {'\0'};
|
||||
if(identifierString[0] == '\0')
|
||||
DetectIdentifierString(identifierString, ARRAY_SIZE(identifierString));
|
||||
return identifierString;
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// CPU frequency
|
||||
|
||||
// set scheduling priority and restore when going out of scope.
|
||||
class ScopedSetPriority
|
||||
{
|
||||
int m_old_policy;
|
||||
sched_param m_old_param;
|
||||
|
||||
public:
|
||||
ScopedSetPriority(int new_priority)
|
||||
{
|
||||
// get current scheduling policy and priority
|
||||
pthread_getschedparam(pthread_self(), &m_old_policy, &m_old_param);
|
||||
|
||||
// set new priority
|
||||
sched_param new_param = {0};
|
||||
new_param.sched_priority = new_priority;
|
||||
pthread_setschedparam(pthread_self(), SCHED_FIFO, &new_param);
|
||||
}
|
||||
|
||||
~ScopedSetPriority()
|
||||
{
|
||||
// restore previous policy and priority.
|
||||
pthread_setschedparam(pthread_self(), m_old_policy, &m_old_param);
|
||||
}
|
||||
};
|
||||
|
||||
// note: this function uses timer.cpp!timer_Time, which is implemented via
|
||||
// whrt.cpp on Windows, which again calls ia32_Init. be careful that
|
||||
// this function isn't called from there as well, else WHRT will be used
|
||||
// before its init completes.
|
||||
double ia32_ClockFrequency()
|
||||
{
|
||||
// if the TSC isn't available, there's really no good way to count the
|
||||
// actual CPU clocks per known time interval, so bail.
|
||||
// note: loop iterations ("bogomips") are not a reliable measure due
|
||||
// to differing IPC and compiler optimizations.
|
||||
if(!ia32_cap(IA32_CAP_TSC))
|
||||
return -1.0; // impossible value
|
||||
|
||||
// increase priority to reduce interference while measuring.
|
||||
const int priority = sched_get_priority_max(SCHED_FIFO)-1;
|
||||
ScopedSetPriority ssp(priority);
|
||||
|
||||
// note: no need to "warm up" cpuid - it will already have been
|
||||
// called several times by the time this code is reached.
|
||||
// (background: it's used in ia32_rdtsc() to serialize instruction flow;
|
||||
// the first call is documented to be slower on Intel CPUs)
|
||||
|
||||
int num_samples = 16;
|
||||
// if clock is low-res, do less samples so it doesn't take too long.
|
||||
// balance measuring time (~ 10 ms) and accuracy (< 1 0/00 error -
|
||||
// ok for using the TSC as a time reference)
|
||||
if(timer_Resolution() >= 1e-3)
|
||||
num_samples = 8;
|
||||
std::vector<double> samples(num_samples);
|
||||
|
||||
for(int i = 0; i < num_samples; i++)
|
||||
{
|
||||
double dt;
|
||||
i64 dc; // i64 because VC6 can't convert u64 -> double,
|
||||
// and we don't need all 64 bits.
|
||||
|
||||
// count # of clocks in max{1 tick, 1 ms}:
|
||||
// .. wait for start of tick.
|
||||
const double t0 = timer_Time();
|
||||
u64 c1; double t1;
|
||||
do
|
||||
{
|
||||
// note: timer_Time effectively has a long delay (up to 5 us)
|
||||
// before returning the time. we call it before ia32_rdtsc to
|
||||
// minimize the delay between actually sampling time / TSC,
|
||||
// thus decreasing the chance for interference.
|
||||
// (if unavoidable background activity, e.g. interrupts,
|
||||
// delays the second reading, inaccuracy is introduced).
|
||||
t1 = timer_Time();
|
||||
c1 = ia32_rdtsc();
|
||||
}
|
||||
while(t1 == t0);
|
||||
// .. wait until start of next tick and at least 1 ms elapsed.
|
||||
do
|
||||
{
|
||||
const double t2 = timer_Time();
|
||||
const u64 c2 = ia32_rdtsc();
|
||||
dc = (i64)(c2 - c1);
|
||||
dt = t2 - t1;
|
||||
}
|
||||
while(dt < 1e-3);
|
||||
|
||||
// .. freq = (delta_clocks) / (delta_seconds);
|
||||
// ia32_rdtsc/timer overhead is negligible.
|
||||
const double freq = dc / dt;
|
||||
samples[i] = freq;
|
||||
}
|
||||
|
||||
std::sort(samples.begin(), samples.end());
|
||||
|
||||
// median filter (remove upper and lower 25% and average the rest).
|
||||
// note: don't just take the lowest value! it could conceivably be
|
||||
// too low, if background processing delays reading c1 (see above).
|
||||
double sum = 0.0;
|
||||
const int lo = num_samples/4, hi = 3*num_samples/4;
|
||||
for(int i = lo; i < hi; i++)
|
||||
sum += samples[i];
|
||||
|
||||
const double clock_frequency = sum / (hi-lo);
|
||||
return clock_frequency;
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// processor topology
|
||||
|
||||
u8 ia32_ApicId()
|
||||
{
|
||||
Ia32CpuidRegs regs;
|
||||
regs.eax = 1;
|
||||
if(!ia32_cpuid(®s))
|
||||
DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING);
|
||||
const u8 apicId = (u8)bits(regs.ebx, 24, 31);
|
||||
return apicId;
|
||||
}
|
||||
|
||||
|
||||
// OSes report hyperthreading units and cores as "processors". we need to
|
||||
// drill down and find out the exact counts (for thread pool dimensioning
|
||||
// and cache sharing considerations).
|
||||
// note: Intel Appnote 485 (CPUID) assures uniformity of coresPerPackage and
|
||||
// logicalPerCore.
|
||||
|
||||
static size_t DetectCoresPerPackage()
|
||||
{
|
||||
Ia32CpuidRegs regs;
|
||||
switch(ia32_Vendor())
|
||||
{
|
||||
case IA32_VENDOR_INTEL:
|
||||
regs.eax = 4;
|
||||
if(ia32_cpuid(®s))
|
||||
return bits(regs.eax, 26, 31)+1;
|
||||
break;
|
||||
|
||||
case IA32_VENDOR_AMD:
|
||||
regs.eax = 0x80000008;
|
||||
if(ia32_cpuid(®s))
|
||||
return bits(regs.ecx, 0, 7)+1;
|
||||
break;
|
||||
}
|
||||
|
||||
return 1; // else: the CPU is single-core.
|
||||
}
|
||||
|
||||
static size_t CoresPerPackage()
|
||||
{
|
||||
static size_t coresPerPackage = 0;
|
||||
if(!coresPerPackage)
|
||||
coresPerPackage = DetectCoresPerPackage();
|
||||
return coresPerPackage;
|
||||
}
|
||||
|
||||
|
||||
static bool IsHyperthreadingCapable()
|
||||
{
|
||||
// definitely not
|
||||
if(!ia32_cap(IA32_CAP_HT))
|
||||
return false;
|
||||
|
||||
// AMD N-core systems falsely set the HT bit for compatibility reasons
|
||||
// (don't bother resetting it, might confuse callers)
|
||||
if(ia32_Vendor() == IA32_VENDOR_AMD && ia32_cap(IA32_CAP_AMD_CMP_LEGACY))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static size_t DetectLogicalPerCore()
|
||||
{
|
||||
if(!IsHyperthreadingCapable())
|
||||
return 1;
|
||||
|
||||
Ia32CpuidRegs regs;
|
||||
regs.eax = 1;
|
||||
if(!ia32_cpuid(®s))
|
||||
DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING);
|
||||
const size_t logicalPerPackage = bits(regs.ebx, 16, 23);
|
||||
|
||||
// cores ought to be uniform WRT # logical processors
|
||||
debug_assert(logicalPerPackage % CoresPerPackage() == 0);
|
||||
|
||||
return logicalPerPackage / CoresPerPackage();
|
||||
}
|
||||
|
||||
static size_t LogicalPerCore()
|
||||
{
|
||||
static size_t logicalPerCore = 0;
|
||||
if(!logicalPerCore)
|
||||
logicalPerCore = DetectLogicalPerCore();
|
||||
return logicalPerCore;
|
||||
}
|
||||
|
||||
|
||||
// the above two functions give the maximum number of cores/logical units.
|
||||
// however, some of them may actually be disabled by the BIOS!
|
||||
// what we can do is to analyze the APIC IDs. they are allocated sequentially
|
||||
// for all "processors". treating the IDs as variable-width bitfields
|
||||
// (according to the number of cores/logical units present) allows
|
||||
// determining the exact topology as well as number of packages.
|
||||
|
||||
// these are set by DetectProcessorTopology.
|
||||
static size_t numPackages = 0; // i.e. sockets; > 1 => true SMP system
|
||||
static size_t enabledCoresPerPackage = 0;
|
||||
static size_t enabledLogicalPerCore = 0; // hyperthreading units
|
||||
|
||||
typedef std::vector<u8> Ids;
|
||||
typedef std::set<u8> IdSet;
|
||||
|
||||
// add the currently running processor's APIC ID to a list of IDs.
|
||||
static void StoreApicId(void* param)
|
||||
{
|
||||
Ids* apicIds = (Ids*)param;
|
||||
apicIds->push_back(ia32_ApicId());
|
||||
}
|
||||
|
||||
|
||||
// field := a range of bits sufficient to represent <num_values> integers.
|
||||
// for each id in apicIds: extract the value of the field at offset bit_pos
|
||||
// and insert it into ids. afterwards, adjust bit_pos to the next field.
|
||||
// used to gather e.g. all core IDs from all APIC IDs.
|
||||
static void ExtractFieldsIntoSet(const Ids& apicIds, size_t& bit_pos, size_t num_values, IdSet& ids)
|
||||
{
|
||||
const size_t id_bits = ceil_log2(num_values);
|
||||
if(id_bits == 0)
|
||||
return;
|
||||
|
||||
const u8 mask = bit_mask<u8>(id_bits);
|
||||
|
||||
for(size_t i = 0; i < apicIds.size(); i++)
|
||||
{
|
||||
const u8 apic_id = apicIds[i];
|
||||
const u8 field = u8(apic_id >> bit_pos) & mask;
|
||||
ids.insert(field);
|
||||
}
|
||||
|
||||
bit_pos += id_bits;
|
||||
}
|
||||
|
||||
|
||||
// @return false if unavailable / no information can be returned.
|
||||
static bool DetectProcessorTopologyViaApicIds()
|
||||
{
|
||||
// old APIC (see ia32_ApicId for details)
|
||||
if(ia32_Generation() < 8)
|
||||
return false;
|
||||
|
||||
// get the set of all APIC IDs
|
||||
Ids apicIds;
|
||||
// .. OS affinity support is missing or excludes us from some processors
|
||||
if(cpu_CallByEachCPU(StoreApicId, &apicIds) != INFO::OK)
|
||||
return false;
|
||||
// .. if IDs aren't unique, cpu_CallByEachCPU is broken.
|
||||
std::sort(apicIds.begin(), apicIds.end());
|
||||
debug_assert(std::unique(apicIds.begin(), apicIds.end()) == apicIds.end());
|
||||
|
||||
// extract values from all 3 ID bitfields into separate sets
|
||||
size_t bit_pos = 0;
|
||||
IdSet logicalIds;
|
||||
ExtractFieldsIntoSet(apicIds, bit_pos, LogicalPerCore(), logicalIds);
|
||||
IdSet coreIds;
|
||||
ExtractFieldsIntoSet(apicIds, bit_pos, CoresPerPackage(), coreIds);
|
||||
IdSet packageIds;
|
||||
ExtractFieldsIntoSet(apicIds, bit_pos, 0xFF, packageIds);
|
||||
|
||||
// (the set cardinality is representative of all packages/cores since
|
||||
// their numbers are uniform across the system.)
|
||||
numPackages = std::max((size_t)packageIds.size(), 1u);
|
||||
enabledCoresPerPackage = std::max((size_t)coreIds .size(), 1u);
|
||||
enabledLogicalPerCore = std::max((size_t)logicalIds.size(), 1u);
|
||||
|
||||
// note: even though APIC IDs are assigned sequentially, we can't make any
|
||||
// assumptions about the values/ordering because we get them according to
|
||||
// the CPU affinity mask, which is unknown.
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
static void GuessProcessorTopologyViaOsCount()
|
||||
{
|
||||
const size_t numProcessors = cpu_NumProcessors();
|
||||
|
||||
// note: we cannot hope to always return correct results since disabled
|
||||
// cores/logical units cannot be distinguished from the situation of the
|
||||
// OS simply not reporting them as "processors". unfortunately this
|
||||
// function won't always only be called for older (#core = #logical = 1)
|
||||
// systems because DetectProcessorTopologyViaApicIds may fail due to
|
||||
// lack of OS support. what we'll do is assume nothing is disabled; this
|
||||
// is reasonable because we care most about #packages. it's fine to assume
|
||||
// more cores (without inflating the total #processors) because that
|
||||
// count only indicates memory barriers etc. ought to be used.
|
||||
enabledCoresPerPackage = CoresPerPackage();
|
||||
enabledLogicalPerCore = LogicalPerCore();
|
||||
|
||||
const size_t numPackagesTimesLogical = numProcessors / CoresPerPackage();
|
||||
debug_assert(numPackagesTimesLogical != 0); // otherwise processors didn't include cores, which would be stupid
|
||||
|
||||
numPackages = numPackagesTimesLogical / LogicalPerCore();
|
||||
if(!numPackages) // processors didn't include logical units (reasonable)
|
||||
numPackages = numPackagesTimesLogical;
|
||||
}
|
||||
|
||||
|
||||
// determine how many CoresPerPackage and LogicalPerCore are
|
||||
// actually enabled and also count numPackages.
|
||||
static void DetectProcessorTopology()
|
||||
{
|
||||
// authoritative, but requires newer CPU, and OS support.
|
||||
if(DetectProcessorTopologyViaApicIds())
|
||||
return; // success, we're done.
|
||||
|
||||
GuessProcessorTopologyViaOsCount();
|
||||
}
|
||||
|
||||
|
||||
size_t cpu_NumPackages()
|
||||
{
|
||||
if(!numPackages)
|
||||
DetectProcessorTopology();
|
||||
return (size_t)numPackages;
|
||||
}
|
||||
|
||||
size_t cpu_CoresPerPackage()
|
||||
{
|
||||
if(!enabledCoresPerPackage)
|
||||
DetectProcessorTopology();
|
||||
return (size_t)enabledCoresPerPackage;
|
||||
}
|
||||
|
||||
size_t cpu_LogicalPerCore()
|
||||
{
|
||||
if(!enabledLogicalPerCore)
|
||||
DetectProcessorTopology();
|
||||
return (size_t)enabledLogicalPerCore;
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// misc stateless functions
|
||||
|
||||
u64 ia32_rdtsc()
|
||||
{
|
||||
#if MSC_VERSION
|
||||
return (u64)__rdtsc();
|
||||
#elif GCC_VERSION
|
||||
// GCC supports "portable" assembly for both x86 and x86_64
|
||||
volatile u32 lo, hi;
|
||||
asm volatile ("rdtsc" : "=a" (lo), "=d" (hi));
|
||||
return u64_from_u32(hi, lo);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
void ia32_DebugBreak()
|
||||
{
|
||||
#if MSC_VERSION
|
||||
__debugbreak();
|
||||
#elif GCC_VERSION
|
||||
// note: this probably isn't necessary, since unix_debug_break
|
||||
// (SIGTRAP) is most probably available if GCC_VERSION.
|
||||
// we include it for completeness, though.
|
||||
__asm__ __volatile__ ("int $3");
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
// enforce strong memory ordering.
|
||||
void cpu_MemoryFence()
|
||||
{
|
||||
if(ia32_cap(IA32_CAP_SSE2))
|
||||
_mm_mfence();
|
||||
}
|
||||
|
||||
|
||||
// checks if there is an IA-32 CALL instruction right before ret_addr.
|
||||
// returns INFO::OK if so and ERR::FAIL if not.
|
||||
// also attempts to determine the call target. if that is possible
|
||||
// (directly addressed relative or indirect jumps), it is stored in
|
||||
// target, which is otherwise 0.
|
||||
//
|
||||
// this is useful for walking the stack manually.
|
||||
LibError ia32_GetCallTarget(void* ret_addr, void** target)
|
||||
{
|
||||
*target = 0;
|
||||
@ -799,25 +95,17 @@ void cpu_ConfigureFloatingPoint()
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// thunk functions for ia32_asm to allow DLL export
|
||||
|
||||
void cpu_AtomicAdd(volatile intptr_t* location, intptr_t increment)
|
||||
{
|
||||
ia32_asm_AtomicAdd(location, increment);
|
||||
}
|
||||
|
||||
|
||||
bool cpu_CAS(volatile uintptr_t* location, uintptr_t expected, uintptr_t new_value)
|
||||
{
|
||||
return ia32_asm_CAS(location, expected, new_value);
|
||||
}
|
||||
|
||||
void cpu_Serialize()
|
||||
{
|
||||
Ia32CpuidRegs regs;
|
||||
regs.eax = 1;
|
||||
ia32_cpuid(®s); // CPUID serializes execution.
|
||||
}
|
||||
|
||||
void* cpu_memcpy(void* RESTRICT dst, const void* RESTRICT src, size_t size)
|
||||
{
|
||||
|
@ -2,7 +2,7 @@
|
||||
* =========================================================================
|
||||
* File : ia32.h
|
||||
* Project : 0 A.D.
|
||||
* Description : C++ and inline asm implementations of IA-32 functions
|
||||
* Description : routines specific to IA-32
|
||||
* =========================================================================
|
||||
*/
|
||||
|
||||
@ -11,106 +11,10 @@
|
||||
#ifndef INCLUDED_IA32
|
||||
#define INCLUDED_IA32
|
||||
|
||||
#if !ARCH_IA32 && !ARCH_AMD64
|
||||
#error "including ia32.h without ARCH_IA32=1 or ARCH_AMD64=1"
|
||||
#if !ARCH_IA32
|
||||
# error "including ia32.h without ARCH_IA32=1"
|
||||
#endif
|
||||
|
||||
/**
|
||||
* registers used/returned by ia32_cpuid
|
||||
**/
|
||||
struct Ia32CpuidRegs
|
||||
{
|
||||
u32 eax;
|
||||
u32 ebx;
|
||||
u32 ecx;
|
||||
u32 edx;
|
||||
};
|
||||
|
||||
/**
|
||||
* invoke CPUID instruction.
|
||||
* @param regs input/output registers.
|
||||
* regs->eax must be set to the desired function.
|
||||
* some functions (e.g. 4) require regs->ecx to be set as well.
|
||||
* rationale: this interface (input/output structure vs. function parameters)
|
||||
* avoids unnecessary copying/initialization if some inputs aren't needed
|
||||
* and allows graceful expansion to functions that require further inputs.
|
||||
* @return true on success or false if the sub-function isn't supported.
|
||||
**/
|
||||
extern bool ia32_cpuid(Ia32CpuidRegs* regs);
|
||||
|
||||
/**
|
||||
* CPU vendor.
|
||||
* (this is exposed because some CPUID functions are vendor-specific.)
|
||||
* (an enum is easier to compare than the original string values.)
|
||||
**/
|
||||
enum Ia32Vendor
|
||||
{
|
||||
IA32_VENDOR_UNKNOWN,
|
||||
IA32_VENDOR_INTEL,
|
||||
IA32_VENDOR_AMD,
|
||||
};
|
||||
|
||||
LIB_API Ia32Vendor ia32_Vendor();
|
||||
|
||||
|
||||
/**
|
||||
* @return the colloquial processor generation
|
||||
* (5 = Pentium, 6 = Pentium Pro/II/III / K6, 7 = Pentium4 / Athlon, 8 = Core / Opteron)
|
||||
**/
|
||||
LIB_API size_t ia32_Generation();
|
||||
|
||||
|
||||
/**
|
||||
* bit indices of CPU capability flags (128 bits).
|
||||
* values are defined by IA-32 CPUID feature flags - do not change!
|
||||
**/
|
||||
enum IA32Cap
|
||||
{
|
||||
// standard (ecx) - currently only defined by Intel
|
||||
IA32_CAP_SSE3 = 0+0, // Streaming SIMD Extensions 3
|
||||
IA32_CAP_EST = 0+7, // Enhanced Speedstep Technology
|
||||
|
||||
// standard (edx)
|
||||
IA32_CAP_FPU = 32+0, // Floating Point Unit
|
||||
IA32_CAP_TSC = 32+4, // TimeStamp Counter
|
||||
IA32_CAP_CMOV = 32+15, // Conditional MOVe
|
||||
IA32_CAP_TM_SCC = 32+22, // Thermal Monitoring and Software Controlled Clock
|
||||
IA32_CAP_MMX = 32+23, // MultiMedia eXtensions
|
||||
IA32_CAP_SSE = 32+25, // Streaming SIMD Extensions
|
||||
IA32_CAP_SSE2 = 32+26, // Streaming SIMD Extensions 2
|
||||
IA32_CAP_HT = 32+28, // HyperThreading
|
||||
|
||||
// extended (ecx)
|
||||
IA32_CAP_AMD_CMP_LEGACY = 64+1, // N-core and IA32_CAP_HT is falsely set
|
||||
|
||||
// extended (edx)
|
||||
IA32_CAP_AMD_MP = 96+19, // MultiProcessing capable; reserved on AMD64
|
||||
IA32_CAP_AMD_MMX_EXT = 96+22,
|
||||
IA32_CAP_AMD_3DNOW_PRO = 96+30,
|
||||
IA32_CAP_AMD_3DNOW = 96+31
|
||||
};
|
||||
|
||||
/**
|
||||
* @return whether the CPU supports the indicated IA32Cap / feature flag.
|
||||
**/
|
||||
LIB_API bool ia32_cap(IA32Cap cap);
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// stateless
|
||||
|
||||
/**
|
||||
* @return APIC ID of the currently executing processor.
|
||||
*
|
||||
* the implementation uses CPUID.1 and only works on >= 8th generation CPUs;
|
||||
* (P4/Athlon XP); otherwise it returns 0. the alternative of accessing the
|
||||
* APIC mmio registers is not feasible - mahaf_MapPhysicalMemory only works
|
||||
* reliably on WinXP. also, the OS already has the APIC registers mapped and
|
||||
* in constant use, and we don't want to interfere.
|
||||
**/
|
||||
LIB_API u8 ia32_ApicId();
|
||||
|
||||
|
||||
/**
|
||||
* check if there is an IA-32 CALL instruction right before ret_addr.
|
||||
* @return INFO::OK if so and ERR::FAIL if not.
|
||||
@ -123,45 +27,4 @@ LIB_API u8 ia32_ApicId();
|
||||
**/
|
||||
LIB_API LibError ia32_GetCallTarget(void* ret_addr, void** target);
|
||||
|
||||
|
||||
/**
|
||||
* @return the current value of the TimeStampCounter (a counter of
|
||||
* CPU cycles since power-on, which is useful for high-resolution timing
|
||||
* but potentially differs between multiple CPUs)
|
||||
**/
|
||||
LIB_API u64 ia32_rdtsc();
|
||||
|
||||
/**
|
||||
* trigger a breakpoint inside this function when it is called.
|
||||
**/
|
||||
LIB_API void ia32_DebugBreak(void);
|
||||
|
||||
|
||||
|
||||
/// fpclassify return values
|
||||
#define IA32_FP_NAN 0x0100
|
||||
#define IA32_FP_NORMAL 0x0400
|
||||
#define IA32_FP_INFINITE (IA32_FP_NAN | IA32_FP_NORMAL)
|
||||
#define IA32_FP_ZERO 0x4000
|
||||
#define IA32_FP_SUBNORMAL (IA32_FP_NORMAL | IA32_FP_ZERO)
|
||||
|
||||
// FPU control word (for ia32_asm_control87)
|
||||
// .. Precision Control:
|
||||
#define IA32_MCW_PC 0x0300
|
||||
#define IA32_PC_24 0x0000
|
||||
// .. Rounding Control:
|
||||
#define IA32_MCW_RC 0x0C00
|
||||
#define IA32_RC_NEAR 0x0000
|
||||
#define IA32_RC_DOWN 0x0400
|
||||
#define IA32_RC_UP 0x0800
|
||||
#define IA32_RC_CHOP 0x0C00
|
||||
// .. Exception Mask:
|
||||
#define IA32_MCW_EM 0x003f
|
||||
#define IA32_EM_INVALID BIT(0)
|
||||
#define IA32_EM_DENORMAL BIT(1)
|
||||
#define IA32_EM_ZERODIVIDE BIT(2)
|
||||
#define IA32_EM_OVERFLOW BIT(3)
|
||||
#define IA32_EM_UNDERFLOW BIT(4)
|
||||
#define IA32_EM_INEXACT BIT(5)
|
||||
|
||||
#endif // #ifndef INCLUDED_IA32
|
||||
|
@ -17,7 +17,7 @@
|
||||
; CPUID support
|
||||
;-------------------------------------------------------------------------------
|
||||
|
||||
; extern "C" void __cdecl ia32_asm_cpuid(Ia32CpuidRegs* regs);
|
||||
; extern "C" void __cdecl ia32_asm_cpuid(x86_x64_CpuidRegs* regs);
|
||||
global sym(ia32_asm_cpuid)
|
||||
sym(ia32_asm_cpuid):
|
||||
push ebx ; (clobbered by CPUID)
|
||||
@ -90,7 +90,7 @@ round_bias dd 0.4999999
|
||||
|
||||
__SECT__
|
||||
|
||||
; extern "C" size_t __cdecl ia32_asm_control87(size_t new_cw, size_t mask);
|
||||
; extern "C" u32 __cdecl ia32_asm_control87(u32 new_cw, u32 mask);
|
||||
global sym(ia32_asm_control87)
|
||||
sym(ia32_asm_control87):
|
||||
push eax
|
||||
|
@ -15,29 +15,52 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
struct Ia32CpuidRegs;
|
||||
extern void CALL_CONV ia32_asm_cpuid(Ia32CpuidRegs* regs);
|
||||
struct x86_x64_CpuidRegs;
|
||||
extern void CALL_CONV ia32_asm_cpuid(x86_x64_CpuidRegs* regs);
|
||||
|
||||
extern void CALL_CONV ia32_asm_AtomicAdd(volatile intptr_t* location, intptr_t increment);
|
||||
extern bool CALL_CONV ia32_asm_CAS(volatile uintptr_t* location, uintptr_t expected, uintptr_t new_value);
|
||||
|
||||
|
||||
/// control87
|
||||
// FPU control word
|
||||
// .. Precision Control:
|
||||
const u32 IA32_MCW_PC = 0x0300;
|
||||
const u32 IA32_PC_24 = 0x0000;
|
||||
// .. Rounding Control:
|
||||
const u32 IA32_MCW_RC = 0x0C00;
|
||||
const u32 IA32_RC_NEAR = 0x0000;
|
||||
const u32 IA32_RC_DOWN = 0x0400;
|
||||
const u32 IA32_RC_UP = 0x0800;
|
||||
const u32 IA32_RC_CHOP = 0x0C00;
|
||||
// .. Exception Mask:
|
||||
const u32 IA32_MCW_EM = 0x3F;
|
||||
const u32 IA32_EM_INVALID = 0x01;
|
||||
const u32 IA32_EM_DENORMAL = 0x02;
|
||||
const u32 IA32_EM_ZERODIVIDE = 0x04;
|
||||
const u32 IA32_EM_OVERFLOW = 0x08;
|
||||
const u32 IA32_EM_UNDERFLOW = 0x10;
|
||||
const u32 IA32_EM_INEXACT = 0x20;
|
||||
/**
|
||||
* for all 1-bits in mask, update the corresponding FPU control word bits
|
||||
* with the bit values in new_val.
|
||||
* @return 0 to indicate success.
|
||||
**/
|
||||
extern size_t CALL_CONV ia32_asm_control87(size_t new_val, size_t mask);
|
||||
extern u32 CALL_CONV ia32_asm_control87(u32 new_val, u32 mask);
|
||||
|
||||
/// see POSIX fpclassify
|
||||
/// POSIX fpclassify
|
||||
#define IA32_FP_NAN 0x0100
|
||||
#define IA32_FP_NORMAL 0x0400
|
||||
#define IA32_FP_INFINITE (IA32_FP_NAN | IA32_FP_NORMAL)
|
||||
#define IA32_FP_ZERO 0x4000
|
||||
#define IA32_FP_SUBNORMAL (IA32_FP_NORMAL | IA32_FP_ZERO)
|
||||
extern size_t CALL_CONV ia32_asm_fpclassifyd(double d);
|
||||
extern size_t CALL_CONV ia32_asm_fpclassifyf(float f);
|
||||
|
||||
/// see POSIX rintf
|
||||
/// POSIX rintf
|
||||
extern float CALL_CONV ia32_asm_rintf(float);
|
||||
extern double CALL_CONV ia32_asm_rint(double);
|
||||
|
||||
/// see POSIX fminf
|
||||
/// POSIX fminf
|
||||
extern float CALL_CONV ia32_asm_fminf(float, float);
|
||||
extern float CALL_CONV ia32_asm_fmaxf(float, float);
|
||||
|
||||
@ -45,7 +68,6 @@ extern i32 CALL_CONV ia32_asm_i32FromFloat(float f);
|
||||
extern i32 CALL_CONV ia32_asm_i32FromDouble(double d);
|
||||
extern i64 CALL_CONV ia32_asm_i64FromDouble(double d);
|
||||
|
||||
|
||||
/**
|
||||
* write the current execution state (e.g. all register values) into
|
||||
* (Win32::CONTEXT*)pcontext (defined as void* to avoid dependency).
|
||||
|
87
source/lib/sysdep/numa.h
Normal file
87
source/lib/sysdep/numa.h
Normal file
@ -0,0 +1,87 @@
|
||||
#ifndef INCLUDED_NUMA
|
||||
#define INCLUDED_NUMA
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// node topology
|
||||
|
||||
/**
|
||||
* @return number of NUMA "nodes" (i.e. groups of CPUs with local memory).
|
||||
**/
|
||||
LIB_API size_t numa_NumNodes();
|
||||
|
||||
/**
|
||||
* @return node number (zero-based) to which <processor> belongs.
|
||||
**/
|
||||
LIB_API size_t numa_NodeFromProcessor(size_t processor);
|
||||
|
||||
/**
|
||||
* @return bit-mask of all processors constituting <node>.
|
||||
**/
|
||||
LIB_API uintptr_t numa_ProcessorMaskFromNode(size_t node);
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// memory
|
||||
|
||||
/**
|
||||
* @return bytes of memory available for allocation on <node>.
|
||||
**/
|
||||
LIB_API size_t numa_AvailableMemory(size_t node);
|
||||
|
||||
/**
|
||||
* @return the ratio between maximum and minimum times that one processor
|
||||
* from each node required to fill a globally allocated array.
|
||||
* in other words, this is the maximum slowdown for NUMA-oblivious
|
||||
* memory accesses. Microsoft guidelines require it to be <= 3.
|
||||
**/
|
||||
LIB_API double numa_Factor();
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// allocator
|
||||
|
||||
/**
|
||||
* simple allocator that "does the right thing" on NUMA systems - page frames
|
||||
* will be taken from the node that first accesses them.
|
||||
**/
|
||||
LIB_API void* numa_Allocate(size_t size);
|
||||
|
||||
enum LargePageDisposition
|
||||
{
|
||||
LPD_DEFAULT,
|
||||
LPD_ALWAYS,
|
||||
LPD_NEVER
|
||||
};
|
||||
|
||||
/**
|
||||
* allocate memory from a specific node.
|
||||
*
|
||||
* @param node node number (zero-based)
|
||||
* @param largePageDisposition - allows forcibly enabling/disabling the use
|
||||
* of large pages; the default decision involves a heuristic.
|
||||
* @param pageSize if non-zero, receives the size [bytes] of a single page
|
||||
* out of those used to map the memory.
|
||||
**/
|
||||
LIB_API void* numa_AllocateOnNode(size_t size, size_t node, LargePageDisposition largePageDisposition = LPD_DEFAULT, size_t* pageSize = 0);
|
||||
|
||||
/**
|
||||
* release memory that had been handed out by one of the above allocators.
|
||||
**/
|
||||
LIB_API void numa_Deallocate(void* mem);
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
// for use with shared_ptr
|
||||
template<typename T>
|
||||
struct numa_Deleter
|
||||
{
|
||||
void operator()(T* p) const
|
||||
{
|
||||
numa_Deallocate(p);
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
#endif // #ifndef INCLUDED_NUMA
|
14
source/lib/sysdep/os_cpu.cpp
Normal file
14
source/lib/sysdep/os_cpu.cpp
Normal file
@ -0,0 +1,14 @@
|
||||
/**
|
||||
* =========================================================================
|
||||
* File : os_cpu.cpp
|
||||
* Project : 0 A.D.
|
||||
* Description : OS-specific support functions relating to CPU and memory
|
||||
* =========================================================================
|
||||
*/
|
||||
|
||||
// license: GPL; see lib/license.txt
|
||||
|
||||
#include "precompiled.h"
|
||||
#include "os_cpu.h"
|
||||
|
||||
ERROR_ASSOCIATE(ERR::OS_CPU_RESTRICTED_AFFINITY, "Cannot set desired CPU affinity", -1);
|
117
source/lib/sysdep/os_cpu.h
Normal file
117
source/lib/sysdep/os_cpu.h
Normal file
@ -0,0 +1,117 @@
|
||||
/**
|
||||
* =========================================================================
|
||||
* File : os_cpu.h
|
||||
* Project : 0 A.D.
|
||||
* Description : OS-specific support functions relating to CPU and memory
|
||||
* =========================================================================
|
||||
*/
|
||||
|
||||
// license: GPL; see lib/license.txt
|
||||
|
||||
#ifndef INCLUDED_OS_CPU
|
||||
#define INCLUDED_OS_CPU
|
||||
|
||||
namespace ERR
|
||||
{
|
||||
const LibError OS_CPU_RESTRICTED_AFFINITY = -130100;
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// processor topology
|
||||
|
||||
// processor ID = [0, os_cpu_NumProcessors())
|
||||
// they are a numbering of the bits of the process affinity mask where the
|
||||
// least significant nonzero bit corresponds to ID 0.
|
||||
// rationale: this spares users from having to deal with noncontiguous IDs,
|
||||
// e.g. when administrative tools are used to restrict process affinity.
|
||||
|
||||
/**
|
||||
* @return bit mask of processors that exist and are available to
|
||||
* this process.
|
||||
* its population count is by definition equal to os_cpu_NumProcessors().
|
||||
**/
|
||||
LIB_API uintptr_t os_cpu_ProcessorMask();
|
||||
|
||||
/**
|
||||
* @return the number of processors available to this process.
|
||||
*
|
||||
* note: this function is necessary because POSIX sysconf _SC_NPROCESSORS_CONF
|
||||
* is not suppored on MacOSX, else we would use that.
|
||||
**/
|
||||
LIB_API size_t os_cpu_NumProcessors();
|
||||
|
||||
// note: we do not provide an os_cpu_CurrentProcessor routine. that would
|
||||
// require Windows 2003 or a lot of work. worse, its results would be
|
||||
// worthless because they may change immediately afterwards. instead,
|
||||
// the recommended approach is to pin OpenMP threads (whose ID can be
|
||||
// queried) to the processor with the same number.
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// CPU and memory characteristics
|
||||
|
||||
/**
|
||||
* @return a rough estimate of the CPU clock frequency.
|
||||
* this is usually accurate to a few MHz and is faster than measurement loops.
|
||||
**/
|
||||
LIB_API double os_cpu_ClockFrequency();
|
||||
|
||||
/**
|
||||
* @return the size [bytes] of a MMU page (4096 on most IA-32 systems)
|
||||
**/
|
||||
LIB_API size_t os_cpu_PageSize();
|
||||
|
||||
/**
|
||||
* @return the size [bytes] of a large MMU page (4 MiB on most IA-32 systems)
|
||||
* or zero if they are not supported.
|
||||
**/
|
||||
LIB_API size_t os_cpu_LargePageSize();
|
||||
|
||||
/**
|
||||
* @return the size [bytes] of physical memory.
|
||||
**/
|
||||
LIB_API size_t os_cpu_MemorySize();
|
||||
|
||||
/**
|
||||
* @return the size [bytes] of currently available memory.
|
||||
**/
|
||||
LIB_API size_t os_cpu_MemoryAvailable();
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// scheduling
|
||||
|
||||
/**
|
||||
* restrict the current thread to a set of processors.
|
||||
* it will not be rescheduled until a subsequent os_cpu_SetThreadAffinity*.
|
||||
*
|
||||
* @param processorMask a bit mask of acceptable processors
|
||||
* (bit index i corresponds to processor i)
|
||||
* @return the previous mask
|
||||
**/
|
||||
LIB_API uintptr_t os_cpu_SetThreadAffinityMask(uintptr_t processorMask);
|
||||
|
||||
/**
|
||||
* restrict the current thread to a single processor.
|
||||
* it will not be rescheduled until a subsequent os_cpu_SetThreadAffinity*.
|
||||
**/
|
||||
LIB_API void os_cpu_SetThreadAffinity(size_t processor);
|
||||
|
||||
/**
|
||||
* called by os_cpu_CallByEachCPU.
|
||||
* @param processor ID of processor running the current thread for the
|
||||
* duration of this function.
|
||||
* @param cbData user-specified data passed through os_cpu_CallByEachCPU.
|
||||
**/
|
||||
typedef void (*OsCpuCallback)(size_t processor, uintptr_t cbData);
|
||||
|
||||
/**
|
||||
* execute the specified function once on each processor.
|
||||
* this proceeds serially (the callback is never reentered) in increasing
|
||||
* order of processor ID.
|
||||
* fails if process affinity prevents running on all processors.
|
||||
**/
|
||||
LIB_API LibError os_cpu_CallByEachCPU(OsCpuCallback cb, uintptr_t cbData);
|
||||
|
||||
#endif // #ifndef INCLUDED_OS_CPU
|
@ -1,6 +1,6 @@
|
||||
#include "lib/self_test.h"
|
||||
|
||||
#include "lib/sysdep/ia32/ia32.h"
|
||||
#include "lib/sysdep/x86_x64/x86_x64.h"
|
||||
|
||||
// note: ia32_i??_from_*, ia32_rint*, ia32_fm??f are all tested within
|
||||
// sysdep to avoid test duplication (both the ia32 versions and
|
||||
@ -12,17 +12,17 @@ public:
|
||||
void test_rdtsc()
|
||||
{
|
||||
// must increase monotonously
|
||||
const u64 c1 = ia32_rdtsc();
|
||||
const u64 c2 = ia32_rdtsc();
|
||||
const u64 c3 = ia32_rdtsc();
|
||||
const u64 c1 = x86_x64_rdtsc();
|
||||
const u64 c2 = x86_x64_rdtsc();
|
||||
const u64 c3 = x86_x64_rdtsc();
|
||||
TS_ASSERT(c1 < c2 && c2 < c3);
|
||||
}
|
||||
|
||||
void test_ia32_cap()
|
||||
{
|
||||
// make sure the really common/basic caps end up reported as true
|
||||
TS_ASSERT(ia32_cap(IA32_CAP_FPU));
|
||||
TS_ASSERT(ia32_cap(IA32_CAP_TSC));
|
||||
TS_ASSERT(ia32_cap(IA32_CAP_MMX));
|
||||
TS_ASSERT(x86_x64_cap(X86_X64_CAP_FPU));
|
||||
TS_ASSERT(x86_x64_cap(X86_X64_CAP_TSC));
|
||||
TS_ASSERT(x86_x64_cap(X86_X64_CAP_MMX));
|
||||
}
|
||||
};
|
||||
|
@ -9,20 +9,62 @@
|
||||
// license: GPL; see lib/license.txt
|
||||
|
||||
#include "precompiled.h"
|
||||
#include "../cpu.h"
|
||||
#include "lib/sysdep/os_cpu.h"
|
||||
|
||||
#include "win.h"
|
||||
#include "lib/bits.h"
|
||||
#include "lib/module_init.h"
|
||||
|
||||
#ifdef _OPENMP
|
||||
# include <omp.h>
|
||||
#endif
|
||||
|
||||
|
||||
static LibError ReadFrequencyFromRegistry(DWORD* freqMhz)
|
||||
uintptr_t os_cpu_ProcessorMask()
|
||||
{
|
||||
static uintptr_t processorMask;
|
||||
|
||||
if(!processorMask)
|
||||
{
|
||||
const HANDLE hProcess = GetCurrentProcess();
|
||||
DWORD_PTR processAffinity, systemAffinity;
|
||||
const BOOL ok = GetProcessAffinityMask(hProcess, &processAffinity, &systemAffinity);
|
||||
debug_assert(ok);
|
||||
processorMask = processAffinity;
|
||||
}
|
||||
|
||||
return processorMask;
|
||||
}
|
||||
|
||||
|
||||
size_t os_cpu_NumProcessors()
|
||||
{
|
||||
static size_t numProcessors;
|
||||
|
||||
if(!numProcessors)
|
||||
{
|
||||
numProcessors = PopulationCount(os_cpu_ProcessorMask());
|
||||
|
||||
// sanity check
|
||||
SYSTEM_INFO si;
|
||||
GetSystemInfo(&si); // guaranteed to succeed
|
||||
debug_assert(numProcessors <= (size_t)si.dwNumberOfProcessors);
|
||||
}
|
||||
|
||||
return numProcessors;
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
static LibError ReadFrequencyFromRegistry(DWORD& freqMhz)
|
||||
{
|
||||
HKEY hKey;
|
||||
if(RegOpenKeyEx(HKEY_LOCAL_MACHINE, "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0", 0, KEY_QUERY_VALUE, &hKey) != ERROR_SUCCESS)
|
||||
return ERR::NO_SYS;
|
||||
|
||||
DWORD size = sizeof(*freqMhz);
|
||||
LONG ret = RegQueryValueEx(hKey, "~MHz", 0, 0, (LPBYTE)freqMhz, &size);
|
||||
DWORD size = sizeof(&freqMhz);
|
||||
LONG ret = RegQueryValueEx(hKey, "~MHz", 0, 0, (LPBYTE)&freqMhz, &size);
|
||||
|
||||
RegCloseKey(hKey);
|
||||
|
||||
@ -32,95 +74,232 @@ static LibError ReadFrequencyFromRegistry(DWORD* freqMhz)
|
||||
return INFO::OK;
|
||||
}
|
||||
|
||||
double cpu_ClockFrequency()
|
||||
double os_cpu_ClockFrequency()
|
||||
{
|
||||
DWORD freqMhz;
|
||||
if(ReadFrequencyFromRegistry(&freqMhz) < 0)
|
||||
return -1.0;
|
||||
static double clockFrequency;
|
||||
|
||||
if(clockFrequency == 0.0)
|
||||
{
|
||||
DWORD freqMhz;
|
||||
if(ReadFrequencyFromRegistry(freqMhz) == INFO::OK)
|
||||
clockFrequency = freqMhz * 1e6;
|
||||
else
|
||||
clockFrequency = -1.0;
|
||||
}
|
||||
|
||||
const double clockFrequency = freqMhz * 1e6;
|
||||
return clockFrequency;
|
||||
}
|
||||
|
||||
|
||||
size_t cpu_NumProcessors()
|
||||
size_t os_cpu_PageSize()
|
||||
{
|
||||
SYSTEM_INFO si;
|
||||
GetSystemInfo(&si); // can't fail
|
||||
const size_t numProcessors = (size_t)si.dwNumberOfProcessors;
|
||||
return numProcessors;
|
||||
static size_t systemPageSize;
|
||||
|
||||
if(!systemPageSize)
|
||||
{
|
||||
SYSTEM_INFO si;
|
||||
GetSystemInfo(&si); // guaranteed to succeed
|
||||
systemPageSize = (size_t)si.dwPageSize;
|
||||
}
|
||||
|
||||
return systemPageSize;
|
||||
}
|
||||
|
||||
|
||||
size_t cpu_PageSize()
|
||||
size_t os_cpu_LargePageSize()
|
||||
{
|
||||
SYSTEM_INFO si;
|
||||
GetSystemInfo(&si); // can't fail
|
||||
const size_t pageSize = (size_t)si.dwPageSize;
|
||||
return pageSize;
|
||||
static size_t largePageSize = ~(size_t)0; // "0" has special significance
|
||||
|
||||
if(largePageSize == ~(size_t)0)
|
||||
{
|
||||
typedef SIZE_T (WINAPI *PGetLargePageMinimum)(void);
|
||||
const HMODULE hKernel32 = GetModuleHandle("kernel32.dll");
|
||||
const PGetLargePageMinimum pGetLargePageMinimum = (PGetLargePageMinimum)GetProcAddress(hKernel32, "GetLargePageMinimum");
|
||||
if(pGetLargePageMinimum)
|
||||
{
|
||||
largePageSize = pGetLargePageMinimum();
|
||||
debug_assert(largePageSize != 0); // IA-32 and AMD64 definitely support large pages
|
||||
debug_assert(largePageSize > os_cpu_PageSize());
|
||||
}
|
||||
// no OS support for large pages
|
||||
else
|
||||
largePageSize = 0;
|
||||
}
|
||||
|
||||
return largePageSize;
|
||||
}
|
||||
|
||||
|
||||
size_t cpu_MemorySize(CpuMemoryIndicators mem_type)
|
||||
static void GetMemoryStatus(MEMORYSTATUSEX& mse)
|
||||
{
|
||||
// note: we no longer bother dynamically importing GlobalMemoryStatusEx -
|
||||
// it's available on Win2k and above. this function safely handles
|
||||
// systems with > 4 GB of memory.
|
||||
MEMORYSTATUSEX mse = { sizeof(mse) };
|
||||
BOOL ok = GlobalMemoryStatusEx(&mse);
|
||||
mse.dwLength = sizeof(mse);
|
||||
const BOOL ok = GlobalMemoryStatusEx(&mse);
|
||||
WARN_IF_FALSE(ok);
|
||||
}
|
||||
|
||||
if(mem_type == CPU_MEM_TOTAL)
|
||||
size_t os_cpu_MemorySize()
|
||||
{
|
||||
static size_t memorySize;
|
||||
|
||||
if(memorySize == 0)
|
||||
{
|
||||
size_t memoryTotal = (size_t)mse.ullTotalPhys;
|
||||
MEMORYSTATUSEX mse;
|
||||
GetMemoryStatus(mse);
|
||||
memorySize = (size_t)mse.ullTotalPhys;
|
||||
|
||||
// Richter, "Programming Applications for Windows": the reported
|
||||
// value doesn't include non-paged pool reserved during boot;
|
||||
// it's not considered available to the kernel. (the amount is
|
||||
// 528 KiB on a 512 MiB WinXP/Win2k machine). we'll round up
|
||||
// to the nearest megabyte to fix this.
|
||||
memoryTotal = round_up(memoryTotal, 1*MiB);
|
||||
return memoryTotal;
|
||||
memorySize = round_up(memorySize, 1*MiB);
|
||||
}
|
||||
|
||||
return memorySize;
|
||||
}
|
||||
|
||||
size_t os_cpu_MemoryAvailable()
|
||||
{
|
||||
MEMORYSTATUSEX mse;
|
||||
GetMemoryStatus(mse);
|
||||
const size_t memoryAvailable = (size_t)mse.ullAvailPhys;
|
||||
return memoryAvailable;
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* maximum number of processors supported by the OS (determined by the
|
||||
* number of bits in an affinity mask)
|
||||
**/
|
||||
static const DWORD maxProcessorNumber = sizeof(DWORD_PTR)*CHAR_BIT-1;
|
||||
|
||||
DWORD_PTR wcpu_AffinityFromProcessorMask(DWORD_PTR processAffinity, uintptr_t processorMask)
|
||||
{
|
||||
DWORD_PTR affinity = 0;
|
||||
|
||||
size_t processor = (size_t)-1;
|
||||
for(DWORD processorNumber = 0; processorNumber <= maxProcessorNumber; processorNumber++)
|
||||
{
|
||||
if(IsBitSet(processAffinity, processorNumber))
|
||||
{
|
||||
++processor; // now corresponds to processorNumber
|
||||
|
||||
if(IsBitSet(processorMask, processor))
|
||||
affinity |= DWORD_PTR(1) << processorNumber;
|
||||
}
|
||||
}
|
||||
|
||||
return affinity;
|
||||
}
|
||||
|
||||
uintptr_t wcpu_ProcessorMaskFromAffinity(DWORD_PTR processAffinity, DWORD_PTR affinity)
|
||||
{
|
||||
uintptr_t processorMask = 0;
|
||||
|
||||
size_t processor = (size_t)-1;
|
||||
for(DWORD processorNumber = 0; processorNumber <= maxProcessorNumber; processorNumber++)
|
||||
{
|
||||
if(IsBitSet(processAffinity, processorNumber))
|
||||
{
|
||||
++processor; // now corresponds to processorNumber
|
||||
|
||||
if(IsBitSet(affinity, processorNumber))
|
||||
processorMask |= uintptr_t(1) << processor;
|
||||
}
|
||||
}
|
||||
|
||||
return processorMask;
|
||||
}
|
||||
|
||||
|
||||
static const DWORD invalidProcessorNumber = (DWORD)-1;
|
||||
|
||||
static DWORD CurrentProcessorNumber()
|
||||
{
|
||||
typedef DWORD (WINAPI *PGetCurrentProcessorNumber)(void);
|
||||
static PGetCurrentProcessorNumber pGetCurrentProcessorNumber;
|
||||
|
||||
static bool initialized;
|
||||
if(!initialized)
|
||||
{
|
||||
initialized = true;
|
||||
const HMODULE hKernel32 = GetModuleHandle("kernel32.dll");
|
||||
// note: NtGetCurrentProcessorNumber and RtlGetCurrentProcessorNumber aren't
|
||||
// implemented on WinXP SP2, so we can't use those either.
|
||||
pGetCurrentProcessorNumber = (PGetCurrentProcessorNumber)GetProcAddress(hKernel32, "GetCurrentProcessorNumber");
|
||||
}
|
||||
|
||||
if(pGetCurrentProcessorNumber)
|
||||
return pGetCurrentProcessorNumber();
|
||||
else
|
||||
{
|
||||
const size_t memoryAvailable = (size_t)mse.ullAvailPhys;
|
||||
return memoryAvailable;
|
||||
// note: we won't bother mapping APIC IDs to processor numbers or
|
||||
// using LSL to re-implement GetCurrentProcessorNumber because
|
||||
// this routine is just a debug aid.
|
||||
return invalidProcessorNumber;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
LibError cpu_CallByEachCPU(CpuCallback cb, void* param)
|
||||
uintptr_t os_cpu_SetThreadAffinityMask(uintptr_t processorMask)
|
||||
{
|
||||
const HANDLE hProcess = GetCurrentProcess();
|
||||
DWORD_PTR process_affinity, system_affinity;
|
||||
if(!GetProcessAffinityMask(hProcess, &process_affinity, &system_affinity))
|
||||
WARN_RETURN(ERR::FAIL);
|
||||
// our affinity != system affinity: OS is limiting the CPUs that
|
||||
// this process can run on. fail (cannot call back for each CPU).
|
||||
if(process_affinity != system_affinity)
|
||||
WARN_RETURN(ERR::CPU_RESTRICTED_AFFINITY);
|
||||
debug_assert((processorMask >> os_cpu_NumProcessors()) == 0);
|
||||
|
||||
for(DWORD_PTR cpu_bit = 1; cpu_bit != 0 && cpu_bit <= process_affinity; cpu_bit *= 2)
|
||||
DWORD_PTR processAffinity, systemAffinity;
|
||||
const BOOL ok = GetProcessAffinityMask(GetCurrentProcess(), &processAffinity, &systemAffinity);
|
||||
debug_assert(ok);
|
||||
|
||||
const DWORD_PTR affinity = wcpu_AffinityFromProcessorMask(processAffinity, processorMask);
|
||||
const DWORD_PTR previousAffinity = SetThreadAffinityMask(GetCurrentThread(), affinity);
|
||||
debug_assert(previousAffinity != 0); // ensure function didn't fail
|
||||
|
||||
// hopefully reschedule our thread
|
||||
Sleep(0);
|
||||
|
||||
// verify we're running on the correct processor
|
||||
const DWORD currentProcessorNumber = CurrentProcessorNumber();
|
||||
if(currentProcessorNumber != invalidProcessorNumber)
|
||||
debug_assert(IsBitSet(affinity, currentProcessorNumber));
|
||||
|
||||
const uintptr_t previousProcessorMask = wcpu_ProcessorMaskFromAffinity(processAffinity, previousAffinity);
|
||||
return previousProcessorMask;
|
||||
}
|
||||
|
||||
|
||||
void os_cpu_SetThreadAffinity(size_t processor)
|
||||
{
|
||||
debug_assert(processor < os_cpu_NumProcessors());
|
||||
|
||||
const uintptr_t processorMask = uintptr_t(1) << processor;
|
||||
(void)os_cpu_SetThreadAffinityMask(processorMask);
|
||||
}
|
||||
|
||||
|
||||
LibError os_cpu_CallByEachCPU(OsCpuCallback cb, uintptr_t cbData)
|
||||
{
|
||||
// ensure we are able to run on all system processors
|
||||
DWORD_PTR processAffinity, systemAffinity;
|
||||
{
|
||||
// check if we can switch to target CPU
|
||||
if(!(process_affinity & cpu_bit))
|
||||
continue;
|
||||
// .. and do so.
|
||||
if(!SetThreadAffinityMask(GetCurrentThread(), cpu_bit))
|
||||
{
|
||||
WARN_ERR(ERR::CPU_RESTRICTED_AFFINITY);
|
||||
continue;
|
||||
}
|
||||
|
||||
// reschedule to make sure we switch CPUs.
|
||||
Sleep(1);
|
||||
|
||||
cb(param);
|
||||
const BOOL ok = GetProcessAffinityMask(GetCurrentProcess(), &processAffinity, &systemAffinity);
|
||||
debug_assert(ok);
|
||||
if(processAffinity != systemAffinity)
|
||||
WARN_RETURN(ERR::OS_CPU_RESTRICTED_AFFINITY);
|
||||
}
|
||||
|
||||
// restore to original value
|
||||
SetThreadAffinityMask(hProcess, process_affinity);
|
||||
const uintptr_t previousAffinity = os_cpu_SetThreadAffinityMask(os_cpu_ProcessorMask());
|
||||
|
||||
for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
|
||||
{
|
||||
os_cpu_SetThreadAffinity(processor);
|
||||
cb(processor, cbData);
|
||||
}
|
||||
|
||||
(void)os_cpu_SetThreadAffinityMask(previousAffinity);
|
||||
|
||||
return INFO::OK;
|
||||
}
|
||||
|
25
source/lib/sysdep/win/wcpu.h
Normal file
25
source/lib/sysdep/win/wcpu.h
Normal file
@ -0,0 +1,25 @@
|
||||
/**
|
||||
* =========================================================================
|
||||
* File : wcpu.h
|
||||
* Project : 0 A.D.
|
||||
* Description : Windows backend of os_cpu
|
||||
* =========================================================================
|
||||
*/
|
||||
|
||||
// license: GPL; see lib/license.txt
|
||||
|
||||
#ifndef INCLUDED_WCPU
|
||||
#define INCLUDED_WCPU
|
||||
|
||||
#include "win.h"
|
||||
|
||||
// "affinity" and "processorNumber" are what Windows sees.
|
||||
// "processorMask" and "processor" are the idealized representation we expose
|
||||
// to users. the latter insulates them from process affinity restrictions by
|
||||
// defining IDs as indices of the nonzero bits within the process affinity.
|
||||
// these routines are provided for the benefit of wnuma.
|
||||
|
||||
extern DWORD_PTR wcpu_AffinityFromProcessorMask(DWORD_PTR processAffinity, uintptr_t processorMask);
|
||||
extern uintptr_t wcpu_ProcessorMaskFromAffinity(DWORD_PTR processAffinity, DWORD_PTR affinity);
|
||||
|
||||
#endif // #ifndef INCLUDED_WCPU
|
@ -15,14 +15,9 @@
|
||||
#include "lib/sysdep/win/win.h"
|
||||
#include "lib/bits.h"
|
||||
|
||||
#if MSC_VERSION
|
||||
# include <intrin.h>
|
||||
# if !ICC_VERSION
|
||||
# pragma intrinsic(__rdtsc)
|
||||
# endif
|
||||
#endif
|
||||
#if ARCH_IA32
|
||||
# include "lib/sysdep/ia32/ia32.h" // ia32_rdtsc
|
||||
#if ARCH_IA32 || ARCH_AMD64
|
||||
# include "lib/sysdep/x86_x64/x86_x64.h" // x86_x64_rdtsc
|
||||
# include "lib/sysdep/x86_x64/topology.h"
|
||||
#endif
|
||||
|
||||
|
||||
@ -38,18 +33,18 @@ enum AmdPowerNowFlags
|
||||
|
||||
static bool IsThrottlingPossible()
|
||||
{
|
||||
#if ARCH_IA32
|
||||
Ia32CpuidRegs regs;
|
||||
switch(ia32_Vendor())
|
||||
#if ARCH_IA32 || ARCH_AMD64
|
||||
x86_x64_CpuidRegs regs;
|
||||
switch(x86_x64_Vendor())
|
||||
{
|
||||
case IA32_VENDOR_INTEL:
|
||||
if(ia32_cap(IA32_CAP_TM_SCC) || ia32_cap(IA32_CAP_EST))
|
||||
case X86_X64_VENDOR_INTEL:
|
||||
if(x86_x64_cap(X86_X64_CAP_TM_SCC) || x86_x64_cap(X86_X64_CAP_EST))
|
||||
return true;
|
||||
break;
|
||||
|
||||
case IA32_VENDOR_AMD:
|
||||
case X86_X64_VENDOR_AMD:
|
||||
regs.eax = 0x80000007;
|
||||
if(ia32_cpuid(®s))
|
||||
if(x86_x64_cpuid(®s))
|
||||
{
|
||||
if(regs.edx & (PN_FREQ_ID_CTRL|PN_SW_THERMAL_CTRL))
|
||||
return true;
|
||||
@ -57,9 +52,6 @@ static bool IsThrottlingPossible()
|
||||
break;
|
||||
}
|
||||
return false;
|
||||
#elif ARCH_AMD64
|
||||
// not yet implemented - consider it unsafe.
|
||||
return true;
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -68,8 +60,8 @@ static bool IsThrottlingPossible()
|
||||
|
||||
LibError CounterTSC::Activate()
|
||||
{
|
||||
#if ARCH_IA32
|
||||
if(!ia32_cap(IA32_CAP_TSC))
|
||||
#if ARCH_IA32 || ARCH_AMD64
|
||||
if(!x86_x64_cap(X86_X64_CAP_TSC))
|
||||
return ERR::NO_SYS; // NOWARN (CPU doesn't support RDTSC)
|
||||
#endif
|
||||
|
||||
@ -107,16 +99,16 @@ bool CounterTSC::IsSafe() const
|
||||
if(cpu_NumPackages() != 1 || cpu_CoresPerPackage() != 1)
|
||||
return false;
|
||||
|
||||
#if ARCH_IA32
|
||||
#if ARCH_IA32 || ARCH_AMD64
|
||||
// recent CPU:
|
||||
if(ia32_Generation() >= 7)
|
||||
if(x86_x64_Generation() >= 7)
|
||||
{
|
||||
// note: 8th generation CPUs support C1-clock ramping, which causes
|
||||
// drift on multi-core systems, but those were excluded above.
|
||||
|
||||
Ia32CpuidRegs regs;
|
||||
x86_x64_CpuidRegs regs;
|
||||
regs.eax = 0x80000007;
|
||||
if(ia32_cpuid(®s))
|
||||
if(x86_x64_cpuid(®s))
|
||||
{
|
||||
// TSC is invariant WRT P-state, C-state and STPCLK => safe.
|
||||
if(regs.edx & PN_INVARIANT_TSC)
|
||||
@ -148,11 +140,7 @@ bool CounterTSC::IsSafe() const
|
||||
|
||||
u64 CounterTSC::Counter() const
|
||||
{
|
||||
#if MSC_VERSION
|
||||
return __rdtsc();
|
||||
#else
|
||||
return ia32_rdtsc();
|
||||
#endif
|
||||
return x86_x64_rdtsc();
|
||||
}
|
||||
|
||||
/**
|
||||
|
359
source/lib/sysdep/win/wnuma.cpp
Normal file
359
source/lib/sysdep/win/wnuma.cpp
Normal file
@ -0,0 +1,359 @@
|
||||
#include "precompiled.h"
|
||||
#include "lib/sysdep/numa.h"
|
||||
|
||||
#include "lib/bits.h" // round_up, PopulationCount
|
||||
#include "lib/timer.h"
|
||||
#include "lib/sysdep/os_cpu.h"
|
||||
#include "win.h"
|
||||
#include "wutil.h"
|
||||
#include "wcpu.h"
|
||||
#include <Psapi.h>
|
||||
|
||||
#ifdef _OPENMP
|
||||
# include <omp.h>
|
||||
#endif
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// node topology
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
size_t numa_NumNodes()
|
||||
{
|
||||
static size_t numNodes;
|
||||
|
||||
if(!numNodes)
|
||||
{
|
||||
typedef BOOL (WINAPI *PGetNumaHighestNodeNumber)(PULONG highestNode);
|
||||
const HMODULE hKernel32 = GetModuleHandle("kernel32.dll");
|
||||
const PGetNumaHighestNodeNumber pGetNumaHighestNodeNumber = (PGetNumaHighestNodeNumber)GetProcAddress(hKernel32, "GetNumaHighestNodeNumber");
|
||||
if(pGetNumaHighestNodeNumber)
|
||||
{
|
||||
ULONG highestNode;
|
||||
const BOOL ok = pGetNumaHighestNodeNumber(&highestNode);
|
||||
debug_assert(ok);
|
||||
debug_assert(highestNode < os_cpu_NumProcessors()); // #nodes <= #processors
|
||||
numNodes = highestNode+1;
|
||||
}
|
||||
// NUMA not supported
|
||||
else
|
||||
numNodes = 1;
|
||||
}
|
||||
|
||||
return numNodes;
|
||||
}
|
||||
|
||||
|
||||
// note: it is easier to implement this in terms of numa_ProcessorMaskFromNode
|
||||
// rather than the other way around because wcpu provides the
|
||||
// wcpu_ProcessorMaskFromAffinity helper. there is no similar function to
|
||||
// convert processor to processorNumber.
|
||||
size_t numa_NodeFromProcessor(size_t processor)
|
||||
{
|
||||
debug_assert(processor < os_cpu_NumProcessors());
|
||||
|
||||
static std::vector<size_t> processorsNode;
|
||||
#ifdef _OPENMP
|
||||
#pragma omp critical
|
||||
#endif
|
||||
if(processorsNode.empty())
|
||||
{
|
||||
processorsNode.resize(os_cpu_NumProcessors(), 0);
|
||||
for(size_t node = 0; node < numa_NumNodes(); node++)
|
||||
{
|
||||
const uintptr_t processorMask = numa_ProcessorMaskFromNode(node);
|
||||
for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
|
||||
{
|
||||
if(IsBitSet(processorMask, processor))
|
||||
processorsNode[processor] = node;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return processorsNode.at(processor);
|
||||
}
|
||||
|
||||
|
||||
uintptr_t numa_ProcessorMaskFromNode(size_t node)
|
||||
{
|
||||
debug_assert(node < numa_NumNodes());
|
||||
|
||||
static std::vector<uintptr_t> nodesProcessorMask;
|
||||
#ifdef _OPENMP
|
||||
#pragma omp critical
|
||||
#endif
|
||||
if(nodesProcessorMask.empty())
|
||||
{
|
||||
typedef BOOL (WINAPI *PGetNumaNodeProcessorMask)(UCHAR node, PULONGLONG affinity);
|
||||
const HMODULE hKernel32 = GetModuleHandle("kernel32.dll");
|
||||
const PGetNumaNodeProcessorMask pGetNumaNodeProcessorMask = (PGetNumaNodeProcessorMask)GetProcAddress(hKernel32, "GetNumaNodeProcessorMask");
|
||||
if(pGetNumaNodeProcessorMask)
|
||||
{
|
||||
DWORD_PTR processAffinity, systemAffinity;
|
||||
const BOOL ok = GetProcessAffinityMask(GetCurrentProcess(), &processAffinity, &systemAffinity);
|
||||
debug_assert(ok);
|
||||
|
||||
for(size_t node = 0; node < numa_NumNodes(); node++)
|
||||
{
|
||||
ULONGLONG affinity;
|
||||
const BOOL ok = pGetNumaNodeProcessorMask((UCHAR)node, &affinity);
|
||||
debug_assert(ok);
|
||||
const uintptr_t processorMask = wcpu_ProcessorMaskFromAffinity(processAffinity, (DWORD_PTR)affinity);
|
||||
nodesProcessorMask.push_back(processorMask);
|
||||
}
|
||||
}
|
||||
// NUMA not supported - consider node 0 to consist of all system processors
|
||||
else
|
||||
nodesProcessorMask.push_back(os_cpu_ProcessorMask());
|
||||
}
|
||||
|
||||
return nodesProcessorMask.at(node);
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// memory info
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
size_t numa_AvailableMemory(size_t node)
|
||||
{
|
||||
debug_assert(node < numa_NumNodes());
|
||||
|
||||
// note: it is said that GetNumaAvailableMemoryNode sometimes incorrectly
|
||||
// reports zero bytes. the actual cause may however be unexpected
|
||||
// RAM configuration, e.g. not all slots filled.
|
||||
typedef BOOL (WINAPI *PGetNumaAvailableMemoryNode)(UCHAR node, PULONGLONG availableBytes);
|
||||
static PGetNumaAvailableMemoryNode pGetNumaAvailableMemoryNode;
|
||||
if(!pGetNumaAvailableMemoryNode)
|
||||
{
|
||||
const HMODULE hKernel32 = GetModuleHandle("kernel32.dll");
|
||||
pGetNumaAvailableMemoryNode = (PGetNumaAvailableMemoryNode)GetProcAddress(hKernel32, "GetNumaAvailableMemoryNode");
|
||||
}
|
||||
|
||||
if(pGetNumaAvailableMemoryNode)
|
||||
{
|
||||
ULONGLONG availableBytes;
|
||||
const BOOL ok = pGetNumaAvailableMemoryNode((UCHAR)node, &availableBytes);
|
||||
debug_assert(ok);
|
||||
return (size_t)availableBytes;
|
||||
}
|
||||
// NUMA not supported - return available system memory
|
||||
else
|
||||
return os_cpu_MemoryAvailable();
|
||||
}
|
||||
|
||||
|
||||
double numa_Factor()
|
||||
{
|
||||
static double factor;
|
||||
|
||||
static bool initialized;
|
||||
#ifdef _OPENMP
|
||||
#pragma omp critical
|
||||
#endif
|
||||
if(!initialized)
|
||||
{
|
||||
initialized = true;
|
||||
|
||||
// if non-NUMA, skip the (expensive) measurements below.
|
||||
if(numa_NumNodes() == 1)
|
||||
factor = 1.0;
|
||||
else
|
||||
{
|
||||
// allocate memory on one node
|
||||
const size_t size = 16*MiB;
|
||||
shared_ptr<u8> buffer((u8*)numa_AllocateOnNode(size, 0), numa_Deleter<u8>());
|
||||
|
||||
const uintptr_t previousProcessorMask = os_cpu_SetThreadAffinityMask(os_cpu_ProcessorMask());
|
||||
|
||||
// measure min/max fill times required by a processor from each node
|
||||
double minTime = 1e10, maxTime = 0.0;
|
||||
for(size_t node = 0; node < numa_NumNodes(); node++)
|
||||
{
|
||||
const uintptr_t processorMask = numa_ProcessorMaskFromNode(node);
|
||||
os_cpu_SetThreadAffinityMask(processorMask);
|
||||
|
||||
const double startTime = timer_Time();
|
||||
memset(buffer.get(), 0, size);
|
||||
const double elapsedTime = timer_Time() - startTime;
|
||||
|
||||
minTime = std::min(minTime, elapsedTime);
|
||||
maxTime = std::max(maxTime, elapsedTime);
|
||||
}
|
||||
|
||||
(void)os_cpu_SetThreadAffinityMask(previousProcessorMask);
|
||||
|
||||
factor = maxTime / minTime;
|
||||
}
|
||||
|
||||
debug_assert(factor >= 1.0);
|
||||
debug_assert(factor <= 3.0); // (Microsoft guideline for NUMA systems)
|
||||
}
|
||||
|
||||
return factor;
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// allocator
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
void* numa_Allocate(size_t size)
|
||||
{
|
||||
void* const mem = VirtualAlloc(0, size, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE);
|
||||
if(!mem)
|
||||
throw std::bad_alloc();
|
||||
return mem;
|
||||
}
|
||||
|
||||
|
||||
static bool largePageAllocationTookTooLong = false;
|
||||
|
||||
static bool ShouldUseLargePages(LargePageDisposition disposition, size_t allocationSize, size_t node)
|
||||
{
|
||||
// can't, OS does not support large pages
|
||||
if(os_cpu_LargePageSize() == 0)
|
||||
return false;
|
||||
|
||||
// overrides
|
||||
if(disposition == LPD_NEVER)
|
||||
return false;
|
||||
if(disposition == LPD_ALWAYS)
|
||||
return true;
|
||||
|
||||
// default disposition: use a heuristic
|
||||
{
|
||||
// a previous attempt already took too long (Windows is apparently
|
||||
// shoveling aside lots of memory).
|
||||
if(largePageAllocationTookTooLong)
|
||||
return false;
|
||||
|
||||
// allocation is rather small and would "only" use half of the
|
||||
// TLBs for its pages.
|
||||
if(allocationSize < 64/2 * os_cpu_PageSize())
|
||||
return false;
|
||||
|
||||
// we want there to be plenty of memory available, otherwise the
|
||||
// page frames are going to be terribly fragmented and even a
|
||||
// single allocation would take SECONDS.
|
||||
if(numa_AvailableMemory(node) < 2*GiB)
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
static bool VerifyPages(void* mem, size_t size, size_t pageSize, size_t node)
|
||||
{
|
||||
typedef BOOL (WINAPI *PQueryWorkingSetEx)(HANDLE hProcess, PVOID buffer, DWORD bufferSize);
|
||||
static PQueryWorkingSetEx pQueryWorkingSetEx;
|
||||
if(!pQueryWorkingSetEx)
|
||||
{
|
||||
const HMODULE hKernel32 = GetModuleHandle("kernel32.dll");
|
||||
pQueryWorkingSetEx = (PQueryWorkingSetEx)GetProcAddress(hKernel32, "QueryWorkingSetEx");
|
||||
if(!pQueryWorkingSetEx)
|
||||
return true; // can't do anything
|
||||
}
|
||||
|
||||
#if WINVER >= 0x600
|
||||
// retrieve attributes of all pages constituting mem
|
||||
const size_t numPages = (size + pageSize-1) / pageSize;
|
||||
PSAPI_WORKING_SET_EX_INFORMATION* wsi = new PSAPI_WORKING_SET_EX_INFORMATION[numPages];
|
||||
for(size_t i = 0; i < numPages; i++)
|
||||
wsi[i].VirtualAddress = (u8*)mem + i*pageSize;
|
||||
pQueryWorkingSetEx(GetCurrentProcess(), wsi, sizeof(PSAPI_WORKING_SET_EX_INFORMATION)*numPages);
|
||||
|
||||
// ensure each is valid and allocated on the correct node
|
||||
for(size_t i = 0; i < numPages; i++)
|
||||
{
|
||||
const PSAPI_WORKING_SET_EX_BLOCK& attributes = wsi[i].VirtualAttributes;
|
||||
if(!attributes.valid)
|
||||
return false;
|
||||
if(attributes.LargePage != (pageSize == LargePageSize()))
|
||||
{
|
||||
debug_printf("NUMA: is not a large page\n");
|
||||
return false;
|
||||
}
|
||||
if(attributes.node != node)
|
||||
{
|
||||
debug_printf("NUMA: allocated from remote node\n");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
delete[] wsi;
|
||||
#else
|
||||
UNUSED2(mem);
|
||||
UNUSED2(size);
|
||||
UNUSED2(pageSize);
|
||||
UNUSED2(node);
|
||||
#endif
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
void* numa_AllocateOnNode(size_t size, size_t node, LargePageDisposition largePageDisposition, size_t* ppageSize)
|
||||
{
|
||||
debug_assert(node < numa_NumNodes());
|
||||
|
||||
// see if there will be enough memory (non-authoritative, for debug purposes only)
|
||||
{
|
||||
const size_t availableBytes = numa_AvailableMemory(node);
|
||||
if(availableBytes < size)
|
||||
debug_printf("NUMA: warning: node reports insufficient memory (%d vs %d)\n", availableBytes, size);
|
||||
}
|
||||
|
||||
void* mem = 0;
|
||||
size_t pageSize = 0;
|
||||
|
||||
// try allocating with large pages (reduces TLB misses)
|
||||
if(ShouldUseLargePages(largePageDisposition, size, node))
|
||||
{
|
||||
const size_t largePageSize = os_cpu_LargePageSize();
|
||||
const size_t paddedSize = round_up(size, largePageSize); // required by MEM_LARGE_PAGES
|
||||
// note: this call can take SECONDS, which is why several checks are
|
||||
// undertaken before we even try. these aren't authoritative, so we
|
||||
// at least prevent future attempts if it takes too long.
|
||||
const double startTime = timer_Time();
|
||||
mem = VirtualAlloc(0, paddedSize, MEM_RESERVE|MEM_COMMIT|MEM_LARGE_PAGES, PAGE_READWRITE);
|
||||
pageSize = largePageSize;
|
||||
const double elapsedTime = timer_Time() - startTime;
|
||||
debug_printf("TIMER| NUMA large page allocation: %g\n", elapsedTime);
|
||||
if(elapsedTime > 1.0)
|
||||
largePageAllocationTookTooLong = true;
|
||||
}
|
||||
|
||||
// try (again) with regular pages
|
||||
if(!mem)
|
||||
{
|
||||
mem = VirtualAlloc(0, size, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE);
|
||||
pageSize = os_cpu_PageSize();
|
||||
}
|
||||
|
||||
// all attempts failed - we're apparently out of memory.
|
||||
if(!mem)
|
||||
throw std::bad_alloc();
|
||||
|
||||
// we can't use VirtualAllocExNuma - it's only available in Vista and Server 2008.
|
||||
// workaround: fault in all pages now to ensure they are allocated from the
|
||||
// current node, then verify page attributes.
|
||||
// (note: VirtualAlloc's MEM_COMMIT only maps virtual pages and does not
|
||||
// actually allocate page frames. Windows uses a first-touch heuristic -
|
||||
// the page will be taken from the node whose processor caused the fault.)
|
||||
memset(mem, 0, size);
|
||||
|
||||
VerifyPages(mem, size, pageSize, node);
|
||||
|
||||
if(ppageSize)
|
||||
*ppageSize = pageSize;
|
||||
|
||||
return mem;
|
||||
}
|
||||
|
||||
|
||||
void numa_Deallocate(void* mem)
|
||||
{
|
||||
VirtualFree(mem, 0, MEM_RELEASE);
|
||||
}
|
@ -405,7 +405,7 @@ int aio_suspend(const struct aiocb* const cbs[], int n, const struct timespec* t
|
||||
const BOOL waitAll = FALSE;
|
||||
// convert timespec to milliseconds (ts == 0 => no timeout)
|
||||
const DWORD timeout = ts? (DWORD)(ts->tv_sec*1000 + ts->tv_nsec/1000000) : INFINITE;
|
||||
DWORD result = WaitForMultipleObjects(numPendingIos, hEvents, waitAll, timeout);
|
||||
DWORD result = WaitForMultipleObjects((DWORD)numPendingIos, hEvents, waitAll, timeout);
|
||||
|
||||
for(size_t i = 0; i < numPendingIos; i++)
|
||||
ResetEvent(hEvents[i]);
|
||||
|
442
source/lib/sysdep/x86_x64/topology.cpp
Normal file
442
source/lib/sysdep/x86_x64/topology.cpp
Normal file
@ -0,0 +1,442 @@
|
||||
/**
|
||||
* =========================================================================
|
||||
* File : topology.cpp
|
||||
* Project : 0 A.D.
|
||||
* Description : detection of CPU and cache topology
|
||||
* =========================================================================
|
||||
*/
|
||||
|
||||
// license: GPL; see lib/license.txt
|
||||
|
||||
#include "precompiled.h"
|
||||
#include "topology.h"
|
||||
|
||||
#include "lib/bits.h"
|
||||
#include "lib/sysdep/cpu.h"
|
||||
#include "lib/sysdep/os_cpu.h"
|
||||
#include "x86_x64.h"
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
// note: Intel Appnote 485 (CPUID) assures uniformity of coresPerPackage and
|
||||
// logicalPerCore across all packages.
|
||||
|
||||
static size_t DetectCoresPerPackage()
|
||||
{
|
||||
x86_x64_CpuidRegs regs;
|
||||
switch(x86_x64_Vendor())
|
||||
{
|
||||
case X86_X64_VENDOR_INTEL:
|
||||
regs.eax = 4;
|
||||
regs.ecx = 0;
|
||||
if(x86_x64_cpuid(®s))
|
||||
return bits(regs.eax, 26, 31)+1;
|
||||
break;
|
||||
|
||||
case X86_X64_VENDOR_AMD:
|
||||
regs.eax = 0x80000008;
|
||||
if(x86_x64_cpuid(®s))
|
||||
return bits(regs.ecx, 0, 7)+1;
|
||||
break;
|
||||
}
|
||||
|
||||
return 1; // else: the CPU is single-core.
|
||||
}
|
||||
|
||||
static size_t CoresPerPackage()
|
||||
{
|
||||
static size_t coresPerPackage = 0;
|
||||
if(!coresPerPackage)
|
||||
coresPerPackage = DetectCoresPerPackage();
|
||||
return coresPerPackage;
|
||||
}
|
||||
|
||||
|
||||
static bool IsHyperthreadingCapable()
|
||||
{
|
||||
// definitely not
|
||||
if(!x86_x64_cap(X86_X64_CAP_HT))
|
||||
return false;
|
||||
|
||||
// AMD N-core systems falsely set the HT bit for compatibility reasons
|
||||
// (don't bother resetting it, might confuse callers)
|
||||
if(x86_x64_Vendor() == X86_X64_VENDOR_AMD && x86_x64_cap(X86_X64_CAP_AMD_CMP_LEGACY))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static size_t DetectLogicalPerCore()
|
||||
{
|
||||
if(!IsHyperthreadingCapable())
|
||||
return 1;
|
||||
|
||||
x86_x64_CpuidRegs regs;
|
||||
regs.eax = 1;
|
||||
if(!x86_x64_cpuid(®s))
|
||||
DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING);
|
||||
const size_t logicalPerPackage = bits(regs.ebx, 16, 23);
|
||||
|
||||
// cores ought to be uniform WRT # logical processors
|
||||
debug_assert(logicalPerPackage % CoresPerPackage() == 0);
|
||||
|
||||
return logicalPerPackage / CoresPerPackage();
|
||||
}
|
||||
|
||||
static size_t LogicalPerCore()
|
||||
{
|
||||
static size_t logicalPerCore = 0;
|
||||
if(!logicalPerCore)
|
||||
logicalPerCore = DetectLogicalPerCore();
|
||||
return logicalPerCore;
|
||||
}
|
||||
|
||||
enum CacheType
|
||||
{
|
||||
CT_NONE = 0,
|
||||
CT_DATA = 1,
|
||||
CT_INSTRUCTION = 2,
|
||||
CT_UNIFIED = 3
|
||||
};
|
||||
|
||||
static bool IsL2DataCache(CacheType type, size_t level)
|
||||
{
|
||||
if(type != CT_DATA && type != CT_UNIFIED)
|
||||
return false;
|
||||
if(level != 2)
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
static size_t DetectLogicalPerCache()
|
||||
{
|
||||
// note: Intel Appnote 485 says the order in which caches are returned is
|
||||
// undefined, so we need to loop through all of them.
|
||||
for(u32 count = 0; ; count++)
|
||||
{
|
||||
x86_x64_CpuidRegs regs;
|
||||
regs.eax = 4;
|
||||
regs.ecx = count;
|
||||
x86_x64_cpuid(®s);
|
||||
|
||||
const CacheType type = (CacheType)bits(regs.eax, 0, 4);
|
||||
// no more caches left
|
||||
if(type == CT_NONE)
|
||||
{
|
||||
debug_assert(0); // we somehow didn't find the L2d
|
||||
return 1;
|
||||
}
|
||||
|
||||
const size_t level = bits(regs.eax, 5, 7);
|
||||
if(IsL2DataCache(type, level))
|
||||
{
|
||||
const size_t logicalPerCache = bits(regs.eax, 14, 25)+1;
|
||||
return logicalPerCache;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static size_t LogicalPerCache()
|
||||
{
|
||||
static size_t logicalPerCache;
|
||||
if(!logicalPerCache)
|
||||
logicalPerCache = DetectLogicalPerCache();
|
||||
return logicalPerCache;
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
// the above functions give the maximum number of cores/logical units.
|
||||
// however, some of them may actually be disabled by the BIOS!
|
||||
// what we can do is to analyze the APIC IDs. they are allocated sequentially
|
||||
// for all "processors". treating the IDs as variable-width bit fields
|
||||
// (according to the number of cores/logical units present) allows
|
||||
// determining the exact topology as well as number of packages.
|
||||
|
||||
// these are set by DetectProcessorTopology.
|
||||
static size_t numPackages = 0; // i.e. sockets; > 1 => true SMP system
|
||||
static size_t enabledCoresPerPackage = 0;
|
||||
static size_t enabledLogicalPerCore = 0; // hyperthreading units
|
||||
|
||||
typedef std::vector<u8> Ids;
|
||||
|
||||
// add the currently running processor's APIC ID to a list of IDs.
|
||||
static void StoreApicId(size_t UNUSED(processor), uintptr_t cbData)
|
||||
{
|
||||
Ids* const apicIds = (Ids*)cbData;
|
||||
apicIds->push_back(x86_x64_ApicId());
|
||||
}
|
||||
|
||||
// if successful, apicIds[i] contains the unique ID of OS processor i.
|
||||
static bool GatherApicIds(Ids& apicIds)
|
||||
{
|
||||
// old APIC (see x86_x64_ApicId for details)
|
||||
if(x86_x64_Generation() < 8)
|
||||
return false;
|
||||
|
||||
// process affinity prevents us from seeing all APIC IDs
|
||||
if(PopulationCount(os_cpu_ProcessorMask()) != os_cpu_NumProcessors())
|
||||
return false;
|
||||
|
||||
const LibError ret = os_cpu_CallByEachCPU(StoreApicId, (uintptr_t)&apicIds);
|
||||
debug_assert(ret == INFO::OK);
|
||||
|
||||
// ensure we got a unique ID for every processor
|
||||
{
|
||||
Ids tmp(apicIds);
|
||||
Ids::iterator end = tmp.end();
|
||||
std::sort(tmp.begin(), end);
|
||||
debug_assert(std::unique(tmp.begin(), end) == end);
|
||||
debug_assert(std::distance(tmp.begin(), end) == (ptrdiff_t)os_cpu_NumProcessors());
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
typedef std::set<u8> IdSet;
|
||||
|
||||
/**
|
||||
* "field" := a range of bits sufficient to represent <numValues> integers.
|
||||
* for each id in <apicIds>: extract the value of the field starting at
|
||||
* <offset> and insert it into <ids>. afterwards, adjust <offset> to the
|
||||
* next field.
|
||||
*
|
||||
* used to gather e.g. all core IDs from all APIC IDs.
|
||||
**/
|
||||
static void ExtractFieldIntoSet(const Ids& apicIds, size_t& offset, size_t numValues, IdSet& ids)
|
||||
{
|
||||
const size_t numBits = ceil_log2(numValues);
|
||||
if(numBits == 0)
|
||||
return;
|
||||
const u8 mask = bit_mask<u8>(numBits);
|
||||
|
||||
for(size_t i = 0; i < apicIds.size(); i++)
|
||||
{
|
||||
const u8 apicId = apicIds[i];
|
||||
const u8 field = u8(apicId >> offset) & mask;
|
||||
ids.insert(field);
|
||||
}
|
||||
|
||||
offset += numBits;
|
||||
}
|
||||
|
||||
static size_t numCaches = 0; // L2d
|
||||
static std::vector<size_t> processorsCache;
|
||||
static std::vector<uintptr_t> cachesProcessorMask;
|
||||
|
||||
|
||||
|
||||
class CacheManager
|
||||
{
|
||||
public:
|
||||
void Add(u8 id, size_t processor)
|
||||
{
|
||||
SharedCache* cache = Find(id);
|
||||
if(!cache)
|
||||
{
|
||||
m_caches.push_back(id);
|
||||
cache = &m_caches.back();
|
||||
}
|
||||
cache->Add(processor);
|
||||
}
|
||||
|
||||
void StoreProcessorMasks(std::vector<uintptr_t>& processorMasks)
|
||||
{
|
||||
processorMasks.resize(m_caches.size());
|
||||
for(size_t i = 0; i < m_caches.size(); i++)
|
||||
processorMasks[i] = m_caches[i].ProcessorMask();
|
||||
}
|
||||
|
||||
private:
|
||||
class SharedCache
|
||||
{
|
||||
public:
|
||||
SharedCache(u8 id)
|
||||
: m_id(id), m_processorMask(0)
|
||||
{
|
||||
}
|
||||
|
||||
bool Matches(u8 id) const
|
||||
{
|
||||
return m_id == id;
|
||||
}
|
||||
|
||||
void Add(size_t processor)
|
||||
{
|
||||
m_processorMask |= uintptr_t(1) << processor;
|
||||
}
|
||||
|
||||
uintptr_t ProcessorMask() const
|
||||
{
|
||||
return m_processorMask;
|
||||
}
|
||||
|
||||
private:
|
||||
u8 m_id;
|
||||
uintptr_t m_processorMask;
|
||||
};
|
||||
|
||||
SharedCache* Find(u8 id)
|
||||
{
|
||||
for(size_t i = 0; i < m_caches.size(); i++)
|
||||
{
|
||||
if(m_caches[i].Matches(id))
|
||||
return &m_caches[i];
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
std::vector<SharedCache> m_caches;
|
||||
};
|
||||
|
||||
static void DetectCacheTopology(const Ids& apicIds)
|
||||
{
|
||||
const size_t numBits = ceil_log2(LogicalPerCache());
|
||||
const u8 cacheIdMask = u8(0xFF << numBits);
|
||||
|
||||
CacheManager cacheManager;
|
||||
for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
|
||||
{
|
||||
const u8 apicId = apicIds[processor];
|
||||
const u8 cacheId = apicId & cacheIdMask;
|
||||
cacheManager.Add(cacheId, processor);
|
||||
}
|
||||
cacheManager.StoreProcessorMasks(cachesProcessorMask);
|
||||
numCaches = cachesProcessorMask.size();
|
||||
|
||||
const size_t invalidCache = ~(size_t)0;
|
||||
processorsCache.resize(os_cpu_NumProcessors(), invalidCache);
|
||||
for(size_t cache = 0; cache < numCaches; cache++)
|
||||
{
|
||||
const uintptr_t processorMask = cachesProcessorMask[cache];
|
||||
for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
|
||||
{
|
||||
if(IsBitSet(processorMask, processor))
|
||||
processorsCache[processor] = cache;
|
||||
}
|
||||
}
|
||||
for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
|
||||
{
|
||||
debug_assert(processorsCache[processor] != invalidCache);
|
||||
debug_assert(processorsCache[processor] < numCaches);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// @return false if unavailable / no information can be returned.
|
||||
static bool DetectProcessorTopologyViaApicIds()
|
||||
{
|
||||
Ids apicIds;
|
||||
if(!GatherApicIds(apicIds))
|
||||
return false;
|
||||
|
||||
// extract values from all 3 ID bit fields into separate sets
|
||||
size_t offset = 0;
|
||||
IdSet logicalIds;
|
||||
ExtractFieldIntoSet(apicIds, offset, LogicalPerCore(), logicalIds);
|
||||
IdSet coreIds;
|
||||
ExtractFieldIntoSet(apicIds, offset, CoresPerPackage(), coreIds);
|
||||
IdSet packageIds;
|
||||
ExtractFieldIntoSet(apicIds, offset, 0xFF, packageIds);
|
||||
|
||||
numPackages = std::max(packageIds.size(), size_t(1));
|
||||
enabledCoresPerPackage = std::max(coreIds .size(), size_t(1));
|
||||
enabledLogicalPerCore = std::max(logicalIds.size(), size_t(1));
|
||||
|
||||
// note: cache ID possibly overlaps the other fields. we also want to
|
||||
// retrieve more information (mappings between processor and cache ID),
|
||||
// so this needs to be handled separately.
|
||||
DetectCacheTopology(apicIds);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
static void GuessProcessorTopologyViaOsCount()
|
||||
{
|
||||
const size_t numProcessors = os_cpu_NumProcessors();
|
||||
|
||||
// note: we cannot hope to always return correct results since disabled
|
||||
// cores/logical units cannot be distinguished from the situation of the
|
||||
// OS simply not reporting them as "processors". unfortunately this
|
||||
// function won't always only be called for older (#core = #logical = 1)
|
||||
// systems because DetectProcessorTopologyViaApicIds may fail due to
|
||||
// lack of OS support. what we'll do is assume nothing is disabled; this
|
||||
// is reasonable because we care most about #packages. it's fine to assume
|
||||
// more cores (without inflating the total #processors) because that
|
||||
// count only indicates memory barriers etc. ought to be used.
|
||||
enabledCoresPerPackage = CoresPerPackage();
|
||||
enabledLogicalPerCore = LogicalPerCore();
|
||||
|
||||
const size_t numPackagesTimesLogical = numProcessors / CoresPerPackage();
|
||||
debug_assert(numPackagesTimesLogical != 0); // otherwise processors didn't include cores, which would be stupid
|
||||
|
||||
numPackages = numPackagesTimesLogical / LogicalPerCore();
|
||||
if(!numPackages) // processors didn't include logical units (reasonable)
|
||||
numPackages = numPackagesTimesLogical;
|
||||
}
|
||||
|
||||
|
||||
// determine how many CoresPerPackage and LogicalPerCore are
|
||||
// actually enabled and also count numPackages.
|
||||
static void DetectProcessorTopology()
|
||||
{
|
||||
// authoritative, but requires OS support and fairly recent CPUs
|
||||
if(DetectProcessorTopologyViaApicIds())
|
||||
return; // success, we're done.
|
||||
|
||||
GuessProcessorTopologyViaOsCount();
|
||||
}
|
||||
|
||||
|
||||
size_t cpu_NumPackages()
|
||||
{
|
||||
if(!numPackages)
|
||||
DetectProcessorTopology();
|
||||
return numPackages;
|
||||
}
|
||||
|
||||
size_t cpu_CoresPerPackage()
|
||||
{
|
||||
if(!enabledCoresPerPackage)
|
||||
DetectProcessorTopology();
|
||||
return enabledCoresPerPackage;
|
||||
}
|
||||
|
||||
size_t cpu_LogicalPerCore()
|
||||
{
|
||||
if(!enabledLogicalPerCore)
|
||||
DetectProcessorTopology();
|
||||
return enabledLogicalPerCore;
|
||||
}
|
||||
|
||||
size_t cpu_NumCaches()
|
||||
{
|
||||
if(!numCaches)
|
||||
DetectProcessorTopology();
|
||||
return numCaches;
|
||||
}
|
||||
|
||||
|
||||
size_t cpu_CacheFromProcessor(size_t processor)
|
||||
{
|
||||
debug_assert(processor < os_cpu_NumProcessors());
|
||||
DetectProcessorTopology();
|
||||
return processorsCache.at(processor);
|
||||
}
|
||||
|
||||
uintptr_t cpu_ProcessorMaskFromCache(size_t cache)
|
||||
{
|
||||
debug_assert(cache < cpu_NumCaches());
|
||||
DetectProcessorTopology();
|
||||
return cachesProcessorMask.at(cache);
|
||||
}
|
||||
|
||||
|
||||
// note: Windows 2003 GetLogicalProcessorInformation returns incorrect
|
||||
// information, claiming all cores in an Intel Core2 Quad processor
|
||||
// share an L2 cache.
|
54
source/lib/sysdep/x86_x64/topology.h
Normal file
54
source/lib/sysdep/x86_x64/topology.h
Normal file
@ -0,0 +1,54 @@
|
||||
/**
|
||||
* =========================================================================
|
||||
* File : topology.cpp
|
||||
* Project : 0 A.D.
|
||||
* Description : detection of CPU and cache topology
|
||||
* =========================================================================
|
||||
*/
|
||||
|
||||
// license: GPL; see lib/license.txt
|
||||
|
||||
#ifndef INCLUDED_TOPOLOGY
|
||||
#define INCLUDED_TOPOLOGY
|
||||
|
||||
// OSes report hyperthreading units and cores as "processors". we need to
|
||||
// drill down and find out the exact counts (for thread pool dimensioning
|
||||
// and cache sharing considerations).
|
||||
|
||||
/**
|
||||
* @return number of *enabled* CPU packages / sockets.
|
||||
**/
|
||||
LIB_API size_t cpu_NumPackages();
|
||||
|
||||
/**
|
||||
* @return number of *enabled* CPU cores per package.
|
||||
* (2 on dual-core systems)
|
||||
**/
|
||||
LIB_API size_t cpu_CoresPerPackage();
|
||||
|
||||
/**
|
||||
* @return number of *enabled* hyperthreading units per core.
|
||||
* (2 on P4 EE)
|
||||
**/
|
||||
LIB_API size_t cpu_LogicalPerCore();
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// L2 cache
|
||||
|
||||
/**
|
||||
* @return number of distinct L2 caches
|
||||
**/
|
||||
LIB_API size_t cpu_NumCaches();
|
||||
|
||||
/**
|
||||
* @return L2 cache number (zero-based) to which <processor> belongs.
|
||||
**/
|
||||
LIB_API size_t cpu_CacheFromProcessor(size_t processor);
|
||||
|
||||
/**
|
||||
* @return bit-mask of all processors sharing <cache>.
|
||||
**/
|
||||
LIB_API uintptr_t cpu_ProcessorMaskFromCache(size_t cache);
|
||||
|
||||
#endif // #ifndef INCLUDED_TOPOLOGY
|
505
source/lib/sysdep/x86_x64/x86_x64.cpp
Normal file
505
source/lib/sysdep/x86_x64/x86_x64.cpp
Normal file
@ -0,0 +1,505 @@
|
||||
/**
|
||||
* =========================================================================
|
||||
* File : x86_x64.cpp
|
||||
* Project : 0 A.D.
|
||||
* Description : CPU-specific routines common to 32 and 64-bit x86
|
||||
* =========================================================================
|
||||
*/
|
||||
|
||||
// license: GPL; see lib/license.txt
|
||||
|
||||
#include "precompiled.h"
|
||||
#include "x86_x64.h"
|
||||
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <vector>
|
||||
#include <set>
|
||||
#include <algorithm>
|
||||
|
||||
#include "lib/posix/posix.h" // pthread
|
||||
#include "lib/bits.h"
|
||||
#include "lib/timer.h"
|
||||
#include "lib/sysdep/cpu.h"
|
||||
#include "lib/sysdep/os_cpu.h"
|
||||
|
||||
#if ARCH_IA32
|
||||
# include "../ia32/ia32_asm.h"
|
||||
#else
|
||||
#include "../amd64/amd64_asm.h"
|
||||
# endif
|
||||
|
||||
#if MSC_VERSION
|
||||
# include <intrin.h>
|
||||
#elif GCC_VERSION
|
||||
#else
|
||||
# error compiler not supported
|
||||
#endif
|
||||
|
||||
|
||||
// note: unfortunately the MSC __cpuid intrinsic does not allow passing
|
||||
// additional inputs (e.g. ecx = count), so we need to implement this
|
||||
// in assembly for both IA-32 and AMD64.
|
||||
static void cpuid_impl(x86_x64_CpuidRegs* regs)
|
||||
{
|
||||
#if ARCH_IA32
|
||||
ia32_asm_cpuid(regs);
|
||||
#else
|
||||
amd64_asm_cpuid(regs);
|
||||
#endif
|
||||
}
|
||||
|
||||
bool x86_x64_cpuid(x86_x64_CpuidRegs* regs)
|
||||
{
|
||||
static u32 maxFunction;
|
||||
static u32 maxExtendedFunction;
|
||||
if(!maxFunction)
|
||||
{
|
||||
x86_x64_CpuidRegs regs2;
|
||||
regs2.eax = 0;
|
||||
cpuid_impl(®s2);
|
||||
maxFunction = regs2.eax;
|
||||
regs2.eax = 0x80000000;
|
||||
cpuid_impl(®s2);
|
||||
maxExtendedFunction = regs2.eax;
|
||||
}
|
||||
|
||||
const u32 function = regs->eax;
|
||||
if(function > maxExtendedFunction)
|
||||
return false;
|
||||
if(function < 0x80000000 && function > maxFunction)
|
||||
return false;
|
||||
|
||||
cpuid_impl(regs);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// capability bits
|
||||
|
||||
static void DetectFeatureFlags(u32 caps[4])
|
||||
{
|
||||
x86_x64_CpuidRegs regs;
|
||||
regs.eax = 1;
|
||||
if(x86_x64_cpuid(®s))
|
||||
{
|
||||
caps[0] = regs.ecx;
|
||||
caps[1] = regs.edx;
|
||||
}
|
||||
regs.eax = 0x80000001;
|
||||
if(x86_x64_cpuid(®s))
|
||||
{
|
||||
caps[2] = regs.ecx;
|
||||
caps[3] = regs.edx;
|
||||
}
|
||||
}
|
||||
|
||||
bool x86_x64_cap(x86_x64_Cap cap)
|
||||
{
|
||||
// treated as 128 bit field; order: std ecx, std edx, ext ecx, ext edx
|
||||
// keep in sync with enum CpuCap!
|
||||
static u32 x86_x64_caps[4];
|
||||
|
||||
// (since relevant CPUs will surely advertise at least one standard flag,
|
||||
// they are zero iff we haven't been initialized yet)
|
||||
if(!x86_x64_caps[1])
|
||||
DetectFeatureFlags(x86_x64_caps);
|
||||
|
||||
const size_t tbl_idx = cap >> 5;
|
||||
const size_t bit_idx = cap & 0x1f;
|
||||
if(tbl_idx > 3)
|
||||
{
|
||||
DEBUG_WARN_ERR(ERR::INVALID_PARAM);
|
||||
return false;
|
||||
}
|
||||
return (x86_x64_caps[tbl_idx] & BIT(bit_idx)) != 0;
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// CPU identification
|
||||
|
||||
static x86_x64_Vendors DetectVendor()
|
||||
{
|
||||
x86_x64_CpuidRegs regs;
|
||||
regs.eax = 0;
|
||||
if(!x86_x64_cpuid(®s))
|
||||
DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING);
|
||||
|
||||
// copy regs to string
|
||||
// note: 'strange' ebx,edx,ecx reg order is due to ModR/M encoding order.
|
||||
char vendor_str[13];
|
||||
u32* vendor_str_u32 = (u32*)vendor_str;
|
||||
vendor_str_u32[0] = regs.ebx;
|
||||
vendor_str_u32[1] = regs.edx;
|
||||
vendor_str_u32[2] = regs.ecx;
|
||||
vendor_str[12] = '\0'; // 0-terminate
|
||||
|
||||
if(!strcmp(vendor_str, "AuthenticAMD"))
|
||||
return X86_X64_VENDOR_AMD;
|
||||
else if(!strcmp(vendor_str, "GenuineIntel"))
|
||||
return X86_X64_VENDOR_INTEL;
|
||||
else
|
||||
{
|
||||
DEBUG_WARN_ERR(ERR::CPU_UNKNOWN_VENDOR);
|
||||
return X86_X64_VENDOR_UNKNOWN;
|
||||
}
|
||||
}
|
||||
|
||||
x86_x64_Vendors x86_x64_Vendor()
|
||||
{
|
||||
static x86_x64_Vendors vendor = X86_X64_VENDOR_UNKNOWN;
|
||||
if(vendor == X86_X64_VENDOR_UNKNOWN)
|
||||
vendor = DetectVendor();
|
||||
return vendor;
|
||||
}
|
||||
|
||||
|
||||
static void DetectSignature(size_t* model, size_t* family)
|
||||
{
|
||||
x86_x64_CpuidRegs regs;
|
||||
regs.eax = 1;
|
||||
if(!x86_x64_cpuid(®s))
|
||||
DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING);
|
||||
*model = bits(regs.eax, 4, 7);
|
||||
*family = bits(regs.eax, 8, 11);
|
||||
}
|
||||
|
||||
|
||||
static size_t DetectGeneration()
|
||||
{
|
||||
size_t model, family;
|
||||
DetectSignature(&model, &family);
|
||||
|
||||
switch(x86_x64_Vendor())
|
||||
{
|
||||
case X86_X64_VENDOR_AMD:
|
||||
switch(family)
|
||||
{
|
||||
case 5:
|
||||
if(model < 6)
|
||||
return 5; // K5
|
||||
else
|
||||
return 6; // K6
|
||||
|
||||
case 6:
|
||||
return 7; // K7 (Athlon)
|
||||
|
||||
case 0xF:
|
||||
return 8; // K8 (Opteron)
|
||||
}
|
||||
break;
|
||||
|
||||
case X86_X64_VENDOR_INTEL:
|
||||
switch(family)
|
||||
{
|
||||
case 5:
|
||||
return 5; // Pentium
|
||||
|
||||
case 6:
|
||||
if(model <= 0xD)
|
||||
return 6; // Pentium Pro/II/III/M
|
||||
else
|
||||
return 8; // Core2Duo
|
||||
|
||||
case 0xF:
|
||||
if(model <= 6)
|
||||
return 7; // Pentium 4/D
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
debug_assert(0); // unknown CPU generation
|
||||
return family;
|
||||
}
|
||||
|
||||
size_t x86_x64_Generation()
|
||||
{
|
||||
static size_t generation;
|
||||
if(!generation)
|
||||
generation = DetectGeneration();
|
||||
return generation;
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// identifier string
|
||||
|
||||
/// functor to remove substrings from the CPU identifier string
|
||||
class StringStripper
|
||||
{
|
||||
char* m_string;
|
||||
size_t m_max_chars;
|
||||
|
||||
public:
|
||||
StringStripper(char* string, size_t max_chars)
|
||||
: m_string(string), m_max_chars(max_chars)
|
||||
{
|
||||
}
|
||||
|
||||
// remove all instances of substring from m_string
|
||||
void operator()(const char* substring)
|
||||
{
|
||||
const size_t substring_length = strlen(substring);
|
||||
for(;;)
|
||||
{
|
||||
char* substring_pos = strstr(m_string, substring);
|
||||
if(!substring_pos)
|
||||
break;
|
||||
const size_t substring_ofs = substring_pos - m_string;
|
||||
const size_t num_chars = m_max_chars - substring_ofs - substring_length;
|
||||
memmove(substring_pos, substring_pos+substring_length, num_chars);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
static void DetectIdentifierString(char* identifierString, size_t maxChars)
|
||||
{
|
||||
// get brand string (if available)
|
||||
char* pos = identifierString;
|
||||
bool have_brand_string = true;
|
||||
for(u32 function = 0x80000002; function <= 0x80000004; function++)
|
||||
{
|
||||
x86_x64_CpuidRegs regs;
|
||||
regs.eax = function;
|
||||
have_brand_string &= x86_x64_cpuid(®s);
|
||||
memcpy(pos, ®s, 16);
|
||||
pos += 16;
|
||||
}
|
||||
|
||||
// fall back to manual detect of CPU type because either:
|
||||
// - CPU doesn't support brand string (we use a flag to indicate this
|
||||
// rather than comparing against a default value because it is safer);
|
||||
// - the brand string is useless, e.g. "Unknown". this happens on
|
||||
// some older boards whose BIOS reprograms the string for CPUs it
|
||||
// doesn't recognize.
|
||||
if(!have_brand_string || strncmp(identifierString, "Unknow", 6) == 0)
|
||||
{
|
||||
size_t model, family;
|
||||
DetectSignature(&model, &family);
|
||||
|
||||
switch(x86_x64_Vendor())
|
||||
{
|
||||
case X86_X64_VENDOR_AMD:
|
||||
// everything else is either too old, or should have a brand string.
|
||||
if(family == 6)
|
||||
{
|
||||
if(model == 3 || model == 7)
|
||||
strcpy_s(identifierString, maxChars, "AMD Duron");
|
||||
else if(model <= 5)
|
||||
strcpy_s(identifierString, maxChars, "AMD Athlon");
|
||||
else
|
||||
{
|
||||
if(x86_x64_cap(X86_X64_CAP_AMD_MP))
|
||||
strcpy_s(identifierString, maxChars, "AMD Athlon MP");
|
||||
else
|
||||
strcpy_s(identifierString, maxChars, "AMD Athlon XP");
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case X86_X64_VENDOR_INTEL:
|
||||
// everything else is either too old, or should have a brand string.
|
||||
if(family == 6)
|
||||
{
|
||||
if(model == 1)
|
||||
strcpy_s(identifierString, maxChars, "Intel Pentium Pro");
|
||||
else if(model == 3 || model == 5)
|
||||
strcpy_s(identifierString, maxChars, "Intel Pentium II");
|
||||
else if(model == 6)
|
||||
strcpy_s(identifierString, maxChars, "Intel Celeron");
|
||||
else
|
||||
strcpy_s(identifierString, maxChars, "Intel Pentium III");
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
// identifierString already holds a valid brand string; pretty it up.
|
||||
else
|
||||
{
|
||||
const char* const undesired_strings[] = { "(tm)", "(TM)", "(R)", "CPU " };
|
||||
std::for_each(undesired_strings, undesired_strings+ARRAY_SIZE(undesired_strings),
|
||||
StringStripper(identifierString, strlen(identifierString)+1));
|
||||
|
||||
// note: Intel brand strings include a frequency, but we can't rely
|
||||
// on it because the CPU may be overclocked. we'll leave it in the
|
||||
// string to show measurement accuracy and if SpeedStep is active.
|
||||
}
|
||||
}
|
||||
|
||||
const char* cpu_IdentifierString()
|
||||
{
|
||||
// 3 calls x 4 registers x 4 bytes = 48
|
||||
static char identifierString[48+1] = {'\0'};
|
||||
if(identifierString[0] == '\0')
|
||||
DetectIdentifierString(identifierString, ARRAY_SIZE(identifierString));
|
||||
return identifierString;
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// CPU frequency
|
||||
|
||||
// set scheduling priority and restore when going out of scope.
|
||||
class ScopedSetPriority
|
||||
{
|
||||
int m_old_policy;
|
||||
sched_param m_old_param;
|
||||
|
||||
public:
|
||||
ScopedSetPriority(int new_priority)
|
||||
{
|
||||
// get current scheduling policy and priority
|
||||
pthread_getschedparam(pthread_self(), &m_old_policy, &m_old_param);
|
||||
|
||||
// set new priority
|
||||
sched_param new_param = {0};
|
||||
new_param.sched_priority = new_priority;
|
||||
pthread_setschedparam(pthread_self(), SCHED_FIFO, &new_param);
|
||||
}
|
||||
|
||||
~ScopedSetPriority()
|
||||
{
|
||||
// restore previous policy and priority.
|
||||
pthread_setschedparam(pthread_self(), m_old_policy, &m_old_param);
|
||||
}
|
||||
};
|
||||
|
||||
// note: this function uses timer.cpp!timer_Time, which is implemented via
|
||||
// whrt.cpp on Windows, which again calls x86_x64_Init. be careful that
|
||||
// this function isn't called from there as well, else WHRT will be used
|
||||
// before its init completes.
|
||||
double cpu_ClockFrequency()
|
||||
{
|
||||
// if the TSC isn't available, there's really no good way to count the
|
||||
// actual CPU clocks per known time interval, so bail.
|
||||
// note: loop iterations ("bogomips") are not a reliable measure due
|
||||
// to differing IPC and compiler optimizations.
|
||||
if(!x86_x64_cap(X86_X64_CAP_TSC))
|
||||
return -1.0; // impossible value
|
||||
|
||||
// increase priority to reduce interference while measuring.
|
||||
const int priority = sched_get_priority_max(SCHED_FIFO)-1;
|
||||
ScopedSetPriority ssp(priority);
|
||||
|
||||
// note: no need to "warm up" cpuid - it will already have been
|
||||
// called several times by the time this code is reached.
|
||||
// (background: it's used in x86_x64_rdtsc() to serialize instruction flow;
|
||||
// the first call is documented to be slower on Intel CPUs)
|
||||
|
||||
int num_samples = 16;
|
||||
// if clock is low-res, do less samples so it doesn't take too long.
|
||||
// balance measuring time (~ 10 ms) and accuracy (< 1 0/00 error -
|
||||
// ok for using the TSC as a time reference)
|
||||
if(timer_Resolution() >= 1e-3)
|
||||
num_samples = 8;
|
||||
std::vector<double> samples(num_samples);
|
||||
|
||||
for(int i = 0; i < num_samples; i++)
|
||||
{
|
||||
double dt;
|
||||
i64 dc; // i64 because VC6 can't convert u64 -> double,
|
||||
// and we don't need all 64 bits.
|
||||
|
||||
// count # of clocks in max{1 tick, 1 ms}:
|
||||
// .. wait for start of tick.
|
||||
const double t0 = timer_Time();
|
||||
u64 c1; double t1;
|
||||
do
|
||||
{
|
||||
// note: timer_Time effectively has a long delay (up to 5 us)
|
||||
// before returning the time. we call it before x86_x64_rdtsc to
|
||||
// minimize the delay between actually sampling time / TSC,
|
||||
// thus decreasing the chance for interference.
|
||||
// (if unavoidable background activity, e.g. interrupts,
|
||||
// delays the second reading, inaccuracy is introduced).
|
||||
t1 = timer_Time();
|
||||
c1 = x86_x64_rdtsc();
|
||||
}
|
||||
while(t1 == t0);
|
||||
// .. wait until start of next tick and at least 1 ms elapsed.
|
||||
do
|
||||
{
|
||||
const double t2 = timer_Time();
|
||||
const u64 c2 = x86_x64_rdtsc();
|
||||
dc = (i64)(c2 - c1);
|
||||
dt = t2 - t1;
|
||||
}
|
||||
while(dt < 1e-3);
|
||||
|
||||
// .. freq = (delta_clocks) / (delta_seconds);
|
||||
// x86_x64_rdtsc/timer overhead is negligible.
|
||||
const double freq = dc / dt;
|
||||
samples[i] = freq;
|
||||
}
|
||||
|
||||
std::sort(samples.begin(), samples.end());
|
||||
|
||||
// median filter (remove upper and lower 25% and average the rest).
|
||||
// note: don't just take the lowest value! it could conceivably be
|
||||
// too low, if background processing delays reading c1 (see above).
|
||||
double sum = 0.0;
|
||||
const int lo = num_samples/4, hi = 3*num_samples/4;
|
||||
for(int i = lo; i < hi; i++)
|
||||
sum += samples[i];
|
||||
|
||||
const double clock_frequency = sum / (hi-lo);
|
||||
return clock_frequency;
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// misc stateless functions
|
||||
|
||||
u8 x86_x64_ApicId()
|
||||
{
|
||||
x86_x64_CpuidRegs regs;
|
||||
regs.eax = 1;
|
||||
if(!x86_x64_cpuid(®s))
|
||||
DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING);
|
||||
const u8 apicId = (u8)bits(regs.ebx, 24, 31);
|
||||
return apicId;
|
||||
}
|
||||
|
||||
|
||||
u64 x86_x64_rdtsc()
|
||||
{
|
||||
#if MSC_VERSION
|
||||
return (u64)__rdtsc();
|
||||
#elif GCC_VERSION
|
||||
// GCC supports "portable" assembly for both x86 and x64
|
||||
volatile u32 lo, hi;
|
||||
asm volatile ("rdtsc" : "=a" (lo), "=d" (hi));
|
||||
return u64_from_u32(hi, lo);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
void x86_x64_DebugBreak()
|
||||
{
|
||||
#if MSC_VERSION
|
||||
__debugbreak();
|
||||
#elif GCC_VERSION
|
||||
// note: this probably isn't necessary, since unix_debug_break
|
||||
// (SIGTRAP) is most probably available if GCC_VERSION.
|
||||
// we include it for completeness, though.
|
||||
__asm__ __volatile__ ("int $3");
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
// enforce strong memory ordering.
|
||||
void cpu_MemoryFence()
|
||||
{
|
||||
if(x86_x64_cap(X86_X64_CAP_SSE2))
|
||||
_mm_mfence();
|
||||
}
|
||||
|
||||
|
||||
void cpu_Serialize()
|
||||
{
|
||||
x86_x64_CpuidRegs regs;
|
||||
regs.eax = 1;
|
||||
x86_x64_cpuid(®s); // CPUID serializes execution.
|
||||
}
|
125
source/lib/sysdep/x86_x64/x86_x64.h
Normal file
125
source/lib/sysdep/x86_x64/x86_x64.h
Normal file
@ -0,0 +1,125 @@
|
||||
/**
|
||||
* =========================================================================
|
||||
* File : x86_x64.h
|
||||
* Project : 0 A.D.
|
||||
* Description : CPU-specific routines common to 32 and 64-bit x86
|
||||
* =========================================================================
|
||||
*/
|
||||
|
||||
// license: GPL; see lib/license.txt
|
||||
|
||||
#ifndef INCLUDED_X86_X64
|
||||
#define INCLUDED_X86_X64
|
||||
|
||||
#if !ARCH_IA32 && !ARCH_AMD64
|
||||
#error "including x86_x64.h without ARCH_IA32=1 or ARCH_AMD64=1"
|
||||
#endif
|
||||
|
||||
/**
|
||||
* registers used/returned by x86_x64_cpuid
|
||||
**/
|
||||
struct x86_x64_CpuidRegs
|
||||
{
|
||||
u32 eax;
|
||||
u32 ebx;
|
||||
u32 ecx;
|
||||
u32 edx;
|
||||
};
|
||||
|
||||
/**
|
||||
* invoke CPUID instruction.
|
||||
* @param regs input/output registers.
|
||||
* regs->eax must be set to the desired function.
|
||||
* some functions (e.g. 4) require regs->ecx to be set as well.
|
||||
* rationale: this interface (input/output structure vs. function parameters)
|
||||
* avoids unnecessary copying/initialization if some inputs aren't needed
|
||||
* and allows graceful expansion to functions that require further inputs.
|
||||
* @return true on success or false if the sub-function isn't supported.
|
||||
**/
|
||||
extern bool x86_x64_cpuid(x86_x64_CpuidRegs* regs);
|
||||
|
||||
/**
|
||||
* CPU vendor.
|
||||
* (this is exposed because some CPUID functions are vendor-specific.)
|
||||
* (an enum is easier to compare than the original string values.)
|
||||
**/
|
||||
enum x86_x64_Vendors
|
||||
{
|
||||
X86_X64_VENDOR_UNKNOWN,
|
||||
X86_X64_VENDOR_INTEL,
|
||||
X86_X64_VENDOR_AMD,
|
||||
};
|
||||
|
||||
LIB_API x86_x64_Vendors x86_x64_Vendor();
|
||||
|
||||
|
||||
/**
|
||||
* @return the colloquial processor generation
|
||||
* (5 = Pentium, 6 = Pentium Pro/II/III / K6, 7 = Pentium4 / Athlon, 8 = Core / Opteron)
|
||||
**/
|
||||
LIB_API size_t x86_x64_Generation();
|
||||
|
||||
|
||||
/**
|
||||
* bit indices of CPU capability flags (128 bits).
|
||||
* values are defined by IA-32 CPUID feature flags - do not change!
|
||||
**/
|
||||
enum x86_x64_Cap
|
||||
{
|
||||
// standard (ecx) - currently only defined by Intel
|
||||
X86_X64_CAP_SSE3 = 0+0, // Streaming SIMD Extensions 3
|
||||
X86_X64_CAP_EST = 0+7, // Enhanced Speedstep Technology
|
||||
|
||||
// standard (edx)
|
||||
X86_X64_CAP_FPU = 32+0, // Floating Point Unit
|
||||
X86_X64_CAP_TSC = 32+4, // TimeStamp Counter
|
||||
X86_X64_CAP_CMOV = 32+15, // Conditional MOVe
|
||||
X86_X64_CAP_TM_SCC = 32+22, // Thermal Monitoring and Software Controlled Clock
|
||||
X86_X64_CAP_MMX = 32+23, // MultiMedia eXtensions
|
||||
X86_X64_CAP_SSE = 32+25, // Streaming SIMD Extensions
|
||||
X86_X64_CAP_SSE2 = 32+26, // Streaming SIMD Extensions 2
|
||||
X86_X64_CAP_HT = 32+28, // HyperThreading
|
||||
|
||||
// extended (ecx)
|
||||
X86_X64_CAP_AMD_CMP_LEGACY = 64+1, // N-core and X86_X64_CAP_HT is falsely set
|
||||
|
||||
// extended (edx)
|
||||
X86_X64_CAP_AMD_MP = 96+19, // MultiProcessing capable; reserved on AMD64
|
||||
X86_X64_CAP_AMD_MMX_EXT = 96+22,
|
||||
X86_X64_CAP_AMD_3DNOW_PRO = 96+30,
|
||||
X86_X64_CAP_AMD_3DNOW = 96+31
|
||||
};
|
||||
|
||||
/**
|
||||
* @return whether the CPU supports the indicated x86_x64_Cap / feature flag.
|
||||
**/
|
||||
LIB_API bool x86_x64_cap(x86_x64_Cap cap);
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// stateless
|
||||
|
||||
/**
|
||||
* @return APIC ID of the currently executing processor.
|
||||
*
|
||||
* the implementation uses CPUID.1 and only works on >= 8th generation CPUs;
|
||||
* (P4/Athlon XP); otherwise it returns 0. the alternative of accessing the
|
||||
* APIC mmio registers is not feasible - mahaf_MapPhysicalMemory only works
|
||||
* reliably on WinXP. also, the OS already has the APIC registers mapped and
|
||||
* in constant use, and we don't want to interfere.
|
||||
**/
|
||||
LIB_API u8 x86_x64_ApicId();
|
||||
|
||||
/**
|
||||
* @return the current value of the TimeStampCounter (a counter of
|
||||
* CPU cycles since power-on, which is useful for high-resolution timing
|
||||
* but potentially differs between multiple CPUs)
|
||||
**/
|
||||
LIB_API u64 x86_x64_rdtsc();
|
||||
|
||||
/**
|
||||
* trigger a breakpoint inside this function when it is called.
|
||||
**/
|
||||
LIB_API void x86_x64_DebugBreak(void);
|
||||
|
||||
#endif // #ifndef INCLUDED_X86_X64
|
@ -186,7 +186,7 @@ class TestMultithread : public CxxTest::TestSuite
|
||||
break;
|
||||
|
||||
case TA_SLEEP:
|
||||
usleep(sleep_duration_ms*1000);
|
||||
usleep(useconds_t(sleep_duration_ms*1000));
|
||||
break;
|
||||
|
||||
default:
|
||||
|
@ -36,7 +36,7 @@ public:
|
||||
if(x == 1) ones++;
|
||||
if(x == 2) twos++;
|
||||
}
|
||||
TS_ASSERT_EQUALS(ones+twos, 100);
|
||||
TS_ASSERT_EQUALS(ones+twos, size_t(100));
|
||||
TS_ASSERT(ones > 10 && twos > 10);
|
||||
}
|
||||
};
|
||||
|
@ -25,8 +25,8 @@
|
||||
# include <unistd.h>
|
||||
#endif
|
||||
#include "lib/config2.h" // CONFIG2_TIMER_ALLOW_RDTSC
|
||||
#if ARCH_IA32 && CONFIG2_TIMER_ALLOW_RDTSC
|
||||
# include "lib/sysdep/ia32/ia32.h" // ia32_rdtsc
|
||||
#if (ARCH_IA32 || ARCH_AMD64) && CONFIG2_TIMER_ALLOW_RDTSC
|
||||
# include "lib/sysdep/x86_x64/x86_x64.h" // x86_x64_rdtsc
|
||||
#endif
|
||||
|
||||
#if OS_UNIX || OS_WIN
|
||||
@ -177,7 +177,7 @@ void TimerUnit::SetToZero()
|
||||
|
||||
void TimerUnit::SetFromTimer()
|
||||
{
|
||||
m_ticks = ia32_rdtsc();
|
||||
m_ticks = x86_x64_rdtsc();
|
||||
}
|
||||
|
||||
void TimerUnit::AddDifference(TimerUnit t0, TimerUnit t1)
|
||||
|
@ -8,7 +8,8 @@
|
||||
#include "lib/allocators/shared_ptr.h"
|
||||
#include "lib/sysdep/gfx.h"
|
||||
#include "lib/sysdep/snd.h"
|
||||
#include "lib/sysdep/cpu.h"
|
||||
#include "lib/sysdep/os_cpu.h"
|
||||
#include "lib/sysdep/x86_x64/topology.h"
|
||||
#include "lib/tex/tex.h"
|
||||
#include "lib/file/io/io_align.h" // BLOCK_SIZE
|
||||
|
||||
@ -87,7 +88,7 @@ void WriteSystemInfo()
|
||||
fprintf(f, "\n");
|
||||
|
||||
// memory
|
||||
fprintf(f, "Memory : %lu MiB; %lu MiB free\n", cpu_MemorySize(CPU_MEM_TOTAL)/MiB, cpu_MemorySize(CPU_MEM_AVAILABLE)/MiB);
|
||||
fprintf(f, "Memory : %lu MiB; %lu MiB free\n", os_cpu_MemorySize()/MiB, os_cpu_MemoryAvailable()/MiB);
|
||||
|
||||
// graphics
|
||||
fprintf(f, "Graphics Card : %s\n", gfx_card);
|
||||
|
Loading…
Reference in New Issue
Block a user