1
0
forked from 0ad/0ad

add NUMA and shared-L2-cache detect code (required at work)

enable most of IA-32 specific code to be used in amd64 (resides in
directory lib/sysdep/x86_x64)

bits: add IsBitSet
remove mem_PageSize (use os_cpu_PageSize instead)
cpuid: change interface to allow gracefully supporting later
subfunctions that require input parameters
amd64_asm.asm: add amd64 implementation of cpuid
cpu: move functions provided by OS to sysdep/os_cpu.cpp
cpu topology: avoid trouble when process affinity is set by remapping
processor numbers to 0..PopulationCount(processAffinity)
topology.cpp: move ex-ia32 topology code here.

This was SVN commit r5945.
This commit is contained in:
janwas 2008-05-12 18:15:08 +00:00
parent 7152e4a3e6
commit ffdff6888d
29 changed files with 2091 additions and 1062 deletions

View File

@ -12,7 +12,7 @@
#include "maths/MathUtil.h"
#include "graphics/SColor.h"
#include "lib/sysdep/ia32/ia32.h"
#include "lib/sysdep/x86_x64/x86_x64.h"
static u32 fallback_ConvertRGBColorTo4ub(const RGBColor& src)
{
@ -39,7 +39,7 @@ void ColorActivateFastImpl()
{
}
#if ARCH_IA32
else if (ia32_cap(IA32_CAP_SSE))
else if (x86_x64_cap(X86_X64_CAP_SSE))
{
ConvertRGBColorTo4ub = sse_ConvertRGBColorTo4ub;
}

View File

@ -13,23 +13,17 @@
#include "lib/bits.h" // round_up
#include "lib/posix/posix_mman.h"
#include "lib/sysdep/cpu.h" // cpu_PageSize
#include "lib/sysdep/os_cpu.h" // os_cpu_PageSize
size_t mem_PageSize()
{
static const size_t page_size = cpu_PageSize();
return page_size;
}
bool mem_IsPageMultiple(uintptr_t x)
{
return (x & (mem_PageSize()-1)) == 0;
return (x & (os_cpu_PageSize()-1)) == 0;
}
size_t mem_RoundUpToPage(size_t size)
{
return round_up(size, mem_PageSize());
return round_up(size, os_cpu_PageSize());
}
size_t mem_RoundUpToAlignment(size_t size)

View File

@ -11,14 +11,6 @@
#ifndef INCLUDED_MEM_UTIL
#define INCLUDED_MEM_UTIL
/**
* @return page size
*
* (this routine caches the result of cpu_PageSize and ensures the value
* is available before static initializers have run.)
**/
extern size_t mem_PageSize();
extern bool mem_IsPageMultiple(uintptr_t x);
extern size_t mem_RoundUpToPage(size_t size);

View File

@ -25,6 +25,13 @@
**/
#define BIT64(n) (1ull << (n))
template<typename T>
bool IsBitSet(T value, size_t index)
{
const T bit = T(1) << index;
return (value & bit) != 0;
}
// these are declared in the header and inlined to aid compiler optimizations
// (they can easily end up being time-critical).

View File

@ -18,7 +18,7 @@
#include "lib/allocators/allocators.h"
#include "lib/allocators/shared_ptr.h"
#include "lib/allocators/headerless.h"
#include "lib/allocators/mem_util.h" // mem_PageSize
#include "lib/sysdep/os_cpu.h" // os_cpu_PageSize
//-----------------------------------------------------------------------------

View File

@ -0,0 +1,36 @@
; =========================================================================
; File : amd64_asm.asm
; Project : 0 A.D.
; Description :
; =========================================================================
; license: GPL; see lib/license.txt
; extern "C" void __cdecl amd64_asm_cpuid(Ia32CpuidRegs* reg);
; reference: http://softwarecommunity.intel.com/articles/eng/2669.htm
PUBLIC amd64_asm_cpuid
.CODE
ALIGN 8
amd64_asm_cpuid PROC FRAME
sub rsp, 32
.allocstack 32
push rbx
.pushreg rbx
.endprolog
mov r8, rcx
mov eax, DWORD PTR [r8+0]
mov ecx, DWORD PTR [r8+8]
cpuid
mov DWORD PTR [r8+0], eax
mov DWORD PTR [r8+4], ebx
mov DWORD PTR [r8+8], ecx
mov DWORD PTR [r8+12], edx
pop rbx
add rsp, 32
ret
ALIGN 8
cpuid64 ENDP
_TEXT ENDS

View File

@ -14,4 +14,3 @@
ERROR_ASSOCIATE(ERR::CPU_FEATURE_MISSING, "This CPU doesn't support a required feature", -1);
ERROR_ASSOCIATE(ERR::CPU_UNKNOWN_OPCODE, "Disassembly failed", -1);
ERROR_ASSOCIATE(ERR::CPU_UNKNOWN_VENDOR, "CPU vendor unknown", -1);
ERROR_ASSOCIATE(ERR::CPU_RESTRICTED_AFFINITY, "Cannot set desired CPU affinity", -1);

View File

@ -16,15 +16,9 @@ namespace ERR
const LibError CPU_FEATURE_MISSING = -130000;
const LibError CPU_UNKNOWN_OPCODE = -130001;
const LibError CPU_UNKNOWN_VENDOR = -130002;
const LibError CPU_RESTRICTED_AFFINITY = -130003;
}
// (some of these functions may be implemented in external asm files)
#ifdef __cplusplus
extern "C" {
#endif
//-----------------------------------------------------------------------------
// CPU detection
@ -44,52 +38,6 @@ LIB_API const char* cpu_IdentifierString();
**/
LIB_API double cpu_ClockFrequency();
/**
* @return the number of what the OS deems "processors" or -1 on failure.
*
* this is used by ia32 when it cannot determine the number via APIC IDs.
* in other situations, the cpu_NumPackages function is preferable since
* it is more specific.
*
* note: this function is necessary because POSIX sysconf _SC_NPROCESSORS_CONF
* is not suppored on MacOSX, else we would use that.
**/
LIB_API size_t cpu_NumProcessors();
/**
* @return number of *enabled* CPU packages / sockets.
**/
LIB_API size_t cpu_NumPackages();
/**
* @return number of *enabled* CPU cores per package.
* (2 on dual-core systems)
**/
LIB_API size_t cpu_CoresPerPackage();
/**
* @return number of *enabled* hyperthreading units per core.
* (2 on P4 EE)
**/
LIB_API size_t cpu_LogicalPerCore();
/**
* @return the size [bytes] of a MMU page.
* (4096 on most IA-32 systems)
**/
LIB_API size_t cpu_PageSize();
enum CpuMemoryIndicators
{
CPU_MEM_TOTAL,
CPU_MEM_AVAILABLE
};
/**
* @return the amount [bytes] of available or total physical memory.
**/
LIB_API size_t cpu_MemorySize(CpuMemoryIndicators mem_type);
//-----------------------------------------------------------------------------
// lock-free support routines
@ -105,6 +53,16 @@ LIB_API size_t cpu_MemorySize(CpuMemoryIndicators mem_type);
**/
LIB_API bool cpu_CAS(volatile uintptr_t* location, uintptr_t expected, uintptr_t newValue);
/**
* specialization of cpu_CAS for pointer types. this avoids error-prone
* casting in user code.
**/
template<typename T>
bool cpu_CAS(volatile T* location, T expected, T new_value)
{
return cpu_CAS((volatile uintptr_t*)location, (uintptr_t)expected, (uintptr_t)new_value);
}
/**
* add a signed value to a variable without the possibility of interference
* from other threads/CPUs.
@ -130,17 +88,6 @@ LIB_API void cpu_MemoryFence();
**/
LIB_API void* cpu_memcpy(void* RESTRICT dst, const void* RESTRICT src, size_t size);
/**
* execute the specified function once on each CPU.
* this includes logical HT units and proceeds serially (function
* is never re-entered) in order of increasing OS CPU ID.
* note: implemented by switching thread affinity masks and forcing
* a reschedule, which is apparently not possible with POSIX.
*
* may fail if e.g. OS is preventing us from running on some CPUs.
**/
typedef void (*CpuCallback)(void* param);
LIB_API LibError cpu_CallByEachCPU(CpuCallback cb, void* param);
/**
* set the FPU control word to "desirable" values (see implementation)
@ -155,19 +102,4 @@ LIB_API void cpu_ConfigureFloatingPoint();
#define cpu_i32FromDouble(d) ((i32)d)
#define cpu_i64FromDouble(d) ((i64)d)
#ifdef __cplusplus
}
#endif
/**
* specialization of cpu_CAS for pointer types. this avoids error-prone
* casting in user code.
**/
template<typename T>
bool cpu_CAS(volatile T* location, T expected, T new_value)
{
return cpu_CAS((volatile uintptr_t*)location, (uintptr_t)expected, (uintptr_t)new_value);
}
#endif // #ifndef INCLUDED_CPU

View File

@ -2,7 +2,7 @@
* =========================================================================
* File : ia32.cpp
* Project : 0 A.D.
* Description : C++ and inline asm implementations of IA-32 functions
* Description : routines specific to IA-32
* =========================================================================
*/
@ -11,715 +11,11 @@
#include "precompiled.h"
#include "ia32.h"
#include <string.h>
#include <stdio.h>
#include <vector>
#include <set>
#include <algorithm>
#include "lib/posix/posix.h" // pthread
#include "lib/bits.h"
#include "lib/timer.h"
#include "lib/sysdep/cpu.h"
#include "ia32_memcpy.h"
#include "ia32_asm.h"
#include "../amd64/amd64_asm.h"
#include <intrin.h>
#if !MSC_VERSION && !GCC_VERSION
# error we currently only support MSC/ICC or GCC
#endif
// note: unfortunately the MSC __cpuid intrinsic does not allow passing
// additional inputs (e.g. ecx = count), so we need to implement this
// in assembly for both IA-32 and AMD64.
static void cpuid_impl(Ia32CpuidRegs* regs)
{
#if ARCH_IA32
ia32_asm_cpuid(regs);
#else // i.e. ARCH_AMD64
amd64_asm_cpuid(regs);
#endif
}
bool ia32_cpuid(Ia32CpuidRegs* regs)
{
static u32 maxFunction;
static u32 maxExtendedFunction;
if(!maxFunction)
{
regs->eax = 0;
cpuid_impl(regs);
maxFunction = regs->eax;
regs->eax = 0x80000000;
cpuid_impl(regs);
maxExtendedFunction = regs->eax;
}
const u32 function = regs->eax;
if(function > maxExtendedFunction)
return false;
if(function < 0x80000000 && function > maxFunction)
return false;
cpuid_impl(regs);
return true;
}
//-----------------------------------------------------------------------------
// capability bits
static void DetectFeatureFlags(u32 caps[4])
{
Ia32CpuidRegs regs;
regs.eax = 1;
if(ia32_cpuid(&regs))
{
caps[0] = regs.ecx;
caps[1] = regs.edx;
}
regs.eax = 0x80000001;
if(ia32_cpuid(&regs))
{
caps[2] = regs.ecx;
caps[3] = regs.edx;
}
}
bool ia32_cap(IA32Cap cap)
{
// treated as 128 bit field; order: std ecx, std edx, ext ecx, ext edx
// keep in sync with enum CpuCap!
static u32 ia32_caps[4];
// (since relevant CPUs will surely advertise at least one standard flag,
// they are zero iff we haven't been initialized yet)
if(!ia32_caps[1])
DetectFeatureFlags(ia32_caps);
const size_t tbl_idx = cap >> 5;
const size_t bit_idx = cap & 0x1f;
if(tbl_idx > 3)
{
DEBUG_WARN_ERR(ERR::INVALID_PARAM);
return false;
}
return (ia32_caps[tbl_idx] & BIT(bit_idx)) != 0;
}
//-----------------------------------------------------------------------------
// CPU identification
static Ia32Vendor DetectVendor()
{
Ia32CpuidRegs regs;
regs.eax = 0;
if(!ia32_cpuid(&regs))
DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING);
// copy regs to string
// note: 'strange' ebx,edx,ecx reg order is due to ModR/M encoding order.
char vendor_str[13];
u32* vendor_str_u32 = (u32*)vendor_str;
vendor_str_u32[0] = regs.ebx;
vendor_str_u32[1] = regs.edx;
vendor_str_u32[2] = regs.ecx;
vendor_str[12] = '\0'; // 0-terminate
if(!strcmp(vendor_str, "AuthenticAMD"))
return IA32_VENDOR_AMD;
else if(!strcmp(vendor_str, "GenuineIntel"))
return IA32_VENDOR_INTEL;
else
{
DEBUG_WARN_ERR(ERR::CPU_UNKNOWN_VENDOR);
return IA32_VENDOR_UNKNOWN;
}
}
Ia32Vendor ia32_Vendor()
{
static Ia32Vendor vendor = IA32_VENDOR_UNKNOWN;
if(vendor == IA32_VENDOR_UNKNOWN)
vendor = DetectVendor();
return vendor;
}
static void DetectSignature(size_t* model, size_t* family)
{
Ia32CpuidRegs regs;
regs.eax = 1;
if(!ia32_cpuid(&regs))
DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING);
*model = bits(regs.eax, 4, 7);
*family = bits(regs.eax, 8, 11);
}
static size_t DetectGeneration()
{
size_t model, family;
DetectSignature(&model, &family);
switch(ia32_Vendor())
{
case IA32_VENDOR_AMD:
switch(family)
{
case 5:
if(model < 6)
return 5; // K5
else
return 6; // K6
case 6:
return 7; // K7 (Athlon)
case 0xF:
return 8; // K8 (Opteron)
}
break;
case IA32_VENDOR_INTEL:
switch(family)
{
case 5:
return 5; // Pentium
case 6:
if(model <= 0xD)
return 6; // Pentium Pro/II/III/M
else
return 8; // Core2Duo
case 0xF:
if(model <= 6)
return 7; // Pentium 4/D
}
break;
}
debug_assert(0); // unknown CPU generation
return family;
}
size_t ia32_Generation()
{
static size_t generation;
if(!generation)
generation = DetectGeneration();
return generation;
}
//-----------------------------------------------------------------------------
// identifier string
/// functor to remove substrings from the CPU identifier string
class StringStripper
{
char* m_string;
size_t m_max_chars;
public:
StringStripper(char* string, size_t max_chars)
: m_string(string), m_max_chars(max_chars)
{
}
// remove all instances of substring from m_string
void operator()(const char* substring)
{
const size_t substring_length = strlen(substring);
for(;;)
{
char* substring_pos = strstr(m_string, substring);
if(!substring_pos)
break;
const size_t substring_ofs = substring_pos - m_string;
const size_t num_chars = m_max_chars - substring_ofs - substring_length;
memmove(substring_pos, substring_pos+substring_length, num_chars);
}
}
};
static void DetectIdentifierString(char* identifierString, size_t maxChars)
{
// get brand string (if available)
char* pos = identifierString;
bool have_brand_string = true;
for(u32 function = 0x80000002; function <= 0x80000004; function++)
{
Ia32CpuidRegs regs;
regs.eax = function;
have_brand_string &= ia32_cpuid(&regs);
memcpy(pos, &regs, 16);
pos += 16;
}
// fall back to manual detect of CPU type because either:
// - CPU doesn't support brand string (we use a flag to indicate this
// rather than comparing against a default value because it is safer);
// - the brand string is useless, e.g. "Unknown". this happens on
// some older boards whose BIOS reprograms the string for CPUs it
// doesn't recognize.
if(!have_brand_string || strncmp(identifierString, "Unknow", 6) == 0)
{
size_t model, family;
DetectSignature(&model, &family);
switch(ia32_Vendor())
{
case IA32_VENDOR_AMD:
// everything else is either too old, or should have a brand string.
if(family == 6)
{
if(model == 3 || model == 7)
strcpy_s(identifierString, maxChars, "AMD Duron");
else if(model <= 5)
strcpy_s(identifierString, maxChars, "AMD Athlon");
else
{
if(ia32_cap(IA32_CAP_AMD_MP))
strcpy_s(identifierString, maxChars, "AMD Athlon MP");
else
strcpy_s(identifierString, maxChars, "AMD Athlon XP");
}
}
break;
case IA32_VENDOR_INTEL:
// everything else is either too old, or should have a brand string.
if(family == 6)
{
if(model == 1)
strcpy_s(identifierString, maxChars, "Intel Pentium Pro");
else if(model == 3 || model == 5)
strcpy_s(identifierString, maxChars, "Intel Pentium II");
else if(model == 6)
strcpy_s(identifierString, maxChars, "Intel Celeron");
else
strcpy_s(identifierString, maxChars, "Intel Pentium III");
}
break;
}
}
// identifierString already holds a valid brand string; pretty it up.
else
{
const char* const undesired_strings[] = { "(tm)", "(TM)", "(R)", "CPU " };
std::for_each(undesired_strings, undesired_strings+ARRAY_SIZE(undesired_strings),
StringStripper(identifierString, strlen(identifierString)+1));
// note: Intel brand strings include a frequency, but we can't rely
// on it because the CPU may be overclocked. we'll leave it in the
// string to show measurement accuracy and if SpeedStep is active.
}
}
const char* cpu_IdentifierString()
{
// 3 calls x 4 registers x 4 bytes = 48
static char identifierString[48+1] = {'\0'};
if(identifierString[0] == '\0')
DetectIdentifierString(identifierString, ARRAY_SIZE(identifierString));
return identifierString;
}
//-----------------------------------------------------------------------------
// CPU frequency
// set scheduling priority and restore when going out of scope.
class ScopedSetPriority
{
int m_old_policy;
sched_param m_old_param;
public:
ScopedSetPriority(int new_priority)
{
// get current scheduling policy and priority
pthread_getschedparam(pthread_self(), &m_old_policy, &m_old_param);
// set new priority
sched_param new_param = {0};
new_param.sched_priority = new_priority;
pthread_setschedparam(pthread_self(), SCHED_FIFO, &new_param);
}
~ScopedSetPriority()
{
// restore previous policy and priority.
pthread_setschedparam(pthread_self(), m_old_policy, &m_old_param);
}
};
// note: this function uses timer.cpp!timer_Time, which is implemented via
// whrt.cpp on Windows, which again calls ia32_Init. be careful that
// this function isn't called from there as well, else WHRT will be used
// before its init completes.
double ia32_ClockFrequency()
{
// if the TSC isn't available, there's really no good way to count the
// actual CPU clocks per known time interval, so bail.
// note: loop iterations ("bogomips") are not a reliable measure due
// to differing IPC and compiler optimizations.
if(!ia32_cap(IA32_CAP_TSC))
return -1.0; // impossible value
// increase priority to reduce interference while measuring.
const int priority = sched_get_priority_max(SCHED_FIFO)-1;
ScopedSetPriority ssp(priority);
// note: no need to "warm up" cpuid - it will already have been
// called several times by the time this code is reached.
// (background: it's used in ia32_rdtsc() to serialize instruction flow;
// the first call is documented to be slower on Intel CPUs)
int num_samples = 16;
// if clock is low-res, do less samples so it doesn't take too long.
// balance measuring time (~ 10 ms) and accuracy (< 1 0/00 error -
// ok for using the TSC as a time reference)
if(timer_Resolution() >= 1e-3)
num_samples = 8;
std::vector<double> samples(num_samples);
for(int i = 0; i < num_samples; i++)
{
double dt;
i64 dc; // i64 because VC6 can't convert u64 -> double,
// and we don't need all 64 bits.
// count # of clocks in max{1 tick, 1 ms}:
// .. wait for start of tick.
const double t0 = timer_Time();
u64 c1; double t1;
do
{
// note: timer_Time effectively has a long delay (up to 5 us)
// before returning the time. we call it before ia32_rdtsc to
// minimize the delay between actually sampling time / TSC,
// thus decreasing the chance for interference.
// (if unavoidable background activity, e.g. interrupts,
// delays the second reading, inaccuracy is introduced).
t1 = timer_Time();
c1 = ia32_rdtsc();
}
while(t1 == t0);
// .. wait until start of next tick and at least 1 ms elapsed.
do
{
const double t2 = timer_Time();
const u64 c2 = ia32_rdtsc();
dc = (i64)(c2 - c1);
dt = t2 - t1;
}
while(dt < 1e-3);
// .. freq = (delta_clocks) / (delta_seconds);
// ia32_rdtsc/timer overhead is negligible.
const double freq = dc / dt;
samples[i] = freq;
}
std::sort(samples.begin(), samples.end());
// median filter (remove upper and lower 25% and average the rest).
// note: don't just take the lowest value! it could conceivably be
// too low, if background processing delays reading c1 (see above).
double sum = 0.0;
const int lo = num_samples/4, hi = 3*num_samples/4;
for(int i = lo; i < hi; i++)
sum += samples[i];
const double clock_frequency = sum / (hi-lo);
return clock_frequency;
}
//-----------------------------------------------------------------------------
// processor topology
u8 ia32_ApicId()
{
Ia32CpuidRegs regs;
regs.eax = 1;
if(!ia32_cpuid(&regs))
DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING);
const u8 apicId = (u8)bits(regs.ebx, 24, 31);
return apicId;
}
// OSes report hyperthreading units and cores as "processors". we need to
// drill down and find out the exact counts (for thread pool dimensioning
// and cache sharing considerations).
// note: Intel Appnote 485 (CPUID) assures uniformity of coresPerPackage and
// logicalPerCore.
static size_t DetectCoresPerPackage()
{
Ia32CpuidRegs regs;
switch(ia32_Vendor())
{
case IA32_VENDOR_INTEL:
regs.eax = 4;
if(ia32_cpuid(&regs))
return bits(regs.eax, 26, 31)+1;
break;
case IA32_VENDOR_AMD:
regs.eax = 0x80000008;
if(ia32_cpuid(&regs))
return bits(regs.ecx, 0, 7)+1;
break;
}
return 1; // else: the CPU is single-core.
}
static size_t CoresPerPackage()
{
static size_t coresPerPackage = 0;
if(!coresPerPackage)
coresPerPackage = DetectCoresPerPackage();
return coresPerPackage;
}
static bool IsHyperthreadingCapable()
{
// definitely not
if(!ia32_cap(IA32_CAP_HT))
return false;
// AMD N-core systems falsely set the HT bit for compatibility reasons
// (don't bother resetting it, might confuse callers)
if(ia32_Vendor() == IA32_VENDOR_AMD && ia32_cap(IA32_CAP_AMD_CMP_LEGACY))
return false;
return true;
}
static size_t DetectLogicalPerCore()
{
if(!IsHyperthreadingCapable())
return 1;
Ia32CpuidRegs regs;
regs.eax = 1;
if(!ia32_cpuid(&regs))
DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING);
const size_t logicalPerPackage = bits(regs.ebx, 16, 23);
// cores ought to be uniform WRT # logical processors
debug_assert(logicalPerPackage % CoresPerPackage() == 0);
return logicalPerPackage / CoresPerPackage();
}
static size_t LogicalPerCore()
{
static size_t logicalPerCore = 0;
if(!logicalPerCore)
logicalPerCore = DetectLogicalPerCore();
return logicalPerCore;
}
// the above two functions give the maximum number of cores/logical units.
// however, some of them may actually be disabled by the BIOS!
// what we can do is to analyze the APIC IDs. they are allocated sequentially
// for all "processors". treating the IDs as variable-width bitfields
// (according to the number of cores/logical units present) allows
// determining the exact topology as well as number of packages.
// these are set by DetectProcessorTopology.
static size_t numPackages = 0; // i.e. sockets; > 1 => true SMP system
static size_t enabledCoresPerPackage = 0;
static size_t enabledLogicalPerCore = 0; // hyperthreading units
typedef std::vector<u8> Ids;
typedef std::set<u8> IdSet;
// add the currently running processor's APIC ID to a list of IDs.
static void StoreApicId(void* param)
{
Ids* apicIds = (Ids*)param;
apicIds->push_back(ia32_ApicId());
}
// field := a range of bits sufficient to represent <num_values> integers.
// for each id in apicIds: extract the value of the field at offset bit_pos
// and insert it into ids. afterwards, adjust bit_pos to the next field.
// used to gather e.g. all core IDs from all APIC IDs.
static void ExtractFieldsIntoSet(const Ids& apicIds, size_t& bit_pos, size_t num_values, IdSet& ids)
{
const size_t id_bits = ceil_log2(num_values);
if(id_bits == 0)
return;
const u8 mask = bit_mask<u8>(id_bits);
for(size_t i = 0; i < apicIds.size(); i++)
{
const u8 apic_id = apicIds[i];
const u8 field = u8(apic_id >> bit_pos) & mask;
ids.insert(field);
}
bit_pos += id_bits;
}
// @return false if unavailable / no information can be returned.
static bool DetectProcessorTopologyViaApicIds()
{
// old APIC (see ia32_ApicId for details)
if(ia32_Generation() < 8)
return false;
// get the set of all APIC IDs
Ids apicIds;
// .. OS affinity support is missing or excludes us from some processors
if(cpu_CallByEachCPU(StoreApicId, &apicIds) != INFO::OK)
return false;
// .. if IDs aren't unique, cpu_CallByEachCPU is broken.
std::sort(apicIds.begin(), apicIds.end());
debug_assert(std::unique(apicIds.begin(), apicIds.end()) == apicIds.end());
// extract values from all 3 ID bitfields into separate sets
size_t bit_pos = 0;
IdSet logicalIds;
ExtractFieldsIntoSet(apicIds, bit_pos, LogicalPerCore(), logicalIds);
IdSet coreIds;
ExtractFieldsIntoSet(apicIds, bit_pos, CoresPerPackage(), coreIds);
IdSet packageIds;
ExtractFieldsIntoSet(apicIds, bit_pos, 0xFF, packageIds);
// (the set cardinality is representative of all packages/cores since
// their numbers are uniform across the system.)
numPackages = std::max((size_t)packageIds.size(), 1u);
enabledCoresPerPackage = std::max((size_t)coreIds .size(), 1u);
enabledLogicalPerCore = std::max((size_t)logicalIds.size(), 1u);
// note: even though APIC IDs are assigned sequentially, we can't make any
// assumptions about the values/ordering because we get them according to
// the CPU affinity mask, which is unknown.
return true;
}
static void GuessProcessorTopologyViaOsCount()
{
const size_t numProcessors = cpu_NumProcessors();
// note: we cannot hope to always return correct results since disabled
// cores/logical units cannot be distinguished from the situation of the
// OS simply not reporting them as "processors". unfortunately this
// function won't always only be called for older (#core = #logical = 1)
// systems because DetectProcessorTopologyViaApicIds may fail due to
// lack of OS support. what we'll do is assume nothing is disabled; this
// is reasonable because we care most about #packages. it's fine to assume
// more cores (without inflating the total #processors) because that
// count only indicates memory barriers etc. ought to be used.
enabledCoresPerPackage = CoresPerPackage();
enabledLogicalPerCore = LogicalPerCore();
const size_t numPackagesTimesLogical = numProcessors / CoresPerPackage();
debug_assert(numPackagesTimesLogical != 0); // otherwise processors didn't include cores, which would be stupid
numPackages = numPackagesTimesLogical / LogicalPerCore();
if(!numPackages) // processors didn't include logical units (reasonable)
numPackages = numPackagesTimesLogical;
}
// determine how many CoresPerPackage and LogicalPerCore are
// actually enabled and also count numPackages.
static void DetectProcessorTopology()
{
// authoritative, but requires newer CPU, and OS support.
if(DetectProcessorTopologyViaApicIds())
return; // success, we're done.
GuessProcessorTopologyViaOsCount();
}
size_t cpu_NumPackages()
{
if(!numPackages)
DetectProcessorTopology();
return (size_t)numPackages;
}
size_t cpu_CoresPerPackage()
{
if(!enabledCoresPerPackage)
DetectProcessorTopology();
return (size_t)enabledCoresPerPackage;
}
size_t cpu_LogicalPerCore()
{
if(!enabledLogicalPerCore)
DetectProcessorTopology();
return (size_t)enabledLogicalPerCore;
}
//-----------------------------------------------------------------------------
// misc stateless functions
u64 ia32_rdtsc()
{
#if MSC_VERSION
return (u64)__rdtsc();
#elif GCC_VERSION
// GCC supports "portable" assembly for both x86 and x86_64
volatile u32 lo, hi;
asm volatile ("rdtsc" : "=a" (lo), "=d" (hi));
return u64_from_u32(hi, lo);
#endif
}
void ia32_DebugBreak()
{
#if MSC_VERSION
__debugbreak();
#elif GCC_VERSION
// note: this probably isn't necessary, since unix_debug_break
// (SIGTRAP) is most probably available if GCC_VERSION.
// we include it for completeness, though.
__asm__ __volatile__ ("int $3");
#endif
}
// enforce strong memory ordering.
void cpu_MemoryFence()
{
if(ia32_cap(IA32_CAP_SSE2))
_mm_mfence();
}
// checks if there is an IA-32 CALL instruction right before ret_addr.
// returns INFO::OK if so and ERR::FAIL if not.
// also attempts to determine the call target. if that is possible
// (directly addressed relative or indirect jumps), it is stored in
// target, which is otherwise 0.
//
// this is useful for walking the stack manually.
LibError ia32_GetCallTarget(void* ret_addr, void** target)
{
*target = 0;
@ -799,25 +95,17 @@ void cpu_ConfigureFloatingPoint()
}
//-----------------------------------------------------------------------------
// thunk functions for ia32_asm to allow DLL export
void cpu_AtomicAdd(volatile intptr_t* location, intptr_t increment)
{
ia32_asm_AtomicAdd(location, increment);
}
bool cpu_CAS(volatile uintptr_t* location, uintptr_t expected, uintptr_t new_value)
{
return ia32_asm_CAS(location, expected, new_value);
}
void cpu_Serialize()
{
Ia32CpuidRegs regs;
regs.eax = 1;
ia32_cpuid(&regs); // CPUID serializes execution.
}
void* cpu_memcpy(void* RESTRICT dst, const void* RESTRICT src, size_t size)
{

View File

@ -2,7 +2,7 @@
* =========================================================================
* File : ia32.h
* Project : 0 A.D.
* Description : C++ and inline asm implementations of IA-32 functions
* Description : routines specific to IA-32
* =========================================================================
*/
@ -11,106 +11,10 @@
#ifndef INCLUDED_IA32
#define INCLUDED_IA32
#if !ARCH_IA32 && !ARCH_AMD64
#error "including ia32.h without ARCH_IA32=1 or ARCH_AMD64=1"
#if !ARCH_IA32
# error "including ia32.h without ARCH_IA32=1"
#endif
/**
* registers used/returned by ia32_cpuid
**/
struct Ia32CpuidRegs
{
u32 eax;
u32 ebx;
u32 ecx;
u32 edx;
};
/**
* invoke CPUID instruction.
* @param regs input/output registers.
* regs->eax must be set to the desired function.
* some functions (e.g. 4) require regs->ecx to be set as well.
* rationale: this interface (input/output structure vs. function parameters)
* avoids unnecessary copying/initialization if some inputs aren't needed
* and allows graceful expansion to functions that require further inputs.
* @return true on success or false if the sub-function isn't supported.
**/
extern bool ia32_cpuid(Ia32CpuidRegs* regs);
/**
* CPU vendor.
* (this is exposed because some CPUID functions are vendor-specific.)
* (an enum is easier to compare than the original string values.)
**/
enum Ia32Vendor
{
IA32_VENDOR_UNKNOWN,
IA32_VENDOR_INTEL,
IA32_VENDOR_AMD,
};
LIB_API Ia32Vendor ia32_Vendor();
/**
* @return the colloquial processor generation
* (5 = Pentium, 6 = Pentium Pro/II/III / K6, 7 = Pentium4 / Athlon, 8 = Core / Opteron)
**/
LIB_API size_t ia32_Generation();
/**
* bit indices of CPU capability flags (128 bits).
* values are defined by IA-32 CPUID feature flags - do not change!
**/
enum IA32Cap
{
// standard (ecx) - currently only defined by Intel
IA32_CAP_SSE3 = 0+0, // Streaming SIMD Extensions 3
IA32_CAP_EST = 0+7, // Enhanced Speedstep Technology
// standard (edx)
IA32_CAP_FPU = 32+0, // Floating Point Unit
IA32_CAP_TSC = 32+4, // TimeStamp Counter
IA32_CAP_CMOV = 32+15, // Conditional MOVe
IA32_CAP_TM_SCC = 32+22, // Thermal Monitoring and Software Controlled Clock
IA32_CAP_MMX = 32+23, // MultiMedia eXtensions
IA32_CAP_SSE = 32+25, // Streaming SIMD Extensions
IA32_CAP_SSE2 = 32+26, // Streaming SIMD Extensions 2
IA32_CAP_HT = 32+28, // HyperThreading
// extended (ecx)
IA32_CAP_AMD_CMP_LEGACY = 64+1, // N-core and IA32_CAP_HT is falsely set
// extended (edx)
IA32_CAP_AMD_MP = 96+19, // MultiProcessing capable; reserved on AMD64
IA32_CAP_AMD_MMX_EXT = 96+22,
IA32_CAP_AMD_3DNOW_PRO = 96+30,
IA32_CAP_AMD_3DNOW = 96+31
};
/**
* @return whether the CPU supports the indicated IA32Cap / feature flag.
**/
LIB_API bool ia32_cap(IA32Cap cap);
//-----------------------------------------------------------------------------
// stateless
/**
* @return APIC ID of the currently executing processor.
*
* the implementation uses CPUID.1 and only works on >= 8th generation CPUs;
* (P4/Athlon XP); otherwise it returns 0. the alternative of accessing the
* APIC mmio registers is not feasible - mahaf_MapPhysicalMemory only works
* reliably on WinXP. also, the OS already has the APIC registers mapped and
* in constant use, and we don't want to interfere.
**/
LIB_API u8 ia32_ApicId();
/**
* check if there is an IA-32 CALL instruction right before ret_addr.
* @return INFO::OK if so and ERR::FAIL if not.
@ -123,45 +27,4 @@ LIB_API u8 ia32_ApicId();
**/
LIB_API LibError ia32_GetCallTarget(void* ret_addr, void** target);
/**
* @return the current value of the TimeStampCounter (a counter of
* CPU cycles since power-on, which is useful for high-resolution timing
* but potentially differs between multiple CPUs)
**/
LIB_API u64 ia32_rdtsc();
/**
* trigger a breakpoint inside this function when it is called.
**/
LIB_API void ia32_DebugBreak(void);
/// fpclassify return values
#define IA32_FP_NAN 0x0100
#define IA32_FP_NORMAL 0x0400
#define IA32_FP_INFINITE (IA32_FP_NAN | IA32_FP_NORMAL)
#define IA32_FP_ZERO 0x4000
#define IA32_FP_SUBNORMAL (IA32_FP_NORMAL | IA32_FP_ZERO)
// FPU control word (for ia32_asm_control87)
// .. Precision Control:
#define IA32_MCW_PC 0x0300
#define IA32_PC_24 0x0000
// .. Rounding Control:
#define IA32_MCW_RC 0x0C00
#define IA32_RC_NEAR 0x0000
#define IA32_RC_DOWN 0x0400
#define IA32_RC_UP 0x0800
#define IA32_RC_CHOP 0x0C00
// .. Exception Mask:
#define IA32_MCW_EM 0x003f
#define IA32_EM_INVALID BIT(0)
#define IA32_EM_DENORMAL BIT(1)
#define IA32_EM_ZERODIVIDE BIT(2)
#define IA32_EM_OVERFLOW BIT(3)
#define IA32_EM_UNDERFLOW BIT(4)
#define IA32_EM_INEXACT BIT(5)
#endif // #ifndef INCLUDED_IA32

View File

@ -17,7 +17,7 @@
; CPUID support
;-------------------------------------------------------------------------------
; extern "C" void __cdecl ia32_asm_cpuid(Ia32CpuidRegs* regs);
; extern "C" void __cdecl ia32_asm_cpuid(x86_x64_CpuidRegs* regs);
global sym(ia32_asm_cpuid)
sym(ia32_asm_cpuid):
push ebx ; (clobbered by CPUID)
@ -90,7 +90,7 @@ round_bias dd 0.4999999
__SECT__
; extern "C" size_t __cdecl ia32_asm_control87(size_t new_cw, size_t mask);
; extern "C" u32 __cdecl ia32_asm_control87(u32 new_cw, u32 mask);
global sym(ia32_asm_control87)
sym(ia32_asm_control87):
push eax

View File

@ -15,29 +15,52 @@
extern "C" {
#endif
struct Ia32CpuidRegs;
extern void CALL_CONV ia32_asm_cpuid(Ia32CpuidRegs* regs);
struct x86_x64_CpuidRegs;
extern void CALL_CONV ia32_asm_cpuid(x86_x64_CpuidRegs* regs);
extern void CALL_CONV ia32_asm_AtomicAdd(volatile intptr_t* location, intptr_t increment);
extern bool CALL_CONV ia32_asm_CAS(volatile uintptr_t* location, uintptr_t expected, uintptr_t new_value);
/// control87
// FPU control word
// .. Precision Control:
const u32 IA32_MCW_PC = 0x0300;
const u32 IA32_PC_24 = 0x0000;
// .. Rounding Control:
const u32 IA32_MCW_RC = 0x0C00;
const u32 IA32_RC_NEAR = 0x0000;
const u32 IA32_RC_DOWN = 0x0400;
const u32 IA32_RC_UP = 0x0800;
const u32 IA32_RC_CHOP = 0x0C00;
// .. Exception Mask:
const u32 IA32_MCW_EM = 0x3F;
const u32 IA32_EM_INVALID = 0x01;
const u32 IA32_EM_DENORMAL = 0x02;
const u32 IA32_EM_ZERODIVIDE = 0x04;
const u32 IA32_EM_OVERFLOW = 0x08;
const u32 IA32_EM_UNDERFLOW = 0x10;
const u32 IA32_EM_INEXACT = 0x20;
/**
* for all 1-bits in mask, update the corresponding FPU control word bits
* with the bit values in new_val.
* @return 0 to indicate success.
**/
extern size_t CALL_CONV ia32_asm_control87(size_t new_val, size_t mask);
extern u32 CALL_CONV ia32_asm_control87(u32 new_val, u32 mask);
/// see POSIX fpclassify
/// POSIX fpclassify
#define IA32_FP_NAN 0x0100
#define IA32_FP_NORMAL 0x0400
#define IA32_FP_INFINITE (IA32_FP_NAN | IA32_FP_NORMAL)
#define IA32_FP_ZERO 0x4000
#define IA32_FP_SUBNORMAL (IA32_FP_NORMAL | IA32_FP_ZERO)
extern size_t CALL_CONV ia32_asm_fpclassifyd(double d);
extern size_t CALL_CONV ia32_asm_fpclassifyf(float f);
/// see POSIX rintf
/// POSIX rintf
extern float CALL_CONV ia32_asm_rintf(float);
extern double CALL_CONV ia32_asm_rint(double);
/// see POSIX fminf
/// POSIX fminf
extern float CALL_CONV ia32_asm_fminf(float, float);
extern float CALL_CONV ia32_asm_fmaxf(float, float);
@ -45,7 +68,6 @@ extern i32 CALL_CONV ia32_asm_i32FromFloat(float f);
extern i32 CALL_CONV ia32_asm_i32FromDouble(double d);
extern i64 CALL_CONV ia32_asm_i64FromDouble(double d);
/**
* write the current execution state (e.g. all register values) into
* (Win32::CONTEXT*)pcontext (defined as void* to avoid dependency).

87
source/lib/sysdep/numa.h Normal file
View File

@ -0,0 +1,87 @@
#ifndef INCLUDED_NUMA
#define INCLUDED_NUMA
//-----------------------------------------------------------------------------
// node topology
/**
* @return number of NUMA "nodes" (i.e. groups of CPUs with local memory).
**/
LIB_API size_t numa_NumNodes();
/**
* @return node number (zero-based) to which <processor> belongs.
**/
LIB_API size_t numa_NodeFromProcessor(size_t processor);
/**
* @return bit-mask of all processors constituting <node>.
**/
LIB_API uintptr_t numa_ProcessorMaskFromNode(size_t node);
//-----------------------------------------------------------------------------
// memory
/**
* @return bytes of memory available for allocation on <node>.
**/
LIB_API size_t numa_AvailableMemory(size_t node);
/**
* @return the ratio between maximum and minimum times that one processor
* from each node required to fill a globally allocated array.
* in other words, this is the maximum slowdown for NUMA-oblivious
* memory accesses. Microsoft guidelines require it to be <= 3.
**/
LIB_API double numa_Factor();
//-----------------------------------------------------------------------------
// allocator
/**
* simple allocator that "does the right thing" on NUMA systems - page frames
* will be taken from the node that first accesses them.
**/
LIB_API void* numa_Allocate(size_t size);
enum LargePageDisposition
{
LPD_DEFAULT,
LPD_ALWAYS,
LPD_NEVER
};
/**
* allocate memory from a specific node.
*
* @param node node number (zero-based)
* @param largePageDisposition - allows forcibly enabling/disabling the use
* of large pages; the default decision involves a heuristic.
* @param pageSize if non-zero, receives the size [bytes] of a single page
* out of those used to map the memory.
**/
LIB_API void* numa_AllocateOnNode(size_t size, size_t node, LargePageDisposition largePageDisposition = LPD_DEFAULT, size_t* pageSize = 0);
/**
* release memory that had been handed out by one of the above allocators.
**/
LIB_API void numa_Deallocate(void* mem);
#ifdef __cplusplus
// for use with shared_ptr
template<typename T>
struct numa_Deleter
{
void operator()(T* p) const
{
numa_Deallocate(p);
}
};
#endif
#endif // #ifndef INCLUDED_NUMA

View File

@ -0,0 +1,14 @@
/**
* =========================================================================
* File : os_cpu.cpp
* Project : 0 A.D.
* Description : OS-specific support functions relating to CPU and memory
* =========================================================================
*/
// license: GPL; see lib/license.txt
#include "precompiled.h"
#include "os_cpu.h"
ERROR_ASSOCIATE(ERR::OS_CPU_RESTRICTED_AFFINITY, "Cannot set desired CPU affinity", -1);

117
source/lib/sysdep/os_cpu.h Normal file
View File

@ -0,0 +1,117 @@
/**
* =========================================================================
* File : os_cpu.h
* Project : 0 A.D.
* Description : OS-specific support functions relating to CPU and memory
* =========================================================================
*/
// license: GPL; see lib/license.txt
#ifndef INCLUDED_OS_CPU
#define INCLUDED_OS_CPU
namespace ERR
{
const LibError OS_CPU_RESTRICTED_AFFINITY = -130100;
}
//-----------------------------------------------------------------------------
// processor topology
// processor ID = [0, os_cpu_NumProcessors())
// they are a numbering of the bits of the process affinity mask where the
// least significant nonzero bit corresponds to ID 0.
// rationale: this spares users from having to deal with noncontiguous IDs,
// e.g. when administrative tools are used to restrict process affinity.
/**
* @return bit mask of processors that exist and are available to
* this process.
* its population count is by definition equal to os_cpu_NumProcessors().
**/
LIB_API uintptr_t os_cpu_ProcessorMask();
/**
* @return the number of processors available to this process.
*
* note: this function is necessary because POSIX sysconf _SC_NPROCESSORS_CONF
* is not suppored on MacOSX, else we would use that.
**/
LIB_API size_t os_cpu_NumProcessors();
// note: we do not provide an os_cpu_CurrentProcessor routine. that would
// require Windows 2003 or a lot of work. worse, its results would be
// worthless because they may change immediately afterwards. instead,
// the recommended approach is to pin OpenMP threads (whose ID can be
// queried) to the processor with the same number.
//-----------------------------------------------------------------------------
// CPU and memory characteristics
/**
* @return a rough estimate of the CPU clock frequency.
* this is usually accurate to a few MHz and is faster than measurement loops.
**/
LIB_API double os_cpu_ClockFrequency();
/**
* @return the size [bytes] of a MMU page (4096 on most IA-32 systems)
**/
LIB_API size_t os_cpu_PageSize();
/**
* @return the size [bytes] of a large MMU page (4 MiB on most IA-32 systems)
* or zero if they are not supported.
**/
LIB_API size_t os_cpu_LargePageSize();
/**
* @return the size [bytes] of physical memory.
**/
LIB_API size_t os_cpu_MemorySize();
/**
* @return the size [bytes] of currently available memory.
**/
LIB_API size_t os_cpu_MemoryAvailable();
//-----------------------------------------------------------------------------
// scheduling
/**
* restrict the current thread to a set of processors.
* it will not be rescheduled until a subsequent os_cpu_SetThreadAffinity*.
*
* @param processorMask a bit mask of acceptable processors
* (bit index i corresponds to processor i)
* @return the previous mask
**/
LIB_API uintptr_t os_cpu_SetThreadAffinityMask(uintptr_t processorMask);
/**
* restrict the current thread to a single processor.
* it will not be rescheduled until a subsequent os_cpu_SetThreadAffinity*.
**/
LIB_API void os_cpu_SetThreadAffinity(size_t processor);
/**
* called by os_cpu_CallByEachCPU.
* @param processor ID of processor running the current thread for the
* duration of this function.
* @param cbData user-specified data passed through os_cpu_CallByEachCPU.
**/
typedef void (*OsCpuCallback)(size_t processor, uintptr_t cbData);
/**
* execute the specified function once on each processor.
* this proceeds serially (the callback is never reentered) in increasing
* order of processor ID.
* fails if process affinity prevents running on all processors.
**/
LIB_API LibError os_cpu_CallByEachCPU(OsCpuCallback cb, uintptr_t cbData);
#endif // #ifndef INCLUDED_OS_CPU

View File

@ -1,6 +1,6 @@
#include "lib/self_test.h"
#include "lib/sysdep/ia32/ia32.h"
#include "lib/sysdep/x86_x64/x86_x64.h"
// note: ia32_i??_from_*, ia32_rint*, ia32_fm??f are all tested within
// sysdep to avoid test duplication (both the ia32 versions and
@ -12,17 +12,17 @@ public:
void test_rdtsc()
{
// must increase monotonously
const u64 c1 = ia32_rdtsc();
const u64 c2 = ia32_rdtsc();
const u64 c3 = ia32_rdtsc();
const u64 c1 = x86_x64_rdtsc();
const u64 c2 = x86_x64_rdtsc();
const u64 c3 = x86_x64_rdtsc();
TS_ASSERT(c1 < c2 && c2 < c3);
}
void test_ia32_cap()
{
// make sure the really common/basic caps end up reported as true
TS_ASSERT(ia32_cap(IA32_CAP_FPU));
TS_ASSERT(ia32_cap(IA32_CAP_TSC));
TS_ASSERT(ia32_cap(IA32_CAP_MMX));
TS_ASSERT(x86_x64_cap(X86_X64_CAP_FPU));
TS_ASSERT(x86_x64_cap(X86_X64_CAP_TSC));
TS_ASSERT(x86_x64_cap(X86_X64_CAP_MMX));
}
};

View File

@ -9,20 +9,62 @@
// license: GPL; see lib/license.txt
#include "precompiled.h"
#include "../cpu.h"
#include "lib/sysdep/os_cpu.h"
#include "win.h"
#include "lib/bits.h"
#include "lib/module_init.h"
#ifdef _OPENMP
# include <omp.h>
#endif
static LibError ReadFrequencyFromRegistry(DWORD* freqMhz)
uintptr_t os_cpu_ProcessorMask()
{
static uintptr_t processorMask;
if(!processorMask)
{
const HANDLE hProcess = GetCurrentProcess();
DWORD_PTR processAffinity, systemAffinity;
const BOOL ok = GetProcessAffinityMask(hProcess, &processAffinity, &systemAffinity);
debug_assert(ok);
processorMask = processAffinity;
}
return processorMask;
}
size_t os_cpu_NumProcessors()
{
static size_t numProcessors;
if(!numProcessors)
{
numProcessors = PopulationCount(os_cpu_ProcessorMask());
// sanity check
SYSTEM_INFO si;
GetSystemInfo(&si); // guaranteed to succeed
debug_assert(numProcessors <= (size_t)si.dwNumberOfProcessors);
}
return numProcessors;
}
//-----------------------------------------------------------------------------
static LibError ReadFrequencyFromRegistry(DWORD& freqMhz)
{
HKEY hKey;
if(RegOpenKeyEx(HKEY_LOCAL_MACHINE, "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0", 0, KEY_QUERY_VALUE, &hKey) != ERROR_SUCCESS)
return ERR::NO_SYS;
DWORD size = sizeof(*freqMhz);
LONG ret = RegQueryValueEx(hKey, "~MHz", 0, 0, (LPBYTE)freqMhz, &size);
DWORD size = sizeof(&freqMhz);
LONG ret = RegQueryValueEx(hKey, "~MHz", 0, 0, (LPBYTE)&freqMhz, &size);
RegCloseKey(hKey);
@ -32,95 +74,232 @@ static LibError ReadFrequencyFromRegistry(DWORD* freqMhz)
return INFO::OK;
}
double cpu_ClockFrequency()
double os_cpu_ClockFrequency()
{
DWORD freqMhz;
if(ReadFrequencyFromRegistry(&freqMhz) < 0)
return -1.0;
static double clockFrequency;
if(clockFrequency == 0.0)
{
DWORD freqMhz;
if(ReadFrequencyFromRegistry(freqMhz) == INFO::OK)
clockFrequency = freqMhz * 1e6;
else
clockFrequency = -1.0;
}
const double clockFrequency = freqMhz * 1e6;
return clockFrequency;
}
size_t cpu_NumProcessors()
size_t os_cpu_PageSize()
{
SYSTEM_INFO si;
GetSystemInfo(&si); // can't fail
const size_t numProcessors = (size_t)si.dwNumberOfProcessors;
return numProcessors;
static size_t systemPageSize;
if(!systemPageSize)
{
SYSTEM_INFO si;
GetSystemInfo(&si); // guaranteed to succeed
systemPageSize = (size_t)si.dwPageSize;
}
return systemPageSize;
}
size_t cpu_PageSize()
size_t os_cpu_LargePageSize()
{
SYSTEM_INFO si;
GetSystemInfo(&si); // can't fail
const size_t pageSize = (size_t)si.dwPageSize;
return pageSize;
static size_t largePageSize = ~(size_t)0; // "0" has special significance
if(largePageSize == ~(size_t)0)
{
typedef SIZE_T (WINAPI *PGetLargePageMinimum)(void);
const HMODULE hKernel32 = GetModuleHandle("kernel32.dll");
const PGetLargePageMinimum pGetLargePageMinimum = (PGetLargePageMinimum)GetProcAddress(hKernel32, "GetLargePageMinimum");
if(pGetLargePageMinimum)
{
largePageSize = pGetLargePageMinimum();
debug_assert(largePageSize != 0); // IA-32 and AMD64 definitely support large pages
debug_assert(largePageSize > os_cpu_PageSize());
}
// no OS support for large pages
else
largePageSize = 0;
}
return largePageSize;
}
size_t cpu_MemorySize(CpuMemoryIndicators mem_type)
static void GetMemoryStatus(MEMORYSTATUSEX& mse)
{
// note: we no longer bother dynamically importing GlobalMemoryStatusEx -
// it's available on Win2k and above. this function safely handles
// systems with > 4 GB of memory.
MEMORYSTATUSEX mse = { sizeof(mse) };
BOOL ok = GlobalMemoryStatusEx(&mse);
mse.dwLength = sizeof(mse);
const BOOL ok = GlobalMemoryStatusEx(&mse);
WARN_IF_FALSE(ok);
}
if(mem_type == CPU_MEM_TOTAL)
size_t os_cpu_MemorySize()
{
static size_t memorySize;
if(memorySize == 0)
{
size_t memoryTotal = (size_t)mse.ullTotalPhys;
MEMORYSTATUSEX mse;
GetMemoryStatus(mse);
memorySize = (size_t)mse.ullTotalPhys;
// Richter, "Programming Applications for Windows": the reported
// value doesn't include non-paged pool reserved during boot;
// it's not considered available to the kernel. (the amount is
// 528 KiB on a 512 MiB WinXP/Win2k machine). we'll round up
// to the nearest megabyte to fix this.
memoryTotal = round_up(memoryTotal, 1*MiB);
return memoryTotal;
memorySize = round_up(memorySize, 1*MiB);
}
return memorySize;
}
size_t os_cpu_MemoryAvailable()
{
MEMORYSTATUSEX mse;
GetMemoryStatus(mse);
const size_t memoryAvailable = (size_t)mse.ullAvailPhys;
return memoryAvailable;
}
//-----------------------------------------------------------------------------
/**
* maximum number of processors supported by the OS (determined by the
* number of bits in an affinity mask)
**/
static const DWORD maxProcessorNumber = sizeof(DWORD_PTR)*CHAR_BIT-1;
DWORD_PTR wcpu_AffinityFromProcessorMask(DWORD_PTR processAffinity, uintptr_t processorMask)
{
DWORD_PTR affinity = 0;
size_t processor = (size_t)-1;
for(DWORD processorNumber = 0; processorNumber <= maxProcessorNumber; processorNumber++)
{
if(IsBitSet(processAffinity, processorNumber))
{
++processor; // now corresponds to processorNumber
if(IsBitSet(processorMask, processor))
affinity |= DWORD_PTR(1) << processorNumber;
}
}
return affinity;
}
uintptr_t wcpu_ProcessorMaskFromAffinity(DWORD_PTR processAffinity, DWORD_PTR affinity)
{
uintptr_t processorMask = 0;
size_t processor = (size_t)-1;
for(DWORD processorNumber = 0; processorNumber <= maxProcessorNumber; processorNumber++)
{
if(IsBitSet(processAffinity, processorNumber))
{
++processor; // now corresponds to processorNumber
if(IsBitSet(affinity, processorNumber))
processorMask |= uintptr_t(1) << processor;
}
}
return processorMask;
}
static const DWORD invalidProcessorNumber = (DWORD)-1;
static DWORD CurrentProcessorNumber()
{
typedef DWORD (WINAPI *PGetCurrentProcessorNumber)(void);
static PGetCurrentProcessorNumber pGetCurrentProcessorNumber;
static bool initialized;
if(!initialized)
{
initialized = true;
const HMODULE hKernel32 = GetModuleHandle("kernel32.dll");
// note: NtGetCurrentProcessorNumber and RtlGetCurrentProcessorNumber aren't
// implemented on WinXP SP2, so we can't use those either.
pGetCurrentProcessorNumber = (PGetCurrentProcessorNumber)GetProcAddress(hKernel32, "GetCurrentProcessorNumber");
}
if(pGetCurrentProcessorNumber)
return pGetCurrentProcessorNumber();
else
{
const size_t memoryAvailable = (size_t)mse.ullAvailPhys;
return memoryAvailable;
// note: we won't bother mapping APIC IDs to processor numbers or
// using LSL to re-implement GetCurrentProcessorNumber because
// this routine is just a debug aid.
return invalidProcessorNumber;
}
}
LibError cpu_CallByEachCPU(CpuCallback cb, void* param)
uintptr_t os_cpu_SetThreadAffinityMask(uintptr_t processorMask)
{
const HANDLE hProcess = GetCurrentProcess();
DWORD_PTR process_affinity, system_affinity;
if(!GetProcessAffinityMask(hProcess, &process_affinity, &system_affinity))
WARN_RETURN(ERR::FAIL);
// our affinity != system affinity: OS is limiting the CPUs that
// this process can run on. fail (cannot call back for each CPU).
if(process_affinity != system_affinity)
WARN_RETURN(ERR::CPU_RESTRICTED_AFFINITY);
debug_assert((processorMask >> os_cpu_NumProcessors()) == 0);
for(DWORD_PTR cpu_bit = 1; cpu_bit != 0 && cpu_bit <= process_affinity; cpu_bit *= 2)
DWORD_PTR processAffinity, systemAffinity;
const BOOL ok = GetProcessAffinityMask(GetCurrentProcess(), &processAffinity, &systemAffinity);
debug_assert(ok);
const DWORD_PTR affinity = wcpu_AffinityFromProcessorMask(processAffinity, processorMask);
const DWORD_PTR previousAffinity = SetThreadAffinityMask(GetCurrentThread(), affinity);
debug_assert(previousAffinity != 0); // ensure function didn't fail
// hopefully reschedule our thread
Sleep(0);
// verify we're running on the correct processor
const DWORD currentProcessorNumber = CurrentProcessorNumber();
if(currentProcessorNumber != invalidProcessorNumber)
debug_assert(IsBitSet(affinity, currentProcessorNumber));
const uintptr_t previousProcessorMask = wcpu_ProcessorMaskFromAffinity(processAffinity, previousAffinity);
return previousProcessorMask;
}
void os_cpu_SetThreadAffinity(size_t processor)
{
debug_assert(processor < os_cpu_NumProcessors());
const uintptr_t processorMask = uintptr_t(1) << processor;
(void)os_cpu_SetThreadAffinityMask(processorMask);
}
LibError os_cpu_CallByEachCPU(OsCpuCallback cb, uintptr_t cbData)
{
// ensure we are able to run on all system processors
DWORD_PTR processAffinity, systemAffinity;
{
// check if we can switch to target CPU
if(!(process_affinity & cpu_bit))
continue;
// .. and do so.
if(!SetThreadAffinityMask(GetCurrentThread(), cpu_bit))
{
WARN_ERR(ERR::CPU_RESTRICTED_AFFINITY);
continue;
}
// reschedule to make sure we switch CPUs.
Sleep(1);
cb(param);
const BOOL ok = GetProcessAffinityMask(GetCurrentProcess(), &processAffinity, &systemAffinity);
debug_assert(ok);
if(processAffinity != systemAffinity)
WARN_RETURN(ERR::OS_CPU_RESTRICTED_AFFINITY);
}
// restore to original value
SetThreadAffinityMask(hProcess, process_affinity);
const uintptr_t previousAffinity = os_cpu_SetThreadAffinityMask(os_cpu_ProcessorMask());
for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
{
os_cpu_SetThreadAffinity(processor);
cb(processor, cbData);
}
(void)os_cpu_SetThreadAffinityMask(previousAffinity);
return INFO::OK;
}

View File

@ -0,0 +1,25 @@
/**
* =========================================================================
* File : wcpu.h
* Project : 0 A.D.
* Description : Windows backend of os_cpu
* =========================================================================
*/
// license: GPL; see lib/license.txt
#ifndef INCLUDED_WCPU
#define INCLUDED_WCPU
#include "win.h"
// "affinity" and "processorNumber" are what Windows sees.
// "processorMask" and "processor" are the idealized representation we expose
// to users. the latter insulates them from process affinity restrictions by
// defining IDs as indices of the nonzero bits within the process affinity.
// these routines are provided for the benefit of wnuma.
extern DWORD_PTR wcpu_AffinityFromProcessorMask(DWORD_PTR processAffinity, uintptr_t processorMask);
extern uintptr_t wcpu_ProcessorMaskFromAffinity(DWORD_PTR processAffinity, DWORD_PTR affinity);
#endif // #ifndef INCLUDED_WCPU

View File

@ -15,14 +15,9 @@
#include "lib/sysdep/win/win.h"
#include "lib/bits.h"
#if MSC_VERSION
# include <intrin.h>
# if !ICC_VERSION
# pragma intrinsic(__rdtsc)
# endif
#endif
#if ARCH_IA32
# include "lib/sysdep/ia32/ia32.h" // ia32_rdtsc
#if ARCH_IA32 || ARCH_AMD64
# include "lib/sysdep/x86_x64/x86_x64.h" // x86_x64_rdtsc
# include "lib/sysdep/x86_x64/topology.h"
#endif
@ -38,18 +33,18 @@ enum AmdPowerNowFlags
static bool IsThrottlingPossible()
{
#if ARCH_IA32
Ia32CpuidRegs regs;
switch(ia32_Vendor())
#if ARCH_IA32 || ARCH_AMD64
x86_x64_CpuidRegs regs;
switch(x86_x64_Vendor())
{
case IA32_VENDOR_INTEL:
if(ia32_cap(IA32_CAP_TM_SCC) || ia32_cap(IA32_CAP_EST))
case X86_X64_VENDOR_INTEL:
if(x86_x64_cap(X86_X64_CAP_TM_SCC) || x86_x64_cap(X86_X64_CAP_EST))
return true;
break;
case IA32_VENDOR_AMD:
case X86_X64_VENDOR_AMD:
regs.eax = 0x80000007;
if(ia32_cpuid(&regs))
if(x86_x64_cpuid(&regs))
{
if(regs.edx & (PN_FREQ_ID_CTRL|PN_SW_THERMAL_CTRL))
return true;
@ -57,9 +52,6 @@ static bool IsThrottlingPossible()
break;
}
return false;
#elif ARCH_AMD64
// not yet implemented - consider it unsafe.
return true;
#endif
}
@ -68,8 +60,8 @@ static bool IsThrottlingPossible()
LibError CounterTSC::Activate()
{
#if ARCH_IA32
if(!ia32_cap(IA32_CAP_TSC))
#if ARCH_IA32 || ARCH_AMD64
if(!x86_x64_cap(X86_X64_CAP_TSC))
return ERR::NO_SYS; // NOWARN (CPU doesn't support RDTSC)
#endif
@ -107,16 +99,16 @@ bool CounterTSC::IsSafe() const
if(cpu_NumPackages() != 1 || cpu_CoresPerPackage() != 1)
return false;
#if ARCH_IA32
#if ARCH_IA32 || ARCH_AMD64
// recent CPU:
if(ia32_Generation() >= 7)
if(x86_x64_Generation() >= 7)
{
// note: 8th generation CPUs support C1-clock ramping, which causes
// drift on multi-core systems, but those were excluded above.
Ia32CpuidRegs regs;
x86_x64_CpuidRegs regs;
regs.eax = 0x80000007;
if(ia32_cpuid(&regs))
if(x86_x64_cpuid(&regs))
{
// TSC is invariant WRT P-state, C-state and STPCLK => safe.
if(regs.edx & PN_INVARIANT_TSC)
@ -148,11 +140,7 @@ bool CounterTSC::IsSafe() const
u64 CounterTSC::Counter() const
{
#if MSC_VERSION
return __rdtsc();
#else
return ia32_rdtsc();
#endif
return x86_x64_rdtsc();
}
/**

View File

@ -0,0 +1,359 @@
#include "precompiled.h"
#include "lib/sysdep/numa.h"
#include "lib/bits.h" // round_up, PopulationCount
#include "lib/timer.h"
#include "lib/sysdep/os_cpu.h"
#include "win.h"
#include "wutil.h"
#include "wcpu.h"
#include <Psapi.h>
#ifdef _OPENMP
# include <omp.h>
#endif
//-----------------------------------------------------------------------------
// node topology
//-----------------------------------------------------------------------------
size_t numa_NumNodes()
{
static size_t numNodes;
if(!numNodes)
{
typedef BOOL (WINAPI *PGetNumaHighestNodeNumber)(PULONG highestNode);
const HMODULE hKernel32 = GetModuleHandle("kernel32.dll");
const PGetNumaHighestNodeNumber pGetNumaHighestNodeNumber = (PGetNumaHighestNodeNumber)GetProcAddress(hKernel32, "GetNumaHighestNodeNumber");
if(pGetNumaHighestNodeNumber)
{
ULONG highestNode;
const BOOL ok = pGetNumaHighestNodeNumber(&highestNode);
debug_assert(ok);
debug_assert(highestNode < os_cpu_NumProcessors()); // #nodes <= #processors
numNodes = highestNode+1;
}
// NUMA not supported
else
numNodes = 1;
}
return numNodes;
}
// note: it is easier to implement this in terms of numa_ProcessorMaskFromNode
// rather than the other way around because wcpu provides the
// wcpu_ProcessorMaskFromAffinity helper. there is no similar function to
// convert processor to processorNumber.
size_t numa_NodeFromProcessor(size_t processor)
{
debug_assert(processor < os_cpu_NumProcessors());
static std::vector<size_t> processorsNode;
#ifdef _OPENMP
#pragma omp critical
#endif
if(processorsNode.empty())
{
processorsNode.resize(os_cpu_NumProcessors(), 0);
for(size_t node = 0; node < numa_NumNodes(); node++)
{
const uintptr_t processorMask = numa_ProcessorMaskFromNode(node);
for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
{
if(IsBitSet(processorMask, processor))
processorsNode[processor] = node;
}
}
}
return processorsNode.at(processor);
}
uintptr_t numa_ProcessorMaskFromNode(size_t node)
{
debug_assert(node < numa_NumNodes());
static std::vector<uintptr_t> nodesProcessorMask;
#ifdef _OPENMP
#pragma omp critical
#endif
if(nodesProcessorMask.empty())
{
typedef BOOL (WINAPI *PGetNumaNodeProcessorMask)(UCHAR node, PULONGLONG affinity);
const HMODULE hKernel32 = GetModuleHandle("kernel32.dll");
const PGetNumaNodeProcessorMask pGetNumaNodeProcessorMask = (PGetNumaNodeProcessorMask)GetProcAddress(hKernel32, "GetNumaNodeProcessorMask");
if(pGetNumaNodeProcessorMask)
{
DWORD_PTR processAffinity, systemAffinity;
const BOOL ok = GetProcessAffinityMask(GetCurrentProcess(), &processAffinity, &systemAffinity);
debug_assert(ok);
for(size_t node = 0; node < numa_NumNodes(); node++)
{
ULONGLONG affinity;
const BOOL ok = pGetNumaNodeProcessorMask((UCHAR)node, &affinity);
debug_assert(ok);
const uintptr_t processorMask = wcpu_ProcessorMaskFromAffinity(processAffinity, (DWORD_PTR)affinity);
nodesProcessorMask.push_back(processorMask);
}
}
// NUMA not supported - consider node 0 to consist of all system processors
else
nodesProcessorMask.push_back(os_cpu_ProcessorMask());
}
return nodesProcessorMask.at(node);
}
//-----------------------------------------------------------------------------
// memory info
//-----------------------------------------------------------------------------
size_t numa_AvailableMemory(size_t node)
{
debug_assert(node < numa_NumNodes());
// note: it is said that GetNumaAvailableMemoryNode sometimes incorrectly
// reports zero bytes. the actual cause may however be unexpected
// RAM configuration, e.g. not all slots filled.
typedef BOOL (WINAPI *PGetNumaAvailableMemoryNode)(UCHAR node, PULONGLONG availableBytes);
static PGetNumaAvailableMemoryNode pGetNumaAvailableMemoryNode;
if(!pGetNumaAvailableMemoryNode)
{
const HMODULE hKernel32 = GetModuleHandle("kernel32.dll");
pGetNumaAvailableMemoryNode = (PGetNumaAvailableMemoryNode)GetProcAddress(hKernel32, "GetNumaAvailableMemoryNode");
}
if(pGetNumaAvailableMemoryNode)
{
ULONGLONG availableBytes;
const BOOL ok = pGetNumaAvailableMemoryNode((UCHAR)node, &availableBytes);
debug_assert(ok);
return (size_t)availableBytes;
}
// NUMA not supported - return available system memory
else
return os_cpu_MemoryAvailable();
}
double numa_Factor()
{
static double factor;
static bool initialized;
#ifdef _OPENMP
#pragma omp critical
#endif
if(!initialized)
{
initialized = true;
// if non-NUMA, skip the (expensive) measurements below.
if(numa_NumNodes() == 1)
factor = 1.0;
else
{
// allocate memory on one node
const size_t size = 16*MiB;
shared_ptr<u8> buffer((u8*)numa_AllocateOnNode(size, 0), numa_Deleter<u8>());
const uintptr_t previousProcessorMask = os_cpu_SetThreadAffinityMask(os_cpu_ProcessorMask());
// measure min/max fill times required by a processor from each node
double minTime = 1e10, maxTime = 0.0;
for(size_t node = 0; node < numa_NumNodes(); node++)
{
const uintptr_t processorMask = numa_ProcessorMaskFromNode(node);
os_cpu_SetThreadAffinityMask(processorMask);
const double startTime = timer_Time();
memset(buffer.get(), 0, size);
const double elapsedTime = timer_Time() - startTime;
minTime = std::min(minTime, elapsedTime);
maxTime = std::max(maxTime, elapsedTime);
}
(void)os_cpu_SetThreadAffinityMask(previousProcessorMask);
factor = maxTime / minTime;
}
debug_assert(factor >= 1.0);
debug_assert(factor <= 3.0); // (Microsoft guideline for NUMA systems)
}
return factor;
}
//-----------------------------------------------------------------------------
// allocator
//-----------------------------------------------------------------------------
void* numa_Allocate(size_t size)
{
void* const mem = VirtualAlloc(0, size, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE);
if(!mem)
throw std::bad_alloc();
return mem;
}
static bool largePageAllocationTookTooLong = false;
static bool ShouldUseLargePages(LargePageDisposition disposition, size_t allocationSize, size_t node)
{
// can't, OS does not support large pages
if(os_cpu_LargePageSize() == 0)
return false;
// overrides
if(disposition == LPD_NEVER)
return false;
if(disposition == LPD_ALWAYS)
return true;
// default disposition: use a heuristic
{
// a previous attempt already took too long (Windows is apparently
// shoveling aside lots of memory).
if(largePageAllocationTookTooLong)
return false;
// allocation is rather small and would "only" use half of the
// TLBs for its pages.
if(allocationSize < 64/2 * os_cpu_PageSize())
return false;
// we want there to be plenty of memory available, otherwise the
// page frames are going to be terribly fragmented and even a
// single allocation would take SECONDS.
if(numa_AvailableMemory(node) < 2*GiB)
return false;
}
return true;
}
static bool VerifyPages(void* mem, size_t size, size_t pageSize, size_t node)
{
typedef BOOL (WINAPI *PQueryWorkingSetEx)(HANDLE hProcess, PVOID buffer, DWORD bufferSize);
static PQueryWorkingSetEx pQueryWorkingSetEx;
if(!pQueryWorkingSetEx)
{
const HMODULE hKernel32 = GetModuleHandle("kernel32.dll");
pQueryWorkingSetEx = (PQueryWorkingSetEx)GetProcAddress(hKernel32, "QueryWorkingSetEx");
if(!pQueryWorkingSetEx)
return true; // can't do anything
}
#if WINVER >= 0x600
// retrieve attributes of all pages constituting mem
const size_t numPages = (size + pageSize-1) / pageSize;
PSAPI_WORKING_SET_EX_INFORMATION* wsi = new PSAPI_WORKING_SET_EX_INFORMATION[numPages];
for(size_t i = 0; i < numPages; i++)
wsi[i].VirtualAddress = (u8*)mem + i*pageSize;
pQueryWorkingSetEx(GetCurrentProcess(), wsi, sizeof(PSAPI_WORKING_SET_EX_INFORMATION)*numPages);
// ensure each is valid and allocated on the correct node
for(size_t i = 0; i < numPages; i++)
{
const PSAPI_WORKING_SET_EX_BLOCK& attributes = wsi[i].VirtualAttributes;
if(!attributes.valid)
return false;
if(attributes.LargePage != (pageSize == LargePageSize()))
{
debug_printf("NUMA: is not a large page\n");
return false;
}
if(attributes.node != node)
{
debug_printf("NUMA: allocated from remote node\n");
return false;
}
}
delete[] wsi;
#else
UNUSED2(mem);
UNUSED2(size);
UNUSED2(pageSize);
UNUSED2(node);
#endif
return true;
}
void* numa_AllocateOnNode(size_t size, size_t node, LargePageDisposition largePageDisposition, size_t* ppageSize)
{
debug_assert(node < numa_NumNodes());
// see if there will be enough memory (non-authoritative, for debug purposes only)
{
const size_t availableBytes = numa_AvailableMemory(node);
if(availableBytes < size)
debug_printf("NUMA: warning: node reports insufficient memory (%d vs %d)\n", availableBytes, size);
}
void* mem = 0;
size_t pageSize = 0;
// try allocating with large pages (reduces TLB misses)
if(ShouldUseLargePages(largePageDisposition, size, node))
{
const size_t largePageSize = os_cpu_LargePageSize();
const size_t paddedSize = round_up(size, largePageSize); // required by MEM_LARGE_PAGES
// note: this call can take SECONDS, which is why several checks are
// undertaken before we even try. these aren't authoritative, so we
// at least prevent future attempts if it takes too long.
const double startTime = timer_Time();
mem = VirtualAlloc(0, paddedSize, MEM_RESERVE|MEM_COMMIT|MEM_LARGE_PAGES, PAGE_READWRITE);
pageSize = largePageSize;
const double elapsedTime = timer_Time() - startTime;
debug_printf("TIMER| NUMA large page allocation: %g\n", elapsedTime);
if(elapsedTime > 1.0)
largePageAllocationTookTooLong = true;
}
// try (again) with regular pages
if(!mem)
{
mem = VirtualAlloc(0, size, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE);
pageSize = os_cpu_PageSize();
}
// all attempts failed - we're apparently out of memory.
if(!mem)
throw std::bad_alloc();
// we can't use VirtualAllocExNuma - it's only available in Vista and Server 2008.
// workaround: fault in all pages now to ensure they are allocated from the
// current node, then verify page attributes.
// (note: VirtualAlloc's MEM_COMMIT only maps virtual pages and does not
// actually allocate page frames. Windows uses a first-touch heuristic -
// the page will be taken from the node whose processor caused the fault.)
memset(mem, 0, size);
VerifyPages(mem, size, pageSize, node);
if(ppageSize)
*ppageSize = pageSize;
return mem;
}
void numa_Deallocate(void* mem)
{
VirtualFree(mem, 0, MEM_RELEASE);
}

View File

@ -405,7 +405,7 @@ int aio_suspend(const struct aiocb* const cbs[], int n, const struct timespec* t
const BOOL waitAll = FALSE;
// convert timespec to milliseconds (ts == 0 => no timeout)
const DWORD timeout = ts? (DWORD)(ts->tv_sec*1000 + ts->tv_nsec/1000000) : INFINITE;
DWORD result = WaitForMultipleObjects(numPendingIos, hEvents, waitAll, timeout);
DWORD result = WaitForMultipleObjects((DWORD)numPendingIos, hEvents, waitAll, timeout);
for(size_t i = 0; i < numPendingIos; i++)
ResetEvent(hEvents[i]);

View File

@ -0,0 +1,442 @@
/**
* =========================================================================
* File : topology.cpp
* Project : 0 A.D.
* Description : detection of CPU and cache topology
* =========================================================================
*/
// license: GPL; see lib/license.txt
#include "precompiled.h"
#include "topology.h"
#include "lib/bits.h"
#include "lib/sysdep/cpu.h"
#include "lib/sysdep/os_cpu.h"
#include "x86_x64.h"
//-----------------------------------------------------------------------------
// note: Intel Appnote 485 (CPUID) assures uniformity of coresPerPackage and
// logicalPerCore across all packages.
static size_t DetectCoresPerPackage()
{
x86_x64_CpuidRegs regs;
switch(x86_x64_Vendor())
{
case X86_X64_VENDOR_INTEL:
regs.eax = 4;
regs.ecx = 0;
if(x86_x64_cpuid(&regs))
return bits(regs.eax, 26, 31)+1;
break;
case X86_X64_VENDOR_AMD:
regs.eax = 0x80000008;
if(x86_x64_cpuid(&regs))
return bits(regs.ecx, 0, 7)+1;
break;
}
return 1; // else: the CPU is single-core.
}
static size_t CoresPerPackage()
{
static size_t coresPerPackage = 0;
if(!coresPerPackage)
coresPerPackage = DetectCoresPerPackage();
return coresPerPackage;
}
static bool IsHyperthreadingCapable()
{
// definitely not
if(!x86_x64_cap(X86_X64_CAP_HT))
return false;
// AMD N-core systems falsely set the HT bit for compatibility reasons
// (don't bother resetting it, might confuse callers)
if(x86_x64_Vendor() == X86_X64_VENDOR_AMD && x86_x64_cap(X86_X64_CAP_AMD_CMP_LEGACY))
return false;
return true;
}
static size_t DetectLogicalPerCore()
{
if(!IsHyperthreadingCapable())
return 1;
x86_x64_CpuidRegs regs;
regs.eax = 1;
if(!x86_x64_cpuid(&regs))
DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING);
const size_t logicalPerPackage = bits(regs.ebx, 16, 23);
// cores ought to be uniform WRT # logical processors
debug_assert(logicalPerPackage % CoresPerPackage() == 0);
return logicalPerPackage / CoresPerPackage();
}
static size_t LogicalPerCore()
{
static size_t logicalPerCore = 0;
if(!logicalPerCore)
logicalPerCore = DetectLogicalPerCore();
return logicalPerCore;
}
enum CacheType
{
CT_NONE = 0,
CT_DATA = 1,
CT_INSTRUCTION = 2,
CT_UNIFIED = 3
};
static bool IsL2DataCache(CacheType type, size_t level)
{
if(type != CT_DATA && type != CT_UNIFIED)
return false;
if(level != 2)
return false;
return true;
}
static size_t DetectLogicalPerCache()
{
// note: Intel Appnote 485 says the order in which caches are returned is
// undefined, so we need to loop through all of them.
for(u32 count = 0; ; count++)
{
x86_x64_CpuidRegs regs;
regs.eax = 4;
regs.ecx = count;
x86_x64_cpuid(&regs);
const CacheType type = (CacheType)bits(regs.eax, 0, 4);
// no more caches left
if(type == CT_NONE)
{
debug_assert(0); // we somehow didn't find the L2d
return 1;
}
const size_t level = bits(regs.eax, 5, 7);
if(IsL2DataCache(type, level))
{
const size_t logicalPerCache = bits(regs.eax, 14, 25)+1;
return logicalPerCache;
}
}
}
static size_t LogicalPerCache()
{
static size_t logicalPerCache;
if(!logicalPerCache)
logicalPerCache = DetectLogicalPerCache();
return logicalPerCache;
}
//-----------------------------------------------------------------------------
// the above functions give the maximum number of cores/logical units.
// however, some of them may actually be disabled by the BIOS!
// what we can do is to analyze the APIC IDs. they are allocated sequentially
// for all "processors". treating the IDs as variable-width bit fields
// (according to the number of cores/logical units present) allows
// determining the exact topology as well as number of packages.
// these are set by DetectProcessorTopology.
static size_t numPackages = 0; // i.e. sockets; > 1 => true SMP system
static size_t enabledCoresPerPackage = 0;
static size_t enabledLogicalPerCore = 0; // hyperthreading units
typedef std::vector<u8> Ids;
// add the currently running processor's APIC ID to a list of IDs.
static void StoreApicId(size_t UNUSED(processor), uintptr_t cbData)
{
Ids* const apicIds = (Ids*)cbData;
apicIds->push_back(x86_x64_ApicId());
}
// if successful, apicIds[i] contains the unique ID of OS processor i.
static bool GatherApicIds(Ids& apicIds)
{
// old APIC (see x86_x64_ApicId for details)
if(x86_x64_Generation() < 8)
return false;
// process affinity prevents us from seeing all APIC IDs
if(PopulationCount(os_cpu_ProcessorMask()) != os_cpu_NumProcessors())
return false;
const LibError ret = os_cpu_CallByEachCPU(StoreApicId, (uintptr_t)&apicIds);
debug_assert(ret == INFO::OK);
// ensure we got a unique ID for every processor
{
Ids tmp(apicIds);
Ids::iterator end = tmp.end();
std::sort(tmp.begin(), end);
debug_assert(std::unique(tmp.begin(), end) == end);
debug_assert(std::distance(tmp.begin(), end) == (ptrdiff_t)os_cpu_NumProcessors());
}
return true;
}
typedef std::set<u8> IdSet;
/**
* "field" := a range of bits sufficient to represent <numValues> integers.
* for each id in <apicIds>: extract the value of the field starting at
* <offset> and insert it into <ids>. afterwards, adjust <offset> to the
* next field.
*
* used to gather e.g. all core IDs from all APIC IDs.
**/
static void ExtractFieldIntoSet(const Ids& apicIds, size_t& offset, size_t numValues, IdSet& ids)
{
const size_t numBits = ceil_log2(numValues);
if(numBits == 0)
return;
const u8 mask = bit_mask<u8>(numBits);
for(size_t i = 0; i < apicIds.size(); i++)
{
const u8 apicId = apicIds[i];
const u8 field = u8(apicId >> offset) & mask;
ids.insert(field);
}
offset += numBits;
}
static size_t numCaches = 0; // L2d
static std::vector<size_t> processorsCache;
static std::vector<uintptr_t> cachesProcessorMask;
class CacheManager
{
public:
void Add(u8 id, size_t processor)
{
SharedCache* cache = Find(id);
if(!cache)
{
m_caches.push_back(id);
cache = &m_caches.back();
}
cache->Add(processor);
}
void StoreProcessorMasks(std::vector<uintptr_t>& processorMasks)
{
processorMasks.resize(m_caches.size());
for(size_t i = 0; i < m_caches.size(); i++)
processorMasks[i] = m_caches[i].ProcessorMask();
}
private:
class SharedCache
{
public:
SharedCache(u8 id)
: m_id(id), m_processorMask(0)
{
}
bool Matches(u8 id) const
{
return m_id == id;
}
void Add(size_t processor)
{
m_processorMask |= uintptr_t(1) << processor;
}
uintptr_t ProcessorMask() const
{
return m_processorMask;
}
private:
u8 m_id;
uintptr_t m_processorMask;
};
SharedCache* Find(u8 id)
{
for(size_t i = 0; i < m_caches.size(); i++)
{
if(m_caches[i].Matches(id))
return &m_caches[i];
}
return 0;
}
std::vector<SharedCache> m_caches;
};
static void DetectCacheTopology(const Ids& apicIds)
{
const size_t numBits = ceil_log2(LogicalPerCache());
const u8 cacheIdMask = u8(0xFF << numBits);
CacheManager cacheManager;
for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
{
const u8 apicId = apicIds[processor];
const u8 cacheId = apicId & cacheIdMask;
cacheManager.Add(cacheId, processor);
}
cacheManager.StoreProcessorMasks(cachesProcessorMask);
numCaches = cachesProcessorMask.size();
const size_t invalidCache = ~(size_t)0;
processorsCache.resize(os_cpu_NumProcessors(), invalidCache);
for(size_t cache = 0; cache < numCaches; cache++)
{
const uintptr_t processorMask = cachesProcessorMask[cache];
for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
{
if(IsBitSet(processorMask, processor))
processorsCache[processor] = cache;
}
}
for(size_t processor = 0; processor < os_cpu_NumProcessors(); processor++)
{
debug_assert(processorsCache[processor] != invalidCache);
debug_assert(processorsCache[processor] < numCaches);
}
}
// @return false if unavailable / no information can be returned.
static bool DetectProcessorTopologyViaApicIds()
{
Ids apicIds;
if(!GatherApicIds(apicIds))
return false;
// extract values from all 3 ID bit fields into separate sets
size_t offset = 0;
IdSet logicalIds;
ExtractFieldIntoSet(apicIds, offset, LogicalPerCore(), logicalIds);
IdSet coreIds;
ExtractFieldIntoSet(apicIds, offset, CoresPerPackage(), coreIds);
IdSet packageIds;
ExtractFieldIntoSet(apicIds, offset, 0xFF, packageIds);
numPackages = std::max(packageIds.size(), size_t(1));
enabledCoresPerPackage = std::max(coreIds .size(), size_t(1));
enabledLogicalPerCore = std::max(logicalIds.size(), size_t(1));
// note: cache ID possibly overlaps the other fields. we also want to
// retrieve more information (mappings between processor and cache ID),
// so this needs to be handled separately.
DetectCacheTopology(apicIds);
return true;
}
static void GuessProcessorTopologyViaOsCount()
{
const size_t numProcessors = os_cpu_NumProcessors();
// note: we cannot hope to always return correct results since disabled
// cores/logical units cannot be distinguished from the situation of the
// OS simply not reporting them as "processors". unfortunately this
// function won't always only be called for older (#core = #logical = 1)
// systems because DetectProcessorTopologyViaApicIds may fail due to
// lack of OS support. what we'll do is assume nothing is disabled; this
// is reasonable because we care most about #packages. it's fine to assume
// more cores (without inflating the total #processors) because that
// count only indicates memory barriers etc. ought to be used.
enabledCoresPerPackage = CoresPerPackage();
enabledLogicalPerCore = LogicalPerCore();
const size_t numPackagesTimesLogical = numProcessors / CoresPerPackage();
debug_assert(numPackagesTimesLogical != 0); // otherwise processors didn't include cores, which would be stupid
numPackages = numPackagesTimesLogical / LogicalPerCore();
if(!numPackages) // processors didn't include logical units (reasonable)
numPackages = numPackagesTimesLogical;
}
// determine how many CoresPerPackage and LogicalPerCore are
// actually enabled and also count numPackages.
static void DetectProcessorTopology()
{
// authoritative, but requires OS support and fairly recent CPUs
if(DetectProcessorTopologyViaApicIds())
return; // success, we're done.
GuessProcessorTopologyViaOsCount();
}
size_t cpu_NumPackages()
{
if(!numPackages)
DetectProcessorTopology();
return numPackages;
}
size_t cpu_CoresPerPackage()
{
if(!enabledCoresPerPackage)
DetectProcessorTopology();
return enabledCoresPerPackage;
}
size_t cpu_LogicalPerCore()
{
if(!enabledLogicalPerCore)
DetectProcessorTopology();
return enabledLogicalPerCore;
}
size_t cpu_NumCaches()
{
if(!numCaches)
DetectProcessorTopology();
return numCaches;
}
size_t cpu_CacheFromProcessor(size_t processor)
{
debug_assert(processor < os_cpu_NumProcessors());
DetectProcessorTopology();
return processorsCache.at(processor);
}
uintptr_t cpu_ProcessorMaskFromCache(size_t cache)
{
debug_assert(cache < cpu_NumCaches());
DetectProcessorTopology();
return cachesProcessorMask.at(cache);
}
// note: Windows 2003 GetLogicalProcessorInformation returns incorrect
// information, claiming all cores in an Intel Core2 Quad processor
// share an L2 cache.

View File

@ -0,0 +1,54 @@
/**
* =========================================================================
* File : topology.cpp
* Project : 0 A.D.
* Description : detection of CPU and cache topology
* =========================================================================
*/
// license: GPL; see lib/license.txt
#ifndef INCLUDED_TOPOLOGY
#define INCLUDED_TOPOLOGY
// OSes report hyperthreading units and cores as "processors". we need to
// drill down and find out the exact counts (for thread pool dimensioning
// and cache sharing considerations).
/**
* @return number of *enabled* CPU packages / sockets.
**/
LIB_API size_t cpu_NumPackages();
/**
* @return number of *enabled* CPU cores per package.
* (2 on dual-core systems)
**/
LIB_API size_t cpu_CoresPerPackage();
/**
* @return number of *enabled* hyperthreading units per core.
* (2 on P4 EE)
**/
LIB_API size_t cpu_LogicalPerCore();
//-----------------------------------------------------------------------------
// L2 cache
/**
* @return number of distinct L2 caches
**/
LIB_API size_t cpu_NumCaches();
/**
* @return L2 cache number (zero-based) to which <processor> belongs.
**/
LIB_API size_t cpu_CacheFromProcessor(size_t processor);
/**
* @return bit-mask of all processors sharing <cache>.
**/
LIB_API uintptr_t cpu_ProcessorMaskFromCache(size_t cache);
#endif // #ifndef INCLUDED_TOPOLOGY

View File

@ -0,0 +1,505 @@
/**
* =========================================================================
* File : x86_x64.cpp
* Project : 0 A.D.
* Description : CPU-specific routines common to 32 and 64-bit x86
* =========================================================================
*/
// license: GPL; see lib/license.txt
#include "precompiled.h"
#include "x86_x64.h"
#include <string.h>
#include <stdio.h>
#include <vector>
#include <set>
#include <algorithm>
#include "lib/posix/posix.h" // pthread
#include "lib/bits.h"
#include "lib/timer.h"
#include "lib/sysdep/cpu.h"
#include "lib/sysdep/os_cpu.h"
#if ARCH_IA32
# include "../ia32/ia32_asm.h"
#else
#include "../amd64/amd64_asm.h"
# endif
#if MSC_VERSION
# include <intrin.h>
#elif GCC_VERSION
#else
# error compiler not supported
#endif
// note: unfortunately the MSC __cpuid intrinsic does not allow passing
// additional inputs (e.g. ecx = count), so we need to implement this
// in assembly for both IA-32 and AMD64.
static void cpuid_impl(x86_x64_CpuidRegs* regs)
{
#if ARCH_IA32
ia32_asm_cpuid(regs);
#else
amd64_asm_cpuid(regs);
#endif
}
bool x86_x64_cpuid(x86_x64_CpuidRegs* regs)
{
static u32 maxFunction;
static u32 maxExtendedFunction;
if(!maxFunction)
{
x86_x64_CpuidRegs regs2;
regs2.eax = 0;
cpuid_impl(&regs2);
maxFunction = regs2.eax;
regs2.eax = 0x80000000;
cpuid_impl(&regs2);
maxExtendedFunction = regs2.eax;
}
const u32 function = regs->eax;
if(function > maxExtendedFunction)
return false;
if(function < 0x80000000 && function > maxFunction)
return false;
cpuid_impl(regs);
return true;
}
//-----------------------------------------------------------------------------
// capability bits
static void DetectFeatureFlags(u32 caps[4])
{
x86_x64_CpuidRegs regs;
regs.eax = 1;
if(x86_x64_cpuid(&regs))
{
caps[0] = regs.ecx;
caps[1] = regs.edx;
}
regs.eax = 0x80000001;
if(x86_x64_cpuid(&regs))
{
caps[2] = regs.ecx;
caps[3] = regs.edx;
}
}
bool x86_x64_cap(x86_x64_Cap cap)
{
// treated as 128 bit field; order: std ecx, std edx, ext ecx, ext edx
// keep in sync with enum CpuCap!
static u32 x86_x64_caps[4];
// (since relevant CPUs will surely advertise at least one standard flag,
// they are zero iff we haven't been initialized yet)
if(!x86_x64_caps[1])
DetectFeatureFlags(x86_x64_caps);
const size_t tbl_idx = cap >> 5;
const size_t bit_idx = cap & 0x1f;
if(tbl_idx > 3)
{
DEBUG_WARN_ERR(ERR::INVALID_PARAM);
return false;
}
return (x86_x64_caps[tbl_idx] & BIT(bit_idx)) != 0;
}
//-----------------------------------------------------------------------------
// CPU identification
static x86_x64_Vendors DetectVendor()
{
x86_x64_CpuidRegs regs;
regs.eax = 0;
if(!x86_x64_cpuid(&regs))
DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING);
// copy regs to string
// note: 'strange' ebx,edx,ecx reg order is due to ModR/M encoding order.
char vendor_str[13];
u32* vendor_str_u32 = (u32*)vendor_str;
vendor_str_u32[0] = regs.ebx;
vendor_str_u32[1] = regs.edx;
vendor_str_u32[2] = regs.ecx;
vendor_str[12] = '\0'; // 0-terminate
if(!strcmp(vendor_str, "AuthenticAMD"))
return X86_X64_VENDOR_AMD;
else if(!strcmp(vendor_str, "GenuineIntel"))
return X86_X64_VENDOR_INTEL;
else
{
DEBUG_WARN_ERR(ERR::CPU_UNKNOWN_VENDOR);
return X86_X64_VENDOR_UNKNOWN;
}
}
x86_x64_Vendors x86_x64_Vendor()
{
static x86_x64_Vendors vendor = X86_X64_VENDOR_UNKNOWN;
if(vendor == X86_X64_VENDOR_UNKNOWN)
vendor = DetectVendor();
return vendor;
}
static void DetectSignature(size_t* model, size_t* family)
{
x86_x64_CpuidRegs regs;
regs.eax = 1;
if(!x86_x64_cpuid(&regs))
DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING);
*model = bits(regs.eax, 4, 7);
*family = bits(regs.eax, 8, 11);
}
static size_t DetectGeneration()
{
size_t model, family;
DetectSignature(&model, &family);
switch(x86_x64_Vendor())
{
case X86_X64_VENDOR_AMD:
switch(family)
{
case 5:
if(model < 6)
return 5; // K5
else
return 6; // K6
case 6:
return 7; // K7 (Athlon)
case 0xF:
return 8; // K8 (Opteron)
}
break;
case X86_X64_VENDOR_INTEL:
switch(family)
{
case 5:
return 5; // Pentium
case 6:
if(model <= 0xD)
return 6; // Pentium Pro/II/III/M
else
return 8; // Core2Duo
case 0xF:
if(model <= 6)
return 7; // Pentium 4/D
}
break;
}
debug_assert(0); // unknown CPU generation
return family;
}
size_t x86_x64_Generation()
{
static size_t generation;
if(!generation)
generation = DetectGeneration();
return generation;
}
//-----------------------------------------------------------------------------
// identifier string
/// functor to remove substrings from the CPU identifier string
class StringStripper
{
char* m_string;
size_t m_max_chars;
public:
StringStripper(char* string, size_t max_chars)
: m_string(string), m_max_chars(max_chars)
{
}
// remove all instances of substring from m_string
void operator()(const char* substring)
{
const size_t substring_length = strlen(substring);
for(;;)
{
char* substring_pos = strstr(m_string, substring);
if(!substring_pos)
break;
const size_t substring_ofs = substring_pos - m_string;
const size_t num_chars = m_max_chars - substring_ofs - substring_length;
memmove(substring_pos, substring_pos+substring_length, num_chars);
}
}
};
static void DetectIdentifierString(char* identifierString, size_t maxChars)
{
// get brand string (if available)
char* pos = identifierString;
bool have_brand_string = true;
for(u32 function = 0x80000002; function <= 0x80000004; function++)
{
x86_x64_CpuidRegs regs;
regs.eax = function;
have_brand_string &= x86_x64_cpuid(&regs);
memcpy(pos, &regs, 16);
pos += 16;
}
// fall back to manual detect of CPU type because either:
// - CPU doesn't support brand string (we use a flag to indicate this
// rather than comparing against a default value because it is safer);
// - the brand string is useless, e.g. "Unknown". this happens on
// some older boards whose BIOS reprograms the string for CPUs it
// doesn't recognize.
if(!have_brand_string || strncmp(identifierString, "Unknow", 6) == 0)
{
size_t model, family;
DetectSignature(&model, &family);
switch(x86_x64_Vendor())
{
case X86_X64_VENDOR_AMD:
// everything else is either too old, or should have a brand string.
if(family == 6)
{
if(model == 3 || model == 7)
strcpy_s(identifierString, maxChars, "AMD Duron");
else if(model <= 5)
strcpy_s(identifierString, maxChars, "AMD Athlon");
else
{
if(x86_x64_cap(X86_X64_CAP_AMD_MP))
strcpy_s(identifierString, maxChars, "AMD Athlon MP");
else
strcpy_s(identifierString, maxChars, "AMD Athlon XP");
}
}
break;
case X86_X64_VENDOR_INTEL:
// everything else is either too old, or should have a brand string.
if(family == 6)
{
if(model == 1)
strcpy_s(identifierString, maxChars, "Intel Pentium Pro");
else if(model == 3 || model == 5)
strcpy_s(identifierString, maxChars, "Intel Pentium II");
else if(model == 6)
strcpy_s(identifierString, maxChars, "Intel Celeron");
else
strcpy_s(identifierString, maxChars, "Intel Pentium III");
}
break;
}
}
// identifierString already holds a valid brand string; pretty it up.
else
{
const char* const undesired_strings[] = { "(tm)", "(TM)", "(R)", "CPU " };
std::for_each(undesired_strings, undesired_strings+ARRAY_SIZE(undesired_strings),
StringStripper(identifierString, strlen(identifierString)+1));
// note: Intel brand strings include a frequency, but we can't rely
// on it because the CPU may be overclocked. we'll leave it in the
// string to show measurement accuracy and if SpeedStep is active.
}
}
const char* cpu_IdentifierString()
{
// 3 calls x 4 registers x 4 bytes = 48
static char identifierString[48+1] = {'\0'};
if(identifierString[0] == '\0')
DetectIdentifierString(identifierString, ARRAY_SIZE(identifierString));
return identifierString;
}
//-----------------------------------------------------------------------------
// CPU frequency
// set scheduling priority and restore when going out of scope.
class ScopedSetPriority
{
int m_old_policy;
sched_param m_old_param;
public:
ScopedSetPriority(int new_priority)
{
// get current scheduling policy and priority
pthread_getschedparam(pthread_self(), &m_old_policy, &m_old_param);
// set new priority
sched_param new_param = {0};
new_param.sched_priority = new_priority;
pthread_setschedparam(pthread_self(), SCHED_FIFO, &new_param);
}
~ScopedSetPriority()
{
// restore previous policy and priority.
pthread_setschedparam(pthread_self(), m_old_policy, &m_old_param);
}
};
// note: this function uses timer.cpp!timer_Time, which is implemented via
// whrt.cpp on Windows, which again calls x86_x64_Init. be careful that
// this function isn't called from there as well, else WHRT will be used
// before its init completes.
double cpu_ClockFrequency()
{
// if the TSC isn't available, there's really no good way to count the
// actual CPU clocks per known time interval, so bail.
// note: loop iterations ("bogomips") are not a reliable measure due
// to differing IPC and compiler optimizations.
if(!x86_x64_cap(X86_X64_CAP_TSC))
return -1.0; // impossible value
// increase priority to reduce interference while measuring.
const int priority = sched_get_priority_max(SCHED_FIFO)-1;
ScopedSetPriority ssp(priority);
// note: no need to "warm up" cpuid - it will already have been
// called several times by the time this code is reached.
// (background: it's used in x86_x64_rdtsc() to serialize instruction flow;
// the first call is documented to be slower on Intel CPUs)
int num_samples = 16;
// if clock is low-res, do less samples so it doesn't take too long.
// balance measuring time (~ 10 ms) and accuracy (< 1 0/00 error -
// ok for using the TSC as a time reference)
if(timer_Resolution() >= 1e-3)
num_samples = 8;
std::vector<double> samples(num_samples);
for(int i = 0; i < num_samples; i++)
{
double dt;
i64 dc; // i64 because VC6 can't convert u64 -> double,
// and we don't need all 64 bits.
// count # of clocks in max{1 tick, 1 ms}:
// .. wait for start of tick.
const double t0 = timer_Time();
u64 c1; double t1;
do
{
// note: timer_Time effectively has a long delay (up to 5 us)
// before returning the time. we call it before x86_x64_rdtsc to
// minimize the delay between actually sampling time / TSC,
// thus decreasing the chance for interference.
// (if unavoidable background activity, e.g. interrupts,
// delays the second reading, inaccuracy is introduced).
t1 = timer_Time();
c1 = x86_x64_rdtsc();
}
while(t1 == t0);
// .. wait until start of next tick and at least 1 ms elapsed.
do
{
const double t2 = timer_Time();
const u64 c2 = x86_x64_rdtsc();
dc = (i64)(c2 - c1);
dt = t2 - t1;
}
while(dt < 1e-3);
// .. freq = (delta_clocks) / (delta_seconds);
// x86_x64_rdtsc/timer overhead is negligible.
const double freq = dc / dt;
samples[i] = freq;
}
std::sort(samples.begin(), samples.end());
// median filter (remove upper and lower 25% and average the rest).
// note: don't just take the lowest value! it could conceivably be
// too low, if background processing delays reading c1 (see above).
double sum = 0.0;
const int lo = num_samples/4, hi = 3*num_samples/4;
for(int i = lo; i < hi; i++)
sum += samples[i];
const double clock_frequency = sum / (hi-lo);
return clock_frequency;
}
//-----------------------------------------------------------------------------
// misc stateless functions
u8 x86_x64_ApicId()
{
x86_x64_CpuidRegs regs;
regs.eax = 1;
if(!x86_x64_cpuid(&regs))
DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING);
const u8 apicId = (u8)bits(regs.ebx, 24, 31);
return apicId;
}
u64 x86_x64_rdtsc()
{
#if MSC_VERSION
return (u64)__rdtsc();
#elif GCC_VERSION
// GCC supports "portable" assembly for both x86 and x64
volatile u32 lo, hi;
asm volatile ("rdtsc" : "=a" (lo), "=d" (hi));
return u64_from_u32(hi, lo);
#endif
}
void x86_x64_DebugBreak()
{
#if MSC_VERSION
__debugbreak();
#elif GCC_VERSION
// note: this probably isn't necessary, since unix_debug_break
// (SIGTRAP) is most probably available if GCC_VERSION.
// we include it for completeness, though.
__asm__ __volatile__ ("int $3");
#endif
}
// enforce strong memory ordering.
void cpu_MemoryFence()
{
if(x86_x64_cap(X86_X64_CAP_SSE2))
_mm_mfence();
}
void cpu_Serialize()
{
x86_x64_CpuidRegs regs;
regs.eax = 1;
x86_x64_cpuid(&regs); // CPUID serializes execution.
}

View File

@ -0,0 +1,125 @@
/**
* =========================================================================
* File : x86_x64.h
* Project : 0 A.D.
* Description : CPU-specific routines common to 32 and 64-bit x86
* =========================================================================
*/
// license: GPL; see lib/license.txt
#ifndef INCLUDED_X86_X64
#define INCLUDED_X86_X64
#if !ARCH_IA32 && !ARCH_AMD64
#error "including x86_x64.h without ARCH_IA32=1 or ARCH_AMD64=1"
#endif
/**
* registers used/returned by x86_x64_cpuid
**/
struct x86_x64_CpuidRegs
{
u32 eax;
u32 ebx;
u32 ecx;
u32 edx;
};
/**
* invoke CPUID instruction.
* @param regs input/output registers.
* regs->eax must be set to the desired function.
* some functions (e.g. 4) require regs->ecx to be set as well.
* rationale: this interface (input/output structure vs. function parameters)
* avoids unnecessary copying/initialization if some inputs aren't needed
* and allows graceful expansion to functions that require further inputs.
* @return true on success or false if the sub-function isn't supported.
**/
extern bool x86_x64_cpuid(x86_x64_CpuidRegs* regs);
/**
* CPU vendor.
* (this is exposed because some CPUID functions are vendor-specific.)
* (an enum is easier to compare than the original string values.)
**/
enum x86_x64_Vendors
{
X86_X64_VENDOR_UNKNOWN,
X86_X64_VENDOR_INTEL,
X86_X64_VENDOR_AMD,
};
LIB_API x86_x64_Vendors x86_x64_Vendor();
/**
* @return the colloquial processor generation
* (5 = Pentium, 6 = Pentium Pro/II/III / K6, 7 = Pentium4 / Athlon, 8 = Core / Opteron)
**/
LIB_API size_t x86_x64_Generation();
/**
* bit indices of CPU capability flags (128 bits).
* values are defined by IA-32 CPUID feature flags - do not change!
**/
enum x86_x64_Cap
{
// standard (ecx) - currently only defined by Intel
X86_X64_CAP_SSE3 = 0+0, // Streaming SIMD Extensions 3
X86_X64_CAP_EST = 0+7, // Enhanced Speedstep Technology
// standard (edx)
X86_X64_CAP_FPU = 32+0, // Floating Point Unit
X86_X64_CAP_TSC = 32+4, // TimeStamp Counter
X86_X64_CAP_CMOV = 32+15, // Conditional MOVe
X86_X64_CAP_TM_SCC = 32+22, // Thermal Monitoring and Software Controlled Clock
X86_X64_CAP_MMX = 32+23, // MultiMedia eXtensions
X86_X64_CAP_SSE = 32+25, // Streaming SIMD Extensions
X86_X64_CAP_SSE2 = 32+26, // Streaming SIMD Extensions 2
X86_X64_CAP_HT = 32+28, // HyperThreading
// extended (ecx)
X86_X64_CAP_AMD_CMP_LEGACY = 64+1, // N-core and X86_X64_CAP_HT is falsely set
// extended (edx)
X86_X64_CAP_AMD_MP = 96+19, // MultiProcessing capable; reserved on AMD64
X86_X64_CAP_AMD_MMX_EXT = 96+22,
X86_X64_CAP_AMD_3DNOW_PRO = 96+30,
X86_X64_CAP_AMD_3DNOW = 96+31
};
/**
* @return whether the CPU supports the indicated x86_x64_Cap / feature flag.
**/
LIB_API bool x86_x64_cap(x86_x64_Cap cap);
//-----------------------------------------------------------------------------
// stateless
/**
* @return APIC ID of the currently executing processor.
*
* the implementation uses CPUID.1 and only works on >= 8th generation CPUs;
* (P4/Athlon XP); otherwise it returns 0. the alternative of accessing the
* APIC mmio registers is not feasible - mahaf_MapPhysicalMemory only works
* reliably on WinXP. also, the OS already has the APIC registers mapped and
* in constant use, and we don't want to interfere.
**/
LIB_API u8 x86_x64_ApicId();
/**
* @return the current value of the TimeStampCounter (a counter of
* CPU cycles since power-on, which is useful for high-resolution timing
* but potentially differs between multiple CPUs)
**/
LIB_API u64 x86_x64_rdtsc();
/**
* trigger a breakpoint inside this function when it is called.
**/
LIB_API void x86_x64_DebugBreak(void);
#endif // #ifndef INCLUDED_X86_X64

View File

@ -186,7 +186,7 @@ class TestMultithread : public CxxTest::TestSuite
break;
case TA_SLEEP:
usleep(sleep_duration_ms*1000);
usleep(useconds_t(sleep_duration_ms*1000));
break;
default:

View File

@ -36,7 +36,7 @@ public:
if(x == 1) ones++;
if(x == 2) twos++;
}
TS_ASSERT_EQUALS(ones+twos, 100);
TS_ASSERT_EQUALS(ones+twos, size_t(100));
TS_ASSERT(ones > 10 && twos > 10);
}
};

View File

@ -25,8 +25,8 @@
# include <unistd.h>
#endif
#include "lib/config2.h" // CONFIG2_TIMER_ALLOW_RDTSC
#if ARCH_IA32 && CONFIG2_TIMER_ALLOW_RDTSC
# include "lib/sysdep/ia32/ia32.h" // ia32_rdtsc
#if (ARCH_IA32 || ARCH_AMD64) && CONFIG2_TIMER_ALLOW_RDTSC
# include "lib/sysdep/x86_x64/x86_x64.h" // x86_x64_rdtsc
#endif
#if OS_UNIX || OS_WIN
@ -177,7 +177,7 @@ void TimerUnit::SetToZero()
void TimerUnit::SetFromTimer()
{
m_ticks = ia32_rdtsc();
m_ticks = x86_x64_rdtsc();
}
void TimerUnit::AddDifference(TimerUnit t0, TimerUnit t1)

View File

@ -8,7 +8,8 @@
#include "lib/allocators/shared_ptr.h"
#include "lib/sysdep/gfx.h"
#include "lib/sysdep/snd.h"
#include "lib/sysdep/cpu.h"
#include "lib/sysdep/os_cpu.h"
#include "lib/sysdep/x86_x64/topology.h"
#include "lib/tex/tex.h"
#include "lib/file/io/io_align.h" // BLOCK_SIZE
@ -87,7 +88,7 @@ void WriteSystemInfo()
fprintf(f, "\n");
// memory
fprintf(f, "Memory : %lu MiB; %lu MiB free\n", cpu_MemorySize(CPU_MEM_TOTAL)/MiB, cpu_MemorySize(CPU_MEM_AVAILABLE)/MiB);
fprintf(f, "Memory : %lu MiB; %lu MiB free\n", os_cpu_MemorySize()/MiB, os_cpu_MemoryAvailable()/MiB);
// graphics
fprintf(f, "Graphics Card : %s\n", gfx_card);