# major refactoring of system-dependent code (simplifies build system)

cpu.cpp: avoided the need for wrapper functions by calling the
OS-specific function directly (declared in central header, implemented
in the platform's cpp file)

avoid the need for init in cpu and ia32 via if(!init) Init() pattern.

optimized memcpy now requires SSE support

remove error-prone CAS macro; replace with cpu_CAS
config: no longer require inline asm for float->int conversions
lib_error: remove special-case in CHECK_ERR for windows (no longer
needed)

This was SVN commit r5365.
This commit is contained in:
janwas 2007-09-23 15:36:29 +00:00
parent 56bd5b59b4
commit 7a5655edde
40 changed files with 454 additions and 871 deletions

View File

@ -13,7 +13,7 @@
#include "lib/posix/posix_mman.h" // PROT_* constants for da_set_prot
#include "lib/posix/posix.h" // sysconf
#include "lib/sysdep/cpu.h" // CAS
#include "lib/sysdep/cpu.h" // cpu_CAS
#include "byte_order.h"
#include "bits.h"
@ -577,7 +577,7 @@ void* single_calloc(void* storage, volatile uintptr_t* in_use_flag, size_t size)
void* p;
// successfully reserved the single instance
if(CAS(in_use_flag, 0, 1))
if(cpu_CAS(in_use_flag, 0, 1))
p = storage;
// already in use (rare) - allocate from heap
else
@ -602,7 +602,7 @@ void single_free(void* storage, volatile uintptr_t* in_use_flag, void* p)
if(p == storage)
{
if(CAS(in_use_flag, 1, 0))
if(cpu_CAS(in_use_flag, 1, 0))
{
// ok, flag has been reset to 0
}

View File

@ -14,7 +14,7 @@
#include <map>
#include "lib/posix/posix_mman.h" // PROT_*
#include "lib/sysdep/cpu.h" // CAS
#include "lib/sysdep/cpu.h" // cpu_CAS
//
@ -591,7 +591,7 @@ fail:
void shutdown()
{
if(!CAS(&initialized, 1, 2))
if(!cpu_CAS(&initialized, 1, 2))
return; // never initialized or already shut down - abort
unlock();
cached_ptr->~T(); // call dtor (since we used placement new)
@ -605,7 +605,7 @@ public:
// this could theoretically be done in the ctor, but we try to
// minimize non-trivial code at NLSO ctor time
// (avoids init order problems).
if(CAS(&initialized, 0, 1))
if(cpu_CAS(&initialized, 0, 1))
init();
debug_assert(initialized != 2 && "OverrunProtector: used after dtor called:");
unlock();

View File

@ -421,9 +421,9 @@
#endif
// should we use our float->int code? it requires IA32 and inline asm;
// new GCC with -ffast-math and VC8 can use SSE, so skip it there.
#if CPU_IA32 && HAVE_MS_ASM && MSC_VERSION && MSC_VERSION < 1400
// should we use our float->int code? newer GCC versions with -ffast-math and
// VC8 can use SSE, so skip it there.
#if CPU_IA32 && MSC_VERSION && MSC_VERSION < 1400
# define USE_IA32_FLOAT_TO_INT 1
#else
# define USE_IA32_FLOAT_TO_INT 0

View File

@ -20,7 +20,7 @@
#include "allocators.h"
#include "fnv_hash.h"
#include "lib/posix/posix_pthread.h"
#include "lib/sysdep/cpu.h" // CAS
#include "lib/sysdep/cpu.h" // cpu_CAS
#include "lib/sysdep/sysdep.h"
// some functions here are called from within mmgr; disable its hooks
// so that our allocations don't cause infinite recursion.
@ -224,7 +224,7 @@ LibError debug_write_crashlog(const wchar_t* text)
{
// avoid potential infinite loop if an error occurs here.
static uintptr_t in_progress;
if(!CAS(&in_progress, 0, 1))
if(!cpu_CAS(&in_progress, 0, 1))
return ERR::REENTERED; // NOWARN
// note: we go through some gyrations here (strcpy+strcat) to avoid
@ -676,13 +676,13 @@ ErrorReaction debug_display_error(const wchar_t* description,
// strobe indicating expected_err is valid and the next error should be
// compared against that / skipped if equal to it.
// set/reset via CAS for thread-safety (hence uintptr_t).
// set/reset via cpu_CAS for thread-safety (hence uintptr_t).
static uintptr_t expected_err_valid;
static LibError expected_err;
void debug_skip_next_err(LibError err)
{
if(CAS(&expected_err_valid, 0, 1))
if(cpu_CAS(&expected_err_valid, 0, 1))
expected_err = err;
else
debug_warn("internal error: concurrent attempt to skip assert/error");
@ -693,8 +693,8 @@ static bool should_skip_this_error(LibError err)
{
// (compare before resetting strobe - expected_err may change afterwards)
bool was_expected_err = (expected_err == err);
// (use CAS to ensure only one error is skipped)
if(CAS(&expected_err_valid, 1, 0))
// (use cpu_CAS to ensure only one error is skipped)
if(cpu_CAS(&expected_err_valid, 1, 0))
{
if(!was_expected_err)
debug_warn("anticipated error was not raised");

View File

@ -30,7 +30,7 @@ struct Anchor
uint tag : 10;
uint state : 2;
// convert to uintptr_t for CAS
// convert to uintptr_t for cpu_CAS
operator uintptr_t() const
{
return *(uintptr_t*)this;
@ -70,7 +70,7 @@ struct Active
{
}
// convert to uintptr_t for CAS
// convert to uintptr_t for cpu_CAS
operator uintptr_t() const
{
return *(uintptr_t*)this;
@ -154,7 +154,7 @@ static Descriptor* DescAlloc()
if(desc)
{
Descriptor* next = desc->next;
if(CAS(&DescAvail, desc, next))
if(cpu_CAS(&DescAvail, desc, next))
break;
}
else
@ -162,7 +162,7 @@ static Descriptor* DescAlloc()
desc = (Descriptor*)AllocNewSB(DESCSBSIZE);
// organize descriptors in a linked list
cpu_MemoryFence();
if(CAS(&DescAvail, 0, desc->next))
if(cpu_CAS(&DescAvail, 0, desc->next))
break;
FreeSB((u8*)desc);
}
@ -179,7 +179,7 @@ static void DescRetire(Descriptor* desc)
desc->next = old_head;
cpu_MemoryFence();
}
while(!CAS(&DescAvail, old_head, desc));
while(!cpu_CAS(&DescAvail, old_head, desc));
}
static Descriptor* ListGetPartial(SizeClass* sc)
@ -210,7 +210,7 @@ static Descriptor* HeapGetPartial(ProcHeap* heap)
if(!desc)
return ListGetPartial(heap->sc);
}
while(!CAS(&heap->partial, desc, 0));
while(!cpu_CAS(&heap->partial, desc, 0));
return desc;
}
@ -220,7 +220,7 @@ static void HeapPutPartial(Descriptor* desc)
Descriptor* prev;
do
prev = desc->heap->partial;
while(!CAS(&desc->heap->partial, prev, desc));
while(!cpu_CAS(&desc->heap->partial, prev, desc));
if(prev)
ListPutPartial(prev);
}
@ -230,7 +230,7 @@ static void UpdateActive(ProcHeap* heap, Descriptor* desc, uint more_credits)
{
Active new_active = desc;
new_active.credits = more_credits-1;
if(CAS(&heap->active, 0, new_active))
if(cpu_CAS(&heap->active, 0, new_active))
return;
// someone installed another active sb
@ -242,7 +242,7 @@ static void UpdateActive(ProcHeap* heap, Descriptor* desc, uint more_credits)
new_anchor.count += more_credits;
new_anchor.state = PARTIAL;
}
while(!CAS(&desc->anchor, old_anchor, new_anchor));
while(!cpu_CAS(&desc->anchor, old_anchor, new_anchor));
HeapPutPartial(desc);
}
@ -250,7 +250,7 @@ static void UpdateActive(ProcHeap* heap, Descriptor* desc, uint more_credits)
static void RemoveEmptyDesc(ProcHeap* heap, Descriptor* desc)
{
if(CAS(&heap->partial, desc, 0))
if(cpu_CAS(&heap->partial, desc, 0))
DescRetire(desc);
else
ListRemoveEmptyDesc(heap->sc);
@ -274,7 +274,7 @@ static void* MallocFromActive(ProcHeap* heap)
else
new_active.credits--;
}
while(!CAS(&heap->active, old_active, new_active));
while(!cpu_CAS(&heap->active, old_active, new_active));
u8* p;
@ -300,7 +300,7 @@ static void* MallocFromActive(ProcHeap* heap)
}
}
}
while(!CAS(&desc->anchor, old_anchor, new_anchor));
while(!cpu_CAS(&desc->anchor, old_anchor, new_anchor));
if(old_active.credits == 0 && old_anchor.count > 0)
UpdateActive(heap, desc, more_credits);
@ -335,7 +335,7 @@ retry:
new_anchor.count -= more_credits+1;
new_anchor.state = (more_credits > 0)? ACTIVE : FULL;
}
while(!CAS(&desc->anchor, old_anchor, new_anchor));
while(!cpu_CAS(&desc->anchor, old_anchor, new_anchor));
u8* p;
@ -347,7 +347,7 @@ retry:
new_anchor.avail = *(uint*)p;
new_anchor.tag++;
}
while(!CAS(&desc->anchor, old_anchor, new_anchor));
while(!cpu_CAS(&desc->anchor, old_anchor, new_anchor));
if(more_credits > 0)
UpdateActive(heap, desc, more_credits);
@ -373,7 +373,7 @@ static void* MallocFromNewSB(ProcHeap* heap)
desc->anchor.count = (desc->maxcount-1)-(new_active.credits+1);
desc->anchor.state = ACTIVE;
cpu_MemoryFence();
if(!CAS(&heap->active, 0, new_active))
if(!cpu_CAS(&heap->active, 0, new_active))
{
FreeSB(desc->sb);
return 0;
@ -453,7 +453,7 @@ void lf_free(void* p_)
new_anchor.count++;
cpu_MemoryFence();
}
while(!CAS(&desc->anchor, old_anchor, new_anchor));
while(!cpu_CAS(&desc->anchor, old_anchor, new_anchor));
if(new_anchor.state == EMPTY)
{
FreeSB(sb);

View File

@ -247,18 +247,6 @@ extern void LibError_set_errno(LibError err);
// if expression evaluates to a negative error code, warn user and
// return the number.
#if OS_WIN
#define CHECK_ERR(expression)\
STMT(\
i64 err64__ = (i64)(expression);\
if(err64__ < 0)\
{\
LibError err__ = (LibError)(err64__ & UINT_MAX);\
DEBUG_WARN_ERR(err__);\
return err__;\
}\
)
#else
#define CHECK_ERR(expression)\
STMT(\
i64 err64__ = (i64)(expression);\
@ -269,7 +257,6 @@ STMT(\
return (LibError)(err__ & UINT_MAX);\
}\
)
#endif
// just pass on errors without any kind of annoying warning
// (useful for functions that can legitimately fail, e.g. vfs_exists).

View File

@ -107,7 +107,7 @@ struct TLS
TLS* next;
void* hp[NUM_HPS];
uintptr_t active; // used as bool, but set by CAS
uintptr_t active; // used as bool, but set by cpu_CAS
Node* retired_nodes[MAX_RETIRED];
size_t num_retired_nodes;
@ -128,7 +128,7 @@ static void tls_retire(void* tls_)
tls->hp[i] = 0;
// successfully marked as unused (must only decrement once)
if(CAS(&tls->active, 1, 0))
if(cpu_CAS(&tls->active, 1, 0))
{
cpu_AtomicAdd(&active_threads, -1);
debug_assert(active_threads >= 0);
@ -168,7 +168,7 @@ static TLS* tls_alloc()
// try to reuse a retired TLS slot
for(tls = tls_list; tls; tls = tls->next)
// .. succeeded in reactivating one.
if(CAS(&tls->active, 0, 1))
if(cpu_CAS(&tls->active, 0, 1))
goto have_tls;
// no unused slots available - allocate another
@ -191,7 +191,7 @@ static TLS* tls_alloc()
old_tls_list = tls_list;
tls->next = old_tls_list;
}
while(!CAS(&tls_list, old_tls_list, tls));
while(!cpu_CAS(&tls_list, old_tls_list, tls));
}
@ -457,7 +457,7 @@ retry:
if(is_marked_as_deleted(pos->next))
{
Node* next = without_mark(pos->next);
if(!CAS(pos->pprev, pos->cur, next))
if(!cpu_CAS(pos->pprev, pos->cur, next))
goto retry;
smr_retire_node(pos->cur);
@ -523,7 +523,7 @@ retry:
// already in list - return it and leave <was_inserted> 'false'
if(list_lookup(list, key, pos))
{
// free in case we allocated below, but CAS failed;
// free in case we allocated below, but cpu_CAS failed;
// no-op if node == 0, i.e. it wasn't allocated.
node_free(node);
@ -547,7 +547,7 @@ retry:
// - *pprev was removed (i.e. it's 'marked')
// - cur was retired (i.e. no longer reachable from *phead)
// - a new node was inserted immediately before cur
if(!CAS(pos->pprev, pos->cur, node))
if(!cpu_CAS(pos->pprev, pos->cur, node))
goto retry;
// else: successfully inserted; linearization point
if(was_inserted)
@ -577,11 +577,11 @@ retry:
// - next was removed
// - cur was retired (i.e. no longer reachable from *phead)
// - a new node was inserted immediately after cur
if(!CAS(&pos->cur->next, pos->next, with_mark(pos->next)))
if(!cpu_CAS(&pos->cur->next, pos->next, with_mark(pos->next)))
goto retry;
// remove from list; if successful, this is the
// linearization point and *pprev isn't marked.
if(CAS(pos->pprev, pos->cur, pos->next))
if(cpu_CAS(pos->pprev, pos->cur, pos->next))
smr_retire_node(pos->cur);
// failed: another thread removed cur after it was marked above.
// call list_lookup to ensure # non-released nodes < # threads.

View File

@ -27,13 +27,13 @@ static Pool trace_pool;
// call at before using trace_pool. no-op if called more than once.
static inline void trace_init()
{
if(CAS(&trace_initialized, 0, 1))
if(cpu_CAS(&trace_initialized, 0, 1))
(void)pool_create(&trace_pool, 4*MiB, sizeof(TraceEntry));
}
void trace_shutdown()
{
if(CAS(&trace_initialized, 1, 0))
if(cpu_CAS(&trace_initialized, 1, 0))
(void)pool_destroy(&trace_pool);
}

View File

@ -11,288 +11,7 @@
#include "precompiled.h"
#include "cpu.h"
#include "lib/bits.h"
#include "lib/module_init.h"
#include "lib/posix/posix.h"
#if CPU_IA32
# include "lib/sysdep/ia32/ia32.h"
# include "lib/sysdep/ia32/ia32_memcpy.h"
#endif
#if OS_MACOSX
# include "lib/sysdep/unix/ocpu.h"
#elif OS_LINUX
# include "lib/sysdep/unix/lcpu.h"
#elif OS_UNIX
# include "lib/sysdep/unix/ucpu.h"
#elif OS_WIN
# include "lib/sysdep/win/wcpu.h"
#endif
ERROR_ASSOCIATE(ERR::CPU_FEATURE_MISSING, "This CPU doesn't support a required feature", -1);
ERROR_ASSOCIATE(ERR::CPU_UNKNOWN_OPCODE, "Disassembly failed", -1);
ERROR_ASSOCIATE(ERR::CPU_UNKNOWN_VENDOR, "CPU vendor unknown", -1);
ERROR_ASSOCIATE(ERR::CPU_RESTRICTED_AFFINITY, "Cannot set desired CPU affinity", -1);
//-----------------------------------------------------------------------------
#pragma region Accessor functions
// insulate caller from the system-specific modules and cache results.
// note: the providers sometimes need to store the results anyway, so we
// don't need to do caching in those cases.
// these are set once during cpu_Init since they're usually all used and
// we thus avoid needing if(already_called) return old_result.
// initially set to 'impossible' values to catch uses before cpu_Init.
static double clockFrequency = -1.0;
double cpu_ClockFrequency()
{
debug_assert(clockFrequency > 0.0);
return clockFrequency;
}
static void DetectClockFrequency()
{
#if OS_WIN
clockFrequency = wcpu_ClockFrequency();
#endif
// success; we stick with this value because it either doesn't matter
// (WHRT isn't using the TSC), or cannot be determined more accurately
// (ia32 will use WHRT's TSC to measure its own frequency). bonus: the
// OS-specific functions are much faster than ia32's measurement loop.
if(clockFrequency > 0.0)
return;
#if CPU_IA32
clockFrequency = ia32_ClockFrequency(); // authoritative, precise
#endif
}
static size_t memoryTotalMib = 1;
size_t cpu_MemoryTotalMiB()
{
debug_assert(memoryTotalMib > 1);
return memoryTotalMib;
}
static void DetectMemory()
{
size_t memoryTotal = cpu_MemorySize(CPU_MEM_TOTAL);
// account for inaccurate reporting by rounding up (see wposix sysconf)
const size_t memoryTotalPow2 = (size_t)round_up_to_pow2((uint)memoryTotal);
// .. difference too great, just round up to 1 MiB
if(memoryTotalPow2 - memoryTotal > 3*MiB)
memoryTotal = round_up(memoryTotal, 1*MiB);
// .. difference acceptable, use next power of two
else
memoryTotal = memoryTotalPow2;
memoryTotalMib = memoryTotal / MiB;
}
const char* cpu_IdentifierString()
{
#if CPU_IA32
return ia32_IdentifierString(); // cached
#endif
}
uint cpu_NumPackages()
{
#if CPU_IA32
return ia32_NumPackages(); // cached
#endif
}
uint cpu_CoresPerPackage()
{
#if CPU_IA32
return ia32_CoresPerPackage(); // cached
#endif
}
uint cpu_LogicalPerCore()
{
#if CPU_IA32
return ia32_LogicalPerCore(); // cached
#endif
}
#pragma endregion
//-----------------------------------------------------------------------------
#if CPU_IA32
static void InitAndConfigureIA32()
{
ia32_Init(); // must come before any use of ia32*
ia32_memcpy_init();
// no longer set 24 bit (float) precision by default: for
// very long game uptimes (> 1 day; e.g. dedicated server),
// we need full precision when calculating the time.
// if there's a spot where we want to speed up divides|sqrts,
// we can temporarily change precision there.
//ia32_asm_control87(IA32_PC_24, IA32_MCW_PC);
// to help catch bugs, enable as many floating-point exceptions as
// possible. unfortunately SpiderMonkey triggers all of them.
// note: passing a flag *disables* that exception.
ia32_asm_control87(IA32_EM_ZERODIVIDE|IA32_EM_INVALID|IA32_EM_DENORMAL|IA32_EM_OVERFLOW|IA32_EM_UNDERFLOW|IA32_EM_INEXACT, IA32_MCW_EM);
// no longer round toward zero (truncate). changing this setting
// resulted in much faster float->int casts, because the compiler
// could be told (via /QIfist) to use FISTP while still truncating
// the result as required by ANSI C. however, FPU calculation
// results were changed significantly, so it had to be disabled.
//ia32_asm_control87(IA32_RC_CHOP, IA32_MCW_RC);
}
#endif
//-----------------------------------------------------------------------------
static ModuleInitState initState;
void cpu_Init()
{
if(!ModuleShouldInitialize(&initState))
return;
#if CPU_IA32
InitAndConfigureIA32();
#endif
DetectMemory();
DetectClockFrequency();
}
void cpu_Shutdown()
{
if(!ModuleShouldShutdown(&initState))
return;
// currently nothing to do
}
//-----------------------------------------------------------------------------
// stateless routines
bool cpu_CAS(volatile uintptr_t* location, uintptr_t expected, uintptr_t new_value)
{
#if CPU_IA32
return ia32_asm_CAS(location, expected, new_value);
#endif
}
void cpu_AtomicAdd(volatile intptr_t* location, intptr_t increment)
{
#if CPU_IA32
return ia32_asm_AtomicAdd(location, increment);
#endif
}
void cpu_MemoryFence()
{
#if CPU_IA32
return ia32_MemoryFence();
#endif
}
void cpu_Serialize()
{
#if CPU_IA32
return ia32_Serialize();
#endif
}
void* cpu_memcpy(void* RESTRICT dst, const void* RESTRICT src, size_t nbytes)
{
#if CPU_IA32
return ia32_memcpy(dst, src, nbytes);
#else
return memcpy(dst, src, nbytes);
#endif
}
int cpu_OsNumProcessors()
{
#if OS_WIN
return wcpu_NumProcessors();
#elif OS_UNIX
return ucpu_NumPackages();
#else
#error "port"
#endif
}
LibError cpu_CallByEachCPU(CpuCallback cb, void* param)
{
#if OS_WIN
return wcpu_CallByEachCPU(cb, param);
#elif OS_LINUX
return lcpu_CallByEachCPU(cb, param);
#endif
}
i32 cpu_i32FromFloat(float f)
{
#if USE_IA32_FLOAT_TO_INT
return ia32_asm_i32FromFloat(f);
#else
return (i32)f;
#endif
}
i32 cpu_i32FromDouble(double d)
{
#if USE_IA32_FLOAT_TO_INT
return ia32_asm_i32FromDouble(d);
#else
return (i32)d;
#endif
}
i64 cpu_i64FromDouble(double d)
{
#if USE_IA32_FLOAT_TO_INT
return ia32_asm_i64FromDouble(d);
#else
return (i64)d;
#endif
}
size_t cpu_PageSize()
{
#if OS_WIN
return wcpu_PageSize();
#else
return ucpu_PageSize();
#endif
}
size_t cpu_MemorySize(CpuMemoryIndicators mem_type)
{
#if OS_LINUX
return lcpu_MemorySize(mem_type);
#elif OS_MACOSX
return ocpu_MemorySize(mem_type);
#elif OS_WIN
return wcpu_MemorySize(mem_type);
#endif
}

View File

@ -19,36 +19,91 @@ namespace ERR
const LibError CPU_RESTRICTED_AFFINITY = -130003;
}
// (some of these functions may be implemented in external asm files)
#ifdef __cplusplus
extern "C" {
#endif
// must be called before any of the below accessors.
extern void cpu_Init();
extern void cpu_Shutdown();
//-----------------------------------------------------------------------------
// CPU detection
/**
* @return string identifying the CPU (usually a cleaned-up version of the
* brand string)
**/
extern const char* cpu_IdentifierString();
/**
* @return a rough estimate of the CPU clock frequency.
*
* note: the accuracy of this value is not important. while it is used by
* the TSC timing backend, thermal drift is an issue that requires
* continual recalibration anyway, which makes the initial accuracy moot.
* querying frequency via OS is also much faster than ia32's measurement loop.
**/
extern double cpu_ClockFrequency();
extern uint cpu_NumPackages(); // i.e. sockets
/**
* @return the number of what the OS deems "processors" or -1 on failure.
*
* this is used by ia32 when it cannot determine the number via APIC IDs.
* in other situations, the cpu_NumPackages function is preferable since
* it is more specific.
*
* note: this function is necessary because POSIX sysconf _SC_NPROCESSORS_CONF
* is not suppored on MacOSX, else we would use that.
**/
extern uint cpu_NumProcessors();
/**
* @return number of *enabled* CPU packages / sockets.
**/
extern uint cpu_NumPackages();
/**
* @return number of *enabled* CPU cores per package.
* (2 on dual-core systems)
**/
extern uint cpu_CoresPerPackage();
/**
* @return number of *enabled* hyperthreading units per core.
* (2 on P4 EE)
**/
extern uint cpu_LogicalPerCore();
// faster than cpu_MemorySize (caches total size determined during init),
// returns #Mebibytes (cleaned up to account e.g. for nonpaged pool)
extern size_t cpu_MemoryTotalMiB();
/**
* @return the size [bytes] of a MMU page.
* (4096 on most IA-32 systems)
**/
extern size_t cpu_PageSize();
enum CpuMemoryIndicators
{
CPU_MEM_TOTAL,
CPU_MEM_AVAILABLE
};
/**
* @return the amount [bytes] of available or total physical memory.
**/
extern size_t cpu_MemorySize(CpuMemoryIndicators mem_type);
//
// misc (stateless)
//
//-----------------------------------------------------------------------------
// lock-free support routines
// atomic "compare and swap". compare the machine word at <location> against
// <expected>; if not equal, return false; otherwise, overwrite it with
// <new_value> and return true.
extern bool cpu_CAS(volatile uintptr_t* location, uintptr_t expected, uintptr_t new_value);
// this is often used for pointers, so the macro coerces parameters to
// uintptr_t. invalid usage unfortunately also goes through without warnings.
// to catch cases where the caller has passed <expected> as <location> or
// similar mishaps, the implementation verifies <location> is a valid pointer.
#define CAS(l,o,n) cpu_CAS((volatile uintptr_t*)l, (uintptr_t)o, (uintptr_t)n)
/**
* atomic "compare and swap".
*
* @param location address of the word to compare and possibly overwrite
* @param expected its expected value
* @param newValue the value with which to replace it
* @return false if the target word doesn't match the expected value,
* otherwise true (also overwriting the contents of location)
**/
extern bool cpu_CAS(volatile uintptr_t* location, uintptr_t expected, uintptr_t newValue);
/**
* add a signed value to a variable without the possibility of interference
@ -56,73 +111,70 @@ extern bool cpu_CAS(volatile uintptr_t* location, uintptr_t expected, uintptr_t
**/
extern void cpu_AtomicAdd(volatile intptr_t* location, intptr_t increment);
/**
* enforce strict instruction ordering in the CPU pipeline.
**/
extern void cpu_Serialize();
// enforce strong memory ordering.
/**
* enforce strong memory ordering.
**/
extern void cpu_MemoryFence();
extern size_t cpu_PageSize();
enum CpuMemoryIndicators
{
CPU_MEM_TOTAL, CPU_MEM_AVAILABLE
};
extern size_t cpu_MemorySize(CpuMemoryIndicators mem_type);
// drop-in replacement for libc memcpy(). only requires CPU support for
// MMX (by now universal). highly optimized for Athlon and Pentium III
// microarchitectures; significantly outperforms VC7.1 memcpy and memcpy_amd.
// for details, see accompanying article.
extern void* cpu_memcpy(void* RESTRICT dst, const void* RESTRICT src, size_t size);
//-----------------------------------------------------------------------------
// misc
/**
* @return the number of what the OS deems "processors" or -1 on failure.
*
* this is used by ia32 when it cannot determine the number via APIC IDs.
* in other situations, the cpu_NumPackages et al. functions are preferable
* since they are more specific.
*
* note: this function is necessary because POSIX sysconf _SC_NPROCESSORS_CONF
* is not suppored on MacOSX, else we would use that.
* drop-in replacement for libc memcpy(). highly optimized for Athlon and
* Pentium III microarchitectures; significantly outperforms VC7.1 memcpy and
* memcpy_amd. for details, see accompanying article.
**/
extern int cpu_OsNumProcessors();
// execute the specified function once on each CPU.
// this includes logical HT units and proceeds serially (function
// is never re-entered) in order of increasing OS CPU ID.
// note: implemented by switching thread affinity masks and forcing
// a reschedule, which is apparently not possible with POSIX.
//
// may fail if e.g. OS is preventing us from running on some CPUs.
// called from ia32.cpp get_cpu_count.
extern void* cpu_memcpy(void* RESTRICT dst, const void* RESTRICT src, size_t size);
/**
* execute the specified function once on each CPU.
* this includes logical HT units and proceeds serially (function
* is never re-entered) in order of increasing OS CPU ID.
* note: implemented by switching thread affinity masks and forcing
* a reschedule, which is apparently not possible with POSIX.
*
* may fail if e.g. OS is preventing us from running on some CPUs.
**/
typedef void (*CpuCallback)(void* param);
extern LibError cpu_CallByEachCPU(CpuCallback cb, void* param);
/**
* set the FPU control word to "desirable" values (see implementation)
**/
extern void cpu_ConfigureFloatingPoint();
// convert float to int much faster than _ftol2, which would normally be
// used by (int) casts.
#if !USE_IA32_FLOAT_TO_INT
#define cpu_i32FromFloat(f) ((i32)f)
#define cpu_i32FromDouble(d) ((i32)d)
#define cpu_i64FromDouble(d) ((i64)d)
#else
extern i32 cpu_i32FromFloat(float f);
extern i32 cpu_i32FromDouble(double d);
extern i64 cpu_i64FromDouble(double d);
// Win32 CONTEXT field abstraction
// (there's no harm also defining this for other platforms)
#if CPU_AMD64
# define PC_ Rip
# define FP_ Rbp
# define SP_ Rsp
#elif CPU_IA32
# define PC_ Eip
# define FP_ Ebp
# define SP_ Esp
#endif
#ifdef __cplusplus
}
#endif
/**
* specialization of cpu_CAS for pointer types. this avoids error-prone
* casting in user code.
**/
template<typename T>
extern bool cpu_CAS(volatile T* location, T expected, T new_value)
{
return cpu_CAS((volatile uintptr_t*)location, (uintptr_t)expected, (uintptr_t)new_value);
}
#endif // #ifndef INCLUDED_CPU

View File

@ -20,7 +20,6 @@
#include "lib/posix/posix.h" // pthread
#include "lib/bits.h"
#include "lib/timer.h"
#include "lib/module_init.h"
#include "lib/sysdep/cpu.h"
#if !HAVE_MS_ASM && !HAVE_GNU_ASM
@ -30,28 +29,32 @@
//-----------------------------------------------------------------------------
// capability bits
// set by ia32_cap_init, referenced by ia32_cap
// treated as 128 bit field; order: std ecx, std edx, ext ecx, ext edx
// keep in sync with enum CpuCap!
static u32 ia32_caps[4];
static void ia32_cap_init()
static void DetectFeatureFlags(u32 caps[4])
{
u32 regs[4];
if(ia32_asm_cpuid(1, regs))
{
ia32_caps[0] = regs[ECX];
ia32_caps[1] = regs[EDX];
caps[0] = regs[ECX];
caps[1] = regs[EDX];
}
if(ia32_asm_cpuid(0x80000001, regs))
{
ia32_caps[2] = regs[ECX];
ia32_caps[3] = regs[EDX];
caps[2] = regs[ECX];
caps[3] = regs[EDX];
}
}
bool ia32_cap(IA32Cap cap)
{
// treated as 128 bit field; order: std ecx, std edx, ext ecx, ext edx
// keep in sync with enum CpuCap!
static u32 ia32_caps[4];
// (since relevant CPUs will surely advertise at least one standard flag,
// they are zero iff we haven't been initialized yet)
if(!ia32_caps[1])
DetectFeatureFlags(ia32_caps);
const uint tbl_idx = cap >> 5;
const uint bit_idx = cap & 0x1f;
if(tbl_idx > 3)
@ -66,18 +69,11 @@ bool ia32_cap(IA32Cap cap)
//-----------------------------------------------------------------------------
// CPU identification
static Ia32Vendor vendor;
Ia32Vendor ia32_Vendor()
{
return vendor;
}
static void DetectVendor()
static Ia32Vendor DetectVendor()
{
u32 regs[4];
if(!ia32_asm_cpuid(0, regs))
return;
DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING);
// copy regs to string
// note: 'strange' ebx,edx,ecx reg order is due to ModR/M encoding order.
@ -89,57 +85,65 @@ static void DetectVendor()
vendor_str[12] = '\0'; // 0-terminate
if(!strcmp(vendor_str, "AuthenticAMD"))
vendor = IA32_VENDOR_AMD;
return IA32_VENDOR_AMD;
else if(!strcmp(vendor_str, "GenuineIntel"))
vendor = IA32_VENDOR_INTEL;
return IA32_VENDOR_INTEL;
else
DEBUG_WARN_ERR(ERR::CPU_UNKNOWN_VENDOR);
}
static uint model, family;
static uint generation;
uint ia32_Generation()
{
return generation;
DEBUG_WARN_ERR(ERR::CPU_UNKNOWN_VENDOR);
return IA32_VENDOR_UNKNOWN;
}
}
static void DetectSignature()
Ia32Vendor ia32_Vendor()
{
static Ia32Vendor vendor = IA32_VENDOR_UNKNOWN;
if(vendor == IA32_VENDOR_UNKNOWN)
vendor = DetectVendor();
return vendor;
}
static void DetectSignature(uint* model, uint* family)
{
u32 regs[4];
if(!ia32_asm_cpuid(1, regs))
DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING);
model = bits(regs[EAX], 4, 7);
family = bits(regs[EAX], 8, 11);
*model = bits(regs[EAX], 4, 7);
*family = bits(regs[EAX], 8, 11);
}
static uint DetectGeneration()
{
uint model, family;
DetectSignature(&model, &family);
switch(family)
{
case 5:
case 6:
case 7:
generation = family;
break;
return family;
case 0xF:
generation = 8;
break;
return 8;
default:
debug_assert(0);
return 0;
}
}
uint ia32_Generation()
{
static uint generation;
if(!generation)
generation = DetectGeneration();
return generation;
}
//-----------------------------------------------------------------------------
// identifier string
// 3 calls x 4 registers x 4 bytes = 48
static char identifierString[48+1] = {'\0'};
const char* ia32_IdentifierString()
{
return identifierString;
}
/// functor to remove substrings from the CPU identifier string
class StringStripper
{
@ -168,7 +172,7 @@ public:
}
};
static void DetectIdentifierString()
static void DetectIdentifierString(char* identifierString)
{
// get brand string (if available)
// note: ia32_asm_cpuid writes 4 u32s directly to identifierString -
@ -191,26 +195,30 @@ static void DetectIdentifierString()
// doesn't recognize.
if(!have_brand_string || strncmp(identifierString, "Unknow", 6) == 0)
{
if(vendor == IA32_VENDOR_AMD)
uint model, family;
DetectSignature(&model, &family);
switch(ia32_Vendor())
{
case IA32_VENDOR_AMD:
// everything else is either too old, or should have a brand string.
if(family == 6)
{
if(model == 3 || model == 7)
SAFE_STRCPY(identifierString, "IA32_VENDOR_AMD Duron");
SAFE_STRCPY(identifierString, "AMD Duron");
else if(model <= 5)
SAFE_STRCPY(identifierString, "IA32_VENDOR_AMD Athlon");
SAFE_STRCPY(identifierString, "AMD Athlon");
else
{
if(ia32_cap(IA32_CAP_AMD_MP))
SAFE_STRCPY(identifierString, "IA32_VENDOR_AMD Athlon MP");
SAFE_STRCPY(identifierString, "AMD Athlon MP");
else
SAFE_STRCPY(identifierString, "IA32_VENDOR_AMD Athlon XP");
SAFE_STRCPY(identifierString, "AMD Athlon XP");
}
}
}
else if(vendor == IA32_VENDOR_INTEL)
{
break;
case IA32_VENDOR_INTEL:
// everything else is either too old, or should have a brand string.
if(family == 6)
{
@ -223,6 +231,7 @@ static void DetectIdentifierString()
else
SAFE_STRCPY(identifierString, "Intel Pentium III");
}
break;
}
}
// identifierString already holds a valid brand string; pretty it up.
@ -238,6 +247,15 @@ static void DetectIdentifierString()
}
}
const char* cpu_IdentifierString()
{
// 3 calls x 4 registers x 4 bytes = 48
static char identifierString[48+1] = {'\0'};
if(identifierString[0] == '\0')
DetectIdentifierString(identifierString);
return identifierString;
}
//-----------------------------------------------------------------------------
// CPU frequency
@ -352,7 +370,6 @@ double ia32_ClockFrequency()
//-----------------------------------------------------------------------------
// processor topology
//-----------------------------------------------------------------------------
uint ia32_ApicId()
{
@ -370,29 +387,34 @@ uint ia32_ApicId()
// note: Intel Appnote 485 (CPUID) assures uniformity of coresPerPackage and
// logicalPerCore.
static uint coresPerPackage = 0;
static uint logicalPerCore = 0;
static void DetectCoresPerPackage()
static uint DetectCoresPerPackage()
{
u32 regs[4];
coresPerPackage = 1; // single-core unless..
switch(vendor)
switch(ia32_Vendor())
{
case IA32_VENDOR_INTEL:
if(ia32_asm_cpuid(4, regs))
coresPerPackage = bits(regs[EAX], 26, 31)+1;
return bits(regs[EAX], 26, 31)+1;
break;
case IA32_VENDOR_AMD:
if(ia32_asm_cpuid(0x80000008, regs))
coresPerPackage = bits(regs[ECX], 0, 7)+1;
return bits(regs[ECX], 0, 7)+1;
break;
}
return 1; // else: the CPU is single-core.
}
static uint CoresPerPackage()
{
static uint coresPerPackage = 0;
if(!coresPerPackage)
coresPerPackage = DetectCoresPerPackage();
return coresPerPackage;
}
static bool IsHyperthreadingCapable()
{
// definitely not
@ -401,30 +423,37 @@ static bool IsHyperthreadingCapable()
// AMD N-core systems falsely set the HT bit for compatibility reasons
// (don't bother resetting it, might confuse callers)
if(vendor == IA32_VENDOR_AMD && ia32_cap(IA32_CAP_AMD_CMP_LEGACY))
if(ia32_Vendor() == IA32_VENDOR_AMD && ia32_cap(IA32_CAP_AMD_CMP_LEGACY))
return false;
return true;
}
static void DetectLogicalPerCore()
static uint DetectLogicalPerCore()
{
u32 regs[4];
if(!IsHyperthreadingCapable())
{
logicalPerCore = 1;
return;
}
return 1;
u32 regs[4];
if(!ia32_asm_cpuid(1, regs))
DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING);
const uint logicalPerPackage = bits(regs[EBX], 16, 23);
// cores ought to be uniform WRT # logical processors
debug_assert(logicalPerPackage % coresPerPackage == 0);
logicalPerCore = logicalPerPackage / coresPerPackage;
debug_assert(logicalPerPackage % CoresPerPackage() == 0);
return logicalPerPackage / CoresPerPackage();
}
static uint LogicalPerCore()
{
static uint logicalPerCore = 0;
if(!logicalPerCore)
logicalPerCore = DetectLogicalPerCore();
return logicalPerCore;
}
// the above two functions give the maximum number of cores/logical units.
// however, some of them may actually be disabled by the BIOS!
// what we can do is to analyze the APIC IDs. they are allocated sequentially
@ -432,7 +461,7 @@ static void DetectLogicalPerCore()
// (according to the number of cores/logical units present) allows
// determining the exact topology as well as number of packages.
// these are set by DetectProcessorTopology, called from ia32_Init.
// these are set by DetectProcessorTopology.
static uint numPackages = 0; // i.e. sockets; > 1 => true SMP system
static uint enabledCoresPerPackage = 0;
static uint enabledLogicalPerCore = 0; // hyperthreading units
@ -475,7 +504,7 @@ static void ExtractFieldsIntoSet(const Ids& apicIds, uint& bit_pos, uint num_val
static bool DetectProcessorTopologyViaApicIds()
{
// old APIC (see ia32_ApicId for details)
if(generation < 8)
if(ia32_Generation() < 8)
return false;
// get the set of all APIC IDs
@ -490,9 +519,9 @@ static bool DetectProcessorTopologyViaApicIds()
// extract values from all 3 ID bitfields into separate sets
uint bit_pos = 0;
IdSet logicalIds;
ExtractFieldsIntoSet(apicIds, bit_pos, logicalPerCore, logicalIds);
ExtractFieldsIntoSet(apicIds, bit_pos, LogicalPerCore(), logicalIds);
IdSet coreIds;
ExtractFieldsIntoSet(apicIds, bit_pos, coresPerPackage, coreIds);
ExtractFieldsIntoSet(apicIds, bit_pos, CoresPerPackage(), coreIds);
IdSet packageIds;
ExtractFieldsIntoSet(apicIds, bit_pos, 0xFF, packageIds);
@ -512,7 +541,7 @@ static bool DetectProcessorTopologyViaApicIds()
static void GuessProcessorTopologyViaOsCount()
{
const int numProcessors = cpu_OsNumProcessors();
const int numProcessors = cpu_NumProcessors();
// note: we cannot hope to always return correct results since disabled
// cores/logical units cannot be distinguished from the situation of the
@ -523,13 +552,13 @@ static void GuessProcessorTopologyViaOsCount()
// is reasonable because we care most about #packages. it's fine to assume
// more cores (without inflating the total #processors) because that
// count only indicates memory barriers etc. ought to be used.
enabledCoresPerPackage = coresPerPackage;
enabledLogicalPerCore = logicalPerCore;
enabledCoresPerPackage = CoresPerPackage();
enabledLogicalPerCore = LogicalPerCore();
const long numPackagesTimesLogical = numProcessors / coresPerPackage;
const long numPackagesTimesLogical = numProcessors / CoresPerPackage();
debug_assert(numPackagesTimesLogical != 0); // otherwise processors didn't include cores, which would be stupid
numPackages = numPackagesTimesLogical / logicalPerCore;
numPackages = numPackagesTimesLogical / LogicalPerCore();
if(!numPackages) // processors didn't include logical units (reasonable)
numPackages = numPackagesTimesLogical;
}
@ -547,27 +576,24 @@ static void DetectProcessorTopology()
}
uint ia32_NumPackages()
uint cpu_NumPackages()
{
#ifndef NDEBUG
debug_assert(numPackages != 0);
#endif
if(!numPackages)
DetectProcessorTopology();
return (uint)numPackages;
}
uint ia32_CoresPerPackage()
uint cpu_CoresPerPackage()
{
#ifndef NDEBUG
debug_assert(enabledCoresPerPackage != 0);
#endif
if(!enabledCoresPerPackage)
DetectProcessorTopology();
return (uint)enabledCoresPerPackage;
}
uint ia32_LogicalPerCore()
uint cpu_LogicalPerCore()
{
#ifndef NDEBUG
debug_assert(enabledLogicalPerCore != 0);
#endif
if(!enabledLogicalPerCore)
DetectProcessorTopology();
return (uint)enabledLogicalPerCore;
}
@ -618,7 +644,7 @@ void ia32_DebugBreak()
// enforce strong memory ordering.
void ia32_MemoryFence()
void cpu_MemoryFence()
{
// Pentium IV
if(ia32_cap(IA32_CAP_SSE2))
@ -629,15 +655,6 @@ void ia32_MemoryFence()
#endif
}
void ia32_Serialize()
{
#if HAVE_MS_ASM
__asm cpuid
#elif HAVE_GNU_ASM
__asm__ __volatile__ ("cpuid");
#endif
}
// checks if there is an IA-32 CALL instruction right before ret_addr.
// returns INFO::OK if so and ERR::FAIL if not.
@ -702,33 +719,24 @@ LibError ia32_GetCallTarget(void* ret_addr, void** target)
}
//-----------------------------------------------------------------------------
static ModuleInitState initState;
void ia32_Init()
void cpu_ConfigureFloatingPoint()
{
if(!ModuleShouldInitialize(&initState))
return;
// no longer set 24 bit (float) precision by default: for
// very long game uptimes (> 1 day; e.g. dedicated server),
// we need full precision when calculating the time.
// if there's a spot where we want to speed up divides|sqrts,
// we can temporarily change precision there.
//ia32_asm_control87(IA32_PC_24, IA32_MCW_PC);
ia32_asm_cpuid_init();
// to help catch bugs, enable as many floating-point exceptions as
// possible. unfortunately SpiderMonkey triggers all of them.
// note: passing a flag *disables* that exception.
ia32_asm_control87(IA32_EM_ZERODIVIDE|IA32_EM_INVALID|IA32_EM_DENORMAL|IA32_EM_OVERFLOW|IA32_EM_UNDERFLOW|IA32_EM_INEXACT, IA32_MCW_EM);
ia32_cap_init();
DetectVendor();
DetectSignature();
DetectIdentifierString();
DetectCoresPerPackage();
DetectLogicalPerCore();
DetectProcessorTopology();
}
void ia32_Shutdown()
{
if(!ModuleShouldShutdown(&initState))
return;
// nothing to do
// no longer round toward zero (truncate). changing this setting
// resulted in much faster float->int casts, because the compiler
// could be told (via /QIfist) to use FISTP while still truncating
// the result as required by ANSI C. however, FPU calculation
// results were changed significantly, so it had to be disabled.
//ia32_asm_control87(IA32_RC_CHOP, IA32_MCW_RC);
}

View File

@ -16,13 +16,7 @@
#endif
#include "ia32_asm.h"
#include "ia32_memcpy.h"
/**
* must be called before any of the following functions.
**/
extern void ia32_Init();
extern void ia32_Shutdown();
/**
* CPU vendor.
@ -82,38 +76,6 @@ enum IA32Cap
extern bool ia32_cap(IA32Cap cap);
// CPU detection
/**
* @return string identifying the CPU (usually a cleaned-up version of the
* brand string)
**/
extern const char* ia32_IdentifierString();
/**
* @return the cached result of a precise measurement of the
* CPU frequency.
**/
extern double ia32_ClockFrequency();
/**
* @return number of *enabled* CPU packages / sockets.
**/
extern uint ia32_NumPackages();
/**
* @return number of *enabled* CPU cores per package.
* (2 on dual-core systems)
**/
extern uint ia32_CoresPerPackage();
/**
* @return number of *enabled* hyperthreading units per core.
* (2 on P4 EE)
**/
extern uint ia32_LogicalPerCore();
//-----------------------------------------------------------------------------
// stateless
@ -163,14 +125,6 @@ extern u64 ia32_rdtsc(); // only for CppDoc's benefit
extern void ia32_DebugBreak(void);
// implementations of the cpu.h interface
/// see cpu_MemoryFence
extern void ia32_MemoryFence();
// see cpu_Serialize
extern void ia32_Serialize();
/// fpclassify return values
#define IA32_FP_NAN 0x0100

View File

@ -19,51 +19,29 @@
[section .data]
; these are actually max_func+1, i.e. the first invalid value.
; the idea here is to avoid a separate cpuid_available flag;
; using signed values doesn't work because ext_funcs are >= 0x80000000.
max_func dd 0
max_ext_func dd 0
__SECT__
; extern "C" void __cdecl ia32_asm_cpuid_init()
global sym(ia32_asm_cpuid_init)
sym(ia32_asm_cpuid_init):
push ebx
; check if CPUID is supported
pushfd
or byte [esp+2], 32
popfd
pushfd
pop eax
xor edx, edx
shr eax, 22 ; bit 21 toggled?
jnc .no_cpuid
; determine max supported CPUID function
xor eax, eax
cpuid
inc eax ; (see max_func decl)
mov [max_func], eax
mov eax, 0x80000000
cpuid
inc eax ; (see max_func decl)
mov [max_ext_func], eax
.no_cpuid:
pop ebx
ret
; extern "C" bool __cdecl ia32_asm_cpuid(u32 func, u32* regs)
global sym(ia32_asm_cpuid)
sym(ia32_asm_cpuid):
push ebx
push edi
cmp dword [max_func], 0
ja .already_initialized
; determine max supported CPUID function
xor eax, eax
cpuid
mov [max_func], eax
mov eax, 0x80000000
cpuid
mov [max_ext_func], eax
.already_initialized:
mov edx, [esp+8+4+0] ; func
mov edi, [esp+8+4+4] ; -> regs
@ -75,7 +53,7 @@ sym(ia32_asm_cpuid):
mov ebx, [max_func]
.is_ext_func:
cmp edx, ebx
jae .ret ; (see max_func decl)
ja .ret
; issue CPUID and store result registers in array
mov eax, edx
@ -102,9 +80,9 @@ sym(ia32_asm_cpuid):
; lock-free support routines
;-------------------------------------------------------------------------------
; extern "C" void __cdecl ia32_asm_AtomicAdd(volatile intptr_t* location, intptr_t increment);
global sym(ia32_asm_AtomicAdd)
sym(ia32_asm_AtomicAdd):
; extern "C" void __cdecl cpu_AtomicAdd(volatile intptr_t* location, intptr_t increment);
global sym(cpu_AtomicAdd)
sym(cpu_AtomicAdd):
mov edx, [esp+4] ; location
mov eax, [esp+8] ; increment
db 0xf0 ; LOCK prefix
@ -113,10 +91,6 @@ db 0xf0 ; LOCK prefix
; notes:
; - this is called via CAS macro, which silently casts its inputs for
; convenience. mixing up the <expected> and <location> parameters would
; go unnoticed; we therefore perform a basic sanity check on <location> and
; raise a warning if it is invalid.
; - a 486 or later processor is required since we use CMPXCHG.
; there's no feature flag we can check, and the ia32 code doesn't
; bother detecting anything < Pentium, so this'll crash and burn if
@ -125,13 +99,11 @@ db 0xf0 ; LOCK prefix
; - nor do we bother skipping the LOCK prefix on single-processor systems.
; the branch may be well-predicted, but difference in performance still
; isn't expected to be enough to justify the effort.
; extern "C" ; extern "C" bool __cdecl ia32_asm_CAS(volatile uintptr_t* location, uintptr_t expected, uintptr_t new_value);
global sym(ia32_asm_CAS)
sym(ia32_asm_CAS):
; extern "C" bool __cdecl cpu_CAS(volatile uintptr_t* location, uintptr_t expected, uintptr_t new_value);
global sym(cpu_CAS)
sym(cpu_CAS):
mov edx, [esp+4] ; location
mov eax, [esp+8] ; expected
cmp edx, 0x10000 ; valid location?
jb .invalid_location ; no - raise warning
mov ecx, [esp+12] ; new_value
db 0xf0 ; LOCK prefix
cmpxchg [edx], ecx
@ -139,12 +111,12 @@ db 0xf0 ; LOCK prefix
movzx eax, al
ret
; NOTE: nasm 0.98.39 doesn't support generating debug info for win32
; output format. that means this code may be misattributed to other
; functions, which makes tracking it down very difficult.
; we therefore raise an "Invalid Opcode" exception, which is rather distinct.
.invalid_location:
ud2
; extern "C" bool __cdecl cpu_Serialize();
global sym(cpu_Serialize)
sym(cpu_Serialize):
cpuid
ret
;-------------------------------------------------------------------------------
@ -203,7 +175,7 @@ sym(ia32_asm_fpclassifyf):
ret
; extern "C" float __cdecl ia32_asm_rintf(float)
; extern "C" float __cdecl cpu_rintf(float)
global sym(ia32_asm_rintf)
sym(ia32_asm_rintf):
fld dword [esp+4]
@ -241,9 +213,9 @@ sym(ia32_asm_fmaxf):
ret
; extern "C" i32 __cdecl ia32_asm_i32FromFloat(float f)
global sym(ia32_asm_i32FromFloat)
sym(ia32_asm_i32FromFloat):
; extern "C" i32 __cdecl cpu_i32FromFloat(float f)
global sym(cpu_i32FromFloat)
sym(cpu_i32FromFloat):
push eax
fld dword [esp+8]
fsub dword [round_bias]
@ -251,9 +223,9 @@ sym(ia32_asm_i32FromFloat):
pop eax
ret
; extern "C" i32 __cdecl ia32_asm_i32FromDouble(double d)
global sym(ia32_asm_i32FromDouble)
sym(ia32_asm_i32FromDouble):
; extern "C" i32 __cdecl cpu_i32FromDouble(double d)
global sym(cpu_i32FromDouble)
sym(cpu_i32FromDouble):
push eax
fld qword [esp+8]
fsub dword [round_bias]
@ -261,9 +233,9 @@ sym(ia32_asm_i32FromDouble):
pop eax
ret
; extern "C" i64 __cdecl ia32_asm_i64FromDouble(double d)
global sym(ia32_asm_i64FromDouble)
sym(ia32_asm_i64FromDouble):
; extern "C" i64 __cdecl cpu_i64FromDouble(double d)
global sym(cpu_i64FromDouble)
sym(cpu_i64FromDouble):
push edx
push eax
fld qword [esp+12]

View File

@ -15,12 +15,6 @@
extern "C" {
#endif
/**
* prepare ia32_asm_cpuid for use (detects which CPUID functions are
* available). called by ia32_Init.
**/
extern void ia32_asm_cpuid_init();
/**
* order in which ia32_asm_cpuid stores register values
**/
@ -60,21 +54,7 @@ extern u64 ia32_asm_rdtsc_edx_eax(void);
extern void ia32_asm_GetCurrentContext(void* pcontext);
// implementations of the cpu.h interface
/// see cpu_AtomicAdd
extern void ia32_asm_AtomicAdd(volatile intptr_t* location, intptr_t increment);
/// see cpu_CAS
extern bool ia32_asm_CAS(volatile uintptr_t* location, uintptr_t expected, uintptr_t new_value);
/// see cpu_i32FromFloat
extern i32 ia32_asm_i32FromFloat(float f);
extern i32 ia32_asm_i32FromDouble(double d);
extern i64 ia32_asm_i64FromDouble(double d);
// backends for POSIX/SUS functions
// implementations of POSIX/SUS functions
/// see fpclassify
extern uint ia32_asm_fpclassifyd(double d);

View File

@ -13,6 +13,13 @@
; microarchitectures; significantly outperforms VC7.1 memcpy and memcpy_amd.
; for details, see accompanying article.
; this mask is applied to the transfer size and is intended to prevent
; use of the MOVNTQ technique on CPUs lacking SSE. however, to avoid
; the trouble of checking for CPU support (doing so at runtime would
; add unnecessary overhead, and requiring an Init call first is risky),
; we will allow all codepaths and thus assume Pentium 3 / Athlon or above.
ia32_memcpy_size_mask equ 0xFFFFFFFF
; if transfer size is at least this much,
; .. it's too big for L1. use non-temporal instructions.
UC_THRESHOLD equ 64*1024
@ -289,11 +296,11 @@ align 16
;------------------------------------------------------------------------------
; void* __declspec(naked) ia32_memcpy(void* dst, const void* src, size_t nbytes)
; drop-in replacement for libc memcpy() (returns dst)
global sym(ia32_memcpy)
; void* __declspec(naked) cpu_memcpy(void* dst, const void* src, size_t nbytes)
global sym(cpu_memcpy)
align 64
sym(ia32_memcpy):
sym(cpu_memcpy):
push edi
push esi
@ -329,8 +336,7 @@ choose_larger_method:
; that use SSE are jumped to if size is greater than a threshold.
; we simply set the requested transfer size to 0 if the CPU doesn't
; support SSE so that those are never reached (done by masking with this).
extern sym(ia32_memcpy_size_mask)
mov eax, [sym(ia32_memcpy_size_mask)]
mov eax, ia32_memcpy_size_mask
and ecx, byte ~IC_TINY_MAX
jz ic_tiny ; < 64 bytes left (due to IC_ALIGN)
add esi, ecx

View File

@ -1,26 +0,0 @@
/**
* =========================================================================
* File : ia32_memcpy.h
* Project : 0 A.D.
* Description : interface to highly optimized memcpy (written in asm)
* =========================================================================
*/
// license: GPL; see lib/license.txt
#ifndef INCLUDED_IA32_MEMCPY
#define INCLUDED_IA32_MEMCPY
#ifdef __cplusplus
extern "C" {
#endif
extern void ia32_memcpy_init();
extern void* ia32_memcpy(void* RESTRICT dst, const void* RESTRICT src, size_t nbytes);
#ifdef __cplusplus
}
#endif
#endif // #ifndef INCLUDED_IA32_MEMCPY

View File

@ -1,32 +0,0 @@
/**
* =========================================================================
* File : ia32_memcpy_init.cpp
* Project : 0 A.D.
* Description : initialization for ia32_memcpy (detect CPU caps)
* =========================================================================
*/
// license: GPL; see lib/license.txt
#include "precompiled.h"
#include "ia32.h"
#include "ia32_memcpy.h"
// set by ia32_memcpy_init, referenced by ia32_memcpy (asm)
// default to "all codepaths supported"
EXTERN_C u32 ia32_memcpy_size_mask = ~0u;
void ia32_memcpy_init()
{
// set the mask that is applied to transfer size before
// choosing copy technique. this is the mechanism for disabling
// codepaths that aren't supported on all CPUs; see article for details.
// .. check for PREFETCHNTA and MOVNTQ support. these are part of the SSE
// instruction set, but also supported on older Athlons as part of
// the extended AMD MMX set.
if(!ia32_cap(IA32_CAP_SSE) && !ia32_cap(IA32_CAP_AMD_MMX_EXT))
ia32_memcpy_size_mask = 0u;
}
// ia32_memcpy() is defined in ia32_memcpy_asm.asm

View File

@ -7,6 +7,16 @@ int ucpu_IsThrottlingPossible()
return -1; // don't know
}
uint cpu_NumProcessors()
{
long res = sysconf(_SC_NPROCESSORS_CONF);
if (res == -1)
return 0;
else
return (uint)res;
}
int ucpu_NumPackages()
{
long res = sysconf(_SC_NPROCESSORS_CONF);

View File

@ -20,8 +20,6 @@ public:
void test_ia32_cap()
{
ia32_Init();
// make sure the really common/basic caps end up reported as true
TS_ASSERT(ia32_cap(IA32_CAP_FPU));
TS_ASSERT(ia32_cap(IA32_CAP_TSC));

View File

@ -40,10 +40,10 @@ LibError sys_clipboard_set(const wchar_t* text)
CloseClipboard();
// note: SetClipboardData says hMem must not be freed until after
// CloseClipboard. however, GlobalFree still fails after the successful
// completion of both. to avoid memory leaks when one of the calls fails,
// we'll leave it in and just ignore the return value.
// note: MSDN's SetClipboardData documentation says hMem must not be
// freed until after CloseClipboard. however, GlobalFree still fails
// after the successful completion of both. we'll leave it in to avoid
// memory leaks, but ignore its return value.
(void)GlobalFree(hMem);
return ret;

View File

@ -2,86 +2,66 @@
* =========================================================================
* File : wcpu.cpp
* Project : 0 A.D.
* Description : Windows backend for CPU related code
* Description : Windows implementation of sysdep/cpu
* =========================================================================
*/
// license: GPL; see lib/license.txt
#include "precompiled.h"
#include "wcpu.h"
#include "../cpu.h"
#include "lib/posix/posix_pthread.h"
#include "lib/posix/posix_time.h"
#include "win.h"
#include "wutil.h"
#include "winit.h"
WINIT_REGISTER_EARLY_INIT(wcpu_Init); // wcpu -> whrt
#include "lib/bits.h"
//-----------------------------------------------------------------------------
// NumProcessors
static uint numProcessors = 0;
/// get number of CPUs (can't fail)
uint wcpu_NumProcessors()
uint cpu_NumProcessors()
{
debug_assert(numProcessors != 0);
SYSTEM_INFO si;
GetSystemInfo(&si); // can't fail
const uint numProcessors = (uint)si.dwNumberOfProcessors;
return numProcessors;
}
static void DetectNumProcessors()
static LibError ReadFrequencyFromRegistry(DWORD* freqMhz)
{
SYSTEM_INFO si;
GetSystemInfo(&si); // can't fail
numProcessors = (uint)si.dwNumberOfProcessors;
HKEY hKey;
if(RegOpenKeyEx(HKEY_LOCAL_MACHINE, "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0", 0, KEY_QUERY_VALUE, &hKey) != ERROR_SUCCESS)
return ERR::NO_SYS;
DWORD size = sizeof(*freqMhz);
LONG ret = RegQueryValueEx(hKey, "~MHz", 0, 0, (LPBYTE)freqMhz, &size);
RegCloseKey(hKey);
if(ret != ERROR_SUCCESS)
WARN_RETURN(ERR::FAIL);
return INFO::OK;
}
//-----------------------------------------------------------------------------
// ClockFrequency
static double clockFrequency = -1.0;
double wcpu_ClockFrequency()
double cpu_ClockFrequency()
{
debug_assert(clockFrequency > 0.0);
DWORD freqMhz;
if(ReadFrequencyFromRegistry(&freqMhz) < 0)
return -1.0;
const double clockFrequency = freqMhz * 1e6;
return clockFrequency;
}
static void DetectClockFrequency()
{
// read from registry
HKEY hKey;
if(RegOpenKeyEx(HKEY_LOCAL_MACHINE, "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0", 0, KEY_QUERY_VALUE, &hKey) == ERROR_SUCCESS)
{
DWORD freqMhz;
DWORD size = sizeof(freqMhz);
if(RegQueryValueEx(hKey, "~MHz", 0, 0, (LPBYTE)&freqMhz, &size) == STATUS_SUCCESS)
clockFrequency = freqMhz * 1e6;
else
debug_assert(0);
RegCloseKey(hKey);
}
else
debug_assert(0);
}
//-----------------------------------------------------------------------------
// MemorySize
size_t wcpu_PageSize()
size_t cpu_PageSize()
{
SYSTEM_INFO si;
GetSystemInfo(&si); // can't fail
return (size_t)si.dwPageSize;
const size_t pageSize = (size_t)si.dwPageSize;
return pageSize;
}
size_t wcpu_MemorySize(CpuMemoryIndicators mem_type)
size_t cpu_MemorySize(CpuMemoryIndicators mem_type)
{
// note: we no longer bother dynamically importing GlobalMemoryStatusEx -
// it's available on Win2k and above. this function safely handles
@ -90,17 +70,23 @@ size_t wcpu_MemorySize(CpuMemoryIndicators mem_type)
BOOL ok = GlobalMemoryStatusEx(&mse);
WARN_IF_FALSE(ok);
if(mem_type == CPU_MEM_TOTAL)
{
size_t memoryTotal = (size_t)mse.ullTotalPhys;
// Richter, "Programming Applications for Windows": the reported
// value doesn't include non-paged pool reserved during boot;
// it's not considered available to kernel. (size is 528 KiB on
// a 512 MiB WinXP/Win2k machine)
// something similar may happen on other OSes, so it is fixed
// by cpu.cpp instead of here.
if(mem_type == CPU_MEM_TOTAL)
return (size_t)mse.ullTotalPhys;
// it's not considered available to the kernel. (the amount is
// 528 KiB on a 512 MiB WinXP/Win2k machine). we'll round up
// to the nearest megabyte to fix this.
memoryTotal = round_up(memoryTotal, 1*MiB);
return memoryTotal;
}
else
return (size_t)mse.ullAvailPhys;
{
const size_t memoryAvailable = (size_t)mse.ullAvailPhys;
return memoryAvailable;
}
}
@ -114,7 +100,7 @@ size_t wcpu_MemorySize(CpuMemoryIndicators mem_type)
//
// may fail if e.g. OS is preventing us from running on some CPUs.
// called from ia32.cpp get_cpu_count.
LibError wcpu_CallByEachCPU(CpuCallback cb, void* param)
LibError cpu_CallByEachCPU(CpuCallback cb, void* param)
{
const HANDLE hProcess = GetCurrentProcess();
DWORD process_affinity, system_affinity;
@ -148,13 +134,3 @@ LibError wcpu_CallByEachCPU(CpuCallback cb, void* param)
return INFO::OK;
}
//-----------------------------------------------------------------------------
static LibError wcpu_Init()
{
DetectNumProcessors();
DetectClockFrequency();
return INFO::OK;
}

View File

@ -1,24 +0,0 @@
/**
* =========================================================================
* File : wcpu.h
* Project : 0 A.D.
* Description : Windows backend for CPU related code
* =========================================================================
*/
// license: GPL; see lib/license.txt
#ifndef INCLUDED_WCPU
#define INCLUDED_WCPU
#include "lib/sysdep/cpu.h"
extern uint wcpu_NumProcessors();
extern double wcpu_ClockFrequency();
extern LibError wcpu_CallByEachCPU(CpuCallback cb, void* param);
extern size_t wcpu_PageSize();
extern size_t wcpu_MemorySize(CpuMemoryIndicators mem_type);
#endif // #ifndef INCLUDED_WCPU

View File

@ -78,7 +78,7 @@ static LibError sym_init()
// bail if already initialized (there's nothing to do).
// don't use pthread_once because we need to return success/error code.
static uintptr_t already_initialized = 0;
if(!CAS(&already_initialized, 0, 1))
if(!cpu_CAS(&already_initialized, 0, 1))
return INFO::OK;
hProcess = GetCurrentProcess();
@ -374,12 +374,18 @@ static LibError walk_stack(StackFrameCallback cb, void* user_arg = 0, uint skip
STACKFRAME64 sf;
memset(&sf, 0, sizeof(sf));
sf.AddrPC.Offset = pcontext->PC_;
sf.AddrPC.Mode = AddrModeFlat;
sf.AddrFrame.Offset = pcontext->FP_;
sf.AddrFrame.Mode = AddrModeFlat;
sf.AddrStack.Offset = pcontext->SP_;
sf.AddrStack.Mode = AddrModeFlat;
#if CPU_AMD64
sf.AddrPC.Offset = pcontext->Rip;
sf.AddrFrame.Offset = pcontext->Rbp;
sf.AddrStack.Offset = pcontext->Rsp;
#else
sf.AddrPC.Offset = pcontext->Eip;
sf.AddrFrame.Offset = pcontext->Ebp;
sf.AddrStack.Offset = pcontext->Esp;
#endif
// for each stack frame found:
LibError ret = ERR::SYM_NO_STACK_FRAMES_FOUND;
@ -1855,7 +1861,7 @@ static LibError dump_frame_cb(const STACKFRAME64* sf, void* UNUSED(user_arg))
LibError debug_dump_stack(wchar_t* buf, size_t max_chars, uint skip, void* pcontext)
{
static uintptr_t already_in_progress;
if(!CAS(&already_in_progress, 0, 1))
if(!cpu_CAS(&already_in_progress, 0, 1))
return ERR::REENTERED; // NOWARN
lock();

View File

@ -84,7 +84,7 @@ ICounter* CreateCounter(uint id)
// - using static_calloc isn't possible because we don't know the
// size until after the alloc / placement new.
if(!CAS(&isCounterAllocated, 0, 1))
if(!cpu_CAS(&isCounterAllocated, 0, 1))
debug_warn("static counter memory is already in use!");
static const size_t memSize = 200;

View File

@ -46,7 +46,7 @@ public:
/**
* initial measurement of the tick rate. not necessarily correct
* (e.g. when using TSC: wcpu_ClockFrequency isn't exact).
* (e.g. when using TSC: cpu_ClockFrequency isn't exact).
**/
virtual double NominalFrequency() const = 0;

View File

@ -113,7 +113,7 @@ uint CounterHPET::CounterBits() const
/**
* initial measurement of the tick rate. not necessarily correct
* (e.g. when using TSC: wcpu_ClockFrequency isn't exact).
* (e.g. when using TSC: cpu_ClockFrequency isn't exact).
**/
double CounterHPET::NominalFrequency() const
{

View File

@ -42,7 +42,7 @@ public:
/**
* initial measurement of the tick rate. not necessarily correct
* (e.g. when using TSC: wcpu_ClockFrequency isn't exact).
* (e.g. when using TSC: cpu_ClockFrequency isn't exact).
**/
virtual double NominalFrequency() const;

View File

@ -84,7 +84,7 @@ uint CounterPMT::CounterBits() const
/**
* initial measurement of the tick rate. not necessarily correct
* (e.g. when using TSC: wcpu_ClockFrequency isn't exact).
* (e.g. when using TSC: cpu_ClockFrequency isn't exact).
**/
double CounterPMT::NominalFrequency() const
{

View File

@ -43,7 +43,7 @@ public:
/**
* initial measurement of the tick rate. not necessarily correct
* (e.g. when using TSC: wcpu_ClockFrequency isn't exact).
* (e.g. when using TSC: cpu_ClockFrequency isn't exact).
**/
virtual double NominalFrequency() const;

View File

@ -11,8 +11,8 @@
#include "precompiled.h"
#include "qpc.h"
#include "lib/sysdep/cpu.h"
#include "lib/sysdep/win/win.h"
#include "lib/sysdep/win/wcpu.h"
#include "lib/sysdep/win/wutil.h" // wutil_argv
#include "pit.h" // PIT_FREQ
#include "pmt.h" // PMT_FREQ
@ -65,10 +65,10 @@ bool CounterQPC::IsSafe() const
// used on MP HAL systems and can be detected by comparing QPF with the
// CPU clock. we consider it unsafe unless the user promises (via
// command line) that it's patched and thus reliable on their system.
bool usesTsc = IsSimilarMagnitude(m_frequency, wcpu_ClockFrequency());
bool usesTsc = IsSimilarMagnitude(m_frequency, cpu_ClockFrequency());
// unconfirmed reports indicate QPC sometimes uses 1/3 of the
// CPU clock frequency, so check that as well.
usesTsc |= IsSimilarMagnitude(m_frequency, wcpu_ClockFrequency()/3);
usesTsc |= IsSimilarMagnitude(m_frequency, cpu_ClockFrequency()/3);
if(usesTsc)
{
const bool isTscSafe = wutil_HasCommandLineArgument("-wQpcTscSafe");
@ -108,7 +108,7 @@ uint CounterQPC::CounterBits() const
/**
* initial measurement of the tick rate. not necessarily correct
* (e.g. when using TSC: wcpu_ClockFrequency isn't exact).
* (e.g. when using TSC: cpu_ClockFrequency isn't exact).
**/
double CounterQPC::NominalFrequency() const
{

View File

@ -41,7 +41,7 @@ public:
/**
* initial measurement of the tick rate. not necessarily correct
* (e.g. when using TSC: wcpu_ClockFrequency isn't exact).
* (e.g. when using TSC: cpu_ClockFrequency isn't exact).
**/
virtual double NominalFrequency() const;

View File

@ -69,7 +69,7 @@ uint CounterTGT::CounterBits() const
/**
* initial measurement of the tick rate. not necessarily correct
* (e.g. when using TSC: wcpu_ClockFrequency isn't exact).
* (e.g. when using TSC: cpu_ClockFrequency isn't exact).
**/
double CounterTGT::NominalFrequency() const
{

View File

@ -36,7 +36,7 @@ public:
/**
* initial measurement of the tick rate. not necessarily correct
* (e.g. when using TSC: wcpu_ClockFrequency isn't exact).
* (e.g. when using TSC: cpu_ClockFrequency isn't exact).
**/
virtual double NominalFrequency() const;

View File

@ -11,8 +11,8 @@
#include "precompiled.h"
#include "tsc.h"
#include "lib/sysdep/cpu.h"
#include "lib/sysdep/win/win.h"
#include "lib/sysdep/win/wcpu.h"
#include "lib/sysdep/ia32/ia32.h" // ia32_rdtsc
#include "lib/bits.h"
@ -55,8 +55,6 @@ static bool IsThrottlingPossible()
LibError CounterTSC::Activate()
{
ia32_Init();
if(!ia32_cap(IA32_CAP_TSC))
return ERR::NO_SYS; // NOWARN (CPU doesn't support RDTSC)
@ -65,7 +63,6 @@ LibError CounterTSC::Activate()
void CounterTSC::Shutdown()
{
ia32_Shutdown();
}
bool CounterTSC::IsSafe() const
@ -92,7 +89,7 @@ bool CounterTSC::IsSafe() const
// per-core counter state and the abovementioned race condition.
// however, we won't bother, since such platforms aren't yet widespread
// and would surely support the nice and safe HPET, anyway)
if(ia32_NumPackages() != 1 || ia32_CoresPerPackage() != 1)
if(cpu_NumPackages() != 1 || cpu_CoresPerPackage() != 1)
return false;
// recent CPU:
@ -147,9 +144,9 @@ uint CounterTSC::CounterBits() const
/**
* initial measurement of the tick rate. not necessarily correct
* (e.g. when using TSC: wcpu_ClockFrequency isn't exact).
* (e.g. when using TSC: cpu_ClockFrequency isn't exact).
**/
double CounterTSC::NominalFrequency() const
{
return wcpu_ClockFrequency();
return cpu_ClockFrequency();
}

View File

@ -36,7 +36,7 @@ public:
/**
* initial measurement of the tick rate. not necessarily correct
* (e.g. when using TSC: wcpu_ClockFrequency isn't exact).
* (e.g. when using TSC: cpu_ClockFrequency isn't exact).
**/
virtual double NominalFrequency() const;
};

View File

@ -13,9 +13,9 @@
#include <process.h> // _beginthreadex
#include "lib/sysdep/cpu.h"
#include "lib/sysdep/win/win.h"
#include "lib/sysdep/win/winit.h"
#include "lib/sysdep/win/wcpu.h"
#include "lib/sysdep/acpi.h"
#include "lib/adts.h"
#include "lib/bits.h"

View File

@ -14,7 +14,7 @@
#include <new>
#include <process.h>
#include "lib/sysdep/cpu.h" // CAS
#include "lib/sysdep/cpu.h" // cpu_CAS
#include "wposix_internal.h"
#include "wtime.h" // timespec
@ -44,7 +44,7 @@ pthread_t pthread_self(void)
int pthread_once(pthread_once_t* once, void (*init_routine)(void))
{
if(CAS(once, 0, 1))
if(cpu_CAS(once, 0, 1))
init_routine();
return 0;
}
@ -121,7 +121,7 @@ int pthread_key_create(pthread_key_t* key, void (*dtor)(void*))
uint i;
for(i = 0; i < MAX_DTORS; i++)
{
if(CAS(&dtors[i].dtor, 0, dtor))
if(cpu_CAS((volatile uintptr_t*)&dtors[i].dtor, 0, (uintptr_t)dtor))
goto have_slot;
}

View File

@ -895,9 +895,9 @@ void EarlyInit()
lockfree_Init();
timer_Init();
cpu_ConfigureFloatingPoint();
cpu_Init(); // must come after timer_Init
timer_Init();
// Initialise the low-quality rand function
srand(time(NULL));

View File

@ -87,7 +87,7 @@ void WriteSystemInfo()
fprintf(f, "\n");
// memory
fprintf(f, "Memory : %lu MiB; %lu MiB free\n", cpu_MemoryTotalMiB(), cpu_MemorySize(CPU_MEM_AVAILABLE)/MiB);
fprintf(f, "Memory : %lu MiB; %lu MiB free\n", cpu_MemorySize(CPU_MEM_TOTAL)/MiB, cpu_MemorySize(CPU_MEM_AVAILABLE)/MiB);
// graphics
fprintf(f, "Graphics Card : %s\n", gfx_card);