1
1
forked from 0ad/0ad
use intrinsics for cpu_AtomicAdd and cpu_CAS, get rid of the
corresponding assembly implementations. (wrapper functions are necessary
at work because mere declarations aren't enough to export the functions
from DLL)
also remove no longer needed fminf

This was SVN commit r8521.
This commit is contained in:
janwas 2010-11-02 13:38:56 +00:00
parent 540925aa32
commit 383cf7b220
11 changed files with 102 additions and 203 deletions

View File

@ -117,7 +117,7 @@ template<class Entries> float ll_calc_min_credit_density(const Entries& entries)
for(typename Entries::const_iterator it = entries.begin(); it != entries.end(); ++it)
{
const float credit_density = Entries::entry_from_it(it).credit_density();
min_credit_density = fminf(min_credit_density, credit_density);
min_credit_density = std::min(min_credit_density, credit_density);
}
return min_credit_density;
}

View File

@ -49,25 +49,6 @@ double rint(double d)
}
float fminf(float a, float b)
{
#if ARCH_IA32
return ia32_asm_fminf(a, b);
#else
return (a < b)? a : b;
#endif
}
float fmaxf(float a, float b)
{
#if ARCH_IA32
return ia32_asm_fmaxf(a, b);
#else
return (a > b)? a : b;
#endif
}
size_t fpclassifyd(double d)
{
#if ARCH_IA32

View File

@ -115,7 +115,7 @@ extern wchar_t* wcsdup(const wchar_t* str);
extern int wcscasecmp(const wchar_t* s1, const wchar_t* s2);
#endif
// rint*, fminf, fpclassify (too few/diverse to make separate HAVE_ for each)
// rint*, fpclassify (too few/diverse to make separate HAVE_ for each)
#if HAVE_C99 || ICC_VERSION || GCC_VERSION
# define HAVE_C99_MATH 1
#else
@ -127,9 +127,6 @@ extern int wcscasecmp(const wchar_t* s1, const wchar_t* s2);
// current rounding mode.
extern float rintf(float f);
extern double rint(double d);
// return minimum/maximum of two floats.
extern float fminf(float a, float b);
extern float fmaxf(float a, float b);
extern size_t fpclassifyf(float f);
extern size_t fpclassifyd(double d);

View File

@ -24,14 +24,59 @@
#if ARCH_AMD64
#include "lib/sysdep/cpu.h"
#include "lib/sysdep/arch/amd64/amd64.h"
#include "lib/sysdep/cpu.h"
void cpu_ConfigureFloatingPoint()
{
// 64-bit CPU:s apparently use SSE2 for all floating-point operations, so I
// *guess* we don't need to do anything...
// 64-bit CPUs use SSE2 for all floating-point operations, so we
// don't need to change the FPU control word.
}
#if MSC_VERSION
// VC 2008 and ICC 12 differ in their declaration of _Interlocked*
#if ICC_VERSION
typedef __int64* P64;
#else
typedef volatile __int64* P64;
#endif
bool cpu_CAS(volatile intptr_t* location, intptr_t expected, intptr_t newValue)
{
const intptr_t initial = _InterlockedCompareExchange64((P64)location, newValue, expected);
return initial == expected;
}
bool cpu_CAS64(volatile i64* location, i64 expected, i64 newValue)
{
const i64 initial = _InterlockedCompareExchange64((P64)location, newValue, expected);
return initial == expected;
}
intptr_t cpu_AtomicAdd(volatile intptr_t* location, intptr_t increment)
{
return _InterlockedExchangeAdd64((P64)location, increment);
}
#elif GCC_VERSION
intptr_t cpu_AtomicAdd(volatile intptr_t* location, intptr_t increment)
{
return __sync_fetch_and_add(location, increment);
}
bool cpu_CAS(volatile intptr_t* location, intptr_t expected, intptr_t newValue)
{
return __sync_bool_compare_and_swap(location, expected, newValue);
}
bool cpu_CAS64(volatile i64* location, i64 expected, i64 newValue)
{
return __sync_bool_compare_and_swap(location, expected, newValue);
}
#endif
#endif // ARCH_AMD64

View File

@ -26,9 +26,10 @@
BITS 64
; extern "C" void __cdecl amd64_asm_cpuid(Ia32CpuidRegs* reg);
; extern "C" void CALL_CONV amd64_asm_cpuid(x86_x64_CpuidRegs* reg);
; reference: http://softwarecommunity.intel.com/articles/eng/2669.htm
global sym(amd64_asm_cpuid)
ALIGN 8
sym(amd64_asm_cpuid):
push rbx ; rbx is the only caller-save register we clobber
@ -45,24 +46,3 @@ sym(amd64_asm_cpuid):
ret
ALIGN 8
; extern "C" intptr_t cpu_AtomicAdd(intptr_t* location, intptr_t increment);
global sym(cpu_AtomicAdd)
sym(cpu_AtomicAdd):
lock xadd [arg0], arg1
mov rax, arg1
ret
; extern "C" bool amd64_CAS(volatile intptr_t* location, intptr_t expected, intptr_t newValue);
; extern "C" bool amd64_CAS64(volatile i64* location, i64 expected, i64 newValue);
global sym(cpu_CAS)
global sym(cpu_CAS64)
sym(cpu_CAS):
sym(cpu_CAS64):
mov rax, arg1 ; expected -> rax
lock cmpxchg [arg0], arg2
sete al
movzx rax, al
ret

View File

@ -34,8 +34,6 @@ extern "C" {
struct x86_x64_CpuidRegs;
extern void CALL_CONV amd64_asm_cpuid(x86_x64_CpuidRegs* reg);
// also implements cpu_AtomicAdd, cpu_CAS and cpu_CAS64 from "sysdep/cpu.h"
#ifdef __cplusplus
}
#endif

View File

@ -26,9 +26,8 @@
#include "precompiled.h"
#include "lib/sysdep/arch/ia32/ia32.h"
#include "lib/sysdep/cpu.h"
#include "lib/sysdep/arch/ia32/ia32.h"
#include "lib/sysdep/arch/ia32/ia32_asm.h"
@ -145,3 +144,51 @@ void cpu_ConfigureFloatingPoint()
// results were changed significantly, so it had to be disabled.
//ia32_asm_control87(IA32_RC_CHOP, IA32_MCW_RC);
}
#if MSC_VERSION
// VC 2008 and ICC 12 differ in their declaration of _Interlocked*
#if ICC_VERSION
typedef long* P32;
typedef __int64* P64;
#else
typedef volatile long* P32;
typedef volatile __int64* P64;
#endif
bool cpu_CAS(volatile intptr_t* location, intptr_t expected, intptr_t newValue)
{
const intptr_t initial = _InterlockedCompareExchange((P32)location, newValue, expected);
return initial == expected;
}
bool cpu_CAS64(volatile i64* location, i64 expected, i64 newValue)
{
const i64 initial = _InterlockedCompareExchange64((P64)location, newValue, expected);
return initial == expected;
}
intptr_t cpu_AtomicAdd(volatile intptr_t* location, intptr_t increment)
{
return _InterlockedExchangeAdd((P32)location, increment);
}
#elif GCC_VERSION
intptr_t cpu_AtomicAdd(volatile intptr_t* location, intptr_t increment)
{
return __sync_fetch_and_add(location, increment);
}
bool cpu_CAS(volatile intptr_t* location, intptr_t expected, intptr_t newValue)
{
return __sync_bool_compare_and_swap(location, expected, newValue);
}
bool cpu_CAS64(volatile i64* location, i64 expected, i64 newValue)
{
return __sync_bool_compare_and_swap(location, expected, newValue);
}
#endif

View File

@ -56,74 +56,10 @@ sym(ia32_asm_cpuid):
ret
;-------------------------------------------------------------------------------
; lock-free support routines
;-------------------------------------------------------------------------------
; extern "C" intptr_t __cdecl cpu_AtomicAdd(volatile intptr_t* location, intptr_t increment);
global sym(cpu_AtomicAdd)
sym(cpu_AtomicAdd):
mov edx, [esp+4] ; location
mov eax, [esp+8] ; increment
db 0xf0 ; LOCK prefix
xadd [edx], eax
ret
; notes:
; - a 486 or later processor is required since we use CMPXCHG.
; there's no feature flag we can check, and the ia32 code doesn't
; bother detecting anything < Pentium, so this'll crash and burn if
; run on 386. we could fall back to simple MOVs there (since 386 CPUs
; aren't MP-capable), but it's not worth the trouble.
; - nor do we bother skipping the LOCK prefix on single-processor systems.
; the branch may be well-predicted, but difference in performance still
; isn't expected to be enough to justify the effort.
; extern "C" bool __cdecl cpu_CAS(volatile intptr_t* location, intptr_t expected, intptr_t new_value);
global sym(cpu_CAS)
sym(cpu_CAS):
mov edx, [esp+4] ; location
mov eax, [esp+8] ; expected
mov ecx, [esp+12] ; new_value
db 0xf0 ; LOCK prefix
cmpxchg [edx], ecx
sete al
movzx eax, al
ret
; extern bool CALL_CONV cpu_CAS64(volatile i64* location, i64 expected, i64 new_value);
global sym(cpu_CAS64)
sym(cpu_CAS64):
push ebx
push esi
mov esi, [esp+8+4] ; location
mov eax, [esp+8+8]
mov edx, [esp+8+12] ; edx:eax = expected
mov ebx, [esp+8+16]
mov ecx, [esp+8+20] ; ecx:ebx = new_value
db 0xf0 ; LOCK prefix
cmpxchg8b [esi]
sete al
movzx eax, al
pop esi
pop ebx
ret
;-------------------------------------------------------------------------------
; FPU
;-------------------------------------------------------------------------------
[section .data]
; to conform with the fallback implementation (a C cast), we need to
; end up with truncate/"chop" rounding. subtracting does the trick,
; assuming RC is the IA-32 default round-to-nearest mode.
round_bias dd 0.4999999
__SECT__
; extern "C" u32 __cdecl ia32_asm_control87(u32 new_cw, u32 mask);
global sym(ia32_asm_control87)
sym(ia32_asm_control87):
@ -182,62 +118,6 @@ sym(ia32_asm_rint):
ret
; extern "C" float __cdecl ia32_asm_fminf(float, float);
global sym(ia32_asm_fminf)
sym(ia32_asm_fminf):
fld dword [esp+4]
fld dword [esp+8]
fcomi st0, st1
fcmovnb st0, st1
fxch
fstp st0
ret
; extern "C" float __cdecl ia32_asm_fmaxf(float, float);
global sym(ia32_asm_fmaxf)
sym(ia32_asm_fmaxf):
fld dword [esp+4]
fld dword [esp+8]
fcomi st0, st1
fcmovb st0, st1
fxch
fstp st0
ret
; extern "C" i32 __cdecl ia32_asm_i32FromFloat(float f);
global sym(ia32_asm_i32FromFloat)
sym(ia32_asm_i32FromFloat):
push eax
fld dword [esp+8]
fsub dword [round_bias]
fistp dword [esp]
pop eax
ret
; extern "C" i32 __cdecl ia32_asm_i32FromDouble(double d);
global sym(ia32_asm_i32FromDouble)
sym(ia32_asm_i32FromDouble):
push eax
fld qword [esp+8]
fsub dword [round_bias]
fistp dword [esp]
pop eax
ret
; extern "C" i64 __cdecl ia32_asm_i64FromDouble(double d);
global sym(ia32_asm_i64FromDouble)
sym(ia32_asm_i64FromDouble):
push edx
push eax
fld qword [esp+12]
fsub dword [round_bias]
fistp qword [esp]
pop eax
pop edx
ret
;-------------------------------------------------------------------------------
; misc
;-------------------------------------------------------------------------------

View File

@ -34,8 +34,6 @@ extern "C" {
struct x86_x64_CpuidRegs;
extern void CALL_CONV ia32_asm_cpuid(x86_x64_CpuidRegs* regs);
// also implements cpu_AtomicAdd, cpu_CAS and cpu_CAS64 from "sysdep/cpu.h"
/// control87
// FPU control word
// .. Precision Control:
@ -75,14 +73,6 @@ extern size_t CALL_CONV ia32_asm_fpclassifyf(float f);
extern float CALL_CONV ia32_asm_rintf(float);
extern double CALL_CONV ia32_asm_rint(double);
/// POSIX fminf
extern float CALL_CONV ia32_asm_fminf(float, float);
extern float CALL_CONV ia32_asm_fmaxf(float, float);
extern i32 CALL_CONV ia32_asm_i32FromFloat(float f);
extern i32 CALL_CONV ia32_asm_i32FromDouble(double d);
extern i64 CALL_CONV ia32_asm_i64FromDouble(double d);
/**
* write the current execution state (e.g. all register values) into
* (Win32::CONTEXT*)pcontext (defined as void* to avoid dependency).

View File

@ -52,8 +52,6 @@ LIB_API const char* cpu_IdentifierString();
//-----------------------------------------------------------------------------
// lock-free support routines
extern "C" { // (assembly-language implementations)
/**
* add a signed value to a variable without the possibility of interference
* from other threads/CPUs.
@ -74,8 +72,6 @@ LIB_API intptr_t cpu_AtomicAdd(volatile intptr_t* location, intptr_t increment);
LIB_API bool cpu_CAS(volatile intptr_t* location, intptr_t expected, intptr_t newValue);
LIB_API bool cpu_CAS64(volatile i64* location, i64 expected, i64 newValue);
} // extern "C"
/**
* specialization of cpu_CAS for pointer types. this avoids error-prone
* casting in user code.

View File

@ -28,7 +28,7 @@
#include "lib/utf8.h"
#include "lib/sysdep/cpu.h"
#include "lib/sysdep/sysdep.h"
#include "lib/posix/posix.h" // fminf etc.
#include "lib/posix/posix.h" // rintf etc.
#if OS_LINUX
# include "mocks/dlfcn.h"
@ -53,21 +53,6 @@ public:
TS_ASSERT_EQUALS(rint(5.6), 6.0);
}
void test_min_max()
{
TS_ASSERT_EQUALS(fminf(0.0f, 10000.0f), 0.0f);
TS_ASSERT_EQUALS(fminf(100.0f, 10000.0f), 100.0f);
TS_ASSERT_EQUALS(fminf(-1.0f, 2.0f), -1.0f);
TS_ASSERT_EQUALS(fminf(-2.0f, 1.0f), -2.0f);
TS_ASSERT_EQUALS(fminf(0.001f, 0.00001f), 0.00001f);
TS_ASSERT_EQUALS(fmaxf(0.0f, 10000.0f), 10000.0f);
TS_ASSERT_EQUALS(fmaxf(100.0f, 10000.0f), 10000.0f);
TS_ASSERT_EQUALS(fmaxf(-1.0f, 2.0f), 2.0f);
TS_ASSERT_EQUALS(fmaxf(-2.0f, 1.0f), 1.0f);
TS_ASSERT_EQUALS(fmaxf(0.001f, 0.00001f), 0.001f);
}
void test_random()
{
u64 a = 0, b = 0;