cleanup:
use intrinsics for cpu_AtomicAdd and cpu_CAS, get rid of the corresponding assembly implementations. (wrapper functions are necessary at work because mere declarations aren't enough to export the functions from DLL) also remove no longer needed fminf This was SVN commit r8521.
This commit is contained in:
parent
540925aa32
commit
383cf7b220
@ -117,7 +117,7 @@ template<class Entries> float ll_calc_min_credit_density(const Entries& entries)
|
||||
for(typename Entries::const_iterator it = entries.begin(); it != entries.end(); ++it)
|
||||
{
|
||||
const float credit_density = Entries::entry_from_it(it).credit_density();
|
||||
min_credit_density = fminf(min_credit_density, credit_density);
|
||||
min_credit_density = std::min(min_credit_density, credit_density);
|
||||
}
|
||||
return min_credit_density;
|
||||
}
|
||||
|
@ -49,25 +49,6 @@ double rint(double d)
|
||||
}
|
||||
|
||||
|
||||
float fminf(float a, float b)
|
||||
{
|
||||
#if ARCH_IA32
|
||||
return ia32_asm_fminf(a, b);
|
||||
#else
|
||||
return (a < b)? a : b;
|
||||
#endif
|
||||
}
|
||||
|
||||
float fmaxf(float a, float b)
|
||||
{
|
||||
#if ARCH_IA32
|
||||
return ia32_asm_fmaxf(a, b);
|
||||
#else
|
||||
return (a > b)? a : b;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
size_t fpclassifyd(double d)
|
||||
{
|
||||
#if ARCH_IA32
|
||||
|
@ -115,7 +115,7 @@ extern wchar_t* wcsdup(const wchar_t* str);
|
||||
extern int wcscasecmp(const wchar_t* s1, const wchar_t* s2);
|
||||
#endif
|
||||
|
||||
// rint*, fminf, fpclassify (too few/diverse to make separate HAVE_ for each)
|
||||
// rint*, fpclassify (too few/diverse to make separate HAVE_ for each)
|
||||
#if HAVE_C99 || ICC_VERSION || GCC_VERSION
|
||||
# define HAVE_C99_MATH 1
|
||||
#else
|
||||
@ -127,9 +127,6 @@ extern int wcscasecmp(const wchar_t* s1, const wchar_t* s2);
|
||||
// current rounding mode.
|
||||
extern float rintf(float f);
|
||||
extern double rint(double d);
|
||||
// return minimum/maximum of two floats.
|
||||
extern float fminf(float a, float b);
|
||||
extern float fmaxf(float a, float b);
|
||||
|
||||
extern size_t fpclassifyf(float f);
|
||||
extern size_t fpclassifyd(double d);
|
||||
|
@ -24,14 +24,59 @@
|
||||
|
||||
#if ARCH_AMD64
|
||||
|
||||
#include "lib/sysdep/cpu.h"
|
||||
#include "lib/sysdep/arch/amd64/amd64.h"
|
||||
|
||||
#include "lib/sysdep/cpu.h"
|
||||
|
||||
void cpu_ConfigureFloatingPoint()
|
||||
{
|
||||
// 64-bit CPU:s apparently use SSE2 for all floating-point operations, so I
|
||||
// *guess* we don't need to do anything...
|
||||
// 64-bit CPUs use SSE2 for all floating-point operations, so we
|
||||
// don't need to change the FPU control word.
|
||||
}
|
||||
|
||||
#if MSC_VERSION
|
||||
|
||||
// VC 2008 and ICC 12 differ in their declaration of _Interlocked*
|
||||
#if ICC_VERSION
|
||||
typedef __int64* P64;
|
||||
#else
|
||||
typedef volatile __int64* P64;
|
||||
#endif
|
||||
|
||||
bool cpu_CAS(volatile intptr_t* location, intptr_t expected, intptr_t newValue)
|
||||
{
|
||||
const intptr_t initial = _InterlockedCompareExchange64((P64)location, newValue, expected);
|
||||
return initial == expected;
|
||||
}
|
||||
|
||||
bool cpu_CAS64(volatile i64* location, i64 expected, i64 newValue)
|
||||
{
|
||||
const i64 initial = _InterlockedCompareExchange64((P64)location, newValue, expected);
|
||||
return initial == expected;
|
||||
}
|
||||
|
||||
intptr_t cpu_AtomicAdd(volatile intptr_t* location, intptr_t increment)
|
||||
{
|
||||
return _InterlockedExchangeAdd64((P64)location, increment);
|
||||
}
|
||||
|
||||
#elif GCC_VERSION
|
||||
|
||||
intptr_t cpu_AtomicAdd(volatile intptr_t* location, intptr_t increment)
|
||||
{
|
||||
return __sync_fetch_and_add(location, increment);
|
||||
}
|
||||
|
||||
bool cpu_CAS(volatile intptr_t* location, intptr_t expected, intptr_t newValue)
|
||||
{
|
||||
return __sync_bool_compare_and_swap(location, expected, newValue);
|
||||
}
|
||||
|
||||
bool cpu_CAS64(volatile i64* location, i64 expected, i64 newValue)
|
||||
{
|
||||
return __sync_bool_compare_and_swap(location, expected, newValue);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#endif // ARCH_AMD64
|
||||
|
@ -26,9 +26,10 @@
|
||||
|
||||
BITS 64
|
||||
|
||||
; extern "C" void __cdecl amd64_asm_cpuid(Ia32CpuidRegs* reg);
|
||||
; extern "C" void CALL_CONV amd64_asm_cpuid(x86_x64_CpuidRegs* reg);
|
||||
; reference: http://softwarecommunity.intel.com/articles/eng/2669.htm
|
||||
global sym(amd64_asm_cpuid)
|
||||
ALIGN 8
|
||||
sym(amd64_asm_cpuid):
|
||||
push rbx ; rbx is the only caller-save register we clobber
|
||||
|
||||
@ -45,24 +46,3 @@ sym(amd64_asm_cpuid):
|
||||
|
||||
ret
|
||||
ALIGN 8
|
||||
|
||||
|
||||
; extern "C" intptr_t cpu_AtomicAdd(intptr_t* location, intptr_t increment);
|
||||
global sym(cpu_AtomicAdd)
|
||||
sym(cpu_AtomicAdd):
|
||||
lock xadd [arg0], arg1
|
||||
mov rax, arg1
|
||||
ret
|
||||
|
||||
|
||||
; extern "C" bool amd64_CAS(volatile intptr_t* location, intptr_t expected, intptr_t newValue);
|
||||
; extern "C" bool amd64_CAS64(volatile i64* location, i64 expected, i64 newValue);
|
||||
global sym(cpu_CAS)
|
||||
global sym(cpu_CAS64)
|
||||
sym(cpu_CAS):
|
||||
sym(cpu_CAS64):
|
||||
mov rax, arg1 ; expected -> rax
|
||||
lock cmpxchg [arg0], arg2
|
||||
sete al
|
||||
movzx rax, al
|
||||
ret
|
||||
|
@ -34,8 +34,6 @@ extern "C" {
|
||||
struct x86_x64_CpuidRegs;
|
||||
extern void CALL_CONV amd64_asm_cpuid(x86_x64_CpuidRegs* reg);
|
||||
|
||||
// also implements cpu_AtomicAdd, cpu_CAS and cpu_CAS64 from "sysdep/cpu.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
@ -26,9 +26,8 @@
|
||||
|
||||
#include "precompiled.h"
|
||||
|
||||
#include "lib/sysdep/arch/ia32/ia32.h"
|
||||
|
||||
#include "lib/sysdep/cpu.h"
|
||||
#include "lib/sysdep/arch/ia32/ia32.h"
|
||||
#include "lib/sysdep/arch/ia32/ia32_asm.h"
|
||||
|
||||
|
||||
@ -145,3 +144,51 @@ void cpu_ConfigureFloatingPoint()
|
||||
// results were changed significantly, so it had to be disabled.
|
||||
//ia32_asm_control87(IA32_RC_CHOP, IA32_MCW_RC);
|
||||
}
|
||||
|
||||
|
||||
#if MSC_VERSION
|
||||
|
||||
// VC 2008 and ICC 12 differ in their declaration of _Interlocked*
|
||||
#if ICC_VERSION
|
||||
typedef long* P32;
|
||||
typedef __int64* P64;
|
||||
#else
|
||||
typedef volatile long* P32;
|
||||
typedef volatile __int64* P64;
|
||||
#endif
|
||||
|
||||
bool cpu_CAS(volatile intptr_t* location, intptr_t expected, intptr_t newValue)
|
||||
{
|
||||
const intptr_t initial = _InterlockedCompareExchange((P32)location, newValue, expected);
|
||||
return initial == expected;
|
||||
}
|
||||
|
||||
bool cpu_CAS64(volatile i64* location, i64 expected, i64 newValue)
|
||||
{
|
||||
const i64 initial = _InterlockedCompareExchange64((P64)location, newValue, expected);
|
||||
return initial == expected;
|
||||
}
|
||||
|
||||
intptr_t cpu_AtomicAdd(volatile intptr_t* location, intptr_t increment)
|
||||
{
|
||||
return _InterlockedExchangeAdd((P32)location, increment);
|
||||
}
|
||||
|
||||
#elif GCC_VERSION
|
||||
|
||||
intptr_t cpu_AtomicAdd(volatile intptr_t* location, intptr_t increment)
|
||||
{
|
||||
return __sync_fetch_and_add(location, increment);
|
||||
}
|
||||
|
||||
bool cpu_CAS(volatile intptr_t* location, intptr_t expected, intptr_t newValue)
|
||||
{
|
||||
return __sync_bool_compare_and_swap(location, expected, newValue);
|
||||
}
|
||||
|
||||
bool cpu_CAS64(volatile i64* location, i64 expected, i64 newValue)
|
||||
{
|
||||
return __sync_bool_compare_and_swap(location, expected, newValue);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -56,74 +56,10 @@ sym(ia32_asm_cpuid):
|
||||
ret
|
||||
|
||||
|
||||
;-------------------------------------------------------------------------------
|
||||
; lock-free support routines
|
||||
;-------------------------------------------------------------------------------
|
||||
|
||||
; extern "C" intptr_t __cdecl cpu_AtomicAdd(volatile intptr_t* location, intptr_t increment);
|
||||
global sym(cpu_AtomicAdd)
|
||||
sym(cpu_AtomicAdd):
|
||||
mov edx, [esp+4] ; location
|
||||
mov eax, [esp+8] ; increment
|
||||
db 0xf0 ; LOCK prefix
|
||||
xadd [edx], eax
|
||||
ret
|
||||
|
||||
|
||||
; notes:
|
||||
; - a 486 or later processor is required since we use CMPXCHG.
|
||||
; there's no feature flag we can check, and the ia32 code doesn't
|
||||
; bother detecting anything < Pentium, so this'll crash and burn if
|
||||
; run on 386. we could fall back to simple MOVs there (since 386 CPUs
|
||||
; aren't MP-capable), but it's not worth the trouble.
|
||||
; - nor do we bother skipping the LOCK prefix on single-processor systems.
|
||||
; the branch may be well-predicted, but difference in performance still
|
||||
; isn't expected to be enough to justify the effort.
|
||||
; extern "C" bool __cdecl cpu_CAS(volatile intptr_t* location, intptr_t expected, intptr_t new_value);
|
||||
global sym(cpu_CAS)
|
||||
sym(cpu_CAS):
|
||||
mov edx, [esp+4] ; location
|
||||
mov eax, [esp+8] ; expected
|
||||
mov ecx, [esp+12] ; new_value
|
||||
db 0xf0 ; LOCK prefix
|
||||
cmpxchg [edx], ecx
|
||||
sete al
|
||||
movzx eax, al
|
||||
ret
|
||||
|
||||
|
||||
; extern bool CALL_CONV cpu_CAS64(volatile i64* location, i64 expected, i64 new_value);
|
||||
global sym(cpu_CAS64)
|
||||
sym(cpu_CAS64):
|
||||
push ebx
|
||||
push esi
|
||||
mov esi, [esp+8+4] ; location
|
||||
mov eax, [esp+8+8]
|
||||
mov edx, [esp+8+12] ; edx:eax = expected
|
||||
mov ebx, [esp+8+16]
|
||||
mov ecx, [esp+8+20] ; ecx:ebx = new_value
|
||||
db 0xf0 ; LOCK prefix
|
||||
cmpxchg8b [esi]
|
||||
sete al
|
||||
movzx eax, al
|
||||
pop esi
|
||||
pop ebx
|
||||
ret
|
||||
|
||||
|
||||
;-------------------------------------------------------------------------------
|
||||
; FPU
|
||||
;-------------------------------------------------------------------------------
|
||||
|
||||
[section .data]
|
||||
|
||||
; to conform with the fallback implementation (a C cast), we need to
|
||||
; end up with truncate/"chop" rounding. subtracting does the trick,
|
||||
; assuming RC is the IA-32 default round-to-nearest mode.
|
||||
round_bias dd 0.4999999
|
||||
|
||||
__SECT__
|
||||
|
||||
; extern "C" u32 __cdecl ia32_asm_control87(u32 new_cw, u32 mask);
|
||||
global sym(ia32_asm_control87)
|
||||
sym(ia32_asm_control87):
|
||||
@ -182,62 +118,6 @@ sym(ia32_asm_rint):
|
||||
ret
|
||||
|
||||
|
||||
; extern "C" float __cdecl ia32_asm_fminf(float, float);
|
||||
global sym(ia32_asm_fminf)
|
||||
sym(ia32_asm_fminf):
|
||||
fld dword [esp+4]
|
||||
fld dword [esp+8]
|
||||
fcomi st0, st1
|
||||
fcmovnb st0, st1
|
||||
fxch
|
||||
fstp st0
|
||||
ret
|
||||
|
||||
; extern "C" float __cdecl ia32_asm_fmaxf(float, float);
|
||||
global sym(ia32_asm_fmaxf)
|
||||
sym(ia32_asm_fmaxf):
|
||||
fld dword [esp+4]
|
||||
fld dword [esp+8]
|
||||
fcomi st0, st1
|
||||
fcmovb st0, st1
|
||||
fxch
|
||||
fstp st0
|
||||
ret
|
||||
|
||||
|
||||
; extern "C" i32 __cdecl ia32_asm_i32FromFloat(float f);
|
||||
global sym(ia32_asm_i32FromFloat)
|
||||
sym(ia32_asm_i32FromFloat):
|
||||
push eax
|
||||
fld dword [esp+8]
|
||||
fsub dword [round_bias]
|
||||
fistp dword [esp]
|
||||
pop eax
|
||||
ret
|
||||
|
||||
; extern "C" i32 __cdecl ia32_asm_i32FromDouble(double d);
|
||||
global sym(ia32_asm_i32FromDouble)
|
||||
sym(ia32_asm_i32FromDouble):
|
||||
push eax
|
||||
fld qword [esp+8]
|
||||
fsub dword [round_bias]
|
||||
fistp dword [esp]
|
||||
pop eax
|
||||
ret
|
||||
|
||||
; extern "C" i64 __cdecl ia32_asm_i64FromDouble(double d);
|
||||
global sym(ia32_asm_i64FromDouble)
|
||||
sym(ia32_asm_i64FromDouble):
|
||||
push edx
|
||||
push eax
|
||||
fld qword [esp+12]
|
||||
fsub dword [round_bias]
|
||||
fistp qword [esp]
|
||||
pop eax
|
||||
pop edx
|
||||
ret
|
||||
|
||||
|
||||
;-------------------------------------------------------------------------------
|
||||
; misc
|
||||
;-------------------------------------------------------------------------------
|
||||
|
@ -34,8 +34,6 @@ extern "C" {
|
||||
struct x86_x64_CpuidRegs;
|
||||
extern void CALL_CONV ia32_asm_cpuid(x86_x64_CpuidRegs* regs);
|
||||
|
||||
// also implements cpu_AtomicAdd, cpu_CAS and cpu_CAS64 from "sysdep/cpu.h"
|
||||
|
||||
/// control87
|
||||
// FPU control word
|
||||
// .. Precision Control:
|
||||
@ -75,14 +73,6 @@ extern size_t CALL_CONV ia32_asm_fpclassifyf(float f);
|
||||
extern float CALL_CONV ia32_asm_rintf(float);
|
||||
extern double CALL_CONV ia32_asm_rint(double);
|
||||
|
||||
/// POSIX fminf
|
||||
extern float CALL_CONV ia32_asm_fminf(float, float);
|
||||
extern float CALL_CONV ia32_asm_fmaxf(float, float);
|
||||
|
||||
extern i32 CALL_CONV ia32_asm_i32FromFloat(float f);
|
||||
extern i32 CALL_CONV ia32_asm_i32FromDouble(double d);
|
||||
extern i64 CALL_CONV ia32_asm_i64FromDouble(double d);
|
||||
|
||||
/**
|
||||
* write the current execution state (e.g. all register values) into
|
||||
* (Win32::CONTEXT*)pcontext (defined as void* to avoid dependency).
|
||||
|
@ -52,8 +52,6 @@ LIB_API const char* cpu_IdentifierString();
|
||||
//-----------------------------------------------------------------------------
|
||||
// lock-free support routines
|
||||
|
||||
extern "C" { // (assembly-language implementations)
|
||||
|
||||
/**
|
||||
* add a signed value to a variable without the possibility of interference
|
||||
* from other threads/CPUs.
|
||||
@ -74,8 +72,6 @@ LIB_API intptr_t cpu_AtomicAdd(volatile intptr_t* location, intptr_t increment);
|
||||
LIB_API bool cpu_CAS(volatile intptr_t* location, intptr_t expected, intptr_t newValue);
|
||||
LIB_API bool cpu_CAS64(volatile i64* location, i64 expected, i64 newValue);
|
||||
|
||||
} // extern "C"
|
||||
|
||||
/**
|
||||
* specialization of cpu_CAS for pointer types. this avoids error-prone
|
||||
* casting in user code.
|
||||
|
@ -28,7 +28,7 @@
|
||||
#include "lib/utf8.h"
|
||||
#include "lib/sysdep/cpu.h"
|
||||
#include "lib/sysdep/sysdep.h"
|
||||
#include "lib/posix/posix.h" // fminf etc.
|
||||
#include "lib/posix/posix.h" // rintf etc.
|
||||
|
||||
#if OS_LINUX
|
||||
# include "mocks/dlfcn.h"
|
||||
@ -53,21 +53,6 @@ public:
|
||||
TS_ASSERT_EQUALS(rint(5.6), 6.0);
|
||||
}
|
||||
|
||||
void test_min_max()
|
||||
{
|
||||
TS_ASSERT_EQUALS(fminf(0.0f, 10000.0f), 0.0f);
|
||||
TS_ASSERT_EQUALS(fminf(100.0f, 10000.0f), 100.0f);
|
||||
TS_ASSERT_EQUALS(fminf(-1.0f, 2.0f), -1.0f);
|
||||
TS_ASSERT_EQUALS(fminf(-2.0f, 1.0f), -2.0f);
|
||||
TS_ASSERT_EQUALS(fminf(0.001f, 0.00001f), 0.00001f);
|
||||
|
||||
TS_ASSERT_EQUALS(fmaxf(0.0f, 10000.0f), 10000.0f);
|
||||
TS_ASSERT_EQUALS(fmaxf(100.0f, 10000.0f), 10000.0f);
|
||||
TS_ASSERT_EQUALS(fmaxf(-1.0f, 2.0f), 2.0f);
|
||||
TS_ASSERT_EQUALS(fmaxf(-2.0f, 1.0f), 1.0f);
|
||||
TS_ASSERT_EQUALS(fmaxf(0.001f, 0.00001f), 0.001f);
|
||||
}
|
||||
|
||||
void test_random()
|
||||
{
|
||||
u64 a = 0, b = 0;
|
||||
|
Loading…
Reference in New Issue
Block a user