cleanup:

use intrinsics for cpu_AtomicAdd and cpu_CAS, get rid of the corresponding assembly implementations. (wrapper functions are necessary at work because mere declarations aren't enough to export the functions from DLL) also remove no longer needed fminf This was SVN commit r8521.
2010-11-02 13:38:56 +00:00 · 2010-11-02 13:38:56 +00:00 · 383cf7b220
commit 383cf7b220
parent 540925aa32
11 changed files with 102 additions and 203 deletions
--- a/source/lib/cache_adt.h
+++ b/source/lib/cache_adt.h
@ -117,7 +117,7 @@ template<class Entries> float ll_calc_min_credit_density(const Entries& entries)
 	for(typename Entries::const_iterator it = entries.begin(); it != entries.end(); ++it)
 	{
 		const float credit_density = Entries::entry_from_it(it).credit_density();
-		min_credit_density = fminf(min_credit_density, credit_density);
+		min_credit_density = std::min(min_credit_density, credit_density);
 	}
 	return min_credit_density;
 }
--- a/source/lib/posix/posix.cpp
+++ b/source/lib/posix/posix.cpp
@ -49,25 +49,6 @@ double rint(double d)
 }


-float fminf(float a, float b)
-{
-#if ARCH_IA32
-	return ia32_asm_fminf(a, b);
-#else
-	return (a < b)? a : b;
-#endif
-}
-
-float fmaxf(float a, float b)
-{
-#if ARCH_IA32
-	return ia32_asm_fmaxf(a, b);
-#else
-	return (a > b)? a : b;
-#endif
-}
-
-
 size_t fpclassifyd(double d)
 {
 #if ARCH_IA32
--- a/source/lib/posix/posix.h
+++ b/source/lib/posix/posix.h
@ -115,7 +115,7 @@ extern wchar_t* wcsdup(const wchar_t* str);
 extern int wcscasecmp(const wchar_t* s1, const wchar_t* s2);
 #endif

-// rint*, fminf, fpclassify (too few/diverse to make separate HAVE_ for each)
+// rint*, fpclassify (too few/diverse to make separate HAVE_ for each)
 #if HAVE_C99 || ICC_VERSION || GCC_VERSION
 # define HAVE_C99_MATH 1
 #else
@ -127,9 +127,6 @@ extern int wcscasecmp(const wchar_t* s1, const wchar_t* s2);
 // current rounding mode.
 extern float rintf(float f);
 extern double rint(double d);
-// return minimum/maximum of two floats.
-extern float fminf(float a, float b);
-extern float fmaxf(float a, float b);

 extern size_t fpclassifyf(float f);
 extern size_t fpclassifyd(double d);
--- a/source/lib/sysdep/arch/amd64/amd64.cpp
+++ b/source/lib/sysdep/arch/amd64/amd64.cpp
@ -24,14 +24,59 @@

 #if ARCH_AMD64

+#include "lib/sysdep/cpu.h"
 #include "lib/sysdep/arch/amd64/amd64.h"

-#include "lib/sysdep/cpu.h"

 void cpu_ConfigureFloatingPoint()
 {
-	// 64-bit CPU:s apparently use SSE2 for all floating-point operations, so I
-	// *guess* we don't need to do anything...
+	// 64-bit CPUs use SSE2 for all floating-point operations, so we
+	// don't need to change the FPU control word.
 }

+#if MSC_VERSION
+
+// VC 2008 and ICC 12 differ in their declaration of _Interlocked*
+#if ICC_VERSION
+typedef __int64* P64;
+#else
+typedef volatile __int64* P64;
+#endif
+
+bool cpu_CAS(volatile intptr_t* location, intptr_t expected, intptr_t newValue)
+{
+	const intptr_t initial = _InterlockedCompareExchange64((P64)location, newValue, expected);
+	return initial == expected;
+}
+
+bool cpu_CAS64(volatile i64* location, i64 expected, i64 newValue)
+{
+	const i64 initial = _InterlockedCompareExchange64((P64)location, newValue, expected);
+	return initial == expected;
+}
+
+intptr_t cpu_AtomicAdd(volatile intptr_t* location, intptr_t increment)
+{
+	return _InterlockedExchangeAdd64((P64)location, increment);
+}
+
+#elif GCC_VERSION
+
+intptr_t cpu_AtomicAdd(volatile intptr_t* location, intptr_t increment)
+{
+	return __sync_fetch_and_add(location, increment);
+}
+
+bool cpu_CAS(volatile intptr_t* location, intptr_t expected, intptr_t newValue)
+{
+	return __sync_bool_compare_and_swap(location, expected, newValue);
+}
+
+bool cpu_CAS64(volatile i64* location, i64 expected, i64 newValue)
+{
+	return __sync_bool_compare_and_swap(location, expected, newValue);
+}
+
+#endif
+
 #endif // ARCH_AMD64
--- a/source/lib/sysdep/arch/amd64/amd64_asm.asm
+++ b/source/lib/sysdep/arch/amd64/amd64_asm.asm
@ -26,9 +26,10 @@

 BITS 64

-; extern "C" void __cdecl amd64_asm_cpuid(Ia32CpuidRegs* reg);
+; extern "C" void CALL_CONV amd64_asm_cpuid(x86_x64_CpuidRegs* reg);
 ; reference: http://softwarecommunity.intel.com/articles/eng/2669.htm
 global sym(amd64_asm_cpuid)
+	ALIGN 8
 sym(amd64_asm_cpuid):
 	push		rbx			; rbx is the only caller-save register we clobber

@ -45,24 +46,3 @@ sym(amd64_asm_cpuid):

 	ret
 	ALIGN 8
-
-
-; extern "C" intptr_t cpu_AtomicAdd(intptr_t* location, intptr_t increment);
-global sym(cpu_AtomicAdd)
-sym(cpu_AtomicAdd):
-	lock xadd	[arg0], arg1
-	mov			rax, arg1
-	ret
-
-
-; extern "C" bool amd64_CAS(volatile intptr_t* location, intptr_t expected, intptr_t newValue);
-; extern "C" bool amd64_CAS64(volatile i64* location, i64 expected, i64 newValue);
-global sym(cpu_CAS)
-global sym(cpu_CAS64)
-sym(cpu_CAS):
-sym(cpu_CAS64):
-	mov			rax, arg1 ; expected -> rax
-	lock cmpxchg [arg0], arg2
-	sete		al
-	movzx		rax, al
-	ret
--- a/source/lib/sysdep/arch/amd64/amd64_asm.h
+++ b/source/lib/sysdep/arch/amd64/amd64_asm.h
@ -34,8 +34,6 @@ extern "C" {
 struct x86_x64_CpuidRegs;
 extern void CALL_CONV amd64_asm_cpuid(x86_x64_CpuidRegs* reg);

-// also implements cpu_AtomicAdd, cpu_CAS and cpu_CAS64 from "sysdep/cpu.h"
-
 #ifdef __cplusplus
 }
 #endif
--- a/source/lib/sysdep/arch/ia32/ia32.cpp
+++ b/source/lib/sysdep/arch/ia32/ia32.cpp
@ -26,9 +26,8 @@

 #include "precompiled.h"

-#include "lib/sysdep/arch/ia32/ia32.h"
-
 #include "lib/sysdep/cpu.h"
+#include "lib/sysdep/arch/ia32/ia32.h"
 #include "lib/sysdep/arch/ia32/ia32_asm.h"


@ -145,3 +144,51 @@ void cpu_ConfigureFloatingPoint()
 	// results were changed significantly, so it had to be disabled.
 	//ia32_asm_control87(IA32_RC_CHOP, IA32_MCW_RC);
 }
+
+
+#if MSC_VERSION
+
+// VC 2008 and ICC 12 differ in their declaration of _Interlocked*
+#if ICC_VERSION
+typedef long* P32;
+typedef __int64* P64;
+#else
+typedef volatile long* P32;
+typedef volatile __int64* P64;
+#endif
+
+bool cpu_CAS(volatile intptr_t* location, intptr_t expected, intptr_t newValue)
+{
+	const intptr_t initial = _InterlockedCompareExchange((P32)location, newValue, expected);
+	return initial == expected;
+}
+
+bool cpu_CAS64(volatile i64* location, i64 expected, i64 newValue)
+{
+	const i64 initial = _InterlockedCompareExchange64((P64)location, newValue, expected);
+	return initial == expected;
+}
+
+intptr_t cpu_AtomicAdd(volatile intptr_t* location, intptr_t increment)
+{
+	return _InterlockedExchangeAdd((P32)location, increment);
+}
+
+#elif GCC_VERSION
+
+intptr_t cpu_AtomicAdd(volatile intptr_t* location, intptr_t increment)
+{
+	return __sync_fetch_and_add(location, increment);
+}
+
+bool cpu_CAS(volatile intptr_t* location, intptr_t expected, intptr_t newValue)
+{
+	return __sync_bool_compare_and_swap(location, expected, newValue);
+}
+
+bool cpu_CAS64(volatile i64* location, i64 expected, i64 newValue)
+{
+	return __sync_bool_compare_and_swap(location, expected, newValue);
+}
+
+#endif
--- a/source/lib/sysdep/arch/ia32/ia32_asm.asm
+++ b/source/lib/sysdep/arch/ia32/ia32_asm.asm
@ -56,74 +56,10 @@ sym(ia32_asm_cpuid):
 	ret


-;-------------------------------------------------------------------------------
-; lock-free support routines
-;-------------------------------------------------------------------------------
-
-; extern "C" intptr_t __cdecl cpu_AtomicAdd(volatile intptr_t* location, intptr_t increment);
-global sym(cpu_AtomicAdd)
-sym(cpu_AtomicAdd):
-	mov		edx, [esp+4]				; location
-	mov		eax, [esp+8]				; increment
-db		0xf0							; LOCK prefix
-	xadd	[edx], eax
-	ret
-
-
-; notes:
-; - a 486 or later processor is required since we use CMPXCHG.
-;   there's no feature flag we can check, and the ia32 code doesn't
-;   bother detecting anything < Pentium, so this'll crash and burn if
-;   run on 386. we could fall back to simple MOVs there (since 386 CPUs
-;   aren't MP-capable), but it's not worth the trouble.
-; - nor do we bother skipping the LOCK prefix on single-processor systems.
-;   the branch may be well-predicted, but difference in performance still
-;   isn't expected to be enough to justify the effort.
-; extern "C" bool __cdecl cpu_CAS(volatile intptr_t* location, intptr_t expected, intptr_t new_value);
-global sym(cpu_CAS)
-sym(cpu_CAS):
-	mov		edx, [esp+4]				; location
-	mov		eax, [esp+8]				; expected
-	mov		ecx, [esp+12]				; new_value
-db		0xf0							; LOCK prefix
-	cmpxchg	[edx], ecx
-	sete	al
-	movzx	eax, al
-	ret
-
-
-; extern bool CALL_CONV cpu_CAS64(volatile i64* location, i64 expected, i64 new_value);
-global sym(cpu_CAS64)
-sym(cpu_CAS64):
-	push	ebx
-	push	esi
-	mov		esi, [esp+8+4]				; location
-	mov		eax, [esp+8+8]
-	mov		edx, [esp+8+12]				; edx:eax = expected
-	mov		ebx, [esp+8+16]
-	mov		ecx, [esp+8+20]				; ecx:ebx = new_value
-db		0xf0							; LOCK prefix
-	cmpxchg8b	[esi]
-	sete	al
-	movzx	eax, al
-	pop		esi
-	pop		ebx
-	ret
-
-
 ;-------------------------------------------------------------------------------
 ; FPU
 ;-------------------------------------------------------------------------------

-[section .data]
-
-; to conform with the fallback implementation (a C cast), we need to
-; end up with truncate/"chop" rounding. subtracting does the trick,
-; assuming RC is the IA-32 default round-to-nearest mode.
-round_bias		dd 0.4999999
-
-__SECT__
-
 ; extern "C" u32 __cdecl ia32_asm_control87(u32 new_cw, u32 mask);
 global sym(ia32_asm_control87)
 sym(ia32_asm_control87):
@ -182,62 +118,6 @@ sym(ia32_asm_rint):
 	ret


-; extern "C" float __cdecl ia32_asm_fminf(float, float);
-global sym(ia32_asm_fminf)
-sym(ia32_asm_fminf):
-	fld		dword [esp+4]
-	fld		dword [esp+8]
-	fcomi	st0, st1
-	fcmovnb	st0, st1
-	fxch
-	fstp	st0
-	ret
-
-; extern "C" float __cdecl ia32_asm_fmaxf(float, float);
-global sym(ia32_asm_fmaxf)
-sym(ia32_asm_fmaxf):
-	fld		dword [esp+4]
-	fld		dword [esp+8]
-	fcomi	st0, st1
-	fcmovb	st0, st1
-	fxch
-	fstp	st0
-	ret
-
-
-; extern "C" i32 __cdecl ia32_asm_i32FromFloat(float f);
-global sym(ia32_asm_i32FromFloat)
-sym(ia32_asm_i32FromFloat):
-	push		eax
-	fld			dword [esp+8]
-	fsub		dword [round_bias]
-	fistp		dword [esp]
-	pop			eax
-	ret
-
-; extern "C" i32 __cdecl ia32_asm_i32FromDouble(double d);
-global sym(ia32_asm_i32FromDouble)
-sym(ia32_asm_i32FromDouble):
-	push		eax
-	fld			qword [esp+8]
-	fsub		dword [round_bias]
-	fistp		dword [esp]
-	pop			eax
-	ret
-
-; extern "C" i64 __cdecl ia32_asm_i64FromDouble(double d);
-global sym(ia32_asm_i64FromDouble)
-sym(ia32_asm_i64FromDouble):
-	push		edx
-	push		eax
-	fld			qword [esp+12]
-	fsub		dword [round_bias]
-	fistp		qword [esp]
-	pop			eax
-	pop			edx
-	ret
-
-
 ;-------------------------------------------------------------------------------
 ; misc
 ;-------------------------------------------------------------------------------
--- a/source/lib/sysdep/arch/ia32/ia32_asm.h
+++ b/source/lib/sysdep/arch/ia32/ia32_asm.h
@ -34,8 +34,6 @@ extern "C" {
 struct x86_x64_CpuidRegs;
 extern void CALL_CONV ia32_asm_cpuid(x86_x64_CpuidRegs* regs);

-// also implements cpu_AtomicAdd, cpu_CAS and cpu_CAS64 from "sysdep/cpu.h"
-
 /// control87
 // FPU control word
 // .. Precision Control:
@ -75,14 +73,6 @@ extern size_t CALL_CONV ia32_asm_fpclassifyf(float f);
 extern float CALL_CONV ia32_asm_rintf(float);
 extern double CALL_CONV ia32_asm_rint(double);

-/// POSIX fminf
-extern float CALL_CONV ia32_asm_fminf(float, float);
-extern float CALL_CONV ia32_asm_fmaxf(float, float);
-
-extern i32 CALL_CONV ia32_asm_i32FromFloat(float f);
-extern i32 CALL_CONV ia32_asm_i32FromDouble(double d);
-extern i64 CALL_CONV ia32_asm_i64FromDouble(double d);
-
 /**
 * write the current execution state (e.g. all register values) into
 * (Win32::CONTEXT*)pcontext (defined as void* to avoid dependency).
--- a/source/lib/sysdep/cpu.h
+++ b/source/lib/sysdep/cpu.h
@ -52,8 +52,6 @@ LIB_API const char* cpu_IdentifierString();
 //-----------------------------------------------------------------------------
 // lock-free support routines

-extern "C" {	// (assembly-language implementations)
-
 /**
 * add a signed value to a variable without the possibility of interference
 * from other threads/CPUs.
@ -74,8 +72,6 @@ LIB_API intptr_t cpu_AtomicAdd(volatile intptr_t* location, intptr_t increment);
 LIB_API bool cpu_CAS(volatile intptr_t* location, intptr_t expected, intptr_t newValue);
 LIB_API bool cpu_CAS64(volatile i64* location, i64 expected, i64 newValue);

-}	// extern "C"
-
 /**
 * specialization of cpu_CAS for pointer types. this avoids error-prone
 * casting in user code.
--- a/source/lib/sysdep/tests/test_sysdep.h
+++ b/source/lib/sysdep/tests/test_sysdep.h
@ -28,7 +28,7 @@
 #include "lib/utf8.h"
 #include "lib/sysdep/cpu.h"
 #include "lib/sysdep/sysdep.h"
-#include "lib/posix/posix.h"	// fminf etc.
+#include "lib/posix/posix.h"	// rintf etc.

 #if OS_LINUX
 # include "mocks/dlfcn.h"
@ -53,21 +53,6 @@ public:
 		TS_ASSERT_EQUALS(rint(5.6), 6.0);
 	}

-	void test_min_max()
-	{
-		TS_ASSERT_EQUALS(fminf(0.0f, 10000.0f), 0.0f);
-		TS_ASSERT_EQUALS(fminf(100.0f, 10000.0f), 100.0f);
-		TS_ASSERT_EQUALS(fminf(-1.0f, 2.0f), -1.0f);
-		TS_ASSERT_EQUALS(fminf(-2.0f, 1.0f), -2.0f);
-		TS_ASSERT_EQUALS(fminf(0.001f, 0.00001f), 0.00001f);
-
-		TS_ASSERT_EQUALS(fmaxf(0.0f, 10000.0f), 10000.0f);
-		TS_ASSERT_EQUALS(fmaxf(100.0f, 10000.0f), 10000.0f);
-		TS_ASSERT_EQUALS(fmaxf(-1.0f, 2.0f), 2.0f);
-		TS_ASSERT_EQUALS(fmaxf(-2.0f, 1.0f), 1.0f);
-		TS_ASSERT_EQUALS(fmaxf(0.001f, 0.00001f), 0.001f);
-	}
-
 	void test_random()
 	{
 		u64 a = 0, b = 0;