config: additional macro is checked

lib: add bits() function (actually moved it from zip.cpp) ia32: start moving code out to external asm file (will use NASM in pre-build step; avoids needing MSC and GCC-specific inline code). move memcpy test code out to other file This was SVN commit r2706.
2005-09-13 04:00:41 +00:00 · 2005-09-13 04:00:41 +00:00 · 808a0f7cbe
commit 808a0f7cbe
parent 0b72d0f86c
7 changed files with 302 additions and 990 deletions
--- a/source/lib/config.h
+++ b/source/lib/config.h
@ -109,7 +109,7 @@
 # define OS_WIN 0
 #endif
 // .. Linux
-#if defined(linux) || defined(__linux)
+#if defined(linux) || defined(__linux) || defined(__linux__)
 # define OS_LINUX 1
 #else
 # define OS_LINUX 0
--- a/source/lib/lib.cpp
+++ b/source/lib/lib.cpp
@ -212,6 +212,17 @@ int ilog2(const float x)
 }


+uint bits(uint num, uint lo_idx, uint hi_idx)
+{
+	uint result = num;
+	result >>= lo_idx;
+	const uint count = (hi_idx - lo_idx)+1;	// # bits to return
+	result &= (1u << count)-1;
+	return result;
+}
+
+
+
 // multiple must be a power of two.
 uintptr_t round_up(const uintptr_t n, const uintptr_t multiple)
 {
--- a/source/lib/lib.h
+++ b/source/lib/lib.h
@ -347,7 +347,7 @@ const size_t GiB = 1ul << 30;


 #define BIT(n) (1ul << (n))
-
+extern uint bits(uint x, uint from, uint to);



--- a/source/lib/res/file/zip.cpp
+++ b/source/lib/res/file/zip.cpp
@ -187,17 +187,6 @@ found_ecdr:
 //
 ///////////////////////////////////////////////////////////////////////////////

-static uint bits(uint num, uint lo_idx, uint hi_idx)
-{
-	uint result = num;
-	result >>= lo_idx;
-	const uint count = (hi_idx - lo_idx)+1;
-	// number of bits to return
-	result &= (1u << count)-1;
-	return result;
-}
-
-
 static time_t convert_dos_date(u16 fatdate, u16 fattime)
 {
 	struct tm t;							// struct tm format:
--- a/source/lib/sysdep/ia32.asm
+++ b/source/lib/sysdep/ia32.asm
@ -0,0 +1,268 @@
+CACHEBLOCK equ 128
+BP_MIN_THRESHOLD_64 equ 192*1024
+MOVNTQ_MIN_THRESHOLD_64 equ 64*1024
+
+
+
+%macro MC_UNROLLED_MOVSD 0
+	and	ebx, 63
+	mov	edx, ebx
+	shr	edx, 2			; dword count
+	neg	edx
+	add	edx, %%movsd_table_end
+	jmp	edx
+
+align 8
+	movsd
+	movsd
+	movsd
+	movsd
+	movsd
+	movsd
+	movsd
+	movsd
+	movsd
+	movsd
+	movsd
+	movsd
+	movsd
+	movsd
+	movsd
+	movsd
+%%movsd_table_end:
+
+	mov	eax, ebx
+	and	eax, 3
+	neg	eax
+	add	eax, %%movsb_table_end
+	jmp	eax
+
+	movsb
+	movsb
+	movsb
+%%movsb_table_end:
+%endm
+
+
+%macro MC_ALIGN 0
+	mov		eax, 8
+	sub		eax, edi
+	and		eax, 7
+	cmp		eax, ecx
+	cmova		eax, ecx
+	sub		ecx, eax
+	neg		eax
+	add		eax, %%align_table_end
+	jmp		eax
+align 4
+	movsb
+	movsb
+	movsb
+	movsb
+	movsb
+	movsb
+	movsb
+	movsb
+%%align_table_end:
+%endm
+
+
+%macro MC_MOVQ 0
+align 16
+%%1:
+	prefetchnta	[esi + (200*64/34+192)]
+	movq		mm0, [esi+0]
+	movq		mm1, [esi+8]
+	movq		[edi+0], mm0
+	movq		[edi+8], mm1
+	movq		mm2, [esi+16]
+	movq		mm3, [esi+24]
+	movq		[edi+16], mm2
+	movq		[edi+24], mm3
+	movq		mm0, [esi+32]
+	movq		mm1, [esi+40]
+	movq		[edi+32], mm0
+	movq		[edi+40], mm1
+	movq		mm2, [esi+48]
+	movq		mm3, [esi+56]
+	movq		[edi+48], mm2
+	movq		[edi+56], mm3
+	add		esi, 64
+	add		edi, 64
+	dec		ecx
+	jnz		%%1
+%endm
+
+
+; we have >= 8kb. until no more 8kb blocks
+%macro MC_BP_MOVNTQ 0
+%%prefetch_and_copy_chunk:
+	mov		eax, CACHEBLOCK / 2		; block prefetch loop, unrolled 2X
+	add		esi, CACHEBLOCK * 64	; move to the top of the block
+align 16
+	; touch each cache line in reverse order (prevents HW prefetch)
+%%prefetch_chunk:
+	mov		edx, [esi-64]
+	mov		edx, [esi-128]
+	sub		esi, 128
+	dec		eax
+	jnz		%%prefetch_chunk
+	mov		eax, CACHEBLOCK		; now that it's in cache, do the copy
+align 16
+%%copy_block:
+	movq		mm0, [esi+ 0]
+	movq		mm1, [esi+ 8]
+	movq		mm2, [esi+16]
+	movq		mm3, [esi+24]
+	movq		mm4, [esi+32]
+	movq		mm5, [esi+40]
+	movq		mm6, [esi+48]
+	movq		mm7, [esi+56]
+	add		esi, 64
+	movntq		[edi+ 0], mm0
+	movntq		[edi+ 8], mm1
+	movntq		[edi+16], mm2
+	movntq		[edi+24], mm3
+	movntq		[edi+32], mm4
+	movntq		[edi+40], mm5
+	movntq		[edi+48], mm6
+	movntq		[edi+56], mm7
+	add		edi, 64
+	dec		eax
+	jnz		%%copy_block
+	sub		ecx, CACHEBLOCK		; update the 64-byte block count
+	cmp		ecx, CACHEBLOCK
+	jl		%%prefetch_and_copy_chunk
+%endm
+
+
+; we have >= 64, 64B BLOCKS
+%macro MC_MOVNTQ 0
+align 16
+%%1:
+	prefetchnta [esi + (200*64/34+192)]
+	movq		mm0,[esi+0]
+	add		edi,64
+	movq		mm1,[esi+8]
+	add		esi,64
+	movq		mm2,[esi-48]
+	movntq		[edi-64], mm0
+	movq		mm0,[esi-40]
+	movntq		[edi-56], mm1
+	movq		mm1,[esi-32]
+	movntq		[edi-48], mm2
+	movq		mm2,[esi-24]
+	movntq		[edi-40], mm0
+	movq		mm0,[esi-16]
+	movntq		[edi-32], mm1
+	movq		mm1,[esi-8]
+	movntq		[edi-24], mm2
+	movntq		[edi-16], mm0
+	dec		ecx
+	movntq		[edi-8], mm1
+	jnz		%%1
+%endm
+
+
+
+
+
+
+
+
+; void __declspec(naked) ia32_memcpy(void* dst, const void* src, size_t nbytes)
+	mov	ecx, [esp+4+8]		; nbytes
+	mov	esi, [esp+4+4]		; src
+	mov	edi, [esp+4+0]		; dst
+
+	MC_ALIGN
+
+	mov	ebx, ecx
+	shr	ecx, 6			; # blocks
+
+	mov	eax, _bp
+	cmp	ecx, BP_MIN_THRESHOLD_64
+	mov	edx, _movntq
+	cmovb	eax, edx
+	cmp	ecx, MOVNTQ_MIN_THRESHOLD_64
+	mov	edx, _mmx
+	cmovb	eax, edx
+	cmp	ecx, 64
+	jbe	tiny
+	jmp	eax
+
+tiny:
+	MC_UNROLLED_MOVSD
+	ret
+
+_mmx:
+	MC_MOVQ
+	emms
+	jmp		tiny
+
+_bp:
+	MC_BP_MOVNTQ
+	sfence
+	emms
+	; protect routine below
+	cmp		ecx, 0
+	jz		tiny
+
+_movntq:
+
+	sfence
+	emms
+	jmp		tiny
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+; extern "C" int __cdecl get_cur_processor_id();
+global _get_cur_processor_id
+_get_cur_processor_id:
+	push		ebx
+	push		1
+	pop		eax
+	cpuid
+	shr		ebx, 24
+	mov		eax, ebx					; ebx[31:24]
+	pop		ebx
+
+
+
+
+
+; extern "C" uint __cdecl ia32_control87(uint new_cw, uint mask)
+global _ia32_control87
+_ia32_control87:
+	push		eax
+	fnstcw		[esp]
+	pop		eax								; old_cw
+	mov		ecx, [esp+4]					; new_cw
+	mov		edx, [esp+8]					; mask
+	and		ecx, edx						; new_cw & mask
+	not		edx								; ~mask
+	and		eax, edx						; old_cw & ~mask
+	or		eax, ecx						; (old_cw & ~mask) | (new_cw & mask)
+	push		edx
+	fldcw		[esp]
+	pop		edx
+	xor		eax, eax					; return value
--- a/source/lib/sysdep/ia32.cpp
+++ b/source/lib/sysdep/ia32.cpp
--- a/source/lib/sysdep/ia32.h
+++ b/source/lib/sysdep/ia32.h
@ -24,6 +24,12 @@

 #include "lib/types.h"

+// some of these are implemented in asm, so make sure name mangling is
+// disabled.
+#ifdef __cplusplus
+extern "C" {
+#endif
+

 extern double _ceil(double);

@ -38,7 +44,7 @@ extern u64 rdtsc(void);
 #endif

 #define _control87 ia32_control87
-extern uint ia32_control87(uint new_cw, uint mask);
+extern uint ia32_control87(uint new_cw, uint mask);	// asm

 extern void ia32_debug_break(void);

@ -78,4 +84,11 @@ extern bool ia32_cap(CpuCap cap);
 extern void ia32_get_cpu_info(void);


+// internal use only
+extern int get_cur_processor_id();
+
+#ifdef __cplusplus
+}
+#endif
+
 #endif	// #ifndef IA32_H