ia32.asm: add explicit init function (simplifies other code)

memcpy no longer requires SSE yet is still efficient (clever trick, hehe) This was SVN commit r2757.
2005-09-20 04:05:23 +00:00 · 2005-09-20 04:05:23 +00:00 · 8ba42d5a3c
commit 8ba42d5a3c
parent 2a8953452d
3 changed files with 180 additions and 139 deletions
--- a/source/lib/sysdep/ia32.asm
+++ b/source/lib/sysdep/ia32.asm
@ -1,3 +1,6 @@
+; set 32-bit attribute once for all sections and activate .text
+section .data use32
+section .bss use32
 section .text use32

 ;-------------------------------------------------------------------------------
@ -9,6 +12,9 @@ section .text use32
 ; mem bandwidth (2000 MiB/s) for transfers >= 192KiB!
 ; Pentium III performance: about 3% faster in above small buffer benchmark.
 ;
+; disables specialized large transfer (> 64KiB) implementations if SSE
+; isn't available; we do assume MMX support, though (quite safe).
+
 ; *requires* (and does not verify the presence of) SSE instructions:
 ; prefetchnta and movntq. therefore, a P3+ or Athlon XP is required.
 ; rationale: older processors are too slow anyway and we don't bother.
@ -35,7 +41,7 @@ BP_SIZE		equ	8*1024
 ; rep movsb is a bit smaller but 6.9% slower; everything else is much worse.
 %macro IC_MOVSD 0
 	mov		eax, ecx
-	shr		ecx, 2			; dword count
+	shr		ecx, 2						; dword count
 	neg		ecx
 	add		ecx, %%movsd_table_end
 	jmp		ecx
@ -74,10 +80,8 @@ align 8
 %macro IC_ALIGN 0
 	mov		eax, 8
 	sub		eax, edi
-	and		eax, byte 7
-	cmp		eax, ecx
-	cmova		eax, ecx
-	sub		ecx, eax
+	and		eax, byte 7					; eax = # misaligned bytes
+	sub		ecx, eax					; reduce copy count
 	neg		eax
 	add		eax, %%align_table_end
 	jmp		eax
@ -94,7 +98,7 @@ align 4
 %endm


-; > ecx = size (> 0)
+; > ecx = size
 ; x edx
 %macro IC_MOVQ 0
 align 16
@ -103,22 +107,22 @@ align 16
 	cmp		ecx, edx
 	jb		%%done
 	prefetchnta	[esi + (200*64/34+192)]
-	movq		mm0, [esi+0]
-	movq		mm1, [esi+8]
-	movq		[edi+0], mm0
-	movq		[edi+8], mm1
-	movq		mm2, [esi+16]
-	movq		mm3, [esi+24]
-	movq		[edi+16], mm2
-	movq		[edi+24], mm3
-	movq		mm0, [esi+32]
-	movq		mm1, [esi+40]
-	movq		[edi+32], mm0
-	movq		[edi+40], mm1
-	movq		mm2, [esi+48]
-	movq		mm3, [esi+56]
-	movq		[edi+48], mm2
-	movq		[edi+56], mm3
+	movq	mm0, [esi+0]
+	movq	mm1, [esi+8]
+	movq	[edi+0], mm0
+	movq	[edi+8], mm1
+	movq	mm2, [esi+16]
+	movq	mm3, [esi+24]
+	movq	[edi+16], mm2
+	movq	[edi+24], mm3
+	movq	mm0, [esi+32]
+	movq	mm1, [esi+40]
+	movq	[edi+32], mm0
+	movq	[edi+40], mm1
+	movq	mm2, [esi+48]
+	movq	mm3, [esi+56]
+	movq	[edi+48], mm2
+	movq	[edi+56], mm3
 	add		esi, edx
 	add		edi, edx
 	sub		ecx, edx
@ -127,6 +131,37 @@ align 16
 %endm


+; > ecx = size (> 64)
+; x
+%macro UC_MOVNTQ 0
+	mov		edx, 64
+align 16
+%%1:
+	prefetchnta [esi + (200*64/34+192)]
+	movq	mm0,[esi+0]
+	add		edi, edx
+	movq	mm1,[esi+8]
+	add		esi, edx
+	movq	mm2,[esi-48]
+	movntq	[edi-64], mm0
+	movq	mm0,[esi-40]
+	movntq	[edi-56], mm1
+	movq	mm1,[esi-32]
+	movntq	[edi-48], mm2
+	movq	mm2,[esi-24]
+	movntq	[edi-40], mm0
+	movq	mm0,[esi-16]
+	movntq	[edi-32], mm1
+	movq	mm1,[esi-8]
+	movntq	[edi-24], mm2
+	movntq	[edi-16], mm0
+	sub		ecx, edx
+	movntq	[edi-8], mm1
+	cmp		ecx, edx
+	jae		%%1
+%endm
+
+
 ; > ecx = size (> 8KiB)
 ; x eax, edx
 ;
@ -135,7 +170,7 @@ align 16
 %%prefetch_and_copy_chunk:

 	; touch each cache line within chunk in reverse order (prevents HW prefetch)
-	push		byte BP_SIZE/128	; # iterations
+	push	byte BP_SIZE/128			; # iterations
 	pop		eax
 	add		esi, BP_SIZE
 align 8
@ -147,28 +182,28 @@ align 8
 	jnz		%%prefetch_chunk

 	; copy 64 byte blocks
-	mov		eax, BP_SIZE/64		; # iterations (> signed 8 bit)
-	push		byte 64
+	mov		eax, BP_SIZE/64				; # iterations (> signed 8 bit)
+	push	byte 64
 	pop		edx
 align 8
 %%copy_block:
-	movq		mm0, [esi+ 0]
-	movq		mm1, [esi+ 8]
-	movq		mm2, [esi+16]
-	movq		mm3, [esi+24]
-	movq		mm4, [esi+32]
-	movq		mm5, [esi+40]
-	movq		mm6, [esi+48]
-	movq		mm7, [esi+56]
+	movq	mm0, [esi+ 0]
+	movq	mm1, [esi+ 8]
+	movq	mm2, [esi+16]
+	movq	mm3, [esi+24]
+	movq	mm4, [esi+32]
+	movq	mm5, [esi+40]
+	movq	mm6, [esi+48]
+	movq	mm7, [esi+56]
 	add		esi, edx
-	movntq		[edi+ 0], mm0
-	movntq		[edi+ 8], mm1
-	movntq		[edi+16], mm2
-	movntq		[edi+24], mm3
-	movntq		[edi+32], mm4
-	movntq		[edi+40], mm5
-	movntq		[edi+48], mm6
-	movntq		[edi+56], mm7
+	movntq	[edi+ 0], mm0
+	movntq	[edi+ 8], mm1
+	movntq	[edi+16], mm2
+	movntq	[edi+24], mm3
+	movntq	[edi+32], mm4
+	movntq	[edi+40], mm5
+	movntq	[edi+48], mm6
+	movntq	[edi+56], mm7
 	add		edi, edx
 	dec		eax
 	jnz		%%copy_block
@ -179,49 +214,28 @@ align 8
 %endm


-; > ecx = size (> 64)
-; x
-%macro UC_MOVNTQ 0
-	mov		edx, 64
-align 16
-%%1:
-	prefetchnta [esi + (200*64/34+192)]
-	movq		mm0,[esi+0]
-	add		edi, edx
-	movq		mm1,[esi+8]
-	add		esi, edx
-	movq		mm2,[esi-48]
-	movntq		[edi-64], mm0
-	movq		mm0,[esi-40]
-	movntq		[edi-56], mm1
-	movq		mm1,[esi-32]
-	movntq		[edi-48], mm2
-	movq		mm2,[esi-24]
-	movntq		[edi-40], mm0
-	movq		mm0,[esi-16]
-	movntq		[edi-32], mm1
-	movq		mm1,[esi-8]
-	movntq		[edi-24], mm2
-	movntq		[edi-16], mm0
-	sub		ecx, edx
-	movntq		[edi-8], mm1
-	cmp		ecx, edx
-	jae		%%1
-%endm
+[section .bss]

+; this is somewhat "clever". the 2 specialized transfer implementations
+; that use SSE are jumped to if transfer size is greater than a threshold.
+; we simply set the requested transfer size to 0 if the CPU doesn't
+; support SSE so that those are never reached (done by masking with this).
+sse_mask		resd	1
+
+__SECT__

 ; void __declspec(naked) ia32_memcpy(void* dst, const void* src, size_t nbytes)
 global _ia32_memcpy
 _ia32_memcpy:
-	push		edi
-	push		esi
+	push	edi
+	push	esi

-	mov		edi, [esp+8+4+0]	; dst
-	mov		esi, [esp+8+4+4]	; src
-	mov		ecx, [esp+8+4+8]	; nbytes
+	mov		edi, [esp+8+4+0]			; dst
+	mov		esi, [esp+8+4+4]			; src
+	mov		ecx, [esp+8+4+8]			; nbytes

 	cmp		ecx, byte IC_SIZE
-	ja		.choose_large_method
+	ja		.choose_larger_method

 .ic_movsd:
 	IC_MOVSD
@ -229,12 +243,21 @@ _ia32_memcpy:
 	pop		edi
 	ret

-.choose_large_method:
+.choose_larger_method:
 	IC_ALIGN
-	cmp		ecx, UC_THRESHOLD
-	jb		near .ic_movq
-	cmp		ecx, BP_THRESHOLD
-	jae		.uc_bp_movntq
+
+	mov		eax, [sse_mask]
+	mov		edx, ecx
+	and		edx, eax					; edx = (SSE)? remaining_bytes : 0
+	cmp		edx, BP_THRESHOLD
+	jae		near .uc_bp_movntq
+	cmp		edx, UC_THRESHOLD
+	jae		.uc_movntq
+
+.ic_movq:
+	IC_MOVQ
+	emms
+	jmp		.ic_movsd

 .uc_movntq:
 	UC_MOVNTQ
@ -245,26 +268,21 @@ _ia32_memcpy:
 .uc_bp_movntq:
 	UC_BP_MOVNTQ
 	sfence
-	; fall through
+	jmp		.ic_movq

-.ic_movq:
-	IC_MOVQ
-	emms
-	jmp		.ic_movsd


 ;-------------------------------------------------------------------------------
 ; CPUID support
 ;-------------------------------------------------------------------------------

-[section .data use32]
-cpuid_available	dd	-1
+[section .data]

-[section .bss use32]
-
-; no init needed - cpuid_available triggers init
-max_func	resd	1
-max_ext_func	resd	1
+; these are actually max_func+1, i.e. the first invalid value.
+; the idea here is to avoid a separate cpuid_available flag;
+; using signed values doesn't work because ext_funcs are >= 0x80000000.
+max_func		dd	0
+max_ext_func	dd	0

 __SECT__

@ -272,28 +290,21 @@ __SECT__
 ; extern "C" bool __cdecl ia32_cpuid(u32 func, u32* regs)
 global _ia32_cpuid
 _ia32_cpuid:
-	; note: must preserve before .one_time_init because it does cpuid
-	push		ebx
-	push		edi
-
-.retry:
-	; if unknown, detect; if not available, fail.
-	xor		eax, eax				; return val on failure
-	cmp		[cpuid_available], eax
-	jl		.one_time_init
-	je		.ret
+	push	ebx
+	push	edi

 	mov		ecx, [esp+8+4+0]			; func
 	mov		edi, [esp+8+4+4]			; -> regs

 	; compare against max supported func and fail if above
-	test		ecx, ecx
+	xor		eax, eax					; return value on failure
+	test	ecx, ecx
 	mov		edx, [max_ext_func]
 	js		.is_ext_func
 	mov		edx, [max_func]
 .is_ext_func:
 	cmp		ecx, edx
-	ja		.ret
+	jae		.ret						; (see max_func decl)

 	; issue CPUID and store result registers in array
 	mov		eax, ecx
@ -314,28 +325,6 @@ _ia32_cpuid:
 	pop		ebx
 	ret

-.one_time_init:
-	; check if CPUID is supported
-	pushfd
-	or		byte [esp+2], 32
-	popfd
-	pushfd
-	pop		eax
-	xor		edx, edx
-	shr		eax, 22					; bit 21 toggled?
-	adc		edx, 0
-	mov		[cpuid_available], edx
-
-	; determine max supported function
-	xor		eax, eax
-	cpuid
-	mov		[max_func], eax
-	mov		eax, 0x80000000
-	cpuid
-	mov		[max_ext_func], eax
-
-	jmp		.retry
-

 ;-------------------------------------------------------------------------------
 ; misc
@ -344,16 +333,58 @@ _ia32_cpuid:
 ; extern "C" uint __cdecl ia32_control87(uint new_cw, uint mask)
 global _ia32_control87
 _ia32_control87:
-	push		eax
-	fnstcw		[esp]
-	pop		eax								; old_cw
-	mov		ecx, [esp+4]					; new_cw
-	mov		edx, [esp+8]					; mask
-	and		ecx, edx						; new_cw & mask
-	not		edx								; ~mask
-	and		eax, edx						; old_cw & ~mask
-	or		eax, ecx						; (old_cw & ~mask) | (new_cw & mask)
-	push		edx
-	fldcw		[esp]
+	push	eax
+	fnstcw	[esp]
+	pop		eax							; old_cw
+	mov		ecx, [esp+4]				; new_cw
+	mov		edx, [esp+8]				; mask
+	and		ecx, edx					; new_cw & mask
+	not		edx							; ~mask
+	and		eax, edx					; old_cw & ~mask
+	or		eax, ecx					; (old_cw & ~mask) | (new_cw & mask)
+	push	edx
+	fldcw	[esp]
 	pop		edx
-	xor		eax, eax					; return value
+	xor		eax, eax					; return value
+	
+	
+;-------------------------------------------------------------------------------
+; init
+;-------------------------------------------------------------------------------
+	
+; extern "C" bool __cdecl ia32_init()
+global _ia32_init
+_ia32_init:
+	push	ebx
+
+	; check if CPUID is supported
+	pushfd
+	or		byte [esp+2], 32
+	popfd
+	pushfd
+	pop		eax
+	xor		edx, edx
+	shr		eax, 22						; bit 21 toggled?
+	jnc		.no_cpuid
+
+	; determine max supported CPUID function
+	xor		eax, eax
+	cpuid
+	inc		eax							; (see max_func decl)
+	mov		[max_func], eax
+	mov		eax, 0x80000000
+	cpuid
+	inc		eax							; (see max_func decl)
+	mov		[max_ext_func], eax
+.no_cpuid:
+
+	; check if SSE is supported (used by memcpy code)
+extern _ia32_cap
+	push	byte 32+25					; ia32.h's SSE cap (won't change)
+	call	_ia32_cap
+	pop		edx							; remove stack param
+	neg		eax							; SSE? ~0 : 0
+	mov		[sse_mask], eax
+
+	pop		ebx
+	ret
--- a/source/lib/sysdep/ia32.h
+++ b/source/lib/sysdep/ia32.h
@ -31,6 +31,9 @@ extern "C" {
 #endif


+// call before any of the following functions
+extern void ia32_init();
+
 extern double _ceil(double);

 extern u64 rdtsc(void);
--- a/source/ps/GameSetup/GameSetup.cpp
+++ b/source/ps/GameSetup/GameSetup.cpp
@ -6,6 +6,9 @@
 #include "lib/detect.h"
 #include "lib/timer.h"
 #include "lib/input.h"
+#if CPU_IA32
+# include "lib/sysdep/ia32.h"
+#endif
 #include "lib/res/res.h"
 #include "lib/res/sound/snd.h"
 #include "lib/res/graphics/tex.h"
@ -688,6 +691,10 @@ void Init(int argc, char* argv[], bool setup_gfx, bool setup_gui)

 	debug_set_thread_name("main");

+#if CPU_IA32
+	ia32_init();
+#endif
+
 	// If you ever want to catch a particular allocation:
 	//_CrtSetBreakAlloc(187);