From 8ba42d5a3cd91267fc6073fa2dff54a1dd184c9d Mon Sep 17 00:00:00 2001 From: janwas Date: Tue, 20 Sep 2005 04:05:23 +0000 Subject: [PATCH] ia32.asm: add explicit init function (simplifies other code) memcpy no longer requires SSE yet is still efficient (clever trick, hehe) This was SVN commit r2757. --- source/lib/sysdep/ia32.asm | 309 ++++++++++++++++-------------- source/lib/sysdep/ia32.h | 3 + source/ps/GameSetup/GameSetup.cpp | 7 + 3 files changed, 180 insertions(+), 139 deletions(-) diff --git a/source/lib/sysdep/ia32.asm b/source/lib/sysdep/ia32.asm index d8c631b957..1a9b3b845b 100644 --- a/source/lib/sysdep/ia32.asm +++ b/source/lib/sysdep/ia32.asm @@ -1,3 +1,6 @@ +; set 32-bit attribute once for all sections and activate .text +section .data use32 +section .bss use32 section .text use32 ;------------------------------------------------------------------------------- @@ -9,6 +12,9 @@ section .text use32 ; mem bandwidth (2000 MiB/s) for transfers >= 192KiB! ; Pentium III performance: about 3% faster in above small buffer benchmark. ; +; disables specialized large transfer (> 64KiB) implementations if SSE +; isn't available; we do assume MMX support, though (quite safe). + ; *requires* (and does not verify the presence of) SSE instructions: ; prefetchnta and movntq. therefore, a P3+ or Athlon XP is required. ; rationale: older processors are too slow anyway and we don't bother. @@ -35,7 +41,7 @@ BP_SIZE equ 8*1024 ; rep movsb is a bit smaller but 6.9% slower; everything else is much worse. %macro IC_MOVSD 0 mov eax, ecx - shr ecx, 2 ; dword count + shr ecx, 2 ; dword count neg ecx add ecx, %%movsd_table_end jmp ecx @@ -74,10 +80,8 @@ align 8 %macro IC_ALIGN 0 mov eax, 8 sub eax, edi - and eax, byte 7 - cmp eax, ecx - cmova eax, ecx - sub ecx, eax + and eax, byte 7 ; eax = # misaligned bytes + sub ecx, eax ; reduce copy count neg eax add eax, %%align_table_end jmp eax @@ -94,7 +98,7 @@ align 4 %endm -; > ecx = size (> 0) +; > ecx = size ; x edx %macro IC_MOVQ 0 align 16 @@ -103,22 +107,22 @@ align 16 cmp ecx, edx jb %%done prefetchnta [esi + (200*64/34+192)] - movq mm0, [esi+0] - movq mm1, [esi+8] - movq [edi+0], mm0 - movq [edi+8], mm1 - movq mm2, [esi+16] - movq mm3, [esi+24] - movq [edi+16], mm2 - movq [edi+24], mm3 - movq mm0, [esi+32] - movq mm1, [esi+40] - movq [edi+32], mm0 - movq [edi+40], mm1 - movq mm2, [esi+48] - movq mm3, [esi+56] - movq [edi+48], mm2 - movq [edi+56], mm3 + movq mm0, [esi+0] + movq mm1, [esi+8] + movq [edi+0], mm0 + movq [edi+8], mm1 + movq mm2, [esi+16] + movq mm3, [esi+24] + movq [edi+16], mm2 + movq [edi+24], mm3 + movq mm0, [esi+32] + movq mm1, [esi+40] + movq [edi+32], mm0 + movq [edi+40], mm1 + movq mm2, [esi+48] + movq mm3, [esi+56] + movq [edi+48], mm2 + movq [edi+56], mm3 add esi, edx add edi, edx sub ecx, edx @@ -127,6 +131,37 @@ align 16 %endm +; > ecx = size (> 64) +; x +%macro UC_MOVNTQ 0 + mov edx, 64 +align 16 +%%1: + prefetchnta [esi + (200*64/34+192)] + movq mm0,[esi+0] + add edi, edx + movq mm1,[esi+8] + add esi, edx + movq mm2,[esi-48] + movntq [edi-64], mm0 + movq mm0,[esi-40] + movntq [edi-56], mm1 + movq mm1,[esi-32] + movntq [edi-48], mm2 + movq mm2,[esi-24] + movntq [edi-40], mm0 + movq mm0,[esi-16] + movntq [edi-32], mm1 + movq mm1,[esi-8] + movntq [edi-24], mm2 + movntq [edi-16], mm0 + sub ecx, edx + movntq [edi-8], mm1 + cmp ecx, edx + jae %%1 +%endm + + ; > ecx = size (> 8KiB) ; x eax, edx ; @@ -135,7 +170,7 @@ align 16 %%prefetch_and_copy_chunk: ; touch each cache line within chunk in reverse order (prevents HW prefetch) - push byte BP_SIZE/128 ; # iterations + push byte BP_SIZE/128 ; # iterations pop eax add esi, BP_SIZE align 8 @@ -147,28 +182,28 @@ align 8 jnz %%prefetch_chunk ; copy 64 byte blocks - mov eax, BP_SIZE/64 ; # iterations (> signed 8 bit) - push byte 64 + mov eax, BP_SIZE/64 ; # iterations (> signed 8 bit) + push byte 64 pop edx align 8 %%copy_block: - movq mm0, [esi+ 0] - movq mm1, [esi+ 8] - movq mm2, [esi+16] - movq mm3, [esi+24] - movq mm4, [esi+32] - movq mm5, [esi+40] - movq mm6, [esi+48] - movq mm7, [esi+56] + movq mm0, [esi+ 0] + movq mm1, [esi+ 8] + movq mm2, [esi+16] + movq mm3, [esi+24] + movq mm4, [esi+32] + movq mm5, [esi+40] + movq mm6, [esi+48] + movq mm7, [esi+56] add esi, edx - movntq [edi+ 0], mm0 - movntq [edi+ 8], mm1 - movntq [edi+16], mm2 - movntq [edi+24], mm3 - movntq [edi+32], mm4 - movntq [edi+40], mm5 - movntq [edi+48], mm6 - movntq [edi+56], mm7 + movntq [edi+ 0], mm0 + movntq [edi+ 8], mm1 + movntq [edi+16], mm2 + movntq [edi+24], mm3 + movntq [edi+32], mm4 + movntq [edi+40], mm5 + movntq [edi+48], mm6 + movntq [edi+56], mm7 add edi, edx dec eax jnz %%copy_block @@ -179,49 +214,28 @@ align 8 %endm -; > ecx = size (> 64) -; x -%macro UC_MOVNTQ 0 - mov edx, 64 -align 16 -%%1: - prefetchnta [esi + (200*64/34+192)] - movq mm0,[esi+0] - add edi, edx - movq mm1,[esi+8] - add esi, edx - movq mm2,[esi-48] - movntq [edi-64], mm0 - movq mm0,[esi-40] - movntq [edi-56], mm1 - movq mm1,[esi-32] - movntq [edi-48], mm2 - movq mm2,[esi-24] - movntq [edi-40], mm0 - movq mm0,[esi-16] - movntq [edi-32], mm1 - movq mm1,[esi-8] - movntq [edi-24], mm2 - movntq [edi-16], mm0 - sub ecx, edx - movntq [edi-8], mm1 - cmp ecx, edx - jae %%1 -%endm +[section .bss] +; this is somewhat "clever". the 2 specialized transfer implementations +; that use SSE are jumped to if transfer size is greater than a threshold. +; we simply set the requested transfer size to 0 if the CPU doesn't +; support SSE so that those are never reached (done by masking with this). +sse_mask resd 1 + +__SECT__ ; void __declspec(naked) ia32_memcpy(void* dst, const void* src, size_t nbytes) global _ia32_memcpy _ia32_memcpy: - push edi - push esi + push edi + push esi - mov edi, [esp+8+4+0] ; dst - mov esi, [esp+8+4+4] ; src - mov ecx, [esp+8+4+8] ; nbytes + mov edi, [esp+8+4+0] ; dst + mov esi, [esp+8+4+4] ; src + mov ecx, [esp+8+4+8] ; nbytes cmp ecx, byte IC_SIZE - ja .choose_large_method + ja .choose_larger_method .ic_movsd: IC_MOVSD @@ -229,12 +243,21 @@ _ia32_memcpy: pop edi ret -.choose_large_method: +.choose_larger_method: IC_ALIGN - cmp ecx, UC_THRESHOLD - jb near .ic_movq - cmp ecx, BP_THRESHOLD - jae .uc_bp_movntq + + mov eax, [sse_mask] + mov edx, ecx + and edx, eax ; edx = (SSE)? remaining_bytes : 0 + cmp edx, BP_THRESHOLD + jae near .uc_bp_movntq + cmp edx, UC_THRESHOLD + jae .uc_movntq + +.ic_movq: + IC_MOVQ + emms + jmp .ic_movsd .uc_movntq: UC_MOVNTQ @@ -245,26 +268,21 @@ _ia32_memcpy: .uc_bp_movntq: UC_BP_MOVNTQ sfence - ; fall through + jmp .ic_movq -.ic_movq: - IC_MOVQ - emms - jmp .ic_movsd ;------------------------------------------------------------------------------- ; CPUID support ;------------------------------------------------------------------------------- -[section .data use32] -cpuid_available dd -1 +[section .data] -[section .bss use32] - -; no init needed - cpuid_available triggers init -max_func resd 1 -max_ext_func resd 1 +; these are actually max_func+1, i.e. the first invalid value. +; the idea here is to avoid a separate cpuid_available flag; +; using signed values doesn't work because ext_funcs are >= 0x80000000. +max_func dd 0 +max_ext_func dd 0 __SECT__ @@ -272,28 +290,21 @@ __SECT__ ; extern "C" bool __cdecl ia32_cpuid(u32 func, u32* regs) global _ia32_cpuid _ia32_cpuid: - ; note: must preserve before .one_time_init because it does cpuid - push ebx - push edi - -.retry: - ; if unknown, detect; if not available, fail. - xor eax, eax ; return val on failure - cmp [cpuid_available], eax - jl .one_time_init - je .ret + push ebx + push edi mov ecx, [esp+8+4+0] ; func mov edi, [esp+8+4+4] ; -> regs ; compare against max supported func and fail if above - test ecx, ecx + xor eax, eax ; return value on failure + test ecx, ecx mov edx, [max_ext_func] js .is_ext_func mov edx, [max_func] .is_ext_func: cmp ecx, edx - ja .ret + jae .ret ; (see max_func decl) ; issue CPUID and store result registers in array mov eax, ecx @@ -314,28 +325,6 @@ _ia32_cpuid: pop ebx ret -.one_time_init: - ; check if CPUID is supported - pushfd - or byte [esp+2], 32 - popfd - pushfd - pop eax - xor edx, edx - shr eax, 22 ; bit 21 toggled? - adc edx, 0 - mov [cpuid_available], edx - - ; determine max supported function - xor eax, eax - cpuid - mov [max_func], eax - mov eax, 0x80000000 - cpuid - mov [max_ext_func], eax - - jmp .retry - ;------------------------------------------------------------------------------- ; misc @@ -344,16 +333,58 @@ _ia32_cpuid: ; extern "C" uint __cdecl ia32_control87(uint new_cw, uint mask) global _ia32_control87 _ia32_control87: - push eax - fnstcw [esp] - pop eax ; old_cw - mov ecx, [esp+4] ; new_cw - mov edx, [esp+8] ; mask - and ecx, edx ; new_cw & mask - not edx ; ~mask - and eax, edx ; old_cw & ~mask - or eax, ecx ; (old_cw & ~mask) | (new_cw & mask) - push edx - fldcw [esp] + push eax + fnstcw [esp] + pop eax ; old_cw + mov ecx, [esp+4] ; new_cw + mov edx, [esp+8] ; mask + and ecx, edx ; new_cw & mask + not edx ; ~mask + and eax, edx ; old_cw & ~mask + or eax, ecx ; (old_cw & ~mask) | (new_cw & mask) + push edx + fldcw [esp] pop edx - xor eax, eax ; return value \ No newline at end of file + xor eax, eax ; return value + + +;------------------------------------------------------------------------------- +; init +;------------------------------------------------------------------------------- + +; extern "C" bool __cdecl ia32_init() +global _ia32_init +_ia32_init: + push ebx + + ; check if CPUID is supported + pushfd + or byte [esp+2], 32 + popfd + pushfd + pop eax + xor edx, edx + shr eax, 22 ; bit 21 toggled? + jnc .no_cpuid + + ; determine max supported CPUID function + xor eax, eax + cpuid + inc eax ; (see max_func decl) + mov [max_func], eax + mov eax, 0x80000000 + cpuid + inc eax ; (see max_func decl) + mov [max_ext_func], eax +.no_cpuid: + + ; check if SSE is supported (used by memcpy code) +extern _ia32_cap + push byte 32+25 ; ia32.h's SSE cap (won't change) + call _ia32_cap + pop edx ; remove stack param + neg eax ; SSE? ~0 : 0 + mov [sse_mask], eax + + pop ebx + ret \ No newline at end of file diff --git a/source/lib/sysdep/ia32.h b/source/lib/sysdep/ia32.h index 8e126cc0c8..d31f6c4513 100755 --- a/source/lib/sysdep/ia32.h +++ b/source/lib/sysdep/ia32.h @@ -31,6 +31,9 @@ extern "C" { #endif +// call before any of the following functions +extern void ia32_init(); + extern double _ceil(double); extern u64 rdtsc(void); diff --git a/source/ps/GameSetup/GameSetup.cpp b/source/ps/GameSetup/GameSetup.cpp index b4d131fb3f..b98b6e4d3d 100644 --- a/source/ps/GameSetup/GameSetup.cpp +++ b/source/ps/GameSetup/GameSetup.cpp @@ -6,6 +6,9 @@ #include "lib/detect.h" #include "lib/timer.h" #include "lib/input.h" +#if CPU_IA32 +# include "lib/sysdep/ia32.h" +#endif #include "lib/res/res.h" #include "lib/res/sound/snd.h" #include "lib/res/graphics/tex.h" @@ -688,6 +691,10 @@ void Init(int argc, char* argv[], bool setup_gfx, bool setup_gui) debug_set_thread_name("main"); +#if CPU_IA32 + ia32_init(); +#endif + // If you ever want to catch a particular allocation: //_CrtSetBreakAlloc(187);