diff --git a/source/lib/sysdep/ia32.asm b/source/lib/sysdep/ia32.asm index 1a14a5ed44..604cbd7ce5 100644 --- a/source/lib/sysdep/ia32.asm +++ b/source/lib/sysdep/ia32.asm @@ -1,19 +1,44 @@ section .text use32 -CACHEBLOCK equ 128 -BP_MIN_THRESHOLD_64 equ 192*1024 -MOVNTQ_MIN_THRESHOLD_64 equ 64*1024 +;------------------------------------------------------------------------------- +; fast general memcpy +;------------------------------------------------------------------------------- + +; optimized for Athlon XP: 7.3% faster (cumulative) than VC7.1's memcpy over +; all 1..64 byte transfer lengths and misalignments. approaches maximum +; mem bandwidth (2000 MiB/s) for transfers >= 192KiB! +; Pentium III performance: about 3% faster in above small buffer benchmark. +; +; *requires* (and does not verify the presence of) SSE instructions: +; prefetchnta and movntq. therefore, a P3+ or Athlon XP is required. +; rationale: older processors are too slow anyway and we don't bother. + +; if memcpy size is greater than this, +; .. it's too big for L1. use non-temporal instructions. +UC_THRESHOLD equ 64*1024 +; .. it also blows L2. pull chunks into L1 ("block prefetch"). +BP_THRESHOLD equ 192*1024 + +; maximum that can be copied by IC_MOVSD. +; if you change this, be sure to expand the movs* table(s)! +IC_SIZE equ 67 + +; size of one block prefetch chunk. +; if you change this, make sure "push byte BP_SIZE/128" doesn't overflow! +BP_SIZE equ 8*1024 - -%macro MC_UNROLLED_MOVSD 0 - and ebx, 63 - mov edx, ebx - shr edx, 2 ; dword count - neg edx - add edx, %%movsd_table_end - jmp edx - +; > ecx = size (<= IC_SIZE) +; x eax, ecx +; +; determined to be fastest approach by testing. a movsd table followed by +; rep movsb is a bit smaller but 6.9% slower; everything else is much worse. +%macro IC_MOVSD 0 + mov eax, ecx + shr ecx, 2 ; dword count + neg ecx + add ecx, %%movsd_table_end + jmp ecx align 8 movsd movsd @@ -33,12 +58,10 @@ align 8 movsd %%movsd_table_end: - mov eax, ebx - and eax, 3 - neg eax - add eax, %%movsb_table_end - jmp eax - + and eax, 3 + neg eax + add eax, %%movsb_table_end + jmp eax movsb movsb movsb @@ -46,10 +69,12 @@ align 8 %endm -%macro MC_ALIGN 0 +; align destination address to multiple of 8. +; not done for small transfers because it doesn't help IC_MOVSD. +%macro IC_ALIGN 0 mov eax, 8 sub eax, edi - and eax, 7 + and eax, byte 7 cmp eax, ecx cmova eax, ecx sub ecx, eax @@ -69,9 +94,14 @@ align 4 %endm -%macro MC_MOVQ 0 +; > ecx = size (> 0) +; x edx +%macro IC_MOVQ 0 align 16 -%%1: + mov edx, 64 +%%loop: + cmp ecx, edx + jb %%done prefetchnta [esi + (200*64/34+192)] movq mm0, [esi+0] movq mm1, [esi+8] @@ -89,28 +119,38 @@ align 16 movq mm3, [esi+56] movq [edi+48], mm2 movq [edi+56], mm3 - add esi, 64 - add edi, 64 - dec ecx - jnz %%1 + add esi, edx + add edi, edx + sub ecx, edx + jmp %%loop +%%done: %endm -; we have >= 8kb. until no more 8kb blocks -%macro MC_BP_MOVNTQ 0 +; > ecx = size (> 8KiB) +; x eax, edx +; +; somewhat optimized for size (futile attempt to avoid near jump) +%macro UC_BP_MOVNTQ 0 %%prefetch_and_copy_chunk: - mov eax, CACHEBLOCK / 2 ; block prefetch loop, unrolled 2X - add esi, CACHEBLOCK * 64 ; move to the top of the block -align 16 - ; touch each cache line in reverse order (prevents HW prefetch) + + ; touch each cache line within chunk in reverse order (prevents HW prefetch) + push byte BP_SIZE/128 ; # iterations + pop eax + add esi, BP_SIZE +align 8 %%prefetch_chunk: mov edx, [esi-64] mov edx, [esi-128] sub esi, 128 dec eax jnz %%prefetch_chunk - mov eax, CACHEBLOCK ; now that it's in cache, do the copy -align 16 + + ; copy 64 byte blocks + mov eax, BP_SIZE/64 ; # iterations (> signed 8 bit) + push byte 64 + pop edx +align 8 %%copy_block: movq mm0, [esi+ 0] movq mm1, [esi+ 8] @@ -120,7 +160,7 @@ align 16 movq mm5, [esi+40] movq mm6, [esi+48] movq mm7, [esi+56] - add esi, 64 + add esi, edx movntq [edi+ 0], mm0 movntq [edi+ 8], mm1 movntq [edi+16], mm2 @@ -129,24 +169,27 @@ align 16 movntq [edi+40], mm5 movntq [edi+48], mm6 movntq [edi+56], mm7 - add edi, 64 + add edi, edx dec eax jnz %%copy_block - sub ecx, CACHEBLOCK ; update the 64-byte block count - cmp ecx, CACHEBLOCK - jl %%prefetch_and_copy_chunk + + sub ecx, BP_SIZE + cmp ecx, BP_SIZE + jae %%prefetch_and_copy_chunk %endm -; we have >= 64, 64B BLOCKS -%macro MC_MOVNTQ 0 +; > ecx = size (> 64) +; x +%macro UC_MOVNTQ 0 + mov edx, 64 align 16 %%1: prefetchnta [esi + (200*64/34+192)] movq mm0,[esi+0] - add edi,64 + add edi, edx movq mm1,[esi+8] - add esi,64 + add esi, edx movq mm2,[esi-48] movntq [edi-64], mm0 movq mm0,[esi-40] @@ -160,75 +203,67 @@ align 16 movq mm1,[esi-8] movntq [edi-24], mm2 movntq [edi-16], mm0 - dec ecx + sub ecx, edx movntq [edi-8], mm1 - jnz %%1 + cmp ecx, edx + jae %%1 %endm - - - - - - ; void __declspec(naked) ia32_memcpy(void* dst, const void* src, size_t nbytes) - mov ecx, [esp+4+8] ; nbytes - mov esi, [esp+4+4] ; src - mov edi, [esp+4+0] ; dst +global _ia32_memcpy +_ia32_memcpy: + mov ecx, [esp+4+8] ; nbytes + mov esi, [esp+4+4] ; src + mov edi, [esp+4+0] ; dst - MC_ALIGN + cmp ecx, byte IC_SIZE + ja .choose_large_method - mov ebx, ecx - shr ecx, 6 ; # blocks - - mov eax, _bp - cmp ecx, BP_MIN_THRESHOLD_64 - mov edx, _movntq - cmovb eax, edx - cmp ecx, MOVNTQ_MIN_THRESHOLD_64 - mov edx, _mmx - cmovb eax, edx - cmp ecx, 64 - jbe tiny - jmp eax - -tiny: - MC_UNROLLED_MOVSD +.ic_movsd: + IC_MOVSD ret -_mmx: - MC_MOVQ - emms - jmp tiny +.choose_large_method: + IC_ALIGN + cmp ecx, UC_THRESHOLD + jb near .ic_movq + cmp ecx, BP_THRESHOLD + jae .uc_bp_movntq -_bp: - MC_BP_MOVNTQ +.uc_movntq: + UC_MOVNTQ sfence emms - ; protect routine below - cmp ecx, 0 - jz tiny - -_movntq: + jmp .ic_movsd +.uc_bp_movntq: + UC_BP_MOVNTQ sfence + ; fall through + +.ic_movq: + IC_MOVQ emms - jmp tiny - - - + jmp .ic_movsd +;------------------------------------------------------------------------------- +; CPUID support +;------------------------------------------------------------------------------- [section .data use32] cpuid_available dd -1 -; max supported CPUID functions. initialized to -max_func dd 0x7FFFFFFF -max_ext_func dd 0xFFFFFFFF +[section .bss use32] + +; no init needed - cpuid_available triggers init +max_func resd 1 +max_ext_func resd 1 + __SECT__ + ; extern "C" bool __cdecl ia32_cpuid(u32 func, u32* regs) global _ia32_cpuid _ia32_cpuid: @@ -297,15 +332,9 @@ _ia32_cpuid: jmp .retry - - - - - - - - - +;------------------------------------------------------------------------------- +; misc +;------------------------------------------------------------------------------- ; extern "C" uint __cdecl ia32_control87(uint new_cw, uint mask) global _ia32_control87 diff --git a/source/lib/sysdep/ia32.cpp b/source/lib/sysdep/ia32.cpp index e251b38692..647155fbba 100755 --- a/source/lib/sysdep/ia32.cpp +++ b/source/lib/sysdep/ia32.cpp @@ -100,12 +100,6 @@ void ia32_debug_break() } -void ia32_memcpy(void* dst, const void* src, size_t nbytes) -{ - memcpy(dst, src, nbytes); -} - - //----------------------------------------------------------------------------- // support code for lock-free primitives //----------------------------------------------------------------------------- diff --git a/source/lib/sysdep/ia32.h b/source/lib/sysdep/ia32.h index 8dd2c814c5..8e126cc0c8 100755 --- a/source/lib/sysdep/ia32.h +++ b/source/lib/sysdep/ia32.h @@ -93,6 +93,10 @@ enum IA32Regs ECX, EDX }; + +// try to call the specified CPUID sub-function. returns true on success or +// false on failure (i.e. CPUID or the specific function not supported). +// returns eax, ebx, ecx, edx registers in above order. extern bool ia32_cpuid(u32 func, u32* regs); #ifdef __cplusplus