improvements to memcpy; cleaned up the whole thing and ia32 is now done (knock on wood)
executive summary: memcpy2 aka. ia32_memcpy is a drop-in replacement that is much faster than memcpy (7% on small buffers, 300% on 192K buffers) This was SVN commit r2729.
This commit is contained in:
parent
82f663e5a2
commit
27539d25e8
@ -1,19 +1,44 @@
|
||||
section .text use32
|
||||
|
||||
CACHEBLOCK equ 128
|
||||
BP_MIN_THRESHOLD_64 equ 192*1024
|
||||
MOVNTQ_MIN_THRESHOLD_64 equ 64*1024
|
||||
;-------------------------------------------------------------------------------
|
||||
; fast general memcpy
|
||||
;-------------------------------------------------------------------------------
|
||||
|
||||
; optimized for Athlon XP: 7.3% faster (cumulative) than VC7.1's memcpy over
|
||||
; all 1..64 byte transfer lengths and misalignments. approaches maximum
|
||||
; mem bandwidth (2000 MiB/s) for transfers >= 192KiB!
|
||||
; Pentium III performance: about 3% faster in above small buffer benchmark.
|
||||
;
|
||||
; *requires* (and does not verify the presence of) SSE instructions:
|
||||
; prefetchnta and movntq. therefore, a P3+ or Athlon XP is required.
|
||||
; rationale: older processors are too slow anyway and we don't bother.
|
||||
|
||||
; if memcpy size is greater than this,
|
||||
; .. it's too big for L1. use non-temporal instructions.
|
||||
UC_THRESHOLD equ 64*1024
|
||||
; .. it also blows L2. pull chunks into L1 ("block prefetch").
|
||||
BP_THRESHOLD equ 192*1024
|
||||
|
||||
; maximum that can be copied by IC_MOVSD.
|
||||
; if you change this, be sure to expand the movs* table(s)!
|
||||
IC_SIZE equ 67
|
||||
|
||||
; size of one block prefetch chunk.
|
||||
; if you change this, make sure "push byte BP_SIZE/128" doesn't overflow!
|
||||
BP_SIZE equ 8*1024
|
||||
|
||||
|
||||
|
||||
%macro MC_UNROLLED_MOVSD 0
|
||||
and ebx, 63
|
||||
mov edx, ebx
|
||||
shr edx, 2 ; dword count
|
||||
neg edx
|
||||
add edx, %%movsd_table_end
|
||||
jmp edx
|
||||
|
||||
; > ecx = size (<= IC_SIZE)
|
||||
; x eax, ecx
|
||||
;
|
||||
; determined to be fastest approach by testing. a movsd table followed by
|
||||
; rep movsb is a bit smaller but 6.9% slower; everything else is much worse.
|
||||
%macro IC_MOVSD 0
|
||||
mov eax, ecx
|
||||
shr ecx, 2 ; dword count
|
||||
neg ecx
|
||||
add ecx, %%movsd_table_end
|
||||
jmp ecx
|
||||
align 8
|
||||
movsd
|
||||
movsd
|
||||
@ -33,12 +58,10 @@ align 8
|
||||
movsd
|
||||
%%movsd_table_end:
|
||||
|
||||
mov eax, ebx
|
||||
and eax, 3
|
||||
neg eax
|
||||
add eax, %%movsb_table_end
|
||||
jmp eax
|
||||
|
||||
and eax, 3
|
||||
neg eax
|
||||
add eax, %%movsb_table_end
|
||||
jmp eax
|
||||
movsb
|
||||
movsb
|
||||
movsb
|
||||
@ -46,10 +69,12 @@ align 8
|
||||
%endm
|
||||
|
||||
|
||||
%macro MC_ALIGN 0
|
||||
; align destination address to multiple of 8.
|
||||
; not done for small transfers because it doesn't help IC_MOVSD.
|
||||
%macro IC_ALIGN 0
|
||||
mov eax, 8
|
||||
sub eax, edi
|
||||
and eax, 7
|
||||
and eax, byte 7
|
||||
cmp eax, ecx
|
||||
cmova eax, ecx
|
||||
sub ecx, eax
|
||||
@ -69,9 +94,14 @@ align 4
|
||||
%endm
|
||||
|
||||
|
||||
%macro MC_MOVQ 0
|
||||
; > ecx = size (> 0)
|
||||
; x edx
|
||||
%macro IC_MOVQ 0
|
||||
align 16
|
||||
%%1:
|
||||
mov edx, 64
|
||||
%%loop:
|
||||
cmp ecx, edx
|
||||
jb %%done
|
||||
prefetchnta [esi + (200*64/34+192)]
|
||||
movq mm0, [esi+0]
|
||||
movq mm1, [esi+8]
|
||||
@ -89,28 +119,38 @@ align 16
|
||||
movq mm3, [esi+56]
|
||||
movq [edi+48], mm2
|
||||
movq [edi+56], mm3
|
||||
add esi, 64
|
||||
add edi, 64
|
||||
dec ecx
|
||||
jnz %%1
|
||||
add esi, edx
|
||||
add edi, edx
|
||||
sub ecx, edx
|
||||
jmp %%loop
|
||||
%%done:
|
||||
%endm
|
||||
|
||||
|
||||
; we have >= 8kb. until no more 8kb blocks
|
||||
%macro MC_BP_MOVNTQ 0
|
||||
; > ecx = size (> 8KiB)
|
||||
; x eax, edx
|
||||
;
|
||||
; somewhat optimized for size (futile attempt to avoid near jump)
|
||||
%macro UC_BP_MOVNTQ 0
|
||||
%%prefetch_and_copy_chunk:
|
||||
mov eax, CACHEBLOCK / 2 ; block prefetch loop, unrolled 2X
|
||||
add esi, CACHEBLOCK * 64 ; move to the top of the block
|
||||
align 16
|
||||
; touch each cache line in reverse order (prevents HW prefetch)
|
||||
|
||||
; touch each cache line within chunk in reverse order (prevents HW prefetch)
|
||||
push byte BP_SIZE/128 ; # iterations
|
||||
pop eax
|
||||
add esi, BP_SIZE
|
||||
align 8
|
||||
%%prefetch_chunk:
|
||||
mov edx, [esi-64]
|
||||
mov edx, [esi-128]
|
||||
sub esi, 128
|
||||
dec eax
|
||||
jnz %%prefetch_chunk
|
||||
mov eax, CACHEBLOCK ; now that it's in cache, do the copy
|
||||
align 16
|
||||
|
||||
; copy 64 byte blocks
|
||||
mov eax, BP_SIZE/64 ; # iterations (> signed 8 bit)
|
||||
push byte 64
|
||||
pop edx
|
||||
align 8
|
||||
%%copy_block:
|
||||
movq mm0, [esi+ 0]
|
||||
movq mm1, [esi+ 8]
|
||||
@ -120,7 +160,7 @@ align 16
|
||||
movq mm5, [esi+40]
|
||||
movq mm6, [esi+48]
|
||||
movq mm7, [esi+56]
|
||||
add esi, 64
|
||||
add esi, edx
|
||||
movntq [edi+ 0], mm0
|
||||
movntq [edi+ 8], mm1
|
||||
movntq [edi+16], mm2
|
||||
@ -129,24 +169,27 @@ align 16
|
||||
movntq [edi+40], mm5
|
||||
movntq [edi+48], mm6
|
||||
movntq [edi+56], mm7
|
||||
add edi, 64
|
||||
add edi, edx
|
||||
dec eax
|
||||
jnz %%copy_block
|
||||
sub ecx, CACHEBLOCK ; update the 64-byte block count
|
||||
cmp ecx, CACHEBLOCK
|
||||
jl %%prefetch_and_copy_chunk
|
||||
|
||||
sub ecx, BP_SIZE
|
||||
cmp ecx, BP_SIZE
|
||||
jae %%prefetch_and_copy_chunk
|
||||
%endm
|
||||
|
||||
|
||||
; we have >= 64, 64B BLOCKS
|
||||
%macro MC_MOVNTQ 0
|
||||
; > ecx = size (> 64)
|
||||
; x
|
||||
%macro UC_MOVNTQ 0
|
||||
mov edx, 64
|
||||
align 16
|
||||
%%1:
|
||||
prefetchnta [esi + (200*64/34+192)]
|
||||
movq mm0,[esi+0]
|
||||
add edi,64
|
||||
add edi, edx
|
||||
movq mm1,[esi+8]
|
||||
add esi,64
|
||||
add esi, edx
|
||||
movq mm2,[esi-48]
|
||||
movntq [edi-64], mm0
|
||||
movq mm0,[esi-40]
|
||||
@ -160,75 +203,67 @@ align 16
|
||||
movq mm1,[esi-8]
|
||||
movntq [edi-24], mm2
|
||||
movntq [edi-16], mm0
|
||||
dec ecx
|
||||
sub ecx, edx
|
||||
movntq [edi-8], mm1
|
||||
jnz %%1
|
||||
cmp ecx, edx
|
||||
jae %%1
|
||||
%endm
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
; void __declspec(naked) ia32_memcpy(void* dst, const void* src, size_t nbytes)
|
||||
mov ecx, [esp+4+8] ; nbytes
|
||||
mov esi, [esp+4+4] ; src
|
||||
mov edi, [esp+4+0] ; dst
|
||||
global _ia32_memcpy
|
||||
_ia32_memcpy:
|
||||
mov ecx, [esp+4+8] ; nbytes
|
||||
mov esi, [esp+4+4] ; src
|
||||
mov edi, [esp+4+0] ; dst
|
||||
|
||||
MC_ALIGN
|
||||
cmp ecx, byte IC_SIZE
|
||||
ja .choose_large_method
|
||||
|
||||
mov ebx, ecx
|
||||
shr ecx, 6 ; # blocks
|
||||
|
||||
mov eax, _bp
|
||||
cmp ecx, BP_MIN_THRESHOLD_64
|
||||
mov edx, _movntq
|
||||
cmovb eax, edx
|
||||
cmp ecx, MOVNTQ_MIN_THRESHOLD_64
|
||||
mov edx, _mmx
|
||||
cmovb eax, edx
|
||||
cmp ecx, 64
|
||||
jbe tiny
|
||||
jmp eax
|
||||
|
||||
tiny:
|
||||
MC_UNROLLED_MOVSD
|
||||
.ic_movsd:
|
||||
IC_MOVSD
|
||||
ret
|
||||
|
||||
_mmx:
|
||||
MC_MOVQ
|
||||
emms
|
||||
jmp tiny
|
||||
.choose_large_method:
|
||||
IC_ALIGN
|
||||
cmp ecx, UC_THRESHOLD
|
||||
jb near .ic_movq
|
||||
cmp ecx, BP_THRESHOLD
|
||||
jae .uc_bp_movntq
|
||||
|
||||
_bp:
|
||||
MC_BP_MOVNTQ
|
||||
.uc_movntq:
|
||||
UC_MOVNTQ
|
||||
sfence
|
||||
emms
|
||||
; protect routine below
|
||||
cmp ecx, 0
|
||||
jz tiny
|
||||
|
||||
_movntq:
|
||||
jmp .ic_movsd
|
||||
|
||||
.uc_bp_movntq:
|
||||
UC_BP_MOVNTQ
|
||||
sfence
|
||||
; fall through
|
||||
|
||||
.ic_movq:
|
||||
IC_MOVQ
|
||||
emms
|
||||
jmp tiny
|
||||
|
||||
|
||||
|
||||
jmp .ic_movsd
|
||||
|
||||
|
||||
;-------------------------------------------------------------------------------
|
||||
; CPUID support
|
||||
;-------------------------------------------------------------------------------
|
||||
|
||||
[section .data use32]
|
||||
cpuid_available dd -1
|
||||
|
||||
; max supported CPUID functions. initialized to
|
||||
max_func dd 0x7FFFFFFF
|
||||
max_ext_func dd 0xFFFFFFFF
|
||||
[section .bss use32]
|
||||
|
||||
; no init needed - cpuid_available triggers init
|
||||
max_func resd 1
|
||||
max_ext_func resd 1
|
||||
|
||||
__SECT__
|
||||
|
||||
|
||||
; extern "C" bool __cdecl ia32_cpuid(u32 func, u32* regs)
|
||||
global _ia32_cpuid
|
||||
_ia32_cpuid:
|
||||
@ -297,15 +332,9 @@ _ia32_cpuid:
|
||||
jmp .retry
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
;-------------------------------------------------------------------------------
|
||||
; misc
|
||||
;-------------------------------------------------------------------------------
|
||||
|
||||
; extern "C" uint __cdecl ia32_control87(uint new_cw, uint mask)
|
||||
global _ia32_control87
|
||||
|
@ -100,12 +100,6 @@ void ia32_debug_break()
|
||||
}
|
||||
|
||||
|
||||
void ia32_memcpy(void* dst, const void* src, size_t nbytes)
|
||||
{
|
||||
memcpy(dst, src, nbytes);
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// support code for lock-free primitives
|
||||
//-----------------------------------------------------------------------------
|
||||
|
@ -93,6 +93,10 @@ enum IA32Regs
|
||||
ECX,
|
||||
EDX
|
||||
};
|
||||
|
||||
// try to call the specified CPUID sub-function. returns true on success or
|
||||
// false on failure (i.e. CPUID or the specific function not supported).
|
||||
// returns eax, ebx, ecx, edx registers in above order.
|
||||
extern bool ia32_cpuid(u32 func, u32* regs);
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
Loading…
Reference in New Issue
Block a user