; ========================================================================= ; File : ia32_memcpy.asm ; Project : 0 A.D. ; Description : highly optimized memory copy. ; ; @author Jan.Wassenberg@stud.uni-karlsruhe.de ; ========================================================================= ; Copyright (c) 2004-2005 Jan Wassenberg ; ; Redistribution and/or modification are also permitted under the ; terms of the GNU General Public License as published by th;e ; Free Software Foundation (version 2 or later, at your option). ; ; This program is distributed in the hope that it will be useful, but ; WITHOUT ANY WARRANTY; without even the implied warranty of ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. %include "ia32.inc" ; drop-in replacement for libc memcpy(). only requires CPU support for ; MMX (by now universal). highly optimized for Athlon and Pentium III ; microarchitectures; significantly outperforms VC7.1 memcpy and memcpy_amd. ; for details, see accompanying article. ; if transfer size is at least this much, ; .. it's too big for L1. use non-temporal instructions. UC_THRESHOLD equ 64*1024 ; .. it also blows L2. pull chunks into L1 ("block prefetch"). BP_THRESHOLD equ 256*1024 ; maximum that can be copied by IC_TINY. IC_TINY_MAX equ 63 ; size of one block prefetch chunk. BP_SIZE equ 8*1024 ;------------------------------------------------------------------------------ ; [p3] replicating this instead of jumping to it from tailN ; saves 1 clock and costs (7-2)*2 bytes code. %macro EPILOG 0 pop esi pop edi mov eax, [esp+4] ; return dst ret %endm align 64 tail1: mov al, [esi+ecx*4] mov [edi+ecx*4], al align 4 tail0: EPILOG align 8 tail3: ; [p3] 2 reads followed by 2 writes is better than ; R/W interleaved and RRR/WWW mov al, [esi+ecx*4+2] mov [edi+ecx*4+2], al ; already aligned to 8 due to above code tail2: mov al, [esi+ecx*4] mov dl, [esi+ecx*4+1] mov [edi+ecx*4], al mov [edi+ecx*4+1], dl EPILOG [section .data] align 16 tail_table dd tail0, tail1, tail2, tail3 __SECT__ ; 15x unrolled copy loop - transfers DWORDs backwards. ; indexed via table of 8-bit offsets. ; rationale: ; - [p3] backwards vs. forwards makes no difference. ; - MOV is faster than MOVSD. ; - index table is needed because calculating end-6*i is slower than ; a LUT and we wouldn't want to expand entries to 8 bytes ; (that'd increase code footprint by 30 bytes) ; - a byte index accessed via MOVZX is better due to less dcache usage. ; - only unrolling 8x and 'reentering' the loop is possible but ; slower due to fiddling with esi/ecx. align 64 unrolled_copy_code_start: %assign i 15 %rep 14 ; 15 entries, 1 base case handled below uc_ %+ i: mov eax, [esi+i*4-4] mov [edi+i*4-4], eax %assign i i-1 %endrep ; base case: no displacement needed; skip it so that code will ; be aligned to 8 bytes after this. uc_1: mov eax, [esi] mov [edi], eax uc_0: jmp [tail_table+edx*4] [section .data] align 32 unrolled_copy_index_table: %assign i 0 %rep 16 db (uc_ %+ i) - unrolled_copy_code_start %assign i i+1 %endrep __SECT__ ;------------------------------------------------------------------------------ ; tiny copy - handles all cases smaller than IC_MOVQ's 64 byte lower limit. ; > edx = number of bytes (< IC_TINY_MAX) ; < does not return. ; x eax, ecx, edx %macro IC_TINY 0 mov ecx, edx shr ecx, 2 ; calculating this address isn't possible due to skipping displacement on uc1; ; even so, it'd require calculating -6*ecx, which is slower than LUT. movzx eax, byte [unrolled_copy_index_table+ecx] and edx, byte 3 add eax, unrolled_copy_code_start jmp eax ; never reached! the unrolled loop jumps into tailN, which ; then returns from the memcpy function. %endm ;------------------------------------------------------------------------------ ; align destination address to multiple of 8. important for large transfers, ; but doesn't affect the tiny technique. ; > esi, edi -> buffers (updated) ; > ecx, edx = transfer size (updated) ; x eax %macro IC_ALIGN 0 mov eax, edi and eax, byte 7 ; eax = # misaligned bytes jz already_aligned ; early out lea eax, [align_table_start+eax*2] jmp eax ; [p3] this is no slower than a table of mov and much smaller/simpler align 8 align_table_start: %rep 8 dec ecx movsb %endrep mov edx, ecx already_aligned: %endm ;------------------------------------------------------------------------------ ; MMX MOVQ technique. used for in-cache transfers of 64B..64*KiB. ; must run on all CPUs, i.e. cannot use the SSE prefetchnta instruction. ; > ecx = -number_of_bytes (multiple of 64) ; > esi, esi point to end of the buffer, i.e. &last_qword+8. ; < ecx = 0 ; x %macro IC_MOVQ 0 align 16 %%loop: ; notes: ; - we can't use prefetch here - this codepath must support all CPUs. ; [p3] that makes us 5..15% slower on 1KiB..4KiB transfers. ; - [p3] simple addressing without +ecx is 3.5% faster. ; - difference between RR/WW/RR/WW and R..R/W..W: ; [p3] none (if simple addressing) ; [axp] interleaved is better (with +ecx addressing) ; - enough time elapses between first and third pair of reads that we ; could reuse MM0. there is no performance gain either way and ; differing displacements make code compression futile anyway, so ; we'll just use MM4..7 for clarity. movq mm0, [esi+ecx] movq mm1, [esi+ecx+8] movq [edi+ecx], mm0 movq [edi+ecx+8], mm1 movq mm2, [esi+ecx+16] movq mm3, [esi+ecx+24] movq [edi+ecx+16], mm2 movq [edi+ecx+24], mm3 movq mm4, [esi+ecx+32] movq mm5, [esi+ecx+40] movq [edi+ecx+32], mm4 movq [edi+ecx+40], mm5 movq mm6, [esi+ecx+48] movq mm7, [esi+ecx+56] movq [edi+ecx+48], mm6 movq [edi+ecx+56], mm7 add ecx, byte 64 jnz %%loop %endm ;------------------------------------------------------------------------------ ; SSE MOVNTQ technique. used for transfers that do not fit in L1, ; i.e. 64KiB..192KiB. requires Pentium III or Athlon; caller checks for this. ; > ecx = -number_of_bytes (multiple of 64) ; > esi, esi point to end of the buffer, i.e. &last_qword+8. ; < ecx = 0 ; x %macro UC_MOVNTQ 0 align 16 %%loop: ; notes: ; - the AMD optimization manual recommends prefetch distances according to ; (200*BytesPerIter/ClocksPerIter+192), which comes out to ~560 here. ; [p3] rounding down to 512 bytes makes for significant gains. ; - [p3] complex addressing with ecx is 1% faster than adding to esi/edi. prefetchnta [esi+ecx+512] movq mm0, [esi+ecx] movq mm1, [esi+ecx+8] movq mm2, [esi+ecx+16] movq mm3, [esi+ecx+24] movq mm4, [esi+ecx+32] movq mm5, [esi+ecx+40] movq mm6, [esi+ecx+48] movq mm7, [esi+ecx+56] movntq [edi+ecx], mm0 movntq [edi+ecx+8], mm1 movntq [edi+ecx+16], mm2 movntq [edi+ecx+24], mm3 movntq [edi+ecx+32], mm4 movntq [edi+ecx+40], mm5 movntq [edi+ecx+48], mm6 movntq [edi+ecx+56], mm7 add ecx, byte 64 jnz %%loop %endm ;------------------------------------------------------------------------------ ; block prefetch technique. used for transfers that do not fit in L2, ; i.e. > 192KiB. requires Pentium III or Athlon; caller checks for this. ; for theory behind this, see article. ; > ecx = -number_of_bytes (multiple of 64, <= -BP_SIZE) ; > esi, esi point to end of the buffer, i.e. &last_qword+8. ; < ecx = -remaining_bytes (multiple of 64, > -BP_SIZE) ; < eax = 0 %macro UC_BP_MOVNTQ 0 push edx align 4 %%prefetch_and_copy_chunk: ; pull chunk into cache by touching each cache line ; (in reverse order to prevent HW prefetches) mov eax, BP_SIZE/128 ; # iterations add esi, BP_SIZE align 16 %%prefetch_loop: mov edx, [esi+ecx-64] mov edx, [esi+ecx-128] add esi, byte -128 dec eax jnz %%prefetch_loop ; copy chunk in 64 byte pieces mov eax, BP_SIZE/64 ; # iterations (> signed 8 bit) align 16 %%copy_loop: movq mm0, [esi+ecx] movq mm1, [esi+ecx+8] movq mm2, [esi+ecx+16] movq mm3, [esi+ecx+24] movq mm4, [esi+ecx+32] movq mm5, [esi+ecx+40] movq mm6, [esi+ecx+48] movq mm7, [esi+ecx+56] movntq [edi+ecx], mm0 movntq [edi+ecx+8], mm1 movntq [edi+ecx+16], mm2 movntq [edi+ecx+24], mm3 movntq [edi+ecx+32], mm4 movntq [edi+ecx+40], mm5 movntq [edi+ecx+48], mm6 movntq [edi+ecx+56], mm7 add ecx, byte 64 dec eax jnz %%copy_loop ; if enough data left, process next chunk cmp ecx, -BP_SIZE jle %%prefetch_and_copy_chunk pop edx %endm ;------------------------------------------------------------------------------ ; void* __declspec(naked) ia32_memcpy(void* dst, const void* src, size_t nbytes) ; drop-in replacement for libc memcpy() (returns dst) global sym(ia32_memcpy) align 64 sym(ia32_memcpy): push edi push esi mov ecx, [esp+8+4+8] ; nbytes mov edi, [esp+8+4+0] ; dst mov esi, [esp+8+4+4] ; src mov edx, ecx cmp ecx, byte IC_TINY_MAX ja choose_larger_method ic_tiny: IC_TINY ; never reached - IC_TINY contains memcpy function epilog code choose_larger_method: IC_ALIGN ; setup: ; eax = number of 64 byte chunks, or 0 if CPU doesn't support SSE. ; used to choose copy technique. ; ecx = -number_of_bytes, multiple of 64. we jump to ic_tiny if ; there's not enough left for a single 64 byte chunk, which can ; happen on unaligned 64..71 byte transfers due to IC_ALIGN. ; edx = number of remainder bytes after qwords have been copied; ; will be handled by IC_TINY. ; esi and edi point to end of the respective buffers (more precisely, ; to buffer_start-ecx). this together with the ecx convention means ; we only need one loop counter (instead of having to advance ; that and esi/edi). ; this mask is applied to the transfer size. the 2 specialized copy techniques ; that use SSE are jumped to if size is greater than a threshold. ; we simply set the requested transfer size to 0 if the CPU doesn't ; support SSE so that those are never reached (done by masking with this). extern sym(ia32_memcpy_size_mask) mov eax, [sym(ia32_memcpy_size_mask)] and ecx, byte ~IC_TINY_MAX jz ic_tiny ; < 64 bytes left (due to IC_ALIGN) add esi, ecx add edi, ecx and edx, byte IC_TINY_MAX and eax, ecx neg ecx cmp eax, BP_THRESHOLD jae near uc_bp_movntq cmp eax, UC_THRESHOLD jae uc_movntq ic_movq: IC_MOVQ emms jmp ic_tiny uc_movntq: UC_MOVNTQ sfence emms jmp ic_tiny uc_bp_movntq: UC_BP_MOVNTQ sfence cmp ecx, byte -(IC_TINY_MAX+1) jle ic_movq emms jmp ic_tiny