1
1
forked from 0ad/0ad

# improvements to build system for asm files

split ia32_asm code up into memcpy, color
premake: add (windows-only so far) support for NASM include paths -
required when using %include. see rationale in vs.c
refs #124

This was SVN commit r4039.
This commit is contained in:
janwas 2006-06-25 20:58:03 +00:00
parent f2f4ff5fbe
commit aeed96dafa
7 changed files with 1297 additions and 1269 deletions

Binary file not shown.

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,50 @@
%include "../lib/sysdep/ia32.inc"
;-------------------------------------------------------------------------------
; Color conversion (SSE)
;-------------------------------------------------------------------------------
; extern "C" u32 ConvertRGBColorTo4ub(const RGBColor& color)
[section .data]
align 16
zero:
dd 0.0
twofivefive:
dd 255.0
__SECT__
align 16
global sym(sse_ConvertRGBColorTo4ub)
sym(sse_ConvertRGBColorTo4ub):
mov eax, [esp+4]
; xmm0, 1, 2 = R, G, B
movss xmm4, [zero]
movss xmm0, [eax+8]
movss xmm1, [eax+4]
movss xmm2, [eax]
movss xmm5, [twofivefive]
; C = min(255, 255*max(C, 0)) ( == clamp(255*C, 0, 255) )
maxss xmm0, xmm4
maxss xmm1, xmm4
maxss xmm2, xmm4
mulss xmm0, xmm5
mulss xmm1, xmm5
mulss xmm2, xmm5
minss xmm0, xmm5
minss xmm1, xmm5
minss xmm2, xmm5
; convert to integer and combine channels using bit logic
cvtss2si eax, xmm0
cvtss2si ecx, xmm1
cvtss2si edx, xmm2
shl eax, 16
shl ecx, 8
or eax, 0xff000000
or edx, ecx
or eax, edx
ret

View File

@ -99,8 +99,7 @@
#include <cctype>
#include <cerrno>
#include <cfloat>
//#include <ciso646>
// defines e.g. "and" to "&". unnecessary and causes trouble with asm.
//#include <ciso646> // defines e.g. "and" to "&". unnecessary and causes trouble with asm.
#include <climits>
#include <clocale>
#include <cmath>

View File

@ -0,0 +1,17 @@
; set section attributes
section .data data align=32 use32
section .bss bss align=16 use32
section .text code align=64 use32
; activate .text (needs to be separate because __SECT__ will otherwise
; complain that the above definition is redeclaring attributes)
section .text
; Usage:
; use sym(ia32_cap) instead of _ia32_cap - on relevant platforms, sym() will add
; the underlines automagically, on others it won't
%ifdef DONT_USE_UNDERLINE
%define sym(a) a
%else
%define sym(a) _ %+ a
%endif

View File

@ -17,382 +17,7 @@
; WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
; set section attributes
section .data data align=32 use32
section .bss bss align=16 use32
section .text code align=64 use32
; activate .text (needs to be separate because __SECT__ will otherwise
; complain that the above definition is redeclaring attributes)
section .text
; Usage:
; use sym(ia32_cap) instead of _ia32_cap - on relevant platforms, sym() will add
; the underlines automagically, on others it won't
%ifdef DONT_USE_UNDERLINE
%define sym(a) a
%else
%define sym(a) _ %+ a
%endif
;-------------------------------------------------------------------------------
; fast general memcpy
;-------------------------------------------------------------------------------
; drop-in replacement for libc memcpy(). only requires CPU support for
; MMX (by now universal). highly optimized for Athlon and Pentium III
; microarchitectures; significantly outperforms VC7.1 memcpy and memcpy_amd.
; for details, see accompanying article.
; if transfer size is at least this much,
; .. it's too big for L1. use non-temporal instructions.
UC_THRESHOLD equ 64*1024
; .. it also blows L2. pull chunks into L1 ("block prefetch").
BP_THRESHOLD equ 256*1024
; maximum that can be copied by IC_TINY.
IC_TINY_MAX equ 63
; size of one block prefetch chunk.
BP_SIZE equ 8*1024
;------------------------------------------------------------------------------
; [p3] replicating this instead of jumping to it from tailN
; saves 1 clock and costs (7-2)*2 bytes code.
%macro EPILOG 0
pop esi
pop edi
mov eax, [esp+4] ; return dst
ret
%endm
align 64
tail1:
mov al, [esi+ecx*4]
mov [edi+ecx*4], al
align 4
tail0:
EPILOG
align 8
tail3:
; [p3] 2 reads followed by 2 writes is better than
; R/W interleaved and RRR/WWW
mov al, [esi+ecx*4+2]
mov [edi+ecx*4+2], al
; already aligned to 8 due to above code
tail2:
mov al, [esi+ecx*4]
mov dl, [esi+ecx*4+1]
mov [edi+ecx*4], al
mov [edi+ecx*4+1], dl
EPILOG
[section .data]
align 16
tail_table dd tail0, tail1, tail2, tail3
__SECT__
; 15x unrolled copy loop - transfers DWORDs backwards.
; indexed via table of 8-bit offsets.
; rationale:
; - [p3] backwards vs. forwards makes no difference.
; - MOV is faster than MOVSD.
; - index table is needed because calculating end-6*i is slower than
; a LUT and we wouldn't want to expand entries to 8 bytes
; (that'd increase code footprint by 30 bytes)
; - a byte index accessed via MOVZX is better due to less dcache usage.
; - only unrolling 8x and 'reentering' the loop is possible but
; slower due to fiddling with esi/ecx.
align 64
unrolled_copy_code_start:
%assign i 15
%rep 14 ; 15 entries, 1 base case handled below
uc_ %+ i:
mov eax, [esi+i*4-4]
mov [edi+i*4-4], eax
%assign i i-1
%endrep
; base case: no displacement needed; skip it so that code will
; be aligned to 8 bytes after this.
uc_1:
mov eax, [esi]
mov [edi], eax
uc_0:
jmp [tail_table+edx*4]
[section .data]
align 32
unrolled_copy_index_table:
%assign i 0
%rep 16
db (uc_ %+ i) - unrolled_copy_code_start
%assign i i+1
%endrep
__SECT__
;------------------------------------------------------------------------------
; tiny copy - handles all cases smaller than IC_MOVQ's 64 byte lower limit.
; > edx = number of bytes (< IC_TINY_MAX)
; < does not return.
; x eax, ecx, edx
%macro IC_TINY 0
mov ecx, edx
shr ecx, 2
; calculating this address isn't possible due to skipping displacement on uc1;
; even so, it'd require calculating -6*ecx, which is slower than LUT.
movzx eax, byte [unrolled_copy_index_table+ecx]
and edx, byte 3
add eax, unrolled_copy_code_start
jmp eax
; never reached! the unrolled loop jumps into tailN, which
; then returns from the memcpy function.
%endm
;------------------------------------------------------------------------------
; align destination address to multiple of 8. important for large transfers,
; but doesn't affect the tiny technique.
; > esi, edi -> buffers (updated)
; > ecx, edx = transfer size (updated)
; x eax
%macro IC_ALIGN 0
mov eax, edi
and eax, byte 7 ; eax = # misaligned bytes
jz already_aligned ; early out
lea eax, [align_table_start+eax*2]
jmp eax
; [p3] this is no slower than a table of mov and much smaller/simpler
align 8
align_table_start:
%rep 8
dec ecx
movsb
%endrep
mov edx, ecx
already_aligned:
%endm
;------------------------------------------------------------------------------
; MMX MOVQ technique. used for in-cache transfers of 64B..64*KiB.
; must run on all CPUs, i.e. cannot use the SSE prefetchnta instruction.
; > ecx = -number_of_bytes (multiple of 64)
; > esi, esi point to end of the buffer, i.e. &last_qword+8.
; < ecx = 0
; x
%macro IC_MOVQ 0
align 16
%%loop:
; notes:
; - we can't use prefetch here - this codepath must support all CPUs.
; [p3] that makes us 5..15% slower on 1KiB..4KiB transfers.
; - [p3] simple addressing without +ecx is 3.5% faster.
; - difference between RR/WW/RR/WW and R..R/W..W:
; [p3] none (if simple addressing)
; [axp] interleaved is better (with +ecx addressing)
; - enough time elapses between first and third pair of reads that we
; could reuse MM0. there is no performance gain either way and
; differing displacements make code compression futile anyway, so
; we'll just use MM4..7 for clarity.
movq mm0, [esi+ecx]
movq mm1, [esi+ecx+8]
movq [edi+ecx], mm0
movq [edi+ecx+8], mm1
movq mm2, [esi+ecx+16]
movq mm3, [esi+ecx+24]
movq [edi+ecx+16], mm2
movq [edi+ecx+24], mm3
movq mm4, [esi+ecx+32]
movq mm5, [esi+ecx+40]
movq [edi+ecx+32], mm4
movq [edi+ecx+40], mm5
movq mm6, [esi+ecx+48]
movq mm7, [esi+ecx+56]
movq [edi+ecx+48], mm6
movq [edi+ecx+56], mm7
add ecx, byte 64
jnz %%loop
%endm
;------------------------------------------------------------------------------
; SSE MOVNTQ technique. used for transfers that do not fit in L1,
; i.e. 64KiB..192KiB. requires Pentium III or Athlon; caller checks for this.
; > ecx = -number_of_bytes (multiple of 64)
; > esi, esi point to end of the buffer, i.e. &last_qword+8.
; < ecx = 0
; x
%macro UC_MOVNTQ 0
align 16
%%loop:
; notes:
; - the AMD optimization manual recommends prefetch distances according to
; (200*BytesPerIter/ClocksPerIter+192), which comes out to ~560 here.
; [p3] rounding down to 512 bytes makes for significant gains.
; - [p3] complex addressing with ecx is 1% faster than adding to esi/edi.
prefetchnta [esi+ecx+512]
movq mm0, [esi+ecx]
movq mm1, [esi+ecx+8]
movq mm2, [esi+ecx+16]
movq mm3, [esi+ecx+24]
movq mm4, [esi+ecx+32]
movq mm5, [esi+ecx+40]
movq mm6, [esi+ecx+48]
movq mm7, [esi+ecx+56]
movntq [edi+ecx], mm0
movntq [edi+ecx+8], mm1
movntq [edi+ecx+16], mm2
movntq [edi+ecx+24], mm3
movntq [edi+ecx+32], mm4
movntq [edi+ecx+40], mm5
movntq [edi+ecx+48], mm6
movntq [edi+ecx+56], mm7
add ecx, byte 64
jnz %%loop
%endm
;------------------------------------------------------------------------------
; block prefetch technique. used for transfers that do not fit in L2,
; i.e. > 192KiB. requires Pentium III or Athlon; caller checks for this.
; for theory behind this, see article.
; > ecx = -number_of_bytes (multiple of 64, <= -BP_SIZE)
; > esi, esi point to end of the buffer, i.e. &last_qword+8.
; < ecx = -remaining_bytes (multiple of 64, > -BP_SIZE)
; < eax = 0
%macro UC_BP_MOVNTQ 0
push edx
align 4
%%prefetch_and_copy_chunk:
; pull chunk into cache by touching each cache line
; (in reverse order to prevent HW prefetches)
mov eax, BP_SIZE/128 ; # iterations
add esi, BP_SIZE
align 16
%%prefetch_loop:
mov edx, [esi+ecx-64]
mov edx, [esi+ecx-128]
add esi, byte -128
dec eax
jnz %%prefetch_loop
; copy chunk in 64 byte pieces
mov eax, BP_SIZE/64 ; # iterations (> signed 8 bit)
align 16
%%copy_loop:
movq mm0, [esi+ecx]
movq mm1, [esi+ecx+8]
movq mm2, [esi+ecx+16]
movq mm3, [esi+ecx+24]
movq mm4, [esi+ecx+32]
movq mm5, [esi+ecx+40]
movq mm6, [esi+ecx+48]
movq mm7, [esi+ecx+56]
movntq [edi+ecx], mm0
movntq [edi+ecx+8], mm1
movntq [edi+ecx+16], mm2
movntq [edi+ecx+24], mm3
movntq [edi+ecx+32], mm4
movntq [edi+ecx+40], mm5
movntq [edi+ecx+48], mm6
movntq [edi+ecx+56], mm7
add ecx, byte 64
dec eax
jnz %%copy_loop
; if enough data left, process next chunk
cmp ecx, -BP_SIZE
jle %%prefetch_and_copy_chunk
pop edx
%endm
;------------------------------------------------------------------------------
; void* __declspec(naked) ia32_memcpy(void* dst, const void* src, size_t nbytes)
; drop-in replacement for libc memcpy() (returns dst)
global sym(ia32_memcpy)
align 64
sym(ia32_memcpy):
push edi
push esi
mov ecx, [esp+8+4+8] ; nbytes
mov edi, [esp+8+4+0] ; dst
mov esi, [esp+8+4+4] ; src
mov edx, ecx
cmp ecx, byte IC_TINY_MAX
ja choose_larger_method
ic_tiny:
IC_TINY
; never reached - IC_TINY contains memcpy function epilog code
choose_larger_method:
IC_ALIGN
; setup:
; eax = number of 64 byte chunks, or 0 if CPU doesn't support SSE.
; used to choose copy technique.
; ecx = -number_of_bytes, multiple of 64. we jump to ic_tiny if
; there's not enough left for a single 64 byte chunk, which can
; happen on unaligned 64..71 byte transfers due to IC_ALIGN.
; edx = number of remainder bytes after qwords have been copied;
; will be handled by IC_TINY.
; esi and edi point to end of the respective buffers (more precisely,
; to buffer_start-ecx). this together with the ecx convention means
; we only need one loop counter (instead of having to advance
; that and esi/edi).
; this mask is applied to the transfer size. the 2 specialized copy techniques
; that use SSE are jumped to if size is greater than a threshold.
; we simply set the requested transfer size to 0 if the CPU doesn't
; support SSE so that those are never reached (done by masking with this).
extern sym(ia32_memcpy_size_mask)
mov eax, [sym(ia32_memcpy_size_mask)]
and ecx, byte ~IC_TINY_MAX
jz ic_tiny ; < 64 bytes left (due to IC_ALIGN)
add esi, ecx
add edi, ecx
and edx, byte IC_TINY_MAX
and eax, ecx
neg ecx
cmp eax, BP_THRESHOLD
jae near uc_bp_movntq
cmp eax, UC_THRESHOLD
jae uc_movntq
ic_movq:
IC_MOVQ
emms
jmp ic_tiny
uc_movntq:
UC_MOVNTQ
sfence
emms
jmp ic_tiny
uc_bp_movntq:
UC_BP_MOVNTQ
sfence
cmp ecx, byte -(IC_TINY_MAX+1)
jle ic_movq
emms
jmp ic_tiny
%include "ia32.inc"
;-------------------------------------------------------------------------------
; CPUID support
@ -657,53 +282,3 @@ sym(ia32_asm_init):
pop ebx
ret
;-------------------------------------------------------------------------------
; Color conversion (SSE)
;-------------------------------------------------------------------------------
; extern "C" u32 ConvertRGBColorTo4ub(const RGBColor& color)
[section .data]
align 16
zero:
dd 0.0
twofivefive:
dd 255.0
__SECT__
align 16
global sym(sse_ConvertRGBColorTo4ub)
sym(sse_ConvertRGBColorTo4ub):
mov eax, [esp+4]
; xmm0, 1, 2 = R, G, B
movss xmm4, [zero]
movss xmm0, [eax+8]
movss xmm1, [eax+4]
movss xmm2, [eax]
movss xmm5, [twofivefive]
; C = min(255, 255*max(C, 0)) ( == clamp(255*C, 0, 255) )
maxss xmm0, xmm4
maxss xmm1, xmm4
maxss xmm2, xmm4
mulss xmm0, xmm5
mulss xmm1, xmm5
mulss xmm2, xmm5
minss xmm0, xmm5
minss xmm1, xmm5
minss xmm2, xmm5
; convert to integer and combine channels using bit logic
cvtss2si eax, xmm0
cvtss2si ecx, xmm1
cvtss2si edx, xmm2
shl eax, 16
shl ecx, 8
or eax, 0xff000000
or edx, ecx
or eax, edx
ret

View File

@ -0,0 +1,374 @@
; =========================================================================
; File : ia32_memcpy.asm
; Project : 0 A.D.
; Description : highly optimized memory copy.
;
; @author Jan.Wassenberg@stud.uni-karlsruhe.de
; =========================================================================
; Copyright (c) 2004-2005 Jan Wassenberg
;
; Redistribution and/or modification are also permitted under the
; terms of the GNU General Public License as published by th;e
; Free Software Foundation (version 2 or later, at your option).
;
; This program is distributed in the hope that it will be useful, but
; WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
%include "ia32.inc"
; drop-in replacement for libc memcpy(). only requires CPU support for
; MMX (by now universal). highly optimized for Athlon and Pentium III
; microarchitectures; significantly outperforms VC7.1 memcpy and memcpy_amd.
; for details, see accompanying article.
; if transfer size is at least this much,
; .. it's too big for L1. use non-temporal instructions.
UC_THRESHOLD equ 64*1024
; .. it also blows L2. pull chunks into L1 ("block prefetch").
BP_THRESHOLD equ 256*1024
; maximum that can be copied by IC_TINY.
IC_TINY_MAX equ 63
; size of one block prefetch chunk.
BP_SIZE equ 8*1024
;------------------------------------------------------------------------------
; [p3] replicating this instead of jumping to it from tailN
; saves 1 clock and costs (7-2)*2 bytes code.
%macro EPILOG 0
pop esi
pop edi
mov eax, [esp+4] ; return dst
ret
%endm
align 64
tail1:
mov al, [esi+ecx*4]
mov [edi+ecx*4], al
align 4
tail0:
EPILOG
align 8
tail3:
; [p3] 2 reads followed by 2 writes is better than
; R/W interleaved and RRR/WWW
mov al, [esi+ecx*4+2]
mov [edi+ecx*4+2], al
; already aligned to 8 due to above code
tail2:
mov al, [esi+ecx*4]
mov dl, [esi+ecx*4+1]
mov [edi+ecx*4], al
mov [edi+ecx*4+1], dl
EPILOG
[section .data]
align 16
tail_table dd tail0, tail1, tail2, tail3
__SECT__
; 15x unrolled copy loop - transfers DWORDs backwards.
; indexed via table of 8-bit offsets.
; rationale:
; - [p3] backwards vs. forwards makes no difference.
; - MOV is faster than MOVSD.
; - index table is needed because calculating end-6*i is slower than
; a LUT and we wouldn't want to expand entries to 8 bytes
; (that'd increase code footprint by 30 bytes)
; - a byte index accessed via MOVZX is better due to less dcache usage.
; - only unrolling 8x and 'reentering' the loop is possible but
; slower due to fiddling with esi/ecx.
align 64
unrolled_copy_code_start:
%assign i 15
%rep 14 ; 15 entries, 1 base case handled below
uc_ %+ i:
mov eax, [esi+i*4-4]
mov [edi+i*4-4], eax
%assign i i-1
%endrep
; base case: no displacement needed; skip it so that code will
; be aligned to 8 bytes after this.
uc_1:
mov eax, [esi]
mov [edi], eax
uc_0:
jmp [tail_table+edx*4]
[section .data]
align 32
unrolled_copy_index_table:
%assign i 0
%rep 16
db (uc_ %+ i) - unrolled_copy_code_start
%assign i i+1
%endrep
__SECT__
;------------------------------------------------------------------------------
; tiny copy - handles all cases smaller than IC_MOVQ's 64 byte lower limit.
; > edx = number of bytes (< IC_TINY_MAX)
; < does not return.
; x eax, ecx, edx
%macro IC_TINY 0
mov ecx, edx
shr ecx, 2
; calculating this address isn't possible due to skipping displacement on uc1;
; even so, it'd require calculating -6*ecx, which is slower than LUT.
movzx eax, byte [unrolled_copy_index_table+ecx]
and edx, byte 3
add eax, unrolled_copy_code_start
jmp eax
; never reached! the unrolled loop jumps into tailN, which
; then returns from the memcpy function.
%endm
;------------------------------------------------------------------------------
; align destination address to multiple of 8. important for large transfers,
; but doesn't affect the tiny technique.
; > esi, edi -> buffers (updated)
; > ecx, edx = transfer size (updated)
; x eax
%macro IC_ALIGN 0
mov eax, edi
and eax, byte 7 ; eax = # misaligned bytes
jz already_aligned ; early out
lea eax, [align_table_start+eax*2]
jmp eax
; [p3] this is no slower than a table of mov and much smaller/simpler
align 8
align_table_start:
%rep 8
dec ecx
movsb
%endrep
mov edx, ecx
already_aligned:
%endm
;------------------------------------------------------------------------------
; MMX MOVQ technique. used for in-cache transfers of 64B..64*KiB.
; must run on all CPUs, i.e. cannot use the SSE prefetchnta instruction.
; > ecx = -number_of_bytes (multiple of 64)
; > esi, esi point to end of the buffer, i.e. &last_qword+8.
; < ecx = 0
; x
%macro IC_MOVQ 0
align 16
%%loop:
; notes:
; - we can't use prefetch here - this codepath must support all CPUs.
; [p3] that makes us 5..15% slower on 1KiB..4KiB transfers.
; - [p3] simple addressing without +ecx is 3.5% faster.
; - difference between RR/WW/RR/WW and R..R/W..W:
; [p3] none (if simple addressing)
; [axp] interleaved is better (with +ecx addressing)
; - enough time elapses between first and third pair of reads that we
; could reuse MM0. there is no performance gain either way and
; differing displacements make code compression futile anyway, so
; we'll just use MM4..7 for clarity.
movq mm0, [esi+ecx]
movq mm1, [esi+ecx+8]
movq [edi+ecx], mm0
movq [edi+ecx+8], mm1
movq mm2, [esi+ecx+16]
movq mm3, [esi+ecx+24]
movq [edi+ecx+16], mm2
movq [edi+ecx+24], mm3
movq mm4, [esi+ecx+32]
movq mm5, [esi+ecx+40]
movq [edi+ecx+32], mm4
movq [edi+ecx+40], mm5
movq mm6, [esi+ecx+48]
movq mm7, [esi+ecx+56]
movq [edi+ecx+48], mm6
movq [edi+ecx+56], mm7
add ecx, byte 64
jnz %%loop
%endm
;------------------------------------------------------------------------------
; SSE MOVNTQ technique. used for transfers that do not fit in L1,
; i.e. 64KiB..192KiB. requires Pentium III or Athlon; caller checks for this.
; > ecx = -number_of_bytes (multiple of 64)
; > esi, esi point to end of the buffer, i.e. &last_qword+8.
; < ecx = 0
; x
%macro UC_MOVNTQ 0
align 16
%%loop:
; notes:
; - the AMD optimization manual recommends prefetch distances according to
; (200*BytesPerIter/ClocksPerIter+192), which comes out to ~560 here.
; [p3] rounding down to 512 bytes makes for significant gains.
; - [p3] complex addressing with ecx is 1% faster than adding to esi/edi.
prefetchnta [esi+ecx+512]
movq mm0, [esi+ecx]
movq mm1, [esi+ecx+8]
movq mm2, [esi+ecx+16]
movq mm3, [esi+ecx+24]
movq mm4, [esi+ecx+32]
movq mm5, [esi+ecx+40]
movq mm6, [esi+ecx+48]
movq mm7, [esi+ecx+56]
movntq [edi+ecx], mm0
movntq [edi+ecx+8], mm1
movntq [edi+ecx+16], mm2
movntq [edi+ecx+24], mm3
movntq [edi+ecx+32], mm4
movntq [edi+ecx+40], mm5
movntq [edi+ecx+48], mm6
movntq [edi+ecx+56], mm7
add ecx, byte 64
jnz %%loop
%endm
;------------------------------------------------------------------------------
; block prefetch technique. used for transfers that do not fit in L2,
; i.e. > 192KiB. requires Pentium III or Athlon; caller checks for this.
; for theory behind this, see article.
; > ecx = -number_of_bytes (multiple of 64, <= -BP_SIZE)
; > esi, esi point to end of the buffer, i.e. &last_qword+8.
; < ecx = -remaining_bytes (multiple of 64, > -BP_SIZE)
; < eax = 0
%macro UC_BP_MOVNTQ 0
push edx
align 4
%%prefetch_and_copy_chunk:
; pull chunk into cache by touching each cache line
; (in reverse order to prevent HW prefetches)
mov eax, BP_SIZE/128 ; # iterations
add esi, BP_SIZE
align 16
%%prefetch_loop:
mov edx, [esi+ecx-64]
mov edx, [esi+ecx-128]
add esi, byte -128
dec eax
jnz %%prefetch_loop
; copy chunk in 64 byte pieces
mov eax, BP_SIZE/64 ; # iterations (> signed 8 bit)
align 16
%%copy_loop:
movq mm0, [esi+ecx]
movq mm1, [esi+ecx+8]
movq mm2, [esi+ecx+16]
movq mm3, [esi+ecx+24]
movq mm4, [esi+ecx+32]
movq mm5, [esi+ecx+40]
movq mm6, [esi+ecx+48]
movq mm7, [esi+ecx+56]
movntq [edi+ecx], mm0
movntq [edi+ecx+8], mm1
movntq [edi+ecx+16], mm2
movntq [edi+ecx+24], mm3
movntq [edi+ecx+32], mm4
movntq [edi+ecx+40], mm5
movntq [edi+ecx+48], mm6
movntq [edi+ecx+56], mm7
add ecx, byte 64
dec eax
jnz %%copy_loop
; if enough data left, process next chunk
cmp ecx, -BP_SIZE
jle %%prefetch_and_copy_chunk
pop edx
%endm
;------------------------------------------------------------------------------
; void* __declspec(naked) ia32_memcpy(void* dst, const void* src, size_t nbytes)
; drop-in replacement for libc memcpy() (returns dst)
global sym(ia32_memcpy)
align 64
sym(ia32_memcpy):
push edi
push esi
mov ecx, [esp+8+4+8] ; nbytes
mov edi, [esp+8+4+0] ; dst
mov esi, [esp+8+4+4] ; src
mov edx, ecx
cmp ecx, byte IC_TINY_MAX
ja choose_larger_method
ic_tiny:
IC_TINY
; never reached - IC_TINY contains memcpy function epilog code
choose_larger_method:
IC_ALIGN
; setup:
; eax = number of 64 byte chunks, or 0 if CPU doesn't support SSE.
; used to choose copy technique.
; ecx = -number_of_bytes, multiple of 64. we jump to ic_tiny if
; there's not enough left for a single 64 byte chunk, which can
; happen on unaligned 64..71 byte transfers due to IC_ALIGN.
; edx = number of remainder bytes after qwords have been copied;
; will be handled by IC_TINY.
; esi and edi point to end of the respective buffers (more precisely,
; to buffer_start-ecx). this together with the ecx convention means
; we only need one loop counter (instead of having to advance
; that and esi/edi).
; this mask is applied to the transfer size. the 2 specialized copy techniques
; that use SSE are jumped to if size is greater than a threshold.
; we simply set the requested transfer size to 0 if the CPU doesn't
; support SSE so that those are never reached (done by masking with this).
extern sym(ia32_memcpy_size_mask)
mov eax, [sym(ia32_memcpy_size_mask)]
and ecx, byte ~IC_TINY_MAX
jz ic_tiny ; < 64 bytes left (due to IC_ALIGN)
add esi, ecx
add edi, ecx
and edx, byte IC_TINY_MAX
and eax, ecx
neg ecx
cmp eax, BP_THRESHOLD
jae near uc_bp_movntq
cmp eax, UC_THRESHOLD
jae uc_movntq
ic_movq:
IC_MOVQ
emms
jmp ic_tiny
uc_movntq:
UC_MOVNTQ
sfence
emms
jmp ic_tiny
uc_bp_movntq:
UC_BP_MOVNTQ
sfence
cmp ecx, byte -(IC_TINY_MAX+1)
jle ic_movq
emms
jmp ic_tiny