# improvements to build system for asm files
split ia32_asm code up into memcpy, color premake: add (windows-only so far) support for NASM include paths - required when using %include. see rationale in vs.c refs #124 This was SVN commit r4039.
This commit is contained in:
parent
f2f4ff5fbe
commit
aeed96dafa
Binary file not shown.
File diff suppressed because it is too large
Load Diff
50
source/graphics/Color_asm.asm
Normal file
50
source/graphics/Color_asm.asm
Normal file
@ -0,0 +1,50 @@
|
||||
%include "../lib/sysdep/ia32.inc"
|
||||
|
||||
;-------------------------------------------------------------------------------
|
||||
; Color conversion (SSE)
|
||||
;-------------------------------------------------------------------------------
|
||||
|
||||
; extern "C" u32 ConvertRGBColorTo4ub(const RGBColor& color)
|
||||
[section .data]
|
||||
align 16
|
||||
zero:
|
||||
dd 0.0
|
||||
twofivefive:
|
||||
dd 255.0
|
||||
|
||||
|
||||
__SECT__
|
||||
align 16
|
||||
global sym(sse_ConvertRGBColorTo4ub)
|
||||
sym(sse_ConvertRGBColorTo4ub):
|
||||
mov eax, [esp+4]
|
||||
|
||||
; xmm0, 1, 2 = R, G, B
|
||||
movss xmm4, [zero]
|
||||
movss xmm0, [eax+8]
|
||||
movss xmm1, [eax+4]
|
||||
movss xmm2, [eax]
|
||||
movss xmm5, [twofivefive]
|
||||
|
||||
; C = min(255, 255*max(C, 0)) ( == clamp(255*C, 0, 255) )
|
||||
maxss xmm0, xmm4
|
||||
maxss xmm1, xmm4
|
||||
maxss xmm2, xmm4
|
||||
mulss xmm0, xmm5
|
||||
mulss xmm1, xmm5
|
||||
mulss xmm2, xmm5
|
||||
minss xmm0, xmm5
|
||||
minss xmm1, xmm5
|
||||
minss xmm2, xmm5
|
||||
|
||||
; convert to integer and combine channels using bit logic
|
||||
cvtss2si eax, xmm0
|
||||
cvtss2si ecx, xmm1
|
||||
cvtss2si edx, xmm2
|
||||
shl eax, 16
|
||||
shl ecx, 8
|
||||
or eax, 0xff000000
|
||||
or edx, ecx
|
||||
or eax, edx
|
||||
|
||||
ret
|
@ -99,8 +99,7 @@
|
||||
#include <cctype>
|
||||
#include <cerrno>
|
||||
#include <cfloat>
|
||||
//#include <ciso646>
|
||||
// defines e.g. "and" to "&". unnecessary and causes trouble with asm.
|
||||
//#include <ciso646> // defines e.g. "and" to "&". unnecessary and causes trouble with asm.
|
||||
#include <climits>
|
||||
#include <clocale>
|
||||
#include <cmath>
|
||||
|
17
source/lib/sysdep/ia32.inc
Normal file
17
source/lib/sysdep/ia32.inc
Normal file
@ -0,0 +1,17 @@
|
||||
|
||||
; set section attributes
|
||||
section .data data align=32 use32
|
||||
section .bss bss align=16 use32
|
||||
section .text code align=64 use32
|
||||
; activate .text (needs to be separate because __SECT__ will otherwise
|
||||
; complain that the above definition is redeclaring attributes)
|
||||
section .text
|
||||
|
||||
; Usage:
|
||||
; use sym(ia32_cap) instead of _ia32_cap - on relevant platforms, sym() will add
|
||||
; the underlines automagically, on others it won't
|
||||
%ifdef DONT_USE_UNDERLINE
|
||||
%define sym(a) a
|
||||
%else
|
||||
%define sym(a) _ %+ a
|
||||
%endif
|
@ -17,382 +17,7 @@
|
||||
; WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
|
||||
; set section attributes
|
||||
section .data data align=32 use32
|
||||
section .bss bss align=16 use32
|
||||
section .text code align=64 use32
|
||||
; activate .text (needs to be separate because __SECT__ will otherwise
|
||||
; complain that the above definition is redeclaring attributes)
|
||||
section .text
|
||||
|
||||
; Usage:
|
||||
; use sym(ia32_cap) instead of _ia32_cap - on relevant platforms, sym() will add
|
||||
; the underlines automagically, on others it won't
|
||||
%ifdef DONT_USE_UNDERLINE
|
||||
%define sym(a) a
|
||||
%else
|
||||
%define sym(a) _ %+ a
|
||||
%endif
|
||||
|
||||
;-------------------------------------------------------------------------------
|
||||
; fast general memcpy
|
||||
;-------------------------------------------------------------------------------
|
||||
|
||||
; drop-in replacement for libc memcpy(). only requires CPU support for
|
||||
; MMX (by now universal). highly optimized for Athlon and Pentium III
|
||||
; microarchitectures; significantly outperforms VC7.1 memcpy and memcpy_amd.
|
||||
; for details, see accompanying article.
|
||||
|
||||
; if transfer size is at least this much,
|
||||
; .. it's too big for L1. use non-temporal instructions.
|
||||
UC_THRESHOLD equ 64*1024
|
||||
; .. it also blows L2. pull chunks into L1 ("block prefetch").
|
||||
BP_THRESHOLD equ 256*1024
|
||||
|
||||
; maximum that can be copied by IC_TINY.
|
||||
IC_TINY_MAX equ 63
|
||||
|
||||
; size of one block prefetch chunk.
|
||||
BP_SIZE equ 8*1024
|
||||
|
||||
|
||||
;------------------------------------------------------------------------------
|
||||
|
||||
; [p3] replicating this instead of jumping to it from tailN
|
||||
; saves 1 clock and costs (7-2)*2 bytes code.
|
||||
%macro EPILOG 0
|
||||
pop esi
|
||||
pop edi
|
||||
mov eax, [esp+4] ; return dst
|
||||
ret
|
||||
%endm
|
||||
|
||||
align 64
|
||||
tail1:
|
||||
mov al, [esi+ecx*4]
|
||||
mov [edi+ecx*4], al
|
||||
align 4
|
||||
tail0:
|
||||
EPILOG
|
||||
|
||||
align 8
|
||||
tail3:
|
||||
; [p3] 2 reads followed by 2 writes is better than
|
||||
; R/W interleaved and RRR/WWW
|
||||
mov al, [esi+ecx*4+2]
|
||||
mov [edi+ecx*4+2], al
|
||||
; already aligned to 8 due to above code
|
||||
tail2:
|
||||
mov al, [esi+ecx*4]
|
||||
mov dl, [esi+ecx*4+1]
|
||||
mov [edi+ecx*4], al
|
||||
mov [edi+ecx*4+1], dl
|
||||
EPILOG
|
||||
|
||||
[section .data]
|
||||
align 16
|
||||
tail_table dd tail0, tail1, tail2, tail3
|
||||
__SECT__
|
||||
|
||||
; 15x unrolled copy loop - transfers DWORDs backwards.
|
||||
; indexed via table of 8-bit offsets.
|
||||
; rationale:
|
||||
; - [p3] backwards vs. forwards makes no difference.
|
||||
; - MOV is faster than MOVSD.
|
||||
; - index table is needed because calculating end-6*i is slower than
|
||||
; a LUT and we wouldn't want to expand entries to 8 bytes
|
||||
; (that'd increase code footprint by 30 bytes)
|
||||
; - a byte index accessed via MOVZX is better due to less dcache usage.
|
||||
; - only unrolling 8x and 'reentering' the loop is possible but
|
||||
; slower due to fiddling with esi/ecx.
|
||||
align 64
|
||||
unrolled_copy_code_start:
|
||||
%assign i 15
|
||||
%rep 14 ; 15 entries, 1 base case handled below
|
||||
uc_ %+ i:
|
||||
mov eax, [esi+i*4-4]
|
||||
mov [edi+i*4-4], eax
|
||||
%assign i i-1
|
||||
%endrep
|
||||
; base case: no displacement needed; skip it so that code will
|
||||
; be aligned to 8 bytes after this.
|
||||
uc_1:
|
||||
mov eax, [esi]
|
||||
mov [edi], eax
|
||||
uc_0:
|
||||
jmp [tail_table+edx*4]
|
||||
|
||||
[section .data]
|
||||
align 32
|
||||
unrolled_copy_index_table:
|
||||
%assign i 0
|
||||
%rep 16
|
||||
db (uc_ %+ i) - unrolled_copy_code_start
|
||||
%assign i i+1
|
||||
%endrep
|
||||
__SECT__
|
||||
|
||||
|
||||
;------------------------------------------------------------------------------
|
||||
; tiny copy - handles all cases smaller than IC_MOVQ's 64 byte lower limit.
|
||||
; > edx = number of bytes (< IC_TINY_MAX)
|
||||
; < does not return.
|
||||
; x eax, ecx, edx
|
||||
%macro IC_TINY 0
|
||||
mov ecx, edx
|
||||
shr ecx, 2
|
||||
; calculating this address isn't possible due to skipping displacement on uc1;
|
||||
; even so, it'd require calculating -6*ecx, which is slower than LUT.
|
||||
movzx eax, byte [unrolled_copy_index_table+ecx]
|
||||
and edx, byte 3
|
||||
add eax, unrolled_copy_code_start
|
||||
jmp eax
|
||||
; never reached! the unrolled loop jumps into tailN, which
|
||||
; then returns from the memcpy function.
|
||||
%endm
|
||||
|
||||
|
||||
;------------------------------------------------------------------------------
|
||||
; align destination address to multiple of 8. important for large transfers,
|
||||
; but doesn't affect the tiny technique.
|
||||
; > esi, edi -> buffers (updated)
|
||||
; > ecx, edx = transfer size (updated)
|
||||
; x eax
|
||||
%macro IC_ALIGN 0
|
||||
mov eax, edi
|
||||
and eax, byte 7 ; eax = # misaligned bytes
|
||||
jz already_aligned ; early out
|
||||
lea eax, [align_table_start+eax*2]
|
||||
jmp eax
|
||||
|
||||
; [p3] this is no slower than a table of mov and much smaller/simpler
|
||||
align 8
|
||||
align_table_start:
|
||||
%rep 8
|
||||
dec ecx
|
||||
movsb
|
||||
%endrep
|
||||
mov edx, ecx
|
||||
already_aligned:
|
||||
%endm
|
||||
|
||||
|
||||
;------------------------------------------------------------------------------
|
||||
; MMX MOVQ technique. used for in-cache transfers of 64B..64*KiB.
|
||||
; must run on all CPUs, i.e. cannot use the SSE prefetchnta instruction.
|
||||
; > ecx = -number_of_bytes (multiple of 64)
|
||||
; > esi, esi point to end of the buffer, i.e. &last_qword+8.
|
||||
; < ecx = 0
|
||||
; x
|
||||
%macro IC_MOVQ 0
|
||||
|
||||
align 16
|
||||
%%loop:
|
||||
|
||||
; notes:
|
||||
; - we can't use prefetch here - this codepath must support all CPUs.
|
||||
; [p3] that makes us 5..15% slower on 1KiB..4KiB transfers.
|
||||
; - [p3] simple addressing without +ecx is 3.5% faster.
|
||||
; - difference between RR/WW/RR/WW and R..R/W..W:
|
||||
; [p3] none (if simple addressing)
|
||||
; [axp] interleaved is better (with +ecx addressing)
|
||||
; - enough time elapses between first and third pair of reads that we
|
||||
; could reuse MM0. there is no performance gain either way and
|
||||
; differing displacements make code compression futile anyway, so
|
||||
; we'll just use MM4..7 for clarity.
|
||||
movq mm0, [esi+ecx]
|
||||
movq mm1, [esi+ecx+8]
|
||||
movq [edi+ecx], mm0
|
||||
movq [edi+ecx+8], mm1
|
||||
movq mm2, [esi+ecx+16]
|
||||
movq mm3, [esi+ecx+24]
|
||||
movq [edi+ecx+16], mm2
|
||||
movq [edi+ecx+24], mm3
|
||||
movq mm4, [esi+ecx+32]
|
||||
movq mm5, [esi+ecx+40]
|
||||
movq [edi+ecx+32], mm4
|
||||
movq [edi+ecx+40], mm5
|
||||
movq mm6, [esi+ecx+48]
|
||||
movq mm7, [esi+ecx+56]
|
||||
movq [edi+ecx+48], mm6
|
||||
movq [edi+ecx+56], mm7
|
||||
add ecx, byte 64
|
||||
jnz %%loop
|
||||
%endm
|
||||
|
||||
|
||||
;------------------------------------------------------------------------------
|
||||
; SSE MOVNTQ technique. used for transfers that do not fit in L1,
|
||||
; i.e. 64KiB..192KiB. requires Pentium III or Athlon; caller checks for this.
|
||||
; > ecx = -number_of_bytes (multiple of 64)
|
||||
; > esi, esi point to end of the buffer, i.e. &last_qword+8.
|
||||
; < ecx = 0
|
||||
; x
|
||||
%macro UC_MOVNTQ 0
|
||||
|
||||
align 16
|
||||
%%loop:
|
||||
; notes:
|
||||
; - the AMD optimization manual recommends prefetch distances according to
|
||||
; (200*BytesPerIter/ClocksPerIter+192), which comes out to ~560 here.
|
||||
; [p3] rounding down to 512 bytes makes for significant gains.
|
||||
; - [p3] complex addressing with ecx is 1% faster than adding to esi/edi.
|
||||
prefetchnta [esi+ecx+512]
|
||||
movq mm0, [esi+ecx]
|
||||
movq mm1, [esi+ecx+8]
|
||||
movq mm2, [esi+ecx+16]
|
||||
movq mm3, [esi+ecx+24]
|
||||
movq mm4, [esi+ecx+32]
|
||||
movq mm5, [esi+ecx+40]
|
||||
movq mm6, [esi+ecx+48]
|
||||
movq mm7, [esi+ecx+56]
|
||||
movntq [edi+ecx], mm0
|
||||
movntq [edi+ecx+8], mm1
|
||||
movntq [edi+ecx+16], mm2
|
||||
movntq [edi+ecx+24], mm3
|
||||
movntq [edi+ecx+32], mm4
|
||||
movntq [edi+ecx+40], mm5
|
||||
movntq [edi+ecx+48], mm6
|
||||
movntq [edi+ecx+56], mm7
|
||||
add ecx, byte 64
|
||||
jnz %%loop
|
||||
%endm
|
||||
|
||||
|
||||
;------------------------------------------------------------------------------
|
||||
; block prefetch technique. used for transfers that do not fit in L2,
|
||||
; i.e. > 192KiB. requires Pentium III or Athlon; caller checks for this.
|
||||
; for theory behind this, see article.
|
||||
; > ecx = -number_of_bytes (multiple of 64, <= -BP_SIZE)
|
||||
; > esi, esi point to end of the buffer, i.e. &last_qword+8.
|
||||
; < ecx = -remaining_bytes (multiple of 64, > -BP_SIZE)
|
||||
; < eax = 0
|
||||
%macro UC_BP_MOVNTQ 0
|
||||
push edx
|
||||
|
||||
align 4
|
||||
%%prefetch_and_copy_chunk:
|
||||
; pull chunk into cache by touching each cache line
|
||||
; (in reverse order to prevent HW prefetches)
|
||||
mov eax, BP_SIZE/128 ; # iterations
|
||||
add esi, BP_SIZE
|
||||
align 16
|
||||
%%prefetch_loop:
|
||||
mov edx, [esi+ecx-64]
|
||||
mov edx, [esi+ecx-128]
|
||||
add esi, byte -128
|
||||
dec eax
|
||||
jnz %%prefetch_loop
|
||||
|
||||
; copy chunk in 64 byte pieces
|
||||
mov eax, BP_SIZE/64 ; # iterations (> signed 8 bit)
|
||||
align 16
|
||||
%%copy_loop:
|
||||
movq mm0, [esi+ecx]
|
||||
movq mm1, [esi+ecx+8]
|
||||
movq mm2, [esi+ecx+16]
|
||||
movq mm3, [esi+ecx+24]
|
||||
movq mm4, [esi+ecx+32]
|
||||
movq mm5, [esi+ecx+40]
|
||||
movq mm6, [esi+ecx+48]
|
||||
movq mm7, [esi+ecx+56]
|
||||
movntq [edi+ecx], mm0
|
||||
movntq [edi+ecx+8], mm1
|
||||
movntq [edi+ecx+16], mm2
|
||||
movntq [edi+ecx+24], mm3
|
||||
movntq [edi+ecx+32], mm4
|
||||
movntq [edi+ecx+40], mm5
|
||||
movntq [edi+ecx+48], mm6
|
||||
movntq [edi+ecx+56], mm7
|
||||
|
||||
add ecx, byte 64
|
||||
dec eax
|
||||
jnz %%copy_loop
|
||||
|
||||
; if enough data left, process next chunk
|
||||
cmp ecx, -BP_SIZE
|
||||
jle %%prefetch_and_copy_chunk
|
||||
|
||||
pop edx
|
||||
%endm
|
||||
|
||||
|
||||
;------------------------------------------------------------------------------
|
||||
|
||||
; void* __declspec(naked) ia32_memcpy(void* dst, const void* src, size_t nbytes)
|
||||
; drop-in replacement for libc memcpy() (returns dst)
|
||||
global sym(ia32_memcpy)
|
||||
align 64
|
||||
sym(ia32_memcpy):
|
||||
push edi
|
||||
push esi
|
||||
|
||||
mov ecx, [esp+8+4+8] ; nbytes
|
||||
mov edi, [esp+8+4+0] ; dst
|
||||
mov esi, [esp+8+4+4] ; src
|
||||
|
||||
mov edx, ecx
|
||||
cmp ecx, byte IC_TINY_MAX
|
||||
ja choose_larger_method
|
||||
|
||||
ic_tiny:
|
||||
IC_TINY
|
||||
; never reached - IC_TINY contains memcpy function epilog code
|
||||
|
||||
choose_larger_method:
|
||||
IC_ALIGN
|
||||
|
||||
; setup:
|
||||
; eax = number of 64 byte chunks, or 0 if CPU doesn't support SSE.
|
||||
; used to choose copy technique.
|
||||
; ecx = -number_of_bytes, multiple of 64. we jump to ic_tiny if
|
||||
; there's not enough left for a single 64 byte chunk, which can
|
||||
; happen on unaligned 64..71 byte transfers due to IC_ALIGN.
|
||||
; edx = number of remainder bytes after qwords have been copied;
|
||||
; will be handled by IC_TINY.
|
||||
; esi and edi point to end of the respective buffers (more precisely,
|
||||
; to buffer_start-ecx). this together with the ecx convention means
|
||||
; we only need one loop counter (instead of having to advance
|
||||
; that and esi/edi).
|
||||
|
||||
; this mask is applied to the transfer size. the 2 specialized copy techniques
|
||||
; that use SSE are jumped to if size is greater than a threshold.
|
||||
; we simply set the requested transfer size to 0 if the CPU doesn't
|
||||
; support SSE so that those are never reached (done by masking with this).
|
||||
extern sym(ia32_memcpy_size_mask)
|
||||
mov eax, [sym(ia32_memcpy_size_mask)]
|
||||
and ecx, byte ~IC_TINY_MAX
|
||||
jz ic_tiny ; < 64 bytes left (due to IC_ALIGN)
|
||||
add esi, ecx
|
||||
add edi, ecx
|
||||
and edx, byte IC_TINY_MAX
|
||||
and eax, ecx
|
||||
neg ecx
|
||||
|
||||
cmp eax, BP_THRESHOLD
|
||||
jae near uc_bp_movntq
|
||||
cmp eax, UC_THRESHOLD
|
||||
jae uc_movntq
|
||||
|
||||
ic_movq:
|
||||
IC_MOVQ
|
||||
emms
|
||||
jmp ic_tiny
|
||||
|
||||
uc_movntq:
|
||||
UC_MOVNTQ
|
||||
sfence
|
||||
emms
|
||||
jmp ic_tiny
|
||||
|
||||
uc_bp_movntq:
|
||||
UC_BP_MOVNTQ
|
||||
sfence
|
||||
cmp ecx, byte -(IC_TINY_MAX+1)
|
||||
jle ic_movq
|
||||
emms
|
||||
jmp ic_tiny
|
||||
|
||||
%include "ia32.inc"
|
||||
|
||||
;-------------------------------------------------------------------------------
|
||||
; CPUID support
|
||||
@ -657,53 +282,3 @@ sym(ia32_asm_init):
|
||||
|
||||
pop ebx
|
||||
ret
|
||||
|
||||
|
||||
;-------------------------------------------------------------------------------
|
||||
; Color conversion (SSE)
|
||||
;-------------------------------------------------------------------------------
|
||||
|
||||
; extern "C" u32 ConvertRGBColorTo4ub(const RGBColor& color)
|
||||
[section .data]
|
||||
align 16
|
||||
zero:
|
||||
dd 0.0
|
||||
twofivefive:
|
||||
dd 255.0
|
||||
|
||||
|
||||
__SECT__
|
||||
align 16
|
||||
global sym(sse_ConvertRGBColorTo4ub)
|
||||
sym(sse_ConvertRGBColorTo4ub):
|
||||
mov eax, [esp+4]
|
||||
|
||||
; xmm0, 1, 2 = R, G, B
|
||||
movss xmm4, [zero]
|
||||
movss xmm0, [eax+8]
|
||||
movss xmm1, [eax+4]
|
||||
movss xmm2, [eax]
|
||||
movss xmm5, [twofivefive]
|
||||
|
||||
; C = min(255, 255*max(C, 0)) ( == clamp(255*C, 0, 255) )
|
||||
maxss xmm0, xmm4
|
||||
maxss xmm1, xmm4
|
||||
maxss xmm2, xmm4
|
||||
mulss xmm0, xmm5
|
||||
mulss xmm1, xmm5
|
||||
mulss xmm2, xmm5
|
||||
minss xmm0, xmm5
|
||||
minss xmm1, xmm5
|
||||
minss xmm2, xmm5
|
||||
|
||||
; convert to integer and combine channels using bit logic
|
||||
cvtss2si eax, xmm0
|
||||
cvtss2si ecx, xmm1
|
||||
cvtss2si edx, xmm2
|
||||
shl eax, 16
|
||||
shl ecx, 8
|
||||
or eax, 0xff000000
|
||||
or edx, ecx
|
||||
or eax, edx
|
||||
|
||||
ret
|
||||
|
374
source/lib/sysdep/ia32_memcpy.asm
Normal file
374
source/lib/sysdep/ia32_memcpy.asm
Normal file
@ -0,0 +1,374 @@
|
||||
; =========================================================================
|
||||
; File : ia32_memcpy.asm
|
||||
; Project : 0 A.D.
|
||||
; Description : highly optimized memory copy.
|
||||
;
|
||||
; @author Jan.Wassenberg@stud.uni-karlsruhe.de
|
||||
; =========================================================================
|
||||
|
||||
; Copyright (c) 2004-2005 Jan Wassenberg
|
||||
;
|
||||
; Redistribution and/or modification are also permitted under the
|
||||
; terms of the GNU General Public License as published by th;e
|
||||
; Free Software Foundation (version 2 or later, at your option).
|
||||
;
|
||||
; This program is distributed in the hope that it will be useful, but
|
||||
; WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
|
||||
%include "ia32.inc"
|
||||
|
||||
; drop-in replacement for libc memcpy(). only requires CPU support for
|
||||
; MMX (by now universal). highly optimized for Athlon and Pentium III
|
||||
; microarchitectures; significantly outperforms VC7.1 memcpy and memcpy_amd.
|
||||
; for details, see accompanying article.
|
||||
|
||||
; if transfer size is at least this much,
|
||||
; .. it's too big for L1. use non-temporal instructions.
|
||||
UC_THRESHOLD equ 64*1024
|
||||
; .. it also blows L2. pull chunks into L1 ("block prefetch").
|
||||
BP_THRESHOLD equ 256*1024
|
||||
|
||||
; maximum that can be copied by IC_TINY.
|
||||
IC_TINY_MAX equ 63
|
||||
|
||||
; size of one block prefetch chunk.
|
||||
BP_SIZE equ 8*1024
|
||||
|
||||
|
||||
;------------------------------------------------------------------------------
|
||||
|
||||
; [p3] replicating this instead of jumping to it from tailN
|
||||
; saves 1 clock and costs (7-2)*2 bytes code.
|
||||
%macro EPILOG 0
|
||||
pop esi
|
||||
pop edi
|
||||
mov eax, [esp+4] ; return dst
|
||||
ret
|
||||
%endm
|
||||
|
||||
align 64
|
||||
tail1:
|
||||
mov al, [esi+ecx*4]
|
||||
mov [edi+ecx*4], al
|
||||
align 4
|
||||
tail0:
|
||||
EPILOG
|
||||
|
||||
align 8
|
||||
tail3:
|
||||
; [p3] 2 reads followed by 2 writes is better than
|
||||
; R/W interleaved and RRR/WWW
|
||||
mov al, [esi+ecx*4+2]
|
||||
mov [edi+ecx*4+2], al
|
||||
; already aligned to 8 due to above code
|
||||
tail2:
|
||||
mov al, [esi+ecx*4]
|
||||
mov dl, [esi+ecx*4+1]
|
||||
mov [edi+ecx*4], al
|
||||
mov [edi+ecx*4+1], dl
|
||||
EPILOG
|
||||
|
||||
[section .data]
|
||||
align 16
|
||||
tail_table dd tail0, tail1, tail2, tail3
|
||||
__SECT__
|
||||
|
||||
; 15x unrolled copy loop - transfers DWORDs backwards.
|
||||
; indexed via table of 8-bit offsets.
|
||||
; rationale:
|
||||
; - [p3] backwards vs. forwards makes no difference.
|
||||
; - MOV is faster than MOVSD.
|
||||
; - index table is needed because calculating end-6*i is slower than
|
||||
; a LUT and we wouldn't want to expand entries to 8 bytes
|
||||
; (that'd increase code footprint by 30 bytes)
|
||||
; - a byte index accessed via MOVZX is better due to less dcache usage.
|
||||
; - only unrolling 8x and 'reentering' the loop is possible but
|
||||
; slower due to fiddling with esi/ecx.
|
||||
align 64
|
||||
unrolled_copy_code_start:
|
||||
%assign i 15
|
||||
%rep 14 ; 15 entries, 1 base case handled below
|
||||
uc_ %+ i:
|
||||
mov eax, [esi+i*4-4]
|
||||
mov [edi+i*4-4], eax
|
||||
%assign i i-1
|
||||
%endrep
|
||||
; base case: no displacement needed; skip it so that code will
|
||||
; be aligned to 8 bytes after this.
|
||||
uc_1:
|
||||
mov eax, [esi]
|
||||
mov [edi], eax
|
||||
uc_0:
|
||||
jmp [tail_table+edx*4]
|
||||
|
||||
[section .data]
|
||||
align 32
|
||||
unrolled_copy_index_table:
|
||||
%assign i 0
|
||||
%rep 16
|
||||
db (uc_ %+ i) - unrolled_copy_code_start
|
||||
%assign i i+1
|
||||
%endrep
|
||||
__SECT__
|
||||
|
||||
|
||||
;------------------------------------------------------------------------------
|
||||
; tiny copy - handles all cases smaller than IC_MOVQ's 64 byte lower limit.
|
||||
; > edx = number of bytes (< IC_TINY_MAX)
|
||||
; < does not return.
|
||||
; x eax, ecx, edx
|
||||
%macro IC_TINY 0
|
||||
mov ecx, edx
|
||||
shr ecx, 2
|
||||
; calculating this address isn't possible due to skipping displacement on uc1;
|
||||
; even so, it'd require calculating -6*ecx, which is slower than LUT.
|
||||
movzx eax, byte [unrolled_copy_index_table+ecx]
|
||||
and edx, byte 3
|
||||
add eax, unrolled_copy_code_start
|
||||
jmp eax
|
||||
; never reached! the unrolled loop jumps into tailN, which
|
||||
; then returns from the memcpy function.
|
||||
%endm
|
||||
|
||||
|
||||
;------------------------------------------------------------------------------
|
||||
; align destination address to multiple of 8. important for large transfers,
|
||||
; but doesn't affect the tiny technique.
|
||||
; > esi, edi -> buffers (updated)
|
||||
; > ecx, edx = transfer size (updated)
|
||||
; x eax
|
||||
%macro IC_ALIGN 0
|
||||
mov eax, edi
|
||||
and eax, byte 7 ; eax = # misaligned bytes
|
||||
jz already_aligned ; early out
|
||||
lea eax, [align_table_start+eax*2]
|
||||
jmp eax
|
||||
|
||||
; [p3] this is no slower than a table of mov and much smaller/simpler
|
||||
align 8
|
||||
align_table_start:
|
||||
%rep 8
|
||||
dec ecx
|
||||
movsb
|
||||
%endrep
|
||||
mov edx, ecx
|
||||
already_aligned:
|
||||
%endm
|
||||
|
||||
|
||||
;------------------------------------------------------------------------------
|
||||
; MMX MOVQ technique. used for in-cache transfers of 64B..64*KiB.
|
||||
; must run on all CPUs, i.e. cannot use the SSE prefetchnta instruction.
|
||||
; > ecx = -number_of_bytes (multiple of 64)
|
||||
; > esi, esi point to end of the buffer, i.e. &last_qword+8.
|
||||
; < ecx = 0
|
||||
; x
|
||||
%macro IC_MOVQ 0
|
||||
|
||||
align 16
|
||||
%%loop:
|
||||
|
||||
; notes:
|
||||
; - we can't use prefetch here - this codepath must support all CPUs.
|
||||
; [p3] that makes us 5..15% slower on 1KiB..4KiB transfers.
|
||||
; - [p3] simple addressing without +ecx is 3.5% faster.
|
||||
; - difference between RR/WW/RR/WW and R..R/W..W:
|
||||
; [p3] none (if simple addressing)
|
||||
; [axp] interleaved is better (with +ecx addressing)
|
||||
; - enough time elapses between first and third pair of reads that we
|
||||
; could reuse MM0. there is no performance gain either way and
|
||||
; differing displacements make code compression futile anyway, so
|
||||
; we'll just use MM4..7 for clarity.
|
||||
movq mm0, [esi+ecx]
|
||||
movq mm1, [esi+ecx+8]
|
||||
movq [edi+ecx], mm0
|
||||
movq [edi+ecx+8], mm1
|
||||
movq mm2, [esi+ecx+16]
|
||||
movq mm3, [esi+ecx+24]
|
||||
movq [edi+ecx+16], mm2
|
||||
movq [edi+ecx+24], mm3
|
||||
movq mm4, [esi+ecx+32]
|
||||
movq mm5, [esi+ecx+40]
|
||||
movq [edi+ecx+32], mm4
|
||||
movq [edi+ecx+40], mm5
|
||||
movq mm6, [esi+ecx+48]
|
||||
movq mm7, [esi+ecx+56]
|
||||
movq [edi+ecx+48], mm6
|
||||
movq [edi+ecx+56], mm7
|
||||
add ecx, byte 64
|
||||
jnz %%loop
|
||||
%endm
|
||||
|
||||
|
||||
;------------------------------------------------------------------------------
|
||||
; SSE MOVNTQ technique. used for transfers that do not fit in L1,
|
||||
; i.e. 64KiB..192KiB. requires Pentium III or Athlon; caller checks for this.
|
||||
; > ecx = -number_of_bytes (multiple of 64)
|
||||
; > esi, esi point to end of the buffer, i.e. &last_qword+8.
|
||||
; < ecx = 0
|
||||
; x
|
||||
%macro UC_MOVNTQ 0
|
||||
|
||||
align 16
|
||||
%%loop:
|
||||
; notes:
|
||||
; - the AMD optimization manual recommends prefetch distances according to
|
||||
; (200*BytesPerIter/ClocksPerIter+192), which comes out to ~560 here.
|
||||
; [p3] rounding down to 512 bytes makes for significant gains.
|
||||
; - [p3] complex addressing with ecx is 1% faster than adding to esi/edi.
|
||||
prefetchnta [esi+ecx+512]
|
||||
movq mm0, [esi+ecx]
|
||||
movq mm1, [esi+ecx+8]
|
||||
movq mm2, [esi+ecx+16]
|
||||
movq mm3, [esi+ecx+24]
|
||||
movq mm4, [esi+ecx+32]
|
||||
movq mm5, [esi+ecx+40]
|
||||
movq mm6, [esi+ecx+48]
|
||||
movq mm7, [esi+ecx+56]
|
||||
movntq [edi+ecx], mm0
|
||||
movntq [edi+ecx+8], mm1
|
||||
movntq [edi+ecx+16], mm2
|
||||
movntq [edi+ecx+24], mm3
|
||||
movntq [edi+ecx+32], mm4
|
||||
movntq [edi+ecx+40], mm5
|
||||
movntq [edi+ecx+48], mm6
|
||||
movntq [edi+ecx+56], mm7
|
||||
add ecx, byte 64
|
||||
jnz %%loop
|
||||
%endm
|
||||
|
||||
|
||||
;------------------------------------------------------------------------------
|
||||
; block prefetch technique. used for transfers that do not fit in L2,
|
||||
; i.e. > 192KiB. requires Pentium III or Athlon; caller checks for this.
|
||||
; for theory behind this, see article.
|
||||
; > ecx = -number_of_bytes (multiple of 64, <= -BP_SIZE)
|
||||
; > esi, esi point to end of the buffer, i.e. &last_qword+8.
|
||||
; < ecx = -remaining_bytes (multiple of 64, > -BP_SIZE)
|
||||
; < eax = 0
|
||||
%macro UC_BP_MOVNTQ 0
|
||||
push edx
|
||||
|
||||
align 4
|
||||
%%prefetch_and_copy_chunk:
|
||||
; pull chunk into cache by touching each cache line
|
||||
; (in reverse order to prevent HW prefetches)
|
||||
mov eax, BP_SIZE/128 ; # iterations
|
||||
add esi, BP_SIZE
|
||||
align 16
|
||||
%%prefetch_loop:
|
||||
mov edx, [esi+ecx-64]
|
||||
mov edx, [esi+ecx-128]
|
||||
add esi, byte -128
|
||||
dec eax
|
||||
jnz %%prefetch_loop
|
||||
|
||||
; copy chunk in 64 byte pieces
|
||||
mov eax, BP_SIZE/64 ; # iterations (> signed 8 bit)
|
||||
align 16
|
||||
%%copy_loop:
|
||||
movq mm0, [esi+ecx]
|
||||
movq mm1, [esi+ecx+8]
|
||||
movq mm2, [esi+ecx+16]
|
||||
movq mm3, [esi+ecx+24]
|
||||
movq mm4, [esi+ecx+32]
|
||||
movq mm5, [esi+ecx+40]
|
||||
movq mm6, [esi+ecx+48]
|
||||
movq mm7, [esi+ecx+56]
|
||||
movntq [edi+ecx], mm0
|
||||
movntq [edi+ecx+8], mm1
|
||||
movntq [edi+ecx+16], mm2
|
||||
movntq [edi+ecx+24], mm3
|
||||
movntq [edi+ecx+32], mm4
|
||||
movntq [edi+ecx+40], mm5
|
||||
movntq [edi+ecx+48], mm6
|
||||
movntq [edi+ecx+56], mm7
|
||||
|
||||
add ecx, byte 64
|
||||
dec eax
|
||||
jnz %%copy_loop
|
||||
|
||||
; if enough data left, process next chunk
|
||||
cmp ecx, -BP_SIZE
|
||||
jle %%prefetch_and_copy_chunk
|
||||
|
||||
pop edx
|
||||
%endm
|
||||
|
||||
|
||||
;------------------------------------------------------------------------------
|
||||
|
||||
; void* __declspec(naked) ia32_memcpy(void* dst, const void* src, size_t nbytes)
|
||||
; drop-in replacement for libc memcpy() (returns dst)
|
||||
global sym(ia32_memcpy)
|
||||
align 64
|
||||
sym(ia32_memcpy):
|
||||
push edi
|
||||
push esi
|
||||
|
||||
mov ecx, [esp+8+4+8] ; nbytes
|
||||
mov edi, [esp+8+4+0] ; dst
|
||||
mov esi, [esp+8+4+4] ; src
|
||||
|
||||
mov edx, ecx
|
||||
cmp ecx, byte IC_TINY_MAX
|
||||
ja choose_larger_method
|
||||
|
||||
ic_tiny:
|
||||
IC_TINY
|
||||
; never reached - IC_TINY contains memcpy function epilog code
|
||||
|
||||
choose_larger_method:
|
||||
IC_ALIGN
|
||||
|
||||
; setup:
|
||||
; eax = number of 64 byte chunks, or 0 if CPU doesn't support SSE.
|
||||
; used to choose copy technique.
|
||||
; ecx = -number_of_bytes, multiple of 64. we jump to ic_tiny if
|
||||
; there's not enough left for a single 64 byte chunk, which can
|
||||
; happen on unaligned 64..71 byte transfers due to IC_ALIGN.
|
||||
; edx = number of remainder bytes after qwords have been copied;
|
||||
; will be handled by IC_TINY.
|
||||
; esi and edi point to end of the respective buffers (more precisely,
|
||||
; to buffer_start-ecx). this together with the ecx convention means
|
||||
; we only need one loop counter (instead of having to advance
|
||||
; that and esi/edi).
|
||||
|
||||
; this mask is applied to the transfer size. the 2 specialized copy techniques
|
||||
; that use SSE are jumped to if size is greater than a threshold.
|
||||
; we simply set the requested transfer size to 0 if the CPU doesn't
|
||||
; support SSE so that those are never reached (done by masking with this).
|
||||
extern sym(ia32_memcpy_size_mask)
|
||||
mov eax, [sym(ia32_memcpy_size_mask)]
|
||||
and ecx, byte ~IC_TINY_MAX
|
||||
jz ic_tiny ; < 64 bytes left (due to IC_ALIGN)
|
||||
add esi, ecx
|
||||
add edi, ecx
|
||||
and edx, byte IC_TINY_MAX
|
||||
and eax, ecx
|
||||
neg ecx
|
||||
|
||||
cmp eax, BP_THRESHOLD
|
||||
jae near uc_bp_movntq
|
||||
cmp eax, UC_THRESHOLD
|
||||
jae uc_movntq
|
||||
|
||||
ic_movq:
|
||||
IC_MOVQ
|
||||
emms
|
||||
jmp ic_tiny
|
||||
|
||||
uc_movntq:
|
||||
UC_MOVNTQ
|
||||
sfence
|
||||
emms
|
||||
jmp ic_tiny
|
||||
|
||||
uc_bp_movntq:
|
||||
UC_BP_MOVNTQ
|
||||
sfence
|
||||
cmp ecx, byte -(IC_TINY_MAX+1)
|
||||
jle ic_movq
|
||||
emms
|
||||
jmp ic_tiny
|
Loading…
Reference in New Issue
Block a user