# improvements to build system for asm files

split ia32_asm code up into memcpy, color premake: add (windows-only so far) support for NASM include paths - required when using %include. see rationale in vs.c refs #124 This was SVN commit r4039.
2006-06-25 20:58:03 +00:00 · 2006-06-25 20:58:03 +00:00 · aeed96dafa
commit aeed96dafa
parent f2f4ff5fbe
7 changed files with 1297 additions and 1269 deletions
--- a/build/premake/premake.exe
+++ b/build/premake/premake.exe
--- a/build/premake/src/Src/vs.c
+++ b/build/premake/src/Src/vs.c
--- a/source/graphics/Color_asm.asm
+++ b/source/graphics/Color_asm.asm
@ -0,0 +1,50 @@
+%include "../lib/sysdep/ia32.inc"
+
+;-------------------------------------------------------------------------------
+; Color conversion (SSE)
+;-------------------------------------------------------------------------------
+
+; extern "C" u32 ConvertRGBColorTo4ub(const RGBColor& color)
+[section .data]
+	align	16
+zero:
+	dd	0.0
+twofivefive:
+	dd	255.0
+
+
+__SECT__
+	align	16
+global sym(sse_ConvertRGBColorTo4ub)
+sym(sse_ConvertRGBColorTo4ub):
+	mov	eax, [esp+4]
+
+	; xmm0, 1, 2 = R, G, B
+	movss	xmm4, [zero]
+	movss	xmm0, [eax+8]
+	movss	xmm1, [eax+4]
+	movss	xmm2, [eax]
+	movss	xmm5, [twofivefive]
+
+	; C = min(255, 255*max(C, 0)) ( == clamp(255*C, 0, 255) )
+	maxss	xmm0, xmm4
+	maxss	xmm1, xmm4
+	maxss	xmm2, xmm4
+	mulss	xmm0, xmm5
+	mulss	xmm1, xmm5
+	mulss	xmm2, xmm5
+	minss	xmm0, xmm5
+	minss	xmm1, xmm5
+	minss	xmm2, xmm5
+
+	; convert to integer and combine channels using bit logic
+	cvtss2si eax, xmm0
+	cvtss2si ecx, xmm1
+	cvtss2si edx, xmm2
+	shl	eax, 16
+	shl	ecx, 8
+	or	eax, 0xff000000
+	or	edx, ecx
+	or	eax, edx
+
+	ret
--- a/source/lib/precompiled.h
+++ b/source/lib/precompiled.h
@ -99,8 +99,7 @@
 #include <cctype>
 #include <cerrno>
 #include <cfloat>
-//#include <ciso646>
-	// defines e.g. "and" to "&". unnecessary and causes trouble with asm.
+//#include <ciso646> // defines e.g. "and" to "&". unnecessary and causes trouble with asm.
 #include <climits>
 #include <clocale>
 #include <cmath>
--- a/source/lib/sysdep/ia32.inc
+++ b/source/lib/sysdep/ia32.inc
@ -0,0 +1,17 @@
+
+; set section attributes
+section .data data align=32 use32
+section .bss  bss  align=16 use32
+section .text code align=64 use32
+; activate .text (needs to be separate because __SECT__ will otherwise
+; complain that the above definition is redeclaring attributes)
+section .text
+
+; Usage:
+; use sym(ia32_cap) instead of _ia32_cap - on relevant platforms, sym() will add
+; the underlines automagically, on others it won't
+%ifdef DONT_USE_UNDERLINE
+%define sym(a) a
+%else
+%define sym(a) _ %+ a
+%endif
--- a/source/lib/sysdep/ia32_asm.asm
+++ b/source/lib/sysdep/ia32_asm.asm
@ -17,382 +17,7 @@
 ; WITHOUT ANY WARRANTY; without even the implied warranty of
 ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

-; set section attributes
-section .data data align=32 use32
-section .bss  bss  align=16 use32
-section .text code align=64 use32
-; activate .text (needs to be separate because __SECT__ will otherwise
-; complain that the above definition is redeclaring attributes)
-section .text
-
-; Usage:
-; use sym(ia32_cap) instead of _ia32_cap - on relevant platforms, sym() will add
-; the underlines automagically, on others it won't
-%ifdef DONT_USE_UNDERLINE
-%define sym(a) a
-%else
-%define sym(a) _ %+ a
-%endif
-
-;-------------------------------------------------------------------------------
-; fast general memcpy
-;-------------------------------------------------------------------------------
-
-; drop-in replacement for libc memcpy(). only requires CPU support for
-; MMX (by now universal). highly optimized for Athlon and Pentium III
-; microarchitectures; significantly outperforms VC7.1 memcpy and memcpy_amd.
-; for details, see accompanying article.
-
-; if transfer size is at least this much,
-; .. it's too big for L1. use non-temporal instructions.
-UC_THRESHOLD	equ	64*1024
-; .. it also blows L2. pull chunks into L1 ("block prefetch").
-BP_THRESHOLD	equ	256*1024
-
-; maximum that can be copied by IC_TINY.
-IC_TINY_MAX		equ	63
-
-; size of one block prefetch chunk.
-BP_SIZE		equ	8*1024
-
-
-;------------------------------------------------------------------------------
-
-; [p3] replicating this instead of jumping to it from tailN
-; saves 1 clock and costs (7-2)*2 bytes code.
-%macro EPILOG 0
-	pop		esi
-	pop		edi
-	mov		eax, [esp+4]		; return dst
-	ret
-%endm
-
-align 64
-tail1:
-	mov		al, [esi+ecx*4]
-	mov		[edi+ecx*4], al
-align 4
-tail0:
-	EPILOG
-
-align 8
-tail3:
-	; [p3] 2 reads followed by 2 writes is better than
-	; R/W interleaved and RRR/WWW
-	mov		al, [esi+ecx*4+2]
-	mov		[edi+ecx*4+2], al
-; already aligned to 8 due to above code
-tail2:
-	mov		al, [esi+ecx*4]
-	mov		dl, [esi+ecx*4+1]
-	mov		[edi+ecx*4], al
-	mov		[edi+ecx*4+1], dl
-	EPILOG
-
-[section .data]
-align 16
-tail_table	dd tail0, tail1, tail2, tail3
-__SECT__
-
-; 15x unrolled copy loop - transfers DWORDs backwards.
-; indexed via table of 8-bit offsets.
-; rationale:
-; - [p3] backwards vs. forwards makes no difference.
-; - MOV is faster than MOVSD.
-; - index table is needed because calculating end-6*i is slower than
-;   a LUT and we wouldn't want to expand entries to 8 bytes
-;   (that'd increase code footprint by 30 bytes)
-; - a byte index accessed via MOVZX is better due to less dcache usage.
-; - only unrolling 8x and 'reentering' the loop is possible but
-;   slower due to fiddling with esi/ecx.
-align 64
-unrolled_copy_code_start:
-%assign i 15
-%rep 14	; 15 entries, 1 base case handled below
-uc_ %+ i:
-    mov     eax, [esi+i*4-4]
-    mov     [edi+i*4-4], eax
-%assign i i-1
-%endrep
-; base case: no displacement needed; skip it so that code will
-; be aligned to 8 bytes after this.
-uc_1:
-    mov     eax, [esi]
-    mov     [edi], eax
-uc_0:
-	jmp		[tail_table+edx*4]
-
-[section .data]
-align 32
-unrolled_copy_index_table:
-%assign i 0
-%rep 16
-	db (uc_ %+ i) - unrolled_copy_code_start
-%assign i i+1
-%endrep
-__SECT__
-
-
-;------------------------------------------------------------------------------
-; tiny copy - handles all cases smaller than IC_MOVQ's 64 byte lower limit.
-; > edx = number of bytes (< IC_TINY_MAX)
-; < does not return.
-; x eax, ecx, edx
-%macro IC_TINY 0
-	mov		ecx, edx
-	shr		ecx, 2
-	; calculating this address isn't possible due to skipping displacement on uc1;
-	; even so, it'd require calculating -6*ecx, which is slower than LUT.
-	movzx	eax, byte [unrolled_copy_index_table+ecx]
-	and		edx, byte 3
-	add		eax, unrolled_copy_code_start
-	jmp		eax
-	; never reached! the unrolled loop jumps into tailN, which
-	; then returns from the memcpy function.
-%endm
-
-
-;------------------------------------------------------------------------------
-; align destination address to multiple of 8. important for large transfers,
-; but doesn't affect the tiny technique.
-; > esi, edi -> buffers (updated)
-; > ecx, edx = transfer size (updated)
-; x eax
-%macro IC_ALIGN 0
-	mov		eax, edi
-	and		eax, byte 7					; eax = # misaligned bytes
-	jz		already_aligned				; early out
-	lea		eax, [align_table_start+eax*2]
-	jmp		eax
-
-; [p3] this is no slower than a table of mov and much smaller/simpler
-align 8
-align_table_start:
-%rep 8
-	dec		ecx
-	movsb
-%endrep
-	mov		edx, ecx
-already_aligned:
-%endm
-
-
-;------------------------------------------------------------------------------
-; MMX MOVQ technique. used for in-cache transfers of 64B..64*KiB.
-; must run on all CPUs, i.e. cannot use the SSE prefetchnta instruction.
-; > ecx = -number_of_bytes (multiple of 64)
-; > esi, esi point to end of the buffer, i.e. &last_qword+8.
-; < ecx = 0
-; x
-%macro IC_MOVQ 0
-
-align 16
-%%loop:
-
-	; notes:
-	; - we can't use prefetch here - this codepath must support all CPUs.
-	;   [p3] that makes us 5..15% slower on 1KiB..4KiB transfers.
-	; - [p3] simple addressing without +ecx is 3.5% faster.
-	; - difference between RR/WW/RR/WW and R..R/W..W:
-	;   [p3] none (if simple addressing)
-	;   [axp] interleaved is better (with +ecx addressing)
-	; - enough time elapses between first and third pair of reads that we
-	;   could reuse MM0. there is no performance gain either way and
-	;   differing displacements make code compression futile anyway, so
-	;   we'll just use MM4..7 for clarity.
-	movq	mm0, [esi+ecx]
-	movq	mm1, [esi+ecx+8]
-	movq	[edi+ecx], mm0
-	movq	[edi+ecx+8], mm1
-	movq	mm2, [esi+ecx+16]
-	movq	mm3, [esi+ecx+24]
-	movq	[edi+ecx+16], mm2
-	movq	[edi+ecx+24], mm3
-	movq	mm4, [esi+ecx+32]
-	movq	mm5, [esi+ecx+40]
-	movq	[edi+ecx+32], mm4
-	movq	[edi+ecx+40], mm5
-	movq	mm6, [esi+ecx+48]
-	movq	mm7, [esi+ecx+56]
-	movq	[edi+ecx+48], mm6
-	movq	[edi+ecx+56], mm7
-	add		ecx, byte 64
-	jnz		%%loop
-%endm
-
-
-;------------------------------------------------------------------------------
-; SSE MOVNTQ technique. used for transfers that do not fit in L1,
-; i.e. 64KiB..192KiB. requires Pentium III or Athlon; caller checks for this.
-; > ecx = -number_of_bytes (multiple of 64)
-; > esi, esi point to end of the buffer, i.e. &last_qword+8.
-; < ecx = 0
-; x
-%macro UC_MOVNTQ 0
-
-align 16
-%%loop:
-	; notes:
-	; - the AMD optimization manual recommends prefetch distances according to
-	;   (200*BytesPerIter/ClocksPerIter+192), which comes out to ~560 here.
-	;   [p3] rounding down to 512 bytes makes for significant gains.
-	; - [p3] complex addressing with ecx is 1% faster than adding to esi/edi.
-	prefetchnta [esi+ecx+512]
-	movq	mm0, [esi+ecx]
-	movq	mm1, [esi+ecx+8]
-	movq	mm2, [esi+ecx+16]
-	movq	mm3, [esi+ecx+24]
-	movq	mm4, [esi+ecx+32]
-	movq	mm5, [esi+ecx+40]
-	movq	mm6, [esi+ecx+48]
-	movq	mm7, [esi+ecx+56]
-	movntq	[edi+ecx], mm0
-	movntq	[edi+ecx+8], mm1
-	movntq	[edi+ecx+16], mm2
-	movntq	[edi+ecx+24], mm3
-	movntq	[edi+ecx+32], mm4
-	movntq	[edi+ecx+40], mm5
-	movntq	[edi+ecx+48], mm6
-	movntq	[edi+ecx+56], mm7
-	add		ecx, byte 64
-	jnz		%%loop
-%endm
-
-
-;------------------------------------------------------------------------------
-; block prefetch technique. used for transfers that do not fit in L2,
-; i.e. > 192KiB. requires Pentium III or Athlon; caller checks for this.
-; for theory behind this, see article.
-; > ecx = -number_of_bytes (multiple of 64, <= -BP_SIZE)
-; > esi, esi point to end of the buffer, i.e. &last_qword+8.
-; < ecx = -remaining_bytes (multiple of 64, > -BP_SIZE)
-; < eax = 0
-%macro UC_BP_MOVNTQ 0
-	push	edx
-
-align 4
-%%prefetch_and_copy_chunk:
-	; pull chunk into cache by touching each cache line
-	; (in reverse order to prevent HW prefetches)
-	mov		eax, BP_SIZE/128			; # iterations
-	add		esi, BP_SIZE
-align 16
-%%prefetch_loop:
-	mov		edx, [esi+ecx-64]
-	mov		edx, [esi+ecx-128]
-	add		esi, byte -128
-	dec		eax
-	jnz		%%prefetch_loop
-
-	; copy chunk in 64 byte pieces
-	mov		eax, BP_SIZE/64				; # iterations (> signed 8 bit)
-align 16
-%%copy_loop:
-	movq	mm0, [esi+ecx]
-	movq	mm1, [esi+ecx+8]
-	movq	mm2, [esi+ecx+16]
-	movq	mm3, [esi+ecx+24]
-	movq	mm4, [esi+ecx+32]
-	movq	mm5, [esi+ecx+40]
-	movq	mm6, [esi+ecx+48]
-	movq	mm7, [esi+ecx+56]
-	movntq	[edi+ecx], mm0
-	movntq	[edi+ecx+8], mm1
-	movntq	[edi+ecx+16], mm2
-	movntq	[edi+ecx+24], mm3
-	movntq	[edi+ecx+32], mm4
-	movntq	[edi+ecx+40], mm5
-	movntq	[edi+ecx+48], mm6
-	movntq	[edi+ecx+56], mm7
-
-	add		ecx, byte 64
-	dec		eax
-	jnz		%%copy_loop
-
-	; if enough data left, process next chunk
-	cmp		ecx, -BP_SIZE
-	jle		%%prefetch_and_copy_chunk
-
-	pop		edx
-%endm
-
-
-;------------------------------------------------------------------------------
-
-; void* __declspec(naked) ia32_memcpy(void* dst, const void* src, size_t nbytes)
-; drop-in replacement for libc memcpy() (returns dst)
-global sym(ia32_memcpy)
-align 64
-sym(ia32_memcpy):
-	push	edi
-	push	esi
-
-	mov		ecx, [esp+8+4+8]			; nbytes
-	mov		edi, [esp+8+4+0]			; dst
-	mov		esi, [esp+8+4+4]			; src
-
-	mov		edx, ecx
-	cmp		ecx, byte IC_TINY_MAX
-	ja		choose_larger_method
-
-ic_tiny:
-	IC_TINY
-	; never reached - IC_TINY contains memcpy function epilog code
-
-choose_larger_method:
-	IC_ALIGN
-
-	; setup:
-	; eax = number of 64 byte chunks, or 0 if CPU doesn't support SSE.
-	;       used to choose copy technique.
-	; ecx = -number_of_bytes, multiple of 64. we jump to ic_tiny if
-	;       there's not enough left for a single 64 byte chunk, which can
-	;       happen on unaligned 64..71 byte transfers due to IC_ALIGN.
-	; edx = number of remainder bytes after qwords have been copied;
-	;       will be handled by IC_TINY.
-	; esi and edi point to end of the respective buffers (more precisely,
-	;       to buffer_start-ecx). this together with the ecx convention means
-	;       we only need one loop counter (instead of having to advance
-	;       that and esi/edi).
-
-	; this mask is applied to the transfer size. the 2 specialized copy techniques
-	; that use SSE are jumped to if size is greater than a threshold.
-	; we simply set the requested transfer size to 0 if the CPU doesn't
-	; support SSE so that those are never reached (done by masking with this).
-	extern sym(ia32_memcpy_size_mask)
-	mov		eax, [sym(ia32_memcpy_size_mask)]
-	and		ecx, byte ~IC_TINY_MAX
-	jz		ic_tiny						; < 64 bytes left (due to IC_ALIGN)
-	add		esi, ecx
-	add		edi, ecx
-	and		edx, byte IC_TINY_MAX
-	and		eax, ecx
-	neg		ecx
-
-	cmp		eax, BP_THRESHOLD
-	jae		near uc_bp_movntq
-	cmp		eax, UC_THRESHOLD
-	jae		uc_movntq
-
-ic_movq:
-	IC_MOVQ
-	emms
-	jmp		ic_tiny
-
-uc_movntq:
-	UC_MOVNTQ
-	sfence
-	emms
-	jmp		ic_tiny
-
-uc_bp_movntq:
-	UC_BP_MOVNTQ
-	sfence
-	cmp		ecx, byte -(IC_TINY_MAX+1)
-	jle		ic_movq
-	emms
-	jmp		ic_tiny
-
+%include "ia32.inc"

 ;-------------------------------------------------------------------------------
 ; CPUID support
@ -657,53 +282,3 @@ sym(ia32_asm_init):

 	pop		ebx
 	ret
-
-
-;-------------------------------------------------------------------------------
-; Color conversion (SSE)
-;-------------------------------------------------------------------------------
-
-; extern "C" u32 ConvertRGBColorTo4ub(const RGBColor& color)
-[section .data]
-	align	16
-zero:
-	dd	0.0
-twofivefive:
-	dd	255.0
-
-
-__SECT__
-	align	16
-global sym(sse_ConvertRGBColorTo4ub)
-sym(sse_ConvertRGBColorTo4ub):
-	mov	eax, [esp+4]
-
-	; xmm0, 1, 2 = R, G, B
-	movss	xmm4, [zero]
-	movss	xmm0, [eax+8]
-	movss	xmm1, [eax+4]
-	movss	xmm2, [eax]
-	movss	xmm5, [twofivefive]
-
-	; C = min(255, 255*max(C, 0)) ( == clamp(255*C, 0, 255) )
-	maxss	xmm0, xmm4
-	maxss	xmm1, xmm4
-	maxss	xmm2, xmm4
-	mulss	xmm0, xmm5
-	mulss	xmm1, xmm5
-	mulss	xmm2, xmm5
-	minss	xmm0, xmm5
-	minss	xmm1, xmm5
-	minss	xmm2, xmm5
-	
-	; convert to integer and combine channels using bit logic
-	cvtss2si eax, xmm0
-	cvtss2si ecx, xmm1
-	cvtss2si edx, xmm2
-	shl	eax, 16
-	shl	ecx, 8
-	or	eax, 0xff000000
-	or	edx, ecx
-	or	eax, edx
-
-	ret
--- a/source/lib/sysdep/ia32_memcpy.asm
+++ b/source/lib/sysdep/ia32_memcpy.asm
@ -0,0 +1,374 @@
+; =========================================================================
+; File        : ia32_memcpy.asm
+; Project     : 0 A.D.
+; Description : highly optimized memory copy.
+;
+; @author Jan.Wassenberg@stud.uni-karlsruhe.de
+; =========================================================================
+
+; Copyright (c) 2004-2005 Jan Wassenberg
+;
+; Redistribution and/or modification are also permitted under the
+; terms of the GNU General Public License as published by th;e
+; Free Software Foundation (version 2 or later, at your option).
+;
+; This program is distributed in the hope that it will be useful, but
+; WITHOUT ANY WARRANTY; without even the implied warranty of
+; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+
+%include "ia32.inc"
+
+; drop-in replacement for libc memcpy(). only requires CPU support for
+; MMX (by now universal). highly optimized for Athlon and Pentium III
+; microarchitectures; significantly outperforms VC7.1 memcpy and memcpy_amd.
+; for details, see accompanying article.
+
+; if transfer size is at least this much,
+; .. it's too big for L1. use non-temporal instructions.
+UC_THRESHOLD	equ	64*1024
+; .. it also blows L2. pull chunks into L1 ("block prefetch").
+BP_THRESHOLD	equ	256*1024
+
+; maximum that can be copied by IC_TINY.
+IC_TINY_MAX		equ	63
+
+; size of one block prefetch chunk.
+BP_SIZE		equ	8*1024
+
+
+;------------------------------------------------------------------------------
+
+; [p3] replicating this instead of jumping to it from tailN
+; saves 1 clock and costs (7-2)*2 bytes code.
+%macro EPILOG 0
+	pop		esi
+	pop		edi
+	mov		eax, [esp+4]		; return dst
+	ret
+%endm
+
+align 64
+tail1:
+	mov		al, [esi+ecx*4]
+	mov		[edi+ecx*4], al
+align 4
+tail0:
+	EPILOG
+
+align 8
+tail3:
+	; [p3] 2 reads followed by 2 writes is better than
+	; R/W interleaved and RRR/WWW
+	mov		al, [esi+ecx*4+2]
+	mov		[edi+ecx*4+2], al
+; already aligned to 8 due to above code
+tail2:
+	mov		al, [esi+ecx*4]
+	mov		dl, [esi+ecx*4+1]
+	mov		[edi+ecx*4], al
+	mov		[edi+ecx*4+1], dl
+	EPILOG
+
+[section .data]
+align 16
+tail_table	dd tail0, tail1, tail2, tail3
+__SECT__
+
+; 15x unrolled copy loop - transfers DWORDs backwards.
+; indexed via table of 8-bit offsets.
+; rationale:
+; - [p3] backwards vs. forwards makes no difference.
+; - MOV is faster than MOVSD.
+; - index table is needed because calculating end-6*i is slower than
+;   a LUT and we wouldn't want to expand entries to 8 bytes
+;   (that'd increase code footprint by 30 bytes)
+; - a byte index accessed via MOVZX is better due to less dcache usage.
+; - only unrolling 8x and 'reentering' the loop is possible but
+;   slower due to fiddling with esi/ecx.
+align 64
+unrolled_copy_code_start:
+%assign i 15
+%rep 14	; 15 entries, 1 base case handled below
+uc_ %+ i:
+    mov     eax, [esi+i*4-4]
+    mov     [edi+i*4-4], eax
+%assign i i-1
+%endrep
+; base case: no displacement needed; skip it so that code will
+; be aligned to 8 bytes after this.
+uc_1:
+    mov     eax, [esi]
+    mov     [edi], eax
+uc_0:
+	jmp		[tail_table+edx*4]
+
+[section .data]
+align 32
+unrolled_copy_index_table:
+%assign i 0
+%rep 16
+	db (uc_ %+ i) - unrolled_copy_code_start
+%assign i i+1
+%endrep
+__SECT__
+
+
+;------------------------------------------------------------------------------
+; tiny copy - handles all cases smaller than IC_MOVQ's 64 byte lower limit.
+; > edx = number of bytes (< IC_TINY_MAX)
+; < does not return.
+; x eax, ecx, edx
+%macro IC_TINY 0
+	mov		ecx, edx
+	shr		ecx, 2
+	; calculating this address isn't possible due to skipping displacement on uc1;
+	; even so, it'd require calculating -6*ecx, which is slower than LUT.
+	movzx	eax, byte [unrolled_copy_index_table+ecx]
+	and		edx, byte 3
+	add		eax, unrolled_copy_code_start
+	jmp		eax
+	; never reached! the unrolled loop jumps into tailN, which
+	; then returns from the memcpy function.
+%endm
+
+
+;------------------------------------------------------------------------------
+; align destination address to multiple of 8. important for large transfers,
+; but doesn't affect the tiny technique.
+; > esi, edi -> buffers (updated)
+; > ecx, edx = transfer size (updated)
+; x eax
+%macro IC_ALIGN 0
+	mov		eax, edi
+	and		eax, byte 7					; eax = # misaligned bytes
+	jz		already_aligned				; early out
+	lea		eax, [align_table_start+eax*2]
+	jmp		eax
+
+; [p3] this is no slower than a table of mov and much smaller/simpler
+align 8
+align_table_start:
+%rep 8
+	dec		ecx
+	movsb
+%endrep
+	mov		edx, ecx
+already_aligned:
+%endm
+
+
+;------------------------------------------------------------------------------
+; MMX MOVQ technique. used for in-cache transfers of 64B..64*KiB.
+; must run on all CPUs, i.e. cannot use the SSE prefetchnta instruction.
+; > ecx = -number_of_bytes (multiple of 64)
+; > esi, esi point to end of the buffer, i.e. &last_qword+8.
+; < ecx = 0
+; x
+%macro IC_MOVQ 0
+
+align 16
+%%loop:
+
+	; notes:
+	; - we can't use prefetch here - this codepath must support all CPUs.
+	;   [p3] that makes us 5..15% slower on 1KiB..4KiB transfers.
+	; - [p3] simple addressing without +ecx is 3.5% faster.
+	; - difference between RR/WW/RR/WW and R..R/W..W:
+	;   [p3] none (if simple addressing)
+	;   [axp] interleaved is better (with +ecx addressing)
+	; - enough time elapses between first and third pair of reads that we
+	;   could reuse MM0. there is no performance gain either way and
+	;   differing displacements make code compression futile anyway, so
+	;   we'll just use MM4..7 for clarity.
+	movq	mm0, [esi+ecx]
+	movq	mm1, [esi+ecx+8]
+	movq	[edi+ecx], mm0
+	movq	[edi+ecx+8], mm1
+	movq	mm2, [esi+ecx+16]
+	movq	mm3, [esi+ecx+24]
+	movq	[edi+ecx+16], mm2
+	movq	[edi+ecx+24], mm3
+	movq	mm4, [esi+ecx+32]
+	movq	mm5, [esi+ecx+40]
+	movq	[edi+ecx+32], mm4
+	movq	[edi+ecx+40], mm5
+	movq	mm6, [esi+ecx+48]
+	movq	mm7, [esi+ecx+56]
+	movq	[edi+ecx+48], mm6
+	movq	[edi+ecx+56], mm7
+	add		ecx, byte 64
+	jnz		%%loop
+%endm
+
+
+;------------------------------------------------------------------------------
+; SSE MOVNTQ technique. used for transfers that do not fit in L1,
+; i.e. 64KiB..192KiB. requires Pentium III or Athlon; caller checks for this.
+; > ecx = -number_of_bytes (multiple of 64)
+; > esi, esi point to end of the buffer, i.e. &last_qword+8.
+; < ecx = 0
+; x
+%macro UC_MOVNTQ 0
+
+align 16
+%%loop:
+	; notes:
+	; - the AMD optimization manual recommends prefetch distances according to
+	;   (200*BytesPerIter/ClocksPerIter+192), which comes out to ~560 here.
+	;   [p3] rounding down to 512 bytes makes for significant gains.
+	; - [p3] complex addressing with ecx is 1% faster than adding to esi/edi.
+	prefetchnta [esi+ecx+512]
+	movq	mm0, [esi+ecx]
+	movq	mm1, [esi+ecx+8]
+	movq	mm2, [esi+ecx+16]
+	movq	mm3, [esi+ecx+24]
+	movq	mm4, [esi+ecx+32]
+	movq	mm5, [esi+ecx+40]
+	movq	mm6, [esi+ecx+48]
+	movq	mm7, [esi+ecx+56]
+	movntq	[edi+ecx], mm0
+	movntq	[edi+ecx+8], mm1
+	movntq	[edi+ecx+16], mm2
+	movntq	[edi+ecx+24], mm3
+	movntq	[edi+ecx+32], mm4
+	movntq	[edi+ecx+40], mm5
+	movntq	[edi+ecx+48], mm6
+	movntq	[edi+ecx+56], mm7
+	add		ecx, byte 64
+	jnz		%%loop
+%endm
+
+
+;------------------------------------------------------------------------------
+; block prefetch technique. used for transfers that do not fit in L2,
+; i.e. > 192KiB. requires Pentium III or Athlon; caller checks for this.
+; for theory behind this, see article.
+; > ecx = -number_of_bytes (multiple of 64, <= -BP_SIZE)
+; > esi, esi point to end of the buffer, i.e. &last_qword+8.
+; < ecx = -remaining_bytes (multiple of 64, > -BP_SIZE)
+; < eax = 0
+%macro UC_BP_MOVNTQ 0
+	push	edx
+
+align 4
+%%prefetch_and_copy_chunk:
+	; pull chunk into cache by touching each cache line
+	; (in reverse order to prevent HW prefetches)
+	mov		eax, BP_SIZE/128			; # iterations
+	add		esi, BP_SIZE
+align 16
+%%prefetch_loop:
+	mov		edx, [esi+ecx-64]
+	mov		edx, [esi+ecx-128]
+	add		esi, byte -128
+	dec		eax
+	jnz		%%prefetch_loop
+
+	; copy chunk in 64 byte pieces
+	mov		eax, BP_SIZE/64				; # iterations (> signed 8 bit)
+align 16
+%%copy_loop:
+	movq	mm0, [esi+ecx]
+	movq	mm1, [esi+ecx+8]
+	movq	mm2, [esi+ecx+16]
+	movq	mm3, [esi+ecx+24]
+	movq	mm4, [esi+ecx+32]
+	movq	mm5, [esi+ecx+40]
+	movq	mm6, [esi+ecx+48]
+	movq	mm7, [esi+ecx+56]
+	movntq	[edi+ecx], mm0
+	movntq	[edi+ecx+8], mm1
+	movntq	[edi+ecx+16], mm2
+	movntq	[edi+ecx+24], mm3
+	movntq	[edi+ecx+32], mm4
+	movntq	[edi+ecx+40], mm5
+	movntq	[edi+ecx+48], mm6
+	movntq	[edi+ecx+56], mm7
+
+	add		ecx, byte 64
+	dec		eax
+	jnz		%%copy_loop
+
+	; if enough data left, process next chunk
+	cmp		ecx, -BP_SIZE
+	jle		%%prefetch_and_copy_chunk
+
+	pop		edx
+%endm
+
+
+;------------------------------------------------------------------------------
+
+; void* __declspec(naked) ia32_memcpy(void* dst, const void* src, size_t nbytes)
+; drop-in replacement for libc memcpy() (returns dst)
+global sym(ia32_memcpy)
+align 64
+sym(ia32_memcpy):
+	push	edi
+	push	esi
+
+	mov		ecx, [esp+8+4+8]			; nbytes
+	mov		edi, [esp+8+4+0]			; dst
+	mov		esi, [esp+8+4+4]			; src
+
+	mov		edx, ecx
+	cmp		ecx, byte IC_TINY_MAX
+	ja		choose_larger_method
+
+ic_tiny:
+	IC_TINY
+	; never reached - IC_TINY contains memcpy function epilog code
+
+choose_larger_method:
+	IC_ALIGN
+
+	; setup:
+	; eax = number of 64 byte chunks, or 0 if CPU doesn't support SSE.
+	;       used to choose copy technique.
+	; ecx = -number_of_bytes, multiple of 64. we jump to ic_tiny if
+	;       there's not enough left for a single 64 byte chunk, which can
+	;       happen on unaligned 64..71 byte transfers due to IC_ALIGN.
+	; edx = number of remainder bytes after qwords have been copied;
+	;       will be handled by IC_TINY.
+	; esi and edi point to end of the respective buffers (more precisely,
+	;       to buffer_start-ecx). this together with the ecx convention means
+	;       we only need one loop counter (instead of having to advance
+	;       that and esi/edi).
+
+	; this mask is applied to the transfer size. the 2 specialized copy techniques
+	; that use SSE are jumped to if size is greater than a threshold.
+	; we simply set the requested transfer size to 0 if the CPU doesn't
+	; support SSE so that those are never reached (done by masking with this).
+	extern sym(ia32_memcpy_size_mask)
+	mov		eax, [sym(ia32_memcpy_size_mask)]
+	and		ecx, byte ~IC_TINY_MAX
+	jz		ic_tiny						; < 64 bytes left (due to IC_ALIGN)
+	add		esi, ecx
+	add		edi, ecx
+	and		edx, byte IC_TINY_MAX
+	and		eax, ecx
+	neg		ecx
+
+	cmp		eax, BP_THRESHOLD
+	jae		near uc_bp_movntq
+	cmp		eax, UC_THRESHOLD
+	jae		uc_movntq
+
+ic_movq:
+	IC_MOVQ
+	emms
+	jmp		ic_tiny
+
+uc_movntq:
+	UC_MOVNTQ
+	sfence
+	emms
+	jmp		ic_tiny
+
+uc_bp_movntq:
+	UC_BP_MOVNTQ
+	sfence
+	cmp		ecx, byte -(IC_TINY_MAX+1)
+	jle		ic_movq
+	emms
+	jmp		ic_tiny