From 15ca12311b861a01b00a57a512993bad529b2b77 Mon Sep 17 00:00:00 2001
From: janwas <jan@wildfiregames.com>
Date: Mon, 12 Sep 2005 22:13:03 +0000
Subject: [PATCH] more WIP; initial memcpy code

This was SVN commit r2700.
---
 source/lib/sysdep/ia32.cpp | 515 +++++++++++++++++++++----------------
 1 file changed, 298 insertions(+), 217 deletions(-)

diff --git a/source/lib/sysdep/ia32.cpp b/source/lib/sysdep/ia32.cpp
index 9692023d9f..e4f08c53a5 100755
--- a/source/lib/sysdep/ia32.cpp
+++ b/source/lib/sysdep/ia32.cpp
@@ -440,99 +440,35 @@ $memcpy_final:
 
 
 
-/*
-ALIGN
+
+	// align to 8 bytes
 	// this align may be slower in [32kb, 64kb]
-	mov		ecx, 8			; a trick that's faster than rep movsb...
-	sub		ecx, edi		; align destination to qword
-	and		ecx, 111b		; get the low bits
-	sub		ebx, ecx		; update copy count
-	neg		ecx				; set up to jump into the array
-	add		ecx, offset $memcpy_align_done
-	jmp		ecx				; jump to array of movsb's
-
-align 4
-	movsb
-	movsb
-	movsb
-	movsb
-	movsb
-	movsb
-	movsb
-	movsb
+	// rationale: always called - it may speed up TINY as well
+	// => we have to be careful to only copy min(8, 8-edi, size)
+#define MEMCPY_ALIGN
 
 
 
-MMX_MOVQ
 
+// temporal (in-cache) copy; 64 bytes per iter
 // This is small block copy that uses the MMX registers to copy 8 bytes
 // at a time.  It uses the "unrolled loop" optimization, and also uses
 // the software prefetch instruction to get the data into the cache.
-align 16
-$memcpy_ic_1:			; 64-byte block copies, in-cache copy
-
-	prefetchnta [esi + (200*64/34+192)]		; start reading ahead
-
-	movq	mm0, [esi+0]	; read 64 bits
-	movq	mm1, [esi+8]
-	movq	[edi+0], mm0	; write 64 bits
-	movq	[edi+8], mm1	;    note:  the normal movq writes the
-	movq	mm2, [esi+16]	;    data to cache; a cache line will be
-	movq	mm3, [esi+24]	;    allocated as needed, to store the data
-	movq	[edi+16], mm2
-	movq	[edi+24], mm3
-	movq	mm0, [esi+32]
-	movq	mm1, [esi+40]
-	movq	[edi+32], mm0
-	movq	[edi+40], mm1
-	movq	mm2, [esi+48]
-	movq	mm3, [esi+56]
-	movq	[edi+48], mm2
-	movq	[edi+56], mm3
-
-	add		esi, 64			; update source pointer
-	add		edi, 64			; update destination pointer
-	dec		ecx				; count down
-	jnz		$memcpy_ic_1	; last 64-byte block?
+#define MEMCPY_MOVQ
 
 
 
-PREFETCH_MOVNTQ
 
+// 64 bytes per iter; uncached non-temporal
 // For larger blocks, which will spill beyond the cache, it's faster to
 // use the Streaming Store instruction MOVNTQ.   This write instruction
 // bypasses the cache and writes straight to main memory.  This code also
 // uses the software prefetch instruction to pre-read the data.
-align 16
-$memcpy_uc_1:				; 64-byte blocks, uncached copy
-
-	prefetchnta [esi + (200*64/34+192)]		; start reading ahead
-
-	movq	mm0,[esi+0]		; read 64 bits
-	add		edi,64			; update destination pointer
-	movq	mm1,[esi+8]
-	add		esi,64			; update source pointer
-	movq	mm2,[esi-48]
-	movntq	[edi-64], mm0	; write 64 bits, bypassing the cache
-	movq	mm0,[esi-40]	;    note: movntq also prevents the CPU
-	movntq	[edi-56], mm1	;    from READING the destination address
-	movq	mm1,[esi-32]	;    into the cache, only to be over-written
-	movntq	[edi-48], mm2	;    so that also helps performance
-	movq	mm2,[esi-24]
-	movntq	[edi-40], mm0
-	movq	mm0,[esi-16]
-	movntq	[edi-32], mm1
-	movq	mm1,[esi-8]
-	movntq	[edi-24], mm2
-	movntq	[edi-16], mm0
-	dec		ecx
-	movntq	[edi-8], mm1
-	jnz		$memcpy_uc_1	; last 64-byte block?
+#define MEMCPY_MOVNTQ
 
 
 
 
-BLOCKPREFETCH_MOVNTQ
 
 // For the largest size blocks, a special technique called Block Prefetch
 // can be used to accelerate the read operations.   Block Prefetch reads
@@ -540,55 +476,81 @@ BLOCKPREFETCH_MOVNTQ
 // This is faster than using software prefetch, in this case.
 // The technique is great for getting maximum read bandwidth,
 // especially in DDR memory systems.
+#define MEMCPY_BP_MOVNTQ
+
+
+
+
+
+
+
+
+/*
+
+
+
+void amd_memcpy_skeleton(u8 *dest, const u8 *src, size_t n)
+{
+  __asm {
+
+	mov		ecx, [n]		; number of bytes to copy
+	mov		edi, [dest]		; destination
+	mov		esi, [src]		; source
+	mov		ebx, ecx		; keep a copy of count
+
+	cmp		ecx, TINY_BLOCK_COPY
+	jb		$memcpy_ic_3	; tiny? skip mmx copy
+
+	ALIGN
+
+	mov		ecx, ebx		; number of bytes left to copy
+	shr		ecx, 6			; get 64-byte block count
+	jz		$memcpy_ic_2	; finish the last few bytes
+
+	cmp		ecx, IN_CACHE_COPY/64	; too big 4 cache? use uncached copy
+	jae		$memcpy_uc_test
+
+	MMX_MOVQ
+
+$memcpy_ic_2:
+	mov		ecx, ebx		; has valid low 6 bits of the byte count
+$memcpy_ic_3:
+	TINY(<=64)
+
+	emms				; clean up the MMX state
+	sfence				; flush the write buffer
+	ret
+
+$memcpy_uc_test:
+	cmp		ecx, UNCACHED_COPY/64	; big enough? use block prefetch copy
+	jae		$memcpy_bp_1
+
+$memcpy_64_test:
+	or		ecx, ecx		; tail end of block prefetch will jump here
+	jz		$memcpy_ic_2	; no more 64-byte blocks left
+
+align 16
+$memcpy_uc_1:				; 64-byte blocks, uncached copy
+	PREFETCH_MOVNTQ
+
+	jmp		$memcpy_ic_2		; almost done
+
 $memcpy_bp_1:			; large blocks, block prefetch copy
+	BLOCKPREFETCH_MOVNTQ
+	// jumps to $memcpy_64_test when done
+    }
+}
 
-	cmp		ecx, CACHEBLOCK			; big enough to run another prefetch loop?
-	jl		$memcpy_64_test			; no, back to regular uncached copy
-
-	mov		eax, CACHEBLOCK / 2		; block prefetch loop, unrolled 2X
-	add		esi, CACHEBLOCK * 64	; move to the top of the block
-align 16
-$memcpy_bp_2:
-	mov		edx, [esi-64]		; grab one address per cache line
-	mov		edx, [esi-128]		; grab one address per cache line
-	sub		esi, 128			; go reverse order
-	dec		eax					; count down the cache lines
-	jnz		$memcpy_bp_2		; keep grabbing more lines into cache
-
-	mov		eax, CACHEBLOCK		; now that it's in cache, do the copy
-align 16
-$memcpy_bp_3:
-	movq	mm0, [esi   ]		; read 64 bits
-	movq	mm1, [esi+ 8]
-	movq	mm2, [esi+16]
-	movq	mm3, [esi+24]
-	movq	mm4, [esi+32]
-	movq	mm5, [esi+40]
-	movq	mm6, [esi+48]
-	movq	mm7, [esi+56]
-	add		esi, 64				; update source pointer
-	movntq	[edi   ], mm0		; write 64 bits, bypassing cache
-	movntq	[edi+ 8], mm1		;    note: movntq also prevents the CPU
-	movntq	[edi+16], mm2		;    from READING the destination address 
-	movntq	[edi+24], mm3		;    into the cache, only to be over-written,
-	movntq	[edi+32], mm4		;    so that also helps performance
-	movntq	[edi+40], mm5
-	movntq	[edi+48], mm6
-	movntq	[edi+56], mm7
-	add		edi, 64				; update dest pointer
-
-	dec		eax					; count down
-
-	jnz		$memcpy_bp_3		; keep copying
-	sub		ecx, CACHEBLOCK		; update the 64-byte block count
-	jmp		$memcpy_bp_1		; keep processing chunks
+									total taken jmp (no ret, no tbl)+unconditional
+	0..64		tiny							0
+	64..64k		mmx								1+1
+	64k..192k	movntq							1+1
+	192k..inf	blockprefetch					1+1
 
 
 */
 
 
-// note: tiny routine isn't entered if size > 64 (protects movsd array)
-
 
 static i64 t0, t1;
 #define BEGIN\
@@ -605,6 +567,201 @@ static i64 t0, t1;
 	__asm mov dword ptr t1, eax\
 	__asm mov dword ptr t1+4, edx\
 	__asm popad
+static const size_t MOVNTQ_MIN_THRESHOLD_64 = 64*KiB / 64;// upper limit for movq/movq copy w/SW prefetch
+
+static const size_t BP_MIN_THRESHOLD_64 = 192*KiB / 64; // upper limit for movq/movntq w/SW prefetch
+
+// rationale:
+// - prefer "jnz loop" style vs "jz end; jmp loop" to make it
+//   better for the branch prediction unit.
+// - need declspec naked because we ret in the middle of the function (avoids jmp)
+//   disadvantage: compiler cannot optimize param passing
+
+void __declspec(naked) ia32_memcpy(void* dst, const void* src, size_t nbytes)
+{
+__asm {
+	BEGIN
+	mov		ecx, [esp+4+8]		;// nbytes
+	mov		esi, [esp+4+4]		;// src
+	mov		edi, [esp+4+0]		;// dst
+
+mov		eax, 8
+sub		eax, edi
+and		eax, 0x07
+cmp		eax, ecx
+cmova	eax, ecx
+sub		ecx, eax
+neg		eax
+add		eax, offset $align_table_end
+jmp		eax
+align 4
+movsb
+movsb
+movsb
+movsb
+movsb
+movsb
+movsb
+movsb
+$align_table_end:
+
+	mov		ebx, ecx
+	shr		ecx, 6			; # blocks
+
+tiny:
+	and		ebx, 63
+	mov		edx, ebx
+	shr		edx, 2			; dword count
+	neg		edx
+	add		edx, offset $movsd_table_end
+
+	mov		eax, _bp
+	cmp		ecx, BP_MIN_THRESHOLD_64
+	cmovb	eax, _movntq
+	cmp		ecx, MOVNTQ_MIN_THRESHOLD_64
+	cmovb	eax, _mmx
+	cmp		ecx, 64
+	cmovbe	eax, edx
+	jmp		eax
+
+align 8
+	movsd
+	movsd
+	movsd
+	movsd
+	movsd
+	movsd
+	movsd
+	movsd
+	movsd
+	movsd
+	movsd
+	movsd
+	movsd
+	movsd
+	movsd
+	movsd
+$movsd_table_end:
+
+// TODO: move this calc into register
+	mov		eax, ebx
+	and		eax, 11b
+	neg		eax
+	add		eax, offset $movsb_table_end
+	jmp		eax
+
+	movsb
+	movsb
+	movsb
+$movsb_table_end:
+	END
+	ret
+
+_mmx:
+	// we have >= 64
+	align 16
+$movq_loop:
+prefetchnta [esi + (200*64/34+192)]
+movq	mm0, [esi+0]
+movq	mm1, [esi+8]
+movq	[edi+0], mm0
+movq	[edi+8], mm1
+movq	mm2, [esi+16]
+movq	mm3, [esi+24]
+movq	[edi+16], mm2
+movq	[edi+24], mm3
+movq	mm0, [esi+32]
+movq	mm1, [esi+40]
+movq	[edi+32], mm0
+movq	[edi+40], mm1
+movq	mm2, [esi+48]
+movq	mm3, [esi+56]
+movq	[edi+48], mm2
+movq	[edi+56], mm3
+add		esi, 64
+add		edi, 64
+dec		ecx
+jnz		$movq_loop
+	emms
+	jmp		tiny
+
+_bp:
+	// we have >= 8kb// until no more 8kb blocks
+	$bp_process_chunk:
+mov		eax, CACHEBLOCK / 2		; block prefetch loop, unrolled 2X
+add		esi, CACHEBLOCK * 64	; move to the top of the block
+align 16
+/* touch each cache line in reverse order (prevents HW prefetch) */
+$bp_prefetch_chunk:
+mov		edx, [esi-64]
+mov		edx, [esi-128]
+sub		esi, 128
+dec		eax
+jnz		$bp_prefetch_chunk
+mov		eax, CACHEBLOCK		/*; now that it's in cache, do the copy*/
+align 16
+$bp_copy_block:
+movq	mm0, [esi+ 0]
+movq	mm1, [esi+ 8]
+movq	mm2, [esi+16]
+movq	mm3, [esi+24]
+movq	mm4, [esi+32]
+movq	mm5, [esi+40]
+movq	mm6, [esi+48]
+movq	mm7, [esi+56]
+add		esi, 64
+movntq	[edi+ 0], mm0
+movntq	[edi+ 8], mm1
+movntq	[edi+16], mm2
+movntq	[edi+24], mm3
+movntq	[edi+32], mm4
+movntq	[edi+40], mm5
+movntq	[edi+48], mm6
+movntq	[edi+56], mm7
+add		edi, 64
+dec		eax
+jnz		$bp_copy_block
+sub		ecx, CACHEBLOCK		/*; update the 64-byte block count*/
+cmp		ecx, CACHEBLOCK
+jl		$bp_process_chunk
+	sfence
+	emms
+	// protect routine below
+	cmp		ecx, 0
+	jz		tiny
+
+_movntq:
+	// we have >= 64, 64B BLOCKS
+	align 16\
+$movntq_loop:\
+prefetchnta [esi + (200*64/34+192)]\
+movq	mm0,[esi+0]\
+add		edi,64\
+movq	mm1,[esi+8]\
+add		esi,64\
+movq	mm2,[esi-48]\
+movntq	[edi-64], mm0\
+movq	mm0,[esi-40]\
+movntq	[edi-56], mm1\
+movq	mm1,[esi-32]\
+movntq	[edi-48], mm2\
+movq	mm2,[esi-24]\
+movntq	[edi-40], mm0\
+movq	mm0,[esi-16]\
+movntq	[edi-32], mm1\
+movq	mm1,[esi-8]\
+movntq	[edi-24], mm2\
+movntq	[edi-16], mm0\
+dec		ecx\
+movntq	[edi-8], mm1\
+jnz		$movntq_loop
+	sfence
+	emms
+	jmp		tiny
+}	// __asm
+}	// ia32_memcpy
+
+
 
 static void dtable_brep(u8* dst, const u8* src, size_t nbytes)
 {
@@ -751,7 +908,7 @@ rep	movsb
 static void bloop(u8* dst, const u8* src, size_t nbytes)
 {
 	BEGIN
-	for(int i = 0; i < nbytes; i++)
+	for(size_t i = 0; i < nbytes; i++)
 		*dst++ = *src++;
 	END
 }
@@ -759,8 +916,8 @@ static void bloop(u8* dst, const u8* src, size_t nbytes)
 static void dloop_bloop(u8* dst, const u8* src, size_t nbytes)
 {
 	BEGIN
-	int dwords = nbytes/4;
-	for(int i = 0; i < dwords; i++)
+	size_t dwords = nbytes/4;
+	for(size_t i = 0; i < dwords; i++)
 	{
 		*(u32*)dst = *(u32*)src;
 		dst += 4; src += 4;
@@ -803,7 +960,7 @@ static u8* setup_tiny_buf(int alignment, int misalign, bool is_dst)
 
 static void verify_tiny_buf(u8* p, size_t l, const char* culprit)
 {
-	for(int i = 0; i < l; i++)
+	for(size_t i = 0; i < l; i++)
 		if(p[i] != i)
 			debug_assert(0);
 
@@ -853,7 +1010,7 @@ misalign 0
 	dtable_btable: 6670
 	drep_brep: 7384
 
-in release mode over all misaligns 0..63
+in release mode over all misaligns 0..63 and all sizes 1..64
                    total  difference WRT baseline (%)
 
 dtable_btable:     434176
@@ -864,11 +1021,31 @@ brep:              546752 26
 dloop_bloop:       679426 56.5
 bloop:             1537061
 				   
-in debug mode over all misaligns 0..63
+in debug mode over all misaligns 0..63 and all sizes 1..64
 	dtable_btable: 425728 0
 	dtable_brep:   457153 7.3
 	drep_brep:     494368 16.1
 	brep:          538729 26.5
+
+p3, debug mode over all misaligns 0..63 and all sizes 1..64
+	dtable_btable: 1099605
+	dtable_brep:   1111757	1.1
+	memcpy:        1124093	2.2
+	drep_brep:     1138089
+	brep:          1313728
+	dloop_bloop:   1756000
+	bloop:         2405315
+
+p3, release mode over all misaligns 0..63 and all sizes 1..64
+	dtable_btable: 1092784
+	dtable_brep:   1109136	1.5
+	memcpy:        1116306	2.2
+	drep_brep:     1127606
+	dloop_bloop:   1129105
+	brep:          1308052
+	bloop:         1588062
+
+
 */
 
 static void test_with_misalign(int misalign)
@@ -948,106 +1125,10 @@ static int test()
 
 
 
-/*
-void* amd_memcpy(void* dst, const void* src, size_t nbytes)
-{
-  __asm {
-
-	mov		ecx, [n]		; number of bytes to copy
-	mov		edi, [dest]		; destination
-	mov		esi, [src]		; source
-	mov		ebx, ecx		; keep a copy of count
-
-	cmp		ecx, TINY_BLOCK_COPY
-	jb		$memcpy_ic_3	; tiny? skip mmx copy
-
-	**ALIGN**
-
-	; destination is dword aligned
-	mov		ecx, ebx		; number of bytes left to copy
-	shr		ecx, 6			; get 64-byte block count
-	jz		$memcpy_ic_2	; finish the last few bytes
-
-	cmp		ecx, IN_CACHE_COPY/64	; too big 4 cache? use uncached copy
-	jae		$memcpy_uc_test
-
-	**MMX_MOVQ**
-
-$memcpy_ic_2:
-	mov		ecx, ebx		; has valid low 6 bits of the byte count
-$memcpy_ic_3:
-	shr		ecx, 2			; dword count
-	and		ecx, 1111b		; only look at the "remainder" bits
-	neg		ecx				; set up to jump into the array
-	add		ecx, offset $memcpy_last_few
-	jmp		ecx				; jump to array of movsd's
-
-$memcpy_uc_test:
-	cmp		ecx, UNCACHED_COPY/64	; big enough? use block prefetch copy
-	jae		$memcpy_bp_1
-
-$memcpy_64_test:
-	or		ecx, ecx		; tail end of block prefetch will jump here
-	jz		$memcpy_ic_2	; no more 64-byte blocks left
-
-	**PREFETCH_MOVNTQ**
-	jmp		$memcpy_ic_2		; almost done
-
-$memcpy_bp_1:
-	**BLOCKPREFETCH_MOVNTQ**
-
-// The smallest copy uses the X86 "movsd" instruction, in an optimized
-// form which is an "unrolled loop".   Then it handles the last few bytes.
-align 4
-	movsd
-	movsd
-	movsd
-	movsd
-	movsd
-	movsd
-	movsd
-	movsd
-	movsd
-	movsd
-	movsd
-	movsd
-	movsd
-	movsd
-	movsd
-	movsd
-
-$memcpy_last_few:		; dword aligned from before movsd's
-	mov		ecx, ebx	; has valid low 2 bits of the byte count
-	and		ecx, 11b	; the last few cows must come home
-	jz		$memcpy_final	; no more, let's leave
-	rep		movsb		; the last 1, 2, or 3 bytes
-
-$memcpy_final: 
-	emms				; clean up the MMX state
-	sfence				; flush the write buffer
-	mov		eax, [dest]	; ret value = destination pointer
-
-    }
-}
 
 
 
 
-*/
-
-
-void ia32_memcpy(void* dst, const void* src, size_t nbytes)
-{
-	// large
-	if(nbytes >= 64*KiB)
-		ia32_memcpy_nt(dst, src, nbytes);
-	// small
-	// TODO: implement small memcpy
-	else
-		memcpy(dst, src, nbytes);
-}
-
-
 //-----------------------------------------------------------------------------
 // support code for lock-free primitives
 //-----------------------------------------------------------------------------