From 15ca12311b861a01b00a57a512993bad529b2b77 Mon Sep 17 00:00:00 2001 From: janwas Date: Mon, 12 Sep 2005 22:13:03 +0000 Subject: [PATCH] more WIP; initial memcpy code This was SVN commit r2700. --- source/lib/sysdep/ia32.cpp | 515 +++++++++++++++++++++---------------- 1 file changed, 298 insertions(+), 217 deletions(-) diff --git a/source/lib/sysdep/ia32.cpp b/source/lib/sysdep/ia32.cpp index 9692023d9f..e4f08c53a5 100755 --- a/source/lib/sysdep/ia32.cpp +++ b/source/lib/sysdep/ia32.cpp @@ -440,99 +440,35 @@ $memcpy_final: -/* -ALIGN + + // align to 8 bytes // this align may be slower in [32kb, 64kb] - mov ecx, 8 ; a trick that's faster than rep movsb... - sub ecx, edi ; align destination to qword - and ecx, 111b ; get the low bits - sub ebx, ecx ; update copy count - neg ecx ; set up to jump into the array - add ecx, offset $memcpy_align_done - jmp ecx ; jump to array of movsb's - -align 4 - movsb - movsb - movsb - movsb - movsb - movsb - movsb - movsb + // rationale: always called - it may speed up TINY as well + // => we have to be careful to only copy min(8, 8-edi, size) +#define MEMCPY_ALIGN -MMX_MOVQ +// temporal (in-cache) copy; 64 bytes per iter // This is small block copy that uses the MMX registers to copy 8 bytes // at a time. It uses the "unrolled loop" optimization, and also uses // the software prefetch instruction to get the data into the cache. -align 16 -$memcpy_ic_1: ; 64-byte block copies, in-cache copy - - prefetchnta [esi + (200*64/34+192)] ; start reading ahead - - movq mm0, [esi+0] ; read 64 bits - movq mm1, [esi+8] - movq [edi+0], mm0 ; write 64 bits - movq [edi+8], mm1 ; note: the normal movq writes the - movq mm2, [esi+16] ; data to cache; a cache line will be - movq mm3, [esi+24] ; allocated as needed, to store the data - movq [edi+16], mm2 - movq [edi+24], mm3 - movq mm0, [esi+32] - movq mm1, [esi+40] - movq [edi+32], mm0 - movq [edi+40], mm1 - movq mm2, [esi+48] - movq mm3, [esi+56] - movq [edi+48], mm2 - movq [edi+56], mm3 - - add esi, 64 ; update source pointer - add edi, 64 ; update destination pointer - dec ecx ; count down - jnz $memcpy_ic_1 ; last 64-byte block? +#define MEMCPY_MOVQ -PREFETCH_MOVNTQ +// 64 bytes per iter; uncached non-temporal // For larger blocks, which will spill beyond the cache, it's faster to // use the Streaming Store instruction MOVNTQ. This write instruction // bypasses the cache and writes straight to main memory. This code also // uses the software prefetch instruction to pre-read the data. -align 16 -$memcpy_uc_1: ; 64-byte blocks, uncached copy - - prefetchnta [esi + (200*64/34+192)] ; start reading ahead - - movq mm0,[esi+0] ; read 64 bits - add edi,64 ; update destination pointer - movq mm1,[esi+8] - add esi,64 ; update source pointer - movq mm2,[esi-48] - movntq [edi-64], mm0 ; write 64 bits, bypassing the cache - movq mm0,[esi-40] ; note: movntq also prevents the CPU - movntq [edi-56], mm1 ; from READING the destination address - movq mm1,[esi-32] ; into the cache, only to be over-written - movntq [edi-48], mm2 ; so that also helps performance - movq mm2,[esi-24] - movntq [edi-40], mm0 - movq mm0,[esi-16] - movntq [edi-32], mm1 - movq mm1,[esi-8] - movntq [edi-24], mm2 - movntq [edi-16], mm0 - dec ecx - movntq [edi-8], mm1 - jnz $memcpy_uc_1 ; last 64-byte block? +#define MEMCPY_MOVNTQ -BLOCKPREFETCH_MOVNTQ // For the largest size blocks, a special technique called Block Prefetch // can be used to accelerate the read operations. Block Prefetch reads @@ -540,55 +476,81 @@ BLOCKPREFETCH_MOVNTQ // This is faster than using software prefetch, in this case. // The technique is great for getting maximum read bandwidth, // especially in DDR memory systems. +#define MEMCPY_BP_MOVNTQ + + + + + + + + +/* + + + +void amd_memcpy_skeleton(u8 *dest, const u8 *src, size_t n) +{ + __asm { + + mov ecx, [n] ; number of bytes to copy + mov edi, [dest] ; destination + mov esi, [src] ; source + mov ebx, ecx ; keep a copy of count + + cmp ecx, TINY_BLOCK_COPY + jb $memcpy_ic_3 ; tiny? skip mmx copy + + ALIGN + + mov ecx, ebx ; number of bytes left to copy + shr ecx, 6 ; get 64-byte block count + jz $memcpy_ic_2 ; finish the last few bytes + + cmp ecx, IN_CACHE_COPY/64 ; too big 4 cache? use uncached copy + jae $memcpy_uc_test + + MMX_MOVQ + +$memcpy_ic_2: + mov ecx, ebx ; has valid low 6 bits of the byte count +$memcpy_ic_3: + TINY(<=64) + + emms ; clean up the MMX state + sfence ; flush the write buffer + ret + +$memcpy_uc_test: + cmp ecx, UNCACHED_COPY/64 ; big enough? use block prefetch copy + jae $memcpy_bp_1 + +$memcpy_64_test: + or ecx, ecx ; tail end of block prefetch will jump here + jz $memcpy_ic_2 ; no more 64-byte blocks left + +align 16 +$memcpy_uc_1: ; 64-byte blocks, uncached copy + PREFETCH_MOVNTQ + + jmp $memcpy_ic_2 ; almost done + $memcpy_bp_1: ; large blocks, block prefetch copy + BLOCKPREFETCH_MOVNTQ + // jumps to $memcpy_64_test when done + } +} - cmp ecx, CACHEBLOCK ; big enough to run another prefetch loop? - jl $memcpy_64_test ; no, back to regular uncached copy - - mov eax, CACHEBLOCK / 2 ; block prefetch loop, unrolled 2X - add esi, CACHEBLOCK * 64 ; move to the top of the block -align 16 -$memcpy_bp_2: - mov edx, [esi-64] ; grab one address per cache line - mov edx, [esi-128] ; grab one address per cache line - sub esi, 128 ; go reverse order - dec eax ; count down the cache lines - jnz $memcpy_bp_2 ; keep grabbing more lines into cache - - mov eax, CACHEBLOCK ; now that it's in cache, do the copy -align 16 -$memcpy_bp_3: - movq mm0, [esi ] ; read 64 bits - movq mm1, [esi+ 8] - movq mm2, [esi+16] - movq mm3, [esi+24] - movq mm4, [esi+32] - movq mm5, [esi+40] - movq mm6, [esi+48] - movq mm7, [esi+56] - add esi, 64 ; update source pointer - movntq [edi ], mm0 ; write 64 bits, bypassing cache - movntq [edi+ 8], mm1 ; note: movntq also prevents the CPU - movntq [edi+16], mm2 ; from READING the destination address - movntq [edi+24], mm3 ; into the cache, only to be over-written, - movntq [edi+32], mm4 ; so that also helps performance - movntq [edi+40], mm5 - movntq [edi+48], mm6 - movntq [edi+56], mm7 - add edi, 64 ; update dest pointer - - dec eax ; count down - - jnz $memcpy_bp_3 ; keep copying - sub ecx, CACHEBLOCK ; update the 64-byte block count - jmp $memcpy_bp_1 ; keep processing chunks + total taken jmp (no ret, no tbl)+unconditional + 0..64 tiny 0 + 64..64k mmx 1+1 + 64k..192k movntq 1+1 + 192k..inf blockprefetch 1+1 */ -// note: tiny routine isn't entered if size > 64 (protects movsd array) - static i64 t0, t1; #define BEGIN\ @@ -605,6 +567,201 @@ static i64 t0, t1; __asm mov dword ptr t1, eax\ __asm mov dword ptr t1+4, edx\ __asm popad +static const size_t MOVNTQ_MIN_THRESHOLD_64 = 64*KiB / 64;// upper limit for movq/movq copy w/SW prefetch + +static const size_t BP_MIN_THRESHOLD_64 = 192*KiB / 64; // upper limit for movq/movntq w/SW prefetch + +// rationale: +// - prefer "jnz loop" style vs "jz end; jmp loop" to make it +// better for the branch prediction unit. +// - need declspec naked because we ret in the middle of the function (avoids jmp) +// disadvantage: compiler cannot optimize param passing + +void __declspec(naked) ia32_memcpy(void* dst, const void* src, size_t nbytes) +{ +__asm { + BEGIN + mov ecx, [esp+4+8] ;// nbytes + mov esi, [esp+4+4] ;// src + mov edi, [esp+4+0] ;// dst + +mov eax, 8 +sub eax, edi +and eax, 0x07 +cmp eax, ecx +cmova eax, ecx +sub ecx, eax +neg eax +add eax, offset $align_table_end +jmp eax +align 4 +movsb +movsb +movsb +movsb +movsb +movsb +movsb +movsb +$align_table_end: + + mov ebx, ecx + shr ecx, 6 ; # blocks + +tiny: + and ebx, 63 + mov edx, ebx + shr edx, 2 ; dword count + neg edx + add edx, offset $movsd_table_end + + mov eax, _bp + cmp ecx, BP_MIN_THRESHOLD_64 + cmovb eax, _movntq + cmp ecx, MOVNTQ_MIN_THRESHOLD_64 + cmovb eax, _mmx + cmp ecx, 64 + cmovbe eax, edx + jmp eax + +align 8 + movsd + movsd + movsd + movsd + movsd + movsd + movsd + movsd + movsd + movsd + movsd + movsd + movsd + movsd + movsd + movsd +$movsd_table_end: + +// TODO: move this calc into register + mov eax, ebx + and eax, 11b + neg eax + add eax, offset $movsb_table_end + jmp eax + + movsb + movsb + movsb +$movsb_table_end: + END + ret + +_mmx: + // we have >= 64 + align 16 +$movq_loop: +prefetchnta [esi + (200*64/34+192)] +movq mm0, [esi+0] +movq mm1, [esi+8] +movq [edi+0], mm0 +movq [edi+8], mm1 +movq mm2, [esi+16] +movq mm3, [esi+24] +movq [edi+16], mm2 +movq [edi+24], mm3 +movq mm0, [esi+32] +movq mm1, [esi+40] +movq [edi+32], mm0 +movq [edi+40], mm1 +movq mm2, [esi+48] +movq mm3, [esi+56] +movq [edi+48], mm2 +movq [edi+56], mm3 +add esi, 64 +add edi, 64 +dec ecx +jnz $movq_loop + emms + jmp tiny + +_bp: + // we have >= 8kb// until no more 8kb blocks + $bp_process_chunk: +mov eax, CACHEBLOCK / 2 ; block prefetch loop, unrolled 2X +add esi, CACHEBLOCK * 64 ; move to the top of the block +align 16 +/* touch each cache line in reverse order (prevents HW prefetch) */ +$bp_prefetch_chunk: +mov edx, [esi-64] +mov edx, [esi-128] +sub esi, 128 +dec eax +jnz $bp_prefetch_chunk +mov eax, CACHEBLOCK /*; now that it's in cache, do the copy*/ +align 16 +$bp_copy_block: +movq mm0, [esi+ 0] +movq mm1, [esi+ 8] +movq mm2, [esi+16] +movq mm3, [esi+24] +movq mm4, [esi+32] +movq mm5, [esi+40] +movq mm6, [esi+48] +movq mm7, [esi+56] +add esi, 64 +movntq [edi+ 0], mm0 +movntq [edi+ 8], mm1 +movntq [edi+16], mm2 +movntq [edi+24], mm3 +movntq [edi+32], mm4 +movntq [edi+40], mm5 +movntq [edi+48], mm6 +movntq [edi+56], mm7 +add edi, 64 +dec eax +jnz $bp_copy_block +sub ecx, CACHEBLOCK /*; update the 64-byte block count*/ +cmp ecx, CACHEBLOCK +jl $bp_process_chunk + sfence + emms + // protect routine below + cmp ecx, 0 + jz tiny + +_movntq: + // we have >= 64, 64B BLOCKS + align 16\ +$movntq_loop:\ +prefetchnta [esi + (200*64/34+192)]\ +movq mm0,[esi+0]\ +add edi,64\ +movq mm1,[esi+8]\ +add esi,64\ +movq mm2,[esi-48]\ +movntq [edi-64], mm0\ +movq mm0,[esi-40]\ +movntq [edi-56], mm1\ +movq mm1,[esi-32]\ +movntq [edi-48], mm2\ +movq mm2,[esi-24]\ +movntq [edi-40], mm0\ +movq mm0,[esi-16]\ +movntq [edi-32], mm1\ +movq mm1,[esi-8]\ +movntq [edi-24], mm2\ +movntq [edi-16], mm0\ +dec ecx\ +movntq [edi-8], mm1\ +jnz $movntq_loop + sfence + emms + jmp tiny +} // __asm +} // ia32_memcpy + + static void dtable_brep(u8* dst, const u8* src, size_t nbytes) { @@ -751,7 +908,7 @@ rep movsb static void bloop(u8* dst, const u8* src, size_t nbytes) { BEGIN - for(int i = 0; i < nbytes; i++) + for(size_t i = 0; i < nbytes; i++) *dst++ = *src++; END } @@ -759,8 +916,8 @@ static void bloop(u8* dst, const u8* src, size_t nbytes) static void dloop_bloop(u8* dst, const u8* src, size_t nbytes) { BEGIN - int dwords = nbytes/4; - for(int i = 0; i < dwords; i++) + size_t dwords = nbytes/4; + for(size_t i = 0; i < dwords; i++) { *(u32*)dst = *(u32*)src; dst += 4; src += 4; @@ -803,7 +960,7 @@ static u8* setup_tiny_buf(int alignment, int misalign, bool is_dst) static void verify_tiny_buf(u8* p, size_t l, const char* culprit) { - for(int i = 0; i < l; i++) + for(size_t i = 0; i < l; i++) if(p[i] != i) debug_assert(0); @@ -853,7 +1010,7 @@ misalign 0 dtable_btable: 6670 drep_brep: 7384 -in release mode over all misaligns 0..63 +in release mode over all misaligns 0..63 and all sizes 1..64 total difference WRT baseline (%) dtable_btable: 434176 @@ -864,11 +1021,31 @@ brep: 546752 26 dloop_bloop: 679426 56.5 bloop: 1537061 -in debug mode over all misaligns 0..63 +in debug mode over all misaligns 0..63 and all sizes 1..64 dtable_btable: 425728 0 dtable_brep: 457153 7.3 drep_brep: 494368 16.1 brep: 538729 26.5 + +p3, debug mode over all misaligns 0..63 and all sizes 1..64 + dtable_btable: 1099605 + dtable_brep: 1111757 1.1 + memcpy: 1124093 2.2 + drep_brep: 1138089 + brep: 1313728 + dloop_bloop: 1756000 + bloop: 2405315 + +p3, release mode over all misaligns 0..63 and all sizes 1..64 + dtable_btable: 1092784 + dtable_brep: 1109136 1.5 + memcpy: 1116306 2.2 + drep_brep: 1127606 + dloop_bloop: 1129105 + brep: 1308052 + bloop: 1588062 + + */ static void test_with_misalign(int misalign) @@ -948,106 +1125,10 @@ static int test() -/* -void* amd_memcpy(void* dst, const void* src, size_t nbytes) -{ - __asm { - - mov ecx, [n] ; number of bytes to copy - mov edi, [dest] ; destination - mov esi, [src] ; source - mov ebx, ecx ; keep a copy of count - - cmp ecx, TINY_BLOCK_COPY - jb $memcpy_ic_3 ; tiny? skip mmx copy - - **ALIGN** - - ; destination is dword aligned - mov ecx, ebx ; number of bytes left to copy - shr ecx, 6 ; get 64-byte block count - jz $memcpy_ic_2 ; finish the last few bytes - - cmp ecx, IN_CACHE_COPY/64 ; too big 4 cache? use uncached copy - jae $memcpy_uc_test - - **MMX_MOVQ** - -$memcpy_ic_2: - mov ecx, ebx ; has valid low 6 bits of the byte count -$memcpy_ic_3: - shr ecx, 2 ; dword count - and ecx, 1111b ; only look at the "remainder" bits - neg ecx ; set up to jump into the array - add ecx, offset $memcpy_last_few - jmp ecx ; jump to array of movsd's - -$memcpy_uc_test: - cmp ecx, UNCACHED_COPY/64 ; big enough? use block prefetch copy - jae $memcpy_bp_1 - -$memcpy_64_test: - or ecx, ecx ; tail end of block prefetch will jump here - jz $memcpy_ic_2 ; no more 64-byte blocks left - - **PREFETCH_MOVNTQ** - jmp $memcpy_ic_2 ; almost done - -$memcpy_bp_1: - **BLOCKPREFETCH_MOVNTQ** - -// The smallest copy uses the X86 "movsd" instruction, in an optimized -// form which is an "unrolled loop". Then it handles the last few bytes. -align 4 - movsd - movsd - movsd - movsd - movsd - movsd - movsd - movsd - movsd - movsd - movsd - movsd - movsd - movsd - movsd - movsd - -$memcpy_last_few: ; dword aligned from before movsd's - mov ecx, ebx ; has valid low 2 bits of the byte count - and ecx, 11b ; the last few cows must come home - jz $memcpy_final ; no more, let's leave - rep movsb ; the last 1, 2, or 3 bytes - -$memcpy_final: - emms ; clean up the MMX state - sfence ; flush the write buffer - mov eax, [dest] ; ret value = destination pointer - - } -} -*/ - - -void ia32_memcpy(void* dst, const void* src, size_t nbytes) -{ - // large - if(nbytes >= 64*KiB) - ia32_memcpy_nt(dst, src, nbytes); - // small - // TODO: implement small memcpy - else - memcpy(dst, src, nbytes); -} - - //----------------------------------------------------------------------------- // support code for lock-free primitives //-----------------------------------------------------------------------------