1
1
forked from 0ad/0ad

more WIP; initial memcpy code

This was SVN commit r2700.
This commit is contained in:
janwas 2005-09-12 22:13:03 +00:00
parent aa118403bb
commit 15ca12311b

View File

@ -440,99 +440,35 @@ $memcpy_final:
/*
ALIGN
// align to 8 bytes
// this align may be slower in [32kb, 64kb]
mov ecx, 8 ; a trick that's faster than rep movsb...
sub ecx, edi ; align destination to qword
and ecx, 111b ; get the low bits
sub ebx, ecx ; update copy count
neg ecx ; set up to jump into the array
add ecx, offset $memcpy_align_done
jmp ecx ; jump to array of movsb's
align 4
movsb
movsb
movsb
movsb
movsb
movsb
movsb
movsb
// rationale: always called - it may speed up TINY as well
// => we have to be careful to only copy min(8, 8-edi, size)
#define MEMCPY_ALIGN
MMX_MOVQ
// temporal (in-cache) copy; 64 bytes per iter
// This is small block copy that uses the MMX registers to copy 8 bytes
// at a time. It uses the "unrolled loop" optimization, and also uses
// the software prefetch instruction to get the data into the cache.
align 16
$memcpy_ic_1: ; 64-byte block copies, in-cache copy
prefetchnta [esi + (200*64/34+192)] ; start reading ahead
movq mm0, [esi+0] ; read 64 bits
movq mm1, [esi+8]
movq [edi+0], mm0 ; write 64 bits
movq [edi+8], mm1 ; note: the normal movq writes the
movq mm2, [esi+16] ; data to cache; a cache line will be
movq mm3, [esi+24] ; allocated as needed, to store the data
movq [edi+16], mm2
movq [edi+24], mm3
movq mm0, [esi+32]
movq mm1, [esi+40]
movq [edi+32], mm0
movq [edi+40], mm1
movq mm2, [esi+48]
movq mm3, [esi+56]
movq [edi+48], mm2
movq [edi+56], mm3
add esi, 64 ; update source pointer
add edi, 64 ; update destination pointer
dec ecx ; count down
jnz $memcpy_ic_1 ; last 64-byte block?
#define MEMCPY_MOVQ
PREFETCH_MOVNTQ
// 64 bytes per iter; uncached non-temporal
// For larger blocks, which will spill beyond the cache, it's faster to
// use the Streaming Store instruction MOVNTQ. This write instruction
// bypasses the cache and writes straight to main memory. This code also
// uses the software prefetch instruction to pre-read the data.
align 16
$memcpy_uc_1: ; 64-byte blocks, uncached copy
prefetchnta [esi + (200*64/34+192)] ; start reading ahead
movq mm0,[esi+0] ; read 64 bits
add edi,64 ; update destination pointer
movq mm1,[esi+8]
add esi,64 ; update source pointer
movq mm2,[esi-48]
movntq [edi-64], mm0 ; write 64 bits, bypassing the cache
movq mm0,[esi-40] ; note: movntq also prevents the CPU
movntq [edi-56], mm1 ; from READING the destination address
movq mm1,[esi-32] ; into the cache, only to be over-written
movntq [edi-48], mm2 ; so that also helps performance
movq mm2,[esi-24]
movntq [edi-40], mm0
movq mm0,[esi-16]
movntq [edi-32], mm1
movq mm1,[esi-8]
movntq [edi-24], mm2
movntq [edi-16], mm0
dec ecx
movntq [edi-8], mm1
jnz $memcpy_uc_1 ; last 64-byte block?
#define MEMCPY_MOVNTQ
BLOCKPREFETCH_MOVNTQ
// For the largest size blocks, a special technique called Block Prefetch
// can be used to accelerate the read operations. Block Prefetch reads
@ -540,55 +476,81 @@ BLOCKPREFETCH_MOVNTQ
// This is faster than using software prefetch, in this case.
// The technique is great for getting maximum read bandwidth,
// especially in DDR memory systems.
#define MEMCPY_BP_MOVNTQ
/*
void amd_memcpy_skeleton(u8 *dest, const u8 *src, size_t n)
{
__asm {
mov ecx, [n] ; number of bytes to copy
mov edi, [dest] ; destination
mov esi, [src] ; source
mov ebx, ecx ; keep a copy of count
cmp ecx, TINY_BLOCK_COPY
jb $memcpy_ic_3 ; tiny? skip mmx copy
ALIGN
mov ecx, ebx ; number of bytes left to copy
shr ecx, 6 ; get 64-byte block count
jz $memcpy_ic_2 ; finish the last few bytes
cmp ecx, IN_CACHE_COPY/64 ; too big 4 cache? use uncached copy
jae $memcpy_uc_test
MMX_MOVQ
$memcpy_ic_2:
mov ecx, ebx ; has valid low 6 bits of the byte count
$memcpy_ic_3:
TINY(<=64)
emms ; clean up the MMX state
sfence ; flush the write buffer
ret
$memcpy_uc_test:
cmp ecx, UNCACHED_COPY/64 ; big enough? use block prefetch copy
jae $memcpy_bp_1
$memcpy_64_test:
or ecx, ecx ; tail end of block prefetch will jump here
jz $memcpy_ic_2 ; no more 64-byte blocks left
align 16
$memcpy_uc_1: ; 64-byte blocks, uncached copy
PREFETCH_MOVNTQ
jmp $memcpy_ic_2 ; almost done
$memcpy_bp_1: ; large blocks, block prefetch copy
BLOCKPREFETCH_MOVNTQ
// jumps to $memcpy_64_test when done
}
}
cmp ecx, CACHEBLOCK ; big enough to run another prefetch loop?
jl $memcpy_64_test ; no, back to regular uncached copy
mov eax, CACHEBLOCK / 2 ; block prefetch loop, unrolled 2X
add esi, CACHEBLOCK * 64 ; move to the top of the block
align 16
$memcpy_bp_2:
mov edx, [esi-64] ; grab one address per cache line
mov edx, [esi-128] ; grab one address per cache line
sub esi, 128 ; go reverse order
dec eax ; count down the cache lines
jnz $memcpy_bp_2 ; keep grabbing more lines into cache
mov eax, CACHEBLOCK ; now that it's in cache, do the copy
align 16
$memcpy_bp_3:
movq mm0, [esi ] ; read 64 bits
movq mm1, [esi+ 8]
movq mm2, [esi+16]
movq mm3, [esi+24]
movq mm4, [esi+32]
movq mm5, [esi+40]
movq mm6, [esi+48]
movq mm7, [esi+56]
add esi, 64 ; update source pointer
movntq [edi ], mm0 ; write 64 bits, bypassing cache
movntq [edi+ 8], mm1 ; note: movntq also prevents the CPU
movntq [edi+16], mm2 ; from READING the destination address
movntq [edi+24], mm3 ; into the cache, only to be over-written,
movntq [edi+32], mm4 ; so that also helps performance
movntq [edi+40], mm5
movntq [edi+48], mm6
movntq [edi+56], mm7
add edi, 64 ; update dest pointer
dec eax ; count down
jnz $memcpy_bp_3 ; keep copying
sub ecx, CACHEBLOCK ; update the 64-byte block count
jmp $memcpy_bp_1 ; keep processing chunks
total taken jmp (no ret, no tbl)+unconditional
0..64 tiny 0
64..64k mmx 1+1
64k..192k movntq 1+1
192k..inf blockprefetch 1+1
*/
// note: tiny routine isn't entered if size > 64 (protects movsd array)
static i64 t0, t1;
#define BEGIN\
@ -605,6 +567,201 @@ static i64 t0, t1;
__asm mov dword ptr t1, eax\
__asm mov dword ptr t1+4, edx\
__asm popad
static const size_t MOVNTQ_MIN_THRESHOLD_64 = 64*KiB / 64;// upper limit for movq/movq copy w/SW prefetch
static const size_t BP_MIN_THRESHOLD_64 = 192*KiB / 64; // upper limit for movq/movntq w/SW prefetch
// rationale:
// - prefer "jnz loop" style vs "jz end; jmp loop" to make it
// better for the branch prediction unit.
// - need declspec naked because we ret in the middle of the function (avoids jmp)
// disadvantage: compiler cannot optimize param passing
void __declspec(naked) ia32_memcpy(void* dst, const void* src, size_t nbytes)
{
__asm {
BEGIN
mov ecx, [esp+4+8] ;// nbytes
mov esi, [esp+4+4] ;// src
mov edi, [esp+4+0] ;// dst
mov eax, 8
sub eax, edi
and eax, 0x07
cmp eax, ecx
cmova eax, ecx
sub ecx, eax
neg eax
add eax, offset $align_table_end
jmp eax
align 4
movsb
movsb
movsb
movsb
movsb
movsb
movsb
movsb
$align_table_end:
mov ebx, ecx
shr ecx, 6 ; # blocks
tiny:
and ebx, 63
mov edx, ebx
shr edx, 2 ; dword count
neg edx
add edx, offset $movsd_table_end
mov eax, _bp
cmp ecx, BP_MIN_THRESHOLD_64
cmovb eax, _movntq
cmp ecx, MOVNTQ_MIN_THRESHOLD_64
cmovb eax, _mmx
cmp ecx, 64
cmovbe eax, edx
jmp eax
align 8
movsd
movsd
movsd
movsd
movsd
movsd
movsd
movsd
movsd
movsd
movsd
movsd
movsd
movsd
movsd
movsd
$movsd_table_end:
// TODO: move this calc into register
mov eax, ebx
and eax, 11b
neg eax
add eax, offset $movsb_table_end
jmp eax
movsb
movsb
movsb
$movsb_table_end:
END
ret
_mmx:
// we have >= 64
align 16
$movq_loop:
prefetchnta [esi + (200*64/34+192)]
movq mm0, [esi+0]
movq mm1, [esi+8]
movq [edi+0], mm0
movq [edi+8], mm1
movq mm2, [esi+16]
movq mm3, [esi+24]
movq [edi+16], mm2
movq [edi+24], mm3
movq mm0, [esi+32]
movq mm1, [esi+40]
movq [edi+32], mm0
movq [edi+40], mm1
movq mm2, [esi+48]
movq mm3, [esi+56]
movq [edi+48], mm2
movq [edi+56], mm3
add esi, 64
add edi, 64
dec ecx
jnz $movq_loop
emms
jmp tiny
_bp:
// we have >= 8kb// until no more 8kb blocks
$bp_process_chunk:
mov eax, CACHEBLOCK / 2 ; block prefetch loop, unrolled 2X
add esi, CACHEBLOCK * 64 ; move to the top of the block
align 16
/* touch each cache line in reverse order (prevents HW prefetch) */
$bp_prefetch_chunk:
mov edx, [esi-64]
mov edx, [esi-128]
sub esi, 128
dec eax
jnz $bp_prefetch_chunk
mov eax, CACHEBLOCK /*; now that it's in cache, do the copy*/
align 16
$bp_copy_block:
movq mm0, [esi+ 0]
movq mm1, [esi+ 8]
movq mm2, [esi+16]
movq mm3, [esi+24]
movq mm4, [esi+32]
movq mm5, [esi+40]
movq mm6, [esi+48]
movq mm7, [esi+56]
add esi, 64
movntq [edi+ 0], mm0
movntq [edi+ 8], mm1
movntq [edi+16], mm2
movntq [edi+24], mm3
movntq [edi+32], mm4
movntq [edi+40], mm5
movntq [edi+48], mm6
movntq [edi+56], mm7
add edi, 64
dec eax
jnz $bp_copy_block
sub ecx, CACHEBLOCK /*; update the 64-byte block count*/
cmp ecx, CACHEBLOCK
jl $bp_process_chunk
sfence
emms
// protect routine below
cmp ecx, 0
jz tiny
_movntq:
// we have >= 64, 64B BLOCKS
align 16\
$movntq_loop:\
prefetchnta [esi + (200*64/34+192)]\
movq mm0,[esi+0]\
add edi,64\
movq mm1,[esi+8]\
add esi,64\
movq mm2,[esi-48]\
movntq [edi-64], mm0\
movq mm0,[esi-40]\
movntq [edi-56], mm1\
movq mm1,[esi-32]\
movntq [edi-48], mm2\
movq mm2,[esi-24]\
movntq [edi-40], mm0\
movq mm0,[esi-16]\
movntq [edi-32], mm1\
movq mm1,[esi-8]\
movntq [edi-24], mm2\
movntq [edi-16], mm0\
dec ecx\
movntq [edi-8], mm1\
jnz $movntq_loop
sfence
emms
jmp tiny
} // __asm
} // ia32_memcpy
static void dtable_brep(u8* dst, const u8* src, size_t nbytes)
{
@ -751,7 +908,7 @@ rep movsb
static void bloop(u8* dst, const u8* src, size_t nbytes)
{
BEGIN
for(int i = 0; i < nbytes; i++)
for(size_t i = 0; i < nbytes; i++)
*dst++ = *src++;
END
}
@ -759,8 +916,8 @@ static void bloop(u8* dst, const u8* src, size_t nbytes)
static void dloop_bloop(u8* dst, const u8* src, size_t nbytes)
{
BEGIN
int dwords = nbytes/4;
for(int i = 0; i < dwords; i++)
size_t dwords = nbytes/4;
for(size_t i = 0; i < dwords; i++)
{
*(u32*)dst = *(u32*)src;
dst += 4; src += 4;
@ -803,7 +960,7 @@ static u8* setup_tiny_buf(int alignment, int misalign, bool is_dst)
static void verify_tiny_buf(u8* p, size_t l, const char* culprit)
{
for(int i = 0; i < l; i++)
for(size_t i = 0; i < l; i++)
if(p[i] != i)
debug_assert(0);
@ -853,7 +1010,7 @@ misalign 0
dtable_btable: 6670
drep_brep: 7384
in release mode over all misaligns 0..63
in release mode over all misaligns 0..63 and all sizes 1..64
total difference WRT baseline (%)
dtable_btable: 434176
@ -864,11 +1021,31 @@ brep: 546752 26
dloop_bloop: 679426 56.5
bloop: 1537061
in debug mode over all misaligns 0..63
in debug mode over all misaligns 0..63 and all sizes 1..64
dtable_btable: 425728 0
dtable_brep: 457153 7.3
drep_brep: 494368 16.1
brep: 538729 26.5
p3, debug mode over all misaligns 0..63 and all sizes 1..64
dtable_btable: 1099605
dtable_brep: 1111757 1.1
memcpy: 1124093 2.2
drep_brep: 1138089
brep: 1313728
dloop_bloop: 1756000
bloop: 2405315
p3, release mode over all misaligns 0..63 and all sizes 1..64
dtable_btable: 1092784
dtable_brep: 1109136 1.5
memcpy: 1116306 2.2
drep_brep: 1127606
dloop_bloop: 1129105
brep: 1308052
bloop: 1588062
*/
static void test_with_misalign(int misalign)
@ -948,106 +1125,10 @@ static int test()
/*
void* amd_memcpy(void* dst, const void* src, size_t nbytes)
{
__asm {
mov ecx, [n] ; number of bytes to copy
mov edi, [dest] ; destination
mov esi, [src] ; source
mov ebx, ecx ; keep a copy of count
cmp ecx, TINY_BLOCK_COPY
jb $memcpy_ic_3 ; tiny? skip mmx copy
**ALIGN**
; destination is dword aligned
mov ecx, ebx ; number of bytes left to copy
shr ecx, 6 ; get 64-byte block count
jz $memcpy_ic_2 ; finish the last few bytes
cmp ecx, IN_CACHE_COPY/64 ; too big 4 cache? use uncached copy
jae $memcpy_uc_test
**MMX_MOVQ**
$memcpy_ic_2:
mov ecx, ebx ; has valid low 6 bits of the byte count
$memcpy_ic_3:
shr ecx, 2 ; dword count
and ecx, 1111b ; only look at the "remainder" bits
neg ecx ; set up to jump into the array
add ecx, offset $memcpy_last_few
jmp ecx ; jump to array of movsd's
$memcpy_uc_test:
cmp ecx, UNCACHED_COPY/64 ; big enough? use block prefetch copy
jae $memcpy_bp_1
$memcpy_64_test:
or ecx, ecx ; tail end of block prefetch will jump here
jz $memcpy_ic_2 ; no more 64-byte blocks left
**PREFETCH_MOVNTQ**
jmp $memcpy_ic_2 ; almost done
$memcpy_bp_1:
**BLOCKPREFETCH_MOVNTQ**
// The smallest copy uses the X86 "movsd" instruction, in an optimized
// form which is an "unrolled loop". Then it handles the last few bytes.
align 4
movsd
movsd
movsd
movsd
movsd
movsd
movsd
movsd
movsd
movsd
movsd
movsd
movsd
movsd
movsd
movsd
$memcpy_last_few: ; dword aligned from before movsd's
mov ecx, ebx ; has valid low 2 bits of the byte count
and ecx, 11b ; the last few cows must come home
jz $memcpy_final ; no more, let's leave
rep movsb ; the last 1, 2, or 3 bytes
$memcpy_final:
emms ; clean up the MMX state
sfence ; flush the write buffer
mov eax, [dest] ; ret value = destination pointer
}
}
*/
void ia32_memcpy(void* dst, const void* src, size_t nbytes)
{
// large
if(nbytes >= 64*KiB)
ia32_memcpy_nt(dst, src, nbytes);
// small
// TODO: implement small memcpy
else
memcpy(dst, src, nbytes);
}
//-----------------------------------------------------------------------------
// support code for lock-free primitives
//-----------------------------------------------------------------------------