forked from 0ad/0ad
olsner
2ab9584840
premake stuff: call assembler objects *.asm.obj instead of *.obj (so that ia32.asm doesn't collide with ia32.cpp) ia32.asm: use .text section for proper linkage This was SVN commit r2709.
268 lines
4.1 KiB
NASM
268 lines
4.1 KiB
NASM
CACHEBLOCK equ 128
|
|
BP_MIN_THRESHOLD_64 equ 192*1024
|
|
MOVNTQ_MIN_THRESHOLD_64 equ 64*1024
|
|
|
|
section .text
|
|
|
|
%macro MC_UNROLLED_MOVSD 0
|
|
and ebx, 63
|
|
mov edx, ebx
|
|
shr edx, 2 ; dword count
|
|
neg edx
|
|
add edx, %%movsd_table_end
|
|
jmp edx
|
|
|
|
align 8
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
%%movsd_table_end:
|
|
|
|
mov eax, ebx
|
|
and eax, 3
|
|
neg eax
|
|
add eax, %%movsb_table_end
|
|
jmp eax
|
|
|
|
movsb
|
|
movsb
|
|
movsb
|
|
%%movsb_table_end:
|
|
%endm
|
|
|
|
|
|
%macro MC_ALIGN 0
|
|
mov eax, 8
|
|
sub eax, edi
|
|
and eax, 7
|
|
cmp eax, ecx
|
|
cmova eax, ecx
|
|
sub ecx, eax
|
|
neg eax
|
|
add eax, %%align_table_end
|
|
jmp eax
|
|
align 4
|
|
movsb
|
|
movsb
|
|
movsb
|
|
movsb
|
|
movsb
|
|
movsb
|
|
movsb
|
|
movsb
|
|
%%align_table_end:
|
|
%endm
|
|
|
|
|
|
%macro MC_MOVQ 0
|
|
align 16
|
|
%%1:
|
|
prefetchnta [esi + (200*64/34+192)]
|
|
movq mm0, [esi+0]
|
|
movq mm1, [esi+8]
|
|
movq [edi+0], mm0
|
|
movq [edi+8], mm1
|
|
movq mm2, [esi+16]
|
|
movq mm3, [esi+24]
|
|
movq [edi+16], mm2
|
|
movq [edi+24], mm3
|
|
movq mm0, [esi+32]
|
|
movq mm1, [esi+40]
|
|
movq [edi+32], mm0
|
|
movq [edi+40], mm1
|
|
movq mm2, [esi+48]
|
|
movq mm3, [esi+56]
|
|
movq [edi+48], mm2
|
|
movq [edi+56], mm3
|
|
add esi, 64
|
|
add edi, 64
|
|
dec ecx
|
|
jnz %%1
|
|
%endm
|
|
|
|
|
|
; we have >= 8kb. until no more 8kb blocks
|
|
%macro MC_BP_MOVNTQ 0
|
|
%%prefetch_and_copy_chunk:
|
|
mov eax, CACHEBLOCK / 2 ; block prefetch loop, unrolled 2X
|
|
add esi, CACHEBLOCK * 64 ; move to the top of the block
|
|
align 16
|
|
; touch each cache line in reverse order (prevents HW prefetch)
|
|
%%prefetch_chunk:
|
|
mov edx, [esi-64]
|
|
mov edx, [esi-128]
|
|
sub esi, 128
|
|
dec eax
|
|
jnz %%prefetch_chunk
|
|
mov eax, CACHEBLOCK ; now that it's in cache, do the copy
|
|
align 16
|
|
%%copy_block:
|
|
movq mm0, [esi+ 0]
|
|
movq mm1, [esi+ 8]
|
|
movq mm2, [esi+16]
|
|
movq mm3, [esi+24]
|
|
movq mm4, [esi+32]
|
|
movq mm5, [esi+40]
|
|
movq mm6, [esi+48]
|
|
movq mm7, [esi+56]
|
|
add esi, 64
|
|
movntq [edi+ 0], mm0
|
|
movntq [edi+ 8], mm1
|
|
movntq [edi+16], mm2
|
|
movntq [edi+24], mm3
|
|
movntq [edi+32], mm4
|
|
movntq [edi+40], mm5
|
|
movntq [edi+48], mm6
|
|
movntq [edi+56], mm7
|
|
add edi, 64
|
|
dec eax
|
|
jnz %%copy_block
|
|
sub ecx, CACHEBLOCK ; update the 64-byte block count
|
|
cmp ecx, CACHEBLOCK
|
|
jl %%prefetch_and_copy_chunk
|
|
%endm
|
|
|
|
|
|
; we have >= 64, 64B BLOCKS
|
|
%macro MC_MOVNTQ 0
|
|
align 16
|
|
%%1:
|
|
prefetchnta [esi + (200*64/34+192)]
|
|
movq mm0,[esi+0]
|
|
add edi,64
|
|
movq mm1,[esi+8]
|
|
add esi,64
|
|
movq mm2,[esi-48]
|
|
movntq [edi-64], mm0
|
|
movq mm0,[esi-40]
|
|
movntq [edi-56], mm1
|
|
movq mm1,[esi-32]
|
|
movntq [edi-48], mm2
|
|
movq mm2,[esi-24]
|
|
movntq [edi-40], mm0
|
|
movq mm0,[esi-16]
|
|
movntq [edi-32], mm1
|
|
movq mm1,[esi-8]
|
|
movntq [edi-24], mm2
|
|
movntq [edi-16], mm0
|
|
dec ecx
|
|
movntq [edi-8], mm1
|
|
jnz %%1
|
|
%endm
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
; void __declspec(naked) ia32_memcpy(void* dst, const void* src, size_t nbytes)
|
|
mov ecx, [esp+4+8] ; nbytes
|
|
mov esi, [esp+4+4] ; src
|
|
mov edi, [esp+4+0] ; dst
|
|
|
|
MC_ALIGN
|
|
|
|
mov ebx, ecx
|
|
shr ecx, 6 ; # blocks
|
|
|
|
mov eax, _bp
|
|
cmp ecx, BP_MIN_THRESHOLD_64
|
|
mov edx, _movntq
|
|
cmovb eax, edx
|
|
cmp ecx, MOVNTQ_MIN_THRESHOLD_64
|
|
mov edx, _mmx
|
|
cmovb eax, edx
|
|
cmp ecx, 64
|
|
jbe tiny
|
|
jmp eax
|
|
|
|
tiny:
|
|
MC_UNROLLED_MOVSD
|
|
ret
|
|
|
|
_mmx:
|
|
MC_MOVQ
|
|
emms
|
|
jmp tiny
|
|
|
|
_bp:
|
|
MC_BP_MOVNTQ
|
|
sfence
|
|
emms
|
|
; protect routine below
|
|
cmp ecx, 0
|
|
jz tiny
|
|
|
|
_movntq:
|
|
|
|
sfence
|
|
emms
|
|
jmp tiny
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
; extern "C" int __cdecl get_cur_processor_id();
|
|
global _get_cur_processor_id
|
|
_get_cur_processor_id:
|
|
push ebx
|
|
push 1
|
|
pop eax
|
|
cpuid
|
|
shr ebx, 24
|
|
mov eax, ebx ; ebx[31:24]
|
|
pop ebx
|
|
|
|
|
|
|
|
|
|
|
|
; extern "C" uint __cdecl ia32_control87(uint new_cw, uint mask)
|
|
global _ia32_control87
|
|
_ia32_control87:
|
|
push eax
|
|
fnstcw [esp]
|
|
pop eax ; old_cw
|
|
mov ecx, [esp+4] ; new_cw
|
|
mov edx, [esp+8] ; mask
|
|
and ecx, edx ; new_cw & mask
|
|
not edx ; ~mask
|
|
and eax, edx ; old_cw & ~mask
|
|
or eax, ecx ; (old_cw & ~mask) | (new_cw & mask)
|
|
push edx
|
|
fldcw [esp]
|
|
pop edx
|
|
xor eax, eax ; return value |