1
1
forked from 0ad/0ad
0ad/source/lib/sysdep/ia32.asm

665 lines
17 KiB
NASM

; set section attributes
section .data data align=32 use32
section .bss bss align=16 use32
section .text code align=64 use32
; activate .text (needs to be separate because __SECT__ will otherwise
; complain that the above definition is redeclaring attributes)
section .text
; Usage:
; use sym(ia32_cap) instead of _ia32_cap - on relevant platforms, sym() will add
; the underlines automagically, on others it won't
%ifdef DONT_USE_UNDERLINE
%define sym(a) a
%else
%define sym(a) _ %+ a
%endif
;-------------------------------------------------------------------------------
; fast general memcpy
;-------------------------------------------------------------------------------
; drop-in replacement for libc memcpy(). only requires CPU support for
; MMX (by now universal). highly optimized for Athlon and Pentium III
; microarchitectures; significantly outperforms VC7.1 memcpy and memcpy_amd.
; for details, see accompanying article.
; if transfer size is at least this much,
; .. it's too big for L1. use non-temporal instructions.
UC_THRESHOLD equ 64*1024
; .. it also blows L2. pull chunks into L1 ("block prefetch").
BP_THRESHOLD equ 256*1024
; maximum that can be copied by IC_TINY.
IC_TINY_MAX equ 63
; size of one block prefetch chunk.
BP_SIZE equ 8*1024
;------------------------------------------------------------------------------
; [p3] replicating this instead of jumping to it from tailN
; saves 1 clock and costs (7-2)*2 bytes code.
%macro EPILOG 0
pop esi
pop edi
mov eax, [esp+4] ; return dst
ret
%endm
align 64
tail1:
mov al, [esi+ecx*4]
mov [edi+ecx*4], al
align 4
tail0:
EPILOG
align 8
tail3:
; [p3] 2 reads followed by 2 writes is better than
; R/W interleaved and RRR/WWW
mov al, [esi+ecx*4+2]
mov [edi+ecx*4+2], al
; already aligned to 8 due to above code
tail2:
mov al, [esi+ecx*4]
mov dl, [esi+ecx*4+1]
mov [edi+ecx*4], al
mov [edi+ecx*4+1], dl
EPILOG
[section .data]
align 16
tail_table dd tail0, tail1, tail2, tail3
__SECT__
; 15x unrolled copy loop - transfers DWORDs backwards.
; indexed via table of 8-bit offsets.
; rationale:
; - [p3] backwards vs. forwards makes no difference.
; - MOV is faster than MOVSD.
; - index table is needed because calculating end-6*i is slower than
; a LUT and we wouldn't want to expand entries to 8 bytes
; (that'd increase code footprint by 30 bytes)
; - a byte index accessed via MOVZX is better due to less dcache usage.
; - only unrolling 8x and 'reentering' the loop is possible but
; slower due to fiddling with esi/ecx.
align 64
unrolled_copy_code_start:
%assign i 15
%rep 14 ; 15 entries, 1 base case handled below
uc_ %+ i:
mov eax, [esi+i*4-4]
mov [edi+i*4-4], eax
%assign i i-1
%endrep
; base case: no displacement needed; skip it so that code will
; be aligned to 8 bytes after this.
uc_1:
mov eax, [esi]
mov [edi], eax
uc_0:
jmp [tail_table+edx*4]
[section .data]
align 32
unrolled_copy_index_table:
%assign i 0
%rep 16
db (uc_ %+ i) - unrolled_copy_code_start
%assign i i+1
%endrep
__SECT__
;------------------------------------------------------------------------------
; tiny copy - handles all cases smaller than IC_MOVQ's 64 byte lower limit.
; > edx = number of bytes (< IC_TINY_MAX)
; < does not return.
; x eax, ecx, edx
%macro IC_TINY 0
mov ecx, edx
shr ecx, 2
; calculating this address isn't possible due to skipping displacement on uc1;
; even so, it'd require calculating -6*ecx, which is slower than LUT.
movzx eax, byte [unrolled_copy_index_table+ecx]
and edx, byte 3
add eax, unrolled_copy_code_start
jmp eax
; never reached! the unrolled loop jumps into tailN, which
; then returns from the memcpy function.
%endm
;------------------------------------------------------------------------------
; align destination address to multiple of 8. important for large transfers,
; but doesn't affect the tiny technique.
; > esi, edi -> buffers (updated)
; > ecx, edx = transfer size (updated)
; x eax
%macro IC_ALIGN 0
mov eax, edi
and eax, byte 7 ; eax = # misaligned bytes
jz already_aligned ; early out
lea eax, [align_table_start+eax*2]
jmp eax
; [p3] this is no slower than a table of mov and much smaller/simpler
align 8
align_table_start:
%rep 8
dec ecx
movsb
%endrep
mov edx, ecx
already_aligned:
%endm
;------------------------------------------------------------------------------
; MMX MOVQ technique. used for in-cache transfers of 64B..64*KiB.
; must run on all CPUs, i.e. cannot use the SSE prefetchnta instruction.
; > ecx = -number_of_bytes (multiple of 64)
; > esi, esi point to end of the buffer, i.e. &last_qword+8.
; < ecx = 0
; x
%macro IC_MOVQ 0
align 16
%%loop:
; notes:
; - we can't use prefetch here - this codepath must support all CPUs.
; [p3] that makes us 5..15% slower on 1KiB..4KiB transfers.
; - [p3] simple addressing without +ecx is 3.5% faster.
; - difference between RR/WW/RR/WW and R..R/W..W:
; [p3] none (if simple addressing)
; [axp] interleaved is better (with +ecx addressing)
; - enough time elapses between first and third pair of reads that we
; could reuse MM0. there is no performance gain either way and
; differing displacements make code compression futile anyway, so
; we'll just use MM4..7 for clarity.
movq mm0, [esi+ecx]
movq mm1, [esi+ecx+8]
movq [edi+ecx], mm0
movq [edi+ecx+8], mm1
movq mm2, [esi+ecx+16]
movq mm3, [esi+ecx+24]
movq [edi+ecx+16], mm2
movq [edi+ecx+24], mm3
movq mm4, [esi+ecx+32]
movq mm5, [esi+ecx+40]
movq [edi+ecx+32], mm4
movq [edi+ecx+40], mm5
movq mm6, [esi+ecx+48]
movq mm7, [esi+ecx+56]
movq [edi+ecx+48], mm6
movq [edi+ecx+56], mm7
add ecx, byte 64
jnz %%loop
%endm
;------------------------------------------------------------------------------
; SSE MOVNTQ technique. used for transfers that do not fit in L1,
; i.e. 64KiB..192KiB. requires Pentium III or Athlon; caller checks for this.
; > ecx = -number_of_bytes (multiple of 64)
; > esi, esi point to end of the buffer, i.e. &last_qword+8.
; < ecx = 0
; x
%macro UC_MOVNTQ 0
align 16
%%loop:
; notes:
; - the AMD optimization manual recommends prefetch distances according to
; (200*BytesPerIter/ClocksPerIter+192), which comes out to ~560 here.
; [p3] rounding down to 512 bytes makes for significant gains.
; - [p3] complex addressing with ecx is 1% faster than adding to esi/edi.
prefetchnta [esi+ecx+512]
movq mm0, [esi+ecx]
movq mm1, [esi+ecx+8]
movq mm2, [esi+ecx+16]
movq mm3, [esi+ecx+24]
movq mm4, [esi+ecx+32]
movq mm5, [esi+ecx+40]
movq mm6, [esi+ecx+48]
movq mm7, [esi+ecx+56]
movntq [edi+ecx], mm0
movntq [edi+ecx+8], mm1
movntq [edi+ecx+16], mm2
movntq [edi+ecx+24], mm3
movntq [edi+ecx+32], mm4
movntq [edi+ecx+40], mm5
movntq [edi+ecx+48], mm6
movntq [edi+ecx+56], mm7
add ecx, byte 64
jnz %%loop
%endm
;------------------------------------------------------------------------------
; block prefetch technique. used for transfers that do not fit in L2,
; i.e. > 192KiB. requires Pentium III or Athlon; caller checks for this.
; for theory behind this, see article.
; > ecx = -number_of_bytes (multiple of 64, <= -BP_SIZE)
; > esi, esi point to end of the buffer, i.e. &last_qword+8.
; < ecx = -remaining_bytes (multiple of 64, > -BP_SIZE)
; < eax = 0
%macro UC_BP_MOVNTQ 0
push edx
align 4
%%prefetch_and_copy_chunk:
; pull chunk into cache by touching each cache line
; (in reverse order to prevent HW prefetches)
mov eax, BP_SIZE/128 ; # iterations
add esi, BP_SIZE
align 16
%%prefetch_loop:
mov edx, [esi+ecx-64]
mov edx, [esi+ecx-128]
add esi, byte -128
dec eax
jnz %%prefetch_loop
; copy chunk in 64 byte pieces
mov eax, BP_SIZE/64 ; # iterations (> signed 8 bit)
align 16
%%copy_loop:
movq mm0, [esi+ecx]
movq mm1, [esi+ecx+8]
movq mm2, [esi+ecx+16]
movq mm3, [esi+ecx+24]
movq mm4, [esi+ecx+32]
movq mm5, [esi+ecx+40]
movq mm6, [esi+ecx+48]
movq mm7, [esi+ecx+56]
movntq [edi+ecx], mm0
movntq [edi+ecx+8], mm1
movntq [edi+ecx+16], mm2
movntq [edi+ecx+24], mm3
movntq [edi+ecx+32], mm4
movntq [edi+ecx+40], mm5
movntq [edi+ecx+48], mm6
movntq [edi+ecx+56], mm7
add ecx, byte 64
dec eax
jnz %%copy_loop
; if enough data left, process next chunk
cmp ecx, -BP_SIZE
jle %%prefetch_and_copy_chunk
pop edx
%endm
;------------------------------------------------------------------------------
; void* __declspec(naked) ia32_memcpy(void* dst, const void* src, size_t nbytes)
; drop-in replacement for libc memcpy() (returns dst)
global sym(ia32_memcpy)
align 64
sym(ia32_memcpy):
push edi
push esi
mov ecx, [esp+8+4+8] ; nbytes
mov edi, [esp+8+4+0] ; dst
mov esi, [esp+8+4+4] ; src
mov edx, ecx
cmp ecx, byte IC_TINY_MAX
ja choose_larger_method
ic_tiny:
IC_TINY
; never reached - IC_TINY contains memcpy function epilog code
choose_larger_method:
IC_ALIGN
; setup:
; eax = number of 64 byte chunks, or 0 if CPU doesn't support SSE.
; used to choose copy technique.
; ecx = -number_of_bytes, multiple of 64. we jump to ic_tiny if
; there's not enough left for a single 64 byte chunk, which can
; happen on unaligned 64..71 byte transfers due to IC_ALIGN.
; edx = number of remainder bytes after qwords have been copied;
; will be handled by IC_TINY.
; esi and edi point to end of the respective buffers (more precisely,
; to buffer_start-ecx). this together with the ecx convention means
; we only need one loop counter (instead of having to advance
; that and esi/edi).
; this mask is applied to the transfer size. the 2 specialized copy techniques
; that use SSE are jumped to if size is greater than a threshold.
; we simply set the requested transfer size to 0 if the CPU doesn't
; support SSE so that those are never reached (done by masking with this).
extern sym(ia32_memcpy_size_mask)
mov eax, [sym(ia32_memcpy_size_mask)]
and ecx, byte ~IC_TINY_MAX
jz ic_tiny ; < 64 bytes left (due to IC_ALIGN)
add esi, ecx
add edi, ecx
and edx, byte IC_TINY_MAX
and eax, ecx
neg ecx
cmp eax, BP_THRESHOLD
jae near uc_bp_movntq
cmp eax, UC_THRESHOLD
jae uc_movntq
ic_movq:
IC_MOVQ
emms
jmp ic_tiny
uc_movntq:
UC_MOVNTQ
sfence
emms
jmp ic_tiny
uc_bp_movntq:
UC_BP_MOVNTQ
sfence
cmp ecx, byte -(IC_TINY_MAX+1)
jle ic_movq
emms
jmp ic_tiny
;-------------------------------------------------------------------------------
; CPUID support
;-------------------------------------------------------------------------------
[section .data]
; these are actually max_func+1, i.e. the first invalid value.
; the idea here is to avoid a separate cpuid_available flag;
; using signed values doesn't work because ext_funcs are >= 0x80000000.
max_func dd 0
max_ext_func dd 0
__SECT__
; extern "C" bool __cdecl ia32_cpuid(u32 func, u32* regs)
global sym(ia32_cpuid)
sym(ia32_cpuid):
push ebx
push edi
mov ecx, [esp+8+4+0] ; func
mov edi, [esp+8+4+4] ; -> regs
; compare against max supported func and fail if above
xor eax, eax ; return value on failure
test ecx, ecx
mov edx, [max_ext_func]
js .is_ext_func
mov edx, [max_func]
.is_ext_func:
cmp ecx, edx
jae .ret ; (see max_func decl)
; issue CPUID and store result registers in array
mov eax, ecx
cpuid
stosd
xchg eax, ebx
stosd
xchg eax, ecx
stosd
xchg eax, edx
stosd
; success
xor eax, eax
inc eax
.ret:
pop edi
pop ebx
ret
;-------------------------------------------------------------------------------
; lock-free support routines
;-------------------------------------------------------------------------------
extern sym(cpus)
; extern "C" void __cdecl atomic_add(intptr_t* location, intptr_t increment);
global sym(atomic_add)
sym(atomic_add):
cmp byte [sym(cpus)], 1
mov edx, [esp+4] ; location
mov eax, [esp+8] ; increment
je .no_lock
db 0xf0 ; LOCK prefix
.no_lock:
add [edx], eax
ret
; notes:
; - this is called via CAS macro, which silently casts its inputs for
; convenience. mixing up the <expected> and <location> parameters would
; go unnoticed; we therefore perform a basic sanity check on <location> and
; raise a warning if it is invalid.
; - a 486 or later processor is required since we use CMPXCHG.
; there's no feature flag we can check, and the ia32 code doesn't
; bother detecting anything < Pentium, so this'll crash and burn if
; run on 386. we could fall back to simple MOVs there (since 386 CPUs
; aren't MP-capable), but it's not worth the trouble.
; extern "C" __declspec(naked) bool __cdecl CAS_(uintptr_t* location, uintptr_t expected, uintptr_t new_value);
global sym(CAS_)
sym(CAS_):
cmp byte [sym(cpus)], 1
mov eax, [esp+8] ; expected
mov edx, [esp+4] ; location
cmp edx, 0x10000 ; .. valid pointer?
jb .invalid_location ; no - raise warning
mov ecx, [esp+12] ; new_value
je .no_lock
db 0xf0 ; LOCK prefix
.no_lock:
cmpxchg [edx], ecx
sete al
movzx eax, al
ret
; NOTE: nasm 0.98.39 doesn't support generating debug info for win32
; output format. that means this code may be misattributed to other
; functions, which makes tracking it down very difficult.
; we therefore raise an "Invalid Opcode" exception, which is rather distinct.
.invalid_location:
ud2
;-------------------------------------------------------------------------------
; misc
;-------------------------------------------------------------------------------
; extern "C" uint __cdecl ia32_control87(uint new_cw, uint mask)
global sym(ia32_control87)
sym(ia32_control87):
push eax
fnstcw [esp]
pop eax ; old_cw
mov ecx, [esp+4] ; new_val
mov edx, [esp+8] ; mask
and ecx, edx ; new_val & mask
not edx ; ~mask
and eax, edx ; old_cw & ~mask
or eax, ecx ; (old_cw & ~mask) | (new_val & mask)
push eax ; = new_cw
fldcw [esp]
pop eax
xor eax, eax ; return value
ret
; write the current execution state (e.g. all register values) into
; (Win32::CONTEXT*)pcontext (defined as void* to avoid dependency).
; optimized for size; this must be straight asm because __declspec(naked)
; is compiler-specific and compiler-generated prolog code inserted before
; inline asm trashes EBP and ESP (unacceptable).
; extern "C" void ia32_get_current_context(void* pcontext)
global sym(ia32_get_current_context)
sym(ia32_get_current_context):
pushad
pushfd
mov edi, [esp+4+32+4] ; pcontext
; ContextFlags
mov eax, 0x10007 ; segs, int, control
stosd
; DRx and FloatSave
; rationale: we can't access the debug registers from Ring3, and
; the FPU save area is irrelevant, so zero them.
xor eax, eax
push byte 6+8+20
pop ecx
rep stosd
; CONTEXT_SEGMENTS
mov ax, gs
stosd
mov ax, fs
stosd
mov ax, es
stosd
mov ax, ds
stosd
; CONTEXT_INTEGER
mov eax, [esp+4+32-32] ; edi
stosd
xchg eax, esi
stosd
xchg eax, ebx
stosd
xchg eax, edx
stosd
mov eax, [esp+4+32-8] ; ecx
stosd
mov eax, [esp+4+32-4] ; eax
stosd
; CONTEXT_CONTROL
xchg eax, ebp ; ebp restored by POPAD
stosd
mov eax, [esp+4+32] ; return address
sub eax, 5 ; skip CALL instruction -> call site.
stosd
xor eax, eax
mov ax, cs
stosd
pop eax ; eflags
stosd
lea eax, [esp+32+4+4] ; esp
stosd
xor eax, eax
mov ax, ss
stosd
; ExtendedRegisters
xor ecx, ecx
mov cl, 512/4
rep stosd
popad
ret
;-------------------------------------------------------------------------------
; init
;-------------------------------------------------------------------------------
; extern "C" bool __cdecl ia32_asm_init()
global sym(ia32_asm_init)
sym(ia32_asm_init):
push ebx
; check if CPUID is supported
pushfd
or byte [esp+2], 32
popfd
pushfd
pop eax
xor edx, edx
shr eax, 22 ; bit 21 toggled?
jnc .no_cpuid
; determine max supported CPUID function
xor eax, eax
cpuid
inc eax ; (see max_func decl)
mov [max_func], eax
mov eax, 0x80000000
cpuid
inc eax ; (see max_func decl)
mov [max_ext_func], eax
.no_cpuid:
pop ebx
ret
;-------------------------------------------------------------------------------
; Color conversion (SSE)
;-------------------------------------------------------------------------------
; extern "C" u32 ConvertRGBColorTo4ub(const RGBColor& color)
[section .data]
align 16
zero:
dd 0.0
twofivefive:
dd 255.0
__SECT__
align 16
global sym(sse_ConvertRGBColorTo4ub)
sym(sse_ConvertRGBColorTo4ub):
mov eax, [esp+4]
; xmm0, 1, 2 = R, G, B
movss xmm4, [zero]
movss xmm0, [eax+8]
movss xmm1, [eax+4]
movss xmm2, [eax]
movss xmm5, [twofivefive]
; C = min(255, 255*max(C, 0)) ( == clamp(255*C, 0, 255) )
maxss xmm0, xmm4
maxss xmm1, xmm4
maxss xmm2, xmm4
mulss xmm0, xmm5
mulss xmm1, xmm5
mulss xmm2, xmm5
minss xmm0, xmm5
minss xmm1, xmm5
minss xmm2, xmm5
; convert to integer and combine channels using bit logic
cvtss2si eax, xmm0
cvtss2si ecx, xmm1
cvtss2si edx, xmm2
shl eax, 16
shl ecx, 8
or eax, 0xff000000
or edx, ecx
or eax, edx
ret