forked from 0ad/0ad
prefect
6fc1f45fa6
Added float-to-byte color conversion, including an SSE assembler version. Model renderer: Push UV coordinates into a shared vertex array and use bytes instead of floats for the color array, thereby, significantly reducing the total size of vertex arrays. This was SVN commit r2827.
450 lines
9.5 KiB
NASM
450 lines
9.5 KiB
NASM
; set 32-bit attribute once for all sections and activate .text
|
|
section .data use32
|
|
section .bss use32
|
|
section .text use32
|
|
|
|
; Usage:
|
|
; use sym(ia32_cap) instead of _ia32_cap - on relevant platforms, sym() will add
|
|
; the underlines automagically, on others it won't
|
|
|
|
%ifdef DONT_USE_UNDERLINE
|
|
%define sym(a) a
|
|
%else
|
|
%define sym(a) _ %+ a
|
|
%endif
|
|
|
|
;-------------------------------------------------------------------------------
|
|
; fast general memcpy
|
|
;-------------------------------------------------------------------------------
|
|
|
|
; optimized for Athlon XP: 7.3% faster (cumulative) than VC7.1's memcpy over
|
|
; all 1..64 byte transfer lengths and misalignments. approaches maximum
|
|
; mem bandwidth (2000 MiB/s) for transfers >= 192KiB!
|
|
; Pentium III performance: about 3% faster in above small buffer benchmark.
|
|
;
|
|
; disables specialized large transfer (> 64KiB) implementations if SSE
|
|
; isn't available; we do assume MMX support, though (quite safe).
|
|
|
|
; *requires* (and does not verify the presence of) SSE instructions:
|
|
; prefetchnta and movntq. therefore, a P3+ or Athlon XP is required.
|
|
; rationale: older processors are too slow anyway and we don't bother.
|
|
|
|
; if memcpy size is greater than this,
|
|
; .. it's too big for L1. use non-temporal instructions.
|
|
UC_THRESHOLD equ 64*1024
|
|
; .. it also blows L2. pull chunks into L1 ("block prefetch").
|
|
BP_THRESHOLD equ 192*1024
|
|
|
|
; maximum that can be copied by IC_MOVSD.
|
|
; if you change this, be sure to expand the movs* table(s)!
|
|
IC_SIZE equ 67
|
|
|
|
; size of one block prefetch chunk.
|
|
; if you change this, make sure "push byte BP_SIZE/128" doesn't overflow!
|
|
BP_SIZE equ 8*1024
|
|
|
|
|
|
; > ecx = size (<= IC_SIZE)
|
|
; x eax, ecx
|
|
;
|
|
; determined to be fastest approach by testing. a movsd table followed by
|
|
; rep movsb is a bit smaller but 6.9% slower; everything else is much worse.
|
|
%macro IC_MOVSD 0
|
|
mov eax, ecx
|
|
shr ecx, 2 ; dword count
|
|
neg ecx
|
|
add ecx, %%movsd_table_end
|
|
jmp ecx
|
|
align 8
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
%%movsd_table_end:
|
|
|
|
and eax, 3
|
|
neg eax
|
|
add eax, %%movsb_table_end
|
|
jmp eax
|
|
movsb
|
|
movsb
|
|
movsb
|
|
%%movsb_table_end:
|
|
%endm
|
|
|
|
|
|
; align destination address to multiple of 8.
|
|
; not done for small transfers because it doesn't help IC_MOVSD.
|
|
%macro IC_ALIGN 0
|
|
mov eax, 8
|
|
sub eax, edi
|
|
and eax, byte 7 ; eax = # misaligned bytes
|
|
sub ecx, eax ; reduce copy count
|
|
neg eax
|
|
add eax, %%align_table_end
|
|
jmp eax
|
|
align 4
|
|
movsb
|
|
movsb
|
|
movsb
|
|
movsb
|
|
movsb
|
|
movsb
|
|
movsb
|
|
movsb
|
|
%%align_table_end:
|
|
%endm
|
|
|
|
|
|
; > ecx = size
|
|
; x edx
|
|
%macro IC_MOVQ 0
|
|
align 16
|
|
mov edx, 64
|
|
%%loop:
|
|
cmp ecx, edx
|
|
jb %%done
|
|
prefetchnta [esi + (200*64/34+192)]
|
|
movq mm0, [esi+0]
|
|
movq mm1, [esi+8]
|
|
movq [edi+0], mm0
|
|
movq [edi+8], mm1
|
|
movq mm2, [esi+16]
|
|
movq mm3, [esi+24]
|
|
movq [edi+16], mm2
|
|
movq [edi+24], mm3
|
|
movq mm0, [esi+32]
|
|
movq mm1, [esi+40]
|
|
movq [edi+32], mm0
|
|
movq [edi+40], mm1
|
|
movq mm2, [esi+48]
|
|
movq mm3, [esi+56]
|
|
movq [edi+48], mm2
|
|
movq [edi+56], mm3
|
|
add esi, edx
|
|
add edi, edx
|
|
sub ecx, edx
|
|
jmp %%loop
|
|
%%done:
|
|
%endm
|
|
|
|
|
|
; > ecx = size (> 64)
|
|
; x
|
|
%macro UC_MOVNTQ 0
|
|
mov edx, 64
|
|
align 16
|
|
%%1:
|
|
prefetchnta [esi + (200*64/34+192)]
|
|
movq mm0,[esi+0]
|
|
add edi, edx
|
|
movq mm1,[esi+8]
|
|
add esi, edx
|
|
movq mm2,[esi-48]
|
|
movntq [edi-64], mm0
|
|
movq mm0,[esi-40]
|
|
movntq [edi-56], mm1
|
|
movq mm1,[esi-32]
|
|
movntq [edi-48], mm2
|
|
movq mm2,[esi-24]
|
|
movntq [edi-40], mm0
|
|
movq mm0,[esi-16]
|
|
movntq [edi-32], mm1
|
|
movq mm1,[esi-8]
|
|
movntq [edi-24], mm2
|
|
movntq [edi-16], mm0
|
|
sub ecx, edx
|
|
movntq [edi-8], mm1
|
|
cmp ecx, edx
|
|
jae %%1
|
|
%endm
|
|
|
|
|
|
; > ecx = size (> 8KiB)
|
|
; x eax, edx
|
|
;
|
|
; somewhat optimized for size (futile attempt to avoid near jump)
|
|
%macro UC_BP_MOVNTQ 0
|
|
%%prefetch_and_copy_chunk:
|
|
|
|
; touch each cache line within chunk in reverse order (prevents HW prefetch)
|
|
push byte BP_SIZE/128 ; # iterations
|
|
pop eax
|
|
add esi, BP_SIZE
|
|
align 8
|
|
%%prefetch_chunk:
|
|
mov edx, [esi-64]
|
|
mov edx, [esi-128]
|
|
sub esi, 128
|
|
dec eax
|
|
jnz %%prefetch_chunk
|
|
|
|
; copy 64 byte blocks
|
|
mov eax, BP_SIZE/64 ; # iterations (> signed 8 bit)
|
|
push byte 64
|
|
pop edx
|
|
align 8
|
|
%%copy_block:
|
|
movq mm0, [esi+ 0]
|
|
movq mm1, [esi+ 8]
|
|
movq mm2, [esi+16]
|
|
movq mm3, [esi+24]
|
|
movq mm4, [esi+32]
|
|
movq mm5, [esi+40]
|
|
movq mm6, [esi+48]
|
|
movq mm7, [esi+56]
|
|
add esi, edx
|
|
movntq [edi+ 0], mm0
|
|
movntq [edi+ 8], mm1
|
|
movntq [edi+16], mm2
|
|
movntq [edi+24], mm3
|
|
movntq [edi+32], mm4
|
|
movntq [edi+40], mm5
|
|
movntq [edi+48], mm6
|
|
movntq [edi+56], mm7
|
|
add edi, edx
|
|
dec eax
|
|
jnz %%copy_block
|
|
|
|
sub ecx, BP_SIZE
|
|
cmp ecx, BP_SIZE
|
|
jae %%prefetch_and_copy_chunk
|
|
%endm
|
|
|
|
|
|
[section .bss]
|
|
|
|
; this is somewhat "clever". the 2 specialized transfer implementations
|
|
; that use SSE are jumped to if transfer size is greater than a threshold.
|
|
; we simply set the requested transfer size to 0 if the CPU doesn't
|
|
; support SSE so that those are never reached (done by masking with this).
|
|
sse_mask resd 1
|
|
|
|
__SECT__
|
|
|
|
; void __declspec(naked) ia32_memcpy(void* dst, const void* src, size_t nbytes)
|
|
global sym(ia32_memcpy)
|
|
sym(ia32_memcpy):
|
|
push edi
|
|
push esi
|
|
|
|
mov edi, [esp+8+4+0] ; dst
|
|
mov esi, [esp+8+4+4] ; src
|
|
mov ecx, [esp+8+4+8] ; nbytes
|
|
|
|
cmp ecx, byte IC_SIZE
|
|
ja .choose_larger_method
|
|
|
|
.ic_movsd:
|
|
IC_MOVSD
|
|
pop esi
|
|
pop edi
|
|
ret
|
|
|
|
.choose_larger_method:
|
|
IC_ALIGN
|
|
|
|
mov eax, [sse_mask]
|
|
mov edx, ecx
|
|
and edx, eax ; edx = (SSE)? remaining_bytes : 0
|
|
cmp edx, BP_THRESHOLD
|
|
jae near .uc_bp_movntq
|
|
cmp edx, UC_THRESHOLD
|
|
jae .uc_movntq
|
|
|
|
.ic_movq:
|
|
IC_MOVQ
|
|
emms
|
|
jmp .ic_movsd
|
|
|
|
.uc_movntq:
|
|
UC_MOVNTQ
|
|
sfence
|
|
emms
|
|
jmp .ic_movsd
|
|
|
|
.uc_bp_movntq:
|
|
UC_BP_MOVNTQ
|
|
sfence
|
|
jmp .ic_movq
|
|
|
|
|
|
|
|
;-------------------------------------------------------------------------------
|
|
; CPUID support
|
|
;-------------------------------------------------------------------------------
|
|
|
|
[section .data]
|
|
|
|
; these are actually max_func+1, i.e. the first invalid value.
|
|
; the idea here is to avoid a separate cpuid_available flag;
|
|
; using signed values doesn't work because ext_funcs are >= 0x80000000.
|
|
max_func dd 0
|
|
max_ext_func dd 0
|
|
|
|
__SECT__
|
|
|
|
|
|
; extern "C" bool __cdecl ia32_cpuid(u32 func, u32* regs)
|
|
global sym(ia32_cpuid)
|
|
sym(ia32_cpuid):
|
|
push ebx
|
|
push edi
|
|
|
|
mov ecx, [esp+8+4+0] ; func
|
|
mov edi, [esp+8+4+4] ; -> regs
|
|
|
|
; compare against max supported func and fail if above
|
|
xor eax, eax ; return value on failure
|
|
test ecx, ecx
|
|
mov edx, [max_ext_func]
|
|
js .is_ext_func
|
|
mov edx, [max_func]
|
|
.is_ext_func:
|
|
cmp ecx, edx
|
|
jae .ret ; (see max_func decl)
|
|
|
|
; issue CPUID and store result registers in array
|
|
mov eax, ecx
|
|
cpuid
|
|
stosd
|
|
xchg eax, ebx
|
|
stosd
|
|
xchg eax, ecx
|
|
stosd
|
|
xchg eax, edx
|
|
stosd
|
|
|
|
; success
|
|
xor eax, eax
|
|
inc eax
|
|
.ret:
|
|
pop edi
|
|
pop ebx
|
|
ret
|
|
|
|
|
|
;-------------------------------------------------------------------------------
|
|
; misc
|
|
;-------------------------------------------------------------------------------
|
|
|
|
; extern "C" uint __cdecl ia32_control87(uint new_cw, uint mask)
|
|
global sym(ia32_control87)
|
|
sym(ia32_control87):
|
|
push eax
|
|
fnstcw [esp]
|
|
pop eax ; old_cw
|
|
mov ecx, [esp+4] ; new_cw
|
|
mov edx, [esp+8] ; mask
|
|
and ecx, edx ; new_cw & mask
|
|
not edx ; ~mask
|
|
and eax, edx ; old_cw & ~mask
|
|
or eax, ecx ; (old_cw & ~mask) | (new_cw & mask)
|
|
push edx
|
|
fldcw [esp]
|
|
pop edx
|
|
xor eax, eax ; return value
|
|
ret
|
|
|
|
;-------------------------------------------------------------------------------
|
|
; init
|
|
;-------------------------------------------------------------------------------
|
|
|
|
; extern "C" bool __cdecl ia32_init()
|
|
global sym(ia32_init)
|
|
sym(ia32_init):
|
|
push ebx
|
|
|
|
; check if CPUID is supported
|
|
pushfd
|
|
or byte [esp+2], 32
|
|
popfd
|
|
pushfd
|
|
pop eax
|
|
xor edx, edx
|
|
shr eax, 22 ; bit 21 toggled?
|
|
jnc .no_cpuid
|
|
|
|
; determine max supported CPUID function
|
|
xor eax, eax
|
|
cpuid
|
|
inc eax ; (see max_func decl)
|
|
mov [max_func], eax
|
|
mov eax, 0x80000000
|
|
cpuid
|
|
inc eax ; (see max_func decl)
|
|
mov [max_ext_func], eax
|
|
.no_cpuid:
|
|
|
|
; check if SSE is supported (used by memcpy code)
|
|
extern sym(ia32_cap)
|
|
push byte 32+25 ; ia32.h's SSE cap (won't change)
|
|
call sym(ia32_cap)
|
|
pop edx ; remove stack param
|
|
neg eax ; SSE? ~0 : 0
|
|
mov [sse_mask], eax
|
|
|
|
pop ebx
|
|
ret
|
|
|
|
;-------------------------------------------------------------------------------
|
|
; Color conversion (SSE)
|
|
;-------------------------------------------------------------------------------
|
|
|
|
; extern "C" u32 ConvertRGBColorTo4ub(const RGBColor& color)
|
|
[section .data]
|
|
align 16
|
|
zero:
|
|
dd 0.0
|
|
twofivefive:
|
|
dd 255.0
|
|
|
|
|
|
__SECT__
|
|
align 16
|
|
global sym(sse_ConvertRGBColorTo4ub)
|
|
sym(sse_ConvertRGBColorTo4ub):
|
|
mov eax, [esp+4]
|
|
|
|
; xmm0, 1, 2 = R, G, B
|
|
movss xmm4, [zero]
|
|
movss xmm0, [eax+8]
|
|
movss xmm1, [eax+4]
|
|
movss xmm2, [eax]
|
|
movss xmm5, [twofivefive]
|
|
|
|
; C = min(255, 255*max(C, 0)) ( == clamp(255*C, 0, 255) )
|
|
maxss xmm0, xmm4
|
|
maxss xmm1, xmm4
|
|
maxss xmm2, xmm4
|
|
mulss xmm0, xmm5
|
|
mulss xmm1, xmm5
|
|
mulss xmm2, xmm5
|
|
minss xmm0, xmm5
|
|
minss xmm1, xmm5
|
|
minss xmm2, xmm5
|
|
|
|
; convert to integer and combine channels using bit logic
|
|
cvtss2si eax, xmm0
|
|
cvtss2si ecx, xmm1
|
|
cvtss2si edx, xmm2
|
|
shl eax, 16
|
|
shl ecx, 8
|
|
or eax, 0xff000000
|
|
or edx, ecx
|
|
or eax, edx
|
|
|
|
ret
|