forked from 0ad/0ad
janwas
b17fbf003d
-move tex_codec stuff out of tex.cpp and into tex_codec.cpp -tex_codec: use linked list of codecs instead of array (simplifies code and removes limit) -ogl_tex: add override mechanism for s3tc/automipmapgen extension detect -tired of keeping of debug_warn text in sync with function name; now use __func__ everywhere (and emulate that with __FUNCTION__ on MSC) -add ONCE_NOT (opposite of ONCE) -fix 3!! stupid bugs in ia32_control87, its caller and the header that were canceling each other out. float exceptions now enabled except for "inexact result" (too common). (update: nicolai had already fixed 2 of these) This was SVN commit r2964.
450 lines
9.5 KiB
NASM
450 lines
9.5 KiB
NASM
; set 32-bit attribute once for all sections and activate .text
|
|
section .data use32
|
|
section .bss use32
|
|
section .text use32
|
|
|
|
; Usage:
|
|
; use sym(ia32_cap) instead of _ia32_cap - on relevant platforms, sym() will add
|
|
; the underlines automagically, on others it won't
|
|
|
|
%ifdef DONT_USE_UNDERLINE
|
|
%define sym(a) a
|
|
%else
|
|
%define sym(a) _ %+ a
|
|
%endif
|
|
|
|
;-------------------------------------------------------------------------------
|
|
; fast general memcpy
|
|
;-------------------------------------------------------------------------------
|
|
|
|
; optimized for Athlon XP: 7.3% faster (cumulative) than VC7.1's memcpy over
|
|
; all 1..64 byte transfer lengths and misalignments. approaches maximum
|
|
; mem bandwidth (2000 MiB/s) for transfers >= 192KiB!
|
|
; Pentium III performance: about 3% faster in above small buffer benchmark.
|
|
;
|
|
; disables specialized large transfer (> 64KiB) implementations if SSE
|
|
; isn't available; we do assume MMX support, though (quite safe).
|
|
|
|
; *requires* (and does not verify the presence of) SSE instructions:
|
|
; prefetchnta and movntq. therefore, a P3+ or Athlon XP is required.
|
|
; rationale: older processors are too slow anyway and we don't bother.
|
|
|
|
; if memcpy size is greater than this,
|
|
; .. it's too big for L1. use non-temporal instructions.
|
|
UC_THRESHOLD equ 64*1024
|
|
; .. it also blows L2. pull chunks into L1 ("block prefetch").
|
|
BP_THRESHOLD equ 192*1024
|
|
|
|
; maximum that can be copied by IC_MOVSD.
|
|
; if you change this, be sure to expand the movs* table(s)!
|
|
IC_SIZE equ 67
|
|
|
|
; size of one block prefetch chunk.
|
|
; if you change this, make sure "push byte BP_SIZE/128" doesn't overflow!
|
|
BP_SIZE equ 8*1024
|
|
|
|
|
|
; > ecx = size (<= IC_SIZE)
|
|
; x eax, ecx
|
|
;
|
|
; determined to be fastest approach by testing. a movsd table followed by
|
|
; rep movsb is a bit smaller but 6.9% slower; everything else is much worse.
|
|
%macro IC_MOVSD 0
|
|
mov eax, ecx
|
|
shr ecx, 2 ; dword count
|
|
neg ecx
|
|
add ecx, %%movsd_table_end
|
|
jmp ecx
|
|
align 8
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
movsd
|
|
%%movsd_table_end:
|
|
|
|
and eax, 3
|
|
neg eax
|
|
add eax, %%movsb_table_end
|
|
jmp eax
|
|
movsb
|
|
movsb
|
|
movsb
|
|
%%movsb_table_end:
|
|
%endm
|
|
|
|
|
|
; align destination address to multiple of 8.
|
|
; not done for small transfers because it doesn't help IC_MOVSD.
|
|
%macro IC_ALIGN 0
|
|
mov eax, 8
|
|
sub eax, edi
|
|
and eax, byte 7 ; eax = # misaligned bytes
|
|
sub ecx, eax ; reduce copy count
|
|
neg eax
|
|
add eax, %%align_table_end
|
|
jmp eax
|
|
align 4
|
|
movsb
|
|
movsb
|
|
movsb
|
|
movsb
|
|
movsb
|
|
movsb
|
|
movsb
|
|
movsb
|
|
%%align_table_end:
|
|
%endm
|
|
|
|
|
|
; > ecx = size
|
|
; x edx
|
|
%macro IC_MOVQ 0
|
|
align 16
|
|
mov edx, 64
|
|
%%loop:
|
|
cmp ecx, edx
|
|
jb %%done
|
|
prefetchnta [esi + (200*64/34+192)]
|
|
movq mm0, [esi+0]
|
|
movq mm1, [esi+8]
|
|
movq [edi+0], mm0
|
|
movq [edi+8], mm1
|
|
movq mm2, [esi+16]
|
|
movq mm3, [esi+24]
|
|
movq [edi+16], mm2
|
|
movq [edi+24], mm3
|
|
movq mm0, [esi+32]
|
|
movq mm1, [esi+40]
|
|
movq [edi+32], mm0
|
|
movq [edi+40], mm1
|
|
movq mm2, [esi+48]
|
|
movq mm3, [esi+56]
|
|
movq [edi+48], mm2
|
|
movq [edi+56], mm3
|
|
add esi, edx
|
|
add edi, edx
|
|
sub ecx, edx
|
|
jmp %%loop
|
|
%%done:
|
|
%endm
|
|
|
|
|
|
; > ecx = size (> 64)
|
|
; x
|
|
%macro UC_MOVNTQ 0
|
|
mov edx, 64
|
|
align 16
|
|
%%1:
|
|
prefetchnta [esi + (200*64/34+192)]
|
|
movq mm0,[esi+0]
|
|
add edi, edx
|
|
movq mm1,[esi+8]
|
|
add esi, edx
|
|
movq mm2,[esi-48]
|
|
movntq [edi-64], mm0
|
|
movq mm0,[esi-40]
|
|
movntq [edi-56], mm1
|
|
movq mm1,[esi-32]
|
|
movntq [edi-48], mm2
|
|
movq mm2,[esi-24]
|
|
movntq [edi-40], mm0
|
|
movq mm0,[esi-16]
|
|
movntq [edi-32], mm1
|
|
movq mm1,[esi-8]
|
|
movntq [edi-24], mm2
|
|
movntq [edi-16], mm0
|
|
sub ecx, edx
|
|
movntq [edi-8], mm1
|
|
cmp ecx, edx
|
|
jae %%1
|
|
%endm
|
|
|
|
|
|
; > ecx = size (> 8KiB)
|
|
; x eax, edx
|
|
;
|
|
; somewhat optimized for size (futile attempt to avoid near jump)
|
|
%macro UC_BP_MOVNTQ 0
|
|
%%prefetch_and_copy_chunk:
|
|
|
|
; touch each cache line within chunk in reverse order (prevents HW prefetch)
|
|
push byte BP_SIZE/128 ; # iterations
|
|
pop eax
|
|
add esi, BP_SIZE
|
|
align 8
|
|
%%prefetch_chunk:
|
|
mov edx, [esi-64]
|
|
mov edx, [esi-128]
|
|
sub esi, 128
|
|
dec eax
|
|
jnz %%prefetch_chunk
|
|
|
|
; copy 64 byte blocks
|
|
mov eax, BP_SIZE/64 ; # iterations (> signed 8 bit)
|
|
push byte 64
|
|
pop edx
|
|
align 8
|
|
%%copy_block:
|
|
movq mm0, [esi+ 0]
|
|
movq mm1, [esi+ 8]
|
|
movq mm2, [esi+16]
|
|
movq mm3, [esi+24]
|
|
movq mm4, [esi+32]
|
|
movq mm5, [esi+40]
|
|
movq mm6, [esi+48]
|
|
movq mm7, [esi+56]
|
|
add esi, edx
|
|
movntq [edi+ 0], mm0
|
|
movntq [edi+ 8], mm1
|
|
movntq [edi+16], mm2
|
|
movntq [edi+24], mm3
|
|
movntq [edi+32], mm4
|
|
movntq [edi+40], mm5
|
|
movntq [edi+48], mm6
|
|
movntq [edi+56], mm7
|
|
add edi, edx
|
|
dec eax
|
|
jnz %%copy_block
|
|
|
|
sub ecx, BP_SIZE
|
|
cmp ecx, BP_SIZE
|
|
jae %%prefetch_and_copy_chunk
|
|
%endm
|
|
|
|
|
|
[section .bss]
|
|
|
|
; this is somewhat "clever". the 2 specialized transfer implementations
|
|
; that use SSE are jumped to if transfer size is greater than a threshold.
|
|
; we simply set the requested transfer size to 0 if the CPU doesn't
|
|
; support SSE so that those are never reached (done by masking with this).
|
|
sse_mask resd 1
|
|
|
|
__SECT__
|
|
|
|
; void __declspec(naked) ia32_memcpy(void* dst, const void* src, size_t nbytes)
|
|
global sym(ia32_memcpy)
|
|
sym(ia32_memcpy):
|
|
push edi
|
|
push esi
|
|
|
|
mov edi, [esp+8+4+0] ; dst
|
|
mov esi, [esp+8+4+4] ; src
|
|
mov ecx, [esp+8+4+8] ; nbytes
|
|
|
|
cmp ecx, byte IC_SIZE
|
|
ja .choose_larger_method
|
|
|
|
.ic_movsd:
|
|
IC_MOVSD
|
|
pop esi
|
|
pop edi
|
|
ret
|
|
|
|
.choose_larger_method:
|
|
IC_ALIGN
|
|
|
|
mov eax, [sse_mask]
|
|
mov edx, ecx
|
|
and edx, eax ; edx = (SSE)? remaining_bytes : 0
|
|
cmp edx, BP_THRESHOLD
|
|
jae near .uc_bp_movntq
|
|
cmp edx, UC_THRESHOLD
|
|
jae .uc_movntq
|
|
|
|
.ic_movq:
|
|
IC_MOVQ
|
|
emms
|
|
jmp .ic_movsd
|
|
|
|
.uc_movntq:
|
|
UC_MOVNTQ
|
|
sfence
|
|
emms
|
|
jmp .ic_movsd
|
|
|
|
.uc_bp_movntq:
|
|
UC_BP_MOVNTQ
|
|
sfence
|
|
jmp .ic_movq
|
|
|
|
|
|
|
|
;-------------------------------------------------------------------------------
|
|
; CPUID support
|
|
;-------------------------------------------------------------------------------
|
|
|
|
[section .data]
|
|
|
|
; these are actually max_func+1, i.e. the first invalid value.
|
|
; the idea here is to avoid a separate cpuid_available flag;
|
|
; using signed values doesn't work because ext_funcs are >= 0x80000000.
|
|
max_func dd 0
|
|
max_ext_func dd 0
|
|
|
|
__SECT__
|
|
|
|
|
|
; extern "C" bool __cdecl ia32_cpuid(u32 func, u32* regs)
|
|
global sym(ia32_cpuid)
|
|
sym(ia32_cpuid):
|
|
push ebx
|
|
push edi
|
|
|
|
mov ecx, [esp+8+4+0] ; func
|
|
mov edi, [esp+8+4+4] ; -> regs
|
|
|
|
; compare against max supported func and fail if above
|
|
xor eax, eax ; return value on failure
|
|
test ecx, ecx
|
|
mov edx, [max_ext_func]
|
|
js .is_ext_func
|
|
mov edx, [max_func]
|
|
.is_ext_func:
|
|
cmp ecx, edx
|
|
jae .ret ; (see max_func decl)
|
|
|
|
; issue CPUID and store result registers in array
|
|
mov eax, ecx
|
|
cpuid
|
|
stosd
|
|
xchg eax, ebx
|
|
stosd
|
|
xchg eax, ecx
|
|
stosd
|
|
xchg eax, edx
|
|
stosd
|
|
|
|
; success
|
|
xor eax, eax
|
|
inc eax
|
|
.ret:
|
|
pop edi
|
|
pop ebx
|
|
ret
|
|
|
|
|
|
;-------------------------------------------------------------------------------
|
|
; misc
|
|
;-------------------------------------------------------------------------------
|
|
|
|
; extern "C" uint __cdecl ia32_control87(uint new_cw, uint mask)
|
|
global sym(ia32_control87)
|
|
sym(ia32_control87):
|
|
push eax
|
|
fnstcw [esp]
|
|
pop eax ; old_cw
|
|
mov ecx, [esp+4] ; new_val
|
|
mov edx, [esp+8] ; mask
|
|
and ecx, edx ; new_val & mask
|
|
not edx ; ~mask
|
|
and eax, edx ; old_cw & ~mask
|
|
or eax, ecx ; (old_cw & ~mask) | (new_val & mask)
|
|
push eax ; = new_cw
|
|
fldcw [esp]
|
|
pop eax
|
|
xor eax, eax ; return value
|
|
ret
|
|
|
|
;-------------------------------------------------------------------------------
|
|
; init
|
|
;-------------------------------------------------------------------------------
|
|
|
|
; extern "C" bool __cdecl ia32_init()
|
|
global sym(ia32_init)
|
|
sym(ia32_init):
|
|
push ebx
|
|
|
|
; check if CPUID is supported
|
|
pushfd
|
|
or byte [esp+2], 32
|
|
popfd
|
|
pushfd
|
|
pop eax
|
|
xor edx, edx
|
|
shr eax, 22 ; bit 21 toggled?
|
|
jnc .no_cpuid
|
|
|
|
; determine max supported CPUID function
|
|
xor eax, eax
|
|
cpuid
|
|
inc eax ; (see max_func decl)
|
|
mov [max_func], eax
|
|
mov eax, 0x80000000
|
|
cpuid
|
|
inc eax ; (see max_func decl)
|
|
mov [max_ext_func], eax
|
|
.no_cpuid:
|
|
|
|
; check if SSE is supported (used by memcpy code)
|
|
extern sym(ia32_cap)
|
|
push byte 32+25 ; ia32.h's SSE cap (won't change)
|
|
call sym(ia32_cap)
|
|
pop edx ; remove stack param
|
|
neg eax ; SSE? ~0 : 0
|
|
mov [sse_mask], eax
|
|
|
|
pop ebx
|
|
ret
|
|
|
|
;-------------------------------------------------------------------------------
|
|
; Color conversion (SSE)
|
|
;-------------------------------------------------------------------------------
|
|
|
|
; extern "C" u32 ConvertRGBColorTo4ub(const RGBColor& color)
|
|
[section .data]
|
|
align 16
|
|
zero:
|
|
dd 0.0
|
|
twofivefive:
|
|
dd 255.0
|
|
|
|
|
|
__SECT__
|
|
align 16
|
|
global sym(sse_ConvertRGBColorTo4ub)
|
|
sym(sse_ConvertRGBColorTo4ub):
|
|
mov eax, [esp+4]
|
|
|
|
; xmm0, 1, 2 = R, G, B
|
|
movss xmm4, [zero]
|
|
movss xmm0, [eax+8]
|
|
movss xmm1, [eax+4]
|
|
movss xmm2, [eax]
|
|
movss xmm5, [twofivefive]
|
|
|
|
; C = min(255, 255*max(C, 0)) ( == clamp(255*C, 0, 255) )
|
|
maxss xmm0, xmm4
|
|
maxss xmm1, xmm4
|
|
maxss xmm2, xmm4
|
|
mulss xmm0, xmm5
|
|
mulss xmm1, xmm5
|
|
mulss xmm2, xmm5
|
|
minss xmm0, xmm5
|
|
minss xmm1, xmm5
|
|
minss xmm2, xmm5
|
|
|
|
; convert to integer and combine channels using bit logic
|
|
cvtss2si eax, xmm0
|
|
cvtss2si ecx, xmm1
|
|
cvtss2si edx, xmm2
|
|
shl eax, 16
|
|
shl ecx, 8
|
|
or eax, 0xff000000
|
|
or edx, ecx
|
|
or eax, edx
|
|
|
|
ret
|