forked from 0ad/0ad
ia32.asm: add explicit init function (simplifies other code)
memcpy no longer requires SSE yet is still efficient (clever trick, hehe) This was SVN commit r2757.
This commit is contained in:
parent
2a8953452d
commit
8ba42d5a3c
@ -1,3 +1,6 @@
|
||||
; set 32-bit attribute once for all sections and activate .text
|
||||
section .data use32
|
||||
section .bss use32
|
||||
section .text use32
|
||||
|
||||
;-------------------------------------------------------------------------------
|
||||
@ -9,6 +12,9 @@ section .text use32
|
||||
; mem bandwidth (2000 MiB/s) for transfers >= 192KiB!
|
||||
; Pentium III performance: about 3% faster in above small buffer benchmark.
|
||||
;
|
||||
; disables specialized large transfer (> 64KiB) implementations if SSE
|
||||
; isn't available; we do assume MMX support, though (quite safe).
|
||||
|
||||
; *requires* (and does not verify the presence of) SSE instructions:
|
||||
; prefetchnta and movntq. therefore, a P3+ or Athlon XP is required.
|
||||
; rationale: older processors are too slow anyway and we don't bother.
|
||||
@ -35,7 +41,7 @@ BP_SIZE equ 8*1024
|
||||
; rep movsb is a bit smaller but 6.9% slower; everything else is much worse.
|
||||
%macro IC_MOVSD 0
|
||||
mov eax, ecx
|
||||
shr ecx, 2 ; dword count
|
||||
shr ecx, 2 ; dword count
|
||||
neg ecx
|
||||
add ecx, %%movsd_table_end
|
||||
jmp ecx
|
||||
@ -74,10 +80,8 @@ align 8
|
||||
%macro IC_ALIGN 0
|
||||
mov eax, 8
|
||||
sub eax, edi
|
||||
and eax, byte 7
|
||||
cmp eax, ecx
|
||||
cmova eax, ecx
|
||||
sub ecx, eax
|
||||
and eax, byte 7 ; eax = # misaligned bytes
|
||||
sub ecx, eax ; reduce copy count
|
||||
neg eax
|
||||
add eax, %%align_table_end
|
||||
jmp eax
|
||||
@ -94,7 +98,7 @@ align 4
|
||||
%endm
|
||||
|
||||
|
||||
; > ecx = size (> 0)
|
||||
; > ecx = size
|
||||
; x edx
|
||||
%macro IC_MOVQ 0
|
||||
align 16
|
||||
@ -103,22 +107,22 @@ align 16
|
||||
cmp ecx, edx
|
||||
jb %%done
|
||||
prefetchnta [esi + (200*64/34+192)]
|
||||
movq mm0, [esi+0]
|
||||
movq mm1, [esi+8]
|
||||
movq [edi+0], mm0
|
||||
movq [edi+8], mm1
|
||||
movq mm2, [esi+16]
|
||||
movq mm3, [esi+24]
|
||||
movq [edi+16], mm2
|
||||
movq [edi+24], mm3
|
||||
movq mm0, [esi+32]
|
||||
movq mm1, [esi+40]
|
||||
movq [edi+32], mm0
|
||||
movq [edi+40], mm1
|
||||
movq mm2, [esi+48]
|
||||
movq mm3, [esi+56]
|
||||
movq [edi+48], mm2
|
||||
movq [edi+56], mm3
|
||||
movq mm0, [esi+0]
|
||||
movq mm1, [esi+8]
|
||||
movq [edi+0], mm0
|
||||
movq [edi+8], mm1
|
||||
movq mm2, [esi+16]
|
||||
movq mm3, [esi+24]
|
||||
movq [edi+16], mm2
|
||||
movq [edi+24], mm3
|
||||
movq mm0, [esi+32]
|
||||
movq mm1, [esi+40]
|
||||
movq [edi+32], mm0
|
||||
movq [edi+40], mm1
|
||||
movq mm2, [esi+48]
|
||||
movq mm3, [esi+56]
|
||||
movq [edi+48], mm2
|
||||
movq [edi+56], mm3
|
||||
add esi, edx
|
||||
add edi, edx
|
||||
sub ecx, edx
|
||||
@ -127,6 +131,37 @@ align 16
|
||||
%endm
|
||||
|
||||
|
||||
; > ecx = size (> 64)
|
||||
; x
|
||||
%macro UC_MOVNTQ 0
|
||||
mov edx, 64
|
||||
align 16
|
||||
%%1:
|
||||
prefetchnta [esi + (200*64/34+192)]
|
||||
movq mm0,[esi+0]
|
||||
add edi, edx
|
||||
movq mm1,[esi+8]
|
||||
add esi, edx
|
||||
movq mm2,[esi-48]
|
||||
movntq [edi-64], mm0
|
||||
movq mm0,[esi-40]
|
||||
movntq [edi-56], mm1
|
||||
movq mm1,[esi-32]
|
||||
movntq [edi-48], mm2
|
||||
movq mm2,[esi-24]
|
||||
movntq [edi-40], mm0
|
||||
movq mm0,[esi-16]
|
||||
movntq [edi-32], mm1
|
||||
movq mm1,[esi-8]
|
||||
movntq [edi-24], mm2
|
||||
movntq [edi-16], mm0
|
||||
sub ecx, edx
|
||||
movntq [edi-8], mm1
|
||||
cmp ecx, edx
|
||||
jae %%1
|
||||
%endm
|
||||
|
||||
|
||||
; > ecx = size (> 8KiB)
|
||||
; x eax, edx
|
||||
;
|
||||
@ -135,7 +170,7 @@ align 16
|
||||
%%prefetch_and_copy_chunk:
|
||||
|
||||
; touch each cache line within chunk in reverse order (prevents HW prefetch)
|
||||
push byte BP_SIZE/128 ; # iterations
|
||||
push byte BP_SIZE/128 ; # iterations
|
||||
pop eax
|
||||
add esi, BP_SIZE
|
||||
align 8
|
||||
@ -147,28 +182,28 @@ align 8
|
||||
jnz %%prefetch_chunk
|
||||
|
||||
; copy 64 byte blocks
|
||||
mov eax, BP_SIZE/64 ; # iterations (> signed 8 bit)
|
||||
push byte 64
|
||||
mov eax, BP_SIZE/64 ; # iterations (> signed 8 bit)
|
||||
push byte 64
|
||||
pop edx
|
||||
align 8
|
||||
%%copy_block:
|
||||
movq mm0, [esi+ 0]
|
||||
movq mm1, [esi+ 8]
|
||||
movq mm2, [esi+16]
|
||||
movq mm3, [esi+24]
|
||||
movq mm4, [esi+32]
|
||||
movq mm5, [esi+40]
|
||||
movq mm6, [esi+48]
|
||||
movq mm7, [esi+56]
|
||||
movq mm0, [esi+ 0]
|
||||
movq mm1, [esi+ 8]
|
||||
movq mm2, [esi+16]
|
||||
movq mm3, [esi+24]
|
||||
movq mm4, [esi+32]
|
||||
movq mm5, [esi+40]
|
||||
movq mm6, [esi+48]
|
||||
movq mm7, [esi+56]
|
||||
add esi, edx
|
||||
movntq [edi+ 0], mm0
|
||||
movntq [edi+ 8], mm1
|
||||
movntq [edi+16], mm2
|
||||
movntq [edi+24], mm3
|
||||
movntq [edi+32], mm4
|
||||
movntq [edi+40], mm5
|
||||
movntq [edi+48], mm6
|
||||
movntq [edi+56], mm7
|
||||
movntq [edi+ 0], mm0
|
||||
movntq [edi+ 8], mm1
|
||||
movntq [edi+16], mm2
|
||||
movntq [edi+24], mm3
|
||||
movntq [edi+32], mm4
|
||||
movntq [edi+40], mm5
|
||||
movntq [edi+48], mm6
|
||||
movntq [edi+56], mm7
|
||||
add edi, edx
|
||||
dec eax
|
||||
jnz %%copy_block
|
||||
@ -179,49 +214,28 @@ align 8
|
||||
%endm
|
||||
|
||||
|
||||
; > ecx = size (> 64)
|
||||
; x
|
||||
%macro UC_MOVNTQ 0
|
||||
mov edx, 64
|
||||
align 16
|
||||
%%1:
|
||||
prefetchnta [esi + (200*64/34+192)]
|
||||
movq mm0,[esi+0]
|
||||
add edi, edx
|
||||
movq mm1,[esi+8]
|
||||
add esi, edx
|
||||
movq mm2,[esi-48]
|
||||
movntq [edi-64], mm0
|
||||
movq mm0,[esi-40]
|
||||
movntq [edi-56], mm1
|
||||
movq mm1,[esi-32]
|
||||
movntq [edi-48], mm2
|
||||
movq mm2,[esi-24]
|
||||
movntq [edi-40], mm0
|
||||
movq mm0,[esi-16]
|
||||
movntq [edi-32], mm1
|
||||
movq mm1,[esi-8]
|
||||
movntq [edi-24], mm2
|
||||
movntq [edi-16], mm0
|
||||
sub ecx, edx
|
||||
movntq [edi-8], mm1
|
||||
cmp ecx, edx
|
||||
jae %%1
|
||||
%endm
|
||||
[section .bss]
|
||||
|
||||
; this is somewhat "clever". the 2 specialized transfer implementations
|
||||
; that use SSE are jumped to if transfer size is greater than a threshold.
|
||||
; we simply set the requested transfer size to 0 if the CPU doesn't
|
||||
; support SSE so that those are never reached (done by masking with this).
|
||||
sse_mask resd 1
|
||||
|
||||
__SECT__
|
||||
|
||||
; void __declspec(naked) ia32_memcpy(void* dst, const void* src, size_t nbytes)
|
||||
global _ia32_memcpy
|
||||
_ia32_memcpy:
|
||||
push edi
|
||||
push esi
|
||||
push edi
|
||||
push esi
|
||||
|
||||
mov edi, [esp+8+4+0] ; dst
|
||||
mov esi, [esp+8+4+4] ; src
|
||||
mov ecx, [esp+8+4+8] ; nbytes
|
||||
mov edi, [esp+8+4+0] ; dst
|
||||
mov esi, [esp+8+4+4] ; src
|
||||
mov ecx, [esp+8+4+8] ; nbytes
|
||||
|
||||
cmp ecx, byte IC_SIZE
|
||||
ja .choose_large_method
|
||||
ja .choose_larger_method
|
||||
|
||||
.ic_movsd:
|
||||
IC_MOVSD
|
||||
@ -229,12 +243,21 @@ _ia32_memcpy:
|
||||
pop edi
|
||||
ret
|
||||
|
||||
.choose_large_method:
|
||||
.choose_larger_method:
|
||||
IC_ALIGN
|
||||
cmp ecx, UC_THRESHOLD
|
||||
jb near .ic_movq
|
||||
cmp ecx, BP_THRESHOLD
|
||||
jae .uc_bp_movntq
|
||||
|
||||
mov eax, [sse_mask]
|
||||
mov edx, ecx
|
||||
and edx, eax ; edx = (SSE)? remaining_bytes : 0
|
||||
cmp edx, BP_THRESHOLD
|
||||
jae near .uc_bp_movntq
|
||||
cmp edx, UC_THRESHOLD
|
||||
jae .uc_movntq
|
||||
|
||||
.ic_movq:
|
||||
IC_MOVQ
|
||||
emms
|
||||
jmp .ic_movsd
|
||||
|
||||
.uc_movntq:
|
||||
UC_MOVNTQ
|
||||
@ -245,26 +268,21 @@ _ia32_memcpy:
|
||||
.uc_bp_movntq:
|
||||
UC_BP_MOVNTQ
|
||||
sfence
|
||||
; fall through
|
||||
jmp .ic_movq
|
||||
|
||||
.ic_movq:
|
||||
IC_MOVQ
|
||||
emms
|
||||
jmp .ic_movsd
|
||||
|
||||
|
||||
;-------------------------------------------------------------------------------
|
||||
; CPUID support
|
||||
;-------------------------------------------------------------------------------
|
||||
|
||||
[section .data use32]
|
||||
cpuid_available dd -1
|
||||
[section .data]
|
||||
|
||||
[section .bss use32]
|
||||
|
||||
; no init needed - cpuid_available triggers init
|
||||
max_func resd 1
|
||||
max_ext_func resd 1
|
||||
; these are actually max_func+1, i.e. the first invalid value.
|
||||
; the idea here is to avoid a separate cpuid_available flag;
|
||||
; using signed values doesn't work because ext_funcs are >= 0x80000000.
|
||||
max_func dd 0
|
||||
max_ext_func dd 0
|
||||
|
||||
__SECT__
|
||||
|
||||
@ -272,28 +290,21 @@ __SECT__
|
||||
; extern "C" bool __cdecl ia32_cpuid(u32 func, u32* regs)
|
||||
global _ia32_cpuid
|
||||
_ia32_cpuid:
|
||||
; note: must preserve before .one_time_init because it does cpuid
|
||||
push ebx
|
||||
push edi
|
||||
|
||||
.retry:
|
||||
; if unknown, detect; if not available, fail.
|
||||
xor eax, eax ; return val on failure
|
||||
cmp [cpuid_available], eax
|
||||
jl .one_time_init
|
||||
je .ret
|
||||
push ebx
|
||||
push edi
|
||||
|
||||
mov ecx, [esp+8+4+0] ; func
|
||||
mov edi, [esp+8+4+4] ; -> regs
|
||||
|
||||
; compare against max supported func and fail if above
|
||||
test ecx, ecx
|
||||
xor eax, eax ; return value on failure
|
||||
test ecx, ecx
|
||||
mov edx, [max_ext_func]
|
||||
js .is_ext_func
|
||||
mov edx, [max_func]
|
||||
.is_ext_func:
|
||||
cmp ecx, edx
|
||||
ja .ret
|
||||
jae .ret ; (see max_func decl)
|
||||
|
||||
; issue CPUID and store result registers in array
|
||||
mov eax, ecx
|
||||
@ -314,28 +325,6 @@ _ia32_cpuid:
|
||||
pop ebx
|
||||
ret
|
||||
|
||||
.one_time_init:
|
||||
; check if CPUID is supported
|
||||
pushfd
|
||||
or byte [esp+2], 32
|
||||
popfd
|
||||
pushfd
|
||||
pop eax
|
||||
xor edx, edx
|
||||
shr eax, 22 ; bit 21 toggled?
|
||||
adc edx, 0
|
||||
mov [cpuid_available], edx
|
||||
|
||||
; determine max supported function
|
||||
xor eax, eax
|
||||
cpuid
|
||||
mov [max_func], eax
|
||||
mov eax, 0x80000000
|
||||
cpuid
|
||||
mov [max_ext_func], eax
|
||||
|
||||
jmp .retry
|
||||
|
||||
|
||||
;-------------------------------------------------------------------------------
|
||||
; misc
|
||||
@ -344,16 +333,58 @@ _ia32_cpuid:
|
||||
; extern "C" uint __cdecl ia32_control87(uint new_cw, uint mask)
|
||||
global _ia32_control87
|
||||
_ia32_control87:
|
||||
push eax
|
||||
fnstcw [esp]
|
||||
pop eax ; old_cw
|
||||
mov ecx, [esp+4] ; new_cw
|
||||
mov edx, [esp+8] ; mask
|
||||
and ecx, edx ; new_cw & mask
|
||||
not edx ; ~mask
|
||||
and eax, edx ; old_cw & ~mask
|
||||
or eax, ecx ; (old_cw & ~mask) | (new_cw & mask)
|
||||
push edx
|
||||
fldcw [esp]
|
||||
push eax
|
||||
fnstcw [esp]
|
||||
pop eax ; old_cw
|
||||
mov ecx, [esp+4] ; new_cw
|
||||
mov edx, [esp+8] ; mask
|
||||
and ecx, edx ; new_cw & mask
|
||||
not edx ; ~mask
|
||||
and eax, edx ; old_cw & ~mask
|
||||
or eax, ecx ; (old_cw & ~mask) | (new_cw & mask)
|
||||
push edx
|
||||
fldcw [esp]
|
||||
pop edx
|
||||
xor eax, eax ; return value
|
||||
xor eax, eax ; return value
|
||||
|
||||
|
||||
;-------------------------------------------------------------------------------
|
||||
; init
|
||||
;-------------------------------------------------------------------------------
|
||||
|
||||
; extern "C" bool __cdecl ia32_init()
|
||||
global _ia32_init
|
||||
_ia32_init:
|
||||
push ebx
|
||||
|
||||
; check if CPUID is supported
|
||||
pushfd
|
||||
or byte [esp+2], 32
|
||||
popfd
|
||||
pushfd
|
||||
pop eax
|
||||
xor edx, edx
|
||||
shr eax, 22 ; bit 21 toggled?
|
||||
jnc .no_cpuid
|
||||
|
||||
; determine max supported CPUID function
|
||||
xor eax, eax
|
||||
cpuid
|
||||
inc eax ; (see max_func decl)
|
||||
mov [max_func], eax
|
||||
mov eax, 0x80000000
|
||||
cpuid
|
||||
inc eax ; (see max_func decl)
|
||||
mov [max_ext_func], eax
|
||||
.no_cpuid:
|
||||
|
||||
; check if SSE is supported (used by memcpy code)
|
||||
extern _ia32_cap
|
||||
push byte 32+25 ; ia32.h's SSE cap (won't change)
|
||||
call _ia32_cap
|
||||
pop edx ; remove stack param
|
||||
neg eax ; SSE? ~0 : 0
|
||||
mov [sse_mask], eax
|
||||
|
||||
pop ebx
|
||||
ret
|
@ -31,6 +31,9 @@ extern "C" {
|
||||
#endif
|
||||
|
||||
|
||||
// call before any of the following functions
|
||||
extern void ia32_init();
|
||||
|
||||
extern double _ceil(double);
|
||||
|
||||
extern u64 rdtsc(void);
|
||||
|
@ -6,6 +6,9 @@
|
||||
#include "lib/detect.h"
|
||||
#include "lib/timer.h"
|
||||
#include "lib/input.h"
|
||||
#if CPU_IA32
|
||||
# include "lib/sysdep/ia32.h"
|
||||
#endif
|
||||
#include "lib/res/res.h"
|
||||
#include "lib/res/sound/snd.h"
|
||||
#include "lib/res/graphics/tex.h"
|
||||
@ -688,6 +691,10 @@ void Init(int argc, char* argv[], bool setup_gfx, bool setup_gui)
|
||||
|
||||
debug_set_thread_name("main");
|
||||
|
||||
#if CPU_IA32
|
||||
ia32_init();
|
||||
#endif
|
||||
|
||||
// If you ever want to catch a particular allocation:
|
||||
//_CrtSetBreakAlloc(187);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user