2006-04-12 01:59:08 +02:00
|
|
|
; =========================================================================
|
2006-04-20 03:33:57 +02:00
|
|
|
; File : ia32_asm.asm
|
2006-04-12 01:59:08 +02:00
|
|
|
; Project : 0 A.D.
|
|
|
|
; Description : optimized assembly code for IA-32. not provided as
|
|
|
|
; : inline assembly because that's compiler-specific.
|
|
|
|
; =========================================================================
|
|
|
|
|
2007-05-07 18:33:24 +02:00
|
|
|
; license: GPL; see lib/license.txt
|
2006-04-12 01:59:08 +02:00
|
|
|
|
2006-06-25 22:58:03 +02:00
|
|
|
%include "ia32.inc"
|
2005-09-13 06:00:41 +02:00
|
|
|
|
2007-04-25 20:19:35 +02:00
|
|
|
; note: pure asm functions prevent inlining but also avoid redundant
|
|
|
|
; store/loads generated by VC inline asm (ugh).
|
|
|
|
|
|
|
|
|
2005-09-15 02:51:59 +02:00
|
|
|
;-------------------------------------------------------------------------------
|
|
|
|
; CPUID support
|
|
|
|
;-------------------------------------------------------------------------------
|
2005-09-13 06:00:41 +02:00
|
|
|
|
2005-09-20 06:05:23 +02:00
|
|
|
[section .data]
|
2005-09-13 06:00:41 +02:00
|
|
|
|
2005-09-20 06:05:23 +02:00
|
|
|
; these are actually max_func+1, i.e. the first invalid value.
|
|
|
|
; the idea here is to avoid a separate cpuid_available flag;
|
|
|
|
; using signed values doesn't work because ext_funcs are >= 0x80000000.
|
|
|
|
max_func dd 0
|
|
|
|
max_ext_func dd 0
|
2005-09-15 02:51:59 +02:00
|
|
|
|
2005-09-13 23:12:29 +02:00
|
|
|
__SECT__
|
2005-09-13 06:00:41 +02:00
|
|
|
|
2005-09-15 02:51:59 +02:00
|
|
|
|
2007-04-25 20:19:35 +02:00
|
|
|
; extern "C" void __cdecl ia32_asm_cpuid_init()
|
|
|
|
global sym(ia32_asm_cpuid_init)
|
|
|
|
sym(ia32_asm_cpuid_init):
|
|
|
|
push ebx
|
|
|
|
|
|
|
|
; check if CPUID is supported
|
|
|
|
pushfd
|
|
|
|
or byte [esp+2], 32
|
|
|
|
popfd
|
|
|
|
pushfd
|
|
|
|
pop eax
|
|
|
|
xor edx, edx
|
|
|
|
shr eax, 22 ; bit 21 toggled?
|
|
|
|
jnc .no_cpuid
|
|
|
|
|
|
|
|
; determine max supported CPUID function
|
|
|
|
xor eax, eax
|
|
|
|
cpuid
|
|
|
|
inc eax ; (see max_func decl)
|
|
|
|
mov [max_func], eax
|
|
|
|
mov eax, 0x80000000
|
|
|
|
cpuid
|
|
|
|
inc eax ; (see max_func decl)
|
|
|
|
mov [max_ext_func], eax
|
|
|
|
.no_cpuid:
|
|
|
|
|
|
|
|
pop ebx
|
|
|
|
ret
|
|
|
|
|
|
|
|
|
|
|
|
; extern "C" bool __cdecl ia32_asm_cpuid(u32 func, u32* regs)
|
|
|
|
global sym(ia32_asm_cpuid)
|
|
|
|
sym(ia32_asm_cpuid):
|
2005-09-20 06:05:23 +02:00
|
|
|
push ebx
|
|
|
|
push edi
|
2005-09-13 06:00:41 +02:00
|
|
|
|
2007-04-25 20:19:35 +02:00
|
|
|
mov edx, [esp+8+4+0] ; func
|
2005-09-13 23:12:29 +02:00
|
|
|
mov edi, [esp+8+4+4] ; -> regs
|
|
|
|
|
|
|
|
; compare against max supported func and fail if above
|
2007-04-25 20:19:35 +02:00
|
|
|
mov ebx, [max_ext_func]
|
2005-09-20 06:05:23 +02:00
|
|
|
xor eax, eax ; return value on failure
|
2007-04-25 20:19:35 +02:00
|
|
|
test edx, edx
|
2005-09-13 23:12:29 +02:00
|
|
|
js .is_ext_func
|
2007-04-25 20:19:35 +02:00
|
|
|
mov ebx, [max_func]
|
2005-09-13 23:12:29 +02:00
|
|
|
.is_ext_func:
|
2007-04-25 20:19:35 +02:00
|
|
|
cmp edx, ebx
|
2005-09-20 06:05:23 +02:00
|
|
|
jae .ret ; (see max_func decl)
|
2005-09-13 23:12:29 +02:00
|
|
|
|
|
|
|
; issue CPUID and store result registers in array
|
2007-04-25 20:19:35 +02:00
|
|
|
mov eax, edx
|
|
|
|
xor ecx, ecx ; CPUID.4 requires ECX = 0..2
|
2005-09-13 23:12:29 +02:00
|
|
|
cpuid
|
|
|
|
stosd
|
|
|
|
xchg eax, ebx
|
|
|
|
stosd
|
|
|
|
xchg eax, ecx
|
|
|
|
stosd
|
|
|
|
xchg eax, edx
|
|
|
|
stosd
|
|
|
|
|
|
|
|
; success
|
|
|
|
xor eax, eax
|
|
|
|
inc eax
|
|
|
|
.ret:
|
|
|
|
pop edi
|
|
|
|
pop ebx
|
|
|
|
ret
|
2005-09-13 06:00:41 +02:00
|
|
|
|
2005-09-20 06:05:23 +02:00
|
|
|
|
2005-11-18 06:16:43 +01:00
|
|
|
;-------------------------------------------------------------------------------
|
|
|
|
; lock-free support routines
|
|
|
|
;-------------------------------------------------------------------------------
|
|
|
|
|
2007-05-02 14:07:08 +02:00
|
|
|
; extern "C" void __cdecl ia32_asm_AtomicAdd(intptr_t* location, intptr_t increment);
|
|
|
|
global sym(ia32_asm_AtomicAdd)
|
|
|
|
sym(ia32_asm_AtomicAdd):
|
2005-11-18 06:16:43 +01:00
|
|
|
mov edx, [esp+4] ; location
|
|
|
|
mov eax, [esp+8] ; increment
|
|
|
|
db 0xf0 ; LOCK prefix
|
|
|
|
add [edx], eax
|
|
|
|
ret
|
|
|
|
|
|
|
|
|
|
|
|
; notes:
|
|
|
|
; - this is called via CAS macro, which silently casts its inputs for
|
|
|
|
; convenience. mixing up the <expected> and <location> parameters would
|
|
|
|
; go unnoticed; we therefore perform a basic sanity check on <location> and
|
|
|
|
; raise a warning if it is invalid.
|
|
|
|
; - a 486 or later processor is required since we use CMPXCHG.
|
|
|
|
; there's no feature flag we can check, and the ia32 code doesn't
|
|
|
|
; bother detecting anything < Pentium, so this'll crash and burn if
|
|
|
|
; run on 386. we could fall back to simple MOVs there (since 386 CPUs
|
|
|
|
; aren't MP-capable), but it's not worth the trouble.
|
2007-04-25 20:19:35 +02:00
|
|
|
; - nor do we bother skipping the LOCK prefix on single-processor systems.
|
|
|
|
; the branch may be well-predicted, but difference in performance still
|
|
|
|
; isn't expected to be enough to justify the effort.
|
|
|
|
; extern "C" ; extern "C" bool __cdecl ia32_asm_CAS(uintptr_t* location, uintptr_t expected, uintptr_t new_value);
|
|
|
|
global sym(ia32_asm_CAS)
|
|
|
|
sym(ia32_asm_CAS):
|
2005-11-18 06:16:43 +01:00
|
|
|
mov edx, [esp+4] ; location
|
2007-04-25 20:19:35 +02:00
|
|
|
mov eax, [esp+8] ; expected
|
|
|
|
cmp edx, 0x10000 ; valid location?
|
|
|
|
jb .invalid_location ; no - raise warning
|
2005-11-18 06:16:43 +01:00
|
|
|
mov ecx, [esp+12] ; new_value
|
|
|
|
db 0xf0 ; LOCK prefix
|
|
|
|
cmpxchg [edx], ecx
|
|
|
|
sete al
|
|
|
|
movzx eax, al
|
|
|
|
ret
|
|
|
|
|
|
|
|
; NOTE: nasm 0.98.39 doesn't support generating debug info for win32
|
|
|
|
; output format. that means this code may be misattributed to other
|
|
|
|
; functions, which makes tracking it down very difficult.
|
|
|
|
; we therefore raise an "Invalid Opcode" exception, which is rather distinct.
|
|
|
|
.invalid_location:
|
|
|
|
ud2
|
|
|
|
|
|
|
|
|
2005-09-20 06:05:23 +02:00
|
|
|
;-------------------------------------------------------------------------------
|
2006-05-05 07:54:00 +02:00
|
|
|
; FPU
|
2005-09-20 06:05:23 +02:00
|
|
|
;-------------------------------------------------------------------------------
|
|
|
|
|
2007-04-25 20:19:35 +02:00
|
|
|
[section .data]
|
|
|
|
|
|
|
|
; to conform with the fallback implementation (a C cast), we need to
|
|
|
|
; end up with truncate/"chop" rounding. subtracting does the trick,
|
|
|
|
; assuming RC is the IA-32 default round-to-nearest mode.
|
|
|
|
round_bias dd 0.4999999
|
|
|
|
|
|
|
|
__SECT__
|
|
|
|
|
|
|
|
; extern "C" uint __cdecl ia32_asm_control87(uint new_cw, uint mask);
|
|
|
|
global sym(ia32_asm_control87)
|
|
|
|
sym(ia32_asm_control87):
|
2005-09-20 06:05:23 +02:00
|
|
|
push eax
|
|
|
|
fnstcw [esp]
|
|
|
|
pop eax ; old_cw
|
2005-10-19 05:06:54 +02:00
|
|
|
mov ecx, [esp+4] ; new_val
|
2005-09-20 06:05:23 +02:00
|
|
|
mov edx, [esp+8] ; mask
|
2005-10-19 05:06:54 +02:00
|
|
|
and ecx, edx ; new_val & mask
|
2005-09-20 06:05:23 +02:00
|
|
|
not edx ; ~mask
|
|
|
|
and eax, edx ; old_cw & ~mask
|
2005-10-19 05:06:54 +02:00
|
|
|
or eax, ecx ; (old_cw & ~mask) | (new_val & mask)
|
|
|
|
push eax ; = new_cw
|
2005-09-20 06:05:23 +02:00
|
|
|
fldcw [esp]
|
2005-10-19 03:53:38 +02:00
|
|
|
pop eax
|
2005-09-20 06:05:23 +02:00
|
|
|
xor eax, eax ; return value
|
2005-09-27 01:36:43 +02:00
|
|
|
ret
|
2005-10-19 05:06:54 +02:00
|
|
|
|
2005-10-24 02:06:08 +02:00
|
|
|
|
2006-05-05 07:54:00 +02:00
|
|
|
; possible IA-32 FPU control word flags after FXAM: NAN|NORMAL|ZERO
|
|
|
|
FP_CLASSIFY_MASK equ 0x4500
|
|
|
|
|
2007-04-30 21:58:04 +02:00
|
|
|
; extern "C" uint __cdecl ia32_asm_fpclassifyd(double d);
|
|
|
|
global sym(ia32_asm_fpclassifyd)
|
|
|
|
sym(ia32_asm_fpclassifyd):
|
2006-05-05 07:54:00 +02:00
|
|
|
fld qword [esp+4]
|
|
|
|
fxam
|
|
|
|
fnstsw ax
|
2006-07-09 02:24:21 +02:00
|
|
|
fstp st0
|
2006-05-05 07:54:00 +02:00
|
|
|
and eax, FP_CLASSIFY_MASK
|
|
|
|
ret
|
|
|
|
|
2007-04-25 20:19:35 +02:00
|
|
|
; extern "C" uint __cdecl ia32_asm_fpclassifyf(float f);
|
|
|
|
global sym(ia32_asm_fpclassifyf)
|
|
|
|
sym(ia32_asm_fpclassifyf):
|
2006-05-05 07:54:00 +02:00
|
|
|
fld dword [esp+4]
|
|
|
|
fxam
|
|
|
|
fnstsw ax
|
2006-07-09 02:24:21 +02:00
|
|
|
fstp st0
|
2006-05-05 07:54:00 +02:00
|
|
|
and eax, FP_CLASSIFY_MASK
|
|
|
|
ret
|
|
|
|
|
|
|
|
|
2007-04-25 20:19:35 +02:00
|
|
|
; extern "C" float __cdecl ia32_asm_rintf(float)
|
|
|
|
global sym(ia32_asm_rintf)
|
|
|
|
sym(ia32_asm_rintf):
|
|
|
|
fld dword [esp+4]
|
|
|
|
frndint
|
|
|
|
ret
|
|
|
|
|
|
|
|
; extern "C" double __cdecl ia32_asm_rint(double)
|
|
|
|
global sym(ia32_asm_rint)
|
|
|
|
sym(ia32_asm_rint):
|
|
|
|
fld qword [esp+4]
|
|
|
|
frndint
|
|
|
|
ret
|
|
|
|
|
|
|
|
|
|
|
|
; extern "C" float __cdecl ia32_asm_fminf(float, float)
|
|
|
|
global sym(ia32_asm_fminf)
|
|
|
|
sym(ia32_asm_fminf):
|
|
|
|
fld dword [esp+4]
|
|
|
|
fld dword [esp+8]
|
|
|
|
fcomi st0, st1
|
|
|
|
fcmovnb st0, st1
|
|
|
|
fxch
|
|
|
|
fstp st0
|
|
|
|
ret
|
|
|
|
|
|
|
|
; extern "C" float __cdecl ia32_asm_fmaxf(float, float)
|
|
|
|
global sym(ia32_asm_fmaxf)
|
|
|
|
sym(ia32_asm_fmaxf):
|
|
|
|
fld dword [esp+4]
|
|
|
|
fld dword [esp+8]
|
|
|
|
fcomi st0, st1
|
|
|
|
fcmovb st0, st1
|
|
|
|
fxch
|
|
|
|
fstp st0
|
|
|
|
ret
|
|
|
|
|
|
|
|
|
2007-05-02 14:07:08 +02:00
|
|
|
; extern "C" i32 __cdecl ia32_asm_i32FromFloat(float f)
|
|
|
|
global sym(ia32_asm_i32FromFloat)
|
|
|
|
sym(ia32_asm_i32FromFloat):
|
2007-04-25 20:19:35 +02:00
|
|
|
push eax
|
|
|
|
fld dword [esp+8]
|
|
|
|
fsub dword [round_bias]
|
|
|
|
fistp dword [esp]
|
|
|
|
pop eax
|
|
|
|
ret
|
|
|
|
|
2007-05-02 14:07:08 +02:00
|
|
|
; extern "C" i32 __cdecl ia32_asm_i32FromDouble(double d)
|
|
|
|
global sym(ia32_asm_i32FromDouble)
|
|
|
|
sym(ia32_asm_i32FromDouble):
|
2007-04-25 20:19:35 +02:00
|
|
|
push eax
|
|
|
|
fld qword [esp+8]
|
|
|
|
fsub dword [round_bias]
|
|
|
|
fistp dword [esp]
|
|
|
|
pop eax
|
|
|
|
ret
|
|
|
|
|
2007-05-02 14:07:08 +02:00
|
|
|
; extern "C" i64 __cdecl ia32_asm_i64FromDouble(double d)
|
|
|
|
global sym(ia32_asm_i64FromDouble)
|
|
|
|
sym(ia32_asm_i64FromDouble):
|
2007-04-25 20:19:35 +02:00
|
|
|
push edx
|
|
|
|
push eax
|
|
|
|
fld qword [esp+12]
|
|
|
|
fsub dword [round_bias]
|
|
|
|
fistp qword [esp]
|
|
|
|
pop eax
|
|
|
|
pop edx
|
|
|
|
ret
|
|
|
|
|
|
|
|
|
2006-05-05 07:54:00 +02:00
|
|
|
;-------------------------------------------------------------------------------
|
|
|
|
; misc
|
|
|
|
;-------------------------------------------------------------------------------
|
|
|
|
|
2006-07-26 16:04:52 +02:00
|
|
|
; rationale: the common return convention for 64-bit values is in edx:eax.
|
|
|
|
; with inline asm, we'd have to MOV data to a temporary and return that;
|
|
|
|
; this is less efficient (-> important for low-overhead profiling) than
|
|
|
|
; making use of the convention.
|
|
|
|
;
|
|
|
|
; however, speed is not the main reason for providing this routine.
|
|
|
|
; xcode complains about CPUID clobbering ebx, so we use external asm
|
|
|
|
; where possible (IA-32 CPUs).
|
|
|
|
;
|
2007-04-25 20:19:35 +02:00
|
|
|
; extern "C" u64 ia32_asm_rdtsc_edx_eax()
|
|
|
|
global sym(ia32_asm_rdtsc_edx_eax)
|
|
|
|
sym(ia32_asm_rdtsc_edx_eax):
|
2006-07-26 16:04:52 +02:00
|
|
|
push ebx
|
|
|
|
cpuid
|
|
|
|
pop ebx
|
|
|
|
rdtsc
|
|
|
|
ret
|
|
|
|
|
|
|
|
|
2007-05-09 23:01:11 +02:00
|
|
|
; extern "C" int ia32_asm_log2_of_pow2(uint n)
|
|
|
|
global sym(ia32_asm_log2_of_pow2)
|
|
|
|
sym(ia32_asm_log2_of_pow2):
|
|
|
|
mov ecx, [esp+4] ; n
|
|
|
|
or eax, -1 ; return value if not a POT
|
|
|
|
test ecx, ecx
|
|
|
|
jz .not_pot
|
|
|
|
lea edx, [ecx-1]
|
|
|
|
test ecx, edx
|
|
|
|
jnz .not_pot
|
|
|
|
bsf eax, ecx
|
|
|
|
.not_pot:
|
|
|
|
ret
|
|
|
|
|
|
|
|
|
2005-10-24 02:06:08 +02:00
|
|
|
; write the current execution state (e.g. all register values) into
|
|
|
|
; (Win32::CONTEXT*)pcontext (defined as void* to avoid dependency).
|
2007-04-25 20:19:35 +02:00
|
|
|
; optimized for size; this must be straight asm because ; extern "C"
|
2005-10-24 02:06:08 +02:00
|
|
|
; is compiler-specific and compiler-generated prolog code inserted before
|
|
|
|
; inline asm trashes EBP and ESP (unacceptable).
|
2007-05-02 14:07:08 +02:00
|
|
|
; extern "C" void ia32_asm_GetCurrentContext(void* pcontext)
|
|
|
|
global sym(ia32_asm_GetCurrentContext)
|
|
|
|
sym(ia32_asm_GetCurrentContext):
|
2005-10-24 02:06:08 +02:00
|
|
|
pushad
|
|
|
|
pushfd
|
|
|
|
mov edi, [esp+4+32+4] ; pcontext
|
|
|
|
|
|
|
|
; ContextFlags
|
|
|
|
mov eax, 0x10007 ; segs, int, control
|
|
|
|
stosd
|
|
|
|
|
|
|
|
; DRx and FloatSave
|
|
|
|
; rationale: we can't access the debug registers from Ring3, and
|
|
|
|
; the FPU save area is irrelevant, so zero them.
|
|
|
|
xor eax, eax
|
|
|
|
push byte 6+8+20
|
|
|
|
pop ecx
|
|
|
|
rep stosd
|
|
|
|
|
|
|
|
; CONTEXT_SEGMENTS
|
|
|
|
mov ax, gs
|
|
|
|
stosd
|
|
|
|
mov ax, fs
|
|
|
|
stosd
|
|
|
|
mov ax, es
|
|
|
|
stosd
|
|
|
|
mov ax, ds
|
|
|
|
stosd
|
|
|
|
|
|
|
|
; CONTEXT_INTEGER
|
|
|
|
mov eax, [esp+4+32-32] ; edi
|
|
|
|
stosd
|
|
|
|
xchg eax, esi
|
|
|
|
stosd
|
|
|
|
xchg eax, ebx
|
|
|
|
stosd
|
|
|
|
xchg eax, edx
|
|
|
|
stosd
|
|
|
|
mov eax, [esp+4+32-8] ; ecx
|
|
|
|
stosd
|
|
|
|
mov eax, [esp+4+32-4] ; eax
|
|
|
|
stosd
|
|
|
|
|
|
|
|
; CONTEXT_CONTROL
|
|
|
|
xchg eax, ebp ; ebp restored by POPAD
|
|
|
|
stosd
|
|
|
|
mov eax, [esp+4+32] ; return address
|
|
|
|
sub eax, 5 ; skip CALL instruction -> call site.
|
|
|
|
stosd
|
|
|
|
xor eax, eax
|
|
|
|
mov ax, cs
|
|
|
|
stosd
|
|
|
|
pop eax ; eflags
|
|
|
|
stosd
|
|
|
|
lea eax, [esp+32+4+4] ; esp
|
|
|
|
stosd
|
|
|
|
xor eax, eax
|
|
|
|
mov ax, ss
|
|
|
|
stosd
|
|
|
|
|
|
|
|
; ExtendedRegisters
|
|
|
|
xor ecx, ecx
|
|
|
|
mov cl, 512/4
|
|
|
|
rep stosd
|
|
|
|
|
|
|
|
popad
|
|
|
|
ret
|