298 lines
7.8 KiB
NASM
298 lines
7.8 KiB
NASM
; Copyright (c) 2010 Wildfire Games
|
|
;
|
|
; Permission is hereby granted, free of charge, to any person obtaining
|
|
; a copy of this software and associated documentation files (the
|
|
; "Software"), to deal in the Software without restriction, including
|
|
; without limitation the rights to use, copy, modify, merge, publish,
|
|
; distribute, sublicense, and/or sell copies of the Software, and to
|
|
; permit persons to whom the Software is furnished to do so, subject to
|
|
; the following conditions:
|
|
;
|
|
; The above copyright notice and this permission notice shall be included
|
|
; in all copies or substantial portions of the Software.
|
|
;
|
|
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
|
; IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
|
; CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
|
; TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
|
; SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
|
|
; optimized assembly code for IA-32. not provided as
|
|
; inline assembly because that's compiler-specific.
|
|
|
|
%include "ia32.inc"
|
|
|
|
; note: pure asm functions prevent inlining but also avoid redundant
|
|
; store/loads generated by VC inline asm (ugh).
|
|
|
|
|
|
;-------------------------------------------------------------------------------
|
|
; CPUID support
|
|
;-------------------------------------------------------------------------------
|
|
|
|
; extern "C" void __cdecl ia32_asm_cpuid(x86_x64_CpuidRegs* regs);
|
|
global sym(ia32_asm_cpuid)
|
|
sym(ia32_asm_cpuid):
|
|
push ebx ; (clobbered by CPUID)
|
|
push edi ; (need a register other than eax..edx)
|
|
|
|
mov edi, [esp+8+4] ; -> regs
|
|
|
|
mov eax, [edi+0] ; eax (function)
|
|
mov ecx, [edi+8] ; ecx (count)
|
|
cpuid
|
|
stosd
|
|
xchg eax, ebx
|
|
stosd
|
|
xchg eax, ecx
|
|
stosd
|
|
xchg eax, edx
|
|
stosd
|
|
|
|
pop edi
|
|
pop ebx
|
|
ret
|
|
|
|
|
|
;-------------------------------------------------------------------------------
|
|
; lock-free support routines
|
|
;-------------------------------------------------------------------------------
|
|
|
|
; extern "C" void __cdecl ia32_asm_AtomicAdd(volatile intptr_t* location, intptr_t increment);
|
|
global sym(ia32_asm_AtomicAdd)
|
|
sym(ia32_asm_AtomicAdd):
|
|
mov edx, [esp+4] ; location
|
|
mov eax, [esp+8] ; increment
|
|
db 0xf0 ; LOCK prefix
|
|
add [edx], eax
|
|
ret
|
|
|
|
|
|
; notes:
|
|
; - a 486 or later processor is required since we use CMPXCHG.
|
|
; there's no feature flag we can check, and the ia32 code doesn't
|
|
; bother detecting anything < Pentium, so this'll crash and burn if
|
|
; run on 386. we could fall back to simple MOVs there (since 386 CPUs
|
|
; aren't MP-capable), but it's not worth the trouble.
|
|
; - nor do we bother skipping the LOCK prefix on single-processor systems.
|
|
; the branch may be well-predicted, but difference in performance still
|
|
; isn't expected to be enough to justify the effort.
|
|
; extern "C" bool __cdecl ia32_asm_CAS(volatile uintptr_t* location, uintptr_t expected, uintptr_t new_value);
|
|
global sym(ia32_asm_CAS)
|
|
sym(ia32_asm_CAS):
|
|
mov edx, [esp+4] ; location
|
|
mov eax, [esp+8] ; expected
|
|
mov ecx, [esp+12] ; new_value
|
|
db 0xf0 ; LOCK prefix
|
|
cmpxchg [edx], ecx
|
|
sete al
|
|
movzx eax, al
|
|
ret
|
|
|
|
|
|
;-------------------------------------------------------------------------------
|
|
; FPU
|
|
;-------------------------------------------------------------------------------
|
|
|
|
[section .data]
|
|
|
|
; to conform with the fallback implementation (a C cast), we need to
|
|
; end up with truncate/"chop" rounding. subtracting does the trick,
|
|
; assuming RC is the IA-32 default round-to-nearest mode.
|
|
round_bias dd 0.4999999
|
|
|
|
__SECT__
|
|
|
|
; extern "C" u32 __cdecl ia32_asm_control87(u32 new_cw, u32 mask);
|
|
global sym(ia32_asm_control87)
|
|
sym(ia32_asm_control87):
|
|
push eax
|
|
fnstcw [esp]
|
|
pop eax ; old_cw
|
|
mov ecx, [esp+4] ; new_val
|
|
mov edx, [esp+8] ; mask
|
|
and ecx, edx ; new_val & mask
|
|
not edx ; ~mask
|
|
and eax, edx ; old_cw & ~mask
|
|
or eax, ecx ; (old_cw & ~mask) | (new_val & mask)
|
|
push eax ; = new_cw
|
|
fldcw [esp]
|
|
pop eax
|
|
xor eax, eax ; return value
|
|
ret
|
|
|
|
|
|
; possible IA-32 FPU control word flags after FXAM: NAN|NORMAL|ZERO
|
|
FP_CLASSIFY_MASK equ 0x4500
|
|
|
|
; extern "C" size_t __cdecl ia32_asm_fpclassifyd(double d);
|
|
global sym(ia32_asm_fpclassifyd)
|
|
sym(ia32_asm_fpclassifyd):
|
|
fld qword [esp+4]
|
|
fxam
|
|
fnstsw ax
|
|
fstp st0
|
|
and eax, FP_CLASSIFY_MASK
|
|
ret
|
|
|
|
; extern "C" size_t __cdecl ia32_asm_fpclassifyf(float f);
|
|
global sym(ia32_asm_fpclassifyf)
|
|
sym(ia32_asm_fpclassifyf):
|
|
fld dword [esp+4]
|
|
fxam
|
|
fnstsw ax
|
|
fstp st0
|
|
and eax, FP_CLASSIFY_MASK
|
|
ret
|
|
|
|
|
|
; extern "C" float __cdecl ia32_asm_rintf(float);
|
|
global sym(ia32_asm_rintf)
|
|
sym(ia32_asm_rintf):
|
|
fld dword [esp+4]
|
|
frndint
|
|
ret
|
|
|
|
; extern "C" double __cdecl ia32_asm_rint(double);
|
|
global sym(ia32_asm_rint)
|
|
sym(ia32_asm_rint):
|
|
fld qword [esp+4]
|
|
frndint
|
|
ret
|
|
|
|
|
|
; extern "C" float __cdecl ia32_asm_fminf(float, float);
|
|
global sym(ia32_asm_fminf)
|
|
sym(ia32_asm_fminf):
|
|
fld dword [esp+4]
|
|
fld dword [esp+8]
|
|
fcomi st0, st1
|
|
fcmovnb st0, st1
|
|
fxch
|
|
fstp st0
|
|
ret
|
|
|
|
; extern "C" float __cdecl ia32_asm_fmaxf(float, float);
|
|
global sym(ia32_asm_fmaxf)
|
|
sym(ia32_asm_fmaxf):
|
|
fld dword [esp+4]
|
|
fld dword [esp+8]
|
|
fcomi st0, st1
|
|
fcmovb st0, st1
|
|
fxch
|
|
fstp st0
|
|
ret
|
|
|
|
|
|
; extern "C" i32 __cdecl ia32_asm_i32FromFloat(float f);
|
|
global sym(ia32_asm_i32FromFloat)
|
|
sym(ia32_asm_i32FromFloat):
|
|
push eax
|
|
fld dword [esp+8]
|
|
fsub dword [round_bias]
|
|
fistp dword [esp]
|
|
pop eax
|
|
ret
|
|
|
|
; extern "C" i32 __cdecl ia32_asm_i32FromDouble(double d);
|
|
global sym(ia32_asm_i32FromDouble)
|
|
sym(ia32_asm_i32FromDouble):
|
|
push eax
|
|
fld qword [esp+8]
|
|
fsub dword [round_bias]
|
|
fistp dword [esp]
|
|
pop eax
|
|
ret
|
|
|
|
; extern "C" i64 __cdecl ia32_asm_i64FromDouble(double d);
|
|
global sym(ia32_asm_i64FromDouble)
|
|
sym(ia32_asm_i64FromDouble):
|
|
push edx
|
|
push eax
|
|
fld qword [esp+12]
|
|
fsub dword [round_bias]
|
|
fistp qword [esp]
|
|
pop eax
|
|
pop edx
|
|
ret
|
|
|
|
|
|
;-------------------------------------------------------------------------------
|
|
; misc
|
|
;-------------------------------------------------------------------------------
|
|
|
|
; write the current execution state (e.g. all register values) into
|
|
; (Win32::CONTEXT*)pcontext (defined as void* to avoid dependency).
|
|
; optimized for size; this must be straight asm because ; extern "C"
|
|
; is compiler-specific and compiler-generated prolog code inserted before
|
|
; inline asm trashes EBP and ESP (unacceptable).
|
|
; extern "C" void ia32_asm_GetCurrentContext(void* pcontext);
|
|
global sym(ia32_asm_GetCurrentContext)
|
|
sym(ia32_asm_GetCurrentContext):
|
|
pushad
|
|
pushfd
|
|
mov edi, [esp+4+32+4] ; pcontext
|
|
|
|
; ContextFlags
|
|
mov eax, 0x10007 ; segs, int, control
|
|
stosd
|
|
|
|
; DRx and FloatSave
|
|
; rationale: we can't access the debug registers from Ring3, and
|
|
; the FPU save area is irrelevant, so zero them.
|
|
xor eax, eax
|
|
push byte 6+8+20
|
|
pop ecx
|
|
rep stosd
|
|
|
|
; CONTEXT_SEGMENTS
|
|
mov ax, gs
|
|
stosd
|
|
mov ax, fs
|
|
stosd
|
|
mov ax, es
|
|
stosd
|
|
mov ax, ds
|
|
stosd
|
|
|
|
; CONTEXT_INTEGER
|
|
mov eax, [esp+4+32-32] ; edi
|
|
stosd
|
|
xchg eax, esi
|
|
stosd
|
|
xchg eax, ebx
|
|
stosd
|
|
xchg eax, edx
|
|
stosd
|
|
mov eax, [esp+4+32-8] ; ecx
|
|
stosd
|
|
mov eax, [esp+4+32-4] ; eax
|
|
stosd
|
|
|
|
; CONTEXT_CONTROL
|
|
xchg eax, ebp ; ebp restored by POPAD
|
|
stosd
|
|
mov eax, [esp+4+32] ; return address
|
|
sub eax, 5 ; skip CALL instruction -> call site.
|
|
stosd
|
|
xor eax, eax
|
|
mov ax, cs
|
|
stosd
|
|
pop eax ; eflags
|
|
stosd
|
|
lea eax, [esp+32+4+4] ; esp
|
|
stosd
|
|
xor eax, eax
|
|
mov ax, ss
|
|
stosd
|
|
|
|
; ExtendedRegisters
|
|
xor ecx, ecx
|
|
mov cl, 512/4
|
|
rep stosd
|
|
|
|
popad
|
|
ret
|