2006-04-12 01:59:08 +02:00
|
|
|
; =========================================================================
|
2006-04-20 03:33:57 +02:00
|
|
|
; File : ia32_asm.asm
|
2006-04-12 01:59:08 +02:00
|
|
|
; Project : 0 A.D.
|
|
|
|
; Description : optimized assembly code for IA-32. not provided as
|
|
|
|
; : inline assembly because that's compiler-specific.
|
|
|
|
; =========================================================================
|
|
|
|
|
2007-05-07 18:33:24 +02:00
|
|
|
; license: GPL; see lib/license.txt
|
2006-04-12 01:59:08 +02:00
|
|
|
|
2006-06-25 22:58:03 +02:00
|
|
|
%include "ia32.inc"
|
2005-09-13 06:00:41 +02:00
|
|
|
|
2007-04-25 20:19:35 +02:00
|
|
|
; note: pure asm functions prevent inlining but also avoid redundant
|
|
|
|
; store/loads generated by VC inline asm (ugh).
|
|
|
|
|
|
|
|
|
2005-09-15 02:51:59 +02:00
|
|
|
;-------------------------------------------------------------------------------
|
|
|
|
; CPUID support
|
|
|
|
;-------------------------------------------------------------------------------
|
2005-09-13 06:00:41 +02:00
|
|
|
|
2008-05-12 20:15:08 +02:00
|
|
|
; extern "C" void __cdecl ia32_asm_cpuid(x86_x64_CpuidRegs* regs);
|
2007-09-23 17:36:29 +02:00
|
|
|
global sym(ia32_asm_cpuid)
|
|
|
|
sym(ia32_asm_cpuid):
|
had to remove uint and ulong from lib/types.h due to conflict with other library.
this snowballed into a massive search+destroy of the hodgepodge of
mostly equivalent types we had in use (int, uint, unsigned, unsigned
int, i32, u32, ulong, uintN).
it is more efficient to use 64-bit types in 64-bit mode, so the
preferred default is size_t (for anything remotely resembling a size or
index). tile coordinates are ssize_t to allow more efficient conversion
to/from floating point. flags are int because we almost never need more
than 15 distinct bits, bit test/set is not slower and int is fastest to
type. finally, some data that is pretty much directly passed to OpenGL
is now typed accordingly.
after several hours, the code now requires fewer casts and less
guesswork.
other changes:
- unit and player IDs now have an "invalid id" constant in the
respective class to avoid casting and -1
- fix some endian/64-bit bugs in the map (un)packing. added a
convenience function to write/read a size_t.
- ia32: change CPUID interface to allow passing in ecx (required for
cache topology detection, which I need at work). remove some unneeded
functions from asm, replace with intrinsics where possible.
This was SVN commit r5942.
2008-05-11 20:48:32 +02:00
|
|
|
push ebx ; (clobbered by CPUID)
|
|
|
|
push edi ; (need a register other than eax..edx)
|
2007-04-25 20:19:35 +02:00
|
|
|
|
had to remove uint and ulong from lib/types.h due to conflict with other library.
this snowballed into a massive search+destroy of the hodgepodge of
mostly equivalent types we had in use (int, uint, unsigned, unsigned
int, i32, u32, ulong, uintN).
it is more efficient to use 64-bit types in 64-bit mode, so the
preferred default is size_t (for anything remotely resembling a size or
index). tile coordinates are ssize_t to allow more efficient conversion
to/from floating point. flags are int because we almost never need more
than 15 distinct bits, bit test/set is not slower and int is fastest to
type. finally, some data that is pretty much directly passed to OpenGL
is now typed accordingly.
after several hours, the code now requires fewer casts and less
guesswork.
other changes:
- unit and player IDs now have an "invalid id" constant in the
respective class to avoid casting and -1
- fix some endian/64-bit bugs in the map (un)packing. added a
convenience function to write/read a size_t.
- ia32: change CPUID interface to allow passing in ecx (required for
cache topology detection, which I need at work). remove some unneeded
functions from asm, replace with intrinsics where possible.
This was SVN commit r5942.
2008-05-11 20:48:32 +02:00
|
|
|
mov edi, [esp+8+4] ; -> regs
|
2007-04-25 20:19:35 +02:00
|
|
|
|
had to remove uint and ulong from lib/types.h due to conflict with other library.
this snowballed into a massive search+destroy of the hodgepodge of
mostly equivalent types we had in use (int, uint, unsigned, unsigned
int, i32, u32, ulong, uintN).
it is more efficient to use 64-bit types in 64-bit mode, so the
preferred default is size_t (for anything remotely resembling a size or
index). tile coordinates are ssize_t to allow more efficient conversion
to/from floating point. flags are int because we almost never need more
than 15 distinct bits, bit test/set is not slower and int is fastest to
type. finally, some data that is pretty much directly passed to OpenGL
is now typed accordingly.
after several hours, the code now requires fewer casts and less
guesswork.
other changes:
- unit and player IDs now have an "invalid id" constant in the
respective class to avoid casting and -1
- fix some endian/64-bit bugs in the map (un)packing. added a
convenience function to write/read a size_t.
- ia32: change CPUID interface to allow passing in ecx (required for
cache topology detection, which I need at work). remove some unneeded
functions from asm, replace with intrinsics where possible.
This was SVN commit r5942.
2008-05-11 20:48:32 +02:00
|
|
|
mov eax, [edi+0] ; eax (function)
|
|
|
|
mov ecx, [edi+8] ; ecx (count)
|
2005-09-13 23:12:29 +02:00
|
|
|
cpuid
|
|
|
|
stosd
|
|
|
|
xchg eax, ebx
|
|
|
|
stosd
|
|
|
|
xchg eax, ecx
|
|
|
|
stosd
|
|
|
|
xchg eax, edx
|
|
|
|
stosd
|
|
|
|
|
|
|
|
pop edi
|
|
|
|
pop ebx
|
|
|
|
ret
|
2005-09-13 06:00:41 +02:00
|
|
|
|
2005-09-20 06:05:23 +02:00
|
|
|
|
2005-11-18 06:16:43 +01:00
|
|
|
;-------------------------------------------------------------------------------
|
|
|
|
; lock-free support routines
|
|
|
|
;-------------------------------------------------------------------------------
|
|
|
|
|
2008-05-01 17:41:42 +02:00
|
|
|
; extern "C" void __cdecl ia32_asm_AtomicAdd(volatile intptr_t* location, intptr_t increment);
|
|
|
|
global sym(ia32_asm_AtomicAdd)
|
|
|
|
sym(ia32_asm_AtomicAdd):
|
2005-11-18 06:16:43 +01:00
|
|
|
mov edx, [esp+4] ; location
|
|
|
|
mov eax, [esp+8] ; increment
|
|
|
|
db 0xf0 ; LOCK prefix
|
|
|
|
add [edx], eax
|
|
|
|
ret
|
|
|
|
|
|
|
|
|
|
|
|
; notes:
|
|
|
|
; - a 486 or later processor is required since we use CMPXCHG.
|
|
|
|
; there's no feature flag we can check, and the ia32 code doesn't
|
|
|
|
; bother detecting anything < Pentium, so this'll crash and burn if
|
|
|
|
; run on 386. we could fall back to simple MOVs there (since 386 CPUs
|
|
|
|
; aren't MP-capable), but it's not worth the trouble.
|
2007-04-25 20:19:35 +02:00
|
|
|
; - nor do we bother skipping the LOCK prefix on single-processor systems.
|
|
|
|
; the branch may be well-predicted, but difference in performance still
|
|
|
|
; isn't expected to be enough to justify the effort.
|
2008-05-01 17:41:42 +02:00
|
|
|
; extern "C" bool __cdecl ia32_asm_CAS(volatile uintptr_t* location, uintptr_t expected, uintptr_t new_value);
|
|
|
|
global sym(ia32_asm_CAS)
|
|
|
|
sym(ia32_asm_CAS):
|
2005-11-18 06:16:43 +01:00
|
|
|
mov edx, [esp+4] ; location
|
2007-04-25 20:19:35 +02:00
|
|
|
mov eax, [esp+8] ; expected
|
2005-11-18 06:16:43 +01:00
|
|
|
mov ecx, [esp+12] ; new_value
|
|
|
|
db 0xf0 ; LOCK prefix
|
|
|
|
cmpxchg [edx], ecx
|
|
|
|
sete al
|
|
|
|
movzx eax, al
|
|
|
|
ret
|
|
|
|
|
2007-09-23 17:36:29 +02:00
|
|
|
|
2005-09-20 06:05:23 +02:00
|
|
|
;-------------------------------------------------------------------------------
|
2006-05-05 07:54:00 +02:00
|
|
|
; FPU
|
2005-09-20 06:05:23 +02:00
|
|
|
;-------------------------------------------------------------------------------
|
|
|
|
|
2007-04-25 20:19:35 +02:00
|
|
|
[section .data]
|
|
|
|
|
|
|
|
; to conform with the fallback implementation (a C cast), we need to
|
|
|
|
; end up with truncate/"chop" rounding. subtracting does the trick,
|
|
|
|
; assuming RC is the IA-32 default round-to-nearest mode.
|
|
|
|
round_bias dd 0.4999999
|
|
|
|
|
|
|
|
__SECT__
|
|
|
|
|
2008-05-12 20:15:08 +02:00
|
|
|
; extern "C" u32 __cdecl ia32_asm_control87(u32 new_cw, u32 mask);
|
2007-04-25 20:19:35 +02:00
|
|
|
global sym(ia32_asm_control87)
|
|
|
|
sym(ia32_asm_control87):
|
2005-09-20 06:05:23 +02:00
|
|
|
push eax
|
|
|
|
fnstcw [esp]
|
|
|
|
pop eax ; old_cw
|
2005-10-19 05:06:54 +02:00
|
|
|
mov ecx, [esp+4] ; new_val
|
2005-09-20 06:05:23 +02:00
|
|
|
mov edx, [esp+8] ; mask
|
2005-10-19 05:06:54 +02:00
|
|
|
and ecx, edx ; new_val & mask
|
2005-09-20 06:05:23 +02:00
|
|
|
not edx ; ~mask
|
|
|
|
and eax, edx ; old_cw & ~mask
|
2005-10-19 05:06:54 +02:00
|
|
|
or eax, ecx ; (old_cw & ~mask) | (new_val & mask)
|
|
|
|
push eax ; = new_cw
|
2005-09-20 06:05:23 +02:00
|
|
|
fldcw [esp]
|
2005-10-19 03:53:38 +02:00
|
|
|
pop eax
|
2005-09-20 06:05:23 +02:00
|
|
|
xor eax, eax ; return value
|
2005-09-27 01:36:43 +02:00
|
|
|
ret
|
2005-10-19 05:06:54 +02:00
|
|
|
|
2005-10-24 02:06:08 +02:00
|
|
|
|
2006-05-05 07:54:00 +02:00
|
|
|
; possible IA-32 FPU control word flags after FXAM: NAN|NORMAL|ZERO
|
|
|
|
FP_CLASSIFY_MASK equ 0x4500
|
|
|
|
|
had to remove uint and ulong from lib/types.h due to conflict with other library.
this snowballed into a massive search+destroy of the hodgepodge of
mostly equivalent types we had in use (int, uint, unsigned, unsigned
int, i32, u32, ulong, uintN).
it is more efficient to use 64-bit types in 64-bit mode, so the
preferred default is size_t (for anything remotely resembling a size or
index). tile coordinates are ssize_t to allow more efficient conversion
to/from floating point. flags are int because we almost never need more
than 15 distinct bits, bit test/set is not slower and int is fastest to
type. finally, some data that is pretty much directly passed to OpenGL
is now typed accordingly.
after several hours, the code now requires fewer casts and less
guesswork.
other changes:
- unit and player IDs now have an "invalid id" constant in the
respective class to avoid casting and -1
- fix some endian/64-bit bugs in the map (un)packing. added a
convenience function to write/read a size_t.
- ia32: change CPUID interface to allow passing in ecx (required for
cache topology detection, which I need at work). remove some unneeded
functions from asm, replace with intrinsics where possible.
This was SVN commit r5942.
2008-05-11 20:48:32 +02:00
|
|
|
; extern "C" size_t __cdecl ia32_asm_fpclassifyd(double d);
|
2007-04-30 21:58:04 +02:00
|
|
|
global sym(ia32_asm_fpclassifyd)
|
|
|
|
sym(ia32_asm_fpclassifyd):
|
2006-05-05 07:54:00 +02:00
|
|
|
fld qword [esp+4]
|
|
|
|
fxam
|
|
|
|
fnstsw ax
|
2006-07-09 02:24:21 +02:00
|
|
|
fstp st0
|
2006-05-05 07:54:00 +02:00
|
|
|
and eax, FP_CLASSIFY_MASK
|
|
|
|
ret
|
|
|
|
|
had to remove uint and ulong from lib/types.h due to conflict with other library.
this snowballed into a massive search+destroy of the hodgepodge of
mostly equivalent types we had in use (int, uint, unsigned, unsigned
int, i32, u32, ulong, uintN).
it is more efficient to use 64-bit types in 64-bit mode, so the
preferred default is size_t (for anything remotely resembling a size or
index). tile coordinates are ssize_t to allow more efficient conversion
to/from floating point. flags are int because we almost never need more
than 15 distinct bits, bit test/set is not slower and int is fastest to
type. finally, some data that is pretty much directly passed to OpenGL
is now typed accordingly.
after several hours, the code now requires fewer casts and less
guesswork.
other changes:
- unit and player IDs now have an "invalid id" constant in the
respective class to avoid casting and -1
- fix some endian/64-bit bugs in the map (un)packing. added a
convenience function to write/read a size_t.
- ia32: change CPUID interface to allow passing in ecx (required for
cache topology detection, which I need at work). remove some unneeded
functions from asm, replace with intrinsics where possible.
This was SVN commit r5942.
2008-05-11 20:48:32 +02:00
|
|
|
; extern "C" size_t __cdecl ia32_asm_fpclassifyf(float f);
|
2007-04-25 20:19:35 +02:00
|
|
|
global sym(ia32_asm_fpclassifyf)
|
|
|
|
sym(ia32_asm_fpclassifyf):
|
2006-05-05 07:54:00 +02:00
|
|
|
fld dword [esp+4]
|
|
|
|
fxam
|
|
|
|
fnstsw ax
|
2006-07-09 02:24:21 +02:00
|
|
|
fstp st0
|
2006-05-05 07:54:00 +02:00
|
|
|
and eax, FP_CLASSIFY_MASK
|
|
|
|
ret
|
|
|
|
|
|
|
|
|
2008-05-01 17:41:42 +02:00
|
|
|
; extern "C" float __cdecl ia32_asm_rintf(float);
|
2007-04-25 20:19:35 +02:00
|
|
|
global sym(ia32_asm_rintf)
|
|
|
|
sym(ia32_asm_rintf):
|
|
|
|
fld dword [esp+4]
|
|
|
|
frndint
|
|
|
|
ret
|
|
|
|
|
2008-05-01 17:41:42 +02:00
|
|
|
; extern "C" double __cdecl ia32_asm_rint(double);
|
2007-04-25 20:19:35 +02:00
|
|
|
global sym(ia32_asm_rint)
|
|
|
|
sym(ia32_asm_rint):
|
|
|
|
fld qword [esp+4]
|
|
|
|
frndint
|
|
|
|
ret
|
|
|
|
|
|
|
|
|
2008-05-01 17:41:42 +02:00
|
|
|
; extern "C" float __cdecl ia32_asm_fminf(float, float);
|
2007-04-25 20:19:35 +02:00
|
|
|
global sym(ia32_asm_fminf)
|
|
|
|
sym(ia32_asm_fminf):
|
|
|
|
fld dword [esp+4]
|
|
|
|
fld dword [esp+8]
|
|
|
|
fcomi st0, st1
|
|
|
|
fcmovnb st0, st1
|
|
|
|
fxch
|
|
|
|
fstp st0
|
|
|
|
ret
|
|
|
|
|
2008-05-01 17:41:42 +02:00
|
|
|
; extern "C" float __cdecl ia32_asm_fmaxf(float, float);
|
2007-04-25 20:19:35 +02:00
|
|
|
global sym(ia32_asm_fmaxf)
|
|
|
|
sym(ia32_asm_fmaxf):
|
|
|
|
fld dword [esp+4]
|
|
|
|
fld dword [esp+8]
|
|
|
|
fcomi st0, st1
|
|
|
|
fcmovb st0, st1
|
|
|
|
fxch
|
|
|
|
fstp st0
|
|
|
|
ret
|
|
|
|
|
|
|
|
|
2008-05-01 17:41:42 +02:00
|
|
|
; extern "C" i32 __cdecl ia32_asm_i32FromFloat(float f);
|
|
|
|
global sym(ia32_asm_i32FromFloat)
|
|
|
|
sym(ia32_asm_i32FromFloat):
|
2007-04-25 20:19:35 +02:00
|
|
|
push eax
|
|
|
|
fld dword [esp+8]
|
|
|
|
fsub dword [round_bias]
|
|
|
|
fistp dword [esp]
|
|
|
|
pop eax
|
|
|
|
ret
|
|
|
|
|
2008-05-01 17:41:42 +02:00
|
|
|
; extern "C" i32 __cdecl ia32_asm_i32FromDouble(double d);
|
|
|
|
global sym(ia32_asm_i32FromDouble)
|
|
|
|
sym(ia32_asm_i32FromDouble):
|
2007-04-25 20:19:35 +02:00
|
|
|
push eax
|
|
|
|
fld qword [esp+8]
|
|
|
|
fsub dword [round_bias]
|
|
|
|
fistp dword [esp]
|
|
|
|
pop eax
|
|
|
|
ret
|
|
|
|
|
2008-05-01 17:41:42 +02:00
|
|
|
; extern "C" i64 __cdecl ia32_asm_i64FromDouble(double d);
|
|
|
|
global sym(ia32_asm_i64FromDouble)
|
|
|
|
sym(ia32_asm_i64FromDouble):
|
2007-04-25 20:19:35 +02:00
|
|
|
push edx
|
|
|
|
push eax
|
|
|
|
fld qword [esp+12]
|
|
|
|
fsub dword [round_bias]
|
|
|
|
fistp qword [esp]
|
|
|
|
pop eax
|
|
|
|
pop edx
|
|
|
|
ret
|
|
|
|
|
|
|
|
|
2006-05-05 07:54:00 +02:00
|
|
|
;-------------------------------------------------------------------------------
|
|
|
|
; misc
|
|
|
|
;-------------------------------------------------------------------------------
|
|
|
|
|
2005-10-24 02:06:08 +02:00
|
|
|
; write the current execution state (e.g. all register values) into
|
|
|
|
; (Win32::CONTEXT*)pcontext (defined as void* to avoid dependency).
|
2007-04-25 20:19:35 +02:00
|
|
|
; optimized for size; this must be straight asm because ; extern "C"
|
2005-10-24 02:06:08 +02:00
|
|
|
; is compiler-specific and compiler-generated prolog code inserted before
|
|
|
|
; inline asm trashes EBP and ESP (unacceptable).
|
2008-05-01 17:41:42 +02:00
|
|
|
; extern "C" void ia32_asm_GetCurrentContext(void* pcontext);
|
2007-05-02 14:07:08 +02:00
|
|
|
global sym(ia32_asm_GetCurrentContext)
|
|
|
|
sym(ia32_asm_GetCurrentContext):
|
2005-10-24 02:06:08 +02:00
|
|
|
pushad
|
|
|
|
pushfd
|
|
|
|
mov edi, [esp+4+32+4] ; pcontext
|
|
|
|
|
|
|
|
; ContextFlags
|
|
|
|
mov eax, 0x10007 ; segs, int, control
|
|
|
|
stosd
|
|
|
|
|
|
|
|
; DRx and FloatSave
|
|
|
|
; rationale: we can't access the debug registers from Ring3, and
|
|
|
|
; the FPU save area is irrelevant, so zero them.
|
|
|
|
xor eax, eax
|
|
|
|
push byte 6+8+20
|
|
|
|
pop ecx
|
|
|
|
rep stosd
|
|
|
|
|
|
|
|
; CONTEXT_SEGMENTS
|
|
|
|
mov ax, gs
|
|
|
|
stosd
|
|
|
|
mov ax, fs
|
|
|
|
stosd
|
|
|
|
mov ax, es
|
|
|
|
stosd
|
|
|
|
mov ax, ds
|
|
|
|
stosd
|
|
|
|
|
|
|
|
; CONTEXT_INTEGER
|
|
|
|
mov eax, [esp+4+32-32] ; edi
|
|
|
|
stosd
|
|
|
|
xchg eax, esi
|
|
|
|
stosd
|
|
|
|
xchg eax, ebx
|
|
|
|
stosd
|
|
|
|
xchg eax, edx
|
|
|
|
stosd
|
|
|
|
mov eax, [esp+4+32-8] ; ecx
|
|
|
|
stosd
|
|
|
|
mov eax, [esp+4+32-4] ; eax
|
|
|
|
stosd
|
|
|
|
|
|
|
|
; CONTEXT_CONTROL
|
|
|
|
xchg eax, ebp ; ebp restored by POPAD
|
|
|
|
stosd
|
|
|
|
mov eax, [esp+4+32] ; return address
|
|
|
|
sub eax, 5 ; skip CALL instruction -> call site.
|
|
|
|
stosd
|
|
|
|
xor eax, eax
|
|
|
|
mov ax, cs
|
|
|
|
stosd
|
|
|
|
pop eax ; eflags
|
|
|
|
stosd
|
|
|
|
lea eax, [esp+32+4+4] ; esp
|
|
|
|
stosd
|
|
|
|
xor eax, eax
|
|
|
|
mov ax, ss
|
|
|
|
stosd
|
|
|
|
|
|
|
|
; ExtendedRegisters
|
|
|
|
xor ecx, ecx
|
|
|
|
mov cl, 512/4
|
|
|
|
rep stosd
|
|
|
|
|
|
|
|
popad
|
|
|
|
ret
|