0ad/source/lib/sysdep/ia32/ia32_asm.asm

; =========================================================================
; File        : ia32_asm.asm
; Project     : 0 A.D.
; Description : optimized assembly code for IA-32. not provided as
;             : inline assembly because that's compiler-specific.
;
; @author Jan.Wassenberg@stud.uni-karlsruhe.de
; =========================================================================

; Copyright (c) 2004-2005 Jan Wassenberg
;
; Redistribution and/or modification are also permitted under the
; terms of the GNU General Public License as published by th;e
; Free Software Foundation (version 2 or later, at your option).
;
; This program is distributed in the hope that it will be useful, but
; WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

%include "ia32.inc"

; note: pure asm functions prevent inlining but also avoid redundant
; store/loads generated by VC inline asm (ugh).


;-------------------------------------------------------------------------------
; CPUID support
;-------------------------------------------------------------------------------

[section .data]

; these are actually max_func+1, i.e. the first invalid value.
; the idea here is to avoid a separate cpuid_available flag;
; using signed values doesn't work because ext_funcs are >= 0x80000000.
max_func		dd	0
max_ext_func	dd	0

__SECT__


; extern "C" void __cdecl ia32_asm_cpuid_init()
global sym(ia32_asm_cpuid_init)
sym(ia32_asm_cpuid_init):
	push	ebx

	; check if CPUID is supported
	pushfd
	or		byte [esp+2], 32
	popfd
	pushfd
	pop		eax
	xor		edx, edx
	shr		eax, 22						; bit 21 toggled?
	jnc		.no_cpuid

	; determine max supported CPUID function
	xor		eax, eax
	cpuid
	inc		eax							; (see max_func decl)
	mov		[max_func], eax
	mov		eax, 0x80000000
	cpuid
	inc		eax							; (see max_func decl)
	mov		[max_ext_func], eax
.no_cpuid:

	pop		ebx
	ret


; extern "C" bool __cdecl ia32_asm_cpuid(u32 func, u32* regs)
global sym(ia32_asm_cpuid)
sym(ia32_asm_cpuid):
	push	ebx
	push	edi

	mov		edx, [esp+8+4+0]			; func
	mov		edi, [esp+8+4+4]			; -> regs

	; compare against max supported func and fail if above
	mov		ebx, [max_ext_func]
	xor		eax, eax					; return value on failure
	test	edx, edx
	js		.is_ext_func
	mov		ebx, [max_func]
.is_ext_func:
	cmp		edx, ebx
	jae		.ret						; (see max_func decl)

	; issue CPUID and store result registers in array
	mov		eax, edx
	xor		ecx, ecx					; CPUID.4 requires ECX = 0..2
	cpuid
	stosd
	xchg		eax, ebx
	stosd
	xchg		eax, ecx
	stosd
	xchg		eax, edx
	stosd

	; success
	xor		eax, eax
	inc		eax
.ret:
	pop		edi
	pop		ebx
	ret


;-------------------------------------------------------------------------------
; lock-free support routines
;-------------------------------------------------------------------------------

; extern "C" void __cdecl ia32_asm_atomic_add(intptr_t* location, intptr_t increment);
global sym(ia32_asm_atomic_add)
sym(ia32_asm_atomic_add):
	mov		edx, [esp+4]				; location
	mov		eax, [esp+8]				; increment
db		0xf0							; LOCK prefix
	add		[edx], eax
	ret


; notes:
; - this is called via CAS macro, which silently casts its inputs for
;   convenience. mixing up the <expected> and <location> parameters would
;   go unnoticed; we therefore perform a basic sanity check on <location> and
;   raise a warning if it is invalid.
; - a 486 or later processor is required since we use CMPXCHG.
;   there's no feature flag we can check, and the ia32 code doesn't
;   bother detecting anything < Pentium, so this'll crash and burn if
;   run on 386. we could fall back to simple MOVs there (since 386 CPUs
;   aren't MP-capable), but it's not worth the trouble.
; - nor do we bother skipping the LOCK prefix on single-processor systems.
;   the branch may be well-predicted, but difference in performance still
;   isn't expected to be enough to justify the effort.
; extern "C" ; extern "C" bool __cdecl ia32_asm_CAS(uintptr_t* location, uintptr_t expected, uintptr_t new_value);
global sym(ia32_asm_CAS)
sym(ia32_asm_CAS):
	mov		edx, [esp+4]				; location
	mov		eax, [esp+8]				; expected
	cmp		edx, 0x10000				; valid location?
	jb		.invalid_location			;   no - raise warning
	mov		ecx, [esp+12]				; new_value
db		0xf0							; LOCK prefix
	cmpxchg	[edx], ecx
	sete	al
	movzx	eax, al
	ret

; NOTE: nasm 0.98.39 doesn't support generating debug info for win32
; output format. that means this code may be misattributed to other
; functions, which makes tracking it down very difficult.
; we therefore raise an "Invalid Opcode" exception, which is rather distinct.
.invalid_location:
	ud2


;-------------------------------------------------------------------------------
; FPU
;-------------------------------------------------------------------------------

[section .data]

; to conform with the fallback implementation (a C cast), we need to
; end up with truncate/"chop" rounding. subtracting does the trick,
; assuming RC is the IA-32 default round-to-nearest mode.
round_bias		dd 0.4999999

__SECT__

; extern "C" uint __cdecl ia32_asm_control87(uint new_cw, uint mask);
global sym(ia32_asm_control87)
sym(ia32_asm_control87):
	push	eax
	fnstcw	[esp]
	pop		eax							; old_cw
	mov		ecx, [esp+4]				; new_val
	mov		edx, [esp+8]				; mask
	and		ecx, edx					; new_val & mask
	not		edx							; ~mask
	and		eax, edx					; old_cw & ~mask
	or		eax, ecx					; (old_cw & ~mask) | (new_val & mask)
	push	eax							; = new_cw
	fldcw	[esp]
	pop		eax
	xor		eax, eax					; return value
	ret


; possible IA-32 FPU control word flags after FXAM: NAN|NORMAL|ZERO
FP_CLASSIFY_MASK	equ 0x4500

; extern "C" uint __cdecl ia32_asm_fpclassify(double d);
global sym(ia32_asm_fpclassify)
sym(ia32_asm_fpclassify):
	fld		qword [esp+4]
	fxam
	fnstsw	ax
	fstp	st0
	and		eax, FP_CLASSIFY_MASK
	ret

; extern "C" uint __cdecl ia32_asm_fpclassifyf(float f);
global sym(ia32_asm_fpclassifyf)
sym(ia32_asm_fpclassifyf):
	fld		dword [esp+4]
	fxam
	fnstsw	ax
	fstp	st0
	and		eax, FP_CLASSIFY_MASK
	ret


; extern "C" float __cdecl ia32_asm_rintf(float)
global sym(ia32_asm_rintf)
sym(ia32_asm_rintf):
	fld		dword [esp+4]
	frndint
	ret

; extern "C" double __cdecl ia32_asm_rint(double)
global sym(ia32_asm_rint)
sym(ia32_asm_rint):
	fld		qword [esp+4]
	frndint
	ret


; extern "C" float __cdecl ia32_asm_fminf(float, float)
global sym(ia32_asm_fminf)
sym(ia32_asm_fminf):
	fld		dword [esp+4]
	fld		dword [esp+8]
	fcomi	st0, st1
	fcmovnb	st0, st1
	fxch
	fstp	st0
	ret

; extern "C" float __cdecl ia32_asm_fmaxf(float, float)
global sym(ia32_asm_fmaxf)
sym(ia32_asm_fmaxf):
	fld		dword [esp+4]
	fld		dword [esp+8]
	fcomi	st0, st1
	fcmovb	st0, st1
	fxch
	fstp	st0
	ret


; extern "C" i32 __cdecl ia32_asm_i32_from_float(float f)
global sym(ia32_asm_i32_from_float)
sym(ia32_asm_i32_from_float):
	push		eax
	fld			dword [esp+8]
	fsub		dword [round_bias]
	fistp		dword [esp]
	pop			eax
	ret

; extern "C" i32 __cdecl ia32_asm_i32_from_double(double d)
global sym(ia32_asm_i32_from_double)
sym(ia32_asm_i32_from_double):
	push		eax
	fld			qword [esp+8]
	fsub		dword [round_bias]
	fistp		dword [esp]
	pop			eax
	ret

; extern "C" i64 __cdecl ia32_asm_i64_from_double(double d)
global sym(ia32_asm_i64_from_double)
sym(ia32_asm_i64_from_double):
	push		edx
	push		eax
	fld			qword [esp+12]
	fsub		dword [round_bias]
	fistp		qword [esp]
	pop			eax
	pop			edx
	ret


;-------------------------------------------------------------------------------
; misc
;-------------------------------------------------------------------------------

; rationale: the common return convention for 64-bit values is in edx:eax.
; with inline asm, we'd have to MOV data to a temporary and return that;
; this is less efficient (-> important for low-overhead profiling) than
; making use of the convention.
;
; however, speed is not the main reason for providing this routine.
; xcode complains about CPUID clobbering ebx, so we use external asm
; where possible (IA-32 CPUs).
;
; extern "C" u64 ia32_asm_rdtsc_edx_eax()
global sym(ia32_asm_rdtsc_edx_eax)
sym(ia32_asm_rdtsc_edx_eax):
	push	ebx
	cpuid
	pop		ebx
	rdtsc
	ret


; write the current execution state (e.g. all register values) into
; (Win32::CONTEXT*)pcontext (defined as void* to avoid dependency).
; optimized for size; this must be straight asm because ; extern "C"
; is compiler-specific and compiler-generated prolog code inserted before
; inline asm trashes EBP and ESP (unacceptable).
; extern "C" void ia32_asm_get_current_context(void* pcontext)
global sym(ia32_asm_get_current_context)
sym(ia32_asm_get_current_context):
	pushad
	pushfd
	mov		edi, [esp+4+32+4]	; pcontext

	; ContextFlags
	mov		eax, 0x10007		; segs, int, control
	stosd

	; DRx and FloatSave
	; rationale: we can't access the debug registers from Ring3, and
	; the FPU save area is irrelevant, so zero them.
	xor		eax, eax
	push	byte 6+8+20
	pop		ecx
rep	stosd

	; CONTEXT_SEGMENTS
	mov		ax, gs
	stosd
	mov		ax, fs
	stosd
	mov		ax, es
	stosd
	mov		ax, ds
	stosd

	; CONTEXT_INTEGER
	mov		eax, [esp+4+32-32]	; edi
	stosd
	xchg	eax, esi
	stosd
	xchg	eax, ebx
	stosd
	xchg	eax, edx
	stosd
	mov		eax, [esp+4+32-8]	; ecx
	stosd
	mov		eax, [esp+4+32-4]	; eax
	stosd

	; CONTEXT_CONTROL
	xchg	eax, ebp			; ebp restored by POPAD
	stosd
	mov		eax, [esp+4+32]		; return address
	sub		eax, 5				; skip CALL instruction -> call site.
	stosd
	xor		eax, eax
	mov		ax, cs
	stosd
	pop		eax					; eflags
	stosd
	lea		eax, [esp+32+4+4]	; esp
	stosd
	xor		eax, eax
	mov		ax, ss
	stosd

	; ExtendedRegisters
	xor		ecx, ecx
	mov		cl, 512/4
rep	stosd

	popad
	ret