; set 32-bit attribute once for all sections and activate .text
section .data use32
section .bss use32
section .text use32

; Usage:
; use sym(ia32_cap) instead of _ia32_cap - on relevant platforms, sym() will add
; the underlines automagically, on others it won't

%ifdef DONT_USE_UNDERLINE
%define sym(a) a
%else
%define sym(a) _ %+ a
%endif

;-------------------------------------------------------------------------------
; fast general memcpy
;-------------------------------------------------------------------------------

; optimized for Athlon XP: 7.3% faster (cumulative) than VC7.1's memcpy over
; all 1..64 byte transfer lengths and misalignments. approaches maximum
; mem bandwidth (2000 MiB/s) for transfers >= 192KiB!
; Pentium III performance: about 3% faster in above small buffer benchmark.
;
; disables specialized large transfer (> 64KiB) implementations if SSE
; isn't available; we do assume MMX support, though (quite safe).

; *requires* (and does not verify the presence of) SSE instructions:
; prefetchnta and movntq. therefore, a P3+ or Athlon XP is required.
; rationale: older processors are too slow anyway and we don't bother.

; if memcpy size is greater than this,
; .. it's too big for L1. use non-temporal instructions.
UC_THRESHOLD	equ	64*1024
; .. it also blows L2. pull chunks into L1 ("block prefetch").
BP_THRESHOLD	equ	192*1024

; maximum that can be copied by IC_MOVSD.
; if you change this, be sure to expand the movs* table(s)!
IC_SIZE		equ	67

; size of one block prefetch chunk.
; if you change this, make sure "push byte BP_SIZE/128" doesn't overflow!
BP_SIZE		equ	8*1024


; > ecx = size (<= IC_SIZE)
; x eax, ecx
;
; determined to be fastest approach by testing. a movsd table followed by
; rep movsb is a bit smaller but 6.9% slower; everything else is much worse.
%macro IC_MOVSD 0
	mov		eax, ecx
	shr		ecx, 2						; dword count
	neg		ecx
	add		ecx, %%movsd_table_end
	jmp		ecx
align 8
	movsd
	movsd
	movsd
	movsd
	movsd
	movsd
	movsd
	movsd
	movsd
	movsd
	movsd
	movsd
	movsd
	movsd
	movsd
	movsd
%%movsd_table_end:

	and		eax, 3
	neg		eax
	add		eax, %%movsb_table_end
	jmp		eax
	movsb
	movsb
	movsb
%%movsb_table_end:
%endm


; align destination address to multiple of 8.
; not done for small transfers because it doesn't help IC_MOVSD.
%macro IC_ALIGN 0
	mov		eax, 8
	sub		eax, edi
	and		eax, byte 7					; eax = # misaligned bytes
	sub		ecx, eax					; reduce copy count
	neg		eax
	add		eax, %%align_table_end
	jmp		eax
align 4
	movsb
	movsb
	movsb
	movsb
	movsb
	movsb
	movsb
	movsb
%%align_table_end:
%endm


; > ecx = size
; x edx
%macro IC_MOVQ 0
align 16
	mov		edx, 64
%%loop:
	cmp		ecx, edx
	jb		%%done
	prefetchnta	[esi + (200*64/34+192)]
	movq	mm0, [esi+0]
	movq	mm1, [esi+8]
	movq	[edi+0], mm0
	movq	[edi+8], mm1
	movq	mm2, [esi+16]
	movq	mm3, [esi+24]
	movq	[edi+16], mm2
	movq	[edi+24], mm3
	movq	mm0, [esi+32]
	movq	mm1, [esi+40]
	movq	[edi+32], mm0
	movq	[edi+40], mm1
	movq	mm2, [esi+48]
	movq	mm3, [esi+56]
	movq	[edi+48], mm2
	movq	[edi+56], mm3
	add		esi, edx
	add		edi, edx
	sub		ecx, edx
	jmp		%%loop
%%done:
%endm


; > ecx = size (> 64)
; x
%macro UC_MOVNTQ 0
	mov		edx, 64
align 16
%%1:
	prefetchnta [esi + (200*64/34+192)]
	movq	mm0,[esi+0]
	add		edi, edx
	movq	mm1,[esi+8]
	add		esi, edx
	movq	mm2,[esi-48]
	movntq	[edi-64], mm0
	movq	mm0,[esi-40]
	movntq	[edi-56], mm1
	movq	mm1,[esi-32]
	movntq	[edi-48], mm2
	movq	mm2,[esi-24]
	movntq	[edi-40], mm0
	movq	mm0,[esi-16]
	movntq	[edi-32], mm1
	movq	mm1,[esi-8]
	movntq	[edi-24], mm2
	movntq	[edi-16], mm0
	sub		ecx, edx
	movntq	[edi-8], mm1
	cmp		ecx, edx
	jae		%%1
%endm


; > ecx = size (> 8KiB)
; x eax, edx
;
; somewhat optimized for size (futile attempt to avoid near jump)
%macro UC_BP_MOVNTQ 0
%%prefetch_and_copy_chunk:

	; touch each cache line within chunk in reverse order (prevents HW prefetch)
	push	byte BP_SIZE/128			; # iterations
	pop		eax
	add		esi, BP_SIZE
align 8
%%prefetch_chunk:
	mov		edx, [esi-64]
	mov		edx, [esi-128]
	sub		esi, 128
	dec		eax
	jnz		%%prefetch_chunk

	; copy 64 byte blocks
	mov		eax, BP_SIZE/64				; # iterations (> signed 8 bit)
	push	byte 64
	pop		edx
align 8
%%copy_block:
	movq	mm0, [esi+ 0]
	movq	mm1, [esi+ 8]
	movq	mm2, [esi+16]
	movq	mm3, [esi+24]
	movq	mm4, [esi+32]
	movq	mm5, [esi+40]
	movq	mm6, [esi+48]
	movq	mm7, [esi+56]
	add		esi, edx
	movntq	[edi+ 0], mm0
	movntq	[edi+ 8], mm1
	movntq	[edi+16], mm2
	movntq	[edi+24], mm3
	movntq	[edi+32], mm4
	movntq	[edi+40], mm5
	movntq	[edi+48], mm6
	movntq	[edi+56], mm7
	add		edi, edx
	dec		eax
	jnz		%%copy_block

	sub		ecx, BP_SIZE
	cmp		ecx, BP_SIZE
	jae		%%prefetch_and_copy_chunk
%endm


[section .bss]

; this is somewhat "clever". the 2 specialized transfer implementations
; that use SSE are jumped to if transfer size is greater than a threshold.
; we simply set the requested transfer size to 0 if the CPU doesn't
; support SSE so that those are never reached (done by masking with this).
sse_mask		resd	1

__SECT__

; void __declspec(naked) ia32_memcpy(void* dst, const void* src, size_t nbytes)
global sym(ia32_memcpy)
sym(ia32_memcpy):
	push	edi
	push	esi

	mov		edi, [esp+8+4+0]			; dst
	mov		esi, [esp+8+4+4]			; src
	mov		ecx, [esp+8+4+8]			; nbytes

	cmp		ecx, byte IC_SIZE
	ja		.choose_larger_method

.ic_movsd:
	IC_MOVSD
	pop		esi
	pop		edi
	ret

.choose_larger_method:
	IC_ALIGN

	mov		eax, [sse_mask]
	mov		edx, ecx
	and		edx, eax					; edx = (SSE)? remaining_bytes : 0
	cmp		edx, BP_THRESHOLD
	jae		near .uc_bp_movntq
	cmp		edx, UC_THRESHOLD
	jae		.uc_movntq

.ic_movq:
	IC_MOVQ
	emms
	jmp		.ic_movsd

.uc_movntq:
	UC_MOVNTQ
	sfence
	emms
	jmp		.ic_movsd

.uc_bp_movntq:
	UC_BP_MOVNTQ
	sfence
	jmp		.ic_movq



;-------------------------------------------------------------------------------
; CPUID support
;-------------------------------------------------------------------------------

[section .data]

; these are actually max_func+1, i.e. the first invalid value.
; the idea here is to avoid a separate cpuid_available flag;
; using signed values doesn't work because ext_funcs are >= 0x80000000.
max_func		dd	0
max_ext_func	dd	0

__SECT__


; extern "C" bool __cdecl ia32_cpuid(u32 func, u32* regs)
global sym(ia32_cpuid)
sym(ia32_cpuid):
	push	ebx
	push	edi

	mov		ecx, [esp+8+4+0]			; func
	mov		edi, [esp+8+4+4]			; -> regs

	; compare against max supported func and fail if above
	xor		eax, eax					; return value on failure
	test	ecx, ecx
	mov		edx, [max_ext_func]
	js		.is_ext_func
	mov		edx, [max_func]
.is_ext_func:
	cmp		ecx, edx
	jae		.ret						; (see max_func decl)

	; issue CPUID and store result registers in array
	mov		eax, ecx
	cpuid
	stosd
	xchg		eax, ebx
	stosd
	xchg		eax, ecx
	stosd
	xchg		eax, edx
	stosd

	; success
	xor		eax, eax
	inc		eax
.ret:
	pop		edi
	pop		ebx
	ret


;-------------------------------------------------------------------------------
; misc
;-------------------------------------------------------------------------------

; extern "C" uint __cdecl ia32_control87(uint new_cw, uint mask)
global sym(ia32_control87)
sym(ia32_control87):
	push	eax
	fnstcw	[esp]
	pop		eax							; old_cw
	mov		ecx, [esp+4]				; new_val
	mov		edx, [esp+8]				; mask
	and		ecx, edx					; new_val & mask
	not		edx							; ~mask
	and		eax, edx					; old_cw & ~mask
	or		eax, ecx					; (old_cw & ~mask) | (new_val & mask)
	push	eax							; = new_cw
	fldcw	[esp]
	pop		eax
	xor		eax, eax					; return value
	ret

;-------------------------------------------------------------------------------
; init
;-------------------------------------------------------------------------------
	
; extern "C" bool __cdecl ia32_init()
global sym(ia32_init)
sym(ia32_init):
	push	ebx

	; check if CPUID is supported
	pushfd
	or		byte [esp+2], 32
	popfd
	pushfd
	pop		eax
	xor		edx, edx
	shr		eax, 22						; bit 21 toggled?
	jnc		.no_cpuid

	; determine max supported CPUID function
	xor		eax, eax
	cpuid
	inc		eax							; (see max_func decl)
	mov		[max_func], eax
	mov		eax, 0x80000000
	cpuid
	inc		eax							; (see max_func decl)
	mov		[max_ext_func], eax
.no_cpuid:

	; check if SSE is supported (used by memcpy code)
extern sym(ia32_cap)
	push	byte 32+25					; ia32.h's SSE cap (won't change)
	call	sym(ia32_cap)
	pop		edx							; remove stack param
	neg		eax							; SSE? ~0 : 0
	mov		[sse_mask], eax

	pop		ebx
	ret

;-------------------------------------------------------------------------------
; Color conversion (SSE)
;-------------------------------------------------------------------------------

; extern "C" u32 ConvertRGBColorTo4ub(const RGBColor& color)
[section .data]
	align	16
zero:
	dd	0.0
twofivefive:
	dd	255.0


__SECT__
	align	16
global sym(sse_ConvertRGBColorTo4ub)
sym(sse_ConvertRGBColorTo4ub):
	mov	eax, [esp+4]

	; xmm0, 1, 2 = R, G, B
	movss	xmm4, [zero]
	movss	xmm0, [eax+8]
	movss	xmm1, [eax+4]
	movss	xmm2, [eax]
	movss	xmm5, [twofivefive]

	; C = min(255, 255*max(C, 0)) ( == clamp(255*C, 0, 255) )
	maxss	xmm0, xmm4
	maxss	xmm1, xmm4
	maxss	xmm2, xmm4
	mulss	xmm0, xmm5
	mulss	xmm1, xmm5
	mulss	xmm2, xmm5
	minss	xmm0, xmm5
	minss	xmm1, xmm5
	minss	xmm2, xmm5
	
	; convert to integer and combine channels using bit logic
	cvtss2si eax, xmm0
	cvtss2si ecx, xmm1
	cvtss2si edx, xmm2
	shl	eax, 16
	shl	ecx, 8
	or	eax, 0xff000000
	or	edx, ecx
	or	eax, edx

	ret