forked from 0ad/0ad
janwas
e2f25f4598
- config: add CONFIG_TRACE - display_error_impl->sys_display_error - cleaned up sysdep; add sys_ prefix everywhere and document everything - add to vfs_load dox - cursor: sys_cursor_load -> sys_cursor_create. sysdep code is no longer dependent on tex; instead of calling tex_load, the caller passes a BGRA texture in. memcpy: huge kick in the pants for accompanying paper; now even faster. - on_each_cpu -> sys_on_each_cpu (removed manager function also) - wsdl: explain PeekMessage CPU usage issue This was SVN commit r3203.
667 lines
17 KiB
NASM
667 lines
17 KiB
NASM
; set 32-bit attribute once for all sections and activate .text
|
|
section .data use32
|
|
section .bss use32
|
|
section .text use32
|
|
|
|
; Usage:
|
|
; use sym(ia32_cap) instead of _ia32_cap - on relevant platforms, sym() will add
|
|
; the underlines automagically, on others it won't
|
|
%ifdef DONT_USE_UNDERLINE
|
|
%define sym(a) a
|
|
%else
|
|
%define sym(a) _ %+ a
|
|
%endif
|
|
|
|
;-------------------------------------------------------------------------------
|
|
; fast general memcpy
|
|
;-------------------------------------------------------------------------------
|
|
|
|
; drop-in replacement for libc memcpy(). only requires CPU support for
|
|
; MMX (by now universal). highly optimized for Athlon and Pentium III
|
|
; microarchitectures; significantly outperforms VC7.1 memcpy and memcpy_amd.
|
|
; for details, see accompanying article.
|
|
|
|
; if transfer size is at least this much,
|
|
; .. it's too big for L1. use non-temporal instructions.
|
|
UC_THRESHOLD equ 64*1024
|
|
; .. it also blows L2. pull chunks into L1 ("block prefetch").
|
|
BP_THRESHOLD equ 192*1024
|
|
|
|
; maximum that can be copied by IC_TINY.
|
|
IC_TINY_MAX equ 63
|
|
|
|
; size of one block prefetch chunk.
|
|
BP_SIZE equ 8*1024
|
|
|
|
|
|
;------------------------------------------------------------------------------
|
|
|
|
; [p3] replicating this instead of jumping to it from tailN
|
|
; saves 1 clock and costs (7-2)*2 bytes code.
|
|
%macro EPILOG 0
|
|
pop esi
|
|
pop edi
|
|
mov eax, [esp+4] ; return dst
|
|
ret
|
|
%endm
|
|
|
|
align 64
|
|
tail1:
|
|
mov al, [esi+ecx*4]
|
|
mov [edi+ecx*4], al
|
|
align 4
|
|
tail0:
|
|
EPILOG
|
|
|
|
align 8
|
|
tail3:
|
|
; [p3] 2 reads followed by 2 writes is better than
|
|
; R/W interleaved and RRR/WWW
|
|
mov al, [esi+ecx*4+2]
|
|
mov [edi+ecx*4+2], al
|
|
; already aligned to 8 due to above code
|
|
tail2:
|
|
mov al, [esi+ecx*4]
|
|
mov dl, [esi+ecx*4+1]
|
|
mov [edi+ecx*4], al
|
|
mov [edi+ecx*4+1], dl
|
|
EPILOG
|
|
|
|
[section .data]
|
|
align 16
|
|
tail_table dd tail0, tail1, tail2, tail3
|
|
__SECT__
|
|
|
|
; 15x unrolled copy loop - transfers DWORDs backwards.
|
|
; indexed via table of 8-bit offsets.
|
|
; rationale:
|
|
; - [p3] backwards vs. forwards makes no difference.
|
|
; - MOV is faster than MOVSD.
|
|
; - index table is needed because calculating end-6*i is slower than
|
|
; a LUT and we wouldn't want to expand entries to 8 bytes
|
|
; (that'd increase code footprint by 30 bytes)
|
|
; - a byte index accessed via MOVZX is better due to less dcache usage.
|
|
; - only unrolling 8x and 'reentering' the loop is possible but
|
|
; slower due to fiddling with esi/ecx.
|
|
align 64
|
|
unrolled_copy_code_start:
|
|
%assign i 15
|
|
%rep 14 ; 15 entries, 1 base case handled below
|
|
uc_ %+ i:
|
|
mov eax, [esi+i*4-4]
|
|
mov [edi+i*4-4], eax
|
|
%assign i i-1
|
|
%endrep
|
|
; base case: no displacement needed; skip it so that code will
|
|
; be aligned to 8 bytes after this.
|
|
uc_1:
|
|
mov eax, [esi]
|
|
mov [edi], eax
|
|
uc_0:
|
|
jmp [tail_table+edx*4]
|
|
|
|
[section .data]
|
|
align 32
|
|
unrolled_copy_index_table:
|
|
%assign i 0
|
|
%rep 16
|
|
db (uc_ %+ i) - unrolled_copy_code_start
|
|
%assign i i+1
|
|
%endrep
|
|
__SECT__
|
|
|
|
|
|
;------------------------------------------------------------------------------
|
|
; tiny copy - handles all cases smaller than IC_MOVQ's 64 byte lower limit.
|
|
; > edx = number of bytes (< IC_TINY_MAX)
|
|
; < does not return.
|
|
; x eax, ecx, edx
|
|
%macro IC_TINY 0
|
|
mov ecx, edx
|
|
shr ecx, 2
|
|
; calculating this address isn't possible due to skipping displacement on uc1;
|
|
; even so, it'd require calculating -6*ecx, which is slower than LUT.
|
|
movzx eax, byte [unrolled_copy_index_table+ecx]
|
|
and edx, byte 3
|
|
add eax, unrolled_copy_code_start
|
|
jmp eax
|
|
; never reached! the unrolled loop jumps into tailN, which
|
|
; then returns from the memcpy function.
|
|
%endm
|
|
|
|
|
|
;------------------------------------------------------------------------------
|
|
; align destination address to multiple of 8. important for large transfers,
|
|
; but doesn't affect the tiny technique.
|
|
; > esi, edi -> buffers (updated)
|
|
; > ecx, edx = transfer size (updated)
|
|
; x eax
|
|
%macro IC_ALIGN 0
|
|
mov eax, edi
|
|
and eax, byte 7 ; eax = # misaligned bytes
|
|
jz already_aligned ; early out
|
|
lea eax, [align_table_start+eax*2]
|
|
jmp eax
|
|
|
|
; [p3] this is no slower than a table of mov and much smaller/simpler
|
|
align 8
|
|
align_table_start:
|
|
%rep 8
|
|
dec ecx
|
|
movsb
|
|
%endrep
|
|
mov edx, ecx
|
|
already_aligned:
|
|
%endm
|
|
|
|
|
|
;------------------------------------------------------------------------------
|
|
; MMX MOVQ technique. used for in-cache transfers of 64B..64*KiB.
|
|
; must run on all CPUs, i.e. cannot use the SSE prefetchnta instruction.
|
|
; > ecx = -number_of_bytes (multiple of 64)
|
|
; > esi, esi point to end of the buffer, i.e. &last_qword+8.
|
|
; < ecx = 0
|
|
; x
|
|
%macro IC_MOVQ 0
|
|
|
|
; see notes below. TODO: if simple addressing is better on Athlons as well, prevent this from happening in setup code when not doing large transfers
|
|
add esi, ecx
|
|
add edi, ecx
|
|
|
|
align 16
|
|
%%loop:
|
|
|
|
; notes:
|
|
; - we can't use prefetch here - this codepath must support all CPUs.
|
|
; [p3] that makes us 5..15% slower on 1KiB..4KiB transfers.
|
|
; - [p3] simple addressing without +ecx is 3.5% faster.
|
|
; - [p3] there's no difference between RR/WW/RR/WW and R..R/W..W
|
|
; with simple addressing and no prefetch.
|
|
; - enough time elapses between first and third pair of reads that we
|
|
; could reuse MM0. there is no performance gain either way and
|
|
; differing displacements make code compression futile, so
|
|
; we'll just use MM4..7 for clarity.
|
|
movq mm0, [esi]
|
|
movq mm1, [esi+8]
|
|
movq [edi], mm0
|
|
movq [edi+8], mm1
|
|
movq mm2, [esi+16]
|
|
movq mm3, [esi+24]
|
|
movq [edi+16], mm2
|
|
movq [edi+24], mm3
|
|
movq mm4, [esi+32]
|
|
movq mm5, [esi+40]
|
|
movq [edi+32], mm4
|
|
movq [edi+40], mm5
|
|
movq mm6, [esi+48]
|
|
movq mm7, [esi+56]
|
|
movq [edi+48], mm6
|
|
movq [edi+56], mm7
|
|
add esi, byte 64
|
|
add edi, byte 64
|
|
add ecx, byte 64
|
|
jnz %%loop
|
|
%endm
|
|
|
|
|
|
;------------------------------------------------------------------------------
|
|
; SSE MOVNTQ technique. used for transfers that do not fit in L1,
|
|
; i.e. 64KiB..192KiB. requires Pentium III or Athlon; caller checks for this.
|
|
; > ecx = -number_of_bytes (multiple of 64)
|
|
; > esi, esi point to end of the buffer, i.e. &last_qword+8.
|
|
; < ecx = 0
|
|
; x
|
|
%macro UC_MOVNTQ 0
|
|
|
|
align 16
|
|
%%loop:
|
|
; notes:
|
|
; - the AMD optimization manual recommends prefetch distances according to
|
|
; (200*BytesPerIter/ClocksPerIter+192), which comes out to ~560 here.
|
|
; [p3] rounding down to 512 bytes makes for significant gains.
|
|
; - [p3] complex addressing with ecx is 1% faster than adding to esi/edi.
|
|
prefetchnta [esi+ecx+512]
|
|
movq mm0, [esi+ecx]
|
|
movq mm1, [esi+ecx+8]
|
|
movq mm2, [esi+ecx+16]
|
|
movq mm3, [esi+ecx+24]
|
|
movq mm4, [esi+ecx+32]
|
|
movq mm5, [esi+ecx+40]
|
|
movq mm6, [esi+ecx+48]
|
|
movq mm7, [esi+ecx+56]
|
|
movntq [edi+ecx], mm0
|
|
movntq [edi+ecx+8], mm1
|
|
movntq [edi+ecx+16], mm2
|
|
movntq [edi+ecx+24], mm3
|
|
movntq [edi+ecx+32], mm4
|
|
movntq [edi+ecx+40], mm5
|
|
movntq [edi+ecx+48], mm6
|
|
movntq [edi+ecx+56], mm7
|
|
add ecx, byte 64
|
|
jnz %%loop
|
|
%endm
|
|
|
|
|
|
;------------------------------------------------------------------------------
|
|
; block prefetch technique. used for transfers that do not fit in L2,
|
|
; i.e. > 192KiB. requires Pentium III or Athlon; caller checks for this.
|
|
; for theory behind this, see article.
|
|
; > ecx = -number_of_bytes (multiple of 64, <= -BP_SIZE)
|
|
; > esi, esi point to end of the buffer, i.e. &last_qword+8.
|
|
; < ecx = -remaining_bytes (multiple of 64, > -BP_SIZE)
|
|
; < eax = 0
|
|
%macro UC_BP_MOVNTQ 0
|
|
push edx
|
|
align 4
|
|
%%prefetch_and_copy_chunk:
|
|
|
|
; pull chunk into cache by touching each cache line
|
|
; (in reverse order to prevent HW prefetches)
|
|
mov eax, BP_SIZE/128 ; # iterations
|
|
add esi, BP_SIZE
|
|
align 16
|
|
%%prefetch_loop:
|
|
mov edx, [esi+ecx-64]
|
|
mov edx, [esi+ecx-128]
|
|
add esi, byte -128
|
|
dec eax
|
|
jnz %%prefetch_loop
|
|
|
|
; copy chunk in 64 byte pieces
|
|
mov eax, BP_SIZE/64 ; # iterations (> signed 8 bit)
|
|
align 16
|
|
%%copy_loop:
|
|
movq mm0, [esi+ecx]
|
|
movq mm1, [esi+ecx+8]
|
|
movq mm2, [esi+ecx+16]
|
|
movq mm3, [esi+ecx+24]
|
|
movq mm4, [esi+ecx+32]
|
|
movq mm5, [esi+ecx+40]
|
|
movq mm6, [esi+ecx+48]
|
|
movq mm7, [esi+ecx+56]
|
|
movntq [edi+ecx], mm0
|
|
movntq [edi+ecx+8], mm1
|
|
movntq [edi+ecx+16], mm2
|
|
movntq [edi+ecx+24], mm3
|
|
movntq [edi+ecx+32], mm4
|
|
movntq [edi+ecx+40], mm5
|
|
movntq [edi+ecx+48], mm6
|
|
movntq [edi+ecx+56], mm7
|
|
|
|
add ecx, byte 64
|
|
dec eax
|
|
jnz %%copy_loop
|
|
|
|
; if enough data left, process next chunk
|
|
cmp ecx, -BP_SIZE
|
|
jle %%prefetch_and_copy_chunk
|
|
|
|
pop edx
|
|
%endm
|
|
|
|
|
|
;------------------------------------------------------------------------------
|
|
|
|
; void* __declspec(naked) ia32_memcpy(void* dst, const void* src, size_t nbytes)
|
|
; drop-in replacement for libc memcpy() (returns dst)
|
|
global sym(ia32_memcpy)
|
|
align 64
|
|
sym(ia32_memcpy):
|
|
push edi
|
|
push esi
|
|
|
|
mov ecx, [esp+8+4+8] ; nbytes
|
|
mov edi, [esp+8+4+0] ; dst
|
|
mov esi, [esp+8+4+4] ; src
|
|
|
|
mov edx, ecx
|
|
cmp ecx, byte IC_TINY_MAX
|
|
ja choose_larger_method
|
|
|
|
ic_tiny:
|
|
IC_TINY
|
|
; never reached - IC_TINY contains memcpy function epilog code
|
|
|
|
choose_larger_method:
|
|
IC_ALIGN
|
|
|
|
; setup:
|
|
; eax = number of 64 byte chunks, or 0 if CPU doesn't support SSE.
|
|
; used to choose copy technique.
|
|
; ecx = -number_of_bytes, multiple of 64. we jump to ic_tiny if
|
|
; there's not enough left for a single 64 byte chunk, which can
|
|
; happen on unaligned 64..71 byte transfers due to IC_ALIGN.
|
|
; edx = number of remainder bytes after qwords have been copied;
|
|
; will be handled by IC_TINY.
|
|
; esi and edi point to end of the respective buffers (more precisely,
|
|
; to buffer_start-ecx). this together with the ecx convention means
|
|
; we only need one loop counter (instead of having to advance
|
|
; that and esi/edi).
|
|
|
|
; this mask is applied to the transfer size. the 2 specialized copy techniques
|
|
; that use SSE are jumped to if size is greater than a threshold.
|
|
; we simply set the requested transfer size to 0 if the CPU doesn't
|
|
; support SSE so that those are never reached (done by masking with this).
|
|
extern sym(ia32_memcpy_size_mask)
|
|
mov eax, [sym(ia32_memcpy_size_mask)]
|
|
and ecx, byte ~IC_TINY_MAX
|
|
jz ic_tiny ; < 64 bytes left (due to IC_ALIGN)
|
|
add esi, ecx
|
|
add edi, ecx
|
|
and edx, byte IC_TINY_MAX
|
|
and eax, ecx
|
|
neg ecx
|
|
|
|
cmp eax, BP_THRESHOLD
|
|
jae near uc_bp_movntq
|
|
cmp eax, UC_THRESHOLD
|
|
jae uc_movntq
|
|
|
|
ic_movq:
|
|
IC_MOVQ
|
|
emms
|
|
jmp ic_tiny
|
|
|
|
uc_movntq:
|
|
UC_MOVNTQ
|
|
sfence
|
|
emms
|
|
jmp ic_tiny
|
|
|
|
uc_bp_movntq:
|
|
UC_BP_MOVNTQ
|
|
sfence
|
|
cmp ecx, byte -(IC_TINY_MAX+1)
|
|
jle ic_movq
|
|
emms
|
|
jmp ic_tiny
|
|
|
|
|
|
;-------------------------------------------------------------------------------
|
|
; CPUID support
|
|
;-------------------------------------------------------------------------------
|
|
|
|
[section .data]
|
|
|
|
; these are actually max_func+1, i.e. the first invalid value.
|
|
; the idea here is to avoid a separate cpuid_available flag;
|
|
; using signed values doesn't work because ext_funcs are >= 0x80000000.
|
|
max_func dd 0
|
|
max_ext_func dd 0
|
|
|
|
__SECT__
|
|
|
|
|
|
; extern "C" bool __cdecl ia32_cpuid(u32 func, u32* regs)
|
|
global sym(ia32_cpuid)
|
|
sym(ia32_cpuid):
|
|
push ebx
|
|
push edi
|
|
|
|
mov ecx, [esp+8+4+0] ; func
|
|
mov edi, [esp+8+4+4] ; -> regs
|
|
|
|
; compare against max supported func and fail if above
|
|
xor eax, eax ; return value on failure
|
|
test ecx, ecx
|
|
mov edx, [max_ext_func]
|
|
js .is_ext_func
|
|
mov edx, [max_func]
|
|
.is_ext_func:
|
|
cmp ecx, edx
|
|
jae .ret ; (see max_func decl)
|
|
|
|
; issue CPUID and store result registers in array
|
|
mov eax, ecx
|
|
cpuid
|
|
stosd
|
|
xchg eax, ebx
|
|
stosd
|
|
xchg eax, ecx
|
|
stosd
|
|
xchg eax, edx
|
|
stosd
|
|
|
|
; success
|
|
xor eax, eax
|
|
inc eax
|
|
.ret:
|
|
pop edi
|
|
pop ebx
|
|
ret
|
|
|
|
|
|
;-------------------------------------------------------------------------------
|
|
; lock-free support routines
|
|
;-------------------------------------------------------------------------------
|
|
|
|
extern sym(cpus)
|
|
|
|
; extern "C" void __cdecl atomic_add(intptr_t* location, intptr_t increment);
|
|
global sym(atomic_add)
|
|
sym(atomic_add):
|
|
cmp byte [sym(cpus)], 1
|
|
mov edx, [esp+4] ; location
|
|
mov eax, [esp+8] ; increment
|
|
je .no_lock
|
|
db 0xf0 ; LOCK prefix
|
|
.no_lock:
|
|
add [edx], eax
|
|
ret
|
|
|
|
|
|
; notes:
|
|
; - this is called via CAS macro, which silently casts its inputs for
|
|
; convenience. mixing up the <expected> and <location> parameters would
|
|
; go unnoticed; we therefore perform a basic sanity check on <location> and
|
|
; raise a warning if it is invalid.
|
|
; - a 486 or later processor is required since we use CMPXCHG.
|
|
; there's no feature flag we can check, and the ia32 code doesn't
|
|
; bother detecting anything < Pentium, so this'll crash and burn if
|
|
; run on 386. we could fall back to simple MOVs there (since 386 CPUs
|
|
; aren't MP-capable), but it's not worth the trouble.
|
|
; extern "C" __declspec(naked) bool __cdecl CAS_(uintptr_t* location, uintptr_t expected, uintptr_t new_value);
|
|
global sym(CAS_)
|
|
sym(CAS_):
|
|
cmp byte [sym(cpus)], 1
|
|
mov eax, [esp+8] ; expected
|
|
mov edx, [esp+4] ; location
|
|
cmp edx, 0x10000 ; .. valid pointer?
|
|
jb .invalid_location ; no - raise warning
|
|
mov ecx, [esp+12] ; new_value
|
|
je .no_lock
|
|
db 0xf0 ; LOCK prefix
|
|
.no_lock:
|
|
cmpxchg [edx], ecx
|
|
sete al
|
|
movzx eax, al
|
|
ret
|
|
|
|
; NOTE: nasm 0.98.39 doesn't support generating debug info for win32
|
|
; output format. that means this code may be misattributed to other
|
|
; functions, which makes tracking it down very difficult.
|
|
; we therefore raise an "Invalid Opcode" exception, which is rather distinct.
|
|
.invalid_location:
|
|
ud2
|
|
|
|
|
|
;-------------------------------------------------------------------------------
|
|
; misc
|
|
;-------------------------------------------------------------------------------
|
|
|
|
; extern "C" uint __cdecl ia32_control87(uint new_cw, uint mask)
|
|
global sym(ia32_control87)
|
|
sym(ia32_control87):
|
|
push eax
|
|
fnstcw [esp]
|
|
pop eax ; old_cw
|
|
mov ecx, [esp+4] ; new_val
|
|
mov edx, [esp+8] ; mask
|
|
and ecx, edx ; new_val & mask
|
|
not edx ; ~mask
|
|
and eax, edx ; old_cw & ~mask
|
|
or eax, ecx ; (old_cw & ~mask) | (new_val & mask)
|
|
push eax ; = new_cw
|
|
fldcw [esp]
|
|
pop eax
|
|
xor eax, eax ; return value
|
|
ret
|
|
|
|
|
|
; write the current execution state (e.g. all register values) into
|
|
; (Win32::CONTEXT*)pcontext (defined as void* to avoid dependency).
|
|
; optimized for size; this must be straight asm because __declspec(naked)
|
|
; is compiler-specific and compiler-generated prolog code inserted before
|
|
; inline asm trashes EBP and ESP (unacceptable).
|
|
; extern "C" void ia32_get_current_context(void* pcontext)
|
|
global sym(ia32_get_current_context)
|
|
sym(ia32_get_current_context):
|
|
pushad
|
|
pushfd
|
|
mov edi, [esp+4+32+4] ; pcontext
|
|
|
|
; ContextFlags
|
|
mov eax, 0x10007 ; segs, int, control
|
|
stosd
|
|
|
|
; DRx and FloatSave
|
|
; rationale: we can't access the debug registers from Ring3, and
|
|
; the FPU save area is irrelevant, so zero them.
|
|
xor eax, eax
|
|
push byte 6+8+20
|
|
pop ecx
|
|
rep stosd
|
|
|
|
; CONTEXT_SEGMENTS
|
|
mov ax, gs
|
|
stosd
|
|
mov ax, fs
|
|
stosd
|
|
mov ax, es
|
|
stosd
|
|
mov ax, ds
|
|
stosd
|
|
|
|
; CONTEXT_INTEGER
|
|
mov eax, [esp+4+32-32] ; edi
|
|
stosd
|
|
xchg eax, esi
|
|
stosd
|
|
xchg eax, ebx
|
|
stosd
|
|
xchg eax, edx
|
|
stosd
|
|
mov eax, [esp+4+32-8] ; ecx
|
|
stosd
|
|
mov eax, [esp+4+32-4] ; eax
|
|
stosd
|
|
|
|
; CONTEXT_CONTROL
|
|
xchg eax, ebp ; ebp restored by POPAD
|
|
stosd
|
|
mov eax, [esp+4+32] ; return address
|
|
sub eax, 5 ; skip CALL instruction -> call site.
|
|
stosd
|
|
xor eax, eax
|
|
mov ax, cs
|
|
stosd
|
|
pop eax ; eflags
|
|
stosd
|
|
lea eax, [esp+32+4+4] ; esp
|
|
stosd
|
|
xor eax, eax
|
|
mov ax, ss
|
|
stosd
|
|
|
|
; ExtendedRegisters
|
|
xor ecx, ecx
|
|
mov cl, 512/4
|
|
rep stosd
|
|
|
|
popad
|
|
ret
|
|
|
|
|
|
;-------------------------------------------------------------------------------
|
|
; init
|
|
;-------------------------------------------------------------------------------
|
|
|
|
; extern "C" bool __cdecl ia32_asm_init()
|
|
global sym(ia32_asm_init)
|
|
sym(ia32_asm_init):
|
|
push ebx
|
|
|
|
; check if CPUID is supported
|
|
pushfd
|
|
or byte [esp+2], 32
|
|
popfd
|
|
pushfd
|
|
pop eax
|
|
xor edx, edx
|
|
shr eax, 22 ; bit 21 toggled?
|
|
jnc .no_cpuid
|
|
|
|
; determine max supported CPUID function
|
|
xor eax, eax
|
|
cpuid
|
|
inc eax ; (see max_func decl)
|
|
mov [max_func], eax
|
|
mov eax, 0x80000000
|
|
cpuid
|
|
inc eax ; (see max_func decl)
|
|
mov [max_ext_func], eax
|
|
.no_cpuid:
|
|
|
|
pop ebx
|
|
ret
|
|
|
|
|
|
;-------------------------------------------------------------------------------
|
|
; Color conversion (SSE)
|
|
;-------------------------------------------------------------------------------
|
|
|
|
; extern "C" u32 ConvertRGBColorTo4ub(const RGBColor& color)
|
|
[section .data]
|
|
align 16
|
|
zero:
|
|
dd 0.0
|
|
twofivefive:
|
|
dd 255.0
|
|
|
|
|
|
__SECT__
|
|
align 16
|
|
global sym(sse_ConvertRGBColorTo4ub)
|
|
sym(sse_ConvertRGBColorTo4ub):
|
|
mov eax, [esp+4]
|
|
|
|
; xmm0, 1, 2 = R, G, B
|
|
movss xmm4, [zero]
|
|
movss xmm0, [eax+8]
|
|
movss xmm1, [eax+4]
|
|
movss xmm2, [eax]
|
|
movss xmm5, [twofivefive]
|
|
|
|
; C = min(255, 255*max(C, 0)) ( == clamp(255*C, 0, 255) )
|
|
maxss xmm0, xmm4
|
|
maxss xmm1, xmm4
|
|
maxss xmm2, xmm4
|
|
mulss xmm0, xmm5
|
|
mulss xmm1, xmm5
|
|
mulss xmm2, xmm5
|
|
minss xmm0, xmm5
|
|
minss xmm1, xmm5
|
|
minss xmm2, xmm5
|
|
|
|
; convert to integer and combine channels using bit logic
|
|
cvtss2si eax, xmm0
|
|
cvtss2si ecx, xmm1
|
|
cvtss2si edx, xmm2
|
|
shl eax, 16
|
|
shl ecx, 8
|
|
or eax, 0xff000000
|
|
or edx, ecx
|
|
or eax, edx
|
|
|
|
ret
|