; set section attributes section .data data align=32 use32 section .bss bss align=16 use32 section .text code align=64 use32 ; activate .text (needs to be separate because __SECT__ will otherwise ; complain that the above definition is redeclaring attributes) section .text ; Usage: ; use sym(ia32_cap) instead of _ia32_cap - on relevant platforms, sym() will add ; the underlines automagically, on others it won't %ifdef DONT_USE_UNDERLINE %define sym(a) a %else %define sym(a) _ %+ a %endif ;------------------------------------------------------------------------------- ; fast general memcpy ;------------------------------------------------------------------------------- ; drop-in replacement for libc memcpy(). only requires CPU support for ; MMX (by now universal). highly optimized for Athlon and Pentium III ; microarchitectures; significantly outperforms VC7.1 memcpy and memcpy_amd. ; for details, see accompanying article. ; if transfer size is at least this much, ; .. it's too big for L1. use non-temporal instructions. UC_THRESHOLD equ 64*1024 ; .. it also blows L2. pull chunks into L1 ("block prefetch"). BP_THRESHOLD equ 256*1024 ; maximum that can be copied by IC_TINY. IC_TINY_MAX equ 63 ; size of one block prefetch chunk. BP_SIZE equ 8*1024 ;------------------------------------------------------------------------------ ; [p3] replicating this instead of jumping to it from tailN ; saves 1 clock and costs (7-2)*2 bytes code. %macro EPILOG 0 pop esi pop edi mov eax, [esp+4] ; return dst ret %endm align 64 tail1: mov al, [esi+ecx*4] mov [edi+ecx*4], al align 4 tail0: EPILOG align 8 tail3: ; [p3] 2 reads followed by 2 writes is better than ; R/W interleaved and RRR/WWW mov al, [esi+ecx*4+2] mov [edi+ecx*4+2], al ; already aligned to 8 due to above code tail2: mov al, [esi+ecx*4] mov dl, [esi+ecx*4+1] mov [edi+ecx*4], al mov [edi+ecx*4+1], dl EPILOG [section .data] align 16 tail_table dd tail0, tail1, tail2, tail3 __SECT__ ; 15x unrolled copy loop - transfers DWORDs backwards. ; indexed via table of 8-bit offsets. ; rationale: ; - [p3] backwards vs. forwards makes no difference. ; - MOV is faster than MOVSD. ; - index table is needed because calculating end-6*i is slower than ; a LUT and we wouldn't want to expand entries to 8 bytes ; (that'd increase code footprint by 30 bytes) ; - a byte index accessed via MOVZX is better due to less dcache usage. ; - only unrolling 8x and 'reentering' the loop is possible but ; slower due to fiddling with esi/ecx. align 64 unrolled_copy_code_start: %assign i 15 %rep 14 ; 15 entries, 1 base case handled below uc_ %+ i: mov eax, [esi+i*4-4] mov [edi+i*4-4], eax %assign i i-1 %endrep ; base case: no displacement needed; skip it so that code will ; be aligned to 8 bytes after this. uc_1: mov eax, [esi] mov [edi], eax uc_0: jmp [tail_table+edx*4] [section .data] align 32 unrolled_copy_index_table: %assign i 0 %rep 16 db (uc_ %+ i) - unrolled_copy_code_start %assign i i+1 %endrep __SECT__ ;------------------------------------------------------------------------------ ; tiny copy - handles all cases smaller than IC_MOVQ's 64 byte lower limit. ; > edx = number of bytes (< IC_TINY_MAX) ; < does not return. ; x eax, ecx, edx %macro IC_TINY 0 mov ecx, edx shr ecx, 2 ; calculating this address isn't possible due to skipping displacement on uc1; ; even so, it'd require calculating -6*ecx, which is slower than LUT. movzx eax, byte [unrolled_copy_index_table+ecx] and edx, byte 3 add eax, unrolled_copy_code_start jmp eax ; never reached! the unrolled loop jumps into tailN, which ; then returns from the memcpy function. %endm ;------------------------------------------------------------------------------ ; align destination address to multiple of 8. important for large transfers, ; but doesn't affect the tiny technique. ; > esi, edi -> buffers (updated) ; > ecx, edx = transfer size (updated) ; x eax %macro IC_ALIGN 0 mov eax, edi and eax, byte 7 ; eax = # misaligned bytes jz already_aligned ; early out lea eax, [align_table_start+eax*2] jmp eax ; [p3] this is no slower than a table of mov and much smaller/simpler align 8 align_table_start: %rep 8 dec ecx movsb %endrep mov edx, ecx already_aligned: %endm ;------------------------------------------------------------------------------ ; MMX MOVQ technique. used for in-cache transfers of 64B..64*KiB. ; must run on all CPUs, i.e. cannot use the SSE prefetchnta instruction. ; > ecx = -number_of_bytes (multiple of 64) ; > esi, esi point to end of the buffer, i.e. &last_qword+8. ; < ecx = 0 ; x %macro IC_MOVQ 0 align 16 %%loop: ; notes: ; - we can't use prefetch here - this codepath must support all CPUs. ; [p3] that makes us 5..15% slower on 1KiB..4KiB transfers. ; - [p3] simple addressing without +ecx is 3.5% faster. ; - difference between RR/WW/RR/WW and R..R/W..W: ; [p3] none (if simple addressing) ; [axp] interleaved is better (with +ecx addressing) ; - enough time elapses between first and third pair of reads that we ; could reuse MM0. there is no performance gain either way and ; differing displacements make code compression futile anyway, so ; we'll just use MM4..7 for clarity. movq mm0, [esi+ecx] movq mm1, [esi+ecx+8] movq [edi+ecx], mm0 movq [edi+ecx+8], mm1 movq mm2, [esi+ecx+16] movq mm3, [esi+ecx+24] movq [edi+ecx+16], mm2 movq [edi+ecx+24], mm3 movq mm4, [esi+ecx+32] movq mm5, [esi+ecx+40] movq [edi+ecx+32], mm4 movq [edi+ecx+40], mm5 movq mm6, [esi+ecx+48] movq mm7, [esi+ecx+56] movq [edi+ecx+48], mm6 movq [edi+ecx+56], mm7 add ecx, byte 64 jnz %%loop %endm ;------------------------------------------------------------------------------ ; SSE MOVNTQ technique. used for transfers that do not fit in L1, ; i.e. 64KiB..192KiB. requires Pentium III or Athlon; caller checks for this. ; > ecx = -number_of_bytes (multiple of 64) ; > esi, esi point to end of the buffer, i.e. &last_qword+8. ; < ecx = 0 ; x %macro UC_MOVNTQ 0 align 16 %%loop: ; notes: ; - the AMD optimization manual recommends prefetch distances according to ; (200*BytesPerIter/ClocksPerIter+192), which comes out to ~560 here. ; [p3] rounding down to 512 bytes makes for significant gains. ; - [p3] complex addressing with ecx is 1% faster than adding to esi/edi. prefetchnta [esi+ecx+512] movq mm0, [esi+ecx] movq mm1, [esi+ecx+8] movq mm2, [esi+ecx+16] movq mm3, [esi+ecx+24] movq mm4, [esi+ecx+32] movq mm5, [esi+ecx+40] movq mm6, [esi+ecx+48] movq mm7, [esi+ecx+56] movntq [edi+ecx], mm0 movntq [edi+ecx+8], mm1 movntq [edi+ecx+16], mm2 movntq [edi+ecx+24], mm3 movntq [edi+ecx+32], mm4 movntq [edi+ecx+40], mm5 movntq [edi+ecx+48], mm6 movntq [edi+ecx+56], mm7 add ecx, byte 64 jnz %%loop %endm ;------------------------------------------------------------------------------ ; block prefetch technique. used for transfers that do not fit in L2, ; i.e. > 192KiB. requires Pentium III or Athlon; caller checks for this. ; for theory behind this, see article. ; > ecx = -number_of_bytes (multiple of 64, <= -BP_SIZE) ; > esi, esi point to end of the buffer, i.e. &last_qword+8. ; < ecx = -remaining_bytes (multiple of 64, > -BP_SIZE) ; < eax = 0 %macro UC_BP_MOVNTQ 0 push edx align 4 %%prefetch_and_copy_chunk: ; pull chunk into cache by touching each cache line ; (in reverse order to prevent HW prefetches) mov eax, BP_SIZE/128 ; # iterations add esi, BP_SIZE align 16 %%prefetch_loop: mov edx, [esi+ecx-64] mov edx, [esi+ecx-128] add esi, byte -128 dec eax jnz %%prefetch_loop ; copy chunk in 64 byte pieces mov eax, BP_SIZE/64 ; # iterations (> signed 8 bit) align 16 %%copy_loop: movq mm0, [esi+ecx] movq mm1, [esi+ecx+8] movq mm2, [esi+ecx+16] movq mm3, [esi+ecx+24] movq mm4, [esi+ecx+32] movq mm5, [esi+ecx+40] movq mm6, [esi+ecx+48] movq mm7, [esi+ecx+56] movntq [edi+ecx], mm0 movntq [edi+ecx+8], mm1 movntq [edi+ecx+16], mm2 movntq [edi+ecx+24], mm3 movntq [edi+ecx+32], mm4 movntq [edi+ecx+40], mm5 movntq [edi+ecx+48], mm6 movntq [edi+ecx+56], mm7 add ecx, byte 64 dec eax jnz %%copy_loop ; if enough data left, process next chunk cmp ecx, -BP_SIZE jle %%prefetch_and_copy_chunk pop edx %endm ;------------------------------------------------------------------------------ ; void* __declspec(naked) ia32_memcpy(void* dst, const void* src, size_t nbytes) ; drop-in replacement for libc memcpy() (returns dst) global sym(ia32_memcpy) align 64 sym(ia32_memcpy): push edi push esi mov ecx, [esp+8+4+8] ; nbytes mov edi, [esp+8+4+0] ; dst mov esi, [esp+8+4+4] ; src mov edx, ecx cmp ecx, byte IC_TINY_MAX ja choose_larger_method ic_tiny: IC_TINY ; never reached - IC_TINY contains memcpy function epilog code choose_larger_method: IC_ALIGN ; setup: ; eax = number of 64 byte chunks, or 0 if CPU doesn't support SSE. ; used to choose copy technique. ; ecx = -number_of_bytes, multiple of 64. we jump to ic_tiny if ; there's not enough left for a single 64 byte chunk, which can ; happen on unaligned 64..71 byte transfers due to IC_ALIGN. ; edx = number of remainder bytes after qwords have been copied; ; will be handled by IC_TINY. ; esi and edi point to end of the respective buffers (more precisely, ; to buffer_start-ecx). this together with the ecx convention means ; we only need one loop counter (instead of having to advance ; that and esi/edi). ; this mask is applied to the transfer size. the 2 specialized copy techniques ; that use SSE are jumped to if size is greater than a threshold. ; we simply set the requested transfer size to 0 if the CPU doesn't ; support SSE so that those are never reached (done by masking with this). extern sym(ia32_memcpy_size_mask) mov eax, [sym(ia32_memcpy_size_mask)] and ecx, byte ~IC_TINY_MAX jz ic_tiny ; < 64 bytes left (due to IC_ALIGN) add esi, ecx add edi, ecx and edx, byte IC_TINY_MAX and eax, ecx neg ecx cmp eax, BP_THRESHOLD jae near uc_bp_movntq cmp eax, UC_THRESHOLD jae uc_movntq ic_movq: IC_MOVQ emms jmp ic_tiny uc_movntq: UC_MOVNTQ sfence emms jmp ic_tiny uc_bp_movntq: UC_BP_MOVNTQ sfence cmp ecx, byte -(IC_TINY_MAX+1) jle ic_movq emms jmp ic_tiny ;------------------------------------------------------------------------------- ; CPUID support ;------------------------------------------------------------------------------- [section .data] ; these are actually max_func+1, i.e. the first invalid value. ; the idea here is to avoid a separate cpuid_available flag; ; using signed values doesn't work because ext_funcs are >= 0x80000000. max_func dd 0 max_ext_func dd 0 __SECT__ ; extern "C" bool __cdecl ia32_cpuid(u32 func, u32* regs) global sym(ia32_cpuid) sym(ia32_cpuid): push ebx push edi mov ecx, [esp+8+4+0] ; func mov edi, [esp+8+4+4] ; -> regs ; compare against max supported func and fail if above xor eax, eax ; return value on failure test ecx, ecx mov edx, [max_ext_func] js .is_ext_func mov edx, [max_func] .is_ext_func: cmp ecx, edx jae .ret ; (see max_func decl) ; issue CPUID and store result registers in array mov eax, ecx cpuid stosd xchg eax, ebx stosd xchg eax, ecx stosd xchg eax, edx stosd ; success xor eax, eax inc eax .ret: pop edi pop ebx ret ;------------------------------------------------------------------------------- ; lock-free support routines ;------------------------------------------------------------------------------- extern sym(cpus) ; extern "C" void __cdecl atomic_add(intptr_t* location, intptr_t increment); global sym(atomic_add) sym(atomic_add): cmp byte [sym(cpus)], 1 mov edx, [esp+4] ; location mov eax, [esp+8] ; increment je .no_lock db 0xf0 ; LOCK prefix .no_lock: add [edx], eax ret ; notes: ; - this is called via CAS macro, which silently casts its inputs for ; convenience. mixing up the and parameters would ; go unnoticed; we therefore perform a basic sanity check on and ; raise a warning if it is invalid. ; - a 486 or later processor is required since we use CMPXCHG. ; there's no feature flag we can check, and the ia32 code doesn't ; bother detecting anything < Pentium, so this'll crash and burn if ; run on 386. we could fall back to simple MOVs there (since 386 CPUs ; aren't MP-capable), but it's not worth the trouble. ; extern "C" __declspec(naked) bool __cdecl CAS_(uintptr_t* location, uintptr_t expected, uintptr_t new_value); global sym(CAS_) sym(CAS_): cmp byte [sym(cpus)], 1 mov eax, [esp+8] ; expected mov edx, [esp+4] ; location cmp edx, 0x10000 ; .. valid pointer? jb .invalid_location ; no - raise warning mov ecx, [esp+12] ; new_value je .no_lock db 0xf0 ; LOCK prefix .no_lock: cmpxchg [edx], ecx sete al movzx eax, al ret ; NOTE: nasm 0.98.39 doesn't support generating debug info for win32 ; output format. that means this code may be misattributed to other ; functions, which makes tracking it down very difficult. ; we therefore raise an "Invalid Opcode" exception, which is rather distinct. .invalid_location: ud2 ;------------------------------------------------------------------------------- ; misc ;------------------------------------------------------------------------------- ; extern "C" uint __cdecl ia32_control87(uint new_cw, uint mask) global sym(ia32_control87) sym(ia32_control87): push eax fnstcw [esp] pop eax ; old_cw mov ecx, [esp+4] ; new_val mov edx, [esp+8] ; mask and ecx, edx ; new_val & mask not edx ; ~mask and eax, edx ; old_cw & ~mask or eax, ecx ; (old_cw & ~mask) | (new_val & mask) push eax ; = new_cw fldcw [esp] pop eax xor eax, eax ; return value ret ; write the current execution state (e.g. all register values) into ; (Win32::CONTEXT*)pcontext (defined as void* to avoid dependency). ; optimized for size; this must be straight asm because __declspec(naked) ; is compiler-specific and compiler-generated prolog code inserted before ; inline asm trashes EBP and ESP (unacceptable). ; extern "C" void ia32_get_current_context(void* pcontext) global sym(ia32_get_current_context) sym(ia32_get_current_context): pushad pushfd mov edi, [esp+4+32+4] ; pcontext ; ContextFlags mov eax, 0x10007 ; segs, int, control stosd ; DRx and FloatSave ; rationale: we can't access the debug registers from Ring3, and ; the FPU save area is irrelevant, so zero them. xor eax, eax push byte 6+8+20 pop ecx rep stosd ; CONTEXT_SEGMENTS mov ax, gs stosd mov ax, fs stosd mov ax, es stosd mov ax, ds stosd ; CONTEXT_INTEGER mov eax, [esp+4+32-32] ; edi stosd xchg eax, esi stosd xchg eax, ebx stosd xchg eax, edx stosd mov eax, [esp+4+32-8] ; ecx stosd mov eax, [esp+4+32-4] ; eax stosd ; CONTEXT_CONTROL xchg eax, ebp ; ebp restored by POPAD stosd mov eax, [esp+4+32] ; return address sub eax, 5 ; skip CALL instruction -> call site. stosd xor eax, eax mov ax, cs stosd pop eax ; eflags stosd lea eax, [esp+32+4+4] ; esp stosd xor eax, eax mov ax, ss stosd ; ExtendedRegisters xor ecx, ecx mov cl, 512/4 rep stosd popad ret ;------------------------------------------------------------------------------- ; init ;------------------------------------------------------------------------------- ; extern "C" bool __cdecl ia32_asm_init() global sym(ia32_asm_init) sym(ia32_asm_init): push ebx ; check if CPUID is supported pushfd or byte [esp+2], 32 popfd pushfd pop eax xor edx, edx shr eax, 22 ; bit 21 toggled? jnc .no_cpuid ; determine max supported CPUID function xor eax, eax cpuid inc eax ; (see max_func decl) mov [max_func], eax mov eax, 0x80000000 cpuid inc eax ; (see max_func decl) mov [max_ext_func], eax .no_cpuid: pop ebx ret ;------------------------------------------------------------------------------- ; Color conversion (SSE) ;------------------------------------------------------------------------------- ; extern "C" u32 ConvertRGBColorTo4ub(const RGBColor& color) [section .data] align 16 zero: dd 0.0 twofivefive: dd 255.0 __SECT__ align 16 global sym(sse_ConvertRGBColorTo4ub) sym(sse_ConvertRGBColorTo4ub): mov eax, [esp+4] ; xmm0, 1, 2 = R, G, B movss xmm4, [zero] movss xmm0, [eax+8] movss xmm1, [eax+4] movss xmm2, [eax] movss xmm5, [twofivefive] ; C = min(255, 255*max(C, 0)) ( == clamp(255*C, 0, 255) ) maxss xmm0, xmm4 maxss xmm1, xmm4 maxss xmm2, xmm4 mulss xmm0, xmm5 mulss xmm1, xmm5 mulss xmm2, xmm5 minss xmm0, xmm5 minss xmm1, xmm5 minss xmm2, xmm5 ; convert to integer and combine channels using bit logic cvtss2si eax, xmm0 cvtss2si ecx, xmm1 cvtss2si edx, xmm2 shl eax, 16 shl ecx, 8 or eax, 0xff000000 or edx, ecx or eax, edx ret