diff --git a/source/lib/sysdep/ia32.asm b/source/lib/sysdep/ia32.asm
index 1a14a5ed44..604cbd7ce5 100644
--- a/source/lib/sysdep/ia32.asm
+++ b/source/lib/sysdep/ia32.asm
@@ -1,19 +1,44 @@
 section .text use32
 
-CACHEBLOCK equ 128
-BP_MIN_THRESHOLD_64 equ 192*1024
-MOVNTQ_MIN_THRESHOLD_64 equ 64*1024
+;-------------------------------------------------------------------------------
+; fast general memcpy
+;-------------------------------------------------------------------------------
+
+; optimized for Athlon XP: 7.3% faster (cumulative) than VC7.1's memcpy over
+; all 1..64 byte transfer lengths and misalignments. approaches maximum
+; mem bandwidth (2000 MiB/s) for transfers >= 192KiB!
+; Pentium III performance: about 3% faster in above small buffer benchmark.
+;
+; *requires* (and does not verify the presence of) SSE instructions:
+; prefetchnta and movntq. therefore, a P3+ or Athlon XP is required.
+; rationale: older processors are too slow anyway and we don't bother.
+
+; if memcpy size is greater than this,
+; .. it's too big for L1. use non-temporal instructions.
+UC_THRESHOLD	equ	64*1024
+; .. it also blows L2. pull chunks into L1 ("block prefetch").
+BP_THRESHOLD	equ	192*1024
+
+; maximum that can be copied by IC_MOVSD.
+; if you change this, be sure to expand the movs* table(s)!
+IC_SIZE		equ	67
+
+; size of one block prefetch chunk.
+; if you change this, make sure "push byte BP_SIZE/128" doesn't overflow!
+BP_SIZE		equ	8*1024
 
 
-
-%macro MC_UNROLLED_MOVSD 0
-	and	ebx, 63
-	mov	edx, ebx
-	shr	edx, 2			; dword count
-	neg	edx
-	add	edx, %%movsd_table_end
-	jmp	edx
-
+; > ecx = size (<= IC_SIZE)
+; x eax, ecx
+;
+; determined to be fastest approach by testing. a movsd table followed by
+; rep movsb is a bit smaller but 6.9% slower; everything else is much worse.
+%macro IC_MOVSD 0
+	mov		eax, ecx
+	shr		ecx, 2			; dword count
+	neg		ecx
+	add		ecx, %%movsd_table_end
+	jmp		ecx
 align 8
 	movsd
 	movsd
@@ -33,12 +58,10 @@ align 8
 	movsd
 %%movsd_table_end:
 
-	mov	eax, ebx
-	and	eax, 3
-	neg	eax
-	add	eax, %%movsb_table_end
-	jmp	eax
-
+	and		eax, 3
+	neg		eax
+	add		eax, %%movsb_table_end
+	jmp		eax
 	movsb
 	movsb
 	movsb
@@ -46,10 +69,12 @@ align 8
 %endm
 
 
-%macro MC_ALIGN 0
+; align destination address to multiple of 8.
+; not done for small transfers because it doesn't help IC_MOVSD.
+%macro IC_ALIGN 0
 	mov		eax, 8
 	sub		eax, edi
-	and		eax, 7
+	and		eax, byte 7
 	cmp		eax, ecx
 	cmova		eax, ecx
 	sub		ecx, eax
@@ -69,9 +94,14 @@ align 4
 %endm
 
 
-%macro MC_MOVQ 0
+; > ecx = size (> 0)
+; x edx
+%macro IC_MOVQ 0
 align 16
-%%1:
+	mov		edx, 64
+%%loop:
+	cmp		ecx, edx
+	jb		%%done
 	prefetchnta	[esi + (200*64/34+192)]
 	movq		mm0, [esi+0]
 	movq		mm1, [esi+8]
@@ -89,28 +119,38 @@ align 16
 	movq		mm3, [esi+56]
 	movq		[edi+48], mm2
 	movq		[edi+56], mm3
-	add		esi, 64
-	add		edi, 64
-	dec		ecx
-	jnz		%%1
+	add		esi, edx
+	add		edi, edx
+	sub		ecx, edx
+	jmp		%%loop
+%%done:
 %endm
 
 
-; we have >= 8kb. until no more 8kb blocks
-%macro MC_BP_MOVNTQ 0
+; > ecx = size (> 8KiB)
+; x eax, edx
+;
+; somewhat optimized for size (futile attempt to avoid near jump)
+%macro UC_BP_MOVNTQ 0
 %%prefetch_and_copy_chunk:
-	mov		eax, CACHEBLOCK / 2		; block prefetch loop, unrolled 2X
-	add		esi, CACHEBLOCK * 64	; move to the top of the block
-align 16
-	; touch each cache line in reverse order (prevents HW prefetch)
+
+	; touch each cache line within chunk in reverse order (prevents HW prefetch)
+	push		byte BP_SIZE/128	; # iterations
+	pop		eax
+	add		esi, BP_SIZE
+align 8
 %%prefetch_chunk:
 	mov		edx, [esi-64]
 	mov		edx, [esi-128]
 	sub		esi, 128
 	dec		eax
 	jnz		%%prefetch_chunk
-	mov		eax, CACHEBLOCK		; now that it's in cache, do the copy
-align 16
+
+	; copy 64 byte blocks
+	mov		eax, BP_SIZE/64		; # iterations (> signed 8 bit)
+	push		byte 64
+	pop		edx
+align 8
 %%copy_block:
 	movq		mm0, [esi+ 0]
 	movq		mm1, [esi+ 8]
@@ -120,7 +160,7 @@ align 16
 	movq		mm5, [esi+40]
 	movq		mm6, [esi+48]
 	movq		mm7, [esi+56]
-	add		esi, 64
+	add		esi, edx
 	movntq		[edi+ 0], mm0
 	movntq		[edi+ 8], mm1
 	movntq		[edi+16], mm2
@@ -129,24 +169,27 @@ align 16
 	movntq		[edi+40], mm5
 	movntq		[edi+48], mm6
 	movntq		[edi+56], mm7
-	add		edi, 64
+	add		edi, edx
 	dec		eax
 	jnz		%%copy_block
-	sub		ecx, CACHEBLOCK		; update the 64-byte block count
-	cmp		ecx, CACHEBLOCK
-	jl		%%prefetch_and_copy_chunk
+
+	sub		ecx, BP_SIZE
+	cmp		ecx, BP_SIZE
+	jae		%%prefetch_and_copy_chunk
 %endm
 
 
-; we have >= 64, 64B BLOCKS
-%macro MC_MOVNTQ 0
+; > ecx = size (> 64)
+; x
+%macro UC_MOVNTQ 0
+	mov		edx, 64
 align 16
 %%1:
 	prefetchnta [esi + (200*64/34+192)]
 	movq		mm0,[esi+0]
-	add		edi,64
+	add		edi, edx
 	movq		mm1,[esi+8]
-	add		esi,64
+	add		esi, edx
 	movq		mm2,[esi-48]
 	movntq		[edi-64], mm0
 	movq		mm0,[esi-40]
@@ -160,75 +203,67 @@ align 16
 	movq		mm1,[esi-8]
 	movntq		[edi-24], mm2
 	movntq		[edi-16], mm0
-	dec		ecx
+	sub		ecx, edx
 	movntq		[edi-8], mm1
-	jnz		%%1
+	cmp		ecx, edx
+	jae		%%1
 %endm
 
 
-
-
-
-
-
-
 ; void __declspec(naked) ia32_memcpy(void* dst, const void* src, size_t nbytes)
-	mov	ecx, [esp+4+8]		; nbytes
-	mov	esi, [esp+4+4]		; src
-	mov	edi, [esp+4+0]		; dst
+global _ia32_memcpy
+_ia32_memcpy:
+	mov		ecx, [esp+4+8]		; nbytes
+	mov		esi, [esp+4+4]		; src
+	mov		edi, [esp+4+0]		; dst
 
-	MC_ALIGN
+	cmp		ecx, byte IC_SIZE
+	ja		.choose_large_method
 
-	mov	ebx, ecx
-	shr	ecx, 6			; # blocks
-
-	mov	eax, _bp
-	cmp	ecx, BP_MIN_THRESHOLD_64
-	mov	edx, _movntq
-	cmovb	eax, edx
-	cmp	ecx, MOVNTQ_MIN_THRESHOLD_64
-	mov	edx, _mmx
-	cmovb	eax, edx
-	cmp	ecx, 64
-	jbe	tiny
-	jmp	eax
-
-tiny:
-	MC_UNROLLED_MOVSD
+.ic_movsd:
+	IC_MOVSD
 	ret
 
-_mmx:
-	MC_MOVQ
-	emms
-	jmp		tiny
+.choose_large_method:
+	IC_ALIGN
+	cmp		ecx, UC_THRESHOLD
+	jb		near .ic_movq
+	cmp		ecx, BP_THRESHOLD
+	jae		.uc_bp_movntq
 
-_bp:
-	MC_BP_MOVNTQ
+.uc_movntq:
+	UC_MOVNTQ
 	sfence
 	emms
-	; protect routine below
-	cmp		ecx, 0
-	jz		tiny
-
-_movntq:
+	jmp		.ic_movsd
 
+.uc_bp_movntq:
+	UC_BP_MOVNTQ
 	sfence
+	; fall through
+
+.ic_movq:
+	IC_MOVQ
 	emms
-	jmp		tiny
-
-
-
+	jmp		.ic_movsd
 
 
+;-------------------------------------------------------------------------------
+; CPUID support
+;-------------------------------------------------------------------------------
 
 [section .data use32]
 cpuid_available	dd	-1
 
-; max supported CPUID functions. initialized to 
-max_func	dd	0x7FFFFFFF
-max_ext_func	dd	0xFFFFFFFF
+[section .bss use32]
+
+; no init needed - cpuid_available triggers init
+max_func	resd	1
+max_ext_func	resd	1
+
 __SECT__
 
+
 ; extern "C" bool __cdecl ia32_cpuid(u32 func, u32* regs)
 global _ia32_cpuid
 _ia32_cpuid:
@@ -297,15 +332,9 @@ _ia32_cpuid:
 	jmp		.retry
 
 
-
-
-
-
-
-
-
-
-
+;-------------------------------------------------------------------------------
+; misc
+;-------------------------------------------------------------------------------
 
 ; extern "C" uint __cdecl ia32_control87(uint new_cw, uint mask)
 global _ia32_control87
diff --git a/source/lib/sysdep/ia32.cpp b/source/lib/sysdep/ia32.cpp
index e251b38692..647155fbba 100755
--- a/source/lib/sysdep/ia32.cpp
+++ b/source/lib/sysdep/ia32.cpp
@@ -100,12 +100,6 @@ void ia32_debug_break()
 }
 
 
-void ia32_memcpy(void* dst, const void* src, size_t nbytes)
-{
-	memcpy(dst, src, nbytes);
-}
-
-
 //-----------------------------------------------------------------------------
 // support code for lock-free primitives
 //-----------------------------------------------------------------------------
diff --git a/source/lib/sysdep/ia32.h b/source/lib/sysdep/ia32.h
index 8dd2c814c5..8e126cc0c8 100755
--- a/source/lib/sysdep/ia32.h
+++ b/source/lib/sysdep/ia32.h
@@ -93,6 +93,10 @@ enum IA32Regs
 	ECX,
 	EDX
 };
+
+// try to call the specified CPUID sub-function. returns true on success or
+// false on failure (i.e. CPUID or the specific function not supported).
+// returns eax, ebx, ecx, edx registers in above order.
 extern bool ia32_cpuid(u32 func, u32* regs);
 
 #ifdef __cplusplus