- self test: rename stuff to SELF_TEST*; add provision for delayed all-at-once self tests (allows for init before the test and makes measuring elapsed time easier)

- config: add CONFIG_TRACE - display_error_impl->sys_display_error - cleaned up sysdep; add sys_ prefix everywhere and document everything - add to vfs_load dox - cursor: sys_cursor_load -> sys_cursor_create. sysdep code is no longer dependent on tex; instead of calling tex_load, the caller passes a BGRA texture in. memcpy: huge kick in the pants for accompanying paper; now even faster. - on_each_cpu -> sys_on_each_cpu (removed manager function also) - wsdl: explain PeekMessage CPU usage issue This was SVN commit r3203.
2005-12-07 03:38:39 +00:00 · 2005-12-07 03:38:39 +00:00 · e2f25f4598
commit e2f25f4598
parent a5d1968a8c
27 changed files with 640 additions and 419 deletions
--- a/source/lib/allocators.cpp
+++ b/source/lib/allocators.cpp
@ -549,7 +549,7 @@ static void self_test()
 	test_matrix();
 }

-RUN_SELF_TEST;
+SELF_TEST_RUN;

 }	// namespace test
 #endif	// #if SELF_TEST_ENABLED
--- a/source/lib/config.h
+++ b/source/lib/config.h
@ -46,6 +46,13 @@
 # define CONFIG_PARANOIA 0
 #endif

+// enable trace output for low-level code - various functions will
+// debug_printf when they are entered/exited. note that the appropriate
+// TRACEn tags must be debug_filter_add-ed for this to have any effect.
+#ifndef CONFIG_TRACE
+# define CONFIG_TRACE 0
+#endif
+
 // try to prevent any exceptions from being thrown - even by the C++
 // standard library. useful only for performance tests.
 #ifndef CONFIG_DISABLE_EXCEPTIONS
--- a/source/lib/debug.cpp
+++ b/source/lib/debug.cpp
@ -492,7 +492,7 @@ ErrorReaction display_error(const wchar_t* description, int flags,
 		text = L"(insufficient memory to display error message)";

 	debug_write_crashlog(text);
-	ErrorReaction er = display_error_impl(text, flags);
+	ErrorReaction er = sys_display_error(text, flags);

 	// note: debug_break-ing here to make sure the app doesn't continue
 	// running is no longer necessary. display_error now determines our
--- a/source/lib/debug.h
+++ b/source/lib/debug.h
@ -175,7 +175,7 @@ extern enum ErrorReaction debug_warn_err(int err, const char* file, int line,


 //-----------------------------------------------------------------------------
-// logging
+// output
 //-----------------------------------------------------------------------------

 // write a formatted string to the debug channel, subject to filtering
@ -185,6 +185,14 @@ extern void debug_printf(const char* fmt, ...);
 extern void debug_wprintf(const wchar_t* fmt, ...);


+extern ErrorReaction display_error(const wchar_t* description, int flags,
+	uint skip, void* context, const char* file, int line);
+
+// convenience version, in case the advanced parameters aren't needed.
+// done this way instead of with default values so that it also works in C.
+#define DISPLAY_ERROR(text) display_error(text, 0, 0, 0, __FILE__, __LINE__)
+
+
 //
 // filtering
 //
--- a/source/lib/lib.cpp
+++ b/source/lib/lib.cpp
@ -517,7 +517,7 @@ static void self_test()
 	test_log2();
 }

-RUN_SELF_TEST;
+SELF_TEST_RUN;

 }	// namespace test
 #endif	// #if SELF_TEST_ENABLED
--- a/source/lib/lib_errors.cpp
+++ b/source/lib/lib_errors.cpp
@ -27,6 +27,11 @@ static const char* lib_error_description(int err)
 }


+// generate textual description of an error code.
+// stores up to <max_chars> in the given buffer.
+// <err> can be one of the above error codes, POSIX ENOENT etc., or
+// an OS-specific errors. if unknown, the string will be something like
+// "Unknown error (65536, 0x10000)".
 void error_description_r(int err, char* buf, size_t max_chars)
 {
 	// lib error
@ -69,11 +74,3 @@ void error_description_r(int err, char* buf, size_t max_chars)
 	if(!have_output)
 		snprintf(buf, max_chars, "Unknown error (%d, 0x%X)", err, err);
 }
-
-
-const char* error_description(int err)
-{
-	static char buf[200];
-	error_description_r(err, buf, ARRAY_SIZE(buf));
-	return buf;
-}
--- a/source/lib/lib_errors.h
+++ b/source/lib/lib_errors.h
@ -65,10 +65,15 @@ ERR(-100704, ERR_SHDR_NO_PROGRAM, "Invalid shader program reference")
 #ifndef ERRORS_H__
 #define ERRORS_H__

+// limits on the errors defined above (used by error_description_r)
 #define ERR_MIN 100000
 #define ERR_MAX 110000

-extern const char* error_description(int err);
+// generate textual description of an error code.
+// stores up to <max_chars> in the given buffer.
+// <err> can be one of the above error codes, POSIX ENOENT etc., or
+// an OS-specific errors. if unknown, the string will be something like
+// "Unknown error (65536, 0x10000)".
 extern void error_description_r(int err, char* buf, size_t max_chars);

 #endif	// #ifndef ERRORS_H__
--- a/source/lib/lockfree.cpp
+++ b/source/lib/lockfree.cpp
@ -957,7 +957,7 @@ static void self_test()
 	multithreaded_torture_test();
 }

-RUN_SELF_TEST;
+SELF_TEST_RUN;

 }	// namespace test
 #endif	// #if SELF_TEST_ENABLED
--- a/source/lib/mmgr.cpp
+++ b/source/lib/mmgr.cpp
@ -1271,7 +1271,7 @@ char* mmgr_getcwd_dbg(char* buf, size_t buf_size, const char* file, int line, co
 //

 static void* new_common(size_t size, AllocType type,
-						const char* file, int line, const char* func)
+	const char* file, int line, const char* func)
 {
 	const char* allocator = types[type];

--- a/source/lib/res/file/file.cpp
+++ b/source/lib/res/file/file.cpp
@ -255,7 +255,7 @@ int file_set_root_dir(const char* argv0, const char* rel_path)
 	// get full path to executable
 	char n_path[PATH_MAX];
 	// .. first try safe, but system-dependent version
-	if(get_executable_name(n_path, PATH_MAX) < 0)
+	if(sys_get_executable_name(n_path, PATH_MAX) < 0)
 	{
 		// .. failed; use argv[0]
 		if(!realpath(argv0, n_path))
--- a/source/lib/res/file/vfs.cpp
+++ b/source/lib/res/file/vfs.cpp
@ -537,14 +537,16 @@ static ssize_t vfs_timed_io(const Handle hf, const size_t size, void** p, FileIO
 }


-// load the entire file <fn> into memory; return a handle to the memory
-// and the buffer address/size. output parameters are zeroed on failure.
-// in addition to the regular file cache, the entire buffer is kept in memory
-// if flags & FILE_CACHE.
+// load the entire file <fn> into memory.
+// returns a memory handle to the file's contents or a negative error code.
+// p and size are filled with address/size of buffer (0 on failure).
+// flags influences IO mode and is typically 0.
+//   in addition to the regular file cache, the entire buffer is
+//   kept in memory if flags & FILE_CACHE.
+// when the file contents are no longer needed, you can mem_free_h the
+// Handle, or mem_free(p).
 //
-// on failure, a debug_warn is generated and a negative error code returned.
-//
-// note: we need the Handle return value for Tex.hm - the data pointer
+// rationale: we need the Handle return value for Tex.hm - the data pointer
 // must be protected against being accidentally free-d in that case.
 Handle vfs_load(const char* v_fn, void*& p, size_t& size, uint flags /* default 0 */)
 {
--- a/source/lib/res/file/vfs.h
+++ b/source/lib/res/file/vfs.h
@ -383,10 +383,14 @@ extern ssize_t vfs_io(Handle hf, size_t size, void** p, FileIOCB cb = 0, uintptr

 // convenience functions that replace vfs_open / vfs_io / vfs_close:

-// load the entire file <fn> into memory; return a memory handle to the
-// buffer and its address/size. output parameters are zeroed on failure.
-// in addition to the regular file cache, the entire buffer is kept in memory
-// if flags & FILE_CACHE.
+// load the entire file <fn> into memory.
+// returns a memory handle to the file's contents or a negative error code.
+// p and size are filled with address/size of buffer (0 on failure).
+// flags influences IO mode and is typically 0.
+//   in addition to the regular file cache, the entire buffer is
+//   kept in memory if flags & FILE_CACHE.
+// when the file contents are no longer needed, you can mem_free_h the
+// Handle, or mem_free(p).
 extern Handle vfs_load(const char* fn, void*& p, size_t& size, uint flags = 0);

 extern ssize_t vfs_store(const char* fn, void* p, size_t size, uint flags = 0);
--- a/source/lib/res/graphics/cursor.cpp
+++ b/source/lib/res/graphics/cursor.cpp
@ -18,6 +18,42 @@
 #include "ogl_tex.h"
 #include "cursor.h"

+
+static void* load_sys_cursor(const char* filename, int hx, int hy)
+{
+#if !ALLOW_SYS_CURSOR
+	return 0;
+#else
+	Tex t;
+	if(tex_load(filename, &t) < 0)
+		return 0;
+
+	{
+	void* sys_cursor = 0;	// return value
+
+	// convert to required BGRA format.
+	const uint flags = (t.flags | TEX_BGR) & ~TEX_DXT;
+	if(tex_transform_to(&t, flags) < 0)
+		goto fail;
+	void* bgra_img = tex_get_data(&t);
+	if(!bgra_img)
+		goto fail;
+
+	if(sys_cursor_create(t.w, t.h, bgra_img, hx, hy, &sys_cursor) < 0)
+		goto fail;
+
+	(void)tex_free(&t);
+	return sys_cursor;
+	}
+
+fail:
+	debug_warn("failed");
+	(void)tex_free(&t);
+	return 0;
+#endif
+}
+
+
 // no init is necessary because this is stored in struct Cursor, which
 // is 0-initialized by h_mgr.
 class GLCursor
@ -124,12 +160,8 @@ static int Cursor_reload(Cursor* c, const char* name, Handle)

 	// load actual cursor
 	snprintf(filename, ARRAY_SIZE(filename), "art/textures/cursors/%s.dds", name);
-	// .. system cursor (2d, hardware accelerated)
-#if ALLOW_SYS_CURSOR
-	WARN_ERR(sys_cursor_load(filename, hotspotx, hotspoty, &c->sys_cursor));
-#else
-	c->sys_cursor = 0;
-#endif
+	// .. try loading as system cursor (2d, hardware accelerated)
+	c->sys_cursor = load_sys_cursor(filename, hotspotx, hotspoty);
 	// .. fall back to GLCursor (system cursor code is disabled or failed)
 	if(!c->sys_cursor)
 		RETURN_ERR(c->gl_cursor.create(filename, hotspotx, hotspoty));
--- a/source/lib/self_test.cpp
+++ b/source/lib/self_test.cpp
@ -19,17 +19,49 @@
 #include "precompiled.h"

 #include "self_test.h"
+#include "timer.h"

 // checked by debug_assert_failed; disables asserts if true (see above).
-// set/cleared by run_self_test.
+// set/cleared by self_test_run.
 bool self_test_active = false;

 // trampoline that sets self_test_active and returns a dummy value;
-// used by RUN_SELF_TEST.
-int run_self_test(void(*test_func)())
+// used by SELF_TEST_RUN.
+int self_test_run(void(*func)())
 {
 	self_test_active = true;
-	test_func();
+	func();
 	self_test_active = false;
-	return 0;
+	return 0;	// assigned to dummy at file scope
 }
+
+
+static const SelfTestRecord* registered_tests;
+
+int self_test_register(SelfTestRecord* r)
+{
+	// SELF_TEST_REGISTER has already initialized r->func.
+	r->next = registered_tests;
+	registered_tests = r;
+	return 0;	// assigned to dummy at file scope
+}
+
+
+void self_test_run_all()
+{
+	debug_printf("SELF TESTS:\n");
+	const double t0 = get_time();
+
+	// someone somewhere may want to run self-tests twice (e.g. to help
+	// track down memory corruption), so don't destroy the list while
+	// iterating over it.
+	const SelfTestRecord* r = registered_tests;
+	while(r)
+	{
+		self_test_run(r->func);
+		r = r->next;
+	}
+
+	const double dt = get_time() - t0;
+	debug_printf("-- done (elapsed time %.0f ms)\n", dt*1e3);
+}
--- a/source/lib/self_test.h
+++ b/source/lib/self_test.h
@ -42,8 +42,9 @@ What makes a good self-test?
  bad inputs ("does it reject those?"), and successes ("did it have the
  expected result?").
 - Tests should be non-intrusive (only bother user if something fails) and
-  very quick. This is because we run them automatically at startup,
-  which solves the common problem of making sure they actually run.
+  very quick. This is because they are executed every program run - which
+  is a good thing because it solves the common problem of forgetting to
+  run them after a change.

  If the test is unavoidably slow or annoying (example: wdbg_sym's
  stack trace), then best to disable it by default; see below for how.
@ -74,7 +75,7 @@ static void self_test()
 	// further test groups..
 }

-RUN_SELF_TEST;									// (4)
+SELF_TEST_RUN;									// (4)

 }	// namespace test
 #endif	// #if SELF_TEST_ENABLED
@ -117,21 +118,40 @@ For further details, see below.
 // and this is the only error reporter guaranteed to work.
 //
 // note: could also stringize condition and display that, but it'd require
-// macro magic (stringize+prepend L) and we already get file+line.
+// macro magic (stringize+prepend L) and we already display file+line.
 #define TEST(condition) STMT(\
 	if(!(condition))\
 		DISPLAY_ERROR(L"Self-test failed");\
 )


-// your source file should contain a void function "self_test" that
+// your source file should contain a function: void self_test(void) that
 // performs all tests or calls out to individual test functions.
 // this macro calls it at static init time and takes care of setting
 // self_test_active (see above).
 //
 // rationale: since compiler optimizations may mess with the dummy variable,
 // best to put this in a macro so we won't have to change each occurrence.
-#define RUN_SELF_TEST static int dummy = run_self_test(self_test)
+#define SELF_TEST_RUN\
+	static int dummy = self_test_run(self_test)
+
+// calling at static init time may not always be desirable - some
+// self-tests may require initialization beforehand. this mechanism allows
+// registering self tests automatically, which are then all run when you
+// call self_test_run_all.
+#define SELF_TEST_REGISTER\
+	static SelfTestRecord self_test_record = { self_test, 0 };\
+	static int dummy = self_test_register(&self_test_record)
+
+struct SelfTestRecord
+{
+	void(*func)();
+	const SelfTestRecord* next;
+};
+
+// call all self-tests registered thus far. rationale: see above.
+// also displays a banner+elapsed time via debug_printf.
+extern void self_test_run_all();


 //
@ -139,8 +159,10 @@ For further details, see below.
 //

 // trampoline that sets self_test_active and returns a dummy value;
-// used by RUN_SELF_TEST.
-extern int run_self_test(void(*test_func)());
+// used by SELF_TEST_RUN.
+extern int self_test_run(void(*func)());
+
+extern int self_test_register(SelfTestRecord* r);

 // checked by debug_assert_failed; disables asserts if true (see above).
 // set/cleared by run_self_test.
--- a/source/lib/string_s.cpp
+++ b/source/lib/string_s.cpp
@ -391,7 +391,7 @@ static void self_test()
 	test_concatenate();
 }

-RUN_SELF_TEST;
+SELF_TEST_RUN;

 #endif	// #if SELF_TEST_ENABLED

--- a/source/lib/sysdep/ia32.asm
+++ b/source/lib/sysdep/ia32.asm
@ -16,267 +16,365 @@ section .text use32
 ; fast general memcpy
 ;-------------------------------------------------------------------------------

-; optimized for Athlon XP: 7.3% faster (cumulative) than VC7.1's memcpy over
-; all 1..64 byte transfer lengths and misalignments. approaches maximum
-; mem bandwidth (2000 MiB/s) for transfers >= 192KiB!
-; Pentium III performance: about 3% faster in above small buffer benchmark.
-;
-; disables specialized large transfer (> 64KiB) implementations if SSE
-; isn't available; we do assume MMX support, though (quite safe).
+; drop-in replacement for libc memcpy(). only requires CPU support for
+; MMX (by now universal). highly optimized for Athlon and Pentium III
+; microarchitectures; significantly outperforms VC7.1 memcpy and memcpy_amd.
+; for details, see accompanying article.

-; if memcpy size is greater than this,
+; if transfer size is at least this much,
 ; .. it's too big for L1. use non-temporal instructions.
 UC_THRESHOLD	equ	64*1024
 ; .. it also blows L2. pull chunks into L1 ("block prefetch").
 BP_THRESHOLD	equ	192*1024

-; maximum that can be copied by IC_MOVSD.
-; if you change this, be sure to expand the movs* table(s)!
-IC_SIZE		equ	67
+; maximum that can be copied by IC_TINY.
+IC_TINY_MAX		equ	63

 ; size of one block prefetch chunk.
-; if you change this, make sure "push byte BP_SIZE/128" doesn't overflow!
 BP_SIZE		equ	8*1024


-; > ecx = size (<= IC_SIZE)
-; x eax, ecx
-;
-; determined to be fastest approach by testing. a movsd table followed by
-; rep movsb is a bit smaller but 6.9% slower; everything else is much worse.
-%macro IC_MOVSD 0
-	mov		eax, ecx
-	shr		ecx, 2						; dword count
-	neg		ecx
-	add		ecx, %%movsd_table_end
-	jmp		ecx
-align 8
-	movsd
-	movsd
-	movsd
-	movsd
-	movsd
-	movsd
-	movsd
-	movsd
-	movsd
-	movsd
-	movsd
-	movsd
-	movsd
-	movsd
-	movsd
-	movsd
-%%movsd_table_end:
+;------------------------------------------------------------------------------

-	and		eax, 3
-	neg		eax
-	add		eax, %%movsb_table_end
-	jmp		eax
-	movsb
-	movsb
-	movsb
-%%movsb_table_end:
+; [p3] replicating this instead of jumping to it from tailN
+; saves 1 clock and costs (7-2)*2 bytes code.
+%macro EPILOG 0
+	pop		esi
+	pop		edi
+	mov		eax, [esp+4]		; return dst
+	ret
 %endm

-
-; align destination address to multiple of 8.
-; not done for small transfers because it doesn't help IC_MOVSD.
-%macro IC_ALIGN 0
-	mov		eax, 8
-	sub		eax, edi
-	and		eax, byte 7					; eax = # misaligned bytes
-	sub		ecx, eax					; reduce copy count
-	neg		eax
-	add		eax, %%align_table_end
-	jmp		eax
+align 64
+tail1:
+	mov		al, [esi+ecx*4]
+	mov		[edi+ecx*4], al
 align 4
-	movsb
-	movsb
-	movsb
-	movsb
-	movsb
-	movsb
-	movsb
-	movsb
-%%align_table_end:
+tail0:
+	EPILOG
+
+align 8
+tail3:
+	; [p3] 2 reads followed by 2 writes is better than
+	; R/W interleaved and RRR/WWW
+	mov		al, [esi+ecx*4+2]
+	mov		[edi+ecx*4+2], al
+; already aligned to 8 due to above code
+tail2:
+	mov		al, [esi+ecx*4]
+	mov		dl, [esi+ecx*4+1]
+	mov		[edi+ecx*4], al
+	mov		[edi+ecx*4+1], dl
+	EPILOG
+
+[section .data]
+align 16
+tail_table	dd tail0, tail1, tail2, tail3
+__SECT__
+
+; 15x unrolled copy loop - transfers DWORDs backwards.
+; indexed via table of 8-bit offsets.
+; rationale:
+; - [p3] backwards vs. forwards makes no difference.
+; - MOV is faster than MOVSD.
+; - index table is needed because calculating end-6*i is slower than
+;   a LUT and we wouldn't want to expand entries to 8 bytes
+;   (that'd increase code footprint by 30 bytes)
+; - a byte index accessed via MOVZX is better due to less dcache usage.
+; - only unrolling 8x and 'reentering' the loop is possible but
+;   slower due to fiddling with esi/ecx.
+align 64
+unrolled_copy_code_start:
+%assign i 15
+%rep 14	; 15 entries, 1 base case handled below
+uc_ %+ i:
+    mov     eax, [esi+i*4-4]
+    mov     [edi+i*4-4], eax
+%assign i i-1
+%endrep
+; base case: no displacement needed; skip it so that code will
+; be aligned to 8 bytes after this.
+uc_1:
+    mov     eax, [esi]
+    mov     [edi], eax
+uc_0:
+	jmp		[tail_table+edx*4]
+
+[section .data]
+align 32
+unrolled_copy_index_table:
+%assign i 0
+%rep 16
+	db (uc_ %+ i) - unrolled_copy_code_start
+%assign i i+1
+%endrep
+__SECT__
+
+
+;------------------------------------------------------------------------------
+; tiny copy - handles all cases smaller than IC_MOVQ's 64 byte lower limit.
+; > edx = number of bytes (< IC_TINY_MAX)
+; < does not return.
+; x eax, ecx, edx
+%macro IC_TINY 0
+	mov		ecx, edx
+	shr		ecx, 2
+	; calculating this address isn't possible due to skipping displacement on uc1;
+	; even so, it'd require calculating -6*ecx, which is slower than LUT.
+	movzx	eax, byte [unrolled_copy_index_table+ecx]
+	and		edx, byte 3
+	add		eax, unrolled_copy_code_start
+	jmp		eax
+	; never reached! the unrolled loop jumps into tailN, which
+	; then returns from the memcpy function.
 %endm


-; > ecx = size
-; x edx
+;------------------------------------------------------------------------------
+; align destination address to multiple of 8. important for large transfers,
+; but doesn't affect the tiny technique.
+; > esi, edi -> buffers (updated)
+; > ecx, edx = transfer size (updated)
+; x eax
+%macro IC_ALIGN 0
+	mov		eax, edi
+	and		eax, byte 7					; eax = # misaligned bytes
+	jz		already_aligned				; early out
+	lea		eax, [align_table_start+eax*2]
+	jmp		eax
+
+; [p3] this is no slower than a table of mov and much smaller/simpler
+align 8
+align_table_start:
+%rep 8
+	dec		ecx
+	movsb
+%endrep
+	mov		edx, ecx
+already_aligned:
+%endm
+
+
+;------------------------------------------------------------------------------
+; MMX MOVQ technique. used for in-cache transfers of 64B..64*KiB.
+; must run on all CPUs, i.e. cannot use the SSE prefetchnta instruction.
+; > ecx = -number_of_bytes (multiple of 64)
+; > esi, esi point to end of the buffer, i.e. &last_qword+8.
+; < ecx = 0
+; x
 %macro IC_MOVQ 0
+
+; see notes below. TODO: if simple addressing is better on Athlons as well, prevent this from happening in setup code when not doing large transfers
+	add		esi, ecx
+	add		edi, ecx
+
 align 16
-	mov		edx, 64
 %%loop:
-	cmp		ecx, edx
-	jb		%%done
-	prefetchnta	[esi + (200*64/34+192)]
-	movq	mm0, [esi+0]
+
+	; notes:
+	; - we can't use prefetch here - this codepath must support all CPUs.
+	;   [p3] that makes us 5..15% slower on 1KiB..4KiB transfers.
+	; - [p3] simple addressing without +ecx is 3.5% faster.
+	; - [p3] there's no difference between RR/WW/RR/WW and R..R/W..W
+	;   with simple addressing and no prefetch.
+	; - enough time elapses between first and third pair of reads that we
+	;   could reuse MM0. there is no performance gain either way and
+	;   differing displacements make code compression futile, so
+	;   we'll just use MM4..7 for clarity.
+	movq	mm0, [esi]
 	movq	mm1, [esi+8]
-	movq	[edi+0], mm0
+	movq	[edi], mm0
 	movq	[edi+8], mm1
 	movq	mm2, [esi+16]
 	movq	mm3, [esi+24]
 	movq	[edi+16], mm2
 	movq	[edi+24], mm3
-	movq	mm0, [esi+32]
-	movq	mm1, [esi+40]
-	movq	[edi+32], mm0
-	movq	[edi+40], mm1
-	movq	mm2, [esi+48]
-	movq	mm3, [esi+56]
-	movq	[edi+48], mm2
-	movq	[edi+56], mm3
-	add		esi, edx
-	add		edi, edx
-	sub		ecx, edx
-	jmp		%%loop
-%%done:
-%endm
-
-
-; > ecx = size (> 64)
-; x
-%macro UC_MOVNTQ 0
-	mov		edx, 64
-align 16
-%%1:
-	prefetchnta [esi + (200*64/34+192)]
-	movq	mm0,[esi+0]
-	add		edi, edx
-	movq	mm1,[esi+8]
-	add		esi, edx
-	movq	mm2,[esi-48]
-	movntq	[edi-64], mm0
-	movq	mm0,[esi-40]
-	movntq	[edi-56], mm1
-	movq	mm1,[esi-32]
-	movntq	[edi-48], mm2
-	movq	mm2,[esi-24]
-	movntq	[edi-40], mm0
-	movq	mm0,[esi-16]
-	movntq	[edi-32], mm1
-	movq	mm1,[esi-8]
-	movntq	[edi-24], mm2
-	movntq	[edi-16], mm0
-	sub		ecx, edx
-	movntq	[edi-8], mm1
-	cmp		ecx, edx
-	jae		%%1
-%endm
-
-
-; > ecx = size (> 8KiB)
-; x eax, edx
-;
-; somewhat optimized for size (futile attempt to avoid near jump)
-%macro UC_BP_MOVNTQ 0
-%%prefetch_and_copy_chunk:
-
-	; touch each cache line within chunk in reverse order (prevents HW prefetch)
-	push	byte BP_SIZE/128			; # iterations
-	pop		eax
-	add		esi, BP_SIZE
-align 8
-%%prefetch_chunk:
-	mov		edx, [esi-64]
-	mov		edx, [esi-128]
-	sub		esi, 128
-	dec		eax
-	jnz		%%prefetch_chunk
-
-	; copy 64 byte blocks
-	mov		eax, BP_SIZE/64				; # iterations (> signed 8 bit)
-	push	byte 64
-	pop		edx
-align 8
-%%copy_block:
-	movq	mm0, [esi+ 0]
-	movq	mm1, [esi+ 8]
-	movq	mm2, [esi+16]
-	movq	mm3, [esi+24]
 	movq	mm4, [esi+32]
 	movq	mm5, [esi+40]
+	movq	[edi+32], mm4
+	movq	[edi+40], mm5
 	movq	mm6, [esi+48]
 	movq	mm7, [esi+56]
-	add		esi, edx
-	movntq	[edi+ 0], mm0
-	movntq	[edi+ 8], mm1
-	movntq	[edi+16], mm2
-	movntq	[edi+24], mm3
-	movntq	[edi+32], mm4
-	movntq	[edi+40], mm5
-	movntq	[edi+48], mm6
-	movntq	[edi+56], mm7
-	add		edi, edx
-	dec		eax
-	jnz		%%copy_block
-
-	sub		ecx, BP_SIZE
-	cmp		ecx, BP_SIZE
-	jae		%%prefetch_and_copy_chunk
+	movq	[edi+48], mm6
+	movq	[edi+56], mm7
+	add		esi, byte 64
+	add		edi, byte 64
+	add		ecx, byte 64
+	jnz		%%loop
 %endm


-[section .bss]
+;------------------------------------------------------------------------------
+; SSE MOVNTQ technique. used for transfers that do not fit in L1,
+; i.e. 64KiB..192KiB. requires Pentium III or Athlon; caller checks for this.
+; > ecx = -number_of_bytes (multiple of 64)
+; > esi, esi point to end of the buffer, i.e. &last_qword+8.
+; < ecx = 0
+; x
+%macro UC_MOVNTQ 0

-; this is somewhat "clever". the 2 specialized transfer implementations
-; that use SSE are jumped to if transfer size is greater than a threshold.
-; we simply set the requested transfer size to 0 if the CPU doesn't
-; support SSE so that those are never reached (done by masking with this).
-sse_mask		resd	1
+align 16
+%%loop:
+	; notes:
+	; - the AMD optimization manual recommends prefetch distances according to
+	;   (200*BytesPerIter/ClocksPerIter+192), which comes out to ~560 here.
+	;   [p3] rounding down to 512 bytes makes for significant gains.
+	; - [p3] complex addressing with ecx is 1% faster than adding to esi/edi.
+	prefetchnta [esi+ecx+512]
+	movq	mm0, [esi+ecx]
+	movq	mm1, [esi+ecx+8]
+	movq	mm2, [esi+ecx+16]
+	movq	mm3, [esi+ecx+24]
+	movq	mm4, [esi+ecx+32]
+	movq	mm5, [esi+ecx+40]
+	movq	mm6, [esi+ecx+48]
+	movq	mm7, [esi+ecx+56]
+	movntq	[edi+ecx], mm0
+	movntq	[edi+ecx+8], mm1
+	movntq	[edi+ecx+16], mm2
+	movntq	[edi+ecx+24], mm3
+	movntq	[edi+ecx+32], mm4
+	movntq	[edi+ecx+40], mm5
+	movntq	[edi+ecx+48], mm6
+	movntq	[edi+ecx+56], mm7
+	add		ecx, byte 64
+	jnz		%%loop
+%endm

-__SECT__
+
+;------------------------------------------------------------------------------
+; block prefetch technique. used for transfers that do not fit in L2,
+; i.e. > 192KiB. requires Pentium III or Athlon; caller checks for this.
+; for theory behind this, see article.
+; > ecx = -number_of_bytes (multiple of 64, <= -BP_SIZE)
+; > esi, esi point to end of the buffer, i.e. &last_qword+8.
+; < ecx = -remaining_bytes (multiple of 64, > -BP_SIZE)
+; < eax = 0
+%macro UC_BP_MOVNTQ 0
+	push	edx
+align 4
+%%prefetch_and_copy_chunk:
+
+	; pull chunk into cache by touching each cache line
+	; (in reverse order to prevent HW prefetches)
+	mov		eax, BP_SIZE/128			; # iterations
+	add		esi, BP_SIZE
+align 16
+%%prefetch_loop:
+	mov		edx, [esi+ecx-64]
+	mov		edx, [esi+ecx-128]
+	add		esi, byte -128
+	dec		eax
+	jnz		%%prefetch_loop
+
+	; copy chunk in 64 byte pieces
+	mov		eax, BP_SIZE/64				; # iterations (> signed 8 bit)
+align 16
+%%copy_loop:
+	movq	mm0, [esi+ecx]
+	movq	mm1, [esi+ecx+8]
+	movq	mm2, [esi+ecx+16]
+	movq	mm3, [esi+ecx+24]
+	movq	mm4, [esi+ecx+32]
+	movq	mm5, [esi+ecx+40]
+	movq	mm6, [esi+ecx+48]
+	movq	mm7, [esi+ecx+56]
+	movntq	[edi+ecx], mm0
+	movntq	[edi+ecx+8], mm1
+	movntq	[edi+ecx+16], mm2
+	movntq	[edi+ecx+24], mm3
+	movntq	[edi+ecx+32], mm4
+	movntq	[edi+ecx+40], mm5
+	movntq	[edi+ecx+48], mm6
+	movntq	[edi+ecx+56], mm7
+
+	add		ecx, byte 64
+	dec		eax
+	jnz		%%copy_loop
+
+	; if enough data left, process next chunk
+	cmp		ecx, -BP_SIZE
+	jle		%%prefetch_and_copy_chunk
+
+	pop		edx
+%endm
+
+
+;------------------------------------------------------------------------------

 ; void* __declspec(naked) ia32_memcpy(void* dst, const void* src, size_t nbytes)
-; Return dst to make ia32_memcpy usable as a standard library memcpy drop-in
+; drop-in replacement for libc memcpy() (returns dst)
 global sym(ia32_memcpy)
+align 64
 sym(ia32_memcpy):
 	push	edi
 	push	esi

+	mov		ecx, [esp+8+4+8]			; nbytes
 	mov		edi, [esp+8+4+0]			; dst
 	mov		esi, [esp+8+4+4]			; src
-	mov		ecx, [esp+8+4+8]			; nbytes

-	cmp		ecx, byte IC_SIZE
-	ja		.choose_larger_method
+	mov		edx, ecx
+	cmp		ecx, byte IC_TINY_MAX
+	ja		choose_larger_method

-.ic_movsd:
-	IC_MOVSD
-	mov		eax, [esp+8+4+0]			; return dst
-	pop		esi
-	pop		edi
-	ret
+ic_tiny:
+	IC_TINY
+	; never reached - IC_TINY contains memcpy function epilog code

-.choose_larger_method:
+choose_larger_method:
 	IC_ALIGN

-	mov		eax, [sse_mask]
-	mov		edx, ecx
-	and		edx, eax					; edx = (SSE)? remaining_bytes : 0
-	cmp		edx, BP_THRESHOLD
-	jae		near .uc_bp_movntq
-	cmp		edx, UC_THRESHOLD
-	jae		.uc_movntq
+	; setup:
+	; eax = number of 64 byte chunks, or 0 if CPU doesn't support SSE.
+	;       used to choose copy technique.
+	; ecx = -number_of_bytes, multiple of 64. we jump to ic_tiny if
+	;       there's not enough left for a single 64 byte chunk, which can
+	;       happen on unaligned 64..71 byte transfers due to IC_ALIGN.
+	; edx = number of remainder bytes after qwords have been copied;
+	;       will be handled by IC_TINY.
+	; esi and edi point to end of the respective buffers (more precisely,
+	;       to buffer_start-ecx). this together with the ecx convention means
+	;       we only need one loop counter (instead of having to advance
+	;       that and esi/edi).

-.ic_movq:
+	; this mask is applied to the transfer size. the 2 specialized copy techniques
+	; that use SSE are jumped to if size is greater than a threshold.
+	; we simply set the requested transfer size to 0 if the CPU doesn't
+	; support SSE so that those are never reached (done by masking with this).
+	extern sym(ia32_memcpy_size_mask)
+	mov		eax, [sym(ia32_memcpy_size_mask)]
+	and		ecx, byte ~IC_TINY_MAX
+	jz		ic_tiny						; < 64 bytes left (due to IC_ALIGN)
+	add		esi, ecx
+	add		edi, ecx
+	and		edx, byte IC_TINY_MAX
+	and		eax, ecx
+	neg		ecx
+
+	cmp		eax, BP_THRESHOLD
+	jae		near uc_bp_movntq
+	cmp		eax, UC_THRESHOLD
+	jae		uc_movntq
+
+ic_movq:
 	IC_MOVQ
 	emms
-	jmp		.ic_movsd
+	jmp		ic_tiny

-.uc_movntq:
+uc_movntq:
 	UC_MOVNTQ
 	sfence
 	emms
-	jmp		.ic_movsd
+	jmp		ic_tiny

-.uc_bp_movntq:
+uc_bp_movntq:
 	UC_BP_MOVNTQ
 	sfence
-	jmp		.ic_movq
-
+	cmp		ecx, byte -(IC_TINY_MAX+1)
+	jle		ic_movq
+	emms
+	jmp		ic_tiny


 ;-------------------------------------------------------------------------------
@ -487,10 +585,10 @@ rep	stosd
 ;-------------------------------------------------------------------------------
 ; init
 ;-------------------------------------------------------------------------------
-	
-; extern "C" bool __cdecl ia32_init()
-global sym(ia32_init)
-sym(ia32_init):
+
+; extern "C" bool __cdecl ia32_asm_init()
+global sym(ia32_asm_init)
+sym(ia32_asm_init):
 	push	ebx

 	; check if CPUID is supported
@ -514,17 +612,10 @@ sym(ia32_init):
 	mov		[max_ext_func], eax
 .no_cpuid:

-	; check if SSE is supported (used by memcpy code)
-extern sym(ia32_cap)
-	push	byte 32+25					; ia32.h's SSE cap (won't change)
-	call	sym(ia32_cap)
-	pop		edx							; remove stack param
-	neg		eax							; SSE? ~0 : 0
-	mov		[sse_mask], eax
-
 	pop		ebx
 	ret

+
 ;-------------------------------------------------------------------------------
 ; Color conversion (SSE)
 ;-------------------------------------------------------------------------------
--- a/source/lib/sysdep/ia32.cpp
+++ b/source/lib/sysdep/ia32.cpp
@ -25,10 +25,14 @@

 // HACK (see call to wtime_reset_impl)
 #if OS_WIN
-#include "win/wtime.h"
+#include "lib/sysdep/win/wtime.h"
 #endif

+#define NO_COLOR
+
+#ifndef NO_COLOR
 #include "graphics/Color.h"
+#endif

 #include <string.h>
 #include <stdio.h>
@ -40,6 +44,26 @@
 #error ia32.cpp needs inline assembly support!
 #endif

+#define SELF_TEST_ENABLED 1
+#include "self_test.h"
+
+// set by ia32_init, referenced by ia32_memcpy (asm)
+extern "C" u32 ia32_memcpy_size_mask = 0;
+
+void ia32_init()
+{
+	ia32_asm_init();
+
+	// memcpy init: set the mask that is applied to transfer size before
+	// choosing copy technique. this is the mechanism for disabling
+	// codepaths that aren't supported on all CPUs; see article for details.
+	// .. check for PREFETCHNTA and MOVNTQ support. these are part of the SSE
+	// instruction set, but also supported on older Athlons as part of
+	// the extended AMD MMX set.
+	if(ia32_cap(SSE) || ia32_cap(AMD_MMX_EXT))
+		ia32_memcpy_size_mask = ~0u;
+}
+

 //-----------------------------------------------------------------------------
 // fast implementations of some sysdep.h functions; see documentation there
@ -79,7 +103,7 @@ __declspec(naked) double ia32_rint(double)
 //   end up with truncate/"chop" rounding. subtracting does the trick,
 //   assuming RC is the IA-32 default round-to-nearest mode.

-static const float round_bias = 0.5f;
+static const float round_bias = 0.4999999f;

 __declspec(naked) i32 ia32_i32_from_float(float f)
 {
@ -417,7 +441,7 @@ static void get_cpu_count()
 		log_id_bits = log2(log_cpu_per_package);	// see above
 		last_phys_id = last_log_id = INVALID_ID;
 		phys_ids = log_ids = 0;
-		if(on_each_cpu(count_ids) == 0)
+		if(sys_on_each_cpu(count_ids) == 0)
 		{
 			cpus         = phys_ids;
 			cpu_ht_units = log_ids / cpu_cores;
@ -621,14 +645,16 @@ int ia32_get_call_target(void* ret_addr, void** target)

 //-----------------------------------------------------------------------------

-
+#ifndef NO_COLOR
 // Assembler-optimized function for color conversion
 extern "C" {
 u32 sse_ConvertRGBColorTo4ub(const RGBColor& src);
 }
+#endif

 void ia32_hook_capabilities()
 {
+#ifndef NO_COLOR
 	if (ia32_cap(SSE))
 	{
 		ConvertRGBColorTo4ub = sse_ConvertRGBColorTo4ub;
@ -637,6 +663,7 @@ void ia32_hook_capabilities()
 	{
 		debug_printf("No SSE available. Slow fallback routines will be used.\n");
 	}
+#endif
 }


@ -667,10 +694,10 @@ namespace test {

 	static void self_test()
 	{
-		test1();
+		test_float_int();
 	}

-	RUN_SELF_TEST;
+	SELF_TEST_RUN;

 }	// namespace test
 #endif	// #if SELF_TEST_ENABLED
--- a/source/lib/sysdep/ia32.h
+++ b/source/lib/sysdep/ia32.h
@ -96,6 +96,7 @@ enum CpuCap

 	// extended (edx) - currently only defined by AMD
 	AMD_MP        = 96+19,	// MultiProcessing capable; reserved on AMD64
+	AMD_MMX_EXT   = 96+22,
 	AMD_3DNOW_PRO = 96+30,
 	AMD_3DNOW     = 96+31
 };
@ -114,6 +115,8 @@ extern void ia32_hook_capabilities(void);
 // (Win32::CONTEXT*)pcontext (defined as void* to avoid dependency).
 extern void ia32_get_current_context(void* pcontext);

+extern void ia32_asm_init();
+
 extern int ia32_get_call_target(void* ret_addr, void** target);

 // order in which registers are stored in regs array
--- a/source/lib/sysdep/sysdep.cpp
+++ b/source/lib/sysdep/sysdep.cpp
@ -66,17 +66,3 @@ i64 i64_from_double(double d)
 }

 #endif
-
-
-// not possible with POSIX calls.
-// called from ia32.cpp get_cpu_count
-int on_each_cpu(void(*cb)())
-{
-#if OS_WIN
-	return wcpu_on_each_cpu(cb);
-#else
-	// apparently not possible on non-Windows OSes because they seem to lack
-	// a CPU affinity API.
-	return ERR_NO_SYS;
-#endif
-}
--- a/source/lib/sysdep/sysdep.h
+++ b/source/lib/sysdep/sysdep.h
@ -17,10 +17,6 @@
 #include "ia32.h"
 #endif

-#ifdef __cplusplus
-extern "C" {
-#endif
-
 // pass "omit frame pointer" setting on to the compiler
 #if MSC_VERSION
 # if CONFIG_OMIT_FP
@ -43,6 +39,15 @@ extern "C" {
 #endif


+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+//-----------------------------------------------------------------------------
+// C99 / SUSv3 emulation where needed
+//-----------------------------------------------------------------------------
+
 // vsnprintf2: handles positional parameters and %lld.
 // already available on *nix, emulated on Win32.
 #if OS_WIN
@ -51,6 +56,16 @@ extern int vsnprintf2(char* buffer, size_t count, const char* format, va_list ar
 #define vsnprintf2 vsnprintf
 #endif

+#if !HAVE_C99
+extern float fminf(float a, float b);
+extern float fmaxf(float a, float b);
+#endif
+
+#if !MSC_VERSION
+#define stricmp strcasecmp
+#define strnicmp strncasecmp
+#endif
+
 // alloca: allocate on stack, automatically free, return 0 if out of mem.
 // already available on *nix, emulated on Win32.
 #if OS_WIN
@ -58,16 +73,6 @@ extern int vsnprintf2(char* buffer, size_t count, const char* format, va_list ar
 extern void* alloca(size_t size);
 #endif

-// memcpy2: hand-tuned version; works for all sizes and aligments and is
-// significantly faster. uses SSE-optimized codepath when available.
-// 10% for < 64byte transfers and up to 300% on large sizes.
-#ifdef CPU_IA32
-# define memcpy2 ia32_memcpy
-extern void* ia32_memcpy(void* dst, const void* src, size_t nbytes);
-#else
-# define memcpy2 memcpy
-#endif
-
 // rint: round float to nearest integral value.
 // provided by C99, otherwise:
 #if !HAVE_C99
@ -82,20 +87,6 @@ extern void* ia32_memcpy(void* dst, const void* src, size_t nbytes);
 # endif
 #endif

-// i32_from_float et al: convert float to int. much faster than _ftol2,
-// which would normally be used by (int) casts.
-// .. fast IA-32 version: only used in some cases; see macro definition.
-#if USE_IA32_FLOAT_TO_INT
-# define i32_from_float ia32_i32_from_float
-# define i32_from_double ia32_i32_from_double
-# define i64_from_double ia32_i64_from_double
-// .. portable C emulation
-#else
-  extern i32 i32_from_float(float);
-  extern i32 i32_from_double(double);
-  extern i64 i64_from_double(double);
-#endif
-
 // finite: return 0 iff the given double is infinite or NaN.
 #if OS_WIN
 # define finite _finite
@ -128,30 +119,29 @@ extern void* ia32_memcpy(void* dst, const void* src, size_t nbytes);
 #endif


+//-----------------------------------------------------------------------------
+// sysdep API
+//-----------------------------------------------------------------------------

 //
 // output
 //

-enum DisplayErrorFlags
-{
-	DE_ALLOW_SUPPRESS = 1,
-	DE_NO_CONTINUE = 2,
-	DE_MANUAL_BREAK = 4
-};
-
+extern void sys_display_msg(const char* caption, const char* msg);
+extern void sys_display_msgw(const wchar_t* caption, const wchar_t* msg);
+  
 // choices offered by the shared error dialog
 enum ErrorReaction
 {
 	// ignore, continue as if nothing happened.
 	ER_CONTINUE = 1,
-		// note: don't start at 0 because that is interpreted as a
-		// DialogBoxParam failure.
+	// note: don't start at 0 because that is interpreted as a
+	// DialogBoxParam failure.

 	// ignore and do not report again.
 	// only returned if DE_ALLOW_SUPPRESS was passed.
 	ER_SUPPRESS,
-		// note: non-persistent; only applicable during this program run.
+	// note: non-persistent; only applicable during this program run.

 	// trigger breakpoint, i.e. enter debugger.
 	// only returned if DE_MANUAL_BREAK was passed; otherwise,
@ -163,29 +153,33 @@ enum ErrorReaction
 	ER_EXIT
 };

-extern ErrorReaction display_error(const wchar_t* description, int flags,
-	uint skip, void* context, const char* file, int line);
-
-// convenience version, in case the advanced parameters aren't needed.
-// done this way instead of with default values so that it also works in C.
-#define DISPLAY_ERROR(text) display_error(text, 0, 0, 0, __FILE__, __LINE__)
+enum SysDisplayErrorFlags
+{
+	DE_ALLOW_SUPPRESS = 1,
+	DE_NO_CONTINUE = 2,
+	DE_MANUAL_BREAK = 4
+};

 // internal use only (used by display_error)
-extern ErrorReaction display_error_impl(const wchar_t* text, int flags);
-
-
-
-extern void display_msg(const char* caption, const char* msg);
-extern void wdisplay_msg(const wchar_t* caption, const wchar_t* msg);
+extern ErrorReaction sys_display_error(const wchar_t* text, int flags);


 //
 // clipboard
 //

-extern int clipboard_set(const wchar_t* text);
-extern wchar_t* clipboard_get(void);
-extern int clipboard_free(wchar_t* copy);
+// "copy" text into the clipboard. replaces previous contents.
+extern int sys_clipboard_set(const wchar_t* text);
+
+// allow "pasting" from clipboard. returns the current contents if they
+// can be represented as text, otherwise 0.
+// when it is no longer needed, the returned pointer must be freed via
+// sys_clipboard_free. (NB: not necessary if zero, but doesn't hurt)
+extern wchar_t* sys_clipboard_get(void);
+
+// frees memory used by <copy>, which must have been returned by
+// sys_clipboard_get. see note above.
+extern int sys_clipboard_free(wchar_t* copy);


 //
@ -194,11 +188,17 @@ extern int clipboard_free(wchar_t* copy);

 // note: these do not warn on error; that is left to the caller.

-// creates a cursor from the given texture file.
+// creates a cursor from the given image.
+// w, h specify image dimensions [pixels]. limit is implementation-
+//   dependent; 32x32 is typical and safe.
+// bgra_img is the cursor image (BGRA format, bottom-up).
+//   it is no longer needed and can be freed after this call returns.
 // hotspot (hx,hy) is the offset from its upper-left corner to the
-// position where mouse clicks are registered.
-// the cursor must be cursor_free-ed when no longer needed.
-extern int sys_cursor_load(const char* filename,
+//   position where mouse clicks are registered.
+// return: negative error code, or 0 on success. cursor is filled with
+//   a pointer and undefined on failure. it must be sys_cursor_free-ed
+//   when no longer needed.
+extern int sys_cursor_create(uint w, uint h, void* bgra_img,
 	uint hx, uint hy, void** cursor);

 // replaces the current system cursor with the one indicated. need only be
@ -210,37 +210,67 @@ extern int sys_cursor_set(void* cursor);
 extern int sys_cursor_free(void* cursor);


+//
+// misc
+//

-
+// OS-specific backend for error_description_r.
+// NB: it is expected to be rare that OS return/error codes are actually
+// seen by user code, but we still translate them for completeness.
 extern int sys_error_description_r(int err, char* buf, size_t max_chars);

-extern int get_executable_name(char* n_path, size_t buf_size);
+// determine filename of the module to whom the given address belongs.
+// useful for handling exceptions in other modules.
+// <path> receives full path to module; it must hold at least MAX_PATH chars.
+// on error, it is set to L"".
+// return path for convenience.
+wchar_t* sys_get_module_filename(void* addr, wchar_t* path);

-// return filename of the module which contains address <addr>,
-// or L"" on failure. path holds the string and must be >= MAX_PATH chars.
-wchar_t* get_module_filename(void* addr, wchar_t* path);
+// store full path to the current executable.
+// returns 0 or a negative error code.
+// useful for determining installation directory, e.g. for VFS.
+extern int sys_get_executable_name(char* n_path, size_t buf_size);

+// have the user specify a directory via OS dialog.
+// stores its full path in the given buffer, which must hold at least
+// PATH_MAX chars.
+// returns 0 on success or a negative error code.
+extern int sys_pick_directory(char* n_path, size_t buf_size);

-
-
-extern int pick_directory(char* n_path, size_t buf_size);
-
-
-// not possible with POSIX calls.
+// execute the specified function once on each CPU.
+// this includes logical HT units and proceeds serially (function
+// is never re-entered) in order of increasing OS CPU ID.
+// note: implemented by switching thread affinity masks and forcing
+// a reschedule, which is apparently not possible with POSIX.
+// return 0 on success or a negative error code on failure
+// (e.g. if OS is preventing us from running on some CPUs).
 // called from ia32.cpp get_cpu_count
-extern int on_each_cpu(void(*cb)());
+extern int sys_on_each_cpu(void(*cb)());


-
-
-#if !HAVE_C99
-extern float fminf(float a, float b);
-extern float fmaxf(float a, float b);
+// drop-in replacement for libc memcpy(). only requires CPU support for
+// MMX (by now universal). highly optimized for Athlon and Pentium III
+// microarchitectures; significantly outperforms VC7.1 memcpy and memcpy_amd.
+// for details, see accompanying article.
+#ifdef CPU_IA32
+# define memcpy2 ia32_memcpy
+extern void* ia32_memcpy(void* dst, const void* src, size_t nbytes);
+#else
+# define memcpy2 memcpy
 #endif

-#if !MSC_VERSION
-#define stricmp strcasecmp
-#define strnicmp strncasecmp
+// i32_from_float et al: convert float to int. much faster than _ftol2,
+// which would normally be used by (int) casts.
+// .. fast IA-32 version: only used in some cases; see macro definition.
+#if USE_IA32_FLOAT_TO_INT
+# define i32_from_float ia32_i32_from_float
+# define i32_from_double ia32_i32_from_double
+# define i64_from_double ia32_i64_from_double
+// .. portable C emulation
+#else
+extern i32 i32_from_float(float);
+extern i32 i32_from_double(double);
+extern i64 i64_from_double(double);
 #endif


@ -249,11 +279,14 @@ extern float fmaxf(float a, float b);
 #endif


-// C++ linkage
-
+//-----------------------------------------------------------------------------
 // STL_HASH_MAP, STL_HASH_MULTIMAP, STL_HASH_SET
+//-----------------------------------------------------------------------------
+
+// these containers are useful but not part of C++98. most STL vendors
+// provide them in some form; we hide their differences behind macros.
+
 #if GCC_VERSION
-// GCC
 # include <ext/hash_map>
 # include <ext/hash_set> // Probably?

@ -283,6 +316,7 @@ namespace __gnu_cxx
 }

 #else	// !__GNUC__
+
 # include <hash_map>
 # include <hash_set>
 // VC7 or above
@ -300,8 +334,7 @@ namespace __gnu_cxx
 #  define STL_HASH_MULTISET std::hash_multiset
 #  define STL_HASH_VALUE std::hash_value
 # endif	// MSC_VERSION >= 1300
+
 #endif	// !__GNUC__

-#include "debug.h"
-
 #endif	// #ifndef SYSDEP_H_INCLUDED
--- a/source/lib/sysdep/unix/unix.cpp
+++ b/source/lib/sysdep/unix/unix.cpp
@ -14,12 +14,12 @@
 // these are basic POSIX-compatible backends for the sysdep.h functions.
 // Win32 has better versions which override these.

-void display_msg(const char* caption, const char* msg)
+void sys_display_msg(const char* caption, const char* msg)
 {
 	fprintf(stderr, "%s: %s\n", caption, msg);
 }

-void wdisplay_msg(const wchar_t* caption, const wchar_t* msg)
+void sys_display_msgw(const wchar_t* caption, const wchar_t* msg)
 {
 	fwprintf(stderr, L"%ls: %ls\n", caption, msg);
 }
@ -51,7 +51,14 @@ int unix_get_cpu_info()
 	return 0;
 }

-ErrorReaction display_error_impl(const wchar_t* text, int flags)
+// apparently not possible on non-Windows OSes because they seem to lack
+// a CPU affinity API. see sysdep.h comment.
+int sys_on_each_cpu(void(*cb)())
+{
+	return ERR_NO_SYS;
+}
+
+ErrorReaction sys_display_error(const wchar_t* text, int flags)
 {
 	printf("%ls\n\n", text);

@ -112,6 +119,13 @@ ErrorReaction display_error_impl(const wchar_t* text, int flags)
 // take advantage of hardware mouse cursors instead of the (jerky when
 // loading) OpenGL cursor.

+int sys_cursor_create(uint w, uint h, void* bgra_img,
+	uint hx, uint hy, void** cursor)
+{
+	*cursor = 0;
+	return 0;
+}
+
 int sys_cursor_set(void* cursor)
 {
 	return 0;
--- a/source/lib/sysdep/win/wcpu.cpp
+++ b/source/lib/sysdep/win/wcpu.cpp
@ -30,45 +30,6 @@
 // note: int instead of unsigned because <cpus> is also signed (tri-state).
 static const int MAX_CPUS = 32;

-
-int wcpu_on_each_cpu(void(*cb)())
-{
-	const HANDLE hProcess = GetCurrentProcess();
-
-	DWORD process_affinity, system_affinity;
-	if(!GetProcessAffinityMask(hProcess, &process_affinity, &system_affinity))
-		return -1;
-
-	// our affinity != system affinity: OS is limiting the CPUs that
-	// this process can run on. fail (cannot call back for each CPU).
-	if(process_affinity != system_affinity)
-		return -1;
-
-	for(DWORD cpu_bit = 1; cpu_bit != 0 && cpu_bit <= process_affinity; cpu_bit *= 2)
-	{
-		// check if we can switch to target CPU
-		if(!(process_affinity & cpu_bit))
-			continue;
-		// .. and do so.
-		if(!SetProcessAffinityMask(hProcess, process_affinity))
-		{
-			debug_warn("SetProcessAffinityMask failed");
-			continue;
-		}
-
-		// reschedule, to make sure we switch CPUs
-		Sleep(0);
-
-		cb();
-	}
-
-	// restore to original value
-	SetProcessAffinityMask(hProcess, process_affinity);
-
-	return 0;
-}
-
-
 static void check_speedstep()
 {
 	WIN_SAVE_LAST_ERROR;
--- a/source/lib/sysdep/win/wcpu.h
+++ b/source/lib/sysdep/win/wcpu.h
@ -1 +1 @@
-extern int wcpu_on_each_cpu(void(*cb)());
+
--- a/source/lib/sysdep/win/wdbg.cpp
+++ b/source/lib/sysdep/win/wdbg.cpp
@ -91,8 +91,8 @@ void wdbg_set_thread_name(const char* name)
 	}
 	__except(EXCEPTION_EXECUTE_HANDLER)
 	{
-		// if we get here, apparently this hack is not longer supported.
-		debug_warn("TODO: find alternative thread name implementation");
+		// if we get here, the debugger didn't handle the exception.
+		debug_warn("thread name hack doesn't work under this debugger");
 	}
 }

--- a/source/lib/sysdep/win/wdbg_sym.cpp
+++ b/source/lib/sysdep/win/wdbg_sym.cpp
@ -149,7 +149,7 @@ static int sym_init()
 	const BOOL fInvadeProcess = TRUE;
 	// .. use default *symbol* search path. we don't use this to locate
 	//    our PDB file because its absolute path is stored inside the EXE.
-	const char* UserSearchPath = 0;
+	PCSTR UserSearchPath = 0;
 	BOOL ok = SymInitialize(hProcess, UserSearchPath, fInvadeProcess);
 	WARN_IF_FALSE(ok);

@ -2205,7 +2205,7 @@ static void self_test()
 	test_addrs(123, 3.1415926535897932384626, "pchar string", 0xf00d);
 }

-RUN_SELF_TEST;
+SELF_TEST_RUN;

 #pragma optimize("", on)
 }	// namespace test
--- a/source/lib/sysdep/win/wsdl.cpp
+++ b/source/lib/sysdep/win/wsdl.cpp
@ -16,15 +16,6 @@
 //   Jan.Wassenberg@stud.uni-karlsruhe.de
 //   http://www.stud.uni-karlsruhe.de/~urkt/

-
-// TODO: should use GetMessage when not active to reduce CPU load.
-// where to do this?
-// - force the app to check for SDL's activation messages, and call
-//   sdl-wait-message?
-// - do it here, just make SDL_PollEvent block until message received?
-// - have the app use another free-the-cpu method, since it controls the main loop.
-//   this is what's currently happening.
-
 #include "precompiled.h"

 #include <stdio.h>
@ -919,6 +910,12 @@ static LRESULT CALLBACK wndproc(HWND hWnd, UINT uMsg, WPARAM wParam, LPARAM lPar

 void SDL_PumpEvents(void)
 {
+	// rationale: we would like to reduce CPU usage automatically if
+	// possible. blocking here until a message arrives would accomplish
+	// that, but might potentially freeze the app too long.
+	// instead, they should check active state and call SDL_Delay etc.
+	// if our window is minimized.
+
 	MSG msg;
 	while(PeekMessageW(&msg, 0, 0, 0, PM_REMOVE))
 	{
@ -1129,7 +1126,7 @@ int SDL_KillThread(SDL_Thread* thread)

 void SDL_WM_SetCaption(const char* title, const char* icon)
 {
-	SetWindowText(hWnd, title);
+	WARN_IF_FALSE(SetWindowText(hWnd, title));

 	UNUSED2(icon);	// TODO: implement
 }