1
0
forked from 0ad/0ad

- self test: rename stuff to SELF_TEST*; add provision for delayed all-at-once self tests (allows for init before the test and makes measuring elapsed time easier)

- config: add CONFIG_TRACE
- display_error_impl->sys_display_error
- cleaned up sysdep; add sys_ prefix everywhere and document everything
- add to vfs_load dox
- cursor: sys_cursor_load -> sys_cursor_create. sysdep code is no longer
dependent on tex; instead of calling tex_load, the caller passes a BGRA
texture in.
memcpy: huge kick in the pants for accompanying paper; now even faster.
- on_each_cpu -> sys_on_each_cpu (removed manager function also)
- wsdl: explain PeekMessage CPU usage issue

This was SVN commit r3203.
This commit is contained in:
janwas 2005-12-07 03:38:39 +00:00
parent a5d1968a8c
commit e2f25f4598
27 changed files with 640 additions and 419 deletions

View File

@ -549,7 +549,7 @@ static void self_test()
test_matrix();
}
RUN_SELF_TEST;
SELF_TEST_RUN;
} // namespace test
#endif // #if SELF_TEST_ENABLED

View File

@ -46,6 +46,13 @@
# define CONFIG_PARANOIA 0
#endif
// enable trace output for low-level code - various functions will
// debug_printf when they are entered/exited. note that the appropriate
// TRACEn tags must be debug_filter_add-ed for this to have any effect.
#ifndef CONFIG_TRACE
# define CONFIG_TRACE 0
#endif
// try to prevent any exceptions from being thrown - even by the C++
// standard library. useful only for performance tests.
#ifndef CONFIG_DISABLE_EXCEPTIONS

View File

@ -492,7 +492,7 @@ ErrorReaction display_error(const wchar_t* description, int flags,
text = L"(insufficient memory to display error message)";
debug_write_crashlog(text);
ErrorReaction er = display_error_impl(text, flags);
ErrorReaction er = sys_display_error(text, flags);
// note: debug_break-ing here to make sure the app doesn't continue
// running is no longer necessary. display_error now determines our

View File

@ -175,7 +175,7 @@ extern enum ErrorReaction debug_warn_err(int err, const char* file, int line,
//-----------------------------------------------------------------------------
// logging
// output
//-----------------------------------------------------------------------------
// write a formatted string to the debug channel, subject to filtering
@ -185,6 +185,14 @@ extern void debug_printf(const char* fmt, ...);
extern void debug_wprintf(const wchar_t* fmt, ...);
extern ErrorReaction display_error(const wchar_t* description, int flags,
uint skip, void* context, const char* file, int line);
// convenience version, in case the advanced parameters aren't needed.
// done this way instead of with default values so that it also works in C.
#define DISPLAY_ERROR(text) display_error(text, 0, 0, 0, __FILE__, __LINE__)
//
// filtering
//

View File

@ -517,7 +517,7 @@ static void self_test()
test_log2();
}
RUN_SELF_TEST;
SELF_TEST_RUN;
} // namespace test
#endif // #if SELF_TEST_ENABLED

View File

@ -27,6 +27,11 @@ static const char* lib_error_description(int err)
}
// generate textual description of an error code.
// stores up to <max_chars> in the given buffer.
// <err> can be one of the above error codes, POSIX ENOENT etc., or
// an OS-specific errors. if unknown, the string will be something like
// "Unknown error (65536, 0x10000)".
void error_description_r(int err, char* buf, size_t max_chars)
{
// lib error
@ -69,11 +74,3 @@ void error_description_r(int err, char* buf, size_t max_chars)
if(!have_output)
snprintf(buf, max_chars, "Unknown error (%d, 0x%X)", err, err);
}
const char* error_description(int err)
{
static char buf[200];
error_description_r(err, buf, ARRAY_SIZE(buf));
return buf;
}

View File

@ -65,10 +65,15 @@ ERR(-100704, ERR_SHDR_NO_PROGRAM, "Invalid shader program reference")
#ifndef ERRORS_H__
#define ERRORS_H__
// limits on the errors defined above (used by error_description_r)
#define ERR_MIN 100000
#define ERR_MAX 110000
extern const char* error_description(int err);
// generate textual description of an error code.
// stores up to <max_chars> in the given buffer.
// <err> can be one of the above error codes, POSIX ENOENT etc., or
// an OS-specific errors. if unknown, the string will be something like
// "Unknown error (65536, 0x10000)".
extern void error_description_r(int err, char* buf, size_t max_chars);
#endif // #ifndef ERRORS_H__

View File

@ -957,7 +957,7 @@ static void self_test()
multithreaded_torture_test();
}
RUN_SELF_TEST;
SELF_TEST_RUN;
} // namespace test
#endif // #if SELF_TEST_ENABLED

View File

@ -1271,7 +1271,7 @@ char* mmgr_getcwd_dbg(char* buf, size_t buf_size, const char* file, int line, co
//
static void* new_common(size_t size, AllocType type,
const char* file, int line, const char* func)
const char* file, int line, const char* func)
{
const char* allocator = types[type];

View File

@ -255,7 +255,7 @@ int file_set_root_dir(const char* argv0, const char* rel_path)
// get full path to executable
char n_path[PATH_MAX];
// .. first try safe, but system-dependent version
if(get_executable_name(n_path, PATH_MAX) < 0)
if(sys_get_executable_name(n_path, PATH_MAX) < 0)
{
// .. failed; use argv[0]
if(!realpath(argv0, n_path))

View File

@ -537,14 +537,16 @@ static ssize_t vfs_timed_io(const Handle hf, const size_t size, void** p, FileIO
}
// load the entire file <fn> into memory; return a handle to the memory
// and the buffer address/size. output parameters are zeroed on failure.
// in addition to the regular file cache, the entire buffer is kept in memory
// if flags & FILE_CACHE.
// load the entire file <fn> into memory.
// returns a memory handle to the file's contents or a negative error code.
// p and size are filled with address/size of buffer (0 on failure).
// flags influences IO mode and is typically 0.
// in addition to the regular file cache, the entire buffer is
// kept in memory if flags & FILE_CACHE.
// when the file contents are no longer needed, you can mem_free_h the
// Handle, or mem_free(p).
//
// on failure, a debug_warn is generated and a negative error code returned.
//
// note: we need the Handle return value for Tex.hm - the data pointer
// rationale: we need the Handle return value for Tex.hm - the data pointer
// must be protected against being accidentally free-d in that case.
Handle vfs_load(const char* v_fn, void*& p, size_t& size, uint flags /* default 0 */)
{

View File

@ -383,10 +383,14 @@ extern ssize_t vfs_io(Handle hf, size_t size, void** p, FileIOCB cb = 0, uintptr
// convenience functions that replace vfs_open / vfs_io / vfs_close:
// load the entire file <fn> into memory; return a memory handle to the
// buffer and its address/size. output parameters are zeroed on failure.
// in addition to the regular file cache, the entire buffer is kept in memory
// if flags & FILE_CACHE.
// load the entire file <fn> into memory.
// returns a memory handle to the file's contents or a negative error code.
// p and size are filled with address/size of buffer (0 on failure).
// flags influences IO mode and is typically 0.
// in addition to the regular file cache, the entire buffer is
// kept in memory if flags & FILE_CACHE.
// when the file contents are no longer needed, you can mem_free_h the
// Handle, or mem_free(p).
extern Handle vfs_load(const char* fn, void*& p, size_t& size, uint flags = 0);
extern ssize_t vfs_store(const char* fn, void* p, size_t size, uint flags = 0);

View File

@ -18,6 +18,42 @@
#include "ogl_tex.h"
#include "cursor.h"
static void* load_sys_cursor(const char* filename, int hx, int hy)
{
#if !ALLOW_SYS_CURSOR
return 0;
#else
Tex t;
if(tex_load(filename, &t) < 0)
return 0;
{
void* sys_cursor = 0; // return value
// convert to required BGRA format.
const uint flags = (t.flags | TEX_BGR) & ~TEX_DXT;
if(tex_transform_to(&t, flags) < 0)
goto fail;
void* bgra_img = tex_get_data(&t);
if(!bgra_img)
goto fail;
if(sys_cursor_create(t.w, t.h, bgra_img, hx, hy, &sys_cursor) < 0)
goto fail;
(void)tex_free(&t);
return sys_cursor;
}
fail:
debug_warn("failed");
(void)tex_free(&t);
return 0;
#endif
}
// no init is necessary because this is stored in struct Cursor, which
// is 0-initialized by h_mgr.
class GLCursor
@ -124,12 +160,8 @@ static int Cursor_reload(Cursor* c, const char* name, Handle)
// load actual cursor
snprintf(filename, ARRAY_SIZE(filename), "art/textures/cursors/%s.dds", name);
// .. system cursor (2d, hardware accelerated)
#if ALLOW_SYS_CURSOR
WARN_ERR(sys_cursor_load(filename, hotspotx, hotspoty, &c->sys_cursor));
#else
c->sys_cursor = 0;
#endif
// .. try loading as system cursor (2d, hardware accelerated)
c->sys_cursor = load_sys_cursor(filename, hotspotx, hotspoty);
// .. fall back to GLCursor (system cursor code is disabled or failed)
if(!c->sys_cursor)
RETURN_ERR(c->gl_cursor.create(filename, hotspotx, hotspoty));

View File

@ -19,17 +19,49 @@
#include "precompiled.h"
#include "self_test.h"
#include "timer.h"
// checked by debug_assert_failed; disables asserts if true (see above).
// set/cleared by run_self_test.
// set/cleared by self_test_run.
bool self_test_active = false;
// trampoline that sets self_test_active and returns a dummy value;
// used by RUN_SELF_TEST.
int run_self_test(void(*test_func)())
// used by SELF_TEST_RUN.
int self_test_run(void(*func)())
{
self_test_active = true;
test_func();
func();
self_test_active = false;
return 0;
return 0; // assigned to dummy at file scope
}
static const SelfTestRecord* registered_tests;
int self_test_register(SelfTestRecord* r)
{
// SELF_TEST_REGISTER has already initialized r->func.
r->next = registered_tests;
registered_tests = r;
return 0; // assigned to dummy at file scope
}
void self_test_run_all()
{
debug_printf("SELF TESTS:\n");
const double t0 = get_time();
// someone somewhere may want to run self-tests twice (e.g. to help
// track down memory corruption), so don't destroy the list while
// iterating over it.
const SelfTestRecord* r = registered_tests;
while(r)
{
self_test_run(r->func);
r = r->next;
}
const double dt = get_time() - t0;
debug_printf("-- done (elapsed time %.0f ms)\n", dt*1e3);
}

View File

@ -42,8 +42,9 @@ What makes a good self-test?
bad inputs ("does it reject those?"), and successes ("did it have the
expected result?").
- Tests should be non-intrusive (only bother user if something fails) and
very quick. This is because we run them automatically at startup,
which solves the common problem of making sure they actually run.
very quick. This is because they are executed every program run - which
is a good thing because it solves the common problem of forgetting to
run them after a change.
If the test is unavoidably slow or annoying (example: wdbg_sym's
stack trace), then best to disable it by default; see below for how.
@ -74,7 +75,7 @@ static void self_test()
// further test groups..
}
RUN_SELF_TEST; // (4)
SELF_TEST_RUN; // (4)
} // namespace test
#endif // #if SELF_TEST_ENABLED
@ -117,21 +118,40 @@ For further details, see below.
// and this is the only error reporter guaranteed to work.
//
// note: could also stringize condition and display that, but it'd require
// macro magic (stringize+prepend L) and we already get file+line.
// macro magic (stringize+prepend L) and we already display file+line.
#define TEST(condition) STMT(\
if(!(condition))\
DISPLAY_ERROR(L"Self-test failed");\
)
// your source file should contain a void function "self_test" that
// your source file should contain a function: void self_test(void) that
// performs all tests or calls out to individual test functions.
// this macro calls it at static init time and takes care of setting
// self_test_active (see above).
//
// rationale: since compiler optimizations may mess with the dummy variable,
// best to put this in a macro so we won't have to change each occurrence.
#define RUN_SELF_TEST static int dummy = run_self_test(self_test)
#define SELF_TEST_RUN\
static int dummy = self_test_run(self_test)
// calling at static init time may not always be desirable - some
// self-tests may require initialization beforehand. this mechanism allows
// registering self tests automatically, which are then all run when you
// call self_test_run_all.
#define SELF_TEST_REGISTER\
static SelfTestRecord self_test_record = { self_test, 0 };\
static int dummy = self_test_register(&self_test_record)
struct SelfTestRecord
{
void(*func)();
const SelfTestRecord* next;
};
// call all self-tests registered thus far. rationale: see above.
// also displays a banner+elapsed time via debug_printf.
extern void self_test_run_all();
//
@ -139,8 +159,10 @@ For further details, see below.
//
// trampoline that sets self_test_active and returns a dummy value;
// used by RUN_SELF_TEST.
extern int run_self_test(void(*test_func)());
// used by SELF_TEST_RUN.
extern int self_test_run(void(*func)());
extern int self_test_register(SelfTestRecord* r);
// checked by debug_assert_failed; disables asserts if true (see above).
// set/cleared by run_self_test.

View File

@ -391,7 +391,7 @@ static void self_test()
test_concatenate();
}
RUN_SELF_TEST;
SELF_TEST_RUN;
#endif // #if SELF_TEST_ENABLED

View File

@ -16,267 +16,365 @@ section .text use32
; fast general memcpy
;-------------------------------------------------------------------------------
; optimized for Athlon XP: 7.3% faster (cumulative) than VC7.1's memcpy over
; all 1..64 byte transfer lengths and misalignments. approaches maximum
; mem bandwidth (2000 MiB/s) for transfers >= 192KiB!
; Pentium III performance: about 3% faster in above small buffer benchmark.
;
; disables specialized large transfer (> 64KiB) implementations if SSE
; isn't available; we do assume MMX support, though (quite safe).
; drop-in replacement for libc memcpy(). only requires CPU support for
; MMX (by now universal). highly optimized for Athlon and Pentium III
; microarchitectures; significantly outperforms VC7.1 memcpy and memcpy_amd.
; for details, see accompanying article.
; if memcpy size is greater than this,
; if transfer size is at least this much,
; .. it's too big for L1. use non-temporal instructions.
UC_THRESHOLD equ 64*1024
; .. it also blows L2. pull chunks into L1 ("block prefetch").
BP_THRESHOLD equ 192*1024
; maximum that can be copied by IC_MOVSD.
; if you change this, be sure to expand the movs* table(s)!
IC_SIZE equ 67
; maximum that can be copied by IC_TINY.
IC_TINY_MAX equ 63
; size of one block prefetch chunk.
; if you change this, make sure "push byte BP_SIZE/128" doesn't overflow!
BP_SIZE equ 8*1024
; > ecx = size (<= IC_SIZE)
; x eax, ecx
;
; determined to be fastest approach by testing. a movsd table followed by
; rep movsb is a bit smaller but 6.9% slower; everything else is much worse.
%macro IC_MOVSD 0
mov eax, ecx
shr ecx, 2 ; dword count
neg ecx
add ecx, %%movsd_table_end
jmp ecx
align 8
movsd
movsd
movsd
movsd
movsd
movsd
movsd
movsd
movsd
movsd
movsd
movsd
movsd
movsd
movsd
movsd
%%movsd_table_end:
;------------------------------------------------------------------------------
and eax, 3
neg eax
add eax, %%movsb_table_end
jmp eax
movsb
movsb
movsb
%%movsb_table_end:
; [p3] replicating this instead of jumping to it from tailN
; saves 1 clock and costs (7-2)*2 bytes code.
%macro EPILOG 0
pop esi
pop edi
mov eax, [esp+4] ; return dst
ret
%endm
; align destination address to multiple of 8.
; not done for small transfers because it doesn't help IC_MOVSD.
%macro IC_ALIGN 0
mov eax, 8
sub eax, edi
and eax, byte 7 ; eax = # misaligned bytes
sub ecx, eax ; reduce copy count
neg eax
add eax, %%align_table_end
jmp eax
align 64
tail1:
mov al, [esi+ecx*4]
mov [edi+ecx*4], al
align 4
movsb
movsb
movsb
movsb
movsb
movsb
movsb
movsb
%%align_table_end:
tail0:
EPILOG
align 8
tail3:
; [p3] 2 reads followed by 2 writes is better than
; R/W interleaved and RRR/WWW
mov al, [esi+ecx*4+2]
mov [edi+ecx*4+2], al
; already aligned to 8 due to above code
tail2:
mov al, [esi+ecx*4]
mov dl, [esi+ecx*4+1]
mov [edi+ecx*4], al
mov [edi+ecx*4+1], dl
EPILOG
[section .data]
align 16
tail_table dd tail0, tail1, tail2, tail3
__SECT__
; 15x unrolled copy loop - transfers DWORDs backwards.
; indexed via table of 8-bit offsets.
; rationale:
; - [p3] backwards vs. forwards makes no difference.
; - MOV is faster than MOVSD.
; - index table is needed because calculating end-6*i is slower than
; a LUT and we wouldn't want to expand entries to 8 bytes
; (that'd increase code footprint by 30 bytes)
; - a byte index accessed via MOVZX is better due to less dcache usage.
; - only unrolling 8x and 'reentering' the loop is possible but
; slower due to fiddling with esi/ecx.
align 64
unrolled_copy_code_start:
%assign i 15
%rep 14 ; 15 entries, 1 base case handled below
uc_ %+ i:
mov eax, [esi+i*4-4]
mov [edi+i*4-4], eax
%assign i i-1
%endrep
; base case: no displacement needed; skip it so that code will
; be aligned to 8 bytes after this.
uc_1:
mov eax, [esi]
mov [edi], eax
uc_0:
jmp [tail_table+edx*4]
[section .data]
align 32
unrolled_copy_index_table:
%assign i 0
%rep 16
db (uc_ %+ i) - unrolled_copy_code_start
%assign i i+1
%endrep
__SECT__
;------------------------------------------------------------------------------
; tiny copy - handles all cases smaller than IC_MOVQ's 64 byte lower limit.
; > edx = number of bytes (< IC_TINY_MAX)
; < does not return.
; x eax, ecx, edx
%macro IC_TINY 0
mov ecx, edx
shr ecx, 2
; calculating this address isn't possible due to skipping displacement on uc1;
; even so, it'd require calculating -6*ecx, which is slower than LUT.
movzx eax, byte [unrolled_copy_index_table+ecx]
and edx, byte 3
add eax, unrolled_copy_code_start
jmp eax
; never reached! the unrolled loop jumps into tailN, which
; then returns from the memcpy function.
%endm
; > ecx = size
; x edx
;------------------------------------------------------------------------------
; align destination address to multiple of 8. important for large transfers,
; but doesn't affect the tiny technique.
; > esi, edi -> buffers (updated)
; > ecx, edx = transfer size (updated)
; x eax
%macro IC_ALIGN 0
mov eax, edi
and eax, byte 7 ; eax = # misaligned bytes
jz already_aligned ; early out
lea eax, [align_table_start+eax*2]
jmp eax
; [p3] this is no slower than a table of mov and much smaller/simpler
align 8
align_table_start:
%rep 8
dec ecx
movsb
%endrep
mov edx, ecx
already_aligned:
%endm
;------------------------------------------------------------------------------
; MMX MOVQ technique. used for in-cache transfers of 64B..64*KiB.
; must run on all CPUs, i.e. cannot use the SSE prefetchnta instruction.
; > ecx = -number_of_bytes (multiple of 64)
; > esi, esi point to end of the buffer, i.e. &last_qword+8.
; < ecx = 0
; x
%macro IC_MOVQ 0
; see notes below. TODO: if simple addressing is better on Athlons as well, prevent this from happening in setup code when not doing large transfers
add esi, ecx
add edi, ecx
align 16
mov edx, 64
%%loop:
cmp ecx, edx
jb %%done
prefetchnta [esi + (200*64/34+192)]
movq mm0, [esi+0]
; notes:
; - we can't use prefetch here - this codepath must support all CPUs.
; [p3] that makes us 5..15% slower on 1KiB..4KiB transfers.
; - [p3] simple addressing without +ecx is 3.5% faster.
; - [p3] there's no difference between RR/WW/RR/WW and R..R/W..W
; with simple addressing and no prefetch.
; - enough time elapses between first and third pair of reads that we
; could reuse MM0. there is no performance gain either way and
; differing displacements make code compression futile, so
; we'll just use MM4..7 for clarity.
movq mm0, [esi]
movq mm1, [esi+8]
movq [edi+0], mm0
movq [edi], mm0
movq [edi+8], mm1
movq mm2, [esi+16]
movq mm3, [esi+24]
movq [edi+16], mm2
movq [edi+24], mm3
movq mm0, [esi+32]
movq mm1, [esi+40]
movq [edi+32], mm0
movq [edi+40], mm1
movq mm2, [esi+48]
movq mm3, [esi+56]
movq [edi+48], mm2
movq [edi+56], mm3
add esi, edx
add edi, edx
sub ecx, edx
jmp %%loop
%%done:
%endm
; > ecx = size (> 64)
; x
%macro UC_MOVNTQ 0
mov edx, 64
align 16
%%1:
prefetchnta [esi + (200*64/34+192)]
movq mm0,[esi+0]
add edi, edx
movq mm1,[esi+8]
add esi, edx
movq mm2,[esi-48]
movntq [edi-64], mm0
movq mm0,[esi-40]
movntq [edi-56], mm1
movq mm1,[esi-32]
movntq [edi-48], mm2
movq mm2,[esi-24]
movntq [edi-40], mm0
movq mm0,[esi-16]
movntq [edi-32], mm1
movq mm1,[esi-8]
movntq [edi-24], mm2
movntq [edi-16], mm0
sub ecx, edx
movntq [edi-8], mm1
cmp ecx, edx
jae %%1
%endm
; > ecx = size (> 8KiB)
; x eax, edx
;
; somewhat optimized for size (futile attempt to avoid near jump)
%macro UC_BP_MOVNTQ 0
%%prefetch_and_copy_chunk:
; touch each cache line within chunk in reverse order (prevents HW prefetch)
push byte BP_SIZE/128 ; # iterations
pop eax
add esi, BP_SIZE
align 8
%%prefetch_chunk:
mov edx, [esi-64]
mov edx, [esi-128]
sub esi, 128
dec eax
jnz %%prefetch_chunk
; copy 64 byte blocks
mov eax, BP_SIZE/64 ; # iterations (> signed 8 bit)
push byte 64
pop edx
align 8
%%copy_block:
movq mm0, [esi+ 0]
movq mm1, [esi+ 8]
movq mm2, [esi+16]
movq mm3, [esi+24]
movq mm4, [esi+32]
movq mm5, [esi+40]
movq [edi+32], mm4
movq [edi+40], mm5
movq mm6, [esi+48]
movq mm7, [esi+56]
add esi, edx
movntq [edi+ 0], mm0
movntq [edi+ 8], mm1
movntq [edi+16], mm2
movntq [edi+24], mm3
movntq [edi+32], mm4
movntq [edi+40], mm5
movntq [edi+48], mm6
movntq [edi+56], mm7
add edi, edx
dec eax
jnz %%copy_block
sub ecx, BP_SIZE
cmp ecx, BP_SIZE
jae %%prefetch_and_copy_chunk
movq [edi+48], mm6
movq [edi+56], mm7
add esi, byte 64
add edi, byte 64
add ecx, byte 64
jnz %%loop
%endm
[section .bss]
;------------------------------------------------------------------------------
; SSE MOVNTQ technique. used for transfers that do not fit in L1,
; i.e. 64KiB..192KiB. requires Pentium III or Athlon; caller checks for this.
; > ecx = -number_of_bytes (multiple of 64)
; > esi, esi point to end of the buffer, i.e. &last_qword+8.
; < ecx = 0
; x
%macro UC_MOVNTQ 0
; this is somewhat "clever". the 2 specialized transfer implementations
; that use SSE are jumped to if transfer size is greater than a threshold.
; we simply set the requested transfer size to 0 if the CPU doesn't
; support SSE so that those are never reached (done by masking with this).
sse_mask resd 1
align 16
%%loop:
; notes:
; - the AMD optimization manual recommends prefetch distances according to
; (200*BytesPerIter/ClocksPerIter+192), which comes out to ~560 here.
; [p3] rounding down to 512 bytes makes for significant gains.
; - [p3] complex addressing with ecx is 1% faster than adding to esi/edi.
prefetchnta [esi+ecx+512]
movq mm0, [esi+ecx]
movq mm1, [esi+ecx+8]
movq mm2, [esi+ecx+16]
movq mm3, [esi+ecx+24]
movq mm4, [esi+ecx+32]
movq mm5, [esi+ecx+40]
movq mm6, [esi+ecx+48]
movq mm7, [esi+ecx+56]
movntq [edi+ecx], mm0
movntq [edi+ecx+8], mm1
movntq [edi+ecx+16], mm2
movntq [edi+ecx+24], mm3
movntq [edi+ecx+32], mm4
movntq [edi+ecx+40], mm5
movntq [edi+ecx+48], mm6
movntq [edi+ecx+56], mm7
add ecx, byte 64
jnz %%loop
%endm
__SECT__
;------------------------------------------------------------------------------
; block prefetch technique. used for transfers that do not fit in L2,
; i.e. > 192KiB. requires Pentium III or Athlon; caller checks for this.
; for theory behind this, see article.
; > ecx = -number_of_bytes (multiple of 64, <= -BP_SIZE)
; > esi, esi point to end of the buffer, i.e. &last_qword+8.
; < ecx = -remaining_bytes (multiple of 64, > -BP_SIZE)
; < eax = 0
%macro UC_BP_MOVNTQ 0
push edx
align 4
%%prefetch_and_copy_chunk:
; pull chunk into cache by touching each cache line
; (in reverse order to prevent HW prefetches)
mov eax, BP_SIZE/128 ; # iterations
add esi, BP_SIZE
align 16
%%prefetch_loop:
mov edx, [esi+ecx-64]
mov edx, [esi+ecx-128]
add esi, byte -128
dec eax
jnz %%prefetch_loop
; copy chunk in 64 byte pieces
mov eax, BP_SIZE/64 ; # iterations (> signed 8 bit)
align 16
%%copy_loop:
movq mm0, [esi+ecx]
movq mm1, [esi+ecx+8]
movq mm2, [esi+ecx+16]
movq mm3, [esi+ecx+24]
movq mm4, [esi+ecx+32]
movq mm5, [esi+ecx+40]
movq mm6, [esi+ecx+48]
movq mm7, [esi+ecx+56]
movntq [edi+ecx], mm0
movntq [edi+ecx+8], mm1
movntq [edi+ecx+16], mm2
movntq [edi+ecx+24], mm3
movntq [edi+ecx+32], mm4
movntq [edi+ecx+40], mm5
movntq [edi+ecx+48], mm6
movntq [edi+ecx+56], mm7
add ecx, byte 64
dec eax
jnz %%copy_loop
; if enough data left, process next chunk
cmp ecx, -BP_SIZE
jle %%prefetch_and_copy_chunk
pop edx
%endm
;------------------------------------------------------------------------------
; void* __declspec(naked) ia32_memcpy(void* dst, const void* src, size_t nbytes)
; Return dst to make ia32_memcpy usable as a standard library memcpy drop-in
; drop-in replacement for libc memcpy() (returns dst)
global sym(ia32_memcpy)
align 64
sym(ia32_memcpy):
push edi
push esi
mov ecx, [esp+8+4+8] ; nbytes
mov edi, [esp+8+4+0] ; dst
mov esi, [esp+8+4+4] ; src
mov ecx, [esp+8+4+8] ; nbytes
cmp ecx, byte IC_SIZE
ja .choose_larger_method
mov edx, ecx
cmp ecx, byte IC_TINY_MAX
ja choose_larger_method
.ic_movsd:
IC_MOVSD
mov eax, [esp+8+4+0] ; return dst
pop esi
pop edi
ret
ic_tiny:
IC_TINY
; never reached - IC_TINY contains memcpy function epilog code
.choose_larger_method:
choose_larger_method:
IC_ALIGN
mov eax, [sse_mask]
mov edx, ecx
and edx, eax ; edx = (SSE)? remaining_bytes : 0
cmp edx, BP_THRESHOLD
jae near .uc_bp_movntq
cmp edx, UC_THRESHOLD
jae .uc_movntq
; setup:
; eax = number of 64 byte chunks, or 0 if CPU doesn't support SSE.
; used to choose copy technique.
; ecx = -number_of_bytes, multiple of 64. we jump to ic_tiny if
; there's not enough left for a single 64 byte chunk, which can
; happen on unaligned 64..71 byte transfers due to IC_ALIGN.
; edx = number of remainder bytes after qwords have been copied;
; will be handled by IC_TINY.
; esi and edi point to end of the respective buffers (more precisely,
; to buffer_start-ecx). this together with the ecx convention means
; we only need one loop counter (instead of having to advance
; that and esi/edi).
.ic_movq:
; this mask is applied to the transfer size. the 2 specialized copy techniques
; that use SSE are jumped to if size is greater than a threshold.
; we simply set the requested transfer size to 0 if the CPU doesn't
; support SSE so that those are never reached (done by masking with this).
extern sym(ia32_memcpy_size_mask)
mov eax, [sym(ia32_memcpy_size_mask)]
and ecx, byte ~IC_TINY_MAX
jz ic_tiny ; < 64 bytes left (due to IC_ALIGN)
add esi, ecx
add edi, ecx
and edx, byte IC_TINY_MAX
and eax, ecx
neg ecx
cmp eax, BP_THRESHOLD
jae near uc_bp_movntq
cmp eax, UC_THRESHOLD
jae uc_movntq
ic_movq:
IC_MOVQ
emms
jmp .ic_movsd
jmp ic_tiny
.uc_movntq:
uc_movntq:
UC_MOVNTQ
sfence
emms
jmp .ic_movsd
jmp ic_tiny
.uc_bp_movntq:
uc_bp_movntq:
UC_BP_MOVNTQ
sfence
jmp .ic_movq
cmp ecx, byte -(IC_TINY_MAX+1)
jle ic_movq
emms
jmp ic_tiny
;-------------------------------------------------------------------------------
@ -487,10 +585,10 @@ rep stosd
;-------------------------------------------------------------------------------
; init
;-------------------------------------------------------------------------------
; extern "C" bool __cdecl ia32_init()
global sym(ia32_init)
sym(ia32_init):
; extern "C" bool __cdecl ia32_asm_init()
global sym(ia32_asm_init)
sym(ia32_asm_init):
push ebx
; check if CPUID is supported
@ -514,17 +612,10 @@ sym(ia32_init):
mov [max_ext_func], eax
.no_cpuid:
; check if SSE is supported (used by memcpy code)
extern sym(ia32_cap)
push byte 32+25 ; ia32.h's SSE cap (won't change)
call sym(ia32_cap)
pop edx ; remove stack param
neg eax ; SSE? ~0 : 0
mov [sse_mask], eax
pop ebx
ret
;-------------------------------------------------------------------------------
; Color conversion (SSE)
;-------------------------------------------------------------------------------

View File

@ -25,10 +25,14 @@
// HACK (see call to wtime_reset_impl)
#if OS_WIN
#include "win/wtime.h"
#include "lib/sysdep/win/wtime.h"
#endif
#define NO_COLOR
#ifndef NO_COLOR
#include "graphics/Color.h"
#endif
#include <string.h>
#include <stdio.h>
@ -40,6 +44,26 @@
#error ia32.cpp needs inline assembly support!
#endif
#define SELF_TEST_ENABLED 1
#include "self_test.h"
// set by ia32_init, referenced by ia32_memcpy (asm)
extern "C" u32 ia32_memcpy_size_mask = 0;
void ia32_init()
{
ia32_asm_init();
// memcpy init: set the mask that is applied to transfer size before
// choosing copy technique. this is the mechanism for disabling
// codepaths that aren't supported on all CPUs; see article for details.
// .. check for PREFETCHNTA and MOVNTQ support. these are part of the SSE
// instruction set, but also supported on older Athlons as part of
// the extended AMD MMX set.
if(ia32_cap(SSE) || ia32_cap(AMD_MMX_EXT))
ia32_memcpy_size_mask = ~0u;
}
//-----------------------------------------------------------------------------
// fast implementations of some sysdep.h functions; see documentation there
@ -79,7 +103,7 @@ __declspec(naked) double ia32_rint(double)
// end up with truncate/"chop" rounding. subtracting does the trick,
// assuming RC is the IA-32 default round-to-nearest mode.
static const float round_bias = 0.5f;
static const float round_bias = 0.4999999f;
__declspec(naked) i32 ia32_i32_from_float(float f)
{
@ -417,7 +441,7 @@ static void get_cpu_count()
log_id_bits = log2(log_cpu_per_package); // see above
last_phys_id = last_log_id = INVALID_ID;
phys_ids = log_ids = 0;
if(on_each_cpu(count_ids) == 0)
if(sys_on_each_cpu(count_ids) == 0)
{
cpus = phys_ids;
cpu_ht_units = log_ids / cpu_cores;
@ -621,14 +645,16 @@ int ia32_get_call_target(void* ret_addr, void** target)
//-----------------------------------------------------------------------------
#ifndef NO_COLOR
// Assembler-optimized function for color conversion
extern "C" {
u32 sse_ConvertRGBColorTo4ub(const RGBColor& src);
}
#endif
void ia32_hook_capabilities()
{
#ifndef NO_COLOR
if (ia32_cap(SSE))
{
ConvertRGBColorTo4ub = sse_ConvertRGBColorTo4ub;
@ -637,6 +663,7 @@ void ia32_hook_capabilities()
{
debug_printf("No SSE available. Slow fallback routines will be used.\n");
}
#endif
}
@ -667,10 +694,10 @@ namespace test {
static void self_test()
{
test1();
test_float_int();
}
RUN_SELF_TEST;
SELF_TEST_RUN;
} // namespace test
#endif // #if SELF_TEST_ENABLED

View File

@ -96,6 +96,7 @@ enum CpuCap
// extended (edx) - currently only defined by AMD
AMD_MP = 96+19, // MultiProcessing capable; reserved on AMD64
AMD_MMX_EXT = 96+22,
AMD_3DNOW_PRO = 96+30,
AMD_3DNOW = 96+31
};
@ -114,6 +115,8 @@ extern void ia32_hook_capabilities(void);
// (Win32::CONTEXT*)pcontext (defined as void* to avoid dependency).
extern void ia32_get_current_context(void* pcontext);
extern void ia32_asm_init();
extern int ia32_get_call_target(void* ret_addr, void** target);
// order in which registers are stored in regs array

View File

@ -66,17 +66,3 @@ i64 i64_from_double(double d)
}
#endif
// not possible with POSIX calls.
// called from ia32.cpp get_cpu_count
int on_each_cpu(void(*cb)())
{
#if OS_WIN
return wcpu_on_each_cpu(cb);
#else
// apparently not possible on non-Windows OSes because they seem to lack
// a CPU affinity API.
return ERR_NO_SYS;
#endif
}

View File

@ -17,10 +17,6 @@
#include "ia32.h"
#endif
#ifdef __cplusplus
extern "C" {
#endif
// pass "omit frame pointer" setting on to the compiler
#if MSC_VERSION
# if CONFIG_OMIT_FP
@ -43,6 +39,15 @@ extern "C" {
#endif
#ifdef __cplusplus
extern "C" {
#endif
//-----------------------------------------------------------------------------
// C99 / SUSv3 emulation where needed
//-----------------------------------------------------------------------------
// vsnprintf2: handles positional parameters and %lld.
// already available on *nix, emulated on Win32.
#if OS_WIN
@ -51,6 +56,16 @@ extern int vsnprintf2(char* buffer, size_t count, const char* format, va_list ar
#define vsnprintf2 vsnprintf
#endif
#if !HAVE_C99
extern float fminf(float a, float b);
extern float fmaxf(float a, float b);
#endif
#if !MSC_VERSION
#define stricmp strcasecmp
#define strnicmp strncasecmp
#endif
// alloca: allocate on stack, automatically free, return 0 if out of mem.
// already available on *nix, emulated on Win32.
#if OS_WIN
@ -58,16 +73,6 @@ extern int vsnprintf2(char* buffer, size_t count, const char* format, va_list ar
extern void* alloca(size_t size);
#endif
// memcpy2: hand-tuned version; works for all sizes and aligments and is
// significantly faster. uses SSE-optimized codepath when available.
// 10% for < 64byte transfers and up to 300% on large sizes.
#ifdef CPU_IA32
# define memcpy2 ia32_memcpy
extern void* ia32_memcpy(void* dst, const void* src, size_t nbytes);
#else
# define memcpy2 memcpy
#endif
// rint: round float to nearest integral value.
// provided by C99, otherwise:
#if !HAVE_C99
@ -82,20 +87,6 @@ extern void* ia32_memcpy(void* dst, const void* src, size_t nbytes);
# endif
#endif
// i32_from_float et al: convert float to int. much faster than _ftol2,
// which would normally be used by (int) casts.
// .. fast IA-32 version: only used in some cases; see macro definition.
#if USE_IA32_FLOAT_TO_INT
# define i32_from_float ia32_i32_from_float
# define i32_from_double ia32_i32_from_double
# define i64_from_double ia32_i64_from_double
// .. portable C emulation
#else
extern i32 i32_from_float(float);
extern i32 i32_from_double(double);
extern i64 i64_from_double(double);
#endif
// finite: return 0 iff the given double is infinite or NaN.
#if OS_WIN
# define finite _finite
@ -128,30 +119,29 @@ extern void* ia32_memcpy(void* dst, const void* src, size_t nbytes);
#endif
//-----------------------------------------------------------------------------
// sysdep API
//-----------------------------------------------------------------------------
//
// output
//
enum DisplayErrorFlags
{
DE_ALLOW_SUPPRESS = 1,
DE_NO_CONTINUE = 2,
DE_MANUAL_BREAK = 4
};
extern void sys_display_msg(const char* caption, const char* msg);
extern void sys_display_msgw(const wchar_t* caption, const wchar_t* msg);
// choices offered by the shared error dialog
enum ErrorReaction
{
// ignore, continue as if nothing happened.
ER_CONTINUE = 1,
// note: don't start at 0 because that is interpreted as a
// DialogBoxParam failure.
// note: don't start at 0 because that is interpreted as a
// DialogBoxParam failure.
// ignore and do not report again.
// only returned if DE_ALLOW_SUPPRESS was passed.
ER_SUPPRESS,
// note: non-persistent; only applicable during this program run.
// note: non-persistent; only applicable during this program run.
// trigger breakpoint, i.e. enter debugger.
// only returned if DE_MANUAL_BREAK was passed; otherwise,
@ -163,29 +153,33 @@ enum ErrorReaction
ER_EXIT
};
extern ErrorReaction display_error(const wchar_t* description, int flags,
uint skip, void* context, const char* file, int line);
// convenience version, in case the advanced parameters aren't needed.
// done this way instead of with default values so that it also works in C.
#define DISPLAY_ERROR(text) display_error(text, 0, 0, 0, __FILE__, __LINE__)
enum SysDisplayErrorFlags
{
DE_ALLOW_SUPPRESS = 1,
DE_NO_CONTINUE = 2,
DE_MANUAL_BREAK = 4
};
// internal use only (used by display_error)
extern ErrorReaction display_error_impl(const wchar_t* text, int flags);
extern void display_msg(const char* caption, const char* msg);
extern void wdisplay_msg(const wchar_t* caption, const wchar_t* msg);
extern ErrorReaction sys_display_error(const wchar_t* text, int flags);
//
// clipboard
//
extern int clipboard_set(const wchar_t* text);
extern wchar_t* clipboard_get(void);
extern int clipboard_free(wchar_t* copy);
// "copy" text into the clipboard. replaces previous contents.
extern int sys_clipboard_set(const wchar_t* text);
// allow "pasting" from clipboard. returns the current contents if they
// can be represented as text, otherwise 0.
// when it is no longer needed, the returned pointer must be freed via
// sys_clipboard_free. (NB: not necessary if zero, but doesn't hurt)
extern wchar_t* sys_clipboard_get(void);
// frees memory used by <copy>, which must have been returned by
// sys_clipboard_get. see note above.
extern int sys_clipboard_free(wchar_t* copy);
//
@ -194,11 +188,17 @@ extern int clipboard_free(wchar_t* copy);
// note: these do not warn on error; that is left to the caller.
// creates a cursor from the given texture file.
// creates a cursor from the given image.
// w, h specify image dimensions [pixels]. limit is implementation-
// dependent; 32x32 is typical and safe.
// bgra_img is the cursor image (BGRA format, bottom-up).
// it is no longer needed and can be freed after this call returns.
// hotspot (hx,hy) is the offset from its upper-left corner to the
// position where mouse clicks are registered.
// the cursor must be cursor_free-ed when no longer needed.
extern int sys_cursor_load(const char* filename,
// position where mouse clicks are registered.
// return: negative error code, or 0 on success. cursor is filled with
// a pointer and undefined on failure. it must be sys_cursor_free-ed
// when no longer needed.
extern int sys_cursor_create(uint w, uint h, void* bgra_img,
uint hx, uint hy, void** cursor);
// replaces the current system cursor with the one indicated. need only be
@ -210,37 +210,67 @@ extern int sys_cursor_set(void* cursor);
extern int sys_cursor_free(void* cursor);
//
// misc
//
// OS-specific backend for error_description_r.
// NB: it is expected to be rare that OS return/error codes are actually
// seen by user code, but we still translate them for completeness.
extern int sys_error_description_r(int err, char* buf, size_t max_chars);
extern int get_executable_name(char* n_path, size_t buf_size);
// determine filename of the module to whom the given address belongs.
// useful for handling exceptions in other modules.
// <path> receives full path to module; it must hold at least MAX_PATH chars.
// on error, it is set to L"".
// return path for convenience.
wchar_t* sys_get_module_filename(void* addr, wchar_t* path);
// return filename of the module which contains address <addr>,
// or L"" on failure. path holds the string and must be >= MAX_PATH chars.
wchar_t* get_module_filename(void* addr, wchar_t* path);
// store full path to the current executable.
// returns 0 or a negative error code.
// useful for determining installation directory, e.g. for VFS.
extern int sys_get_executable_name(char* n_path, size_t buf_size);
// have the user specify a directory via OS dialog.
// stores its full path in the given buffer, which must hold at least
// PATH_MAX chars.
// returns 0 on success or a negative error code.
extern int sys_pick_directory(char* n_path, size_t buf_size);
extern int pick_directory(char* n_path, size_t buf_size);
// not possible with POSIX calls.
// execute the specified function once on each CPU.
// this includes logical HT units and proceeds serially (function
// is never re-entered) in order of increasing OS CPU ID.
// note: implemented by switching thread affinity masks and forcing
// a reschedule, which is apparently not possible with POSIX.
// return 0 on success or a negative error code on failure
// (e.g. if OS is preventing us from running on some CPUs).
// called from ia32.cpp get_cpu_count
extern int on_each_cpu(void(*cb)());
extern int sys_on_each_cpu(void(*cb)());
#if !HAVE_C99
extern float fminf(float a, float b);
extern float fmaxf(float a, float b);
// drop-in replacement for libc memcpy(). only requires CPU support for
// MMX (by now universal). highly optimized for Athlon and Pentium III
// microarchitectures; significantly outperforms VC7.1 memcpy and memcpy_amd.
// for details, see accompanying article.
#ifdef CPU_IA32
# define memcpy2 ia32_memcpy
extern void* ia32_memcpy(void* dst, const void* src, size_t nbytes);
#else
# define memcpy2 memcpy
#endif
#if !MSC_VERSION
#define stricmp strcasecmp
#define strnicmp strncasecmp
// i32_from_float et al: convert float to int. much faster than _ftol2,
// which would normally be used by (int) casts.
// .. fast IA-32 version: only used in some cases; see macro definition.
#if USE_IA32_FLOAT_TO_INT
# define i32_from_float ia32_i32_from_float
# define i32_from_double ia32_i32_from_double
# define i64_from_double ia32_i64_from_double
// .. portable C emulation
#else
extern i32 i32_from_float(float);
extern i32 i32_from_double(double);
extern i64 i64_from_double(double);
#endif
@ -249,11 +279,14 @@ extern float fmaxf(float a, float b);
#endif
// C++ linkage
//-----------------------------------------------------------------------------
// STL_HASH_MAP, STL_HASH_MULTIMAP, STL_HASH_SET
//-----------------------------------------------------------------------------
// these containers are useful but not part of C++98. most STL vendors
// provide them in some form; we hide their differences behind macros.
#if GCC_VERSION
// GCC
# include <ext/hash_map>
# include <ext/hash_set> // Probably?
@ -283,6 +316,7 @@ namespace __gnu_cxx
}
#else // !__GNUC__
# include <hash_map>
# include <hash_set>
// VC7 or above
@ -300,8 +334,7 @@ namespace __gnu_cxx
# define STL_HASH_MULTISET std::hash_multiset
# define STL_HASH_VALUE std::hash_value
# endif // MSC_VERSION >= 1300
#endif // !__GNUC__
#include "debug.h"
#endif // #ifndef SYSDEP_H_INCLUDED

View File

@ -14,12 +14,12 @@
// these are basic POSIX-compatible backends for the sysdep.h functions.
// Win32 has better versions which override these.
void display_msg(const char* caption, const char* msg)
void sys_display_msg(const char* caption, const char* msg)
{
fprintf(stderr, "%s: %s\n", caption, msg);
}
void wdisplay_msg(const wchar_t* caption, const wchar_t* msg)
void sys_display_msgw(const wchar_t* caption, const wchar_t* msg)
{
fwprintf(stderr, L"%ls: %ls\n", caption, msg);
}
@ -51,7 +51,14 @@ int unix_get_cpu_info()
return 0;
}
ErrorReaction display_error_impl(const wchar_t* text, int flags)
// apparently not possible on non-Windows OSes because they seem to lack
// a CPU affinity API. see sysdep.h comment.
int sys_on_each_cpu(void(*cb)())
{
return ERR_NO_SYS;
}
ErrorReaction sys_display_error(const wchar_t* text, int flags)
{
printf("%ls\n\n", text);
@ -112,6 +119,13 @@ ErrorReaction display_error_impl(const wchar_t* text, int flags)
// take advantage of hardware mouse cursors instead of the (jerky when
// loading) OpenGL cursor.
int sys_cursor_create(uint w, uint h, void* bgra_img,
uint hx, uint hy, void** cursor)
{
*cursor = 0;
return 0;
}
int sys_cursor_set(void* cursor)
{
return 0;

View File

@ -30,45 +30,6 @@
// note: int instead of unsigned because <cpus> is also signed (tri-state).
static const int MAX_CPUS = 32;
int wcpu_on_each_cpu(void(*cb)())
{
const HANDLE hProcess = GetCurrentProcess();
DWORD process_affinity, system_affinity;
if(!GetProcessAffinityMask(hProcess, &process_affinity, &system_affinity))
return -1;
// our affinity != system affinity: OS is limiting the CPUs that
// this process can run on. fail (cannot call back for each CPU).
if(process_affinity != system_affinity)
return -1;
for(DWORD cpu_bit = 1; cpu_bit != 0 && cpu_bit <= process_affinity; cpu_bit *= 2)
{
// check if we can switch to target CPU
if(!(process_affinity & cpu_bit))
continue;
// .. and do so.
if(!SetProcessAffinityMask(hProcess, process_affinity))
{
debug_warn("SetProcessAffinityMask failed");
continue;
}
// reschedule, to make sure we switch CPUs
Sleep(0);
cb();
}
// restore to original value
SetProcessAffinityMask(hProcess, process_affinity);
return 0;
}
static void check_speedstep()
{
WIN_SAVE_LAST_ERROR;

View File

@ -1 +1 @@
extern int wcpu_on_each_cpu(void(*cb)());

View File

@ -91,8 +91,8 @@ void wdbg_set_thread_name(const char* name)
}
__except(EXCEPTION_EXECUTE_HANDLER)
{
// if we get here, apparently this hack is not longer supported.
debug_warn("TODO: find alternative thread name implementation");
// if we get here, the debugger didn't handle the exception.
debug_warn("thread name hack doesn't work under this debugger");
}
}

View File

@ -149,7 +149,7 @@ static int sym_init()
const BOOL fInvadeProcess = TRUE;
// .. use default *symbol* search path. we don't use this to locate
// our PDB file because its absolute path is stored inside the EXE.
const char* UserSearchPath = 0;
PCSTR UserSearchPath = 0;
BOOL ok = SymInitialize(hProcess, UserSearchPath, fInvadeProcess);
WARN_IF_FALSE(ok);
@ -2205,7 +2205,7 @@ static void self_test()
test_addrs(123, 3.1415926535897932384626, "pchar string", 0xf00d);
}
RUN_SELF_TEST;
SELF_TEST_RUN;
#pragma optimize("", on)
} // namespace test

View File

@ -16,15 +16,6 @@
// Jan.Wassenberg@stud.uni-karlsruhe.de
// http://www.stud.uni-karlsruhe.de/~urkt/
// TODO: should use GetMessage when not active to reduce CPU load.
// where to do this?
// - force the app to check for SDL's activation messages, and call
// sdl-wait-message?
// - do it here, just make SDL_PollEvent block until message received?
// - have the app use another free-the-cpu method, since it controls the main loop.
// this is what's currently happening.
#include "precompiled.h"
#include <stdio.h>
@ -919,6 +910,12 @@ static LRESULT CALLBACK wndproc(HWND hWnd, UINT uMsg, WPARAM wParam, LPARAM lPar
void SDL_PumpEvents(void)
{
// rationale: we would like to reduce CPU usage automatically if
// possible. blocking here until a message arrives would accomplish
// that, but might potentially freeze the app too long.
// instead, they should check active state and call SDL_Delay etc.
// if our window is minimized.
MSG msg;
while(PeekMessageW(&msg, 0, 0, 0, PM_REMOVE))
{
@ -1129,7 +1126,7 @@ int SDL_KillThread(SDL_Thread* thread)
void SDL_WM_SetCaption(const char* title, const char* icon)
{
SetWindowText(hWnd, title);
WARN_IF_FALSE(SetWindowText(hWnd, title));
UNUSED2(icon); // TODO: implement
}