- self test: rename stuff to SELF_TEST*; add provision for delayed all-at-once self tests (allows for init before the test and makes measuring elapsed time easier)
- config: add CONFIG_TRACE - display_error_impl->sys_display_error - cleaned up sysdep; add sys_ prefix everywhere and document everything - add to vfs_load dox - cursor: sys_cursor_load -> sys_cursor_create. sysdep code is no longer dependent on tex; instead of calling tex_load, the caller passes a BGRA texture in. memcpy: huge kick in the pants for accompanying paper; now even faster. - on_each_cpu -> sys_on_each_cpu (removed manager function also) - wsdl: explain PeekMessage CPU usage issue This was SVN commit r3203.
This commit is contained in:
parent
a5d1968a8c
commit
e2f25f4598
@ -549,7 +549,7 @@ static void self_test()
|
||||
test_matrix();
|
||||
}
|
||||
|
||||
RUN_SELF_TEST;
|
||||
SELF_TEST_RUN;
|
||||
|
||||
} // namespace test
|
||||
#endif // #if SELF_TEST_ENABLED
|
||||
|
@ -46,6 +46,13 @@
|
||||
# define CONFIG_PARANOIA 0
|
||||
#endif
|
||||
|
||||
// enable trace output for low-level code - various functions will
|
||||
// debug_printf when they are entered/exited. note that the appropriate
|
||||
// TRACEn tags must be debug_filter_add-ed for this to have any effect.
|
||||
#ifndef CONFIG_TRACE
|
||||
# define CONFIG_TRACE 0
|
||||
#endif
|
||||
|
||||
// try to prevent any exceptions from being thrown - even by the C++
|
||||
// standard library. useful only for performance tests.
|
||||
#ifndef CONFIG_DISABLE_EXCEPTIONS
|
||||
|
@ -492,7 +492,7 @@ ErrorReaction display_error(const wchar_t* description, int flags,
|
||||
text = L"(insufficient memory to display error message)";
|
||||
|
||||
debug_write_crashlog(text);
|
||||
ErrorReaction er = display_error_impl(text, flags);
|
||||
ErrorReaction er = sys_display_error(text, flags);
|
||||
|
||||
// note: debug_break-ing here to make sure the app doesn't continue
|
||||
// running is no longer necessary. display_error now determines our
|
||||
|
@ -175,7 +175,7 @@ extern enum ErrorReaction debug_warn_err(int err, const char* file, int line,
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// logging
|
||||
// output
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
// write a formatted string to the debug channel, subject to filtering
|
||||
@ -185,6 +185,14 @@ extern void debug_printf(const char* fmt, ...);
|
||||
extern void debug_wprintf(const wchar_t* fmt, ...);
|
||||
|
||||
|
||||
extern ErrorReaction display_error(const wchar_t* description, int flags,
|
||||
uint skip, void* context, const char* file, int line);
|
||||
|
||||
// convenience version, in case the advanced parameters aren't needed.
|
||||
// done this way instead of with default values so that it also works in C.
|
||||
#define DISPLAY_ERROR(text) display_error(text, 0, 0, 0, __FILE__, __LINE__)
|
||||
|
||||
|
||||
//
|
||||
// filtering
|
||||
//
|
||||
|
@ -517,7 +517,7 @@ static void self_test()
|
||||
test_log2();
|
||||
}
|
||||
|
||||
RUN_SELF_TEST;
|
||||
SELF_TEST_RUN;
|
||||
|
||||
} // namespace test
|
||||
#endif // #if SELF_TEST_ENABLED
|
||||
|
@ -27,6 +27,11 @@ static const char* lib_error_description(int err)
|
||||
}
|
||||
|
||||
|
||||
// generate textual description of an error code.
|
||||
// stores up to <max_chars> in the given buffer.
|
||||
// <err> can be one of the above error codes, POSIX ENOENT etc., or
|
||||
// an OS-specific errors. if unknown, the string will be something like
|
||||
// "Unknown error (65536, 0x10000)".
|
||||
void error_description_r(int err, char* buf, size_t max_chars)
|
||||
{
|
||||
// lib error
|
||||
@ -69,11 +74,3 @@ void error_description_r(int err, char* buf, size_t max_chars)
|
||||
if(!have_output)
|
||||
snprintf(buf, max_chars, "Unknown error (%d, 0x%X)", err, err);
|
||||
}
|
||||
|
||||
|
||||
const char* error_description(int err)
|
||||
{
|
||||
static char buf[200];
|
||||
error_description_r(err, buf, ARRAY_SIZE(buf));
|
||||
return buf;
|
||||
}
|
@ -65,10 +65,15 @@ ERR(-100704, ERR_SHDR_NO_PROGRAM, "Invalid shader program reference")
|
||||
#ifndef ERRORS_H__
|
||||
#define ERRORS_H__
|
||||
|
||||
// limits on the errors defined above (used by error_description_r)
|
||||
#define ERR_MIN 100000
|
||||
#define ERR_MAX 110000
|
||||
|
||||
extern const char* error_description(int err);
|
||||
// generate textual description of an error code.
|
||||
// stores up to <max_chars> in the given buffer.
|
||||
// <err> can be one of the above error codes, POSIX ENOENT etc., or
|
||||
// an OS-specific errors. if unknown, the string will be something like
|
||||
// "Unknown error (65536, 0x10000)".
|
||||
extern void error_description_r(int err, char* buf, size_t max_chars);
|
||||
|
||||
#endif // #ifndef ERRORS_H__
|
@ -957,7 +957,7 @@ static void self_test()
|
||||
multithreaded_torture_test();
|
||||
}
|
||||
|
||||
RUN_SELF_TEST;
|
||||
SELF_TEST_RUN;
|
||||
|
||||
} // namespace test
|
||||
#endif // #if SELF_TEST_ENABLED
|
||||
|
@ -1271,7 +1271,7 @@ char* mmgr_getcwd_dbg(char* buf, size_t buf_size, const char* file, int line, co
|
||||
//
|
||||
|
||||
static void* new_common(size_t size, AllocType type,
|
||||
const char* file, int line, const char* func)
|
||||
const char* file, int line, const char* func)
|
||||
{
|
||||
const char* allocator = types[type];
|
||||
|
||||
|
@ -255,7 +255,7 @@ int file_set_root_dir(const char* argv0, const char* rel_path)
|
||||
// get full path to executable
|
||||
char n_path[PATH_MAX];
|
||||
// .. first try safe, but system-dependent version
|
||||
if(get_executable_name(n_path, PATH_MAX) < 0)
|
||||
if(sys_get_executable_name(n_path, PATH_MAX) < 0)
|
||||
{
|
||||
// .. failed; use argv[0]
|
||||
if(!realpath(argv0, n_path))
|
||||
|
@ -537,14 +537,16 @@ static ssize_t vfs_timed_io(const Handle hf, const size_t size, void** p, FileIO
|
||||
}
|
||||
|
||||
|
||||
// load the entire file <fn> into memory; return a handle to the memory
|
||||
// and the buffer address/size. output parameters are zeroed on failure.
|
||||
// in addition to the regular file cache, the entire buffer is kept in memory
|
||||
// if flags & FILE_CACHE.
|
||||
// load the entire file <fn> into memory.
|
||||
// returns a memory handle to the file's contents or a negative error code.
|
||||
// p and size are filled with address/size of buffer (0 on failure).
|
||||
// flags influences IO mode and is typically 0.
|
||||
// in addition to the regular file cache, the entire buffer is
|
||||
// kept in memory if flags & FILE_CACHE.
|
||||
// when the file contents are no longer needed, you can mem_free_h the
|
||||
// Handle, or mem_free(p).
|
||||
//
|
||||
// on failure, a debug_warn is generated and a negative error code returned.
|
||||
//
|
||||
// note: we need the Handle return value for Tex.hm - the data pointer
|
||||
// rationale: we need the Handle return value for Tex.hm - the data pointer
|
||||
// must be protected against being accidentally free-d in that case.
|
||||
Handle vfs_load(const char* v_fn, void*& p, size_t& size, uint flags /* default 0 */)
|
||||
{
|
||||
|
@ -383,10 +383,14 @@ extern ssize_t vfs_io(Handle hf, size_t size, void** p, FileIOCB cb = 0, uintptr
|
||||
|
||||
// convenience functions that replace vfs_open / vfs_io / vfs_close:
|
||||
|
||||
// load the entire file <fn> into memory; return a memory handle to the
|
||||
// buffer and its address/size. output parameters are zeroed on failure.
|
||||
// in addition to the regular file cache, the entire buffer is kept in memory
|
||||
// if flags & FILE_CACHE.
|
||||
// load the entire file <fn> into memory.
|
||||
// returns a memory handle to the file's contents or a negative error code.
|
||||
// p and size are filled with address/size of buffer (0 on failure).
|
||||
// flags influences IO mode and is typically 0.
|
||||
// in addition to the regular file cache, the entire buffer is
|
||||
// kept in memory if flags & FILE_CACHE.
|
||||
// when the file contents are no longer needed, you can mem_free_h the
|
||||
// Handle, or mem_free(p).
|
||||
extern Handle vfs_load(const char* fn, void*& p, size_t& size, uint flags = 0);
|
||||
|
||||
extern ssize_t vfs_store(const char* fn, void* p, size_t size, uint flags = 0);
|
||||
|
@ -18,6 +18,42 @@
|
||||
#include "ogl_tex.h"
|
||||
#include "cursor.h"
|
||||
|
||||
|
||||
static void* load_sys_cursor(const char* filename, int hx, int hy)
|
||||
{
|
||||
#if !ALLOW_SYS_CURSOR
|
||||
return 0;
|
||||
#else
|
||||
Tex t;
|
||||
if(tex_load(filename, &t) < 0)
|
||||
return 0;
|
||||
|
||||
{
|
||||
void* sys_cursor = 0; // return value
|
||||
|
||||
// convert to required BGRA format.
|
||||
const uint flags = (t.flags | TEX_BGR) & ~TEX_DXT;
|
||||
if(tex_transform_to(&t, flags) < 0)
|
||||
goto fail;
|
||||
void* bgra_img = tex_get_data(&t);
|
||||
if(!bgra_img)
|
||||
goto fail;
|
||||
|
||||
if(sys_cursor_create(t.w, t.h, bgra_img, hx, hy, &sys_cursor) < 0)
|
||||
goto fail;
|
||||
|
||||
(void)tex_free(&t);
|
||||
return sys_cursor;
|
||||
}
|
||||
|
||||
fail:
|
||||
debug_warn("failed");
|
||||
(void)tex_free(&t);
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
// no init is necessary because this is stored in struct Cursor, which
|
||||
// is 0-initialized by h_mgr.
|
||||
class GLCursor
|
||||
@ -124,12 +160,8 @@ static int Cursor_reload(Cursor* c, const char* name, Handle)
|
||||
|
||||
// load actual cursor
|
||||
snprintf(filename, ARRAY_SIZE(filename), "art/textures/cursors/%s.dds", name);
|
||||
// .. system cursor (2d, hardware accelerated)
|
||||
#if ALLOW_SYS_CURSOR
|
||||
WARN_ERR(sys_cursor_load(filename, hotspotx, hotspoty, &c->sys_cursor));
|
||||
#else
|
||||
c->sys_cursor = 0;
|
||||
#endif
|
||||
// .. try loading as system cursor (2d, hardware accelerated)
|
||||
c->sys_cursor = load_sys_cursor(filename, hotspotx, hotspoty);
|
||||
// .. fall back to GLCursor (system cursor code is disabled or failed)
|
||||
if(!c->sys_cursor)
|
||||
RETURN_ERR(c->gl_cursor.create(filename, hotspotx, hotspoty));
|
||||
|
@ -19,17 +19,49 @@
|
||||
#include "precompiled.h"
|
||||
|
||||
#include "self_test.h"
|
||||
#include "timer.h"
|
||||
|
||||
// checked by debug_assert_failed; disables asserts if true (see above).
|
||||
// set/cleared by run_self_test.
|
||||
// set/cleared by self_test_run.
|
||||
bool self_test_active = false;
|
||||
|
||||
// trampoline that sets self_test_active and returns a dummy value;
|
||||
// used by RUN_SELF_TEST.
|
||||
int run_self_test(void(*test_func)())
|
||||
// used by SELF_TEST_RUN.
|
||||
int self_test_run(void(*func)())
|
||||
{
|
||||
self_test_active = true;
|
||||
test_func();
|
||||
func();
|
||||
self_test_active = false;
|
||||
return 0;
|
||||
return 0; // assigned to dummy at file scope
|
||||
}
|
||||
|
||||
|
||||
static const SelfTestRecord* registered_tests;
|
||||
|
||||
int self_test_register(SelfTestRecord* r)
|
||||
{
|
||||
// SELF_TEST_REGISTER has already initialized r->func.
|
||||
r->next = registered_tests;
|
||||
registered_tests = r;
|
||||
return 0; // assigned to dummy at file scope
|
||||
}
|
||||
|
||||
|
||||
void self_test_run_all()
|
||||
{
|
||||
debug_printf("SELF TESTS:\n");
|
||||
const double t0 = get_time();
|
||||
|
||||
// someone somewhere may want to run self-tests twice (e.g. to help
|
||||
// track down memory corruption), so don't destroy the list while
|
||||
// iterating over it.
|
||||
const SelfTestRecord* r = registered_tests;
|
||||
while(r)
|
||||
{
|
||||
self_test_run(r->func);
|
||||
r = r->next;
|
||||
}
|
||||
|
||||
const double dt = get_time() - t0;
|
||||
debug_printf("-- done (elapsed time %.0f ms)\n", dt*1e3);
|
||||
}
|
@ -42,8 +42,9 @@ What makes a good self-test?
|
||||
bad inputs ("does it reject those?"), and successes ("did it have the
|
||||
expected result?").
|
||||
- Tests should be non-intrusive (only bother user if something fails) and
|
||||
very quick. This is because we run them automatically at startup,
|
||||
which solves the common problem of making sure they actually run.
|
||||
very quick. This is because they are executed every program run - which
|
||||
is a good thing because it solves the common problem of forgetting to
|
||||
run them after a change.
|
||||
|
||||
If the test is unavoidably slow or annoying (example: wdbg_sym's
|
||||
stack trace), then best to disable it by default; see below for how.
|
||||
@ -74,7 +75,7 @@ static void self_test()
|
||||
// further test groups..
|
||||
}
|
||||
|
||||
RUN_SELF_TEST; // (4)
|
||||
SELF_TEST_RUN; // (4)
|
||||
|
||||
} // namespace test
|
||||
#endif // #if SELF_TEST_ENABLED
|
||||
@ -117,21 +118,40 @@ For further details, see below.
|
||||
// and this is the only error reporter guaranteed to work.
|
||||
//
|
||||
// note: could also stringize condition and display that, but it'd require
|
||||
// macro magic (stringize+prepend L) and we already get file+line.
|
||||
// macro magic (stringize+prepend L) and we already display file+line.
|
||||
#define TEST(condition) STMT(\
|
||||
if(!(condition))\
|
||||
DISPLAY_ERROR(L"Self-test failed");\
|
||||
)
|
||||
|
||||
|
||||
// your source file should contain a void function "self_test" that
|
||||
// your source file should contain a function: void self_test(void) that
|
||||
// performs all tests or calls out to individual test functions.
|
||||
// this macro calls it at static init time and takes care of setting
|
||||
// self_test_active (see above).
|
||||
//
|
||||
// rationale: since compiler optimizations may mess with the dummy variable,
|
||||
// best to put this in a macro so we won't have to change each occurrence.
|
||||
#define RUN_SELF_TEST static int dummy = run_self_test(self_test)
|
||||
#define SELF_TEST_RUN\
|
||||
static int dummy = self_test_run(self_test)
|
||||
|
||||
// calling at static init time may not always be desirable - some
|
||||
// self-tests may require initialization beforehand. this mechanism allows
|
||||
// registering self tests automatically, which are then all run when you
|
||||
// call self_test_run_all.
|
||||
#define SELF_TEST_REGISTER\
|
||||
static SelfTestRecord self_test_record = { self_test, 0 };\
|
||||
static int dummy = self_test_register(&self_test_record)
|
||||
|
||||
struct SelfTestRecord
|
||||
{
|
||||
void(*func)();
|
||||
const SelfTestRecord* next;
|
||||
};
|
||||
|
||||
// call all self-tests registered thus far. rationale: see above.
|
||||
// also displays a banner+elapsed time via debug_printf.
|
||||
extern void self_test_run_all();
|
||||
|
||||
|
||||
//
|
||||
@ -139,8 +159,10 @@ For further details, see below.
|
||||
//
|
||||
|
||||
// trampoline that sets self_test_active and returns a dummy value;
|
||||
// used by RUN_SELF_TEST.
|
||||
extern int run_self_test(void(*test_func)());
|
||||
// used by SELF_TEST_RUN.
|
||||
extern int self_test_run(void(*func)());
|
||||
|
||||
extern int self_test_register(SelfTestRecord* r);
|
||||
|
||||
// checked by debug_assert_failed; disables asserts if true (see above).
|
||||
// set/cleared by run_self_test.
|
||||
|
@ -391,7 +391,7 @@ static void self_test()
|
||||
test_concatenate();
|
||||
}
|
||||
|
||||
RUN_SELF_TEST;
|
||||
SELF_TEST_RUN;
|
||||
|
||||
#endif // #if SELF_TEST_ENABLED
|
||||
|
||||
|
@ -16,267 +16,365 @@ section .text use32
|
||||
; fast general memcpy
|
||||
;-------------------------------------------------------------------------------
|
||||
|
||||
; optimized for Athlon XP: 7.3% faster (cumulative) than VC7.1's memcpy over
|
||||
; all 1..64 byte transfer lengths and misalignments. approaches maximum
|
||||
; mem bandwidth (2000 MiB/s) for transfers >= 192KiB!
|
||||
; Pentium III performance: about 3% faster in above small buffer benchmark.
|
||||
;
|
||||
; disables specialized large transfer (> 64KiB) implementations if SSE
|
||||
; isn't available; we do assume MMX support, though (quite safe).
|
||||
; drop-in replacement for libc memcpy(). only requires CPU support for
|
||||
; MMX (by now universal). highly optimized for Athlon and Pentium III
|
||||
; microarchitectures; significantly outperforms VC7.1 memcpy and memcpy_amd.
|
||||
; for details, see accompanying article.
|
||||
|
||||
; if memcpy size is greater than this,
|
||||
; if transfer size is at least this much,
|
||||
; .. it's too big for L1. use non-temporal instructions.
|
||||
UC_THRESHOLD equ 64*1024
|
||||
; .. it also blows L2. pull chunks into L1 ("block prefetch").
|
||||
BP_THRESHOLD equ 192*1024
|
||||
|
||||
; maximum that can be copied by IC_MOVSD.
|
||||
; if you change this, be sure to expand the movs* table(s)!
|
||||
IC_SIZE equ 67
|
||||
; maximum that can be copied by IC_TINY.
|
||||
IC_TINY_MAX equ 63
|
||||
|
||||
; size of one block prefetch chunk.
|
||||
; if you change this, make sure "push byte BP_SIZE/128" doesn't overflow!
|
||||
BP_SIZE equ 8*1024
|
||||
|
||||
|
||||
; > ecx = size (<= IC_SIZE)
|
||||
; x eax, ecx
|
||||
;
|
||||
; determined to be fastest approach by testing. a movsd table followed by
|
||||
; rep movsb is a bit smaller but 6.9% slower; everything else is much worse.
|
||||
%macro IC_MOVSD 0
|
||||
mov eax, ecx
|
||||
shr ecx, 2 ; dword count
|
||||
neg ecx
|
||||
add ecx, %%movsd_table_end
|
||||
jmp ecx
|
||||
align 8
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
%%movsd_table_end:
|
||||
;------------------------------------------------------------------------------
|
||||
|
||||
and eax, 3
|
||||
neg eax
|
||||
add eax, %%movsb_table_end
|
||||
jmp eax
|
||||
movsb
|
||||
movsb
|
||||
movsb
|
||||
%%movsb_table_end:
|
||||
; [p3] replicating this instead of jumping to it from tailN
|
||||
; saves 1 clock and costs (7-2)*2 bytes code.
|
||||
%macro EPILOG 0
|
||||
pop esi
|
||||
pop edi
|
||||
mov eax, [esp+4] ; return dst
|
||||
ret
|
||||
%endm
|
||||
|
||||
|
||||
; align destination address to multiple of 8.
|
||||
; not done for small transfers because it doesn't help IC_MOVSD.
|
||||
%macro IC_ALIGN 0
|
||||
mov eax, 8
|
||||
sub eax, edi
|
||||
and eax, byte 7 ; eax = # misaligned bytes
|
||||
sub ecx, eax ; reduce copy count
|
||||
neg eax
|
||||
add eax, %%align_table_end
|
||||
jmp eax
|
||||
align 64
|
||||
tail1:
|
||||
mov al, [esi+ecx*4]
|
||||
mov [edi+ecx*4], al
|
||||
align 4
|
||||
movsb
|
||||
movsb
|
||||
movsb
|
||||
movsb
|
||||
movsb
|
||||
movsb
|
||||
movsb
|
||||
movsb
|
||||
%%align_table_end:
|
||||
tail0:
|
||||
EPILOG
|
||||
|
||||
align 8
|
||||
tail3:
|
||||
; [p3] 2 reads followed by 2 writes is better than
|
||||
; R/W interleaved and RRR/WWW
|
||||
mov al, [esi+ecx*4+2]
|
||||
mov [edi+ecx*4+2], al
|
||||
; already aligned to 8 due to above code
|
||||
tail2:
|
||||
mov al, [esi+ecx*4]
|
||||
mov dl, [esi+ecx*4+1]
|
||||
mov [edi+ecx*4], al
|
||||
mov [edi+ecx*4+1], dl
|
||||
EPILOG
|
||||
|
||||
[section .data]
|
||||
align 16
|
||||
tail_table dd tail0, tail1, tail2, tail3
|
||||
__SECT__
|
||||
|
||||
; 15x unrolled copy loop - transfers DWORDs backwards.
|
||||
; indexed via table of 8-bit offsets.
|
||||
; rationale:
|
||||
; - [p3] backwards vs. forwards makes no difference.
|
||||
; - MOV is faster than MOVSD.
|
||||
; - index table is needed because calculating end-6*i is slower than
|
||||
; a LUT and we wouldn't want to expand entries to 8 bytes
|
||||
; (that'd increase code footprint by 30 bytes)
|
||||
; - a byte index accessed via MOVZX is better due to less dcache usage.
|
||||
; - only unrolling 8x and 'reentering' the loop is possible but
|
||||
; slower due to fiddling with esi/ecx.
|
||||
align 64
|
||||
unrolled_copy_code_start:
|
||||
%assign i 15
|
||||
%rep 14 ; 15 entries, 1 base case handled below
|
||||
uc_ %+ i:
|
||||
mov eax, [esi+i*4-4]
|
||||
mov [edi+i*4-4], eax
|
||||
%assign i i-1
|
||||
%endrep
|
||||
; base case: no displacement needed; skip it so that code will
|
||||
; be aligned to 8 bytes after this.
|
||||
uc_1:
|
||||
mov eax, [esi]
|
||||
mov [edi], eax
|
||||
uc_0:
|
||||
jmp [tail_table+edx*4]
|
||||
|
||||
[section .data]
|
||||
align 32
|
||||
unrolled_copy_index_table:
|
||||
%assign i 0
|
||||
%rep 16
|
||||
db (uc_ %+ i) - unrolled_copy_code_start
|
||||
%assign i i+1
|
||||
%endrep
|
||||
__SECT__
|
||||
|
||||
|
||||
;------------------------------------------------------------------------------
|
||||
; tiny copy - handles all cases smaller than IC_MOVQ's 64 byte lower limit.
|
||||
; > edx = number of bytes (< IC_TINY_MAX)
|
||||
; < does not return.
|
||||
; x eax, ecx, edx
|
||||
%macro IC_TINY 0
|
||||
mov ecx, edx
|
||||
shr ecx, 2
|
||||
; calculating this address isn't possible due to skipping displacement on uc1;
|
||||
; even so, it'd require calculating -6*ecx, which is slower than LUT.
|
||||
movzx eax, byte [unrolled_copy_index_table+ecx]
|
||||
and edx, byte 3
|
||||
add eax, unrolled_copy_code_start
|
||||
jmp eax
|
||||
; never reached! the unrolled loop jumps into tailN, which
|
||||
; then returns from the memcpy function.
|
||||
%endm
|
||||
|
||||
|
||||
; > ecx = size
|
||||
; x edx
|
||||
;------------------------------------------------------------------------------
|
||||
; align destination address to multiple of 8. important for large transfers,
|
||||
; but doesn't affect the tiny technique.
|
||||
; > esi, edi -> buffers (updated)
|
||||
; > ecx, edx = transfer size (updated)
|
||||
; x eax
|
||||
%macro IC_ALIGN 0
|
||||
mov eax, edi
|
||||
and eax, byte 7 ; eax = # misaligned bytes
|
||||
jz already_aligned ; early out
|
||||
lea eax, [align_table_start+eax*2]
|
||||
jmp eax
|
||||
|
||||
; [p3] this is no slower than a table of mov and much smaller/simpler
|
||||
align 8
|
||||
align_table_start:
|
||||
%rep 8
|
||||
dec ecx
|
||||
movsb
|
||||
%endrep
|
||||
mov edx, ecx
|
||||
already_aligned:
|
||||
%endm
|
||||
|
||||
|
||||
;------------------------------------------------------------------------------
|
||||
; MMX MOVQ technique. used for in-cache transfers of 64B..64*KiB.
|
||||
; must run on all CPUs, i.e. cannot use the SSE prefetchnta instruction.
|
||||
; > ecx = -number_of_bytes (multiple of 64)
|
||||
; > esi, esi point to end of the buffer, i.e. &last_qword+8.
|
||||
; < ecx = 0
|
||||
; x
|
||||
%macro IC_MOVQ 0
|
||||
|
||||
; see notes below. TODO: if simple addressing is better on Athlons as well, prevent this from happening in setup code when not doing large transfers
|
||||
add esi, ecx
|
||||
add edi, ecx
|
||||
|
||||
align 16
|
||||
mov edx, 64
|
||||
%%loop:
|
||||
cmp ecx, edx
|
||||
jb %%done
|
||||
prefetchnta [esi + (200*64/34+192)]
|
||||
movq mm0, [esi+0]
|
||||
|
||||
; notes:
|
||||
; - we can't use prefetch here - this codepath must support all CPUs.
|
||||
; [p3] that makes us 5..15% slower on 1KiB..4KiB transfers.
|
||||
; - [p3] simple addressing without +ecx is 3.5% faster.
|
||||
; - [p3] there's no difference between RR/WW/RR/WW and R..R/W..W
|
||||
; with simple addressing and no prefetch.
|
||||
; - enough time elapses between first and third pair of reads that we
|
||||
; could reuse MM0. there is no performance gain either way and
|
||||
; differing displacements make code compression futile, so
|
||||
; we'll just use MM4..7 for clarity.
|
||||
movq mm0, [esi]
|
||||
movq mm1, [esi+8]
|
||||
movq [edi+0], mm0
|
||||
movq [edi], mm0
|
||||
movq [edi+8], mm1
|
||||
movq mm2, [esi+16]
|
||||
movq mm3, [esi+24]
|
||||
movq [edi+16], mm2
|
||||
movq [edi+24], mm3
|
||||
movq mm0, [esi+32]
|
||||
movq mm1, [esi+40]
|
||||
movq [edi+32], mm0
|
||||
movq [edi+40], mm1
|
||||
movq mm2, [esi+48]
|
||||
movq mm3, [esi+56]
|
||||
movq [edi+48], mm2
|
||||
movq [edi+56], mm3
|
||||
add esi, edx
|
||||
add edi, edx
|
||||
sub ecx, edx
|
||||
jmp %%loop
|
||||
%%done:
|
||||
%endm
|
||||
|
||||
|
||||
; > ecx = size (> 64)
|
||||
; x
|
||||
%macro UC_MOVNTQ 0
|
||||
mov edx, 64
|
||||
align 16
|
||||
%%1:
|
||||
prefetchnta [esi + (200*64/34+192)]
|
||||
movq mm0,[esi+0]
|
||||
add edi, edx
|
||||
movq mm1,[esi+8]
|
||||
add esi, edx
|
||||
movq mm2,[esi-48]
|
||||
movntq [edi-64], mm0
|
||||
movq mm0,[esi-40]
|
||||
movntq [edi-56], mm1
|
||||
movq mm1,[esi-32]
|
||||
movntq [edi-48], mm2
|
||||
movq mm2,[esi-24]
|
||||
movntq [edi-40], mm0
|
||||
movq mm0,[esi-16]
|
||||
movntq [edi-32], mm1
|
||||
movq mm1,[esi-8]
|
||||
movntq [edi-24], mm2
|
||||
movntq [edi-16], mm0
|
||||
sub ecx, edx
|
||||
movntq [edi-8], mm1
|
||||
cmp ecx, edx
|
||||
jae %%1
|
||||
%endm
|
||||
|
||||
|
||||
; > ecx = size (> 8KiB)
|
||||
; x eax, edx
|
||||
;
|
||||
; somewhat optimized for size (futile attempt to avoid near jump)
|
||||
%macro UC_BP_MOVNTQ 0
|
||||
%%prefetch_and_copy_chunk:
|
||||
|
||||
; touch each cache line within chunk in reverse order (prevents HW prefetch)
|
||||
push byte BP_SIZE/128 ; # iterations
|
||||
pop eax
|
||||
add esi, BP_SIZE
|
||||
align 8
|
||||
%%prefetch_chunk:
|
||||
mov edx, [esi-64]
|
||||
mov edx, [esi-128]
|
||||
sub esi, 128
|
||||
dec eax
|
||||
jnz %%prefetch_chunk
|
||||
|
||||
; copy 64 byte blocks
|
||||
mov eax, BP_SIZE/64 ; # iterations (> signed 8 bit)
|
||||
push byte 64
|
||||
pop edx
|
||||
align 8
|
||||
%%copy_block:
|
||||
movq mm0, [esi+ 0]
|
||||
movq mm1, [esi+ 8]
|
||||
movq mm2, [esi+16]
|
||||
movq mm3, [esi+24]
|
||||
movq mm4, [esi+32]
|
||||
movq mm5, [esi+40]
|
||||
movq [edi+32], mm4
|
||||
movq [edi+40], mm5
|
||||
movq mm6, [esi+48]
|
||||
movq mm7, [esi+56]
|
||||
add esi, edx
|
||||
movntq [edi+ 0], mm0
|
||||
movntq [edi+ 8], mm1
|
||||
movntq [edi+16], mm2
|
||||
movntq [edi+24], mm3
|
||||
movntq [edi+32], mm4
|
||||
movntq [edi+40], mm5
|
||||
movntq [edi+48], mm6
|
||||
movntq [edi+56], mm7
|
||||
add edi, edx
|
||||
dec eax
|
||||
jnz %%copy_block
|
||||
|
||||
sub ecx, BP_SIZE
|
||||
cmp ecx, BP_SIZE
|
||||
jae %%prefetch_and_copy_chunk
|
||||
movq [edi+48], mm6
|
||||
movq [edi+56], mm7
|
||||
add esi, byte 64
|
||||
add edi, byte 64
|
||||
add ecx, byte 64
|
||||
jnz %%loop
|
||||
%endm
|
||||
|
||||
|
||||
[section .bss]
|
||||
;------------------------------------------------------------------------------
|
||||
; SSE MOVNTQ technique. used for transfers that do not fit in L1,
|
||||
; i.e. 64KiB..192KiB. requires Pentium III or Athlon; caller checks for this.
|
||||
; > ecx = -number_of_bytes (multiple of 64)
|
||||
; > esi, esi point to end of the buffer, i.e. &last_qword+8.
|
||||
; < ecx = 0
|
||||
; x
|
||||
%macro UC_MOVNTQ 0
|
||||
|
||||
; this is somewhat "clever". the 2 specialized transfer implementations
|
||||
; that use SSE are jumped to if transfer size is greater than a threshold.
|
||||
; we simply set the requested transfer size to 0 if the CPU doesn't
|
||||
; support SSE so that those are never reached (done by masking with this).
|
||||
sse_mask resd 1
|
||||
align 16
|
||||
%%loop:
|
||||
; notes:
|
||||
; - the AMD optimization manual recommends prefetch distances according to
|
||||
; (200*BytesPerIter/ClocksPerIter+192), which comes out to ~560 here.
|
||||
; [p3] rounding down to 512 bytes makes for significant gains.
|
||||
; - [p3] complex addressing with ecx is 1% faster than adding to esi/edi.
|
||||
prefetchnta [esi+ecx+512]
|
||||
movq mm0, [esi+ecx]
|
||||
movq mm1, [esi+ecx+8]
|
||||
movq mm2, [esi+ecx+16]
|
||||
movq mm3, [esi+ecx+24]
|
||||
movq mm4, [esi+ecx+32]
|
||||
movq mm5, [esi+ecx+40]
|
||||
movq mm6, [esi+ecx+48]
|
||||
movq mm7, [esi+ecx+56]
|
||||
movntq [edi+ecx], mm0
|
||||
movntq [edi+ecx+8], mm1
|
||||
movntq [edi+ecx+16], mm2
|
||||
movntq [edi+ecx+24], mm3
|
||||
movntq [edi+ecx+32], mm4
|
||||
movntq [edi+ecx+40], mm5
|
||||
movntq [edi+ecx+48], mm6
|
||||
movntq [edi+ecx+56], mm7
|
||||
add ecx, byte 64
|
||||
jnz %%loop
|
||||
%endm
|
||||
|
||||
__SECT__
|
||||
|
||||
;------------------------------------------------------------------------------
|
||||
; block prefetch technique. used for transfers that do not fit in L2,
|
||||
; i.e. > 192KiB. requires Pentium III or Athlon; caller checks for this.
|
||||
; for theory behind this, see article.
|
||||
; > ecx = -number_of_bytes (multiple of 64, <= -BP_SIZE)
|
||||
; > esi, esi point to end of the buffer, i.e. &last_qword+8.
|
||||
; < ecx = -remaining_bytes (multiple of 64, > -BP_SIZE)
|
||||
; < eax = 0
|
||||
%macro UC_BP_MOVNTQ 0
|
||||
push edx
|
||||
align 4
|
||||
%%prefetch_and_copy_chunk:
|
||||
|
||||
; pull chunk into cache by touching each cache line
|
||||
; (in reverse order to prevent HW prefetches)
|
||||
mov eax, BP_SIZE/128 ; # iterations
|
||||
add esi, BP_SIZE
|
||||
align 16
|
||||
%%prefetch_loop:
|
||||
mov edx, [esi+ecx-64]
|
||||
mov edx, [esi+ecx-128]
|
||||
add esi, byte -128
|
||||
dec eax
|
||||
jnz %%prefetch_loop
|
||||
|
||||
; copy chunk in 64 byte pieces
|
||||
mov eax, BP_SIZE/64 ; # iterations (> signed 8 bit)
|
||||
align 16
|
||||
%%copy_loop:
|
||||
movq mm0, [esi+ecx]
|
||||
movq mm1, [esi+ecx+8]
|
||||
movq mm2, [esi+ecx+16]
|
||||
movq mm3, [esi+ecx+24]
|
||||
movq mm4, [esi+ecx+32]
|
||||
movq mm5, [esi+ecx+40]
|
||||
movq mm6, [esi+ecx+48]
|
||||
movq mm7, [esi+ecx+56]
|
||||
movntq [edi+ecx], mm0
|
||||
movntq [edi+ecx+8], mm1
|
||||
movntq [edi+ecx+16], mm2
|
||||
movntq [edi+ecx+24], mm3
|
||||
movntq [edi+ecx+32], mm4
|
||||
movntq [edi+ecx+40], mm5
|
||||
movntq [edi+ecx+48], mm6
|
||||
movntq [edi+ecx+56], mm7
|
||||
|
||||
add ecx, byte 64
|
||||
dec eax
|
||||
jnz %%copy_loop
|
||||
|
||||
; if enough data left, process next chunk
|
||||
cmp ecx, -BP_SIZE
|
||||
jle %%prefetch_and_copy_chunk
|
||||
|
||||
pop edx
|
||||
%endm
|
||||
|
||||
|
||||
;------------------------------------------------------------------------------
|
||||
|
||||
; void* __declspec(naked) ia32_memcpy(void* dst, const void* src, size_t nbytes)
|
||||
; Return dst to make ia32_memcpy usable as a standard library memcpy drop-in
|
||||
; drop-in replacement for libc memcpy() (returns dst)
|
||||
global sym(ia32_memcpy)
|
||||
align 64
|
||||
sym(ia32_memcpy):
|
||||
push edi
|
||||
push esi
|
||||
|
||||
mov ecx, [esp+8+4+8] ; nbytes
|
||||
mov edi, [esp+8+4+0] ; dst
|
||||
mov esi, [esp+8+4+4] ; src
|
||||
mov ecx, [esp+8+4+8] ; nbytes
|
||||
|
||||
cmp ecx, byte IC_SIZE
|
||||
ja .choose_larger_method
|
||||
mov edx, ecx
|
||||
cmp ecx, byte IC_TINY_MAX
|
||||
ja choose_larger_method
|
||||
|
||||
.ic_movsd:
|
||||
IC_MOVSD
|
||||
mov eax, [esp+8+4+0] ; return dst
|
||||
pop esi
|
||||
pop edi
|
||||
ret
|
||||
ic_tiny:
|
||||
IC_TINY
|
||||
; never reached - IC_TINY contains memcpy function epilog code
|
||||
|
||||
.choose_larger_method:
|
||||
choose_larger_method:
|
||||
IC_ALIGN
|
||||
|
||||
mov eax, [sse_mask]
|
||||
mov edx, ecx
|
||||
and edx, eax ; edx = (SSE)? remaining_bytes : 0
|
||||
cmp edx, BP_THRESHOLD
|
||||
jae near .uc_bp_movntq
|
||||
cmp edx, UC_THRESHOLD
|
||||
jae .uc_movntq
|
||||
; setup:
|
||||
; eax = number of 64 byte chunks, or 0 if CPU doesn't support SSE.
|
||||
; used to choose copy technique.
|
||||
; ecx = -number_of_bytes, multiple of 64. we jump to ic_tiny if
|
||||
; there's not enough left for a single 64 byte chunk, which can
|
||||
; happen on unaligned 64..71 byte transfers due to IC_ALIGN.
|
||||
; edx = number of remainder bytes after qwords have been copied;
|
||||
; will be handled by IC_TINY.
|
||||
; esi and edi point to end of the respective buffers (more precisely,
|
||||
; to buffer_start-ecx). this together with the ecx convention means
|
||||
; we only need one loop counter (instead of having to advance
|
||||
; that and esi/edi).
|
||||
|
||||
.ic_movq:
|
||||
; this mask is applied to the transfer size. the 2 specialized copy techniques
|
||||
; that use SSE are jumped to if size is greater than a threshold.
|
||||
; we simply set the requested transfer size to 0 if the CPU doesn't
|
||||
; support SSE so that those are never reached (done by masking with this).
|
||||
extern sym(ia32_memcpy_size_mask)
|
||||
mov eax, [sym(ia32_memcpy_size_mask)]
|
||||
and ecx, byte ~IC_TINY_MAX
|
||||
jz ic_tiny ; < 64 bytes left (due to IC_ALIGN)
|
||||
add esi, ecx
|
||||
add edi, ecx
|
||||
and edx, byte IC_TINY_MAX
|
||||
and eax, ecx
|
||||
neg ecx
|
||||
|
||||
cmp eax, BP_THRESHOLD
|
||||
jae near uc_bp_movntq
|
||||
cmp eax, UC_THRESHOLD
|
||||
jae uc_movntq
|
||||
|
||||
ic_movq:
|
||||
IC_MOVQ
|
||||
emms
|
||||
jmp .ic_movsd
|
||||
jmp ic_tiny
|
||||
|
||||
.uc_movntq:
|
||||
uc_movntq:
|
||||
UC_MOVNTQ
|
||||
sfence
|
||||
emms
|
||||
jmp .ic_movsd
|
||||
jmp ic_tiny
|
||||
|
||||
.uc_bp_movntq:
|
||||
uc_bp_movntq:
|
||||
UC_BP_MOVNTQ
|
||||
sfence
|
||||
jmp .ic_movq
|
||||
|
||||
cmp ecx, byte -(IC_TINY_MAX+1)
|
||||
jle ic_movq
|
||||
emms
|
||||
jmp ic_tiny
|
||||
|
||||
|
||||
;-------------------------------------------------------------------------------
|
||||
@ -487,10 +585,10 @@ rep stosd
|
||||
;-------------------------------------------------------------------------------
|
||||
; init
|
||||
;-------------------------------------------------------------------------------
|
||||
|
||||
; extern "C" bool __cdecl ia32_init()
|
||||
global sym(ia32_init)
|
||||
sym(ia32_init):
|
||||
|
||||
; extern "C" bool __cdecl ia32_asm_init()
|
||||
global sym(ia32_asm_init)
|
||||
sym(ia32_asm_init):
|
||||
push ebx
|
||||
|
||||
; check if CPUID is supported
|
||||
@ -514,17 +612,10 @@ sym(ia32_init):
|
||||
mov [max_ext_func], eax
|
||||
.no_cpuid:
|
||||
|
||||
; check if SSE is supported (used by memcpy code)
|
||||
extern sym(ia32_cap)
|
||||
push byte 32+25 ; ia32.h's SSE cap (won't change)
|
||||
call sym(ia32_cap)
|
||||
pop edx ; remove stack param
|
||||
neg eax ; SSE? ~0 : 0
|
||||
mov [sse_mask], eax
|
||||
|
||||
pop ebx
|
||||
ret
|
||||
|
||||
|
||||
;-------------------------------------------------------------------------------
|
||||
; Color conversion (SSE)
|
||||
;-------------------------------------------------------------------------------
|
||||
|
@ -25,10 +25,14 @@
|
||||
|
||||
// HACK (see call to wtime_reset_impl)
|
||||
#if OS_WIN
|
||||
#include "win/wtime.h"
|
||||
#include "lib/sysdep/win/wtime.h"
|
||||
#endif
|
||||
|
||||
#define NO_COLOR
|
||||
|
||||
#ifndef NO_COLOR
|
||||
#include "graphics/Color.h"
|
||||
#endif
|
||||
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
@ -40,6 +44,26 @@
|
||||
#error ia32.cpp needs inline assembly support!
|
||||
#endif
|
||||
|
||||
#define SELF_TEST_ENABLED 1
|
||||
#include "self_test.h"
|
||||
|
||||
// set by ia32_init, referenced by ia32_memcpy (asm)
|
||||
extern "C" u32 ia32_memcpy_size_mask = 0;
|
||||
|
||||
void ia32_init()
|
||||
{
|
||||
ia32_asm_init();
|
||||
|
||||
// memcpy init: set the mask that is applied to transfer size before
|
||||
// choosing copy technique. this is the mechanism for disabling
|
||||
// codepaths that aren't supported on all CPUs; see article for details.
|
||||
// .. check for PREFETCHNTA and MOVNTQ support. these are part of the SSE
|
||||
// instruction set, but also supported on older Athlons as part of
|
||||
// the extended AMD MMX set.
|
||||
if(ia32_cap(SSE) || ia32_cap(AMD_MMX_EXT))
|
||||
ia32_memcpy_size_mask = ~0u;
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// fast implementations of some sysdep.h functions; see documentation there
|
||||
@ -79,7 +103,7 @@ __declspec(naked) double ia32_rint(double)
|
||||
// end up with truncate/"chop" rounding. subtracting does the trick,
|
||||
// assuming RC is the IA-32 default round-to-nearest mode.
|
||||
|
||||
static const float round_bias = 0.5f;
|
||||
static const float round_bias = 0.4999999f;
|
||||
|
||||
__declspec(naked) i32 ia32_i32_from_float(float f)
|
||||
{
|
||||
@ -417,7 +441,7 @@ static void get_cpu_count()
|
||||
log_id_bits = log2(log_cpu_per_package); // see above
|
||||
last_phys_id = last_log_id = INVALID_ID;
|
||||
phys_ids = log_ids = 0;
|
||||
if(on_each_cpu(count_ids) == 0)
|
||||
if(sys_on_each_cpu(count_ids) == 0)
|
||||
{
|
||||
cpus = phys_ids;
|
||||
cpu_ht_units = log_ids / cpu_cores;
|
||||
@ -621,14 +645,16 @@ int ia32_get_call_target(void* ret_addr, void** target)
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
|
||||
#ifndef NO_COLOR
|
||||
// Assembler-optimized function for color conversion
|
||||
extern "C" {
|
||||
u32 sse_ConvertRGBColorTo4ub(const RGBColor& src);
|
||||
}
|
||||
#endif
|
||||
|
||||
void ia32_hook_capabilities()
|
||||
{
|
||||
#ifndef NO_COLOR
|
||||
if (ia32_cap(SSE))
|
||||
{
|
||||
ConvertRGBColorTo4ub = sse_ConvertRGBColorTo4ub;
|
||||
@ -637,6 +663,7 @@ void ia32_hook_capabilities()
|
||||
{
|
||||
debug_printf("No SSE available. Slow fallback routines will be used.\n");
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@ -667,10 +694,10 @@ namespace test {
|
||||
|
||||
static void self_test()
|
||||
{
|
||||
test1();
|
||||
test_float_int();
|
||||
}
|
||||
|
||||
RUN_SELF_TEST;
|
||||
SELF_TEST_RUN;
|
||||
|
||||
} // namespace test
|
||||
#endif // #if SELF_TEST_ENABLED
|
||||
|
@ -96,6 +96,7 @@ enum CpuCap
|
||||
|
||||
// extended (edx) - currently only defined by AMD
|
||||
AMD_MP = 96+19, // MultiProcessing capable; reserved on AMD64
|
||||
AMD_MMX_EXT = 96+22,
|
||||
AMD_3DNOW_PRO = 96+30,
|
||||
AMD_3DNOW = 96+31
|
||||
};
|
||||
@ -114,6 +115,8 @@ extern void ia32_hook_capabilities(void);
|
||||
// (Win32::CONTEXT*)pcontext (defined as void* to avoid dependency).
|
||||
extern void ia32_get_current_context(void* pcontext);
|
||||
|
||||
extern void ia32_asm_init();
|
||||
|
||||
extern int ia32_get_call_target(void* ret_addr, void** target);
|
||||
|
||||
// order in which registers are stored in regs array
|
||||
|
@ -66,17 +66,3 @@ i64 i64_from_double(double d)
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
// not possible with POSIX calls.
|
||||
// called from ia32.cpp get_cpu_count
|
||||
int on_each_cpu(void(*cb)())
|
||||
{
|
||||
#if OS_WIN
|
||||
return wcpu_on_each_cpu(cb);
|
||||
#else
|
||||
// apparently not possible on non-Windows OSes because they seem to lack
|
||||
// a CPU affinity API.
|
||||
return ERR_NO_SYS;
|
||||
#endif
|
||||
}
|
||||
|
@ -17,10 +17,6 @@
|
||||
#include "ia32.h"
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// pass "omit frame pointer" setting on to the compiler
|
||||
#if MSC_VERSION
|
||||
# if CONFIG_OMIT_FP
|
||||
@ -43,6 +39,15 @@ extern "C" {
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// C99 / SUSv3 emulation where needed
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
// vsnprintf2: handles positional parameters and %lld.
|
||||
// already available on *nix, emulated on Win32.
|
||||
#if OS_WIN
|
||||
@ -51,6 +56,16 @@ extern int vsnprintf2(char* buffer, size_t count, const char* format, va_list ar
|
||||
#define vsnprintf2 vsnprintf
|
||||
#endif
|
||||
|
||||
#if !HAVE_C99
|
||||
extern float fminf(float a, float b);
|
||||
extern float fmaxf(float a, float b);
|
||||
#endif
|
||||
|
||||
#if !MSC_VERSION
|
||||
#define stricmp strcasecmp
|
||||
#define strnicmp strncasecmp
|
||||
#endif
|
||||
|
||||
// alloca: allocate on stack, automatically free, return 0 if out of mem.
|
||||
// already available on *nix, emulated on Win32.
|
||||
#if OS_WIN
|
||||
@ -58,16 +73,6 @@ extern int vsnprintf2(char* buffer, size_t count, const char* format, va_list ar
|
||||
extern void* alloca(size_t size);
|
||||
#endif
|
||||
|
||||
// memcpy2: hand-tuned version; works for all sizes and aligments and is
|
||||
// significantly faster. uses SSE-optimized codepath when available.
|
||||
// 10% for < 64byte transfers and up to 300% on large sizes.
|
||||
#ifdef CPU_IA32
|
||||
# define memcpy2 ia32_memcpy
|
||||
extern void* ia32_memcpy(void* dst, const void* src, size_t nbytes);
|
||||
#else
|
||||
# define memcpy2 memcpy
|
||||
#endif
|
||||
|
||||
// rint: round float to nearest integral value.
|
||||
// provided by C99, otherwise:
|
||||
#if !HAVE_C99
|
||||
@ -82,20 +87,6 @@ extern void* ia32_memcpy(void* dst, const void* src, size_t nbytes);
|
||||
# endif
|
||||
#endif
|
||||
|
||||
// i32_from_float et al: convert float to int. much faster than _ftol2,
|
||||
// which would normally be used by (int) casts.
|
||||
// .. fast IA-32 version: only used in some cases; see macro definition.
|
||||
#if USE_IA32_FLOAT_TO_INT
|
||||
# define i32_from_float ia32_i32_from_float
|
||||
# define i32_from_double ia32_i32_from_double
|
||||
# define i64_from_double ia32_i64_from_double
|
||||
// .. portable C emulation
|
||||
#else
|
||||
extern i32 i32_from_float(float);
|
||||
extern i32 i32_from_double(double);
|
||||
extern i64 i64_from_double(double);
|
||||
#endif
|
||||
|
||||
// finite: return 0 iff the given double is infinite or NaN.
|
||||
#if OS_WIN
|
||||
# define finite _finite
|
||||
@ -128,30 +119,29 @@ extern void* ia32_memcpy(void* dst, const void* src, size_t nbytes);
|
||||
#endif
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// sysdep API
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
//
|
||||
// output
|
||||
//
|
||||
|
||||
enum DisplayErrorFlags
|
||||
{
|
||||
DE_ALLOW_SUPPRESS = 1,
|
||||
DE_NO_CONTINUE = 2,
|
||||
DE_MANUAL_BREAK = 4
|
||||
};
|
||||
|
||||
extern void sys_display_msg(const char* caption, const char* msg);
|
||||
extern void sys_display_msgw(const wchar_t* caption, const wchar_t* msg);
|
||||
|
||||
// choices offered by the shared error dialog
|
||||
enum ErrorReaction
|
||||
{
|
||||
// ignore, continue as if nothing happened.
|
||||
ER_CONTINUE = 1,
|
||||
// note: don't start at 0 because that is interpreted as a
|
||||
// DialogBoxParam failure.
|
||||
// note: don't start at 0 because that is interpreted as a
|
||||
// DialogBoxParam failure.
|
||||
|
||||
// ignore and do not report again.
|
||||
// only returned if DE_ALLOW_SUPPRESS was passed.
|
||||
ER_SUPPRESS,
|
||||
// note: non-persistent; only applicable during this program run.
|
||||
// note: non-persistent; only applicable during this program run.
|
||||
|
||||
// trigger breakpoint, i.e. enter debugger.
|
||||
// only returned if DE_MANUAL_BREAK was passed; otherwise,
|
||||
@ -163,29 +153,33 @@ enum ErrorReaction
|
||||
ER_EXIT
|
||||
};
|
||||
|
||||
extern ErrorReaction display_error(const wchar_t* description, int flags,
|
||||
uint skip, void* context, const char* file, int line);
|
||||
|
||||
// convenience version, in case the advanced parameters aren't needed.
|
||||
// done this way instead of with default values so that it also works in C.
|
||||
#define DISPLAY_ERROR(text) display_error(text, 0, 0, 0, __FILE__, __LINE__)
|
||||
enum SysDisplayErrorFlags
|
||||
{
|
||||
DE_ALLOW_SUPPRESS = 1,
|
||||
DE_NO_CONTINUE = 2,
|
||||
DE_MANUAL_BREAK = 4
|
||||
};
|
||||
|
||||
// internal use only (used by display_error)
|
||||
extern ErrorReaction display_error_impl(const wchar_t* text, int flags);
|
||||
|
||||
|
||||
|
||||
extern void display_msg(const char* caption, const char* msg);
|
||||
extern void wdisplay_msg(const wchar_t* caption, const wchar_t* msg);
|
||||
extern ErrorReaction sys_display_error(const wchar_t* text, int flags);
|
||||
|
||||
|
||||
//
|
||||
// clipboard
|
||||
//
|
||||
|
||||
extern int clipboard_set(const wchar_t* text);
|
||||
extern wchar_t* clipboard_get(void);
|
||||
extern int clipboard_free(wchar_t* copy);
|
||||
// "copy" text into the clipboard. replaces previous contents.
|
||||
extern int sys_clipboard_set(const wchar_t* text);
|
||||
|
||||
// allow "pasting" from clipboard. returns the current contents if they
|
||||
// can be represented as text, otherwise 0.
|
||||
// when it is no longer needed, the returned pointer must be freed via
|
||||
// sys_clipboard_free. (NB: not necessary if zero, but doesn't hurt)
|
||||
extern wchar_t* sys_clipboard_get(void);
|
||||
|
||||
// frees memory used by <copy>, which must have been returned by
|
||||
// sys_clipboard_get. see note above.
|
||||
extern int sys_clipboard_free(wchar_t* copy);
|
||||
|
||||
|
||||
//
|
||||
@ -194,11 +188,17 @@ extern int clipboard_free(wchar_t* copy);
|
||||
|
||||
// note: these do not warn on error; that is left to the caller.
|
||||
|
||||
// creates a cursor from the given texture file.
|
||||
// creates a cursor from the given image.
|
||||
// w, h specify image dimensions [pixels]. limit is implementation-
|
||||
// dependent; 32x32 is typical and safe.
|
||||
// bgra_img is the cursor image (BGRA format, bottom-up).
|
||||
// it is no longer needed and can be freed after this call returns.
|
||||
// hotspot (hx,hy) is the offset from its upper-left corner to the
|
||||
// position where mouse clicks are registered.
|
||||
// the cursor must be cursor_free-ed when no longer needed.
|
||||
extern int sys_cursor_load(const char* filename,
|
||||
// position where mouse clicks are registered.
|
||||
// return: negative error code, or 0 on success. cursor is filled with
|
||||
// a pointer and undefined on failure. it must be sys_cursor_free-ed
|
||||
// when no longer needed.
|
||||
extern int sys_cursor_create(uint w, uint h, void* bgra_img,
|
||||
uint hx, uint hy, void** cursor);
|
||||
|
||||
// replaces the current system cursor with the one indicated. need only be
|
||||
@ -210,37 +210,67 @@ extern int sys_cursor_set(void* cursor);
|
||||
extern int sys_cursor_free(void* cursor);
|
||||
|
||||
|
||||
//
|
||||
// misc
|
||||
//
|
||||
|
||||
|
||||
// OS-specific backend for error_description_r.
|
||||
// NB: it is expected to be rare that OS return/error codes are actually
|
||||
// seen by user code, but we still translate them for completeness.
|
||||
extern int sys_error_description_r(int err, char* buf, size_t max_chars);
|
||||
|
||||
extern int get_executable_name(char* n_path, size_t buf_size);
|
||||
// determine filename of the module to whom the given address belongs.
|
||||
// useful for handling exceptions in other modules.
|
||||
// <path> receives full path to module; it must hold at least MAX_PATH chars.
|
||||
// on error, it is set to L"".
|
||||
// return path for convenience.
|
||||
wchar_t* sys_get_module_filename(void* addr, wchar_t* path);
|
||||
|
||||
// return filename of the module which contains address <addr>,
|
||||
// or L"" on failure. path holds the string and must be >= MAX_PATH chars.
|
||||
wchar_t* get_module_filename(void* addr, wchar_t* path);
|
||||
// store full path to the current executable.
|
||||
// returns 0 or a negative error code.
|
||||
// useful for determining installation directory, e.g. for VFS.
|
||||
extern int sys_get_executable_name(char* n_path, size_t buf_size);
|
||||
|
||||
// have the user specify a directory via OS dialog.
|
||||
// stores its full path in the given buffer, which must hold at least
|
||||
// PATH_MAX chars.
|
||||
// returns 0 on success or a negative error code.
|
||||
extern int sys_pick_directory(char* n_path, size_t buf_size);
|
||||
|
||||
|
||||
|
||||
extern int pick_directory(char* n_path, size_t buf_size);
|
||||
|
||||
|
||||
// not possible with POSIX calls.
|
||||
// execute the specified function once on each CPU.
|
||||
// this includes logical HT units and proceeds serially (function
|
||||
// is never re-entered) in order of increasing OS CPU ID.
|
||||
// note: implemented by switching thread affinity masks and forcing
|
||||
// a reschedule, which is apparently not possible with POSIX.
|
||||
// return 0 on success or a negative error code on failure
|
||||
// (e.g. if OS is preventing us from running on some CPUs).
|
||||
// called from ia32.cpp get_cpu_count
|
||||
extern int on_each_cpu(void(*cb)());
|
||||
extern int sys_on_each_cpu(void(*cb)());
|
||||
|
||||
|
||||
|
||||
|
||||
#if !HAVE_C99
|
||||
extern float fminf(float a, float b);
|
||||
extern float fmaxf(float a, float b);
|
||||
// drop-in replacement for libc memcpy(). only requires CPU support for
|
||||
// MMX (by now universal). highly optimized for Athlon and Pentium III
|
||||
// microarchitectures; significantly outperforms VC7.1 memcpy and memcpy_amd.
|
||||
// for details, see accompanying article.
|
||||
#ifdef CPU_IA32
|
||||
# define memcpy2 ia32_memcpy
|
||||
extern void* ia32_memcpy(void* dst, const void* src, size_t nbytes);
|
||||
#else
|
||||
# define memcpy2 memcpy
|
||||
#endif
|
||||
|
||||
#if !MSC_VERSION
|
||||
#define stricmp strcasecmp
|
||||
#define strnicmp strncasecmp
|
||||
// i32_from_float et al: convert float to int. much faster than _ftol2,
|
||||
// which would normally be used by (int) casts.
|
||||
// .. fast IA-32 version: only used in some cases; see macro definition.
|
||||
#if USE_IA32_FLOAT_TO_INT
|
||||
# define i32_from_float ia32_i32_from_float
|
||||
# define i32_from_double ia32_i32_from_double
|
||||
# define i64_from_double ia32_i64_from_double
|
||||
// .. portable C emulation
|
||||
#else
|
||||
extern i32 i32_from_float(float);
|
||||
extern i32 i32_from_double(double);
|
||||
extern i64 i64_from_double(double);
|
||||
#endif
|
||||
|
||||
|
||||
@ -249,11 +279,14 @@ extern float fmaxf(float a, float b);
|
||||
#endif
|
||||
|
||||
|
||||
// C++ linkage
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// STL_HASH_MAP, STL_HASH_MULTIMAP, STL_HASH_SET
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
// these containers are useful but not part of C++98. most STL vendors
|
||||
// provide them in some form; we hide their differences behind macros.
|
||||
|
||||
#if GCC_VERSION
|
||||
// GCC
|
||||
# include <ext/hash_map>
|
||||
# include <ext/hash_set> // Probably?
|
||||
|
||||
@ -283,6 +316,7 @@ namespace __gnu_cxx
|
||||
}
|
||||
|
||||
#else // !__GNUC__
|
||||
|
||||
# include <hash_map>
|
||||
# include <hash_set>
|
||||
// VC7 or above
|
||||
@ -300,8 +334,7 @@ namespace __gnu_cxx
|
||||
# define STL_HASH_MULTISET std::hash_multiset
|
||||
# define STL_HASH_VALUE std::hash_value
|
||||
# endif // MSC_VERSION >= 1300
|
||||
|
||||
#endif // !__GNUC__
|
||||
|
||||
#include "debug.h"
|
||||
|
||||
#endif // #ifndef SYSDEP_H_INCLUDED
|
||||
|
@ -14,12 +14,12 @@
|
||||
// these are basic POSIX-compatible backends for the sysdep.h functions.
|
||||
// Win32 has better versions which override these.
|
||||
|
||||
void display_msg(const char* caption, const char* msg)
|
||||
void sys_display_msg(const char* caption, const char* msg)
|
||||
{
|
||||
fprintf(stderr, "%s: %s\n", caption, msg);
|
||||
}
|
||||
|
||||
void wdisplay_msg(const wchar_t* caption, const wchar_t* msg)
|
||||
void sys_display_msgw(const wchar_t* caption, const wchar_t* msg)
|
||||
{
|
||||
fwprintf(stderr, L"%ls: %ls\n", caption, msg);
|
||||
}
|
||||
@ -51,7 +51,14 @@ int unix_get_cpu_info()
|
||||
return 0;
|
||||
}
|
||||
|
||||
ErrorReaction display_error_impl(const wchar_t* text, int flags)
|
||||
// apparently not possible on non-Windows OSes because they seem to lack
|
||||
// a CPU affinity API. see sysdep.h comment.
|
||||
int sys_on_each_cpu(void(*cb)())
|
||||
{
|
||||
return ERR_NO_SYS;
|
||||
}
|
||||
|
||||
ErrorReaction sys_display_error(const wchar_t* text, int flags)
|
||||
{
|
||||
printf("%ls\n\n", text);
|
||||
|
||||
@ -112,6 +119,13 @@ ErrorReaction display_error_impl(const wchar_t* text, int flags)
|
||||
// take advantage of hardware mouse cursors instead of the (jerky when
|
||||
// loading) OpenGL cursor.
|
||||
|
||||
int sys_cursor_create(uint w, uint h, void* bgra_img,
|
||||
uint hx, uint hy, void** cursor)
|
||||
{
|
||||
*cursor = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int sys_cursor_set(void* cursor)
|
||||
{
|
||||
return 0;
|
||||
|
@ -30,45 +30,6 @@
|
||||
// note: int instead of unsigned because <cpus> is also signed (tri-state).
|
||||
static const int MAX_CPUS = 32;
|
||||
|
||||
|
||||
int wcpu_on_each_cpu(void(*cb)())
|
||||
{
|
||||
const HANDLE hProcess = GetCurrentProcess();
|
||||
|
||||
DWORD process_affinity, system_affinity;
|
||||
if(!GetProcessAffinityMask(hProcess, &process_affinity, &system_affinity))
|
||||
return -1;
|
||||
|
||||
// our affinity != system affinity: OS is limiting the CPUs that
|
||||
// this process can run on. fail (cannot call back for each CPU).
|
||||
if(process_affinity != system_affinity)
|
||||
return -1;
|
||||
|
||||
for(DWORD cpu_bit = 1; cpu_bit != 0 && cpu_bit <= process_affinity; cpu_bit *= 2)
|
||||
{
|
||||
// check if we can switch to target CPU
|
||||
if(!(process_affinity & cpu_bit))
|
||||
continue;
|
||||
// .. and do so.
|
||||
if(!SetProcessAffinityMask(hProcess, process_affinity))
|
||||
{
|
||||
debug_warn("SetProcessAffinityMask failed");
|
||||
continue;
|
||||
}
|
||||
|
||||
// reschedule, to make sure we switch CPUs
|
||||
Sleep(0);
|
||||
|
||||
cb();
|
||||
}
|
||||
|
||||
// restore to original value
|
||||
SetProcessAffinityMask(hProcess, process_affinity);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
static void check_speedstep()
|
||||
{
|
||||
WIN_SAVE_LAST_ERROR;
|
||||
|
@ -1 +1 @@
|
||||
extern int wcpu_on_each_cpu(void(*cb)());
|
||||
|
||||
|
@ -91,8 +91,8 @@ void wdbg_set_thread_name(const char* name)
|
||||
}
|
||||
__except(EXCEPTION_EXECUTE_HANDLER)
|
||||
{
|
||||
// if we get here, apparently this hack is not longer supported.
|
||||
debug_warn("TODO: find alternative thread name implementation");
|
||||
// if we get here, the debugger didn't handle the exception.
|
||||
debug_warn("thread name hack doesn't work under this debugger");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -149,7 +149,7 @@ static int sym_init()
|
||||
const BOOL fInvadeProcess = TRUE;
|
||||
// .. use default *symbol* search path. we don't use this to locate
|
||||
// our PDB file because its absolute path is stored inside the EXE.
|
||||
const char* UserSearchPath = 0;
|
||||
PCSTR UserSearchPath = 0;
|
||||
BOOL ok = SymInitialize(hProcess, UserSearchPath, fInvadeProcess);
|
||||
WARN_IF_FALSE(ok);
|
||||
|
||||
@ -2205,7 +2205,7 @@ static void self_test()
|
||||
test_addrs(123, 3.1415926535897932384626, "pchar string", 0xf00d);
|
||||
}
|
||||
|
||||
RUN_SELF_TEST;
|
||||
SELF_TEST_RUN;
|
||||
|
||||
#pragma optimize("", on)
|
||||
} // namespace test
|
||||
|
@ -16,15 +16,6 @@
|
||||
// Jan.Wassenberg@stud.uni-karlsruhe.de
|
||||
// http://www.stud.uni-karlsruhe.de/~urkt/
|
||||
|
||||
|
||||
// TODO: should use GetMessage when not active to reduce CPU load.
|
||||
// where to do this?
|
||||
// - force the app to check for SDL's activation messages, and call
|
||||
// sdl-wait-message?
|
||||
// - do it here, just make SDL_PollEvent block until message received?
|
||||
// - have the app use another free-the-cpu method, since it controls the main loop.
|
||||
// this is what's currently happening.
|
||||
|
||||
#include "precompiled.h"
|
||||
|
||||
#include <stdio.h>
|
||||
@ -919,6 +910,12 @@ static LRESULT CALLBACK wndproc(HWND hWnd, UINT uMsg, WPARAM wParam, LPARAM lPar
|
||||
|
||||
void SDL_PumpEvents(void)
|
||||
{
|
||||
// rationale: we would like to reduce CPU usage automatically if
|
||||
// possible. blocking here until a message arrives would accomplish
|
||||
// that, but might potentially freeze the app too long.
|
||||
// instead, they should check active state and call SDL_Delay etc.
|
||||
// if our window is minimized.
|
||||
|
||||
MSG msg;
|
||||
while(PeekMessageW(&msg, 0, 0, 0, PM_REMOVE))
|
||||
{
|
||||
@ -1129,7 +1126,7 @@ int SDL_KillThread(SDL_Thread* thread)
|
||||
|
||||
void SDL_WM_SetCaption(const char* title, const char* icon)
|
||||
{
|
||||
SetWindowText(hWnd, title);
|
||||
WARN_IF_FALSE(SetWindowText(hWnd, title));
|
||||
|
||||
UNUSED2(icon); // TODO: implement
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user