diff --git a/source/lib/detect.cpp b/source/lib/detect.cpp index 15a1d11f2a..5956c3c7d4 100755 --- a/source/lib/detect.cpp +++ b/source/lib/detect.cpp @@ -157,20 +157,20 @@ void cpu_init() // we need full precision when calculating the time. // if there's a spot where we want to speed up divides|sqrts, // we can temporarily change precision there. - //_control87(_PC_24, _MCW_PC); + //ia32_control87(IA32_PC_24, IA32_MCW_PC); // to help catch bugs, enable as many floating-point exceptions as // possible. that means only zero-divide, because the JS engine is // triggering the rest. // note: passing a flag *disables* that exception. - _control87(_EM_INVALID|_EM_DENORMAL|_EM_OVERFLOW|_EM_UNDERFLOW|_EM_INEXACT, _MCW_EM); + ia32_control87(IA32_EM_INVALID|IA32_EM_DENORMAL|IA32_EM_OVERFLOW|IA32_EM_UNDERFLOW|IA32_EM_INEXACT, IA32_MCW_EM); // no longer round toward zero (truncate). changing this setting // resulted in much faster float->int casts, because the compiler // could be told (via /QIfist) to use FISTP while still truncating // the result as required by ANSI C. however, FPU calculation // results were changed significantly, so it had to be disabled. - //_control87(_RC_CHOP, _MCW_RC); + //ia32_control87(IA32_RC_CHOP, IA32_MCW_RC); // If possible, hook up capability-sensitive assembler routines ia32_hook_capabilities(); diff --git a/source/lib/sysdep/ia32.cpp b/source/lib/sysdep/ia32.cpp index 434d275423..8eb9ea4855 100755 --- a/source/lib/sysdep/ia32.cpp +++ b/source/lib/sysdep/ia32.cpp @@ -40,26 +40,13 @@ #error ia32.cpp needs inline assembly support! #endif + +//----------------------------------------------------------------------------- +// fast implementations of some sysdep.h functions; see documentation there +//----------------------------------------------------------------------------- + #if HAVE_MS_ASM -// replace pathetic MS libc implementation. -// not needed on non-Win32, so don't bother converting from MS inline asm. -double _ceil(double f) -{ - UNUSED2(f); // avoid bogus warning - const float _49 = 0.499999f; - double r; -__asm -{ - fld [f] - fadd [_49] - frndint - fstp [r] -} - return r; -} - - // note: declspec naked is significantly faster: it avoids redundant // store/load, even though it prevents inlining. @@ -135,7 +122,7 @@ __asm{ #endif // USE_IA32_FLOAT_TO_INT - +//----------------------------------------------------------------------------- // rationale: this function should return its output (instead of setting // out params) to simplify its callers. it is written in inline asm diff --git a/source/lib/sysdep/ia32.h b/source/lib/sysdep/ia32.h index a0eb3a3e64..de5dec1408 100755 --- a/source/lib/sysdep/ia32.h +++ b/source/lib/sysdep/ia32.h @@ -35,60 +35,46 @@ extern "C" { extern void ia32_init(); -extern double _ceil(double); +// +// fast implementations of some sysdep.h functions; see documentation there +// extern float ia32_rintf(float f); extern double ia32_rint(double f); +extern i32 ia32_i32_from_float(float f); +extern i32 ia32_i32_from_double(double d); +extern i64 ia32_i64_from_double(double d); -extern u64 rdtsc(void); +extern void* ia32_memcpy(void* dst, const void* src, size_t nbytes); // asm -// these may have been defined by system headers; we redefine them to -// the real IA-32 values for use with ia32_control87. +// FPU control word // .. Precision Control: -#undef _MCW_PC -#define _MCW_PC 0x0300 -#undef _PC_24 -#define _PC_24 0x0000 +#define IA32_MCW_PC 0x0300 +#define IA32_PC_24 0x0000 // .. Rounding Control: -#undef _MCW_RC -#define _MCW_RC 0x0C00 -#undef _RC_NEAR -#define _RC_NEAR 0x0000 -#undef _RC_DOWN -#define _RC_DOWN 0x0400 -#undef _RC_UP -#define _RC_UP 0x0800 -#undef _RC_CHOP -#define _RC_CHOP 0x0C00 +#define IA32_MCW_RC 0x0C00 +#define IA32_RC_NEAR 0x0000 +#define IA32_RC_DOWN 0x0400 +#define IA32_RC_UP 0x0800 +#define IA32_RC_CHOP 0x0C00 // .. Exception Mask: -#undef _MCW_EM -#define _MCW_EM 0x003f -#undef _EM_INVALID -#define _EM_INVALID BIT(0) -#undef _EM_DENORMAL -#define _EM_DENORMAL BIT(1) -#undef _EM_ZERODIVIDE -#define _EM_ZERODIVIDE BIT(2) -#undef _EM_OVERFLOW -#define _EM_OVERFLOW BIT(3) -#undef _EM_UNDERFLOW -#define _EM_UNDERFLOW BIT(4) -#undef _EM_INEXACT -#define _EM_INEXACT BIT(5) +#define IA32_MCW_EM 0x003f +#define IA32_EM_INVALID BIT(0) +#define IA32_EM_DENORMAL BIT(1) +#define IA32_EM_ZERODIVIDE BIT(2) +#define IA32_EM_OVERFLOW BIT(3) +#define IA32_EM_UNDERFLOW BIT(4) +#define IA32_EM_INEXACT BIT(5) -#define _control87 ia32_control87 extern uint ia32_control87(uint new_val, uint mask); // asm +extern u64 rdtsc(void); + extern void ia32_debug_break(void); -extern void* ia32_memcpy(void* dst, const void* src, size_t nbytes); - -// write the current execution state (e.g. all register values) into -// (Win32::CONTEXT*)pcontext (defined as void* to avoid dependency). -extern void ia32_get_current_context(void* pcontext); // CPU caps (128 bits) // do not change the order! @@ -121,8 +107,13 @@ extern void ia32_get_cpu_info(void); extern void ia32_hook_capabilities(void); +//----------------------------------------------------------------------------- // internal use only +// write the current execution state (e.g. all register values) into +// (Win32::CONTEXT*)pcontext (defined as void* to avoid dependency). +extern void ia32_get_current_context(void* pcontext); + extern int ia32_get_call_target(void* ret_addr, void** target); // order in which registers are stored in regs array diff --git a/source/lib/sysdep/sysdep.cpp b/source/lib/sysdep/sysdep.cpp index 43d2ff32ae..59bb9043b2 100755 --- a/source/lib/sysdep/sysdep.cpp +++ b/source/lib/sysdep/sysdep.cpp @@ -13,15 +13,6 @@ #include #include -#if MSC_VERSION - -double round(double x) -{ - return (long)(x + 0.5); -} - -#endif // MSC_VERSION - #if !HAVE_C99 @@ -35,8 +26,12 @@ float fmaxf(float a, float b) return (a > b)? a : b; } +#endif -#ifndef rint + +// no C99, and not running on IA-32 (where this is defined to ia32_rint) +// => need to implement our fallback version. +#if !HAVE_C99 && !defined(rint) inline float rintf(float f) { @@ -50,9 +45,9 @@ inline double rint(double d) #endif -#endif // !HAVE_C99 - +// float->int conversion: not using the ia32 version; just implement as a +// cast. (see USE_IA32_FLOAT_TO_INT definition for details) #if !USE_IA32_FLOAT_TO_INT i32 i32_from_float(float f) diff --git a/source/lib/sysdep/sysdep.h b/source/lib/sysdep/sysdep.h index 6f3f6c23d1..8ed176d000 100755 --- a/source/lib/sysdep/sysdep.h +++ b/source/lib/sysdep/sysdep.h @@ -3,11 +3,19 @@ #include "config.h" +// some functions among the sysdep API are implemented as macros +// that redirect to the platform-dependent version. this is done where +// the cost of a trampoline function would be too great; VC7 does not +// always inline them. +// we therefore need to include those headers. #if OS_WIN # include "win/win.h" #elif OS_UNIX # include "unix/unix.h" #endif +#if CPU_IA32 +#include "ia32.h" +#endif #ifdef __cplusplus extern "C" { @@ -50,6 +58,9 @@ extern int vsnprintf2(char* buffer, size_t count, const char* format, va_list ar extern void* alloca(size_t size); #endif +// memcpy2: hand-tuned version; works for all sizes and aligments and is +// significantly faster. uses SSE-optimized codepath when available. +// 10% for < 64byte transfers and up to 300% on large sizes. #ifdef CPU_IA32 # define memcpy2 ia32_memcpy extern void* ia32_memcpy(void* dst, const void* src, size_t nbytes); @@ -57,30 +68,33 @@ extern void* ia32_memcpy(void* dst, const void* src, size_t nbytes); # define memcpy2 memcpy #endif -// rint: round float to nearest integer. +// rint: round float to nearest integral value. // provided by C99, otherwise: #if !HAVE_C99 -// .. implemented on IA-32; define as macro to avoid jmp overhead +// .. fast IA-32 version # if CPU_IA32 # define rintf ia32_rintf # define rint ia32_rint +// .. portable C emulation +# else + extern float rintf(float f); + extern double rint(double d); # endif -// .. forward-declare either the IA-32 version or portable C emulation. -extern float rintf(float f); -extern double rint(double d); #endif -// fast float->int conversion; does not specify rounding mode, -// so do not use them if exact values are needed. +// i32_from_float et al: convert float to int. much faster than _ftol2, +// which would normally be used by (int) casts. +// .. fast IA-32 version: only used in some cases; see macro definition. #if USE_IA32_FLOAT_TO_INT # define i32_from_float ia32_i32_from_float # define i32_from_double ia32_i32_from_double # define i64_from_double ia32_i64_from_double +// .. portable C emulation +#else + extern i32 i32_from_float(float); + extern i32 i32_from_double(double); + extern i64 i64_from_double(double); #endif -// .. forward-declare either the IA-32 version or portable C emulation. -extern i32 i32_from_float(float); -extern i32 i32_from_double(double); -extern i64 i64_from_double(double); // finite: return 0 iff the given double is infinite or NaN. #if OS_WIN @@ -216,9 +230,6 @@ extern int on_each_cpu(void(*cb)()); -#if MSC_VERSION -extern double round(double); -#endif #if !HAVE_C99 extern float fminf(float a, float b); diff --git a/source/lib/sysdep/win/wsdl.cpp b/source/lib/sysdep/win/wsdl.cpp index 344f9c3eda..4129c66fa5 100755 --- a/source/lib/sysdep/win/wsdl.cpp +++ b/source/lib/sysdep/win/wsdl.cpp @@ -16,7 +16,7 @@ // Jan.Wassenberg@stud.uni-karlsruhe.de // http://www.stud.uni-karlsruhe.de/~urkt/ - + // TODO: should use GetMessage when not active to reduce CPU load. // where to do this? // - force the app to check for SDL's activation messages, and call diff --git a/source/lib/sysdep/win/wsdl.h b/source/lib/sysdep/win/wsdl.h index de5271afed..4cf1c454c4 100755 --- a/source/lib/sysdep/win/wsdl.h +++ b/source/lib/sysdep/win/wsdl.h @@ -46,7 +46,10 @@ extern int SDL_Init(Uint32 flags); extern void SDL_Quit(void); -extern Uint8 SDL_GetAppState(); + +// +// video +// typedef enum { @@ -57,14 +60,12 @@ SDL_GLattr; extern int SDL_GL_SetAttribute(SDL_GLattr attr, int value); - // SDL_SetVideoMode() flags #define SDL_OPENGL 0 #define SDL_FULLSCREEN 1 extern int SDL_SetVideoMode(int w, int h, int bpp, unsigned long flags); - typedef struct { int w, h; @@ -73,7 +74,6 @@ SDL_Surface; extern SDL_Surface* SDL_GetVideoSurface(void); - typedef struct { int video_mem; @@ -82,6 +82,10 @@ SDL_VideoInfo; extern SDL_VideoInfo* SDL_GetVideoInfo(void); +extern void* SDL_GL_GetProcAddress(const char*); + +extern void SDL_GL_SwapBuffers(void); + // // threads / sync @@ -90,10 +94,6 @@ extern SDL_VideoInfo* SDL_GetVideoInfo(void); typedef void SDL_sem; typedef void SDL_Thread; -extern void* SDL_GL_GetProcAddress(const char*); - -extern void SDL_GL_SwapBuffers(void); - extern u32 SDL_GetTicks(void); extern void SDL_Delay(u32 ms); @@ -105,6 +105,7 @@ extern int SDL_SemWait(SDL_sem* sem); extern SDL_Thread* SDL_CreateThread(int(*)(void*), void*); extern int SDL_KillThread(SDL_Thread*); + extern void SDL_WarpMouse(int, int); enum ShowCursorToggle @@ -118,14 +119,10 @@ extern int SDL_ShowCursor(int toggle); extern int SDL_SetGamma(float r, float g, float b); -// macros -#define SDL_GRAB_ON 0 -#define SDL_WM_GrabInput(a) -#define SDL_GetError() "" - - -////////////////////////////////////////////////////////////////////////////// +// +// byte swapping +// #ifdef linux @@ -285,13 +282,28 @@ extern int SDL_WaitEvent(SDL_Event*); extern int SDL_PollEvent(SDL_Event* ev); extern int SDL_PushEvent(SDL_Event* ev); + +// +// misc +// + +#define SDL_GRAB_ON 0 +#define SDL_WM_GrabInput(a) + +#define SDL_GetError() "" + +// from real SDL, but they're ignored anyway +#define SDL_DEFAULT_REPEAT_DELAY 500 +#define SDL_DEFAULT_REPEAT_INTERVAL 30 +#define SDL_EnableKeyRepeat(delay, interval) + + extern void SDL_WM_SetCaption(const char *title, const char *icon); extern Uint8* SDL_GetKeyState(int* num_keys); extern Uint8 SDL_GetMouseState(int* x, int* y); -//( SDLMod and KMOD_* already defined by SDL_keysym.h) -extern SDLMod SDL_GetModState(void); +extern Uint8 SDL_GetAppState(); #ifdef __cplusplus diff --git a/source/ps/GameSetup/GameSetup.cpp b/source/ps/GameSetup/GameSetup.cpp index 4d3c99f018..f56ecf44d2 100644 --- a/source/ps/GameSetup/GameSetup.cpp +++ b/source/ps/GameSetup/GameSetup.cpp @@ -567,9 +567,7 @@ static void InitPs(bool setup_gui) static void InitInput() { -#if !OS_WIN SDL_EnableKeyRepeat(SDL_DEFAULT_REPEAT_DELAY, SDL_DEFAULT_REPEAT_INTERVAL); -#endif // register input handlers // This stack is constructed so the first added, will be the last