fixes/improvements made at work:

- fix lots of 64-bit warnings - round_up/down now templates - avoid warning with definition of byte swap functions; remove duplication of that in wsdl - codec_zlib.cpp: avoid domination warning - vfs/file_cache: VFS is now responsible for handling zero-length files (no longer considered an error; just returns zero pointer and size=0) - cpu: export all functions (thus obviating cpu_memcpy_thunk). this required renaming asm functions and adding thunk functions that call them - winit: fix segment merge statement, reinstate /include (otherwise init functions are stripped by linker) - wstartup: VC CRT init section definitions have changed in amd64 build; match their definition - wsysdep.cpp: more descriptive text for osError = 0 This was SVN commit r5899.
2008-05-01 15:41:42 +00:00 · 2008-05-01 15:41:42 +00:00 · cc243f67eb
commit cc243f67eb
parent 3892f03e2e
37 changed files with 293 additions and 245 deletions
--- a/source/lib/allocators/allocators.cpp
+++ b/source/lib/allocators/allocators.cpp
@ -141,7 +141,7 @@ void single_free(void* storage, volatile uintptr_t* in_use_flag, void* p)

 void* static_calloc(StaticStorage* ss, size_t size)
 {
-	void* p = (void*)round_up((uintptr_t)ss->pos, 16);
+	void* p = (void*)round_up((uintptr_t)ss->pos, (uintptr_t)16u);
 	ss->pos = (u8*)p+size;
 	debug_assert(ss->pos <= ss->end);
 	return p;
--- a/source/lib/allocators/headerless.cpp
+++ b/source/lib/allocators/headerless.cpp
@ -50,7 +50,7 @@ public:
 		// clear all fields to prevent accidental reuse
 		prev = next = 0;
 		m_id = 0;
-		m_size = ~0u;
+		m_size = ~(size_t)0u;
 		m_magic = 0;
 	}

--- a/source/lib/allocators/pool.h
+++ b/source/lib/allocators/pool.h
@ -43,7 +43,7 @@ struct Pool
 * pass as pool_create's <el_size> param to indicate variable-sized allocs
 * are required (see below).
 **/
-const size_t POOL_VARIABLE_ALLOCS = ~0u;
+const size_t POOL_VARIABLE_ALLOCS = ~(size_t)0u;

 /**
 * Ready Pool for use.
--- a/source/lib/bits.cpp
+++ b/source/lib/bits.cpp
@ -62,22 +62,3 @@ uint round_up_to_pow2(uint x)

 	return x+1;
 }
-
-
-// multiple must be a power of two.
-uintptr_t round_up(const uintptr_t n, const uintptr_t multiple)
-{
-	debug_assert(is_pow2((long)multiple));
-	const uintptr_t result = (n + multiple-1) & ~(multiple-1);
-	debug_assert(n <= result && result < n+multiple);
-	return result;
-}
-
-// multiple must be a power of two.
-uintptr_t round_down(const uintptr_t n, const uintptr_t multiple)
-{
-	debug_assert(is_pow2((long)multiple));
-	const uintptr_t result = n & ~(multiple-1);
-	debug_assert(result <= n && n < result+multiple);
-	return result;
-}
--- a/source/lib/bits.h
+++ b/source/lib/bits.h
@ -129,14 +129,29 @@ extern uint round_up_to_pow2(uint x);
 *
 * @param multiple: must be a power of two.
 **/
-extern uintptr_t round_up  (uintptr_t n, uintptr_t multiple);
-extern uintptr_t round_down(uintptr_t n, uintptr_t multiple);
+template<typename T>
+T round_up(T n, T multiple)
+{
+	debug_assert(is_pow2((uint)multiple));
+	const T result = (n + multiple-1) & ~(multiple-1);
+	debug_assert(n <= result && result < n+multiple);
+	return result;
+}
+
+template<typename T>
+T round_down(T n, T multiple)
+{
+	debug_assert(is_pow2((uint)multiple));
+	const T result = n & ~(multiple-1);
+	debug_assert(result <= n && n < result+multiple);
+	return result;
+}
+

 template<typename T>
 bool IsAligned(T t, uintptr_t multiple)
 {
 	return ((uintptr_t)t % multiple) == 0;
-
 }

 #endif	// #ifndef INCLUDED_BITS
--- a/source/lib/byte_order.h
+++ b/source/lib/byte_order.h
@ -101,23 +101,39 @@ extern i64 movsx_le64(const u8* p, size_t size);
 extern i64 movsx_be64(const u8* p, size_t size);


-// Debug-mode ICC doesn't like the intrinsics, so only use them
-// for MSVC and non-debug ICC.
-#if MSC_VERSION && !( defined(__INTEL_COMPILER) && !defined(NDEBUG) )
+#if MSC_VERSION
 extern unsigned short _byteswap_ushort(unsigned short);
 extern unsigned long _byteswap_ulong(unsigned long);
 extern unsigned __int64 _byteswap_uint64(unsigned __int64);
-#pragma intrinsic(_byteswap_ushort)
-#pragma intrinsic(_byteswap_ulong)
-#pragma intrinsic(_byteswap_uint64)
+# if !ICC_VERSION	// ICC doesn't need (and warns about) the pragmas
+#  pragma intrinsic(_byteswap_ushort)
+#  pragma intrinsic(_byteswap_ulong)
+#  pragma intrinsic(_byteswap_uint64)
+# endif
 # define swap16 _byteswap_ushort
 # define swap32 _byteswap_ulong
 # define swap64 _byteswap_uint64
-#else
-extern u16 swap16(const u16 x);
-extern u32 swap32(const u32 x);
-extern u64 swap64(const u64 x);
-#endif	// no swap intrinsics
+#elif defined(linux)
+# include <asm/byteorder.h>
+# ifdef __arch__swab16
+#  define swap16 __arch__swab16
+# endif
+# ifdef __arch__swab32
+#  define swap32 __arch__swab32
+# endif
+# ifdef __arch__swab64
+#  define swap64 __arch__swab64
+# endif
+#endif

+#ifndef swap16
+extern u16 swap16(const u16 x);
+#endif
+#ifndef swap32
+extern u32 swap32(const u32 x);
+#endif
+#ifndef swap64
+extern u64 swap64(const u64 x);
+#endif

 #endif	// #ifndef INCLUDED_BYTE_ORDER
--- a/source/lib/code_annotation.h
+++ b/source/lib/code_annotation.h
@ -71,7 +71,7 @@ checking, but does not cause any compiler warnings.
 //    a) normal implementation: includes "abort", which is declared with
 //       noreturn attribute and therefore avoids GCC's "execution reaches
 //       end of non-void function" warning.
-# if !MSC_VERSION || CONFIG_PARANOIA
+# if !MSC_VERSION || ICC_VERSION || CONFIG_PARANOIA
 #  define UNREACHABLE\
 	STMT(\
 		debug_assert(0);	/* hit supposedly unreachable code */\
--- a/source/lib/debug.cpp
+++ b/source/lib/debug.cpp
@ -307,13 +307,12 @@ fail:

 	// append OS error (just in case it happens to be relevant -
 	// it's usually still set from unrelated operations)
-	char description_buf[100] = { '?' };
+	char description_buf[100] = "?";
 	LibError errno_equiv = LibError_from_errno(false);
 	if(errno_equiv != ERR::FAIL)	// meaningful translation
 		error_description_r(errno_equiv, description_buf, ARRAY_SIZE(description_buf));
-	char os_error[100];
-	if(sys_error_description_r(0, os_error, ARRAY_SIZE(os_error)) != INFO::OK)
-		strcpy_s(os_error, ARRAY_SIZE(os_error), "?");
+	char os_error[100] = "?";
+	sys_error_description_r(0, os_error, ARRAY_SIZE(os_error));
 	len = swprintf(pos, chars_left,
 		L"\r\n"
 		L"errno = %d (%hs)\r\n"
--- a/source/lib/file/archive/codec_zlib.cpp
+++ b/source/lib/file/archive/codec_zlib.cpp
@ -116,7 +116,7 @@ protected:

 	typedef int ZEXPORT (*ZLibFunc)(z_streamp strm, int flush);

-	LibError Process(ZLibFunc func, int flush, const u8* in, const size_t inSize, u8* out, const size_t outSize, size_t& inConsumed, size_t& outProduced)
+	LibError CallStreamFunc(ZLibFunc func, int flush, const u8* in, const size_t inSize, u8* out, const size_t outSize, size_t& inConsumed, size_t& outProduced)
 	{
 		m_zs.next_in  = (Byte*)in;
 		m_zs.avail_in = (uInt)inSize;
@ -196,7 +196,7 @@ public:
 	virtual LibError Process(const u8* in, size_t inSize, u8* out, size_t outSize, size_t& inConsumed, size_t& outProduced)
 	{
 		m_checksum = UpdateChecksum(m_checksum, in, inSize);
-		return CodecZLibStream::Process(deflate, 0, in, inSize, out, outSize, inConsumed, outProduced);
+		return CodecZLibStream::CallStreamFunc(deflate, 0, in, inSize, out, outSize, inConsumed, outProduced);
 	}

 	virtual LibError Finish(u32& checksum)
@ -250,7 +250,7 @@ public:

 	virtual LibError Process(const u8* in, size_t inSize, u8* out, size_t outSize, size_t& inConsumed, size_t& outProduced)
 	{
-		const LibError ret = CodecZLibStream::Process(inflate, Z_SYNC_FLUSH, in, inSize, out, outSize, inConsumed, outProduced);
+		const LibError ret = CodecZLibStream::CallStreamFunc(inflate, Z_SYNC_FLUSH, in, inSize, out, outSize, inConsumed, outProduced);
 		m_checksum = UpdateChecksum(m_checksum, out, outProduced);
 		return ret;
 	}
--- a/source/lib/file/io/io_align.cpp
+++ b/source/lib/file/io/io_align.cpp
@ -9,15 +9,15 @@ bool IsAligned_Offset(off_t ofs)

 off_t AlignedOffset(off_t ofs)
 {
-	return round_down(ofs, BLOCK_SIZE);
+	return round_down(ofs, (off_t)BLOCK_SIZE);
 }

 off_t AlignedSize(off_t size)
 {
-	return round_up(size, BLOCK_SIZE);
+	return round_up(size, (off_t)BLOCK_SIZE);
 }

 off_t PaddedSize(off_t size, off_t ofs)
 {
-	return round_up(size + ofs - AlignedOffset(ofs), BLOCK_SIZE);
+	return round_up(size + ofs - AlignedOffset(ofs), (off_t)BLOCK_SIZE);
 }
--- a/source/lib/file/vfs/file_cache.cpp
+++ b/source/lib/file/vfs/file_cache.cpp
@ -146,8 +146,7 @@ public:

 	shared_ptr<u8> Reserve(size_t size)
 	{
-		// (this probably indicates a bug; caching 0-length files would
-		// have no benefit, anyway)
+		// (should never happen because the VFS ensures size != 0.)
 		debug_assert(size != 0);

 		// (300 iterations have been observed when reserving several MB
--- a/source/lib/file/vfs/vfs.cpp
+++ b/source/lib/file/vfs/vfs.cpp
@ -11,6 +11,7 @@
 #include "precompiled.h"
 #include "vfs.h"

+#include "lib/allocators/shared_ptr.h"
 #include "lib/path_util.h"
 #include "lib/file/common/file_stats.h"
 #include "lib/file/common/trace.h"
@ -102,7 +103,10 @@ public:
 			CHECK_ERR(vfs_Lookup(pathname, &m_rootDirectory, directory, &file));

 			size = file->Size();
-			if(size > ChooseCacheSize())
+			// safely handle zero-length files
+			if(!size)
+				fileContents = DummySharedPtr((u8*)0);
+			else if(size > ChooseCacheSize())
 			{
 				fileContents = io_Allocate(size);
 				RETURN_ERR(file->Load(fileContents));
--- a/source/lib/module_init.cpp
+++ b/source/lib/module_init.cpp
@ -24,7 +24,7 @@ static const ModuleInitState MODULE_UNINITIALIZED = 0u;

 // (1..N = reference count)

-static const ModuleInitState MODULE_ERROR = ~1u;
+static const ModuleInitState MODULE_ERROR = ~(uintptr_t)1u;


 bool ModuleShouldInitialize(volatile ModuleInitState* pInitState)
--- a/source/lib/ogl.cpp
+++ b/source/lib/ogl.cpp
@ -35,8 +35,8 @@
 // define extension function pointers
 extern "C"
 {
-#define FUNC(ret, name, params) ret (CALL_CONV *p##name) params;
-#define FUNC2(ret, nameARB, nameCore, version, params) ret (CALL_CONV *p##nameARB) params;
+#define FUNC(ret, name, params) ret (GL_CALL_CONV *p##name) params;
+#define FUNC2(ret, nameARB, nameCore, version, params) ret (GL_CALL_CONV *p##nameARB) params;
 #include "glext_funcs.h"
 #undef FUNC2
 #undef FUNC
--- a/source/lib/ogl.h
+++ b/source/lib/ogl.h
@ -110,16 +110,16 @@ extern const char* ogl_ExtensionString(void);

 // declare extension function pointers
 #if OS_WIN
-# define CALL_CONV __stdcall
+# define GL_CALL_CONV __stdcall
 #else
-# define CALL_CONV
+# define GL_CALL_CONV
 #endif
-#define FUNC(ret, name, params) EXTERN_C ret (CALL_CONV *p##name) params;
-#define FUNC2(ret, nameARB, nameCore, version, params) EXTERN_C ret (CALL_CONV *p##nameARB) params;
+#define FUNC(ret, name, params) EXTERN_C ret (GL_CALL_CONV *p##name) params;
+#define FUNC2(ret, nameARB, nameCore, version, params) EXTERN_C ret (GL_CALL_CONV *p##nameARB) params;
 #include "glext_funcs.h"
 #undef FUNC2
 #undef FUNC
-// leave CALL_CONV defined for ogl.cpp
+// leave GL_CALL_CONV defined for ogl.cpp


 //-----------------------------------------------------------------------------
--- a/source/lib/precompiled.h
+++ b/source/lib/precompiled.h
@ -33,6 +33,7 @@
 # endif
 # if ICC_VERSION
 #  pragma warning(disable:1786)	// function is deprecated (disabling 4996 isn't sufficient)
+#  pragma warning(disable:1684)	// conversion from pointer to same-sized integral type
 # endif
 #endif

--- a/source/lib/sysdep/compiler.h
+++ b/source/lib/sysdep/compiler.h
@ -178,4 +178,10 @@
 # define INLINE inline
 #endif

+#if MSC_VERSION
+# define CALL_CONV __cdecl
+#else
+# define CALL_CONV
+#endif
+
 #endif	// #ifndef INCLUDED_COMPILER
--- a/source/lib/sysdep/cpu.cpp
+++ b/source/lib/sysdep/cpu.cpp
@ -15,8 +15,3 @@ ERROR_ASSOCIATE(ERR::CPU_FEATURE_MISSING, "This CPU doesn't support a required f
 ERROR_ASSOCIATE(ERR::CPU_UNKNOWN_OPCODE, "Disassembly failed", -1);
 ERROR_ASSOCIATE(ERR::CPU_UNKNOWN_VENDOR, "CPU vendor unknown", -1);
 ERROR_ASSOCIATE(ERR::CPU_RESTRICTED_AFFINITY, "Cannot set desired CPU affinity", -1);
-
-void* cpu_memcpy_thunk(void* RESTRICT dst, const void* RESTRICT src, size_t size)
-{
-	return cpu_memcpy(dst, src, size);
-}
--- a/source/lib/sysdep/cpu.h
+++ b/source/lib/sysdep/cpu.h
@ -32,7 +32,7 @@ extern "C" {
 * @return string identifying the CPU (usually a cleaned-up version of the
 * brand string)
 **/
-extern const char* cpu_IdentifierString();
+LIB_API const char* cpu_IdentifierString();

 /**
 * @return a rough estimate of the CPU clock frequency.
@ -42,7 +42,7 @@ extern const char* cpu_IdentifierString();
 * continual recalibration anyway, which makes the initial accuracy moot.
 * querying frequency via OS is also much faster than ia32's measurement loop.
 **/
-extern double cpu_ClockFrequency();
+LIB_API double cpu_ClockFrequency();

 /**
 * @return the number of what the OS deems "processors" or -1 on failure.
@ -54,30 +54,30 @@ extern double cpu_ClockFrequency();
 * note: this function is necessary because POSIX sysconf _SC_NPROCESSORS_CONF
 * is not suppored on MacOSX, else we would use that.
 **/
-extern uint cpu_NumProcessors();
+LIB_API uint cpu_NumProcessors();

 /**
 * @return number of *enabled* CPU packages / sockets.
 **/
-extern uint cpu_NumPackages();
+LIB_API uint cpu_NumPackages();

 /**
 * @return number of *enabled* CPU cores per package.
 * (2 on dual-core systems)
 **/
-extern uint cpu_CoresPerPackage();
+LIB_API uint cpu_CoresPerPackage();

 /**
 * @return number of *enabled* hyperthreading units per core.
 * (2 on P4 EE)
 **/
-extern uint cpu_LogicalPerCore();
+LIB_API uint cpu_LogicalPerCore();

 /**
 * @return the size [bytes] of a MMU page.
 * (4096 on most IA-32 systems)
 **/
-extern size_t cpu_PageSize();
+LIB_API size_t cpu_PageSize();

 enum CpuMemoryIndicators
 {
@ -88,7 +88,7 @@ enum CpuMemoryIndicators
 /**
 * @return the amount [bytes] of available or total physical memory.
 **/
-extern size_t cpu_MemorySize(CpuMemoryIndicators mem_type);
+LIB_API size_t cpu_MemorySize(CpuMemoryIndicators mem_type);


 //-----------------------------------------------------------------------------
@ -103,37 +103,32 @@ extern size_t cpu_MemorySize(CpuMemoryIndicators mem_type);
 * @return false if the target word doesn't match the expected value,
 * otherwise true (also overwriting the contents of location)
 **/
-extern bool cpu_CAS(volatile uintptr_t* location, uintptr_t expected, uintptr_t newValue);
+LIB_API bool cpu_CAS(volatile uintptr_t* location, uintptr_t expected, uintptr_t newValue);

 /**
 * add a signed value to a variable without the possibility of interference
 * from other threads/CPUs.
 **/
-extern void cpu_AtomicAdd(volatile intptr_t* location, intptr_t increment);
+LIB_API void cpu_AtomicAdd(volatile intptr_t* location, intptr_t increment);

 /**
 * enforce strict instruction ordering in the CPU pipeline.
 **/
-extern void cpu_Serialize();
+LIB_API void cpu_Serialize();

 /**
 * enforce strong memory ordering.
 **/
-extern void cpu_MemoryFence();
+LIB_API void cpu_MemoryFence();


 //-----------------------------------------------------------------------------
 // misc

 /**
- * drop-in replacement for libc memcpy(). highly optimized for Athlon and
- * Pentium III microarchitectures; significantly outperforms VC7.1 memcpy and
- * memcpy_amd. for details, see accompanying article.
+ * drop-in replacement for POSIX memcpy().
 **/
-extern void* cpu_memcpy(void* RESTRICT dst, const void* RESTRICT src, size_t size);
-LIB_API void* cpu_memcpy_thunk(void* RESTRICT dst, const void* RESTRICT src, size_t size);
-
-LIB_API void* cpu_memcpy_thunk(void* RESTRICT dst, const void* RESTRICT src, size_t size);
+LIB_API void* cpu_memcpy(void* RESTRICT dst, const void* RESTRICT src, size_t size);

 /**
 * execute the specified function once on each CPU.
@ -145,12 +140,12 @@ LIB_API void* cpu_memcpy_thunk(void* RESTRICT dst, const void* RESTRICT src, siz
 * may fail if e.g. OS is preventing us from running on some CPUs.
 **/
 typedef void (*CpuCallback)(void* param);
-extern LibError cpu_CallByEachCPU(CpuCallback cb, void* param);
+LIB_API LibError cpu_CallByEachCPU(CpuCallback cb, void* param);

 /**
 * set the FPU control word to "desirable" values (see implementation)
 **/
-extern void cpu_ConfigureFloatingPoint();
+LIB_API void cpu_ConfigureFloatingPoint();

 // convert float to int much faster than _ftol2, which would normally be
 // used by (int) casts.
--- a/source/lib/sysdep/ia32/ia32.cpp
+++ b/source/lib/sysdep/ia32/ia32.cpp
@ -21,6 +21,7 @@
 #include "lib/bits.h"
 #include "lib/timer.h"
 #include "lib/sysdep/cpu.h"
+#include "ia32_memcpy.h"

 #if !MSC_VERSION && !GCC_VERSION
 #error ia32.cpp needs inline assembly support!
@ -767,3 +768,27 @@ void cpu_ConfigureFloatingPoint()
 	// results were changed significantly, so it had to be disabled.
 	//ia32_asm_control87(IA32_RC_CHOP, IA32_MCW_RC);
 }
+
+
+//-----------------------------------------------------------------------------
+// thunk functions for ia32_asm to allow DLL export
+
+void cpu_AtomicAdd(volatile intptr_t* location, intptr_t increment)
+{
+	ia32_asm_AtomicAdd(location, increment);
+}
+
+bool cpu_CAS(volatile uintptr_t* location, uintptr_t expected, uintptr_t new_value)
+{
+	return ia32_asm_CAS(location, expected, new_value);
+}
+
+void cpu_Serialize()
+{
+	return ia32_asm_Serialize();
+}
+
+void* cpu_memcpy(void* RESTRICT dst, const void* RESTRICT src, size_t size)
+{
+	return ia32_memcpy(dst, src, size);
+}
--- a/source/lib/sysdep/ia32/ia32_asm.asm
+++ b/source/lib/sysdep/ia32/ia32_asm.asm
@ -80,9 +80,9 @@ sym(ia32_asm_cpuid):
 ; lock-free support routines
 ;-------------------------------------------------------------------------------

-; extern "C" void __cdecl cpu_AtomicAdd(volatile intptr_t* location, intptr_t increment);
-global sym(cpu_AtomicAdd)
-sym(cpu_AtomicAdd):
+; extern "C" void __cdecl ia32_asm_AtomicAdd(volatile intptr_t* location, intptr_t increment);
+global sym(ia32_asm_AtomicAdd)
+sym(ia32_asm_AtomicAdd):
 	mov		edx, [esp+4]				; location
 	mov		eax, [esp+8]				; increment
 db		0xf0							; LOCK prefix
@ -99,9 +99,9 @@ db		0xf0							; LOCK prefix
 ; - nor do we bother skipping the LOCK prefix on single-processor systems.
 ;   the branch may be well-predicted, but difference in performance still
 ;   isn't expected to be enough to justify the effort.
-; extern "C" bool __cdecl cpu_CAS(volatile uintptr_t* location, uintptr_t expected, uintptr_t new_value);
-global sym(cpu_CAS)
-sym(cpu_CAS):
+; extern "C" bool __cdecl ia32_asm_CAS(volatile uintptr_t* location, uintptr_t expected, uintptr_t new_value);
+global sym(ia32_asm_CAS)
+sym(ia32_asm_CAS):
 	mov		edx, [esp+4]				; location
 	mov		eax, [esp+8]				; expected
 	mov		ecx, [esp+12]				; new_value
@ -112,9 +112,9 @@ db		0xf0							; LOCK prefix
 	ret


-; extern "C" bool __cdecl cpu_Serialize();
-global sym(cpu_Serialize)
-sym(cpu_Serialize):
+; extern "C" bool __cdecl ia32_asm_Serialize();
+global sym(ia32_asm_Serialize)
+sym(ia32_asm_Serialize):
 	cpuid
 	ret

@ -175,14 +175,14 @@ sym(ia32_asm_fpclassifyf):
 	ret


-; extern "C" float __cdecl cpu_rintf(float)
+; extern "C" float __cdecl ia32_asm_rintf(float);
 global sym(ia32_asm_rintf)
 sym(ia32_asm_rintf):
 	fld		dword [esp+4]
 	frndint
 	ret

-; extern "C" double __cdecl ia32_asm_rint(double)
+; extern "C" double __cdecl ia32_asm_rint(double);
 global sym(ia32_asm_rint)
 sym(ia32_asm_rint):
 	fld		qword [esp+4]
@ -190,7 +190,7 @@ sym(ia32_asm_rint):
 	ret


-; extern "C" float __cdecl ia32_asm_fminf(float, float)
+; extern "C" float __cdecl ia32_asm_fminf(float, float);
 global sym(ia32_asm_fminf)
 sym(ia32_asm_fminf):
 	fld		dword [esp+4]
@ -201,7 +201,7 @@ sym(ia32_asm_fminf):
 	fstp	st0
 	ret

-; extern "C" float __cdecl ia32_asm_fmaxf(float, float)
+; extern "C" float __cdecl ia32_asm_fmaxf(float, float);
 global sym(ia32_asm_fmaxf)
 sym(ia32_asm_fmaxf):
 	fld		dword [esp+4]
@ -213,9 +213,9 @@ sym(ia32_asm_fmaxf):
 	ret


-; extern "C" i32 __cdecl cpu_i32FromFloat(float f)
-global sym(cpu_i32FromFloat)
-sym(cpu_i32FromFloat):
+; extern "C" i32 __cdecl ia32_asm_i32FromFloat(float f);
+global sym(ia32_asm_i32FromFloat)
+sym(ia32_asm_i32FromFloat):
 	push		eax
 	fld			dword [esp+8]
 	fsub		dword [round_bias]
@ -223,9 +223,9 @@ sym(cpu_i32FromFloat):
 	pop			eax
 	ret

-; extern "C" i32 __cdecl cpu_i32FromDouble(double d)
-global sym(cpu_i32FromDouble)
-sym(cpu_i32FromDouble):
+; extern "C" i32 __cdecl ia32_asm_i32FromDouble(double d);
+global sym(ia32_asm_i32FromDouble)
+sym(ia32_asm_i32FromDouble):
 	push		eax
 	fld			qword [esp+8]
 	fsub		dword [round_bias]
@ -233,9 +233,9 @@ sym(cpu_i32FromDouble):
 	pop			eax
 	ret

-; extern "C" i64 __cdecl cpu_i64FromDouble(double d)
-global sym(cpu_i64FromDouble)
-sym(cpu_i64FromDouble):
+; extern "C" i64 __cdecl ia32_asm_i64FromDouble(double d);
+global sym(ia32_asm_i64FromDouble)
+sym(ia32_asm_i64FromDouble):
 	push		edx
 	push		eax
 	fld			qword [esp+12]
@ -259,7 +259,7 @@ sym(cpu_i64FromDouble):
 ; xcode complains about CPUID clobbering ebx, so we use external asm
 ; where possible (IA-32 CPUs).
 ;
-; extern "C" u64 ia32_asm_rdtsc_edx_eax()
+; extern "C" u64 ia32_asm_rdtsc_edx_eax();
 global sym(ia32_asm_rdtsc_edx_eax)
 sym(ia32_asm_rdtsc_edx_eax):
 	push	ebx
@ -269,7 +269,7 @@ sym(ia32_asm_rdtsc_edx_eax):
 	ret


-; extern "C" int ia32_asm_log2_of_pow2(uint n)
+; extern "C" int ia32_asm_log2_of_pow2(uint n);
 global sym(ia32_asm_log2_of_pow2)
 sym(ia32_asm_log2_of_pow2):
 	mov		ecx, [esp+4]	; n
@ -289,7 +289,7 @@ sym(ia32_asm_log2_of_pow2):
 ; optimized for size; this must be straight asm because ; extern "C"
 ; is compiler-specific and compiler-generated prolog code inserted before
 ; inline asm trashes EBP and ESP (unacceptable).
-; extern "C" void ia32_asm_GetCurrentContext(void* pcontext)
+; extern "C" void ia32_asm_GetCurrentContext(void* pcontext);
 global sym(ia32_asm_GetCurrentContext)
 sym(ia32_asm_GetCurrentContext):
 	pushad
--- a/source/lib/sysdep/ia32/ia32_asm.h
+++ b/source/lib/sysdep/ia32/ia32_asm.h
@ -15,6 +15,7 @@
 extern "C" {
 #endif

+
 /**
 * order in which ia32_asm_cpuid stores register values
 **/
@ -32,48 +33,54 @@ enum IA32Regs
 * fills register array according to IA32Regs.
 * @return true on success or false if the sub-function isn't supported.
 **/
-extern bool ia32_asm_cpuid(u32 func, u32* regs);
+extern bool CALL_CONV ia32_asm_cpuid(u32 func, u32* regs);
+
+extern void CALL_CONV ia32_asm_AtomicAdd(volatile intptr_t* location, intptr_t increment);
+extern bool CALL_CONV ia32_asm_CAS(volatile uintptr_t* location, uintptr_t expected, uintptr_t new_value);
+extern void CALL_CONV ia32_asm_Serialize();
+

 /**
 * for all 1-bits in mask, update the corresponding FPU control word bits
 * with the bit values in new_val.
 * @return 0 to indicate success.
 **/
-extern uint ia32_asm_control87(uint new_val, uint mask);
+extern uint CALL_CONV ia32_asm_control87(uint new_val, uint mask);
+
+/// see POSIX fpclassify
+extern uint CALL_CONV ia32_asm_fpclassifyd(double d);
+extern uint CALL_CONV ia32_asm_fpclassifyf(float f);
+
+/// see POSIX rintf
+extern float CALL_CONV ia32_asm_rintf(float);
+extern double CALL_CONV ia32_asm_rint(double);
+
+/// see POSIX fminf
+extern float CALL_CONV ia32_asm_fminf(float, float);
+extern float CALL_CONV ia32_asm_fmaxf(float, float);
+
+extern i32 CALL_CONV ia32_asm_i32FromFloat(float f);
+extern i32 CALL_CONV ia32_asm_i32FromDouble(double d);
+extern i64 CALL_CONV ia32_asm_i64FromDouble(double d);
+

 /**
 * @return the current value of the TimeStampCounter in edx:eax
 * (interpretable as a u64 when using the standard Win32 calling convention)
 **/
-extern u64 ia32_asm_rdtsc_edx_eax(void);
-
-/**
- * write the current execution state (e.g. all register values) into
- * (Win32::CONTEXT*)pcontext (defined as void* to avoid dependency).
- **/
-extern void ia32_asm_GetCurrentContext(void* pcontext);
-
-
-// implementations of POSIX/SUS functions
-
-/// see fpclassify
-extern uint ia32_asm_fpclassifyd(double d);
-extern uint ia32_asm_fpclassifyf(float f);
-
-/// see rintf
-extern float ia32_asm_rintf(float);
-extern double ia32_asm_rint(double);
-extern float ia32_asm_fminf(float, float);
-extern float ia32_asm_fmaxf(float, float);
-
-
-// misc
+extern u64 CALL_CONV ia32_asm_rdtsc_edx_eax(void);

 /**
 * @return the (integral) base 2 logarithm, or -1 if the number
 * is not a power-of-two.
 **/
-extern int ia32_asm_log2_of_pow2(uint n);
+extern int CALL_CONV ia32_asm_log2_of_pow2(uint n);
+
+/**
+ * write the current execution state (e.g. all register values) into
+ * (Win32::CONTEXT*)pcontext (defined as void* to avoid dependency).
+ **/
+extern void CALL_CONV ia32_asm_GetCurrentContext(void* pcontext);

 #ifdef __cplusplus
 }
--- a/source/lib/sysdep/ia32/ia32_memcpy.asm
+++ b/source/lib/sysdep/ia32/ia32_memcpy.asm
@ -297,10 +297,10 @@ align 16
 ;------------------------------------------------------------------------------

 ; drop-in replacement for libc memcpy() (returns dst)
-; void* __declspec(naked) cpu_memcpy(void* dst, const void* src, size_t nbytes)
-global sym(cpu_memcpy)
+; extern void* ia32_memcpy(void* RESTRICT dst, const void* RESTRICT src, size_t size);
+global sym(ia32_memcpy)
 align 64
-sym(cpu_memcpy):
+sym(ia32_memcpy):
 	push	edi
 	push	esi

--- a/source/lib/sysdep/ia32/ia32_memcpy.h
+++ b/source/lib/sysdep/ia32/ia32_memcpy.h
@ -0,0 +1,30 @@
+/**
+ * =========================================================================
+ * File        : ia32_memcpy.h
+ * Project     : 0 A.D.
+ * Description : interface to various IA-32 functions (written in asm)
+ * =========================================================================
+ */
+
+// license: GPL; see lib/license.txt
+
+#ifndef INCLUDED_IA32_MEMCPY
+#define INCLUDED_IA32_MEMCPY
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * drop-in replacement for POSIX memcpy.
+ * highly optimized for Athlon and Pentium III microarchitectures;
+ * significantly outperforms VC7.1 memcpy and memcpy_amd.
+ * for details, see "Speeding Up Memory Copy".
+ **/
+extern void* ia32_memcpy(void* RESTRICT dst, const void* RESTRICT src, size_t size);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	// #ifndef INCLUDED_IA32_MEMCPY
--- a/source/lib/sysdep/win/mahaf.cpp
+++ b/source/lib/sysdep/win/mahaf.cpp
@ -125,7 +125,7 @@ volatile void* mahaf_MapPhysicalMemory(uintptr_t physicalAddress, size_t numByte
 	}

 	debug_assert(bytesReturned == sizeof(out));
-	volatile void* virtualAddress = (volatile void*)out.virtualAddress;
+	volatile void* virtualAddress = (volatile void*)(uintptr_t)out.virtualAddress;
 	return virtualAddress;
 }

--- a/source/lib/sysdep/win/wdbg_heap.cpp
+++ b/source/lib/sysdep/win/wdbg_heap.cpp
@ -429,6 +429,7 @@ static uint EncodedLength(uintptr_t quantizedOffset)
 	}

 	wdbg_assert(0);	// unreachable
+	return 0;
 }


@ -873,6 +874,7 @@ static int __cdecl ReportHook(int reportType, char* message, int* out)
 	}

 	wdbg_assert(0);	// unreachable
+	return 0;
 }

 //-----------------------------------------------------------------------------
--- a/source/lib/sysdep/win/wdbg_sym.cpp
+++ b/source/lib/sysdep/win/wdbg_sym.cpp
@ -42,7 +42,7 @@ WINIT_REGISTER_CRITICAL_INIT(wdbg_sym_Init);
 // generate a stack trace. if that changes, we need to init a local copy
 // of these in dump_sym_cb and pass them to all subsequent dump_*.
 static HANDLE hProcess;
-static ULONG64 mod_base;
+static uintptr_t mod_base;

 // for StackWalk64; taken from PE header by wdbg_init.
 static WORD machine;
@ -80,7 +80,7 @@ static LibError sym_init()
 	WARN_IF_FALSE(ok);

 	mod_base = SymGetModuleBase64(hProcess, (u64)&sym_init);
-	IMAGE_NT_HEADERS* header = ImageNtHeader((void*)mod_base);
+	IMAGE_NT_HEADERS* header = ImageNtHeader((void*)(uintptr_t)mod_base);
 	machine = header->FileHeader.Machine;

 	return INFO::OK;
@ -231,9 +231,9 @@ func2:
 static LibError ia32_walk_stack(_tagSTACKFRAME64* sf)
 {
 	// read previous values from _tagSTACKFRAME64
-	void* prev_fp  = (void*)sf->AddrFrame .Offset;
-	void* prev_ip  = (void*)sf->AddrPC    .Offset;
-	void* prev_ret = (void*)sf->AddrReturn.Offset;
+	void* prev_fp  = (void*)(uintptr_t)sf->AddrFrame .Offset;
+	void* prev_ip  = (void*)(uintptr_t)sf->AddrPC    .Offset;
+	void* prev_ret = (void*)(uintptr_t)sf->AddrReturn.Offset;
 	if(!debug_is_stack_ptr(prev_fp))
 		WARN_RETURN(ERR::_11);
 	if(prev_ip && !debug_is_code_ptr(prev_ip))
@ -409,7 +409,7 @@ static LibError nth_caller_cb(const _tagSTACKFRAME64* sf, uintptr_t cbData)
 	void** pfunc = (void**)cbData;

 	// return its address
-	*pfunc = (void*)sf->AddrPC.Offset;
+	*pfunc = (void*)(uintptr_t)sf->AddrPC.Offset;
 	return INFO::OK;
 }

@ -876,7 +876,7 @@ SymGetTypeInfo(hProcess, mod_base, id, TI_GET_OFFSET, &ofs2);


 	// get address
-	ULONG64 addr = sym->Address;
+	uintptr_t addr = sym->Address;
 	// .. relative to a register
 	//    note: we only have the FP (not SP)
 	if(sym->Flags & SYMFLAG_REGREL)
@ -908,7 +908,7 @@ in_register:
 		return ERR::SYM_UNRETRIEVABLE_REG;	// NOWARN
 	}

-	*pp = (const u8*)addr;
+	*pp = (const u8*)(uintptr_t)addr;

 debug_printf("SYM| %ws at %p  flags=%X dk=%d sym->addr=%I64X addrofs=%X addr2=%I64X ofs2=%X\n", sym->Name, *pp, sym->Flags, data_kind, sym->Address, addrofs, addr2, ofs2);

@ -1270,7 +1270,7 @@ static LibError dump_sym_pointer(DWORD type_id, const u8* p, DumpState state)
 	const size_t size = (size_t)size_;

 	// read+output pointer's value.
-	p = (const u8*)movzx_le64(p, size);
+	p = (const u8*)(uintptr_t)movzx_le64(p, size);
 	out(L"0x%p", p);

 	// bail if it's obvious the pointer is bogus
@ -1732,7 +1732,7 @@ static BOOL CALLBACK dump_sym_cb(SYMBOL_INFO* sym, ULONG UNUSED(size), void* UNU
 {
 	out_latch_pos();	// see decl
 	mod_base = sym->ModBase;
-	const u8* p = (const u8*)sym->Address;
+	const u8* p = (const u8*)(uintptr_t)sym->Address;
 	DumpState state;

 	INDENT;
@ -1750,7 +1750,7 @@ static BOOL CALLBACK dump_sym_cb(SYMBOL_INFO* sym, ULONG UNUSED(size), void* UNU
 static LibError dump_frame_cb(const _tagSTACKFRAME64* sf, uintptr_t UNUSED(cbData))
 {
 	current_stackframe64 = sf;
-	void* func = (void*)sf->AddrPC.Offset;
+	void* func = (void*)(uintptr_t)sf->AddrPC.Offset;

 	char func_name[DBG_SYMBOL_LEN]; char file[DBG_FILE_LEN]; int line;
 	LibError ret = debug_resolve_symbol_lk(func, func_name, file, &line);
--- a/source/lib/sysdep/win/wdll_delay_load.cpp
+++ b/source/lib/sysdep/win/wdll_delay_load.cpp
@ -140,8 +140,9 @@ EXTERN_C PfnDliHook __pfnDliFailureHook2 = 0;



-
+#if !ICC_VERSION
 #pragma intrinsic(strlen,memcmp,memcpy)
+#endif

 // utility function for calculating the index of the current import
 // for all the tables (INT, BIAT, UIAT, and IAT).
--- a/source/lib/sysdep/win/whrt/counter.cpp
+++ b/source/lib/sysdep/win/whrt/counter.cpp
@ -85,7 +85,7 @@ ICounter* CreateCounter(uint id)

 	static const size_t memSize = 200;
 	static u8 mem[memSize];
-	u8* alignedMem = (u8*)round_up((uintptr_t)mem, 16);
+	u8* alignedMem = (u8*)round_up((uintptr_t)mem, (uintptr_t)16u);
 	const size_t bytesLeft = mem+memSize - alignedMem;
 	ICounter* counter = ConstructCounterAt(id, alignedMem, bytesLeft);

--- a/source/lib/sysdep/win/whrt/tsc.cpp
+++ b/source/lib/sysdep/win/whrt/tsc.cpp
@ -17,7 +17,9 @@

 #if MSC_VERSION
 # include <intrin.h>
+# if !ICC_VERSION
 #  pragma intrinsic(__rdtsc)
+# endif
 #endif
 #if ARCH_IA32
 # include "lib/sysdep/ia32/ia32.h"	// ia32_rdtsc
--- a/source/lib/sysdep/win/winit.h
+++ b/source/lib/sysdep/win/winit.h
@ -88,7 +88,7 @@ Several methods of module init are possible: (see Large Scale C++ Design)
 #pragma section(".WINIT$S7", read)
 #pragma section(".WINIT$S8", read)
 #pragma section(".WINIT$SZ", read)
-#pragma comment(linker, "/merge:WINIT=.rdata")
+#pragma comment(linker, "/merge:.WINIT=.rdata")


 //-----------------------------------------------------------------------------
@ -117,39 +117,29 @@ Several methods of module init are possible: (see Large Scale C++ Design)

 // very early init; must not fail, since error handling code *crashes*
 // if called before these have completed.
-#define WINIT_REGISTER_CRITICAL_INIT(func)   static LibError func(void); EXTERN_C __declspec(allocate(".WINIT$I0")) LibError (*p##func)(void) = func;
-//__pragma(comment(linker, "/include:_p"#func))
+#define WINIT_REGISTER_CRITICAL_INIT(func)   static LibError func(void); EXTERN_C __declspec(allocate(".WINIT$I0")) LibError (*p##func)(void) = func; __pragma(comment(linker, "/include:_p"#func))

 // meant for modules with dependents but whose init is complicated and may
 // raise error/warning messages (=> can't go in WINIT_REGISTER_CRITICAL_INIT)
-#define WINIT_REGISTER_EARLY_INIT(func)      static LibError func(void); EXTERN_C __declspec(allocate(".WINIT$I1")) LibError (*p##func)(void) = func;
-//__pragma(comment(linker, "/include:_p"#func))
+#define WINIT_REGISTER_EARLY_INIT(func)      static LibError func(void); EXTERN_C __declspec(allocate(".WINIT$I1")) LibError (*p##func)(void) = func; __pragma(comment(linker, "/include:_p"#func))

 // available for dependents of WINIT_REGISTER_EARLY_INIT-modules that
 // must still come before WINIT_REGISTER_MAIN_INIT.
-#define WINIT_REGISTER_EARLY_INIT2(func)     static LibError func(void); EXTERN_C __declspec(allocate(".WINIT$I2")) LibError (*p##func)(void) = func;
-//__pragma(comment(linker, "/include:_p"#func))
+#define WINIT_REGISTER_EARLY_INIT2(func)     static LibError func(void); EXTERN_C __declspec(allocate(".WINIT$I2")) LibError (*p##func)(void) = func; __pragma(comment(linker, "/include:_p"#func))

 // most modules will go here unless they are often used or
 // have many dependents.
-#define WINIT_REGISTER_MAIN_INIT(func)       static LibError func(void); EXTERN_C __declspec(allocate(".WINIT$I6")) LibError (*p##func)(void) = func;
-//__pragma(comment(linker, "/include:_p"#func))
+#define WINIT_REGISTER_MAIN_INIT(func)       static LibError func(void); EXTERN_C __declspec(allocate(".WINIT$I6")) LibError (*p##func)(void) = func; __pragma(comment(linker, "/include:_p"#func))

 // available for any modules that may need to come after
 // WINIT_REGISTER_MAIN_INIT (unlikely)
-#define WINIT_REGISTER_LATE_INIT(func)       static LibError func(void); EXTERN_C __declspec(allocate(".WINIT$I7")) LibError (*p##func)(void) = func;
-//__pragma(comment(linker, "/include:_p"#func))
+#define WINIT_REGISTER_LATE_INIT(func)       static LibError func(void); EXTERN_C __declspec(allocate(".WINIT$I7")) LibError (*p##func)(void) = func; __pragma(comment(linker, "/include:_p"#func))

-#define WINIT_REGISTER_EARLY_SHUTDOWN(func)  static LibError func(void); EXTERN_C __declspec(allocate(".WINIT$S0")) LibError (*p##func)(void) = func;
-//__pragma(comment(linker, "/include:_p"#func))
-#define WINIT_REGISTER_EARLY_SHUTDOWN2(func) static LibError func(void); EXTERN_C __declspec(allocate(".WINIT$S1")) LibError (*p##func)(void) = func;
-//__pragma(comment(linker, "/include:_p"#func))
-#define WINIT_REGISTER_MAIN_SHUTDOWN(func)   static LibError func(void); EXTERN_C __declspec(allocate(".WINIT$S6")) LibError (*p##func)(void) = func;
-//__pragma(comment(linker, "/include:_p"#func))
-#define WINIT_REGISTER_LATE_SHUTDOWN(func)   static LibError func(void); EXTERN_C __declspec(allocate(".WINIT$S7")) LibError (*p##func)(void) = func;
-//__pragma(comment(linker, "/include:_p"#func))
-#define WINIT_REGISTER_LATE_SHUTDOWN2(func)  static LibError func(void); EXTERN_C __declspec(allocate(".WINIT$S8")) LibError (*p##func)(void) = func;
-//__pragma(comment(linker, "/include:_p"#func))
+#define WINIT_REGISTER_EARLY_SHUTDOWN(func)  static LibError func(void); EXTERN_C __declspec(allocate(".WINIT$S0")) LibError (*p##func)(void) = func; __pragma(comment(linker, "/include:_p"#func))
+#define WINIT_REGISTER_EARLY_SHUTDOWN2(func) static LibError func(void); EXTERN_C __declspec(allocate(".WINIT$S1")) LibError (*p##func)(void) = func; __pragma(comment(linker, "/include:_p"#func))
+#define WINIT_REGISTER_MAIN_SHUTDOWN(func)   static LibError func(void); EXTERN_C __declspec(allocate(".WINIT$S6")) LibError (*p##func)(void) = func; __pragma(comment(linker, "/include:_p"#func))
+#define WINIT_REGISTER_LATE_SHUTDOWN(func)   static LibError func(void); EXTERN_C __declspec(allocate(".WINIT$S7")) LibError (*p##func)(void) = func; __pragma(comment(linker, "/include:_p"#func))
+#define WINIT_REGISTER_LATE_SHUTDOWN2(func)  static LibError func(void); EXTERN_C __declspec(allocate(".WINIT$S8")) LibError (*p##func)(void) = func; __pragma(comment(linker, "/include:_p"#func))


 //-----------------------------------------------------------------------------
--- a/source/lib/sysdep/win/wmi.cpp
+++ b/source/lib/sysdep/win/wmi.cpp
@ -88,7 +88,7 @@ LibError wmi_GetClass(const char* className, WmiMap& wmiMap)
 	{
 		IWbemClassObjectPtr pObj = 0;
 		ULONG numReturned = 0;
-		hr = pEnum->Next(WBEM_INFINITE, 1, &pObj, &numReturned);
+		hr = pEnum->Next((LONG)WBEM_INFINITE, 1, &pObj, &numReturned);
 		if(FAILED(hr))
 			WARN_RETURN(ERR::FAIL);
 		if(numReturned == 0)
--- a/source/lib/sysdep/win/wsdl.cpp
+++ b/source/lib/sysdep/win/wsdl.cpp
@ -214,7 +214,7 @@ static HWND wsdl_CreateWindow(int w, int h)
 	}

 	// note: you can override the hardcoded window name via SDL_WM_SetCaption.
-	return CreateWindowEx(WS_EX_APPWINDOW, (LPCSTR)class_atom, "wsdl", windowStyle, 0, 0, w, h, 0, 0, hInst, 0);
+	return CreateWindowEx(WS_EX_APPWINDOW, (LPCSTR)(uintptr_t)class_atom, "wsdl", windowStyle, 0, 0, w, h, 0, 0, hInst, 0);
 }


--- a/source/lib/sysdep/win/wsdl.h
+++ b/source/lib/sysdep/win/wsdl.h
@ -11,6 +11,7 @@
 #ifndef INCLUDED_WSDL
 #define INCLUDED_WSDL

+#include "lib/byte_order.h"
 #include "SDL/SDL_keysym.h"

 typedef u8  Uint8;
@ -106,48 +107,16 @@ extern int SDL_SetGamma(float r, float g, float b);
 // byte swapping
 //

-
-#ifdef linux
-# include <asm/byteorder.h>
-# ifdef __arch__swab16
-#  define SDL_Swap16  __arch__swab16
-# endif
-# ifdef __arch__swab32
-#  define SDL_Swap32  __arch__swab32
-# endif
-#endif
-
-// Debug-mode ICC doesn't like the intrinsics, so only use them
-// for MSVC and non-debug ICC.
-#if MSC_VERSION && !( defined(__INTEL_COMPILER) && !defined(NDEBUG) )
-extern unsigned short _byteswap_ushort(unsigned short);
-extern unsigned long _byteswap_ulong(unsigned long);
-extern unsigned __int64 _byteswap_uint64(unsigned __int64);
-#pragma intrinsic(_byteswap_ushort)
-#pragma intrinsic(_byteswap_ulong)
-#pragma intrinsic(_byteswap_uint64)
-# define SDL_Swap16 _byteswap_ushort
-# define SDL_Swap32 _byteswap_ulong
-# define SDL_Swap64 _byteswap_uint64
-#endif
-
-#ifndef SDL_Swap16
-extern u16 SDL_Swap16(u16);
-#endif
-
-#ifndef SDL_Swap32
-extern u32 SDL_Swap32(u32);
-#endif
-
-#ifndef SDL_Swap64
-extern u64 SDL_Swap64(u64);
-#endif
-
 #define SDL_LIL_ENDIAN 1234
 #define SDL_BIG_ENDIAN 4321

 #define SDL_BYTEORDER SDL_LIL_ENDIAN

+#define SDL_Swap16 swap16
+#define SDL_Swap32 swap32
+#define SDL_Swap64 swap64
+
+
 //////////////////////////////////////////////////////////////////////////////
 //
 // events
--- a/source/lib/sysdep/win/wstartup.cpp
+++ b/source/lib/sysdep/win/wstartup.cpp
@ -94,8 +94,14 @@ EXTERN_C int wstartup_InitAndRegisterShutdown()
 	return 0;
 }

-#pragma section(".CRT$XIV", long,read)
-#pragma data_seg(".CRT$XIV")	// after C init, after XIU ("User") block
-EXTERN_C int(*wstartup_pInitAndRegisterShutdown)() = wstartup_InitAndRegisterShutdown;
-#pragma data_seg()
-//#pragma comment(linker, "/include:_wstartup_pInitAndRegisterShutdown")
+
+// insert our initialization function after _cinit and XIU ("User") block
+#if ARCH_AMD64
+# define SECTION_ATTRIBUTES read
+#else
+# define SECTION_ATTRIBUTES read,write
+#endif
+#pragma section(".CRT$XIV", long,SECTION_ATTRIBUTES)
+#undef SECTION_ATTRIBUTES
+EXTERN_C __declspec(allocate(".CRT$XIV")) int(*wstartup_pInitAndRegisterShutdown)() = wstartup_InitAndRegisterShutdown;
+#pragma comment(linker, "/include:_wstartup_pInitAndRegisterShutdown")
--- a/source/lib/sysdep/win/wsysdep.cpp
+++ b/source/lib/sysdep/win/wsysdep.cpp
@ -279,19 +279,24 @@ ErrorReaction sys_display_error(const wchar_t* text, uint flags)

 LibError sys_error_description_r(int user_err, char* buf, size_t max_chars)
 {
-	DWORD err = (DWORD)user_err;
-	// not in our range (Win32 error numbers are positive)
+	// validate user_err - Win32 doesn't have negative error numbers
 	if(user_err < 0)
 		return ERR::FAIL;	// NOWARN
-	// user doesn't know error code; get current error state
-	if(!user_err)
-		err = GetLastError();
+
+	const DWORD err = user_err? (DWORD)user_err : GetLastError();
+
+	// no one likes to see "The operation completed successfully" in
+	// error messages, so return more descriptive text instead.
+	if(err == 0)
+	{
+		strcpy_s(buf, max_chars, "0 (no error code was set)");
+		return INFO::OK;
+	}

 	const LPCVOID source = 0;	// ignored (we're not using FROM_HMODULE etc.)
 	const DWORD lang_id = 0;	// look for neutral, then current locale
 	va_list* args = 0;			// we don't care about "inserts"
-	DWORD chars_output = FormatMessage(FORMAT_MESSAGE_FROM_SYSTEM, source, err,
-		lang_id, buf, (DWORD)max_chars, args);
+	const DWORD chars_output = FormatMessage(FORMAT_MESSAGE_FROM_SYSTEM, source, err, lang_id, buf, (DWORD)max_chars, args);
 	if(!chars_output)
 		WARN_RETURN(ERR::FAIL);
 	debug_assert(chars_output < max_chars);
--- a/source/lib/tex/tex_dds.cpp
+++ b/source/lib/tex/tex_dds.cpp
@ -229,8 +229,8 @@ static void s3tc_decompress_level(uint UNUSED(level), uint level_w, uint level_h
 	// note: 1x1 images are legitimate (e.g. in mipmaps). they report their
 	// width as such for glTexImage, but the S3TC data is padded to
 	// 4x4 pixel block boundaries.
-	const uint blocks_w = (uint)round_up(level_w, 4) / 4;
-	const uint blocks_h = (uint)round_up(level_h, 4) / 4;
+	const uint blocks_w = round_up(level_w, 4u) / 4u;
+	const uint blocks_h = round_up(level_h, 4u) / 4u;
 	const u8* s3tc_data = level_data;
 	debug_assert(level_data_size % s3tc_block_size == 0);

@ -404,15 +404,15 @@ static LibError decode_pf(const DDPIXELFORMAT* pf, uint* bpp_, uint* flags_)
 		WARN_RETURN(ERR::TEX_INVALID_SIZE);

 	// determine type
-	const u32 pf_flags = read_le32(&pf->dwFlags);
+	const size_t pf_flags = (size_t)read_le32(&pf->dwFlags);
 	// .. uncompressed
 	if(pf_flags & DDPF_RGB)
 	{
-		const u32 pf_bpp    = read_le32(&pf->dwRGBBitCount);
-		const u32 pf_r_mask = read_le32(&pf->dwRBitMask);
-		const u32 pf_g_mask = read_le32(&pf->dwGBitMask);
-		const u32 pf_b_mask = read_le32(&pf->dwBBitMask);
-		const u32 pf_a_mask = read_le32(&pf->dwRGBAlphaBitMask);
+		const size_t pf_bpp    = (size_t)read_le32(&pf->dwRGBBitCount);
+		const size_t pf_r_mask = (size_t)read_le32(&pf->dwRBitMask);
+		const size_t pf_g_mask = (size_t)read_le32(&pf->dwGBitMask);
+		const size_t pf_b_mask = (size_t)read_le32(&pf->dwBBitMask);
+		const size_t pf_a_mask = (size_t)read_le32(&pf->dwRGBAlphaBitMask);

 		// (checked below; must be set in case below warning is to be
 		// skipped)
@ -493,16 +493,16 @@ static LibError decode_sd(const DDSURFACEDESC2* sd, uint* w_, uint* h_,
 		WARN_RETURN(ERR::CORRUPTED);

 	// flags (indicate which fields are valid)
-	const u32 sd_flags = read_le32(&sd->dwFlags);
+	const size_t sd_flags = (size_t)read_le32(&sd->dwFlags);
 	// .. not all required fields are present
 	// note: we can't guess dimensions - the image may not be square.
-	const u32 sd_req_flags = DDSD_CAPS|DDSD_HEIGHT|DDSD_WIDTH|DDSD_PIXELFORMAT;
+	const size_t sd_req_flags = DDSD_CAPS|DDSD_HEIGHT|DDSD_WIDTH|DDSD_PIXELFORMAT;
 	if((sd_flags & sd_req_flags) != sd_req_flags)
 		WARN_RETURN(ERR::TEX_INCOMPLETE_HEADER);

 	// image dimensions
-	const u32 h = read_le32(&sd->dwHeight);
-	const u32 w = read_le32(&sd->dwWidth);
+	const size_t h = (size_t)read_le32(&sd->dwHeight);
+	const size_t w = (size_t)read_le32(&sd->dwWidth);

 	// pixel format
 	uint bpp, flags;
@ -511,11 +511,11 @@ static LibError decode_sd(const DDSURFACEDESC2* sd, uint* w_, uint* h_,
 	// if the image is not aligned with the S3TC block size, it is stored
 	// with extra pixels on the bottom left to fill up the space, so we need
 	// to account for those when calculating how big it should be
-	u32 stored_h, stored_w;
+	size_t stored_h, stored_w;
 	if(flags & TEX_DXT)
 	{
-		stored_h = round_up(h, 4);
-		stored_w = round_up(w, 4);
+		stored_h = round_up(h, 4u);
+		stored_w = round_up(w, 4u);
 	}
 	else
 	{
@ -525,10 +525,10 @@ static LibError decode_sd(const DDSURFACEDESC2* sd, uint* w_, uint* h_,

 	// verify pitch or linear size, if given
 	const size_t pitch = stored_w*bpp/8;
-	const u32 sd_pitch_or_size = read_le32(&sd->dwPitchOrLinearSize);
+	const size_t sd_pitch_or_size = (size_t)read_le32(&sd->dwPitchOrLinearSize);
 	if(sd_flags & DDSD_PITCH)
 	{
-		if(sd_pitch_or_size != round_up(pitch, 4))
+		if(sd_pitch_or_size != round_up(pitch, 4u))
 			WARN_RETURN(ERR::CORRUPTED);
 	}
 	if(sd_flags & DDSD_LINEARSIZE)
@ -542,7 +542,7 @@ static LibError decode_sd(const DDSURFACEDESC2* sd, uint* w_, uint* h_,
 	// mipmaps
 	if(sd_flags & DDSD_MIPMAPCOUNT)
 	{
-		const u32 mipmap_count = read_le32(&sd->dwMipMapCount);
+		const size_t mipmap_count = (size_t)read_le32(&sd->dwMipMapCount);
 		if(mipmap_count)
 		{
 			// mipmap chain is incomplete
@ -556,7 +556,7 @@ static LibError decode_sd(const DDSURFACEDESC2* sd, uint* w_, uint* h_,
 	// check for volume textures
 	if(sd_flags & DDSD_DEPTH)
 	{
-		const u32 depth = read_le32(&sd->dwDepth);
+		const size_t depth = (size_t)read_le32(&sd->dwDepth);
 		if(depth)
 			WARN_RETURN(ERR::NOT_IMPLEMENTED);
 	}