Optimise fixed multiplies on MSVC x86

On MSVC, casting i32 to i64 then multiplying gets compiled into a call to __allmul, which is slow. Use the __emul intrinsic instead. GCC 4.6+ appears optimise this case automatically and doesn't need any special handling. This reduces the cost of ComputeShortPath by about 50% (testing AI vs AI on Oasis 01). This was SVN commit r13873.
2013-09-20 20:17:54 +00:00 · 2013-09-20 20:17:54 +00:00 · d252e245ef
commit d252e245ef
parent 5a8cfb738b
4 changed files with 55 additions and 48 deletions
--- a/source/maths/Fixed.h
+++ b/source/maths/Fixed.h
@ -1,4 +1,4 @@
-/* Copyright (C) 2010 Wildfire Games.
+/* Copyright (C) 2013 Wildfire Games.
 * This file is part of 0 A.D.
 *
 * 0 A.D. is free software: you can redistribute it and/or modify
@ -26,7 +26,15 @@ class CStrW;

 #ifndef NDEBUG
 #define USE_FIXED_OVERFLOW_CHECKS
-#endif // NDEBUG
+#endif
+
+#if MSC_VERSION
+// i32*i32 -> i64 multiply: MSVC x86 doesn't optimise i64 multiplies automatically, so use the intrinsic
+#include <intrin.h>
+#define FIXED_MUL_I64_I32_I32(a, b) (__emul((a), (b)))
+#else
+#define FIXED_MUL_I64_I32_I32(a, b) ((i64)(a) * (i64)(b))
+#endif

 //define overflow macros
 #ifndef USE_FIXED_OVERFLOW_CHECKS
@ -281,7 +289,7 @@ public:
 	 */
 	CFixed Multiply(CFixed n) const
 	{
-		i64 t = (i64)value * (i64)n.value;
+		i64 t = FIXED_MUL_I64_I32_I32(value, n.value);
 		t >>= fract_bits;

 		CheckCastOverflow(t, T, L"Overflow in CFixed::Multiply(CFixed n)", L"Underflow in CFixed::Multiply(CFixed n)")
@ -301,7 +309,7 @@ public:
 	 */
 	CFixed MulDiv(CFixed m, CFixed d) const
 	{
-		i64 t = ((i64)value * (i64)m.value) / (i64)d.value;
+		i64 t = FIXED_MUL_I64_I32_I32(value, m.value) / (i64)d.value;
 		CheckCastOverflow(t, T, L"Overflow in CFixed::Multiply(CFixed n)", L"Underflow in CFixed::Multiply(CFixed n)")
 		return CFixed((T)t);
 	}
--- a/source/maths/FixedVector2D.h
+++ b/source/maths/FixedVector2D.h
@ -1,4 +1,4 @@
-/* Copyright (C) 2010 Wildfire Games.
+/* Copyright (C) 2013 Wildfire Games.
 * This file is part of 0 A.D.
 *
 * 0 A.D. is free software: you can redistribute it and/or modify
@ -95,10 +95,10 @@ public:
 	fixed Length() const
 	{
 		// Do intermediate calculations with 64-bit ints to avoid overflows
-		i64 x = (i64)X.GetInternalValue();
-		i64 y = (i64)Y.GetInternalValue();
-		u64 xx = (u64)(x * x);
-		u64 yy = (u64)(y * y);
+		i32 x = X.GetInternalValue();
+		i32 y = Y.GetInternalValue();
+		u64 xx = (u64)FIXED_MUL_I64_I32_I32(x, x);
+		u64 yy = (u64)FIXED_MUL_I64_I32_I32(y, y);
 		u64 d2 = xx + yy;
 		CheckUnsignedAdditionOverflow(d2, xx, L"Overflow in CFixedVector2D::Length() part 1")

@ -117,14 +117,14 @@ public:
 	 */
 	int CompareLength(fixed cmp) const
 	{
-		i64 x = (i64)X.GetInternalValue(); // abs(x) <= 2^31
-		i64 y = (i64)Y.GetInternalValue();
-		u64 xx = (u64)(x * x); // xx <= 2^62
-		u64 yy = (u64)(y * y);
+		i32 x = X.GetInternalValue(); // abs(x) <= 2^31
+		i32 y = Y.GetInternalValue();
+		u64 xx = (u64)FIXED_MUL_I64_I32_I32(x, x); // xx <= 2^62
+		u64 yy = (u64)FIXED_MUL_I64_I32_I32(y, y);
 		u64 d2 = xx + yy; // d2 <= 2^63 (no overflow)

-		i64 c = (i64)cmp.GetInternalValue();
-		u64 c2 = (u64)(c * c);
+		i32 c = cmp.GetInternalValue();
+		u64 c2 = (u64)FIXED_MUL_I64_I32_I32(c, c);
 		if (d2 < c2)
 			return -1;
 		else if (d2 > c2)
@ -140,13 +140,13 @@ public:
 	 */
 	int CompareLength(const CFixedVector2D& other) const
 	{
-		i64 x = (i64)X.GetInternalValue();
-		i64 y = (i64)Y.GetInternalValue();
-		u64 d2 = (u64)(x * x) + (u64)(y * y);
+		i32 x = X.GetInternalValue();
+		i32 y = Y.GetInternalValue();
+		u64 d2 = (u64)FIXED_MUL_I64_I32_I32(x, x) + (u64)FIXED_MUL_I64_I32_I32(y, y);

-		i64 ox = (i64)other.X.GetInternalValue();
-		i64 oy = (i64)other.Y.GetInternalValue();
-		u64 od2 = (u64)(ox * ox) + (u64)(oy * oy);
+		i32 ox = other.X.GetInternalValue();
+		i32 oy = other.Y.GetInternalValue();
+		u64 od2 = (u64)FIXED_MUL_I64_I32_I32(ox, ox) + (u64)FIXED_MUL_I64_I32_I32(oy, oy);

 		if (d2 < od2)
 			return -1;
@ -194,8 +194,8 @@ public:
 	 */
 	fixed Dot(const CFixedVector2D& v)
 	{
-		i64 x = (i64)X.GetInternalValue() * (i64)v.X.GetInternalValue();
-		i64 y = (i64)Y.GetInternalValue() * (i64)v.Y.GetInternalValue();
+		i64 x = FIXED_MUL_I64_I32_I32(X.GetInternalValue(), v.X.GetInternalValue());
+		i64 y = FIXED_MUL_I64_I32_I32(Y.GetInternalValue(), v.Y.GetInternalValue());
 		CheckSignedAdditionOverflow(i64, x, y, L"Overflow in CFixedVector2D::Dot() part 1", L"Underflow in CFixedVector2D::Dot() part 1")
 		i64 sum = x + y;
 		sum >>= fixed::fract_bits;
--- a/source/maths/FixedVector3D.h
+++ b/source/maths/FixedVector3D.h
@ -1,4 +1,4 @@
-/* Copyright (C) 2010 Wildfire Games.
+/* Copyright (C) 2013 Wildfire Games.
 * This file is part of 0 A.D.
 *
 * 0 A.D. is free software: you can redistribute it and/or modify
@ -82,12 +82,12 @@ public:
 	fixed Length() const
 	{
 		// Do intermediate calculations with 64-bit ints to avoid overflows
-		i64 x = (i64)X.GetInternalValue();
-		i64 y = (i64)Y.GetInternalValue();
-		i64 z = (i64)Z.GetInternalValue();
-		u64 xx = (u64)(x * x);
-		u64 yy = (u64)(y * y);
-		u64 zz = (u64)(z * z);
+		i32 x = X.GetInternalValue();
+		i32 y = Y.GetInternalValue();
+		i32 z = Z.GetInternalValue();
+		u64 xx = (u64)FIXED_MUL_I64_I32_I32(x, x);
+		u64 yy = (u64)FIXED_MUL_I64_I32_I32(y, y);
+		u64 zz = (u64)FIXED_MUL_I64_I32_I32(z, z);
 		u64 t = xx + yy;
 		CheckUnsignedAdditionOverflow(t, xx, L"Overflow in CFixedVector3D::Length() part 1")

@ -137,20 +137,20 @@ public:
 	 */
 	CFixedVector3D Cross(const CFixedVector3D& v)
 	{
-		i64 y_vz = (i64)Y.GetInternalValue() * (i64)v.Z.GetInternalValue();
-		i64 z_vy = (i64)Z.GetInternalValue() * (i64)v.Y.GetInternalValue();
+		i64 y_vz = FIXED_MUL_I64_I32_I32(Y.GetInternalValue(), v.Z.GetInternalValue());
+		i64 z_vy = FIXED_MUL_I64_I32_I32(Z.GetInternalValue(), v.Y.GetInternalValue());
 		CheckSignedSubtractionOverflow(i64, y_vz, z_vy, L"Overflow in CFixedVector3D::Cross() part 1", L"Underflow in CFixedVector3D::Cross() part 1")
 		i64 x = y_vz - z_vy;
 		x >>= fixed::fract_bits;

-		i64 z_vx = (i64)Z.GetInternalValue() * (i64)v.X.GetInternalValue();
-		i64 x_vz = (i64)X.GetInternalValue() * (i64)v.Z.GetInternalValue();
+		i64 z_vx = FIXED_MUL_I64_I32_I32(Z.GetInternalValue(), v.X.GetInternalValue());
+		i64 x_vz = FIXED_MUL_I64_I32_I32(X.GetInternalValue(), v.Z.GetInternalValue());
 		CheckSignedSubtractionOverflow(i64, z_vx, x_vz, L"Overflow in CFixedVector3D::Cross() part 2", L"Underflow in CFixedVector3D::Cross() part 2")
 		i64 y = z_vx - x_vz;
 		y >>= fixed::fract_bits;

-		i64 x_vy = (i64)X.GetInternalValue() * (i64)v.Y.GetInternalValue();
-		i64 y_vx = (i64)Y.GetInternalValue() * (i64)v.X.GetInternalValue();
+		i64 x_vy = FIXED_MUL_I64_I32_I32(X.GetInternalValue(), v.Y.GetInternalValue());
+		i64 y_vx = FIXED_MUL_I64_I32_I32(Y.GetInternalValue(), v.X.GetInternalValue());
 		CheckSignedSubtractionOverflow(i64, x_vy, y_vx, L"Overflow in CFixedVector3D::Cross() part 3", L"Underflow in CFixedVector3D::Cross() part 3")
 		i64 z = x_vy - y_vx;
 		z >>= fixed::fract_bits;
@ -170,9 +170,9 @@ public:
 	 */
 	fixed Dot(const CFixedVector3D& v)
 	{
-		i64 x = (i64)X.GetInternalValue() * (i64)v.X.GetInternalValue();
-		i64 y = (i64)Y.GetInternalValue() * (i64)v.Y.GetInternalValue();
-		i64 z = (i64)Z.GetInternalValue() * (i64)v.Z.GetInternalValue();
+		i64 x = FIXED_MUL_I64_I32_I32(X.GetInternalValue(), v.X.GetInternalValue());
+		i64 y = FIXED_MUL_I64_I32_I32(Y.GetInternalValue(), v.Y.GetInternalValue());
+		i64 z = FIXED_MUL_I64_I32_I32(Z.GetInternalValue(), v.Z.GetInternalValue());
 		CheckSignedAdditionOverflow(i64, x, y, L"Overflow in CFixedVector3D::Dot() part 1", L"Underflow in CFixedVector3D::Dot() part 1")
 		i64 t = x + y;

--- a/source/simulation2/components/CCmpRangeManager.cpp
+++ b/source/simulation2/components/CCmpRangeManager.cpp
@ -103,18 +103,17 @@ static u32 CalcSharedLosMask(std::vector<player_id_t> players)
 */
 static bool InParabolicRange(CFixedVector3D v, fixed range) 
 {
-	i64 x = (i64)v.X.GetInternalValue(); // abs(x) <= 2^31
-	i64 z = (i64)v.Z.GetInternalValue();
-	i64 xx = (x * x); // xx <= 2^62
-	i64 zz = (z * z);
+	i32 x = v.X.GetInternalValue(); // abs(x) <= 2^31
+	i32 z = v.Z.GetInternalValue();
+	u64 xx = (u64)FIXED_MUL_I64_I32_I32(x, x); // xx <= 2^62
+	u64 zz = (u64)FIXED_MUL_I64_I32_I32(z, z);
 	i64 d2 = (xx + zz) >> 1; // d2 <= 2^62 (no overflow)
 	
-	i64 y = (i64)v.Y.GetInternalValue();
+	i32 y = v.Y.GetInternalValue();
+	i32 c = range.GetInternalValue();
+	i32 c_2 = c >> 1;

-	i64 c = (i64)range.GetInternalValue();
-	i64 c_2 = c >> 1; 
-
-	i64 c2 = (c_2-y)*c;
+	i64 c2 = FIXED_MUL_I64_I32_I32(c_2 - y, c);

 	if (d2 <= c2)
 		return true;