Optimise fixed multiplies on MSVC x86

On MSVC, casting i32 to i64 then multiplying gets compiled into a call
to __allmul, which is slow. Use the __emul intrinsic instead.

GCC 4.6+ appears optimise this case automatically and doesn't need any
special handling.

This reduces the cost of ComputeShortPath by about 50% (testing AI vs AI
on Oasis 01).

This was SVN commit r13873.
This commit is contained in:
Ykkrosh 2013-09-20 20:17:54 +00:00
parent 5a8cfb738b
commit d252e245ef
4 changed files with 55 additions and 48 deletions

View File

@ -1,4 +1,4 @@
/* Copyright (C) 2010 Wildfire Games.
/* Copyright (C) 2013 Wildfire Games.
* This file is part of 0 A.D.
*
* 0 A.D. is free software: you can redistribute it and/or modify
@ -26,7 +26,15 @@ class CStrW;
#ifndef NDEBUG
#define USE_FIXED_OVERFLOW_CHECKS
#endif // NDEBUG
#endif
#if MSC_VERSION
// i32*i32 -> i64 multiply: MSVC x86 doesn't optimise i64 multiplies automatically, so use the intrinsic
#include <intrin.h>
#define FIXED_MUL_I64_I32_I32(a, b) (__emul((a), (b)))
#else
#define FIXED_MUL_I64_I32_I32(a, b) ((i64)(a) * (i64)(b))
#endif
//define overflow macros
#ifndef USE_FIXED_OVERFLOW_CHECKS
@ -281,7 +289,7 @@ public:
*/
CFixed Multiply(CFixed n) const
{
i64 t = (i64)value * (i64)n.value;
i64 t = FIXED_MUL_I64_I32_I32(value, n.value);
t >>= fract_bits;
CheckCastOverflow(t, T, L"Overflow in CFixed::Multiply(CFixed n)", L"Underflow in CFixed::Multiply(CFixed n)")
@ -301,7 +309,7 @@ public:
*/
CFixed MulDiv(CFixed m, CFixed d) const
{
i64 t = ((i64)value * (i64)m.value) / (i64)d.value;
i64 t = FIXED_MUL_I64_I32_I32(value, m.value) / (i64)d.value;
CheckCastOverflow(t, T, L"Overflow in CFixed::Multiply(CFixed n)", L"Underflow in CFixed::Multiply(CFixed n)")
return CFixed((T)t);
}

View File

@ -1,4 +1,4 @@
/* Copyright (C) 2010 Wildfire Games.
/* Copyright (C) 2013 Wildfire Games.
* This file is part of 0 A.D.
*
* 0 A.D. is free software: you can redistribute it and/or modify
@ -95,10 +95,10 @@ public:
fixed Length() const
{
// Do intermediate calculations with 64-bit ints to avoid overflows
i64 x = (i64)X.GetInternalValue();
i64 y = (i64)Y.GetInternalValue();
u64 xx = (u64)(x * x);
u64 yy = (u64)(y * y);
i32 x = X.GetInternalValue();
i32 y = Y.GetInternalValue();
u64 xx = (u64)FIXED_MUL_I64_I32_I32(x, x);
u64 yy = (u64)FIXED_MUL_I64_I32_I32(y, y);
u64 d2 = xx + yy;
CheckUnsignedAdditionOverflow(d2, xx, L"Overflow in CFixedVector2D::Length() part 1")
@ -117,14 +117,14 @@ public:
*/
int CompareLength(fixed cmp) const
{
i64 x = (i64)X.GetInternalValue(); // abs(x) <= 2^31
i64 y = (i64)Y.GetInternalValue();
u64 xx = (u64)(x * x); // xx <= 2^62
u64 yy = (u64)(y * y);
i32 x = X.GetInternalValue(); // abs(x) <= 2^31
i32 y = Y.GetInternalValue();
u64 xx = (u64)FIXED_MUL_I64_I32_I32(x, x); // xx <= 2^62
u64 yy = (u64)FIXED_MUL_I64_I32_I32(y, y);
u64 d2 = xx + yy; // d2 <= 2^63 (no overflow)
i64 c = (i64)cmp.GetInternalValue();
u64 c2 = (u64)(c * c);
i32 c = cmp.GetInternalValue();
u64 c2 = (u64)FIXED_MUL_I64_I32_I32(c, c);
if (d2 < c2)
return -1;
else if (d2 > c2)
@ -140,13 +140,13 @@ public:
*/
int CompareLength(const CFixedVector2D& other) const
{
i64 x = (i64)X.GetInternalValue();
i64 y = (i64)Y.GetInternalValue();
u64 d2 = (u64)(x * x) + (u64)(y * y);
i32 x = X.GetInternalValue();
i32 y = Y.GetInternalValue();
u64 d2 = (u64)FIXED_MUL_I64_I32_I32(x, x) + (u64)FIXED_MUL_I64_I32_I32(y, y);
i64 ox = (i64)other.X.GetInternalValue();
i64 oy = (i64)other.Y.GetInternalValue();
u64 od2 = (u64)(ox * ox) + (u64)(oy * oy);
i32 ox = other.X.GetInternalValue();
i32 oy = other.Y.GetInternalValue();
u64 od2 = (u64)FIXED_MUL_I64_I32_I32(ox, ox) + (u64)FIXED_MUL_I64_I32_I32(oy, oy);
if (d2 < od2)
return -1;
@ -194,8 +194,8 @@ public:
*/
fixed Dot(const CFixedVector2D& v)
{
i64 x = (i64)X.GetInternalValue() * (i64)v.X.GetInternalValue();
i64 y = (i64)Y.GetInternalValue() * (i64)v.Y.GetInternalValue();
i64 x = FIXED_MUL_I64_I32_I32(X.GetInternalValue(), v.X.GetInternalValue());
i64 y = FIXED_MUL_I64_I32_I32(Y.GetInternalValue(), v.Y.GetInternalValue());
CheckSignedAdditionOverflow(i64, x, y, L"Overflow in CFixedVector2D::Dot() part 1", L"Underflow in CFixedVector2D::Dot() part 1")
i64 sum = x + y;
sum >>= fixed::fract_bits;

View File

@ -1,4 +1,4 @@
/* Copyright (C) 2010 Wildfire Games.
/* Copyright (C) 2013 Wildfire Games.
* This file is part of 0 A.D.
*
* 0 A.D. is free software: you can redistribute it and/or modify
@ -82,12 +82,12 @@ public:
fixed Length() const
{
// Do intermediate calculations with 64-bit ints to avoid overflows
i64 x = (i64)X.GetInternalValue();
i64 y = (i64)Y.GetInternalValue();
i64 z = (i64)Z.GetInternalValue();
u64 xx = (u64)(x * x);
u64 yy = (u64)(y * y);
u64 zz = (u64)(z * z);
i32 x = X.GetInternalValue();
i32 y = Y.GetInternalValue();
i32 z = Z.GetInternalValue();
u64 xx = (u64)FIXED_MUL_I64_I32_I32(x, x);
u64 yy = (u64)FIXED_MUL_I64_I32_I32(y, y);
u64 zz = (u64)FIXED_MUL_I64_I32_I32(z, z);
u64 t = xx + yy;
CheckUnsignedAdditionOverflow(t, xx, L"Overflow in CFixedVector3D::Length() part 1")
@ -137,20 +137,20 @@ public:
*/
CFixedVector3D Cross(const CFixedVector3D& v)
{
i64 y_vz = (i64)Y.GetInternalValue() * (i64)v.Z.GetInternalValue();
i64 z_vy = (i64)Z.GetInternalValue() * (i64)v.Y.GetInternalValue();
i64 y_vz = FIXED_MUL_I64_I32_I32(Y.GetInternalValue(), v.Z.GetInternalValue());
i64 z_vy = FIXED_MUL_I64_I32_I32(Z.GetInternalValue(), v.Y.GetInternalValue());
CheckSignedSubtractionOverflow(i64, y_vz, z_vy, L"Overflow in CFixedVector3D::Cross() part 1", L"Underflow in CFixedVector3D::Cross() part 1")
i64 x = y_vz - z_vy;
x >>= fixed::fract_bits;
i64 z_vx = (i64)Z.GetInternalValue() * (i64)v.X.GetInternalValue();
i64 x_vz = (i64)X.GetInternalValue() * (i64)v.Z.GetInternalValue();
i64 z_vx = FIXED_MUL_I64_I32_I32(Z.GetInternalValue(), v.X.GetInternalValue());
i64 x_vz = FIXED_MUL_I64_I32_I32(X.GetInternalValue(), v.Z.GetInternalValue());
CheckSignedSubtractionOverflow(i64, z_vx, x_vz, L"Overflow in CFixedVector3D::Cross() part 2", L"Underflow in CFixedVector3D::Cross() part 2")
i64 y = z_vx - x_vz;
y >>= fixed::fract_bits;
i64 x_vy = (i64)X.GetInternalValue() * (i64)v.Y.GetInternalValue();
i64 y_vx = (i64)Y.GetInternalValue() * (i64)v.X.GetInternalValue();
i64 x_vy = FIXED_MUL_I64_I32_I32(X.GetInternalValue(), v.Y.GetInternalValue());
i64 y_vx = FIXED_MUL_I64_I32_I32(Y.GetInternalValue(), v.X.GetInternalValue());
CheckSignedSubtractionOverflow(i64, x_vy, y_vx, L"Overflow in CFixedVector3D::Cross() part 3", L"Underflow in CFixedVector3D::Cross() part 3")
i64 z = x_vy - y_vx;
z >>= fixed::fract_bits;
@ -170,9 +170,9 @@ public:
*/
fixed Dot(const CFixedVector3D& v)
{
i64 x = (i64)X.GetInternalValue() * (i64)v.X.GetInternalValue();
i64 y = (i64)Y.GetInternalValue() * (i64)v.Y.GetInternalValue();
i64 z = (i64)Z.GetInternalValue() * (i64)v.Z.GetInternalValue();
i64 x = FIXED_MUL_I64_I32_I32(X.GetInternalValue(), v.X.GetInternalValue());
i64 y = FIXED_MUL_I64_I32_I32(Y.GetInternalValue(), v.Y.GetInternalValue());
i64 z = FIXED_MUL_I64_I32_I32(Z.GetInternalValue(), v.Z.GetInternalValue());
CheckSignedAdditionOverflow(i64, x, y, L"Overflow in CFixedVector3D::Dot() part 1", L"Underflow in CFixedVector3D::Dot() part 1")
i64 t = x + y;

View File

@ -103,18 +103,17 @@ static u32 CalcSharedLosMask(std::vector<player_id_t> players)
*/
static bool InParabolicRange(CFixedVector3D v, fixed range)
{
i64 x = (i64)v.X.GetInternalValue(); // abs(x) <= 2^31
i64 z = (i64)v.Z.GetInternalValue();
i64 xx = (x * x); // xx <= 2^62
i64 zz = (z * z);
i32 x = v.X.GetInternalValue(); // abs(x) <= 2^31
i32 z = v.Z.GetInternalValue();
u64 xx = (u64)FIXED_MUL_I64_I32_I32(x, x); // xx <= 2^62
u64 zz = (u64)FIXED_MUL_I64_I32_I32(z, z);
i64 d2 = (xx + zz) >> 1; // d2 <= 2^62 (no overflow)
i64 y = (i64)v.Y.GetInternalValue();
i32 y = v.Y.GetInternalValue();
i32 c = range.GetInternalValue();
i32 c_2 = c >> 1;
i64 c = (i64)range.GetInternalValue();
i64 c_2 = c >> 1;
i64 c2 = (c_2-y)*c;
i64 c2 = FIXED_MUL_I64_I32_I32(c_2 - y, c);
if (d2 <= c2)
return true;