Optimise vertex skinning code with SSE, based on patch by gruby.

Fixes #905.

This was SVN commit r10499.
This commit is contained in:
Ykkrosh 2011-11-09 23:11:28 +00:00
parent 9545f783a1
commit 3916c25b84
12 changed files with 276 additions and 53 deletions

View File

@ -32,6 +32,7 @@
#include "ObjectEntry.h"
#include "lib/res/graphics/ogl_tex.h"
#include "lib/res/h_mgr.h"
#include "lib/sysdep/rtl.h"
#include "ps/Profile.h"
#include "ps/CLogger.h"
@ -57,7 +58,7 @@ CModel::~CModel()
// ReleaseData: delete anything allocated by the model
void CModel::ReleaseData()
{
delete[] m_BoneMatrices;
rtl_FreeAligned(m_BoneMatrices);
delete[] m_InverseBindBoneMatrices;
for (size_t i = 0; i < m_Props.size(); ++i)
@ -84,7 +85,7 @@ bool CModel::InitModel(const CModelDefPtr& modeldef)
size_t numBlends = modeldef->GetNumBlends();
// allocate matrices for bone transformations
m_BoneMatrices = new CMatrix3D[numBones + numBlends];
m_BoneMatrices = (CMatrix3D*)rtl_AllocateAligned(sizeof(CMatrix3D) * (numBones + numBlends), 16);
for (size_t i = 0; i < numBones + numBlends; ++i)
{
m_BoneMatrices[i].SetIdentity();

View File

@ -154,11 +154,12 @@ public:
*/
bool IsSkinned() { return (m_BoneMatrices != NULL); }
// return the models bone matrices
// return the models bone matrices; 16-byte aligned for SSE reads
const CMatrix3D* GetAnimatedBoneMatrices() {
ENSURE(m_PositionValid);
return m_BoneMatrices;
}
const CMatrix3D* GetInverseBindBoneMatrices() {
return m_InverseBindBoneMatrices;
}

View File

@ -1,4 +1,4 @@
/* Copyright (C) 2010 Wildfire Games.
/* Copyright (C) 2011 Wildfire Games.
* This file is part of 0 A.D.
*
* 0 A.D. is free software: you can redistribute it and/or modify
@ -26,6 +26,10 @@
#include "ps/FileIo.h"
#include "maths/Vector4D.h"
#if ARCH_X86_X64
# include <xmmintrin.h>
#endif
CVector3D CModelDef::SkinPoint(const SModelVertex& vtx,
const CMatrix3D newPoseMatrices[])
{
@ -91,12 +95,18 @@ void CModelDef::SkinPointsAndNormals(
const size_t* blendIndices,
const CMatrix3D newPoseMatrices[])
{
// To avoid some performance overhead, get the raw vertex array pointers
char* PositionData = Position.GetData();
size_t PositionStride = Position.GetStride();
char* NormalData = Normal.GetData();
size_t NormalStride = Normal.GetStride();
for (size_t j = 0; j < numVertices; ++j)
{
const SModelVertex& vtx = vertices[j];
Position[j] = newPoseMatrices[blendIndices[j]].Transform(vtx.m_Coords);
Normal[j] = newPoseMatrices[blendIndices[j]].Rotate(vtx.m_Norm);
CVector3D pos = newPoseMatrices[blendIndices[j]].Transform(vtx.m_Coords);
CVector3D norm = newPoseMatrices[blendIndices[j]].Rotate(vtx.m_Norm);
// If there was more than one influence, the result is probably not going
// to be of unit length (since it's a weighted sum of several independent
@ -104,10 +114,95 @@ void CModelDef::SkinPointsAndNormals(
// (It's fairly common to only have one influence, so it seems sensible to
// optimise that case a bit.)
if (vtx.m_Blend.m_Bone[1] != 0xff) // if more than one influence
Normal[j].Normalize();
norm.Normalize();
memcpy(PositionData + PositionStride*j, &pos.X, 3*sizeof(float));
memcpy(NormalData + NormalStride*j, &norm.X, 3*sizeof(float));
}
}
#if ARCH_X86_X64
void CModelDef::SkinPointsAndNormals_SSE(
size_t numVertices,
const VertexArrayIterator<CVector3D>& Position,
const VertexArrayIterator<CVector3D>& Normal,
const SModelVertex* vertices,
const size_t* blendIndices,
const CMatrix3D newPoseMatrices[])
{
// To avoid some performance overhead, get the raw vertex array pointers
char* PositionData = Position.GetData();
size_t PositionStride = Position.GetStride();
char* NormalData = Normal.GetData();
size_t NormalStride = Normal.GetStride();
// Must be aligned correctly for SSE
ASSERT((intptr_t)newPoseMatrices % 16 == 0);
ASSERT((intptr_t)PositionData % 16 == 0);
ASSERT((intptr_t)PositionStride % 16 == 0);
ASSERT((intptr_t)NormalData % 16 == 0);
ASSERT((intptr_t)NormalStride % 16 == 0);
__m128 col0, col1, col2, col3, vec0, vec1, vec2;
for (size_t j = 0; j < numVertices; ++j)
{
const SModelVertex& vtx = vertices[j];
const CMatrix3D& mtx = newPoseMatrices[blendIndices[j]];
// Loads matrix to xmm registers.
col0 = _mm_load_ps(mtx._data);
col1 = _mm_load_ps(mtx._data + 4);
col2 = _mm_load_ps(mtx._data + 8);
col3 = _mm_load_ps(mtx._data + 12);
// Loads and computes vertex coordinates.
vec0 = _mm_load1_ps(&vtx.m_Coords.X);
vec0 = _mm_mul_ps(col0, vec0);
vec1 = _mm_load1_ps(&vtx.m_Coords.Y);
vec1 = _mm_mul_ps(col1, vec1);
vec0 = _mm_add_ps(vec0, vec1);
vec1 = _mm_load1_ps(&vtx.m_Coords.Z);
vec1 = _mm_mul_ps(col2, vec1);
vec1 = _mm_add_ps(vec1, col3);
vec0 = _mm_add_ps(vec0, vec1);
_mm_store_ps((float*)(PositionData + PositionStride*j), vec0);
// Loads and computes normal vectors.
vec0 = _mm_load1_ps(&vtx.m_Norm.X);
vec0 = _mm_mul_ps(col0, vec0);
vec1 = _mm_load1_ps(&vtx.m_Norm.Y);
vec1 = _mm_mul_ps(col1, vec1);
vec0 = _mm_add_ps(vec0, vec1);
vec1 = _mm_load1_ps(&vtx.m_Norm.Z);
vec1 = _mm_mul_ps(col2, vec1);
vec0 = _mm_add_ps(vec0, vec1);
// If there was more than one influence, the result is probably not going
// to be of unit length (since it's a weighted sum of several independent
// unit vectors), so we need to normalise it.
// (It's fairly common to only have one influence, so it seems sensible to
// optimise that case a bit.)
if (vtx.m_Blend.m_Bone[1] != 0xff) // if more than one influence
{
// Normalization.
// vec1 = [x*x, y*y, z*z, ?*?]
vec1 = _mm_mul_ps(vec0, vec0);
// vec2 = [y*y, z*z, x*x, y*y]
vec2 = _mm_shuffle_ps(vec1, vec1, _MM_SHUFFLE(1, 2, 0, 1));
vec1 = _mm_add_ps(vec1, vec2);
// vec2 = [z*z, x*x, y*y, z*z]
vec2 = _mm_shuffle_ps(vec2, vec2, _MM_SHUFFLE(1, 2, 0, 1));
vec1 = _mm_add_ps(vec1, vec2);
// rsqrt(a) = 1 / sqrt(a)
vec1 = _mm_rsqrt_ps(vec1);
vec0 = _mm_mul_ps(vec0, vec1);
}
_mm_store_ps((float*)(NormalData + NormalStride*j), vec0);
}
}
#endif
void CModelDef::BlendBoneMatrices(
CMatrix3D boneMatrices[])
{

View File

@ -1,4 +1,4 @@
/* Copyright (C) 2010 Wildfire Games.
/* Copyright (C) 2011 Wildfire Games.
* This file is part of 0 A.D.
*
* 0 A.D. is free software: you can redistribute it and/or modify
@ -179,6 +179,19 @@ public:
const size_t* blendIndices,
const CMatrix3D newPoseMatrices[]);
#if ARCH_X86_X64
/**
* SSE-optimised version of SkinPointsAndNormals.
*/
static void SkinPointsAndNormals_SSE(
size_t numVertices,
const VertexArrayIterator<CVector3D>& Position,
const VertexArrayIterator<CVector3D>& Normal,
const SModelVertex* vertices,
const size_t* blendIndices,
const CMatrix3D newPoseMatrices[]);
#endif
/**
* Blend bone matrices together to fill bone palette.
*/

View File

@ -62,6 +62,7 @@
#include "renderer/Renderer.h"
#include "renderer/VertexBufferManager.h"
#include "renderer/ModelRenderer.h"
#include "maths/MathUtil.h"
@ -591,6 +592,7 @@ static void InitRenderer()
g_Renderer.SetViewport(vp);
ColorActivateFastImpl();
ModelRenderer::Init();
}
static void InitSDL()

View File

@ -1,4 +1,4 @@
/* Copyright (C) 2009 Wildfire Games.
/* Copyright (C) 2011 Wildfire Games.
* This file is part of 0 A.D.
*
* 0 A.D. is free software: you can redistribute it and/or modify
@ -21,7 +21,9 @@
#include "precompiled.h"
#include "lib/bits.h"
#include "lib/ogl.h"
#include "lib/sysdep/rtl.h"
#include "maths/Vector3D.h"
#include "maths/Vector4D.h"
@ -97,8 +99,13 @@ struct FFModel
struct FixedFunctionModelRendererInternals
{
/// Transformed vertex normals - required for recalculating lighting on skinned models
std::vector<CVector3D> normals;
/**
* Scratch space for normal vector calculation.
* Space is reserved so we don't have to do frequent reallocations.
* Allocated with rtl_AllocateAligned(normalsNumVertices*16, 16) for SSE writes.
*/
char* normals;
size_t normalsNumVertices;
/// Previously prepared modeldef
FFModelDef* ffmodeldef;
@ -110,10 +117,14 @@ FixedFunctionModelRenderer::FixedFunctionModelRenderer()
{
m = new FixedFunctionModelRendererInternals;
m->ffmodeldef = 0;
m->normals = 0;
m->normalsNumVertices = 0;
}
FixedFunctionModelRenderer::~FixedFunctionModelRenderer()
{
rtl_FreeAligned(m->normals);
delete m;
}
@ -133,17 +144,26 @@ void* FixedFunctionModelRenderer::CreateModelData(CModel* model)
// Build the per-model data
FFModel* ffmodel = new FFModel;
ffmodel->m_Position.type = GL_FLOAT;
ffmodel->m_Position.elems = 3;
ffmodel->m_Array.AddAttribute(&ffmodel->m_Position);
// Positions must be 16-byte aligned for SSE writes.
// We can pack the color after the position; it will be corrupted by
// BuildPositionAndNormals, but that's okay since we'll recompute the
// colors afterwards.
ffmodel->m_Color.type = GL_UNSIGNED_BYTE;
ffmodel->m_Color.elems = 4;
ffmodel->m_Array.AddAttribute(&ffmodel->m_Color);
ffmodel->m_Position.type = GL_FLOAT;
ffmodel->m_Position.elems = 3;
ffmodel->m_Array.AddAttribute(&ffmodel->m_Position);
ffmodel->m_Array.SetNumVertices(mdef->GetNumVertices());
ffmodel->m_Array.Layout();
// Verify alignment
ENSURE(ffmodel->m_Position.offset % 16 == 0);
ENSURE(ffmodel->m_Array.GetStride() % 16 == 0);
return ffmodel;
}
@ -159,11 +179,19 @@ void FixedFunctionModelRenderer::UpdateModelData(CModel* model, void* data, int
size_t numVertices = mdef->GetNumVertices();
// build vertices
if (m->normals.size() < numVertices)
m->normals.resize(numVertices);
// allocate working space for computing normals
if (numVertices > m->normalsNumVertices)
{
rtl_FreeAligned(m->normals);
size_t newSize = round_up_to_pow2(numVertices);
m->normals = (char*)rtl_AllocateAligned(newSize*16, 16);
m->normalsNumVertices = newSize;
}
VertexArrayIterator<CVector3D> Position = ffmodel->m_Position.GetIterator<CVector3D>();
VertexArrayIterator<CVector3D> Normal = VertexArrayIterator<CVector3D>((char*)&m->normals[0], sizeof(CVector3D));
VertexArrayIterator<CVector3D> Normal = VertexArrayIterator<CVector3D>(m->normals, 16);
ModelRenderer::BuildPositionAndNormals(model, Position, Normal);

View File

@ -37,13 +37,35 @@ struct ShaderModelDef : public CModelDefRPrivate
/// Indices are the same for all models, so share them
VertexIndexArray m_IndexArray;
/// Static per-CModelDef vertex array
VertexArray m_Array;
/// UV coordinates are stored in the static array
VertexArray::Attribute m_UV;
ShaderModelDef(const CModelDefPtr& mdef);
};
ShaderModelDef::ShaderModelDef(const CModelDefPtr& mdef)
: m_IndexArray(GL_STATIC_DRAW)
: m_IndexArray(GL_STATIC_DRAW), m_Array(GL_STATIC_DRAW)
{
size_t numVertices = mdef->GetNumVertices();
m_UV.type = GL_FLOAT;
m_UV.elems = 2;
m_Array.AddAttribute(&m_UV);
m_Array.SetNumVertices(numVertices);
m_Array.Layout();
VertexArrayIterator<float[2]> UVit = m_UV.GetIterator<float[2]>();
ModelRenderer::BuildUV(mdef, UVit);
m_Array.Upload();
m_Array.FreeBackingStore();
m_IndexArray.SetNumVertices(mdef->GetNumFaces()*3);
m_IndexArray.Layout();
ModelRenderer::BuildIndices(mdef, m_IndexArray.GetIterator());
@ -61,9 +83,6 @@ struct ShaderModel
VertexArray::Attribute m_Position;
VertexArray::Attribute m_Normal;
/// UV is stored per-CModel in order to avoid space wastage due to alignment
VertexArray::Attribute m_UV;
ShaderModel() : m_Array(GL_DYNAMIC_DRAW) { }
};
@ -103,25 +122,23 @@ void* ShaderModelRenderer::CreateModelData(CModel* model)
// Build the per-model data
ShaderModel* shadermodel = new ShaderModel;
// Positions and normals must be 16-byte aligned for SSE writes.
shadermodel->m_Position.type = GL_FLOAT;
shadermodel->m_Position.elems = 3;
shadermodel->m_Position.elems = 4;
shadermodel->m_Array.AddAttribute(&shadermodel->m_Position);
shadermodel->m_UV.type = GL_FLOAT;
shadermodel->m_UV.elems = 2;
shadermodel->m_Array.AddAttribute(&shadermodel->m_UV);
shadermodel->m_Normal.type = GL_FLOAT;
shadermodel->m_Normal.elems = 3;
shadermodel->m_Normal.elems = 4;
shadermodel->m_Array.AddAttribute(&shadermodel->m_Normal);
shadermodel->m_Array.SetNumVertices(mdef->GetNumVertices());
shadermodel->m_Array.Layout();
// Fill in static UV coordinates
VertexArrayIterator<float[2]> UVit = shadermodel->m_UV.GetIterator<float[2]>();
ModelRenderer::BuildUV(mdef, UVit);
// Verify alignment
ENSURE(shadermodel->m_Position.offset % 16 == 0);
ENSURE(shadermodel->m_Normal.offset % 16 == 0);
ENSURE(shadermodel->m_Array.GetStride() % 16 == 0);
return shadermodel;
}
@ -188,11 +205,19 @@ void ShaderModelRenderer::EndPass(int streamflags)
// Prepare UV coordinates for this modeldef
void ShaderModelRenderer::PrepareModelDef(int UNUSED(streamflags), const CModelDefPtr& def)
void ShaderModelRenderer::PrepareModelDef(int streamflags, const CModelDefPtr& def)
{
m->shadermodeldef = (ShaderModelDef*)def->GetRenderData(m);
ENSURE(m->shadermodeldef);
if (streamflags & STREAM_UV0)
{
u8* base = m->shadermodeldef->m_Array.Bind();
GLsizei stride = (GLsizei)m->shadermodeldef->m_Array.GetStride();
glTexCoordPointer(2, GL_FLOAT, stride, base + m->shadermodeldef->m_UV.offset);
}
}
@ -213,9 +238,6 @@ void ShaderModelRenderer::RenderModel(int streamflags, CModel* model, void* data
if (streamflags & STREAM_NORMAL)
glNormalPointer(GL_FLOAT, stride, base + shadermodel->m_Normal.offset);
if (streamflags & STREAM_UV0)
glTexCoordPointer(2, GL_FLOAT, stride, base + shadermodel->m_UV.offset);
// render the lot
size_t numFaces = mdldef->GetNumFaces();

View File

@ -1,4 +1,4 @@
/* Copyright (C) 2009 Wildfire Games.
/* Copyright (C) 2011 Wildfire Games.
* This file is part of 0 A.D.
*
* 0 A.D. is free software: you can redistribute it and/or modify
@ -41,10 +41,23 @@
#include <boost/weak_ptr.hpp>
#if ARCH_X86_X64
# include "lib/sysdep/arch/x86_x64/x86_x64.h"
#endif
///////////////////////////////////////////////////////////////////////////////////////////////
// ModelRenderer implementation
static bool g_EnableSSE = false;
void ModelRenderer::Init()
{
#if ARCH_X86_X64
if (x86_x64_cap(X86_X64_CAP_SSE))
g_EnableSSE = true;
#endif
}
// Helper function to copy object-space position and normal vectors into arrays.
void ModelRenderer::CopyPositionAndNormals(
const CModelDefPtr& mdef,
@ -84,8 +97,10 @@ void ModelRenderer::BuildPositionAndNormals(
return;
}
CModelDef::SkinPointsAndNormals(numVertices, Position, Normal, vertices, mdef->GetBlendIndices(), model->GetAnimatedBoneMatrices());
if (g_EnableSSE)
CModelDef::SkinPointsAndNormals_SSE(numVertices, Position, Normal, vertices, mdef->GetBlendIndices(), model->GetAnimatedBoneMatrices());
else
CModelDef::SkinPointsAndNormals(numVertices, Position, Normal, vertices, mdef->GetBlendIndices(), model->GetAnimatedBoneMatrices());
}
else
{

View File

@ -1,4 +1,4 @@
/* Copyright (C) 2009 Wildfire Games.
/* Copyright (C) 2011 Wildfire Games.
* This file is part of 0 A.D.
*
* 0 A.D. is free software: you can redistribute it and/or modify
@ -123,6 +123,12 @@ public:
ModelRenderer() { }
virtual ~ModelRenderer() { }
/**
* Initialise global settings.
* Should be called before using the class.
*/
static void Init();
/**
* Submit: Submit a model for rendering this frame.
*
@ -212,7 +218,8 @@ public:
* @param Position Points to the array that will receive
* transformed position vectors. The array behind the iterator
* must be large enough to hold model->GetModelDef()->GetNumVertices()
* vertices.
* vertices. It must allow 16 bytes to be written to each element
* (i.e. provide 4 bytes of padding after each CVector3D).
* @param Normal Points to the array that will receive transformed
* normal vectors. The array behind the iterator must be as large as
* the Position array.

View File

@ -1,4 +1,4 @@
/* Copyright (C) 2009 Wildfire Games.
/* Copyright (C) 2011 Wildfire Games.
* This file is part of 0 A.D.
*
* 0 A.D. is free software: you can redistribute it and/or modify
@ -25,7 +25,9 @@
#include <algorithm>
#include <vector>
#include "lib/bits.h"
#include "lib/ogl.h"
#include "lib/sysdep/rtl.h"
#include "maths/MathUtil.h"
#include "maths/Vector3D.h"
#include "maths/Vector4D.h"
@ -117,17 +119,26 @@ PSModel::PSModel(CModel* model)
{
CModelDefPtr mdef = m_Model->GetModelDef();
m_Position.type = GL_FLOAT;
m_Position.elems = 3;
m_Array.AddAttribute(&m_Position);
// Positions and normals must be 16-byte aligned for SSE writes.
// We can pack the color after the position; it will be corrupted by
// BuildPositionAndNormals, but that's okay since we'll recompute the
// colors afterwards.
m_Color.type = GL_UNSIGNED_BYTE;
m_Color.elems = 4;
m_Array.AddAttribute(&m_Color);
m_Position.type = GL_FLOAT;
m_Position.elems = 3;
m_Array.AddAttribute(&m_Position);
m_Array.SetNumVertices(mdef->GetNumVertices());
m_Array.Layout();
// Verify alignment
ENSURE(m_Position.offset % 16 == 0);
ENSURE(m_Array.GetStride() % 16 == 0);
m_Indices = new u16[mdef->GetNumFaces()*3];
}
@ -194,8 +205,13 @@ float PSModel::BackToFrontIndexSort(const CMatrix3D& worldToCam)
*/
struct PolygonSortModelRendererInternals
{
/// Scratch space for normal vector calculation
std::vector<CVector3D> normals;
/**
* Scratch space for normal vector calculation.
* Space is reserved so we don't have to do frequent reallocations.
* Allocated with rtl_AllocateAligned(normalsNumVertices*16, 16) for SSE writes.
*/
char* normals;
size_t normalsNumVertices;
};
@ -203,10 +219,14 @@ struct PolygonSortModelRendererInternals
PolygonSortModelRenderer::PolygonSortModelRenderer()
{
m = new PolygonSortModelRendererInternals;
m->normals = 0;
m->normalsNumVertices = 0;
}
PolygonSortModelRenderer::~PolygonSortModelRenderer()
{
rtl_FreeAligned(m->normals);
delete m;
}
@ -237,11 +257,19 @@ void PolygonSortModelRenderer::UpdateModelData(CModel* model, void* data, int up
size_t numVertices = mdef->GetNumVertices();
// build vertices
if (m->normals.size() < numVertices)
m->normals.resize(numVertices);
// allocate working space for computing normals
if (numVertices > m->normalsNumVertices)
{
rtl_FreeAligned(m->normals);
size_t newSize = round_up_to_pow2(numVertices);
m->normals = (char*)rtl_AllocateAligned(newSize*16, 16);
m->normalsNumVertices = newSize;
}
VertexArrayIterator<CVector3D> Position = psmdl->m_Position.GetIterator<CVector3D>();
VertexArrayIterator<CVector3D> Normal = VertexArrayIterator<CVector3D>((char*)&m->normals[0], sizeof(CVector3D));
VertexArrayIterator<CVector3D> Normal = VertexArrayIterator<CVector3D>(m->normals, 16);
ModelRenderer::BuildPositionAndNormals(model, Position, Normal);

View File

@ -19,6 +19,7 @@
#include "lib/alignment.h"
#include "lib/ogl.h"
#include "lib/sysdep/rtl.h"
#include "maths/Vector3D.h"
#include "maths/Vector4D.h"
#include "graphics/SColor.h"
@ -47,7 +48,7 @@ VertexArray::~VertexArray()
// Free all resources on destruction or when a layout parameter changes
void VertexArray::Free()
{
delete[] m_BackingStore;
rtl_FreeAligned(m_BackingStore);
m_BackingStore = 0;
if (m_VB)
@ -214,7 +215,7 @@ void VertexArray::Layout()
//debug_printf(L"Stride: %u\n", m_Stride);
if (m_Stride)
m_BackingStore = new char[m_Stride * m_NumVertices];
m_BackingStore = (char*)rtl_AllocateAligned(m_Stride * m_NumVertices, 16);
}
@ -249,7 +250,7 @@ u8* VertexArray::Bind()
// Free the backing store to save some memory
void VertexArray::FreeBackingStore()
{
delete[] m_BackingStore;
rtl_FreeAligned(m_BackingStore);
m_BackingStore = 0;
}

View File

@ -103,6 +103,16 @@ public:
return tmp;
}
// Accessors for raw buffer data, for performance-critical code
char* GetData() const
{
return m_Data;
}
size_t GetStride() const
{
return m_Stride;
}
private:
char* m_Data;
size_t m_Stride;
@ -187,7 +197,7 @@ private:
CVertexBuffer::VBChunk* m_VB;
size_t m_Stride;
char* m_BackingStore;
char* m_BackingStore; // 16-byte aligned, to allow fast SSE access
};
/**