From 3916c25b840ccc66071dfa95b5a52d460a3ceed3 Mon Sep 17 00:00:00 2001
From: Ykkrosh <philip@wildfiregames.com>
Date: Wed, 9 Nov 2011 23:11:28 +0000
Subject: [PATCH] Optimise vertex skinning code with SSE, based on patch by
 gruby.

Fixes #905.

This was SVN commit r10499.
---
 source/graphics/Model.cpp                     |   5 +-
 source/graphics/Model.h                       |   3 +-
 source/graphics/ModelDef.cpp                  | 103 +++++++++++++++++-
 source/graphics/ModelDef.h                    |  15 ++-
 source/ps/GameSetup/GameSetup.cpp             |   2 +
 .../renderer/FixedFunctionModelRenderer.cpp   |  46 ++++++--
 source/renderer/HWLightingModelRenderer.cpp   |  58 +++++++---
 source/renderer/ModelRenderer.cpp             |  21 +++-
 source/renderer/ModelRenderer.h               |  11 +-
 source/renderer/TransparencyRenderer.cpp      |  46 ++++++--
 source/renderer/VertexArray.cpp               |   7 +-
 source/renderer/VertexArray.h                 |  12 +-
 12 files changed, 276 insertions(+), 53 deletions(-)

diff --git a/source/graphics/Model.cpp b/source/graphics/Model.cpp
index 704e2b27ba..aa24a19bb3 100644
--- a/source/graphics/Model.cpp
+++ b/source/graphics/Model.cpp
@@ -32,6 +32,7 @@
 #include "ObjectEntry.h"
 #include "lib/res/graphics/ogl_tex.h"
 #include "lib/res/h_mgr.h"
+#include "lib/sysdep/rtl.h"
 #include "ps/Profile.h"
 
 #include "ps/CLogger.h"
@@ -57,7 +58,7 @@ CModel::~CModel()
 // ReleaseData: delete anything allocated by the model
 void CModel::ReleaseData()
 {
-	delete[] m_BoneMatrices;
+	rtl_FreeAligned(m_BoneMatrices);
 	delete[] m_InverseBindBoneMatrices;
 
 	for (size_t i = 0; i < m_Props.size(); ++i)
@@ -84,7 +85,7 @@ bool CModel::InitModel(const CModelDefPtr& modeldef)
 		size_t numBlends = modeldef->GetNumBlends();
 
 		// allocate matrices for bone transformations
-		m_BoneMatrices = new CMatrix3D[numBones + numBlends];
+		m_BoneMatrices = (CMatrix3D*)rtl_AllocateAligned(sizeof(CMatrix3D) * (numBones + numBlends), 16);
 		for (size_t i = 0; i < numBones + numBlends; ++i)
 		{
 			m_BoneMatrices[i].SetIdentity();
diff --git a/source/graphics/Model.h b/source/graphics/Model.h
index 84efeb65cb..202378f04d 100644
--- a/source/graphics/Model.h
+++ b/source/graphics/Model.h
@@ -154,11 +154,12 @@ public:
 	 */
 	bool IsSkinned() { return (m_BoneMatrices != NULL); }
 
-	// return the models bone matrices
+	// return the models bone matrices; 16-byte aligned for SSE reads
 	const CMatrix3D* GetAnimatedBoneMatrices() { 
 		ENSURE(m_PositionValid);
 		return m_BoneMatrices;
 	}
+
 	const CMatrix3D* GetInverseBindBoneMatrices() { 
 		return m_InverseBindBoneMatrices;
 	}
diff --git a/source/graphics/ModelDef.cpp b/source/graphics/ModelDef.cpp
index 2afd09cc8c..d528695170 100644
--- a/source/graphics/ModelDef.cpp
+++ b/source/graphics/ModelDef.cpp
@@ -1,4 +1,4 @@
-/* Copyright (C) 2010 Wildfire Games.
+/* Copyright (C) 2011 Wildfire Games.
  * This file is part of 0 A.D.
  *
  * 0 A.D. is free software: you can redistribute it and/or modify
@@ -26,6 +26,10 @@
 #include "ps/FileIo.h"
 #include "maths/Vector4D.h"
 
+#if ARCH_X86_X64
+# include <xmmintrin.h>
+#endif
+
 CVector3D CModelDef::SkinPoint(const SModelVertex& vtx,
 							   const CMatrix3D newPoseMatrices[])
 {
@@ -91,12 +95,18 @@ void CModelDef::SkinPointsAndNormals(
 		const size_t* blendIndices,
 		const CMatrix3D newPoseMatrices[])
 {
+	// To avoid some performance overhead, get the raw vertex array pointers
+	char* PositionData = Position.GetData();
+	size_t PositionStride = Position.GetStride();
+	char* NormalData = Normal.GetData();
+	size_t NormalStride = Normal.GetStride();
+
 	for (size_t j = 0; j < numVertices; ++j)
 	{
 		const SModelVertex& vtx = vertices[j];
 
-		Position[j] = newPoseMatrices[blendIndices[j]].Transform(vtx.m_Coords);
-		Normal[j] = newPoseMatrices[blendIndices[j]].Rotate(vtx.m_Norm);
+		CVector3D pos = newPoseMatrices[blendIndices[j]].Transform(vtx.m_Coords);
+		CVector3D norm = newPoseMatrices[blendIndices[j]].Rotate(vtx.m_Norm);
 
 		// If there was more than one influence, the result is probably not going
 		// to be of unit length (since it's a weighted sum of several independent
@@ -104,10 +114,95 @@ void CModelDef::SkinPointsAndNormals(
 		// (It's fairly common to only have one influence, so it seems sensible to
 		// optimise that case a bit.)
 		if (vtx.m_Blend.m_Bone[1] != 0xff) // if more than one influence
-			Normal[j].Normalize();
+			norm.Normalize();
+
+		memcpy(PositionData + PositionStride*j, &pos.X, 3*sizeof(float));
+		memcpy(NormalData + NormalStride*j, &norm.X, 3*sizeof(float));
 	}
 }
 
+#if ARCH_X86_X64
+void CModelDef::SkinPointsAndNormals_SSE(
+		size_t numVertices,
+		const VertexArrayIterator<CVector3D>& Position,
+		const VertexArrayIterator<CVector3D>& Normal,
+		const SModelVertex* vertices,
+		const size_t* blendIndices,
+		const CMatrix3D newPoseMatrices[])
+{
+	// To avoid some performance overhead, get the raw vertex array pointers
+	char* PositionData = Position.GetData();
+	size_t PositionStride = Position.GetStride();
+	char* NormalData = Normal.GetData();
+	size_t NormalStride = Normal.GetStride();
+
+	// Must be aligned correctly for SSE
+	ASSERT((intptr_t)newPoseMatrices % 16 == 0);
+	ASSERT((intptr_t)PositionData % 16 == 0);
+	ASSERT((intptr_t)PositionStride % 16 == 0);
+ 	ASSERT((intptr_t)NormalData % 16 == 0);
+	ASSERT((intptr_t)NormalStride % 16 == 0);
+
+	__m128 col0, col1, col2, col3, vec0, vec1, vec2;
+
+	for (size_t j = 0; j < numVertices; ++j)
+	{
+		const SModelVertex& vtx = vertices[j];
+		const CMatrix3D& mtx = newPoseMatrices[blendIndices[j]];
+
+		// Loads matrix to xmm registers.
+		col0 = _mm_load_ps(mtx._data);
+		col1 = _mm_load_ps(mtx._data + 4);
+		col2 = _mm_load_ps(mtx._data + 8);
+		col3 = _mm_load_ps(mtx._data + 12);
+		
+		// Loads and computes vertex coordinates.
+		vec0 = _mm_load1_ps(&vtx.m_Coords.X);
+		vec0 = _mm_mul_ps(col0, vec0);
+		vec1 = _mm_load1_ps(&vtx.m_Coords.Y);
+		vec1 = _mm_mul_ps(col1, vec1);
+		vec0 = _mm_add_ps(vec0, vec1);
+		vec1 = _mm_load1_ps(&vtx.m_Coords.Z);
+		vec1 = _mm_mul_ps(col2, vec1);
+		vec1 = _mm_add_ps(vec1, col3);
+		vec0 = _mm_add_ps(vec0, vec1);
+		_mm_store_ps((float*)(PositionData + PositionStride*j), vec0);
+
+		// Loads and computes normal vectors.
+		vec0 = _mm_load1_ps(&vtx.m_Norm.X);
+		vec0 = _mm_mul_ps(col0, vec0);
+		vec1 = _mm_load1_ps(&vtx.m_Norm.Y);
+		vec1 = _mm_mul_ps(col1, vec1);
+		vec0 = _mm_add_ps(vec0, vec1);
+		vec1 = _mm_load1_ps(&vtx.m_Norm.Z);
+		vec1 = _mm_mul_ps(col2, vec1);
+		vec0 = _mm_add_ps(vec0, vec1);
+
+		// If there was more than one influence, the result is probably not going
+		// to be of unit length (since it's a weighted sum of several independent
+		// unit vectors), so we need to normalise it.
+		// (It's fairly common to only have one influence, so it seems sensible to
+		// optimise that case a bit.)
+		if (vtx.m_Blend.m_Bone[1] != 0xff) // if more than one influence
+		{
+			// Normalization.
+			// vec1 = [x*x, y*y, z*z, ?*?]
+			vec1 = _mm_mul_ps(vec0, vec0);
+			// vec2 = [y*y, z*z, x*x, y*y]
+			vec2 = _mm_shuffle_ps(vec1, vec1, _MM_SHUFFLE(1, 2, 0, 1));
+			vec1 = _mm_add_ps(vec1, vec2);
+			// vec2 = [z*z, x*x, y*y, z*z]
+			vec2 = _mm_shuffle_ps(vec2, vec2, _MM_SHUFFLE(1, 2, 0, 1));
+			vec1 = _mm_add_ps(vec1, vec2);
+			// rsqrt(a) = 1 / sqrt(a)
+			vec1 = _mm_rsqrt_ps(vec1);
+			vec0 = _mm_mul_ps(vec0, vec1);
+		}
+		_mm_store_ps((float*)(NormalData + NormalStride*j), vec0);
+	}
+}
+#endif
+
 void CModelDef::BlendBoneMatrices(
 		CMatrix3D boneMatrices[])
 {
diff --git a/source/graphics/ModelDef.h b/source/graphics/ModelDef.h
index e7ce494e83..9634277555 100644
--- a/source/graphics/ModelDef.h
+++ b/source/graphics/ModelDef.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2010 Wildfire Games.
+/* Copyright (C) 2011 Wildfire Games.
  * This file is part of 0 A.D.
  *
  * 0 A.D. is free software: you can redistribute it and/or modify
@@ -179,6 +179,19 @@ public:
 		const size_t* blendIndices,
 		const CMatrix3D newPoseMatrices[]);
 
+#if ARCH_X86_X64
+	/**
+	 * SSE-optimised version of SkinPointsAndNormals.
+	 */
+	static void SkinPointsAndNormals_SSE(
+		size_t numVertices,
+		const VertexArrayIterator<CVector3D>& Position,
+		const VertexArrayIterator<CVector3D>& Normal,
+		const SModelVertex* vertices,
+		const size_t* blendIndices,
+		const CMatrix3D newPoseMatrices[]);
+#endif
+
 	/**
 	 * Blend bone matrices together to fill bone palette.
 	 */
diff --git a/source/ps/GameSetup/GameSetup.cpp b/source/ps/GameSetup/GameSetup.cpp
index 282158f018..e21f82a873 100644
--- a/source/ps/GameSetup/GameSetup.cpp
+++ b/source/ps/GameSetup/GameSetup.cpp
@@ -62,6 +62,7 @@
 
 #include "renderer/Renderer.h"
 #include "renderer/VertexBufferManager.h"
+#include "renderer/ModelRenderer.h"
 
 #include "maths/MathUtil.h"
 
@@ -591,6 +592,7 @@ static void InitRenderer()
 	g_Renderer.SetViewport(vp);
 
 	ColorActivateFastImpl();
+	ModelRenderer::Init();
 }
 
 static void InitSDL()
diff --git a/source/renderer/FixedFunctionModelRenderer.cpp b/source/renderer/FixedFunctionModelRenderer.cpp
index 557b61c37d..f653cf672f 100644
--- a/source/renderer/FixedFunctionModelRenderer.cpp
+++ b/source/renderer/FixedFunctionModelRenderer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (C) 2009 Wildfire Games.
+/* Copyright (C) 2011 Wildfire Games.
  * This file is part of 0 A.D.
  *
  * 0 A.D. is free software: you can redistribute it and/or modify
@@ -21,7 +21,9 @@
 
 #include "precompiled.h"
 
+#include "lib/bits.h"
 #include "lib/ogl.h"
+#include "lib/sysdep/rtl.h"
 #include "maths/Vector3D.h"
 #include "maths/Vector4D.h"
 
@@ -97,8 +99,13 @@ struct FFModel
 
 struct FixedFunctionModelRendererInternals
 {
-	/// Transformed vertex normals - required for recalculating lighting on skinned models
-	std::vector<CVector3D> normals;
+	/**
+	 * Scratch space for normal vector calculation.
+	 * Space is reserved so we don't have to do frequent reallocations.
+	 * Allocated with rtl_AllocateAligned(normalsNumVertices*16, 16) for SSE writes.
+	 */
+	char* normals;
+	size_t normalsNumVertices;
 
 	/// Previously prepared modeldef
 	FFModelDef* ffmodeldef;
@@ -110,10 +117,14 @@ FixedFunctionModelRenderer::FixedFunctionModelRenderer()
 {
 	m = new FixedFunctionModelRendererInternals;
 	m->ffmodeldef = 0;
+	m->normals = 0;
+	m->normalsNumVertices = 0;
 }
 
 FixedFunctionModelRenderer::~FixedFunctionModelRenderer()
 {
+	rtl_FreeAligned(m->normals);
+
 	delete m;
 }
 
@@ -133,17 +144,26 @@ void* FixedFunctionModelRenderer::CreateModelData(CModel* model)
 	// Build the per-model data
 	FFModel* ffmodel = new FFModel;
 
-	ffmodel->m_Position.type = GL_FLOAT;
-	ffmodel->m_Position.elems = 3;
-	ffmodel->m_Array.AddAttribute(&ffmodel->m_Position);
+	// Positions must be 16-byte aligned for SSE writes.
+	// We can pack the color after the position; it will be corrupted by
+	// BuildPositionAndNormals, but that's okay since we'll recompute the
+	// colors afterwards.
 
 	ffmodel->m_Color.type = GL_UNSIGNED_BYTE;
 	ffmodel->m_Color.elems = 4;
 	ffmodel->m_Array.AddAttribute(&ffmodel->m_Color);
 
+	ffmodel->m_Position.type = GL_FLOAT;
+	ffmodel->m_Position.elems = 3;
+	ffmodel->m_Array.AddAttribute(&ffmodel->m_Position);
+
 	ffmodel->m_Array.SetNumVertices(mdef->GetNumVertices());
 	ffmodel->m_Array.Layout();
 
+	// Verify alignment
+	ENSURE(ffmodel->m_Position.offset % 16 == 0);
+	ENSURE(ffmodel->m_Array.GetStride() % 16 == 0);
+
 	return ffmodel;
 }
 
@@ -159,11 +179,19 @@ void FixedFunctionModelRenderer::UpdateModelData(CModel* model, void* data, int
 		size_t numVertices = mdef->GetNumVertices();
 
 		// build vertices
-		if (m->normals.size() < numVertices)
-			m->normals.resize(numVertices);
+
+		// allocate working space for computing normals
+		if (numVertices > m->normalsNumVertices)
+		{
+			rtl_FreeAligned(m->normals);
+
+			size_t newSize = round_up_to_pow2(numVertices);
+			m->normals = (char*)rtl_AllocateAligned(newSize*16, 16);
+			m->normalsNumVertices = newSize;
+		}
 
 		VertexArrayIterator<CVector3D> Position = ffmodel->m_Position.GetIterator<CVector3D>();
-		VertexArrayIterator<CVector3D> Normal = VertexArrayIterator<CVector3D>((char*)&m->normals[0], sizeof(CVector3D));
+		VertexArrayIterator<CVector3D> Normal = VertexArrayIterator<CVector3D>(m->normals, 16);
 
 		ModelRenderer::BuildPositionAndNormals(model, Position, Normal);
 
diff --git a/source/renderer/HWLightingModelRenderer.cpp b/source/renderer/HWLightingModelRenderer.cpp
index 31ab164202..bd3d1adb76 100644
--- a/source/renderer/HWLightingModelRenderer.cpp
+++ b/source/renderer/HWLightingModelRenderer.cpp
@@ -37,13 +37,35 @@ struct ShaderModelDef : public CModelDefRPrivate
 	/// Indices are the same for all models, so share them
 	VertexIndexArray m_IndexArray;
 
+	/// Static per-CModelDef vertex array
+	VertexArray m_Array;
+
+	/// UV coordinates are stored in the static array
+	VertexArray::Attribute m_UV;
+
 	ShaderModelDef(const CModelDefPtr& mdef);
 };
 
 
 ShaderModelDef::ShaderModelDef(const CModelDefPtr& mdef)
-	: m_IndexArray(GL_STATIC_DRAW)
+	: m_IndexArray(GL_STATIC_DRAW), m_Array(GL_STATIC_DRAW)
 {
+	size_t numVertices = mdef->GetNumVertices();
+
+	m_UV.type = GL_FLOAT;
+	m_UV.elems = 2;
+	m_Array.AddAttribute(&m_UV);
+
+	m_Array.SetNumVertices(numVertices);
+	m_Array.Layout();
+
+	VertexArrayIterator<float[2]> UVit = m_UV.GetIterator<float[2]>();
+
+	ModelRenderer::BuildUV(mdef, UVit);
+
+	m_Array.Upload();
+	m_Array.FreeBackingStore();
+
 	m_IndexArray.SetNumVertices(mdef->GetNumFaces()*3);
 	m_IndexArray.Layout();
 	ModelRenderer::BuildIndices(mdef, m_IndexArray.GetIterator());
@@ -61,9 +83,6 @@ struct ShaderModel
 	VertexArray::Attribute m_Position;
 	VertexArray::Attribute m_Normal;
 
-	/// UV is stored per-CModel in order to avoid space wastage due to alignment
-	VertexArray::Attribute m_UV;
-
 	ShaderModel() : m_Array(GL_DYNAMIC_DRAW) { }
 };
 
@@ -103,25 +122,23 @@ void* ShaderModelRenderer::CreateModelData(CModel* model)
 	// Build the per-model data
 	ShaderModel* shadermodel = new ShaderModel;
 
+	// Positions and normals must be 16-byte aligned for SSE writes.
+
 	shadermodel->m_Position.type = GL_FLOAT;
-	shadermodel->m_Position.elems = 3;
+	shadermodel->m_Position.elems = 4;
 	shadermodel->m_Array.AddAttribute(&shadermodel->m_Position);
 
-	shadermodel->m_UV.type = GL_FLOAT;
-	shadermodel->m_UV.elems = 2;
-	shadermodel->m_Array.AddAttribute(&shadermodel->m_UV);
-
 	shadermodel->m_Normal.type = GL_FLOAT;
-	shadermodel->m_Normal.elems = 3;
+	shadermodel->m_Normal.elems = 4;
 	shadermodel->m_Array.AddAttribute(&shadermodel->m_Normal);
 
 	shadermodel->m_Array.SetNumVertices(mdef->GetNumVertices());
 	shadermodel->m_Array.Layout();
 
-	// Fill in static UV coordinates
-	VertexArrayIterator<float[2]> UVit = shadermodel->m_UV.GetIterator<float[2]>();
-
-	ModelRenderer::BuildUV(mdef, UVit);
+	// Verify alignment
+	ENSURE(shadermodel->m_Position.offset % 16 == 0);
+	ENSURE(shadermodel->m_Normal.offset % 16 == 0);
+	ENSURE(shadermodel->m_Array.GetStride() % 16 == 0);
 
 	return shadermodel;
 }
@@ -188,11 +205,19 @@ void ShaderModelRenderer::EndPass(int streamflags)
 
 
 // Prepare UV coordinates for this modeldef
-void ShaderModelRenderer::PrepareModelDef(int UNUSED(streamflags), const CModelDefPtr& def)
+void ShaderModelRenderer::PrepareModelDef(int streamflags, const CModelDefPtr& def)
 {
 	m->shadermodeldef = (ShaderModelDef*)def->GetRenderData(m);
 
 	ENSURE(m->shadermodeldef);
+
+	if (streamflags & STREAM_UV0)
+	{
+		u8* base = m->shadermodeldef->m_Array.Bind();
+		GLsizei stride = (GLsizei)m->shadermodeldef->m_Array.GetStride();
+
+		glTexCoordPointer(2, GL_FLOAT, stride, base + m->shadermodeldef->m_UV.offset);
+	}
 }
 
 
@@ -213,9 +238,6 @@ void ShaderModelRenderer::RenderModel(int streamflags, CModel* model, void* data
 	if (streamflags & STREAM_NORMAL)
 		glNormalPointer(GL_FLOAT, stride, base + shadermodel->m_Normal.offset);
 
-	if (streamflags & STREAM_UV0)
-		glTexCoordPointer(2, GL_FLOAT, stride, base + shadermodel->m_UV.offset);
-
 	// render the lot
 	size_t numFaces = mdldef->GetNumFaces();
 
diff --git a/source/renderer/ModelRenderer.cpp b/source/renderer/ModelRenderer.cpp
index 0b048040a3..9ee9a4371d 100644
--- a/source/renderer/ModelRenderer.cpp
+++ b/source/renderer/ModelRenderer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (C) 2009 Wildfire Games.
+/* Copyright (C) 2011 Wildfire Games.
  * This file is part of 0 A.D.
  *
  * 0 A.D. is free software: you can redistribute it and/or modify
@@ -41,10 +41,23 @@
 
 #include <boost/weak_ptr.hpp>
 
+#if ARCH_X86_X64
+# include "lib/sysdep/arch/x86_x64/x86_x64.h"
+#endif
 
 ///////////////////////////////////////////////////////////////////////////////////////////////
 // ModelRenderer implementation
 
+static bool g_EnableSSE = false;
+
+void ModelRenderer::Init()
+{
+#if ARCH_X86_X64
+	if (x86_x64_cap(X86_X64_CAP_SSE))
+		g_EnableSSE = true;
+#endif
+}
+
 // Helper function to copy object-space position and normal vectors into arrays.
 void ModelRenderer::CopyPositionAndNormals(
 		const CModelDefPtr& mdef,
@@ -84,8 +97,10 @@ void ModelRenderer::BuildPositionAndNormals(
 			return;
 		}
 
-		CModelDef::SkinPointsAndNormals(numVertices, Position, Normal, vertices, mdef->GetBlendIndices(), model->GetAnimatedBoneMatrices());
-
+		if (g_EnableSSE)
+			CModelDef::SkinPointsAndNormals_SSE(numVertices, Position, Normal, vertices, mdef->GetBlendIndices(), model->GetAnimatedBoneMatrices());
+		else
+			CModelDef::SkinPointsAndNormals(numVertices, Position, Normal, vertices, mdef->GetBlendIndices(), model->GetAnimatedBoneMatrices());
 	}
 	else
 	{
diff --git a/source/renderer/ModelRenderer.h b/source/renderer/ModelRenderer.h
index 76786edda0..692546b993 100644
--- a/source/renderer/ModelRenderer.h
+++ b/source/renderer/ModelRenderer.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2009 Wildfire Games.
+/* Copyright (C) 2011 Wildfire Games.
  * This file is part of 0 A.D.
  *
  * 0 A.D. is free software: you can redistribute it and/or modify
@@ -123,6 +123,12 @@ public:
 	ModelRenderer() { }
 	virtual ~ModelRenderer() { }
 
+	/**
+	 * Initialise global settings.
+	 * Should be called before using the class.
+	 */
+	static void Init();
+	
 	/**
 	 * Submit: Submit a model for rendering this frame.
 	 *
@@ -212,7 +218,8 @@ public:
 	 * @param Position Points to the array that will receive
 	 * transformed position vectors. The array behind the iterator
 	 * must be large enough to hold model->GetModelDef()->GetNumVertices()
-	 * vertices.
+	 * vertices. It must allow 16 bytes to be written to each element
+	 * (i.e. provide 4 bytes of padding after each CVector3D).
 	 * @param Normal Points to the array that will receive transformed
 	 * normal vectors. The array behind the iterator must be as large as
 	 * the Position array.
diff --git a/source/renderer/TransparencyRenderer.cpp b/source/renderer/TransparencyRenderer.cpp
index aa0af878ad..7c5c119336 100644
--- a/source/renderer/TransparencyRenderer.cpp
+++ b/source/renderer/TransparencyRenderer.cpp
@@ -1,4 +1,4 @@
-/* Copyright (C) 2009 Wildfire Games.
+/* Copyright (C) 2011 Wildfire Games.
  * This file is part of 0 A.D.
  *
  * 0 A.D. is free software: you can redistribute it and/or modify
@@ -25,7 +25,9 @@
 #include <algorithm>
 #include <vector>
 
+#include "lib/bits.h"
 #include "lib/ogl.h"
+#include "lib/sysdep/rtl.h"
 #include "maths/MathUtil.h"
 #include "maths/Vector3D.h"
 #include "maths/Vector4D.h"
@@ -117,17 +119,26 @@ PSModel::PSModel(CModel* model)
 {
 	CModelDefPtr mdef = m_Model->GetModelDef();
 
-	m_Position.type = GL_FLOAT;
-	m_Position.elems = 3;
-	m_Array.AddAttribute(&m_Position);
+	// Positions and normals must be 16-byte aligned for SSE writes.
+	// We can pack the color after the position; it will be corrupted by
+	// BuildPositionAndNormals, but that's okay since we'll recompute the
+	// colors afterwards.
 
 	m_Color.type = GL_UNSIGNED_BYTE;
 	m_Color.elems = 4;
 	m_Array.AddAttribute(&m_Color);
 
+	m_Position.type = GL_FLOAT;
+	m_Position.elems = 3;
+	m_Array.AddAttribute(&m_Position);
+
 	m_Array.SetNumVertices(mdef->GetNumVertices());
 	m_Array.Layout();
 
+	// Verify alignment
+	ENSURE(m_Position.offset % 16 == 0);
+	ENSURE(m_Array.GetStride() % 16 == 0);
+
 	m_Indices = new u16[mdef->GetNumFaces()*3];
 }
 
@@ -194,8 +205,13 @@ float PSModel::BackToFrontIndexSort(const CMatrix3D& worldToCam)
  */
 struct PolygonSortModelRendererInternals
 {
-	/// Scratch space for normal vector calculation
-	std::vector<CVector3D> normals;
+	/**
+	 * Scratch space for normal vector calculation.
+	 * Space is reserved so we don't have to do frequent reallocations.
+	 * Allocated with rtl_AllocateAligned(normalsNumVertices*16, 16) for SSE writes.
+	 */
+	char* normals;
+	size_t normalsNumVertices;
 };
 
 
@@ -203,10 +219,14 @@ struct PolygonSortModelRendererInternals
 PolygonSortModelRenderer::PolygonSortModelRenderer()
 {
 	m = new PolygonSortModelRendererInternals;
+	m->normals = 0;
+	m->normalsNumVertices = 0;
 }
 
 PolygonSortModelRenderer::~PolygonSortModelRenderer()
 {
+	rtl_FreeAligned(m->normals);
+
 	delete m;
 }
 
@@ -237,11 +257,19 @@ void PolygonSortModelRenderer::UpdateModelData(CModel* model, void* data, int up
 		size_t numVertices = mdef->GetNumVertices();
 
 		// build vertices
-		if (m->normals.size() < numVertices)
-			m->normals.resize(numVertices);
+
+		// allocate working space for computing normals
+		if (numVertices > m->normalsNumVertices)
+		{
+			rtl_FreeAligned(m->normals);
+
+			size_t newSize = round_up_to_pow2(numVertices);
+			m->normals = (char*)rtl_AllocateAligned(newSize*16, 16);
+			m->normalsNumVertices = newSize;
+		}
 
 		VertexArrayIterator<CVector3D> Position = psmdl->m_Position.GetIterator<CVector3D>();
-		VertexArrayIterator<CVector3D> Normal = VertexArrayIterator<CVector3D>((char*)&m->normals[0], sizeof(CVector3D));
+		VertexArrayIterator<CVector3D> Normal = VertexArrayIterator<CVector3D>(m->normals, 16);
 
 		ModelRenderer::BuildPositionAndNormals(model, Position, Normal);
 
diff --git a/source/renderer/VertexArray.cpp b/source/renderer/VertexArray.cpp
index bfbf018679..69893dbbc0 100644
--- a/source/renderer/VertexArray.cpp
+++ b/source/renderer/VertexArray.cpp
@@ -19,6 +19,7 @@
 
 #include "lib/alignment.h"
 #include "lib/ogl.h"
+#include "lib/sysdep/rtl.h"
 #include "maths/Vector3D.h"
 #include "maths/Vector4D.h"
 #include "graphics/SColor.h"
@@ -47,7 +48,7 @@ VertexArray::~VertexArray()
 // Free all resources on destruction or when a layout parameter changes
 void VertexArray::Free()
 {
-	delete[] m_BackingStore;
+	rtl_FreeAligned(m_BackingStore);
 	m_BackingStore = 0;
 	
 	if (m_VB)
@@ -214,7 +215,7 @@ void VertexArray::Layout()
 	//debug_printf(L"Stride: %u\n", m_Stride);
 	
 	if (m_Stride)
-		m_BackingStore = new char[m_Stride * m_NumVertices];
+		m_BackingStore = (char*)rtl_AllocateAligned(m_Stride * m_NumVertices, 16);
 }
 
 
@@ -249,7 +250,7 @@ u8* VertexArray::Bind()
 // Free the backing store to save some memory
 void VertexArray::FreeBackingStore()
 {
-	delete[] m_BackingStore;
+	rtl_FreeAligned(m_BackingStore);
 	m_BackingStore = 0;
 }
 
diff --git a/source/renderer/VertexArray.h b/source/renderer/VertexArray.h
index e66496d2d4..7cf6cfaadd 100644
--- a/source/renderer/VertexArray.h
+++ b/source/renderer/VertexArray.h
@@ -103,6 +103,16 @@ public:
 		return tmp;
 	}
 
+	// Accessors for raw buffer data, for performance-critical code
+	char* GetData() const
+	{
+		return m_Data;
+	}
+	size_t GetStride() const
+	{
+		return m_Stride;
+	}
+
 private:
 	char* m_Data;
 	size_t m_Stride;
@@ -187,7 +197,7 @@ private:
 
 	CVertexBuffer::VBChunk* m_VB;
 	size_t m_Stride;
-	char* m_BackingStore;
+	char* m_BackingStore; // 16-byte aligned, to allow fast SSE access
 };
 
 /**