//	GLMatrix.h - A GLFloat matrix used to set up translations etc.
//  ----------------------------------------------------------------------------
//	This file is part of 'NiallsAVLib', base code for any kind of audiovisual
//	apps.
//	Copyright (C) 2012  Niall Moody
//	
//	This program is free software: you can redistribute it and/or modify
//	it under the terms of the GNU General Public License as published by
//	the Free Software Foundation, either version 3 of the License, or
//	(at your option) any later version.
//
//	This program is distributed in the hope that it will be useful,
//	but WITHOUT ANY WARRANTY; without even the implied warranty of
//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//	GNU General Public License for more details.
//
//	You should have received a copy of the GNU General Public License
//	along with this program.  If not, see <http://www.gnu.org/licenses/>.
//	----------------------------------------------------------------------------

#ifndef GLMATRIX_H_
#define GLMATRIX_H_

#include "c0xHeaders.h"
#include "GLHeaders.h"

#include <stack>

typedef array<GLfloat, 16> MatrixArray;

///	A GLFloat matrix used to set up translations etc.
/*!
	Designed to be a more-or-less straight replacement for the fixed function
	matrix methods, so acts as a stack.

	Most of the code for the translate, scale, rotate operations is taken from:
	http://iphonedevelopment.blogspot.co.uk/2009/06/opengl-es-from-ground-up-part-7_04.html
 */
class GLMatrix
{
  public:
	///	Constructor (sets up the identity matrix).
	GLMatrix();
	///	Destructor.
	~GLMatrix();

	///	Translates the matrix.
	void translate(float x, float y, float z);
	///	Scales the matrix.
	void scale(float x, float y, float z);
	///	Rotates the matrix (in radians).
	void rotate(float angle, float x, float y, float z);

	///	Pushes the current matrix to the stack.
	void push();
	///	Pops the current matrix from the stack.
	void pop();
	///	Clears the stack.
	void clear();

	///	Returns the current matrix.
	const GLfloat *getMatrix() const {return matrix.data();};

	///	Generates an orthographic projection matrix.
	void generateOrthoProjection(float left,
								 float right,
								 float bottom,
								 float top);

	///	Helper method. Fills out the identity matrix.
	inline void identity()
	{
		matrix[0] = matrix[5] = matrix[10] = matrix[15] = 1.0;
		matrix[1] = matrix[2] = matrix[3] = matrix[4] = 0.0;
		matrix[6] = matrix[7] = matrix[8] = matrix[9] = 0.0;
		matrix[11] = matrix[12] = matrix[13] = matrix[14] = 0.0;
	};
  private:
	///	Helper method. Multiplies the tempMatrix with the main matrix.
	inline void multiplyWithTemp()
	{
		/*int i, j;
		__m128 c = _mm_setzero_ps();
		float *mIn;
		float *mIn2;
		float *mOut;

		tempIn.swap(matrix);

		mIn = tempIn.data();
		mIn2 = tempMatrix.data();
		mOut = matrix.data();

		//Transpose tempIn.
		for(i=0;i<4;++i)
		{
			for(j=(i+1);j<4;++j)
				std::swap(tempIn[(i*4)+j], tempIn[(j*4)+i]);
		}

		//Supposed optimised SIMD approach, but it goes at exactly the same
		//speed as the naive approach below.
		c = _mm_mul_ps(_mm_load_ps(mIn), _mm_load_ps(mIn2));
		c = _mm_hadd_ps(c, c);
		c = _mm_hadd_ps(c, c);
		_mm_store_ss(mOut, c);
		++mOut;
		c = _mm_mul_ps(_mm_load_ps(mIn+4), _mm_load_ps(mIn2));
		c = _mm_hadd_ps(c, c);
		c = _mm_hadd_ps(c, c);
		_mm_store_ss(mOut, c);
		++mOut;
		c = _mm_mul_ps(_mm_load_ps(mIn+8), _mm_load_ps(mIn2));
		c = _mm_hadd_ps(c, c);
		c = _mm_hadd_ps(c, c);
		_mm_store_ss(mOut, c);
		++mOut;
		c = _mm_mul_ps(_mm_load_ps(mIn+12), _mm_load_ps(mIn2));
		c = _mm_hadd_ps(c, c);
		c = _mm_hadd_ps(c, c);
		_mm_store_ss(mOut, c);

		mIn2 += 4;
		++mOut;
		c = _mm_mul_ps(_mm_load_ps(mIn), _mm_load_ps(mIn2));
		c = _mm_hadd_ps(c, c);
		c = _mm_hadd_ps(c, c);
		_mm_store_ss(mOut, c);
		++mOut;
		c = _mm_mul_ps(_mm_load_ps(mIn+4), _mm_load_ps(mIn2));
		c = _mm_hadd_ps(c, c);
		c = _mm_hadd_ps(c, c);
		_mm_store_ss(mOut, c);
		++mOut;
		c = _mm_mul_ps(_mm_load_ps(mIn+8), _mm_load_ps(mIn2));
		c = _mm_hadd_ps(c, c);
		c = _mm_hadd_ps(c, c);
		_mm_store_ss(mOut, c);
		++mOut;
		c = _mm_mul_ps(_mm_load_ps(mIn+12), _mm_load_ps(mIn2));
		c = _mm_hadd_ps(c, c);
		c = _mm_hadd_ps(c, c);
		_mm_store_ss(mOut, c);

		mIn2 += 4;
		++mOut;
		c = _mm_mul_ps(_mm_load_ps(mIn), _mm_load_ps(mIn2));
		c = _mm_hadd_ps(c, c);
		c = _mm_hadd_ps(c, c);
		_mm_store_ss(mOut, c);
		++mOut;
		c = _mm_mul_ps(_mm_load_ps(mIn+4), _mm_load_ps(mIn2));
		c = _mm_hadd_ps(c, c);
		c = _mm_hadd_ps(c, c);
		_mm_store_ss(mOut, c);
		++mOut;
		c = _mm_mul_ps(_mm_load_ps(mIn+8), _mm_load_ps(mIn2));
		c = _mm_hadd_ps(c, c);
		c = _mm_hadd_ps(c, c);
		_mm_store_ss(mOut, c);
		++mOut;
		c = _mm_mul_ps(_mm_load_ps(mIn+12), _mm_load_ps(mIn2));
		c = _mm_hadd_ps(c, c);
		c = _mm_hadd_ps(c, c);
		_mm_store_ss(mOut, c);

		mIn2 += 4;
		++mOut;
		c = _mm_mul_ps(_mm_load_ps(mIn), _mm_load_ps(mIn2));
		c = _mm_hadd_ps(c, c);
		c = _mm_hadd_ps(c, c);
		_mm_store_ss(mOut, c);
		++mOut;
		c = _mm_mul_ps(_mm_load_ps(mIn+4), _mm_load_ps(mIn2));
		c = _mm_hadd_ps(c, c);
		c = _mm_hadd_ps(c, c);
		_mm_store_ss(mOut, c);
		++mOut;
		c = _mm_mul_ps(_mm_load_ps(mIn+8), _mm_load_ps(mIn2));
		c = _mm_hadd_ps(c, c);
		c = _mm_hadd_ps(c, c);
		_mm_store_ss(mOut, c);
		++mOut;
		c = _mm_mul_ps(_mm_load_ps(mIn+12), _mm_load_ps(mIn2));
		c = _mm_hadd_ps(c, c);
		c = _mm_hadd_ps(c, c);
		_mm_store_ss(mOut, c);*/

		/*matrix[0] = tempIn[0] * tempMatrix[0] + tempIn[1] * tempMatrix[1] + tempIn[2] * tempMatrix[2] + tempIn[3] * tempMatrix[3];
		matrix[1] = tempIn[4] * tempMatrix[0] + tempIn[5] * tempMatrix[1] + tempIn[6] * tempMatrix[2] + tempIn[7] * tempMatrix[3];
		matrix[2] = tempIn[8] * tempMatrix[0] + tempIn[9] * tempMatrix[1] + tempIn[10] * tempMatrix[2] + tempIn[11] * tempMatrix[3];
		matrix[3] = tempIn[12] * tempMatrix[0] + tempIn[13] * tempMatrix[1] + tempIn[14] * tempMatrix[2] + tempIn[15] * tempMatrix[3];

		matrix[4] = tempIn[0] * tempMatrix[4] + tempIn[1] * tempMatrix[5] + tempIn[2] * tempMatrix[6] + tempIn[3] * tempMatrix[7];
		matrix[5] = tempIn[4] * tempMatrix[4] + tempIn[5] * tempMatrix[5] + tempIn[6] * tempMatrix[6] + tempIn[7] * tempMatrix[7];
		matrix[6] = tempIn[8] * tempMatrix[4] + tempIn[9] * tempMatrix[5] + tempIn[10] * tempMatrix[6] + tempIn[11] * tempMatrix[7];
		matrix[7] = tempIn[12] * tempMatrix[4] + tempIn[13] * tempMatrix[5] + tempIn[14] * tempMatrix[6] + tempIn[15] * tempMatrix[7];

		matrix[8] = tempIn[0] * tempMatrix[8] + tempIn[1] * tempMatrix[9] + tempIn[2] * tempMatrix[10] + tempIn[3] * tempMatrix[11];
		matrix[9] = tempIn[4] * tempMatrix[8] + tempIn[5] * tempMatrix[9] + tempIn[6] * tempMatrix[10] + tempIn[7] * tempMatrix[11];
		matrix[10] = tempIn[8] * tempMatrix[8] + tempIn[9] * tempMatrix[9] + tempIn[10] * tempMatrix[10] + tempIn[11] * tempMatrix[11];
		matrix[11] = tempIn[12] * tempMatrix[8] + tempIn[13] * tempMatrix[9] + tempIn[14] * tempMatrix[10] + tempIn[15] * tempMatrix[11];

		matrix[12] = tempIn[0] * tempMatrix[12] + tempIn[1] * tempMatrix[13] + tempIn[2] * tempMatrix[14] + tempIn[3] * tempMatrix[15];
		matrix[13] = tempIn[4] * tempMatrix[12] + tempIn[5] * tempMatrix[13] + tempIn[6] * tempMatrix[14] + tempIn[7] * tempMatrix[15];
		matrix[14] = tempIn[8] * tempMatrix[12] + tempIn[9] * tempMatrix[13] + tempIn[10] * tempMatrix[14] + tempIn[11] * tempMatrix[15];
		matrix[15] = tempIn[12] * tempMatrix[12] + tempIn[13] * tempMatrix[13] + tempIn[14] * tempMatrix[14] + tempIn[15] * tempMatrix[15];*/

		tempIn.swap(matrix);

		matrix[0] = tempIn[0] * tempMatrix[0] + tempIn[4] * tempMatrix[1] + tempIn[8] * tempMatrix[2] + tempIn[12] * tempMatrix[3];
		matrix[1] = tempIn[1] * tempMatrix[0] + tempIn[5] * tempMatrix[1] + tempIn[9] * tempMatrix[2] + tempIn[13] * tempMatrix[3];
		matrix[2] = tempIn[2] * tempMatrix[0] + tempIn[6] * tempMatrix[1] + tempIn[10] * tempMatrix[2] + tempIn[14] * tempMatrix[3];
		matrix[3] = tempIn[3] * tempMatrix[0] + tempIn[7] * tempMatrix[1] + tempIn[11] * tempMatrix[2] + tempIn[15] * tempMatrix[3];

		matrix[4] = tempIn[0] * tempMatrix[4] + tempIn[4] * tempMatrix[5] + tempIn[8] * tempMatrix[6] + tempIn[12] * tempMatrix[7];
		matrix[5] = tempIn[1] * tempMatrix[4] + tempIn[5] * tempMatrix[5] + tempIn[9] * tempMatrix[6] + tempIn[13] * tempMatrix[7];
		matrix[6] = tempIn[2] * tempMatrix[4] + tempIn[6] * tempMatrix[5] + tempIn[10] * tempMatrix[6] + tempIn[14] * tempMatrix[7];
		matrix[7] = tempIn[3] * tempMatrix[4] + tempIn[7] * tempMatrix[5] + tempIn[11] * tempMatrix[6] + tempIn[15] * tempMatrix[7];

		matrix[8] = tempIn[0] * tempMatrix[8] + tempIn[4] * tempMatrix[9] + tempIn[8] * tempMatrix[10] + tempIn[12] * tempMatrix[11];
		matrix[9] = tempIn[1] * tempMatrix[8] + tempIn[5] * tempMatrix[9] + tempIn[9] * tempMatrix[10] + tempIn[13] * tempMatrix[11];
		matrix[10] = tempIn[2] * tempMatrix[8] + tempIn[6] * tempMatrix[9] + tempIn[10] * tempMatrix[10] + tempIn[14] * tempMatrix[11];
		matrix[11] = tempIn[3] * tempMatrix[8] + tempIn[7] * tempMatrix[9] + tempIn[11] * tempMatrix[10] + tempIn[15] * tempMatrix[11];

		matrix[12] = tempIn[0] * tempMatrix[12] + tempIn[4] * tempMatrix[13] + tempIn[8] * tempMatrix[14] + tempIn[12] * tempMatrix[15];
		matrix[13] = tempIn[1] * tempMatrix[12] + tempIn[5] * tempMatrix[13] + tempIn[9] * tempMatrix[14] + tempIn[13] * tempMatrix[15];
		matrix[14] = tempIn[2] * tempMatrix[12] + tempIn[6] * tempMatrix[13] + tempIn[10] * tempMatrix[14] + tempIn[14] * tempMatrix[15];
		matrix[15] = tempIn[3] * tempMatrix[12] + tempIn[7] * tempMatrix[13] + tempIn[11] * tempMatrix[14] + tempIn[15] * tempMatrix[15];
	};

	///	The matrix.
	MatrixArray matrix;
	///	Used for any translations etc. via multiplyWithTemp().
	MatrixArray tempMatrix;
	///	Used by multiplyWithTemp().
	MatrixArray tempIn;

	///	Our stack of matrices.
	/*!
		std::stack apparently relies on std::deque, meaning it allocates memory
		in chunks, so allocations should be kept to a minimum.
	 */
	std::stack<MatrixArray> stack;
};

#endif
