//  BlockyFroggy
//  Copyright © 2017 John Ryland.
//  All rights reserved.
#include "Maths.h"


namespace details
{

  inline float e(const float *m, int j, int i)
  {
    return m[(j % 4) * 4 + (i % 4)];
  }

  inline float invf(int i, int j, const float *m)
  {
    int o = 2 + (j - i);
    i += 4 + o;
    j += 4 - o;
    float inv =
      + e(m, j-1, i+1) * e(m, j+0, i+0) * e(m, j+1, i-1)
      + e(m, j+1, i+1) * e(m, j-1, i+0) * e(m, j+0, i-1)
      + e(m, j-1, i-1) * e(m, j+0, i+1) * e(m, j+1, i+0)
      - e(m, j-1, i-1) * e(m, j+0, i+0) * e(m, j+1, i+1)
      - e(m, j+1, i-1) * e(m, j-1, i+0) * e(m, j+0, i+1)
      - e(m, j-1, i+1) * e(m, j+0, i-1) * e(m, j+1, i+0);
    return (o % 2) ? inv : -inv;
  }

  bool inverseMatrix4x4(float *restrict out, const float *restrict m)
  {
    float inv[16];
    for (int i = 0; i < 4; i++)
      for (int j = 0; j < 4; j++)
        inv[j*4+i] = invf(i,j,m);
    double determinant = 0.0;
    for (int k = 0; k < 4; k++)
      determinant += m[k] * inv[k*4];
    if (determinant == 0.0)
      return false;
    float invdet = float(1.0 / determinant);
    for (int i = 0; i < 16; i++)
      out[i] = inv[i] * invdet;
    return true;
  }

  bool matrix4x4ToNormalMatrix3x3(float *restrict out, const float *restrict m)
  {
    double determinant = 
             e(m,0,0) * ( e(m,1,1)*e(m,2,2) - e(m,2,1)*e(m,1,2) )
           - e(m,0,1) * ( e(m,1,0)*e(m,2,2) - e(m,1,2)*e(m,2,0) )
           + e(m,0,2) * ( e(m,1,0)*e(m,2,1) - e(m,1,1)*e(m,2,0) );
    if (determinant == 0)
      return false;
    float invdet = float(1.0 / determinant);
    out[0] =  (e(m,1,1)*e(m,2,2)-e(m,2,1)*e(m,1,2))*invdet;
    out[1] = -(e(m,0,1)*e(m,2,2)-e(m,0,2)*e(m,2,1))*invdet;
    out[2] =  (e(m,0,1)*e(m,1,2)-e(m,0,2)*e(m,1,1))*invdet;
    out[3] = -(e(m,1,0)*e(m,2,2)-e(m,1,2)*e(m,2,0))*invdet;
    out[4] =  (e(m,0,0)*e(m,2,2)-e(m,0,2)*e(m,2,0))*invdet;
    out[5] = -(e(m,0,0)*e(m,1,2)-e(m,1,0)*e(m,0,2))*invdet;
    out[6] =  (e(m,1,0)*e(m,2,1)-e(m,2,0)*e(m,1,1))*invdet;
    out[7] = -(e(m,0,0)*e(m,2,1)-e(m,2,0)*e(m,0,1))*invdet;
    out[8] =  (e(m,0,0)*e(m,1,1)-e(m,1,0)*e(m,0,1))*invdet;
    return true;
  }

  void setMatrix4x4(float* out, 
      float m11, float m12, float m13, float m14,
      float m21, float m22, float m23, float m24,
      float m31, float m32, float m33, float m34,
      float m41, float m42, float m43, float m44)
  {
    out[0]  = m11; out[1]  = m12; out[2]  = m13; out[3]  = m14;
    out[4]  = m21; out[5]  = m22; out[6]  = m23; out[7]  = m24;
    out[8]  = m31; out[9]  = m32; out[10] = m33; out[11] = m34;
    out[12] = m41; out[13] = m42; out[14] = m43; out[15] = m44;
  }

}


namespace Math
{

  float dotProduct(const float *v1, const float *v2, int stride1, int stride2, int count)
  {
    float result = 0;
    for (int i = 0; i < count; i++)
      result += v1[i * stride1] * v2[i * stride2];
    return result;
  }

  void crossProduct(float *restrict out, const float *restrict v1, const float *restrict v2)
  {
    for (int x = 0; x < 3; x++)
      out[x] = v1[(x+1)%3] * v2[(x+2)%3] - v1[(x+2)%3] * v2[(x+1)%3];
  }

  void quatProduct(float *restrict out, const float *restrict v1, const float *restrict v2)
  {
    // assuming quat of form  a*i + b*j + c*k + d
    out[0] = v1[3]*v2[0] + v1[1]*v2[2] - v1[2]*v2[1] + v1[0]*v2[3];
    out[1] = v1[3]*v2[1] + v1[1]*v2[3] + v1[2]*v2[0] - v1[0]*v2[2];
    out[2] = v1[3]*v2[2] - v1[1]*v2[0] + v1[2]*v2[3] + v1[0]*v2[1];
    out[3] = v1[3]*v2[3] - v1[1]*v2[1] - v1[2]*v2[2] - v1[0]*v2[0];
  }

  void quatProductB(float *restrict out, const float *restrict v1, const float *restrict v2)
  {
    // assuming quat of form  a + bi + cj + dk  where  order is  a,b,c,d   not   b,c,d, a
    out[0] = v1[0]*v2[0] - v1[1]*v2[1] - v1[2]*v2[2] - v1[3]*v2[3];
    out[1] = v1[0]*v2[1] + v1[1]*v2[0] + v1[2]*v2[3] - v1[3]*v2[2];
    out[2] = v1[0]*v2[2] - v1[1]*v2[3] + v1[2]*v2[0] + v1[3]*v2[1];
    out[3] = v1[0]*v2[3] + v1[1]*v2[2] - v1[2]*v2[1] + v1[3]*v2[0];
  }

  void transformVector(float *restrict vout, const float *restrict m, const float *restrict vin)
  {
    for (int i = 0; i < 4; i++)
      vout[i] = dotProduct(m+i, vin, 4);
  }

  void normalizeVector(float *vInOut, int count)
  {
    float invMag = 1.0f / sqrt(dotProduct(vInOut, vInOut, 1, 1, 3));
    for (int i = 0; i < count; i++)
      vInOut[i] *= invMag;
  }

  void makeIdentityMatrix4x4(float* a_out)
  {
    for (int j = 0; j < 16; j++)
      a_out[j] = 0;
    a_out[0] = a_out[5] = a_out[10] = a_out[15] = 1;
  }

  void copyMatrix4x4(float *restrict a_out, const float *restrict m)
  {
    for (int j = 0; j < 16; j++)
      a_out[j] = m[j];
  }

  void makePerspectiveMatrix4x4(float* a_out, float a_fov, float a_aspect, float a_near, float a_far)
  {
    float ctan = 1.0f / tanf(a_fov / 2.0f);
    float inlf = 1.0f / (a_near - a_far);
    details::setMatrix4x4(a_out, ctan / a_aspect, 0.0f, 0.0f, 0.0f, 0.0f, ctan, 0.0f, 0.0f,
        0.0f, 0.0f, (a_far + a_near) * inlf, -1.0f, 0.0f, 0.0f, (2.0f * a_far * a_near) * inlf, 0.0f);
  }

  void makeOrthographicMatrix4x4(float* a_out, float a_left, float a_right, float a_bottom, float a_top, float a_near, float a_far)
  {
    float ral = a_right + a_left;
    float tab = a_top + a_bottom;
    float fan = a_far + a_near;
    float rsl = 1.0f / (a_right - a_left);
    float tsb = 1.0f / (a_top - a_bottom);
    float fsn = 1.0f / (a_far - a_near);
    details::setMatrix4x4(a_out, 2.0f * rsl, 0.0f, 0.0f, 0.0f, 0.0f, 2.0f * tsb, 0.0f, 0.0f,
        0.0f, 0.0f, -2.0f * fsn, 0.0f, -ral * rsl, -tab * tsb, -fan * fsn, 1.0f);
  }

  void eulerToQuat(float *restrict out, float heading, float attitude, float bank)
  {
		// Assuming the angles are in radians.
		float c1 = cos(heading/2);
		float s1 = sin(heading/2);
		float c2 = cos(attitude/2);
		float s2 = sin(attitude/2);
		float c3 = cos(bank/2);
		float s3 = sin(bank/2);
		float c1c2 = c1*c2;
		float s1s2 = s1*s2;
		float c1s2 = c1*s2;
		float s1c2 = s1*c2;
    // assuming quat of form  a*i + b*j + c*k + d
		out[0] = c1c2*s3 + s1s2*c3; // x / a
		out[1] = s1c2*c3 + c1s2*s3; // y / b
		out[2] = c1s2*c3 - s1c2*s3; // z / c
		out[3] = c1c2*c3 - s1s2*s3; // w / d
	}

  void quatToMatrix4x4(float *restrict out, const float *restrict q)
  {
    float x[10] = { q[0]*q[0], q[1]*q[1], q[2]*q[2], q[3]*q[3], q[0]*q[1],
                    q[2]*q[3], q[0]*q[2], q[1]*q[3], q[1]*q[2], q[0]*q[3] };
    float invs = 1.0f / (x[0] + x[1] + x[2] + x[3]); // inverse square length only required if quaternion not normalised
    details::setMatrix4x4(out,
      (x[0] - x[1] - x[2] + x[3]) * invs, 2.0f * (x[4] - x[5]) * invs, 2.0f * (x[6] + x[7]) * invs, 0.0f,
      2.0f * (x[4] + x[5]) * invs, (-x[0] + x[1] - x[2] + x[3]) * invs, 2.0f * (x[8] - x[9]) * invs, 0.0f,
      2.0f * (x[6] - x[7]) * invs, 2.0f * (x[8] + x[9]) * invs, (-x[0] - x[1] + x[2] + x[3]) * invs, 0.0f,
      0.0f, 0.0f, 0.0f, 1.0f);
  }

  void translationRotationScaleToMatrix4x4(float *restrict out, const float *restrict translate, const float *restrict rot, float uniformScale)
  {
    makeIdentityMatrix4x4(out);
    translateMatrix4x4(out, translate);
    rotateMatrix4x4(out, rot);
    scaleMatrix4x4(out, uniformScale, uniformScale, uniformScale);
  }

  void translationQuatScaleToMatrix4x4(float *restrict out, const float *restrict translate, const float *restrict quat, float uniformScale)
  {
    float tmp[16];
    float rot[16];
    quatToMatrix4x4(rot, quat);
    makeIdentityMatrix4x4(tmp);
    translateMatrix4x4(tmp, translate);
    multiplyMatrix4x4(out, tmp, rot);
    scaleMatrix4x4(out, uniformScale, uniformScale, uniformScale);
  }

  void translateMatrix4x4(float *restrict a_inOut, const float *restrict vec3)
  {
    // similarities to transformVector
    for (int i = 0; i < 4; i++)
      a_inOut[12+i] += dotProduct(a_inOut+i, vec3, 4, 1, 3);
  }

  void translateMatrix4x4(float* a_inOut, float x, float y, float z)
  {
    float xyz[3] = { x, y, z };
    translateMatrix4x4(a_inOut, xyz);
  }

  void scaleMatrix4x4(float *restrict a_inOut, const float *restrict vec3)
  {
    for (int j = 0; j < 3; j++)
      for (int i = 0; i < 4; i++)
        a_inOut[j*4+i] *= vec3[j];
  }

  void scaleMatrix4x4(float* a_inOut, float x, float y, float z)
  {
    float xyz[3] = { x, y, z };
    scaleMatrix4x4(a_inOut, xyz);
  }

  void rotateMatrix4x4(float *restrict inOut, const float *restrict vec3)
  {
    // TODO: for efficiency, perhaps convert vec3 angles to quat, and then convert quat to matrix
    rotateAxisMatrix4x4(inOut, degreesToRadians(vec3[0]), 1.0f, 0.0f, 0.0f);
    rotateAxisMatrix4x4(inOut, degreesToRadians(vec3[1]), 0.0f, 1.0f, 0.0f);
    rotateAxisMatrix4x4(inOut, degreesToRadians(vec3[2]), 0.0f, 0.0f, 1.0f);
  }

  void rotateMatrix4x4(float* inOut, float x, float y, float z)
  {
    float xyz[3] = { x, y, z };
    rotateMatrix4x4(inOut, xyz);
  }

  void rotateAxisMatrix4x4(float *restrict a_inOut, float radians, const float *restrict vec3)
  {
    float v[3] = { vec3[0], vec3[1], vec3[2] };
    normalizeVector(v, 3);
    float cos = cosf(radians);
    float cosp = 1.0f - cos;
    float sin = sinf(radians);
    float rotm[16];
    details::setMatrix4x4(rotm,
       cos + cosp * v[0] * v[0],         cosp * v[0] * v[1] + v[2] * sin,   cosp * v[0] * v[2] - v[1] * sin,    0.0f,
       cosp * v[0] * v[1] - v[2] * sin,  cos + cosp * v[1] * v[1],          cosp * v[1] * v[2] + v[0] * sin,    0.0f,
       cosp * v[0] * v[2] + v[1] * sin,  cosp * v[1] * v[2] - v[0] * sin,   cos + cosp * v[2] * v[2],           0.0f,
       0.0f, 0.0f, 0.0f, 1.0f);
    float m[16];
    copyMatrix4x4(m, a_inOut);
    multiplyMatrix4x4(a_inOut, m, rotm); // is it possible to do the multiply in-place?
  }

  void rotateAxisMatrix4x4(float* a_inOut, float radians, float x, float y, float z)
  {
    float xyz[3] = { x, y, z };
    rotateAxisMatrix4x4(a_inOut, radians, xyz);
  }

  void quatRotateMatrix4x4(float *restrict a_inOut, const float *restrict a_quat)
  {
    float rotm[16];
    float m[16];
    quatToMatrix4x4(rotm, a_quat);
    copyMatrix4x4(m, a_inOut);
    multiplyMatrix4x4(a_inOut, m, rotm);
  }

  // Helpers
  void multiplyMatrix4x4(float *restrict out, const float *restrict m1, const float *restrict m2)
  {
    for (int j = 0; j < 4; j++)
      for (int k = 0; k < 4; k++)
        out[j+k*4] = dotProduct(m1+j, m2+k*4, 4);
  }

  bool matrix4x4ToNormalMatrix3x3(float *restrict out, const float *restrict m)
  {
    return details::matrix4x4ToNormalMatrix3x3(out, m);
  }

  bool inverseMatrix4x4(float *restrict out, const float *restrict m)
  {
    return details::inverseMatrix4x4(out, m);
  }

  void transposeMatrix4x4(float *restrict out, const float *restrict m)
  {
    details::setMatrix4x4(out, m[15], m[14], m[13], m[12], m[11], m[10],
       m[9],  m[8], m[7],  m[6],  m[5],  m[4], m[3],  m[2],  m[1],  m[0]);
  }
  
  float degreesToRadians(float degrees)
  {
    const float deg2rad = 0.01745329251f; // pi / 180
    return degrees * deg2rad;
  }
  
  float radiansToDegrees(float radians)
  {
    const float rad2deg = 57.2957795131f; // 180 / pi
    return radians * rad2deg;
  }
  
  float phi()
  {
    return (1.0f + sqrt(5.0f)) * 0.5f;
  }

  void makeCameraMatrix(float *restrict out, float fov, float aspect, const float restrict translate[3], const float restrict rotate[3], float scale, bool ortho)
  {
    float projectionMatrix[16];
    float modelViewMatrix[16];
    if (ortho)
      makeOrthographicMatrix4x4(projectionMatrix, -2.0f, 2.0f, -2.0f, 2.0f, 0.0f, 10.0f);
    else
      makePerspectiveMatrix4x4(projectionMatrix, degreesToRadians(fov), aspect, 0.1f, 100.0f);
    translationRotationScaleToMatrix4x4(modelViewMatrix, translate, rotate, scale);
    multiplyMatrix4x4(out, projectionMatrix, modelViewMatrix);
  }

  // This is cheap way to make the clip planes from the mvp without needing to
  // do expensive matrix inversions and other maths. This quite directly does the job.
  // References:
  //      http://gamedevs.org/uploads/fast-extraction-viewing-frustum-planes-from-world-view-projection-matrix.pdf
  //      http://www.lighthouse3d.com/tutorials/view-frustum-culling/clip-space-approach-implementation-details/
  void extractClipPlanes(float restrict clipPlanes[6][4], const float *restrict m)
  {
    for (int i = 0; i < 4; i++)
    {
      clipPlanes[0][i] = m[3+i*4] + m[0+i*4]; // Left clipping plane
      clipPlanes[1][i] = m[3+i*4] - m[0+i*4]; // Right clipping plane
      clipPlanes[2][i] = m[3+i*4] - m[1+i*4]; // Top clipping plane
      clipPlanes[3][i] = m[3+i*4] + m[1+i*4]; // Bottom clipping plane
      clipPlanes[4][i] = m[3+i*4] + m[2+i*4]; // Near clipping plane
      clipPlanes[5][i] = m[3+i*4] - m[2+i*4]; // Far clipping plane
    }
    // Normalize the plane equations, if required
    for (int i = 0; i < 6; i++)
      normalizeVector(clipPlanes[i]);
  }

  // The args need commenting - This comes originally from BlockyFroggy where there are bounding boxes
  // in the form of a x,y,z position, with a w,h,d dimensions (being width, height, depth) of the
  // bounding box.
  bool isClippedBlockyFroggy(const float clipPlanes[6][4], float x, float y, float z, float w, float h, float d)
  {
    //if (x <= w || x >= (1000-w)) // This is blocky froggy specific, it checks if the x value is within the
    //  return true;               // play area (from 0-1000) and takes in to account the width of the object
    // Test specific
    // Find the largest dimension of the object (the width or height or depth -
    // instead could be a bounding sphere with just one value being the diameter of the sphere)
    float maxDist = (w < h) ? h : ((w < d) ? d : w); // could pre-compute this once
    maxDist *= 1.3; // Hack to fix things getting clipped too early on the edges of the screen
    float pnt[4] = { x, y, z, 1.0 }; // Probably should pass this parameter in as a vector
    for (int i = 0; i < 6; i++)
      if (-maxDist > dotProduct(pnt, clipPlanes[i]))
        return true;
    return false;
  }

  // vec4 is in world coordinates, make sure w / vec4[3] is set to 1.0f.
  // a_radius is size of object
  // clipPlanes are planes of the frustum (use extractClipPlanes to find this)
  bool isClipped(const float clipPlanes[6][4], const float *restrict vec4, float a_radius)
  {
    for (int i = 0; i < 6; i++)
        if (-a_radius > dotProduct(vec4, clipPlanes[i]))
            return true;
    return false;
  }

  // this is a helper function for being able to draw the frustum by getting the points to
  // be able to draw lines between them. Probably only useful for debug purposes. For clipping
  // you shouldn't need this, instead use extractClipPlanes and call isClipped to see if a
  // point / object is clipped or not
  void calcFrustum(float restrict frustumPts[8][4], const float *restrict m)
  {
    // Kind of expensive with the matrix inverse, so better if just use for debugging
    // I wonder if an alternative approach could be to extract the clip planes and find
    // the points where combinations of 3 planes intersect which might be a bit cheaper
    float inv[16];
    inverseMatrix4x4(inv, m);
    for (int i = 0; i < 8; i++)
    {
      float w[4] = { (i&1)?1.0f:-1.0f, (i&2)?1.0f:-1.0f, (i&4)?0.95f:-0.94f, 1.0f };
      for (int j = 0; j < 3; j++)
        w[j] *= 1.01f;
      transformVector(frustumPts[i], inv, w);
      for (int j = 0; j < 3; j++)
        frustumPts[i][j] /= frustumPts[i][3];
    }
  }

  bool compactInvertMatrix(const float m[16], float invOut[16])
  {
    float inv[16], det;
    const int idx[16][18] = {
      {  5, 10, 15, 9, 7, 14, 13, 6, 11, 5, 11, 14, 9, 6, 15, 13, 7, 10 },
      {  1, 11, 14, 9, 2, 15, 13, 3, 10, 1, 10, 15, 9, 3, 14, 13, 2, 11 },
      {  1, 6, 15, 5, 3, 14, 13, 2, 7, 1, 7, 14, 5, 2, 15, 13, 3, 6 },
      {  1, 7, 10, 5, 2, 11, 9, 3, 6, 1, 6, 11, 5, 3, 10, 9, 2, 7 },
      {  4, 11, 14, 8, 6, 15, 12, 7, 10, 4, 10, 15, 8, 7, 14, 12, 6, 11 },
      {  0, 10, 15, 8, 3, 14, 12, 2, 11, 0, 11, 14, 8, 2, 15, 12, 3, 10 },
      {  0, 7, 14, 4, 2, 15, 12, 3, 6, 0, 6, 15, 4, 3, 14, 12, 2, 7 },
      {  0, 6, 11, 4, 3, 10, 8, 2, 7, 0, 7, 10, 4, 2, 11, 8, 3, 6 },
      {  4, 9, 15, 8, 7, 13, 12, 5, 11, 4, 11, 13, 8, 5, 15, 12, 7, 9 },
      {  0, 11, 13, 8, 1, 15, 12, 3, 9, 0, 9, 15, 8, 3, 13, 12, 1, 11 },
      {  0, 5, 15, 4, 3, 13, 12, 1, 7, 0, 7, 13, 4, 1, 15, 12, 3, 5 },
      {  0, 7, 9, 4, 1, 11, 8, 3, 5, 0, 5, 11, 4, 3, 9, 8, 1, 7 },
      {  4, 10, 13, 8, 5, 14, 12, 6, 9, 4, 9, 14, 8, 6, 13, 12, 5, 10 },
      {  0, 9, 14, 8, 2, 13, 12, 1, 10, 0, 10, 13, 8, 1, 14, 12, 2, 9 },
      {  0, 6, 13, 4, 1, 14, 12, 2, 5, 0, 5, 14, 4, 2, 13, 12, 1, 6 },
      {  0, 5, 10, 4, 2, 9, 8, 1, 6, 0, 6, 9, 4, 1, 10, 8, 2, 5 }
    };
    for (int i = 0; i < 16; i++)
    {
      inv[i]  = m[idx[i][0*3+0]] * m[idx[i][0*3+1]] * m[idx[i][0*3+2]];
      inv[i] -= m[idx[i][3*3+0]] * m[idx[i][3*3+1]] * m[idx[i][3*3+2]];
      inv[i] += m[idx[i][1*3+0]] * m[idx[i][1*3+1]] * m[idx[i][1*3+2]];
      inv[i] -= m[idx[i][4*3+0]] * m[idx[i][4*3+1]] * m[idx[i][4*3+2]];
      inv[i] += m[idx[i][2*3+0]] * m[idx[i][2*3+1]] * m[idx[i][2*3+2]];
      inv[i] -= m[idx[i][5*3+0]] * m[idx[i][5*3+1]] * m[idx[i][5*3+2]];
    }
    det = m[0] * inv[0] + m[1] * inv[4] + m[2] * inv[8] + m[3] * inv[12];
    if (det == 0)
      return false;
    det = 1.0 / det;
    for (int i = 0; i < 16; i++)
      invOut[i] = inv[i] * det;
    return true;
  }

}


/*

  //
  // Might be worth comparing with optimizations enabled how inverse compares to this if need more speed from it:
  //
  bool gluInvertMatrix(const float m[16], float invOut[16])
  {
    float inv[16], det;
    int i;

    inv[0] = m[5]  * m[10] * m[15] - 
      m[5]  * m[11] * m[14] - 
      m[9]  * m[6]  * m[15] + 
      m[9]  * m[7]  * m[14] +
      m[13] * m[6]  * m[11] - 
      m[13] * m[7]  * m[10];

    inv[4] = -m[4]  * m[10] * m[15] + 
      m[4]  * m[11] * m[14] + 
      m[8]  * m[6]  * m[15] - 
      m[8]  * m[7]  * m[14] - 
      m[12] * m[6]  * m[11] + 
      m[12] * m[7]  * m[10];

    inv[8] = m[4]  * m[9] * m[15] - 
      m[4]  * m[11] * m[13] - 
      m[8]  * m[5] * m[15] + 
      m[8]  * m[7] * m[13] + 
      m[12] * m[5] * m[11] - 
      m[12] * m[7] * m[9];

    inv[12] = -m[4]  * m[9] * m[14] + 
      m[4]  * m[10] * m[13] +
      m[8]  * m[5] * m[14] - 
      m[8]  * m[6] * m[13] - 
      m[12] * m[5] * m[10] + 
      m[12] * m[6] * m[9];

    inv[1] = -m[1]  * m[10] * m[15] + 
      m[1]  * m[11] * m[14] + 
      m[9]  * m[2] * m[15] - 
      m[9]  * m[3] * m[14] - 
      m[13] * m[2] * m[11] + 
      m[13] * m[3] * m[10];

    inv[5] = m[0]  * m[10] * m[15] - 
      m[0]  * m[11] * m[14] - 
      m[8]  * m[2] * m[15] + 
      m[8]  * m[3] * m[14] + 
      m[12] * m[2] * m[11] - 
      m[12] * m[3] * m[10];

    inv[9] = -m[0]  * m[9] * m[15] + 
      m[0]  * m[11] * m[13] + 
      m[8]  * m[1] * m[15] - 
      m[8]  * m[3] * m[13] - 
      m[12] * m[1] * m[11] + 
      m[12] * m[3] * m[9];

    inv[13] = m[0]  * m[9] * m[14] - 
      m[0]  * m[10] * m[13] - 
      m[8]  * m[1] * m[14] + 
      m[8]  * m[2] * m[13] + 
      m[12] * m[1] * m[10] - 
      m[12] * m[2] * m[9];

    inv[2] = m[1]  * m[6] * m[15] - 
      m[1]  * m[7] * m[14] - 
      m[5]  * m[2] * m[15] + 
      m[5]  * m[3] * m[14] + 
      m[13] * m[2] * m[7] - 
      m[13] * m[3] * m[6];

    inv[6] = -m[0]  * m[6] * m[15] + 
      m[0]  * m[7] * m[14] + 
      m[4]  * m[2] * m[15] - 
      m[4]  * m[3] * m[14] - 
      m[12] * m[2] * m[7] + 
      m[12] * m[3] * m[6];

    inv[10] = m[0]  * m[5] * m[15] - 
      m[0]  * m[7] * m[13] - 
      m[4]  * m[1] * m[15] + 
      m[4]  * m[3] * m[13] + 
      m[12] * m[1] * m[7] - 
      m[12] * m[3] * m[5];

    inv[14] = -m[0]  * m[5] * m[14] + 
      m[0]  * m[6] * m[13] + 
      m[4]  * m[1] * m[14] - 
      m[4]  * m[2] * m[13] - 
      m[12] * m[1] * m[6] + 
      m[12] * m[2] * m[5];

    inv[3] = -m[1] * m[6] * m[11] + 
      m[1] * m[7] * m[10] + 
      m[5] * m[2] * m[11] - 
      m[5] * m[3] * m[10] - 
      m[9] * m[2] * m[7] + 
      m[9] * m[3] * m[6];

    inv[7] = m[0] * m[6] * m[11] - 
      m[0] * m[7] * m[10] - 
      m[4] * m[2] * m[11] + 
      m[4] * m[3] * m[10] + 
      m[8] * m[2] * m[7] - 
      m[8] * m[3] * m[6];

    inv[11] = -m[0] * m[5] * m[11] + 
      m[0] * m[7] * m[9] + 
      m[4] * m[1] * m[11] - 
      m[4] * m[3] * m[9] - 
      m[8] * m[1] * m[7] + 
      m[8] * m[3] * m[5];

    inv[15] = m[0] * m[5] * m[10] - 
      m[0] * m[6] * m[9] - 
      m[4] * m[1] * m[10] + 
      m[4] * m[2] * m[9] + 
      m[8] * m[1] * m[6] - 
      m[8] * m[2] * m[5];

    det = m[0] * inv[0] + m[1] * inv[4] + m[2] * inv[8] + m[3] * inv[12];

    if (det == 0)
      return false;

    det = 1.0 / det;

    for (i = 0; i < 16; i++)
      invOut[i] = inv[i] * det;

    return true;
  }
*/