/*---------------------------------------------------------------------------*
  Project: matrix vector Library
  File:    mtx.c

  Copyright 1998-2011 Nintendo.  All rights reserved.

  These coded instructions, statements, and computer programs contain
  proprietary information of Nintendo of America Inc. and/or Nintendo
  Company Ltd., and are protected by Federal copyright law.     They may
  not be disclosed to third parties or copied or duplicated in any form,
  in whole or in part, without the prior written consent of Nintendo.

 *---------------------------------------------------------------------------*/

#include <math.h>
#include <stdio.h>
#include <cafe/mtx.h>
#include "mtxAssert.h"

/*---------------------------------------------------------------------*
    Constants
 *---------------------------------------------------------------------*/
static const f32x2 c00 = {0.0F, 0.0F};
static const f32x2 c01 = {0.0F, 1.0F};
static const f32x2 c10 = {1.0F, 0.0F};
static const f32x2 c11 = {1.0F, 1.0F};
//static const f32x2 c22 = {2.0F, 2.0F};
static const f32x2 c33 = {3.0F, 3.0F};
static const f32x2 c0505 = {0.5F, 0.5F};

/*---------------------------------------------------------------------*


                            GENERAL SECTION


*---------------------------------------------------------------------*/


/*---------------------------------------------------------------------*

Name:           MTXIdentity

Description:    sets a matrix to identity

Arguments:      m :  matrix to be set

Return:         none

*---------------------------------------------------------------------*/
/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
void C_MTXIdentity ( Mtx m )
{
    ASSERTMSG( (m != 0), MTX_IDENTITY_1 );

    m[0][0] = 1.0f;     m[0][1] = 0.0f;  m[0][2] = 0.0f;  m[0][3] = 0.0f;
    m[1][0] = 0.0f;     m[1][1] = 1.0f;  m[1][2] = 0.0f;  m[1][3] = 0.0f;
    m[2][0] = 0.0f;     m[2][1] = 0.0f;  m[2][2] = 1.0f;  m[2][3] = 0.0f;
}

#if !defined(WIN32) && !defined(WIN64)
/*---------------------------------------------------------------------*
    Paired-Single intrinsics version
 *---------------------------------------------------------------------*
                Note that this performs NO error checking.
 *---------------------------------------------------------------------*/
void PSMTXIdentity( Mtx m )
{

    //psq_st      c00, 8(m),   0, 0     // m[0][2], m[0][3]
    __PSQ_STX(m, 8, c00, 0, 0);

    //psq_st      c00, 24(m),  0, 0     // m[1][2], m[1][3]
    __PSQ_STX(m, 24, c00, 0, 0);

    //psq_st      c00, 32(m),  0, 0     // m[2][0], m[2][1]
    __PSQ_STX(m, 32, c00, 0, 0);

    //psq_st      c01,   16(m),  0, 0     // m[1][0], m[1][1]
    __PSQ_STX(m, 16, c01, 0, 0);

    //psq_st      c10,   0(m),   0, 0     // m[0][0], m[0][1]
    __PSQ_STX(m, 0, c10, 0, 0);

    //psq_st      c10,   40(m),  0, 0     // m[2][2], m[2][3]
    __PSQ_STX(m, 40, c10, 0, 0);
}
#endif

/*---------------------------------------------------------------------*

Name:           MTXCopy

Description:    copies the contents of one matrix into another

Arguments:      src        source matrix for copy
                dst        destination matrix for copy

Return:         none

*---------------------------------------------------------------------*/
/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
void C_MTXCopy ( MTX_CONST Mtx src, Mtx dst )
{
    ASSERTMSG( (src != 0) , MTX_COPY_1 );
    ASSERTMSG( (dst != 0) , MTX_COPY_2 );

    if( src == dst )
    {
        return;
    }

    dst[0][0] = src[0][0];    dst[0][1] = src[0][1];    dst[0][2] = src[0][2];    dst[0][3] = src[0][3];
    dst[1][0] = src[1][0];    dst[1][1] = src[1][1];    dst[1][2] = src[1][2];    dst[1][3] = src[1][3];
    dst[2][0] = src[2][0];    dst[2][1] = src[2][1];    dst[2][2] = src[2][2];    dst[2][3] = src[2][3];
}

#if !defined(WIN32) && !defined(WIN64)
/*---------------------------------------------------------------------*
    Paired-Single intrinsics version
 *---------------------------------------------------------------------*
                Note that this performs NO error checking.
 *---------------------------------------------------------------------*/
void PSMTXCopy(MTX_CONST Mtx src, Mtx dst )
{
    f32x2 fp0, fp1, fp2, fp3, fp4, fp5;

    //psq_l       fp0, 0(src),   0, 0
    fp0 = __PSQ_L(src, 0, 0);

    //psq_st      fp0, 0(dst),   0, 0
    __PSQ_ST(dst, fp0, 0, 0);

    //psq_l       fp1, 8(src),   0, 0
    fp1 = __PSQ_LX(src, 8, 0, 0);

    //psq_st      fp1, 8(dst),   0, 0
    __PSQ_STX(dst, 8, fp1, 0, 0);

    //psq_l       fp2, 16(src),  0, 0
    fp2 = __PSQ_LX(src, 16, 0, 0);

    //psq_st      fp2, 16(dst),  0, 0
    __PSQ_STX(dst, 16, fp2, 0, 0);

    //psq_l       fp3, 24(src),  0, 0
    fp3 = __PSQ_LX(src, 24, 0, 0);

    //psq_st      fp3, 24(dst),  0, 0
    __PSQ_STX(dst, 24, fp3, 0, 0);

    //psq_l       fp4, 32(src),  0, 0
    fp4 = __PSQ_LX(src, 32, 0, 0);

    //psq_st      fp4, 32(dst),  0, 0
    __PSQ_STX(dst, 32, fp4, 0, 0);

    //psq_l       fp5, 40(src),  0, 0
    fp5 = __PSQ_LX(src, 40, 0, 0);

    //psq_st      fp5, 40(dst),  0, 0
    __PSQ_STX(dst, 40, fp5, 0, 0);

}
#endif

/*---------------------------------------------------------------------*

Name:           MTXConcat

Description:    concatenates two matrices.
                order of operation is A x B = AB.
                ok for any of ab == a == b.

                saves a MTXCopy operation if ab != to a or b.

Arguments:      a        first matrix for concat.
                b        second matrix for concat.
                ab       resultant matrix from concat.

Return:         none

 *---------------------------------------------------------------------*/
/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
void C_MTXConcat ( MTX_CONST Mtx a, MTX_CONST Mtx b, Mtx ab )
{
    Mtx mTmp;
    MtxPtr m;

    ASSERTMSG( (a  != 0), MTX_CONCAT_1 );
    ASSERTMSG( (b  != 0), MTX_CONCAT_2 );
    ASSERTMSG( (ab != 0), MTX_CONCAT_3 );

    if( (ab == a) || (ab == b) )
    {
        m = mTmp;
    }

    else
    {
        m = ab;
    }

    // compute (a x b) -> m

    m[0][0] = a[0][0]*b[0][0] + a[0][1]*b[1][0] + a[0][2]*b[2][0];
    m[0][1] = a[0][0]*b[0][1] + a[0][1]*b[1][1] + a[0][2]*b[2][1];
    m[0][2] = a[0][0]*b[0][2] + a[0][1]*b[1][2] + a[0][2]*b[2][2];
    m[0][3] = a[0][0]*b[0][3] + a[0][1]*b[1][3] + a[0][2]*b[2][3] + a[0][3];

    m[1][0] = a[1][0]*b[0][0] + a[1][1]*b[1][0] + a[1][2]*b[2][0];
    m[1][1] = a[1][0]*b[0][1] + a[1][1]*b[1][1] + a[1][2]*b[2][1];
    m[1][2] = a[1][0]*b[0][2] + a[1][1]*b[1][2] + a[1][2]*b[2][2];
    m[1][3] = a[1][0]*b[0][3] + a[1][1]*b[1][3] + a[1][2]*b[2][3] + a[1][3];

    m[2][0] = a[2][0]*b[0][0] + a[2][1]*b[1][0] + a[2][2]*b[2][0];
    m[2][1] = a[2][0]*b[0][1] + a[2][1]*b[1][1] + a[2][2]*b[2][1];
    m[2][2] = a[2][0]*b[0][2] + a[2][1]*b[1][2] + a[2][2]*b[2][2];
    m[2][3] = a[2][0]*b[0][3] + a[2][1]*b[1][3] + a[2][2]*b[2][3] + a[2][3];

    // overwrite a or b if needed
    if(m == mTmp)
    {
        C_MTXCopy( *((MTX_CONST Mtx *)&mTmp), ab );
    }
}

#if !defined(WIN32) && !defined(WIN64)
/*---------------------------------------------------------------------*
    Paired-Single intrinsics version
 *---------------------------------------------------------------------*
                Note that this performs NO error checking.
 *---------------------------------------------------------------------*/
void PSMTXConcat ( MTX_CONST Mtx a, MTX_CONST Mtx b, Mtx ab )
{
    f32x2 A00_A01 = __PSQ_L(a, 0, 0);
    f32x2 A02_A03;
    f32x2 A10_A11;
    f32x2 A12_A13;
    f32x2 A20_A21;
    f32x2 A22_A23;
    f32x2 B00_B01 = __PSQ_L(b, 0, 0);
    f32x2 B02_B03 = __PSQ_LX(b,  8, 0, 0);
    f32x2 B10_B11 = __PSQ_LX(b, 16, 0, 0);
    f32x2 B12_B13;
    f32x2 B20_B21;
    f32x2 B22_B23;

    f32x2 D00_D01;
    f32x2 D02_D03;
    f32x2 D10_D11;
    f32x2 D12_D13;
    f32x2 D20_D21;
    f32x2 D22_D23;

    // D00_D01 = b00a00 , b01a00
    D00_D01 = __PS_MULS0( B00_B01, A00_A01);
    A10_A11 = __PSQ_LX(a, 16, 0, 0);

    // D02_D03 = b02a00 , b03a00
    D02_D03 = __PS_MULS0( B02_B03, A00_A01);

    // D10_D11 = a10b00 , a10b01
    D10_D11 = __PS_MULS0( B00_B01, A10_A11);
    B12_B13 = __PSQ_LX(b, 24, 0, 0);

    // D12_D13 = a10b02 , a10b03
    D12_D13 = __PS_MULS0( B02_B03, A10_A11);
    A02_A03 = __PSQ_LX(a,  8, 0, 0);

    // D00_D01 = b10a01 + b00a00 , b11a01 + b01a00
    D00_D01 = __PS_MADDS1( B10_B11, A00_A01, D00_D01);
    A12_A13 = __PSQ_LX(a, 24, 0, 0);

    // D10_D11 = a10b00 + a11b10 , a10b01 + a11b11
    D10_D11 =  __PS_MADDS1( B10_B11, A10_A11, D10_D11);
    B20_B21 = __PSQ_LX(b, 32, 0, 0);

    // D02_D03 = b12a01 + b02a00 , b13a01 + b03a00
    D02_D03 =  __PS_MADDS1( B12_B13, A00_A01, D02_D03);
    B22_B23 = __PSQ_LX(b, 40, 0, 0);

    // D12_D13 = a10b02 + a11b12, a10b03+a11b13
    D12_D13 =  __PS_MADDS1( B12_B13, A10_A11, D12_D13);

    A20_A21 = __PSQ_LX(a, 32, 0, 0);
    A22_A23 = __PSQ_LX(a, 40, 0, 0);

    // D00_D01 = b20a02 + b10a01 + b00a00 , b21a02 + b11a01 + b01a00
    D00_D01 =  __PS_MADDS0( B20_B21, A02_A03, D00_D01); // m00, m01 computed

    // D02_D03 = b12a01 + b02a00 + b22a02 , b13a01 + b03a00 + b23a02
    D02_D03 =  __PS_MADDS0( B22_B23, A02_A03, D02_D03);

    // D10_D11 = a10b00 + a11b10 +a12b20, a10b01 + a11b11 + a12b21
    D10_D11 =  __PS_MADDS0( B20_B21, A12_A13, D10_D11); // m10, m11 computed

    // D12_D13 = a10b02 + a11b12 + a12b22, a10b03+a11b13 + a12b23 + a13
    D12_D13 =  __PS_MADDS0( B22_B23, A12_A13, D12_D13);

    // store m00m01
    __PSQ_ST(ab, D00_D01, 0, 0);

    // D20_D21 = a20b00, a20b01
    D20_D21 = __PS_MULS0( B00_B01, A20_A21);

    // get a03 from fp1 and add to D02_D03
    D02_D03 =  __PS_MADDS1( c01, A02_A03, D02_D03); // m02, m03 computed

    // D22_D23 = a20b02, a20b03
    D22_D23 = __PS_MULS0( B02_B03, A20_A21);

    // store m10m11
    __PSQ_STX(ab, 16, D10_D11, 0, 0);

    // get a13 from fp3 and add to D12_D13
    D12_D13 =  __PS_MADDS1( c01, A12_A13, D12_D13); // m12, m13 computed

    // store m02m03
    __PSQ_STX(ab, 8, D02_D03, 0, 0);

    // D20_D21 = a20b00 + a21b10, a20b01 + a21b11
    D20_D21 =  __PS_MADDS1( B10_B11, A20_A21, D20_D21);

    // D22_D23 = a20b02 + a21b12, a20b03 + a21b13
    D22_D23 =  __PS_MADDS1( B12_B13, A20_A21, D22_D23);

    // D20_D21 = a20b00 + a21b10 + a22b20, a20b01 + a21b11 + a22b21
    D20_D21 =  __PS_MADDS0( B20_B21, A22_A23, D20_D21);

    // store m12m13
    __PSQ_STX(ab, 24, D12_D13, 0, 0);

    // D22_D23 = a20b02 + a21b12 + a22b22, a20b03 + a21b13 + a22b23 + a23
    D22_D23 =  __PS_MADDS0( B22_B23, A22_A23, D22_D23);

    // store m20m21

    __PSQ_STX(ab, 32, D20_D21, 0, 0);

    // get a23 from fp5 and add to fp17
    D22_D23 =  __PS_MADDS1( c01, A22_A23, D22_D23);

    // store m22m23
    __PSQ_STX(ab, 40, D22_D23, 0, 0);

}
#endif

/*---------------------------------------------------------------------*

Name:           MTXConcatArray

Description:    concatenates a matrix to an array of matrices.
                order of operation is A x B(array) = AB(array).

Arguments:      a        first matrix for concat.
                srcBase  array base of second matrix for concat.
                dstBase  array base of resultant matrix from concat.
                count    number of matrices in srcBase, dstBase arrays.

                note:      cannot check for array overflow

Return:         none

 *---------------------------------------------------------------------*/
/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
void C_MTXConcatArray ( MTX_CONST Mtx a, MTX_CONST Mtx* srcBase, Mtx* dstBase, u32 count )
{
    u32 i;

    ASSERTMSG( (a       != 0), "MTXConcatArray(): NULL MtxPtr 'a' " );
    ASSERTMSG( (srcBase != 0), "MTXConcatArray(): NULL MtxPtr 'srcBase' " );
    ASSERTMSG( (dstBase != 0), "MTXConcatArray(): NULL MtxPtr 'dstBase' " );
    ASSERTMSG( (count > 1),    "MTXConcatArray(): count must be greater than 1." );

    for ( i = 0 ; i < count ; i++ )
    {
        C_MTXConcat(a, *srcBase, *dstBase);

        srcBase++;
        dstBase++;
    }
}

#if !defined(WIN32) && !defined(WIN64)
/*---------------------------------------------------------------------*
    Paired-Single intrinsics version
 *---------------------------------------------------------------------*
                Note that this performs NO error checking.
 *---------------------------------------------------------------------*/
void PSMTXConcatArray (
    MTX_CONST Mtx  a,
    MTX_CONST Mtx* srcBase,
    Mtx* dstBase,
    u32  count )
{

    int i;

    for ( i = 0 ; i < count ; i++ )
    {
        PSMTXConcat(a, *srcBase, *dstBase);

        srcBase++;
        dstBase++;
    }
}
#endif

/*---------------------------------------------------------------------*

Name:           MTXTranspose

Description:    computes the transpose of a matrix.
                As matrices are 3x4, fourth column (translation component) is
                lost and becomes (0,0,0).

                This function is intended for use in computing an
                inverse-transpose matrix to transform normals for lighting.
                In this case, lost translation component doesn't matter.

Arguments:      src       source matrix.
                xPose     destination (transposed) matrix.
                          ok if src == xPose.

Return:         none

*---------------------------------------------------------------------*/
/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
void C_MTXTranspose ( MTX_CONST Mtx src, Mtx xPose )
{
    Mtx mTmp;
    MtxPtr m;

    ASSERTMSG( (src   != 0), MTX_TRANSPOSE_1  );
    ASSERTMSG( (xPose != 0), MTX_TRANSPOSE_2  );

    if(src == xPose)
    {
        m = mTmp;
    }
    else
    {
        m = xPose;
    }

    m[0][0] = src[0][0];   m[0][1] = src[1][0];      m[0][2] = src[2][0];     m[0][3] = 0.0f;
    m[1][0] = src[0][1];   m[1][1] = src[1][1];      m[1][2] = src[2][1];     m[1][3] = 0.0f;
    m[2][0] = src[0][2];   m[2][1] = src[1][2];      m[2][2] = src[2][2];     m[2][3] = 0.0f;

    // copy back if needed
    if( m == mTmp )
    {
        C_MTXCopy( *((MTX_CONST Mtx *)&mTmp), xPose );
    }
}

#if !defined(WIN32) && !defined(WIN64)
/*---------------------------------------------------------------------*
    Paired-Single intrinsics version
 *---------------------------------------------------------------------*
                Note that this performs NO error checking.
 *---------------------------------------------------------------------*/
void PSMTXTranspose ( MTX_CONST Mtx src, Mtx xPose )
{
    f32x2 row0a, row1a, row0b, row1b;
    f32x2 trns0, trns1, trns2;

    //psq_l       row0a, 0(src),  0, 0    // [0][0], [0][1]
    row0a = __PSQ_L(src, 0, 0);

    //psq_l       row1a, 16(src), 0, 0    // [1][0], [1][1]
    row1a = __PSQ_LX(src, 16, 0, 0);

    //ps_merge00  trns0, row0a, row1a     // [0][0], [1][0]
    trns0 = __PS_MERGE00(row0a, row1a);

    //psq_l       row0b, 8(src),  1, 0    // [0][2], 1
    row0b = __PSQ_LX(src, 8, 1, 0);

    //ps_merge11  trns1, row0a, row1a     // [0][1], [1][1]
    trns1 = __PS_MERGE11(row0a, row1a);

    //psq_l       row1b, 24(src), 1, 0    // [1][2], 1
    row1b = __PSQ_LX(src, 24, 1, 0);

    //psq_st      trns0, 0(xPose),  0, 0  // [0][0], [1][0] -> [0][0], [0][1]
    __PSQ_ST(xPose, trns0, 0, 0);

    //psq_l       row0a, 32(src), 0, 0    // [2][0], [2][1]
    row0a = __PSQ_LX(src, 32, 0, 0);

    //ps_merge00  trns2, row0b, row1b     // [0][2], [1][2]
    trns2 = __PS_MERGE00(row0b, row1b);

    //psq_st      trns1, 16(xPose), 0, 0  // [0][1], [1][1] -> [1][0], [1][1]
    __PSQ_STX(xPose, 16, trns1, 0, 0);

    //ps_merge00  trns0, row0a, c00       // [2][0], 0
    trns0 = __PS_MERGE00(row0a, c00);

    //psq_st      trns2, 32(xPose), 0, 0  // [0][2], [1][2] -> [2][0], [2][1]
    __PSQ_STX(xPose, 32, trns2, 0, 0);

    //ps_merge10  trns1, row0a, c00       // [2][1], 0
    trns1 = __PS_MERGE10(row0a, c00);

    //psq_st      trns0, 8(xPose),  0, 0  // [2][0], 0 -> [0][2], [0][3]
    __PSQ_STX(xPose, 8, trns0, 0, 0);

    //lfs         row0b, 40(src)          // [2][2]
    row0b = __PSQ_LX(src, 40, 1, 0);

    //psq_st      trns1, 24(xPose), 0, 0  // [2][1], 0 -> [1][2], [1][3]
    __PSQ_STX(xPose, 24, trns1, 0, 0);

    //stfs        row0b, 40(xPose)        // [2][2] -> [2][2]
   __PSQ_STX(xPose, 40, row0b, 1, 0);
}
#endif

/*---------------------------------------------------------------------*

Name:           MTXInverse

Description:    computes a fast inverse of a matrix.
                this algorithm works for matrices with a fourth row of
                (0,0,0,1).

                for a matrix
                M =  |     A      C      |  where A is the upper 3x3 submatrix,
                     |     0      1      |        C is a 1x3 column vector

                INV(M)     =    |  inv(A)      (inv(A))*(-C)    |
                                |     0               1         |

Arguments:      src       source matrix.
                inv       destination (inverse) matrix.
                          ok if src == inv.

Return:         0 if src is not invertible.
                1 on success.

*---------------------------------------------------------------------*/
/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
u32 C_MTXInverse ( MTX_CONST Mtx src, Mtx inv )
{
    Mtx mTmp;
    MtxPtr m;
    f32 det;

    ASSERTMSG( (src != 0), MTX_INVERSE_1 );
    ASSERTMSG( (inv != 0), MTX_INVERSE_2 );

    if( src == inv )
    {
        m = mTmp;
    }
    else
    {
        m = inv;
    }

    // compute the determinant of the upper 3x3 submatrix
    det =   src[0][0]*src[1][1]*src[2][2] + src[0][1]*src[1][2]*src[2][0] + src[0][2]*src[1][0]*src[2][1]
          - src[2][0]*src[1][1]*src[0][2] - src[1][0]*src[0][1]*src[2][2] - src[0][0]*src[2][1]*src[1][2];

    // check if matrix is singular
    if( det == 0.0f )
    {
        return 0;
    }

    // compute the inverse of the upper submatrix:

    // find the transposed matrix of cofactors of the upper submatrix
    // and multiply by (1/det)

    det = 1.0f / det;

    m[0][0] =  (src[1][1]*src[2][2] - src[2][1]*src[1][2]) * det;
    m[0][1] = -(src[0][1]*src[2][2] - src[2][1]*src[0][2]) * det;
    m[0][2] =  (src[0][1]*src[1][2] - src[1][1]*src[0][2]) * det;

    m[1][0] = -(src[1][0]*src[2][2] - src[2][0]*src[1][2]) * det;
    m[1][1] =  (src[0][0]*src[2][2] - src[2][0]*src[0][2]) * det;
    m[1][2] = -(src[0][0]*src[1][2] - src[1][0]*src[0][2]) * det;

    m[2][0] =  (src[1][0]*src[2][1] - src[2][0]*src[1][1]) * det;
    m[2][1] = -(src[0][0]*src[2][1] - src[2][0]*src[0][1]) * det;
    m[2][2] =  (src[0][0]*src[1][1] - src[1][0]*src[0][1]) * det;

    // compute (invA)*(-C)
    m[0][3] = -m[0][0]*src[0][3] - m[0][1]*src[1][3] - m[0][2]*src[2][3];
    m[1][3] = -m[1][0]*src[0][3] - m[1][1]*src[1][3] - m[1][2]*src[2][3];
    m[2][3] = -m[2][0]*src[0][3] - m[2][1]*src[1][3] - m[2][2]*src[2][3];

    // copy back if needed
    if( m == mTmp )
    {
        C_MTXCopy( *((MTX_CONST Mtx *)&mTmp),inv );
    }

    return 1;
}

#if !defined(WIN32) && !defined(WIN64)
/*---------------------------------------------------------------------*
    Paired-Single intrinsics version
 *---------------------------------------------------------------------*
            Note that this performs NO error checking.
            Results may be a little bit different from the C version
            because it doesn't perform exactly same calculation.
 *---------------------------------------------------------------------*/
u32 PSMTXInverse ( MTX_CONST Mtx src, Mtx inv )
{

    f32x2 fp0;
    f32x2 fp1;
    f32x2 fp2;
    f32x2 fp3;
    f32x2 fp4;
    f32x2 fp5;

    f32x2 fp6;
    f32x2 fp7;
    f32x2 fp8;
    f32x2 fp9;
    f32x2 fp10;
    f32x2 fp11;
    f32x2 fp12;
    f32x2 fp13;

    // fp0 [ 00 ][ 1.0F ] : Load
    fp0 = __PSQ_LX(src, 0, 1, 0);

    // fp1 [ 01 ][ 02 ]   : Load
    fp1 = __PSQ_LX(src, 4, 0, 0);

    // fp2 [ 10 ][ 1.0F ] : Load
    fp2 = __PSQ_LX(src, 16, 1, 0);

    // fp6 [ 02 ][ 00 ]
    fp6 = __PS_MERGE10(fp1, fp0);

    // fp3 [ 11 ][ 12 ]   : Load
    fp3 = __PSQ_LX(src, 20, 0, 0);

    // fp4 [ 20 ][ 1.0F ] : Load
    fp4 = __PSQ_LX(src, 32, 1, 0);

    // fp7 [ 12 ][ 10 ]
    fp7 = __PS_MERGE10(fp3, fp2);

    // fp5 [ 21 ][ 22 ]   : Load
    fp5 = __PSQ_LX(src, 36, 0, 0);

    // fp11[ 11*02 ][ 00*12 ]
    fp11 = __PS_MUL(fp3, fp6);

    // fp8 [ 22 ][ 20 ]
    fp8 = __PS_MERGE10(fp5, fp4);

    // fp13[ 21*12 ][ 10*22 ]
    fp13 = __PS_MUL(fp5, fp7);

    // fp11[ 01*12 - 11*02 ][ 10*02 - 00*12 ]
    fp11 = __PS_MSUB(fp1, fp7, fp11);

    // fp12[ 01*22 ][ 20*02 ]
    fp12 = __PS_MUL(fp1, fp8);

    // fp13[ 11*22 - 21*12 ][ 20*12 - 10*22 ]
    fp13 = __PS_MSUB(fp3, fp8, fp13);

    // fp10[ 20*11 ][ N/A ]
    fp10 = __PS_MUL(fp3, fp4);

    // fp12[ 21*02 - 01*22 ][ 00*22 - 20*02 ]
    fp12 = __PS_MSUB(fp5, fp6, fp12);

    // fp7 [ 00*(11*22-21*12) ][ N/A ]
    fp7  = __PS_MUL(fp0, fp13);

    // fp9 [ 00*21 ][ N/A ]
    fp9  = __PS_MUL(fp0, fp5);

    // fp8 [ 10*01 ][ N/A ]
    fp8  = __PS_MUL(fp1, fp2);

    // fp7 [ 00*(11*22-21*12) + 10*(21*02-01*22) ][ N/A ]
    fp7 = __PS_MADD(fp2, fp12, fp7);

    // fp6 [ 0.0F ][ 0.0F ]
    fp6 = __PS_SUB(fp6, fp6);

    // fp10[ 10*21 - 20*11 ][ N/A ]
    fp10 = __PS_MSUB(fp2, fp5, fp10);

    // fp7 [ 00*(11*22-21*12) + 10*(21*02-01*22) + 20*(01*12-11*02) ][ N/A ] : det
    fp7 = __PS_MADD(fp4, fp11, fp7);

    // fp9 [ 20*01 - 00*21 ][ N/A ]
    fp9 = __PS_MSUB(fp1, fp4, fp9);

    // fp8 [ 00*11 - 10*01 ][ N/A ]
    fp8 = __PS_MSUB(fp0, fp3, fp8);

    // check if matrix is singular
    if( fp7[0] == 0.0f && fp7[1] == 0.0f)
    {
        return 0;
    }

    // compute the inverse of the upper submatrix:

    // find the transposed matrix of cofactors of the upper submatrix
    // and multiply by (1/det)

    // fp0 [ 1/det ][ N/A ]
    fp0 = __PS_RES(fp7);

    // Newton's approximation
    // Refinement : ( E = est. of 1/K ) -> ( E' = ( 2 - K * E ) * E )
    fp6 = __PS_ADD(fp0, fp0);
    fp5 = __PS_MUL(fp7, fp0);
    fp0 = __PS_NMSUB(fp0, fp5, fp6);

    // fp1 [ 03 ][ 03 ] : Load
    fp1[0] = src[0][3];
    fp1[1] = src[0][3];

    // fp13[ ( 11*22 - 21*12 ) * rdet ][ ( 20*12 - 10*22 ) * rdet ] : i[0][0], i[1][0]
    fp13 = __PS_MULS0(fp13, fp0);

    // fp2 [ 13 ][ 13 ] : Load
    fp2[0] = src[1][3];
    fp2[1] = src[1][3];

    // fp12[ ( 21*02 - 01*22 ) * rdet ][ ( 00*22 - 20*02 ) * rdet ] : i[0][1], i[1][1]
    fp12 = __PS_MULS0(fp12, fp0);

    // fp3 [ 23 ][ 23 ] : Load
    fp3[0] = src[2][3];
    fp3[1] = src[2][3];

    // fp11[ ( 01*12 - 11*02 ) * rdet ][ ( 10*02 - 00*12 ) * rdet ] : i[0][2], i[1][2]
    fp11 = __PS_MULS0(fp11, fp0);

    // fp5 [ i00 ][ i01 ]
    fp5 = __PS_MERGE00(fp13, fp12);

    // fp4 [ i10 ][ i11 ]
    fp4 = __PS_MERGE11(fp13, fp12);

    // fp6 [ i00*03 ][ i10*03 ]
    fp6 = __PS_MUL(fp13, fp1);

    // [ i00 ][ i01 ] : Store fp5   -> free(fp5[ i00 ][ i01 ])
    //inv[0][0] = fp5[0];
    //inv[0][1] = fp5[1];
    __PSQ_STX(inv, 0, fp5, 0, 0);

    // [ i10 ][ i11 ] : Store fp4   -> free(fp4[ i10 ][ i11 ])
    //inv[1][0] = fp4[0];
    //inv[1][1] = fp4[1];
    __PSQ_STX(inv, 16, fp4, 0, 0);

    // fp10[ ( 10*21 - 20*11 ) * rdet ] : i[2][0]
    fp10  = __PS_MULS0(fp10, fp0);

    // fp9 [ ( 20*01 - 00*21 ) * rdet ] : i[2][1]
    fp9  = __PS_MULS0(fp9,  fp0);

    // fp6 [ i00*03+i01*13 ][ i10*03+i11*13 ]
    fp6 = __PS_MADD(fp12, fp2, fp6);

    // [ i20 ] : Store fp10
    //inv[2][0] = fp10[0];
    __PSQ_STX(inv, 32, fp10, 1, 0);

    // fp8 [ ( 00*11 - 10*01 ) * rdet ] : i[2][2]
    fp8 = __PS_MULS0(fp8,  fp0);

    // fp6 [ -i00*03-i01*13-i02*23 ][ -i10*03-i11*13-i12*23 ] : i[0][3], i[1][3]
    fp6 = __PS_NMADD(fp11, fp3, fp6);

    // [ i21 ] : Store fp9
    //inv[2][1] = fp9[0];
    __PSQ_STX(inv, 36, fp9, 1, 0);

    // fp7 [ i20*03 ][ N/A ]
    fp7 = __PS_MUL(fp10, fp1);

    // fp5 [ i02 ][ i03 ]
    fp5 = __PS_MERGE00(fp11, fp6);

    // [ i22 ] : Store fp8
    //inv[2][2] = fp8[0];
    __PSQ_STX(inv, 40, fp8, 1, 0);

    // fp7 [ i20*03+i21*13 ][ N/A ]
    fp7  = __PS_MADD(fp9,  fp2, fp7);

    // fp4 [ i12 ][ i13 ]
    fp4  = __PS_MERGE11(fp11, fp6);

    // [ i02 ][ i03 ] : Store fp5
    //inv[0][2] = fp5[0];
    //inv[0][3] = fp5[1];
    __PSQ_STX(inv, 8, fp5, 0, 0);

    // fp7 [ -i20*03-i21*13-i22*23 ][ N/A ] : i[2][3]
    fp7 = __PS_NMADD(fp8,  fp3, fp7);

    // [ i12 ][ i13 ] : Store fp4
    //inv[1][2] = fp4[0];
    //inv[1][3] = fp4[1];
    __PSQ_STX(inv, 24, fp4, 0, 0);

    // [ i23 ] : Store fp7
    //inv[2][3] = fp7[0];
    __PSQ_STX(inv, 44, fp7, 1, 0);

    return 1;
}
#endif

/*---------------------------------------------------------------------*

Name:           MTXInvXpose

Description:    computes a fast inverse-transpose of a matrix.
                this algorithm works for matrices with a fourth row of
                (0,0,0,1). Commonly used for calculating normal transform
                matrices.

                This function is equivalent to the combination of
                two functions MTXInverse + MTXTranspose.

Arguments:      src       source matrix.
                invx      destination (inverse-transpose) matrix.
                          ok if src == invx.

Return:         0 if src is not invertible.
                1 on success.

*---------------------------------------------------------------------*/
/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
u32 C_MTXInvXpose ( MTX_CONST Mtx src, Mtx invX )
{
    Mtx mTmp;
    MtxPtr m;
    f32 det;

    ASSERTMSG( (src != 0), MTX_INVXPOSE_1 );
    ASSERTMSG( (invX != 0), MTX_INVXPOSE_2 );

    if( src == invX )
    {
        m = mTmp;
    }
    else
    {
        m = invX;
    }

    // compute the determinant of the upper 3x3 submatrix
    det =   src[0][0]*src[1][1]*src[2][2] + src[0][1]*src[1][2]*src[2][0] + src[0][2]*src[1][0]*src[2][1]
          - src[2][0]*src[1][1]*src[0][2] - src[1][0]*src[0][1]*src[2][2] - src[0][0]*src[2][1]*src[1][2];

    // check if matrix is singular
    if( det == 0.0f )
    {
        return 0;
    }

    // compute the inverse-transpose of the upper submatrix:

    // find the transposed matrix of cofactors of the upper submatrix
    // and multiply by (1/det)

    det = 1.0f / det;

    m[0][0] =  (src[1][1]*src[2][2] - src[2][1]*src[1][2]) * det;
    m[0][1] = -(src[1][0]*src[2][2] - src[2][0]*src[1][2]) * det;
    m[0][2] =  (src[1][0]*src[2][1] - src[2][0]*src[1][1]) * det;

    m[1][0] = -(src[0][1]*src[2][2] - src[2][1]*src[0][2]) * det;
    m[1][1] =  (src[0][0]*src[2][2] - src[2][0]*src[0][2]) * det;
    m[1][2] = -(src[0][0]*src[2][1] - src[2][0]*src[0][1]) * det;

    m[2][0] =  (src[0][1]*src[1][2] - src[1][1]*src[0][2]) * det;
    m[2][1] = -(src[0][0]*src[1][2] - src[1][0]*src[0][2]) * det;
    m[2][2] =  (src[0][0]*src[1][1] - src[1][0]*src[0][1]) * det;

    // the fourth columns should be all zero
    m[0][3] = 0.0F;
    m[1][3] = 0.0F;
    m[2][3] = 0.0F;

    // copy back if needed
    if( m == mTmp )
    {
        C_MTXCopy( *((MTX_CONST Mtx *)&mTmp),invX );
    }

    return 1;
}

#if !defined(WIN32) && !defined(WIN64)
/*---------------------------------------------------------------------*
    Paired-Single intrinsics version
 *---------------------------------------------------------------------*
            Note that this performs NO error checking.
            Results may be a little bit different from the C version
            because it doesn't perform exactly same calculation.
 *---------------------------------------------------------------------*/
u32 PSMTXInvXpose ( MTX_CONST Mtx src, Mtx invX )
{
    f32x2 fp0;
    f32x2 fp1;
    f32x2 fp2;
    f32x2 fp3;
    f32x2 fp4;
    f32x2 fp5;

    f32x2 fp6;
    f32x2 fp7;
    f32x2 fp8;
    f32x2 fp9;
    f32x2 fp10;
    f32x2 fp11;
    f32x2 fp12;
    f32x2 fp13;

    // fp0 [ 00 ][ 1.0F ] : Load
    //fp0[0] = src[0][0];
    //fp0[1] = 1.0F;
    fp0 = __PSQ_LX(src, 0, 1, 0);

    // fp1 [ 01 ][ 02 ]   : Load
    //fp1[0] = src[0][1];
    //fp1[1] = src[0][2];
    fp1 = __PSQ_LX(src, 4, 0, 0);

    // fp2 [ 10 ][ 1.0F ] : Load
    //fp2[0] = src[1][0];
    //fp2[1] = 1.0F;
    fp2 = __PSQ_LX(src, 16, 1, 0);

    // fp6 [ 02 ][ 00 ]
    fp6 = __PS_MERGE10(fp1, fp0);

    // fp3 [ 11 ][ 12 ]   : Load
    //fp3[0] = src[1][1];
    //fp3[1] = src[1][2];
    fp3 = __PSQ_LX(src, 20, 0, 0);

    // fp4 [ 20 ][ 1.0F ] : Load
    //fp4[0] = src[2][0];
    //fp4[1] = 1.0F;
    fp4 = __PSQ_LX(src, 32, 1, 0);

    // fp7 [ 12 ][ 10 ]
    fp7 = __PS_MERGE10(fp3, fp2);

    // fp5 [ 21 ][ 22 ]   : Load
    //fp5[0] = src[2][1];
    //fp5[1] = src[2][2];
    fp5 = __PSQ_LX(src, 36, 0, 0);

    // fp11[ 11*02 ][ 00*12 ]
    fp11 = __PS_MUL(fp3, fp6);

    // fp8 [ 22 ][ 20 ]
    fp8 = __PS_MERGE10(fp5, fp4);

    // fp13[ 21*12 ][ 10*22 ]
    fp13 = __PS_MUL(fp5, fp7);

    // fp11[ 01*12 - 11*02 ][ 10*02 - 00*12 ]
    fp11 = __PS_MSUB(fp1, fp7, fp11);

    // fp12[ 01*22 ][ 20*02 ]
    fp12 = __PS_MUL(fp1, fp8);

    // fp13[ 11*22 - 21*12 ][ 20*12 - 10*22 ]
    fp13 = __PS_MSUB(fp3, fp8, fp13);

    // fp10[ 20*11 ][ N/A ]
    fp10 = __PS_MUL(fp3, fp4);

    // fp12[ 21*02 - 01*22 ][ 00*22 - 20*02 ]
    fp12 = __PS_MSUB(fp5, fp6, fp12);

    // fp7 [ 00*(11*22-21*12) ][ N/A ]
    fp7  = __PS_MUL(fp0, fp13);

    // fp9 [ 00*21 ][ N/A ]
    fp9  = __PS_MUL(fp0, fp5);

    // fp8 [ 10*01 ][ N/A ]
    fp8  = __PS_MUL(fp1, fp2);

    // fp7 [ 00*(11*22-21*12) + 10*(21*02-01*22) ][ N/A ]
    fp7 = __PS_MADD(fp2, fp12, fp7);

    // fp6 [ 0.0F ][ 0.0F ]
    fp6 = __PS_SUB(fp6, fp6);

    // fp10[ 10*21 - 20*11 ][ N/A ]
    fp10 = __PS_MSUB(fp2, fp5, fp10);

    // fp7 [ 00*(11*22-21*12) + 10*(21*02-01*22) + 20*(01*12-11*02) ][ N/A ] : det
    fp7 = __PS_MADD(fp4, fp11, fp7);

    // fp9 [ 20*01 - 00*21 ][ N/A ]
    fp9 = __PS_MSUB(fp1, fp4, fp9);

    // fp8 [ 00*11 - 10*01 ][ N/A ]
    fp8 = __PS_MSUB(fp0, fp3, fp8);

    // check if matrix is singular
    if( fp7[0] == 0.0f && fp7[1] == 0.0f)
    {
        return 0;
    }

    // compute the inverse-transpose of the upper submatrix:

    // find the transposed matrix of cofactors of the upper submatrix
    // and multiply by (1/det)

    // fp0 [ 1/det ][ N/A ]
    fp0 = __PS_RES(fp7);

    // [ ix03 ] : Store fp6
    invX[0][3] = fp6[0];

    // Newton's approximation
    // Refinement : ( E = est. of 1/K ) -> ( E' = ( 2 - K * E ) * E )
    fp4 = __PS_ADD(fp0, fp0);
    fp5 = __PS_MUL(fp7, fp0);

    // [ ix13 ] : Store fp6
    //invX[1][3] = fp6[0];
    __PSQ_STX(invX, 28, fp6, 1, 0);

    fp0 = __PS_NMSUB(fp0, fp5, fp4);

    // [ ix23 ] : Store fp6
    //invX[2][3] = fp6[0];
    __PSQ_STX(invX, 44, fp6, 1, 0);

    // fp13[ ( 11*22 - 21*12 ) * rdet ][ ( 20*12 - 10*22 ) * rdet ] : ix[0][0], ix[0][1]
    fp13 = __PS_MULS0(fp13, fp0);

    // fp12[ ( 21*02 - 01*22 ) * rdet ][ ( 00*22 - 20*02 ) * rdet ] : ix[1][0], ix[1][1]
    fp12 = __PS_MULS0(fp12, fp0);

    // [ ix00 ][ ix01 ] : Store fp13
    //invX[0][0] = fp13[0];
    //invX[0][1] = fp13[1];
    __PSQ_STX(invX, 0, fp13, 0, 0);

    // fp11[ ( 01*12 - 11*02 ) * rdet ][ ( 10*02 - 00*12 ) * rdet ] : ix[2][0], ix[2][1]
    fp11 = __PS_MULS0(fp11, fp0);

    // [ ix10 ][ ix11 ] : Store fp12
    //invX[1][0] = fp12[0];
    //invX[1][1] = fp12[1];
    __PSQ_STX(invX, 16, fp12, 0, 0);

    // fp10[ ( 10*21 - 20*11 ) * rdet ] : i[0][2]
    fp10 = __PS_MULS0(fp10, fp0);

    // [ ix20 ][ ix21 ] : Store fp11
    //invX[2][0] = fp11[0];
    //invX[2][1] = fp11[1];
    __PSQ_STX(invX, 32, fp11, 0, 0);

    // fp9 [ ( 20*01 - 00*21 ) * rdet ] : i[1][2]
    fp9 = __PS_MULS0(fp9, fp0);

    // [ ix02 ]         : Store fp10
    //invX[0][2] = fp10[0];
    __PSQ_STX(invX, 8, fp10, 1, 0);

    // fp8 [ ( 00*11 - 10*01 ) * rdet ] : i[2][2]
    fp8 = __PS_MULS0(fp8, fp0);

    // [ ix12 ]         : Store fp9
    //invX[1][2] = fp9[0];
    __PSQ_STX(invX, 24, fp9, 1, 0);

    // [ ix22 ]         : Store fp8
    //invX[2][2] = fp8[0];
    __PSQ_STX(invX, 40, fp8, 1, 0);

    return 1;
}
#endif

/*---------------------------------------------------------------------*


                             MODEL SECTION


*---------------------------------------------------------------------*/

/*---------------------------------------------------------------------*

Name:           MTXRotDeg

Description:    sets a rotation matrix about one of the X, Y or Z axes

Arguments:      m       matrix to be set

                axis    major axis about which to rotate.
                        axis is passed in as a character.
                        it must be one of 'X', 'x', 'Y', 'y', 'Z', 'z'

                deg     rotation angle in degrees.

                        note:  counter-clockwise rotation is positive.

Return:         none

*---------------------------------------------------------------------*/

/*---------------------------------------------------------------------*

Name:           MTXRotRad

Description:    sets a rotation matrix about one of the X, Y or Z axes

Arguments:      m       matrix to be set

                axis    major axis about which to rotate.
                        axis is passed in as a character.
                        it must be one of 'X', 'x', 'Y', 'y', 'Z', 'z'

                deg     rotation angle in radians.

                        note:  counter-clockwise rotation is positive.

Return:         none

*---------------------------------------------------------------------*/
/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
void C_MTXRotRad ( Mtx m, char axis, f32 rad )
{

    f32 sinA, cosA;

    ASSERTMSG( (m != 0), MTX_ROTRAD_1 );

    // verification of "axis" will occur in MTXRotTrig

    sinA = sinf(rad);
    cosA = cosf(rad);

    C_MTXRotTrig( m, axis, sinA, cosA );
}

#if !defined(WIN32) && !defined(WIN64)
/*---------------------------------------------------------------------*
    Paired-Single intrinsics version
 *---------------------------------------------------------------------*
                Note that this performs NO error checking.
 *---------------------------------------------------------------------*/
void PSMTXRotRad ( Mtx m, char axis, f32 rad )
{
    f32 sinA, cosA;

    sinA = sinf(rad);
    cosA = cosf(rad);

    PSMTXRotTrig( m, axis, sinA, cosA );
}
#endif

/*---------------------------------------------------------------------*

Name:           MTXRotTrig

Description:    sets a rotation matrix about one of the X, Y or Z axes
                from specified trig ratios

Arguments:      m       matrix to be set

                axis    major axis about which to rotate.
                        axis is passed in as a character.
                        It must be one of 'X', 'x', 'Y', 'y', 'Z', 'z'

                sinA    sine of rotation angle.

                cosA    cosine of rotation angle.

                        note:  counter-clockwise rotation is positive.

Return:         none

*---------------------------------------------------------------------*/
/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
void C_MTXRotTrig ( Mtx m, char axis, f32 sinA, f32 cosA )
{
    ASSERTMSG( (m != 0), MTX_ROTTRIG_1 );

    switch(axis)
    {

    case 'x':
    case 'X':
        m[0][0] =  1.0f;  m[0][1] =  0.0f;    m[0][2] =  0.0f;  m[0][3] = 0.0f;
        m[1][0] =  0.0f;  m[1][1] =  cosA;    m[1][2] = -sinA;  m[1][3] = 0.0f;
        m[2][0] =  0.0f;  m[2][1] =  sinA;    m[2][2] =  cosA;  m[2][3] = 0.0f;
        break;

    case 'y':
    case 'Y':
        m[0][0] =  cosA;  m[0][1] =  0.0f;    m[0][2] =  sinA;  m[0][3] = 0.0f;
        m[1][0] =  0.0f;  m[1][1] =  1.0f;    m[1][2] =  0.0f;  m[1][3] = 0.0f;
        m[2][0] = -sinA;  m[2][1] =  0.0f;    m[2][2] =  cosA;  m[2][3] = 0.0f;
        break;

    case 'z':
    case 'Z':
        m[0][0] =  cosA;  m[0][1] = -sinA;    m[0][2] =  0.0f;  m[0][3] = 0.0f;
        m[1][0] =  sinA;  m[1][1] =  cosA;    m[1][2] =  0.0f;  m[1][3] = 0.0f;
        m[2][0] =  0.0f;  m[2][1] =  0.0f;    m[2][2] =  1.0f;  m[2][3] = 0.0f;
        break;

    default:
        ASSERTMSG( 0, MTX_ROTTRIG_2 );
        break;

    }
}

#if !defined(WIN32) && !defined(WIN64)
/*---------------------------------------------------------------------*
    Paired-Single intrinsics version
 *---------------------------------------------------------------------*
                Note that this performs NO error checking.
 *---------------------------------------------------------------------*/
void PSMTXRotTrig ( Mtx m, char axis, f32 sinA, f32 cosA )
{
    f32x2 nsinA;
    f32x2 fw0, fw1, fw2, fw3;
    f32x2 sinA10 = {sinA, 0.0f};
    f32x2 cosA10 = {cosA, 0.0f};

    //ps_neg      nsinA, sinA
    nsinA = __PS_NEG(sinA10);

    switch(axis)
    {
    case 'x':
    case 'X':
        //psq_st      fc1,  0(m), 1, 0
        __PSQ_ST(m, c11, 1, 0);

        //psq_st      fc0,  4(m), 0, 0
        __PSQ_STX(m, 4, c00, 0, 0);

        //ps_merge00  fw0, sinA, cosA
        fw0 = __PS_MERGE00(sinA10, cosA10);

        //psq_st      fc0, 12(m), 0, 0
        __PSQ_STX(m, 12, c00, 0, 0);

        //ps_merge00  fw1, cosA, nsinA
        fw1 = __PS_MERGE00(cosA10, nsinA);

        //psq_st      fc0, 28(m), 0, 0
        __PSQ_STX(m, 28, c00, 0, 0);

        //psq_st      fc0, 44(m), 1, 0
        __PSQ_STX(m, 44, c00, 1, 0);

        //psq_st      fw0, 36(m), 0, 0
        __PSQ_STX(m, 36, fw0, 0, 0);

        //psq_st      fw1, 20(m), 0, 0
        __PSQ_STX(m, 20, fw1, 0, 0);

        break;

    case 'y':
    case 'Y':
        //ps_merge00  fw0, cosA, fc0
        fw0 = __PS_MERGE00(cosA10, c00);

        //ps_merge00  fw1, fc0, fc1
        fw1 = __PS_MERGE00(c00, c11);

        //psq_st      fc0, 24(m), 0, 0
        __PSQ_STX(m, 24, c00, 0, 0);

        //psq_st      fw0,  0(m), 0, 0
        __PSQ_ST(m, fw0, 0, 0);

        //ps_merge00  fw2, nsinA, fc0
        fw2 = __PS_MERGE00(nsinA, c00);

        //ps_merge00  fw3, sinA, fc0
        fw3 = __PS_MERGE00(sinA10, c00);

        //psq_st      fw0, 40(m), 0, 0;
        __PSQ_STX(m, 40, fw0, 0, 0);

        //psq_st      fw1, 16(m), 0, 0;
        __PSQ_STX(m, 16, fw1, 0, 0);

        //psq_st      fw3,  8(m), 0, 0;
        __PSQ_STX(m, 8, fw3, 0, 0);

        //psq_st      fw2, 32(m), 0, 0;
        __PSQ_STX(m, 32, fw2, 0, 0);

        break;

    case 'z':
    case 'Z':

        //psq_st      fc0,  8(m), 0, 0
        __PSQ_STX(m, 8, c00, 0, 0);

        //ps_merge00  fw0, sinA, cosA
        fw0 = __PS_MERGE00(sinA10, cosA10);

        //ps_merge00  fw2, cosA, nsinA
        fw2 = __PS_MERGE00(cosA10, nsinA);

        //psq_st      fc0, 24(m), 0, 0
        __PSQ_STX(m, 24, c00, 0, 0);

        //psq_st      fc0, 32(m), 0, 0
        __PSQ_STX(m, 32, c00, 0, 0);

        //ps_merge00  fw1, fc1, fc0
        fw1 = __PS_MERGE00(c11, c00);

        //psq_st      fw0, 16(m), 0, 0
        __PSQ_STX(m, 16, fw0, 0, 0);

        //psq_st      fw2,  0(m), 0, 0
        __PSQ_ST(m, fw2, 0, 0);

        //psq_st      fw1, 40(m), 0, 0
        __PSQ_STX(m, 40, fw1, 0, 0);

        break;

    default:
        ASSERTMSG( 0, MTX_ROTTRIG_2 );
        break;
    }
}
#endif

/*---------------------------------------------------------------------*

Name:           MTXRotAxisRad

Description:    sets a rotation matrix about an arbitrary axis


Arguments:      m       matrix to be set

                axis    ptr to a vector containing the x,y,z axis
                        components.
                        axis does not have to be a unit vector.

                deg     rotation angle in radians.

                        note:  counter-clockwise rotation is positive.

Return:         none

*---------------------------------------------------------------------*/
/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
void C_MTXRotAxisRad( Mtx m, const Vec *axis, f32 rad )
{
    Vec vN;
    f32 s, c;             // sinTheta, cosTheta
    f32 t;                // ( 1 - cosTheta )
    f32 x, y, z;          // x, y, z components of normalized axis
    f32 xSq, ySq, zSq;    // x, y, z squared

    ASSERTMSG( (m    != 0), MTX_ROTAXIS_1  );
    ASSERTMSG( (axis != 0), MTX_ROTAXIS_2  );

    s = sinf(rad);
    c = cosf(rad);
    t = 1.0f - c;

    C_VECNormalize( axis, &vN );

    x = vN.x;
    y = vN.y;
    z = vN.z;

    xSq = x * x;
    ySq = y * y;
    zSq = z * z;

    m[0][0] = ( t * xSq )   + ( c );
    m[0][1] = ( t * x * y ) - ( s * z );
    m[0][2] = ( t * x * z ) + ( s * y );
    m[0][3] =    0.0f;

    m[1][0] = ( t * x * y ) + ( s * z );
    m[1][1] = ( t * ySq )   + ( c );
    m[1][2] = ( t * y * z ) - ( s * x );
    m[1][3] =    0.0f;

    m[2][0] = ( t * x * z ) - ( s * y );
    m[2][1] = ( t * y * z ) + ( s * x );
    m[2][2] = ( t * zSq )   + ( c );
    m[2][3] =    0.0f;
}

#if !defined(WIN32) && !defined(WIN64)
/*---------------------------------------------------------------------*
    Paired-Single intrinsics version
 *---------------------------------------------------------------------*
                Note that this performs NO error checking.
 *---------------------------------------------------------------------*/
static void _PSMTXRotAxisRadInternal(
    Mtx    m,
    const Vec *axis,
    f32    sT,
    f32    cT )
{
    f32x2    tT, sT2, cT2;
    f32x2    tmp0, tmp1, tmp2, tmp3, tmp4;
    f32x2    tmp5, tmp6, tmp7, tmp9, tmp8;

    // tmp0 = [x][y] : LOAD
    //psq_l       tmp0, 0(axis), 0, 0
    //tmp0[0] = axis->x;
    //tmp0[1] = axis->y;
    tmp0 = __PSQ_L(axis, 0, 0);

    // tmp1 = [z][z] : LOAD
    tmp1[0] = axis->z;
    tmp1[1] = axis->z;

    // tmp2 = [x*x][y*y]
    tmp2 = __PS_MUL(tmp0, tmp0);

    // tmp3 = [x*x+z*z][y*y+z*z]
    tmp3 = __PS_MADD(tmp1, tmp1, tmp2);

    // tmp4 = [S = x*x+y*y+z*z][z]
    tmp4 = __PS_SUM0(tmp3, tmp1, tmp2);

    // tT = 1.0F - cT
    tT[0] = tT[1] = 1.0f - cT;

    // tmp5 = [1.0/sqrt(S)] :estimation[E]
    tmp5[0] = tmp5[1] = __FRSQRTE(tmp4[0]);

    // Newton-Rapson refinement step
    // E' = E/2(3.0 - E*E*S)
    tmp2 = __PS_MUL(tmp5, tmp5);            // E*E
    tmp3 = __PS_MUL(tmp5, c0505);            // E/2
    tmp2 = __PS_NMSUB(tmp2, tmp4, c33);    // (3-E*E*S)
    tmp5 = __PS_MUL(tmp2, tmp3);            // (E/2)(3-E*E*S)

    // cT = [c][c]
    cT2[0] = cT2[1] = cT;

    // sT = [c][c]
    sT2[0] = sT2[1] = sT;

    // tmp0 = [nx = x/sqrt(S)][ny = y/sqrt(S)]
    tmp0 = __PS_MULS0(tmp0, tmp5);

    // tmp1 = [nz = z/sqrt(S)][nz = z/sqrt(S)]
    tmp1 = __PS_MULS0(tmp1, tmp5);

    // tmp4 = [t*nx][t*ny]
    tmp4 = __PS_MULS0(tmp0, tT);

    // tmp9 = [s*nx][s*ny]
    tmp9 = __PS_MULS0(tmp0, sT2);

    // tmp5 = [t*nz][t*nz]
    tmp5  = __PS_MULS0(tmp1, tT);

    // tmp3 = [t*nx*ny][t*ny*ny]
    tmp3  = __PS_MULS1(tmp4, tmp0);

    // tmp2 = [t*nx*nx][t*ny*nx]
    tmp2 = __PS_MULS0(tmp4, tmp0);

    // tmp4 = [t*nx*nz][t*ny*nz]
    tmp4 = __PS_MULS0(tmp4, tmp1);

    // tmp6 = [t*nx*nx-s*nz][t*ny*ny-s*nz]
    tmp6 = __PS_NMSUB(tmp1, sT2, tmp2);

    // tmp7 = [t*nx*ny+s*nz][t*ny*ny+s*nz]
    tmp7 = __PS_MADD(tmp1, sT2, tmp3);

    // tmp0 = [-s*nx][-s*ny]
    tmp0 = __PS_NEG(tmp9);

    // tmp8 = [t*nx*nz+s*ny][0] == [m02][m03]
    tmp8 = __PS_SUM0(tmp4, c00, tmp9);

    // tmp2 = [t*nx*nx+c][t*nx*ny-s*nz] == [m00][m01]
    tmp2  = __PS_SUM0(tmp2, tmp6, cT2);

    // tmp3 = [t*nx*ny+s*nz][t*ny*ny+c] == [m10][m11]
    tmp3 = __PS_SUM1(cT2, tmp7, tmp3);

    // tmp6 = [t*ny*nz-s*nx][0] == [m12][m13]
    tmp6 = __PS_SUM0(tmp0, c00 ,tmp4);

    // tmp8 [m02][m03] : STORE
    //psq_st      tmp8, 8(m), 0, 0
    //m[0][2] = tmp8[0];
    //m[0][3] = tmp8[1];
    __PSQ_STX(m, 8, tmp8, 0, 0);

    // tmp0 = [t*nx*nz-s*ny][t*ny*nz]
    tmp0 = __PS_SUM0(tmp4, tmp4, tmp0);

    // tmp2 [m00][m01] : STORE
    //psq_st      tmp2, 0(m), 0, 0
    //m[0][0] = tmp2[0];
    //m[0][1] = tmp2[1];
    __PSQ_STX(m, 0, tmp2, 0, 0);

    // tmp5 = [t*nz*nz][t*nz*nz]
    tmp5 = __PS_MULS0(tmp5, tmp1);

    // tmp3 [m10][m11] : STORE
    //psq_st      tmp3, 16(m), 0, 0
    //m[1][0] = tmp3[0];
    //m[1][1] = tmp3[1];
    __PSQ_STX(m, 16, tmp3, 0, 0);

    // tmp4 = [t*nx*nz-s*ny][t*ny*nz+s*nx] == [m20][m21]
    tmp4 = __PS_SUM1(tmp9, tmp0, tmp4);

    // tmp6 [m12][m13] : STORE
    //psq_st      tmp6, 24(m), 0, 0
    //m[1][2] = tmp6[0];
    //m[1][3] = tmp6[1];
    __PSQ_STX(m, 24, tmp6, 0, 0);

    // tmp5 = [t*nz*nz+c][0]   == [m22][m23]
    tmp5  = __PS_SUM0(tmp5, c00, cT2);

    // tmp4 [m20][m21] : STORE
    //psq_st      tmp4, 32(m), 0, 0
    //m[2][0] = tmp4[0];
    //m[2][1] = tmp4[1];
    __PSQ_STX(m, 32, tmp4, 0, 0);

    // tmp5 [m22][m23] : STORE
    //psq_st      tmp5, 40(m), 0, 0
    //m[2][2] = tmp5[0];
    //m[2][3] = tmp5[1];
    __PSQ_STX(m, 40, tmp5, 0, 0);
}

/*---------------------------------------------------------------------*
    Paired-Single intrinsics version
 *---------------------------------------------------------------------*
                Note that this performs NO error checking.
 *---------------------------------------------------------------------*/
void PSMTXRotAxisRad(
    Mtx             m,
    const Vec      *axis,
    f32             rad )
{
    f32     sinT, cosT;

    sinT = sinf(rad);
    cosT = cosf(rad);

    _PSMTXRotAxisRadInternal(m, axis, sinT, cosT);
}
#endif

/*---------------------------------------------------------------------*

Name:           MTXTrans

Description:    sets a translation matrix.

Arguments:       m        matrix to be set

                xT        x component of translation.

                yT        y component of translation.

                zT        z component of translation.

Return:         none

*---------------------------------------------------------------------*/
/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
void C_MTXTrans ( Mtx m, f32 xT, f32 yT, f32 zT )
{
    ASSERTMSG( (m != 0), MTX_TRANS_1 );

    m[0][0] = 1.0f;  m[0][1] = 0.0f;  m[0][2] = 0.0f;  m[0][3] =  xT;
    m[1][0] = 0.0f;  m[1][1] = 1.0f;  m[1][2] = 0.0f;  m[1][3] =  yT;
    m[2][0] = 0.0f;  m[2][1] = 0.0f;  m[2][2] = 1.0f;  m[2][3] =  zT;
}

#if !defined(WIN32) && !defined(WIN64)
/*---------------------------------------------------------------------*
    Paired-Single intrinsics version
 *---------------------------------------------------------------------*
                Note that this performs NO error checking.
 *---------------------------------------------------------------------*/
void PSMTXTrans( Mtx m, f32 xT, f32 yT, f32 zT )
{
    f32x2 xT2 = {0.0F, xT};
    f32x2 yT2 = {0.0F, yT};
    f32x2 zT2 = {1.0F, zT};
    __PSQ_ST(m, c10, 0, 0);
    __PSQ_STX(m,  8, xT2, 0, 0);
    __PSQ_STX(m, 16, c01, 0, 0);
    __PSQ_STX(m, 24, yT2, 0, 0);
    __PSQ_STX(m, 32, c00, 0, 0);
    __PSQ_STX(m, 40, zT2, 0, 0);
}
#endif

/*---------------------------------------------------------------------*

Name:           MTXTransApply

Description:    This function performs the operation equivalent to
                MTXTrans + MTXConcat.

Arguments:      src       matrix to be operated.

                dst       resultant matrix from concat.

                xT        x component of translation.

                yT        y component of translation.

                zT        z component of translation.

Return:         none

*---------------------------------------------------------------------*/
/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
void C_MTXTransApply ( MTX_CONST Mtx src, Mtx dst, f32 xT, f32 yT, f32 zT )
{
    ASSERTMSG( (src != 0), MTX_TRANSAPPLY_1 );
    ASSERTMSG( (dst != 0), MTX_TRANSAPPLY_1 );

    if ( src != dst )
    {
        dst[0][0] = src[0][0];    dst[0][1] = src[0][1];    dst[0][2] = src[0][2];
        dst[1][0] = src[1][0];    dst[1][1] = src[1][1];    dst[1][2] = src[1][2];
        dst[2][0] = src[2][0];    dst[2][1] = src[2][1];    dst[2][2] = src[2][2];
    }

    dst[0][3] = src[0][3] + xT;
    dst[1][3] = src[1][3] + yT;
    dst[2][3] = src[2][3] + zT;
}

#if !defined(WIN32) && !defined(WIN64)
/*---------------------------------------------------------------------*
    Paired-Single intrinsics version
 *---------------------------------------------------------------------*
                Note that this performs NO error checking.
 *---------------------------------------------------------------------*/
void PSMTXTransApply( Mtx src, Mtx dst, f32 xT, f32 yT, f32 zT )
{
    f32x2 fp4, fp5, fp6, fp7, fp8, fp9;
    f32x2 xT10 = {xT, 0.0f};
    f32x2 yT10 = {yT, 0.0f};
    f32x2 zT10 = {zT, 0.0f};

    //psq_l       fp4, 0(src),        0, 0;
    fp4 = __PSQ_L(src, 0, 0);

    //frsp        xT, xT;                     // to make sure xT = single precision
    //psq_l       fp5, 8(src),        0, 0;
    fp5 = __PSQ_LX(src, 8, 0, 0);

    //frsp        yT, yT;                     // to make sure yT = single precision
    //psq_l       fp7, 24(src),       0, 0;
    fp7 = __PSQ_LX(src, 24, 0, 0);

    //frsp        zT, zT;                     // to make sure zT = single precision
    //psq_l       fp8, 40(src),       0, 0;
    fp8 = __PSQ_LX(src, 40, 0, 0);

    //psq_st      fp4, 0(dst),        0, 0;
    __PSQ_ST(dst, fp4, 0, 0);

    //ps_sum1     fp5, xT, fp5, fp5;
    fp5 = __PS_SUM1(xT10, fp5, fp5);

    //psq_l       fp6, 16(src),       0, 0;
    fp6 = __PSQ_LX(src, 16, 0, 0);

    //psq_st      fp5, 8(dst),        0, 0;
    __PSQ_STX(dst, 8, fp5, 0, 0);

    //ps_sum1     fp7, yT, fp7, fp7;
    fp7 = __PS_SUM1(yT10, fp7, fp7);

    //psq_l       fp9, 32(src),       0, 0;
    fp9 = __PSQ_LX(src, 32, 0, 0);

    //psq_st      fp6, 16(dst),       0, 0;
    __PSQ_STX(dst, 16, fp6, 0, 0);

    //ps_sum1     fp8, zT, fp8, fp8;
    fp8 = __PS_SUM1(zT10, fp8, fp8);

    //psq_st      fp7, 24(dst),       0, 0;
    __PSQ_STX(dst, 24, fp7, 0, 0);

    //psq_st      fp9, 32(dst),       0, 0;
    __PSQ_STX(dst, 32, fp9, 0, 0);

    //psq_st      fp8, 40(dst),       0, 0;
    __PSQ_STX(dst, 40, fp8, 0, 0);
}
#endif

/*---------------------------------------------------------------------*

Name:            MTXScale

Description:     sets a scaling matrix.


Arguments:       m        matrix to be set

                xS        x scale factor.

                yS        y scale factor.

                zS        z scale factor.

Return:         none

 *---------------------------------------------------------------------*/
/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
void C_MTXScale ( Mtx m, f32 xS, f32 yS, f32 zS )
{
    ASSERTMSG( (m != 0), MTX_SCALE_1 );


    m[0][0] = xS;    m[0][1] = 0.0f;  m[0][2] = 0.0f;  m[0][3] = 0.0f;
    m[1][0] = 0.0f;  m[1][1] = yS;    m[1][2] = 0.0f;  m[1][3] = 0.0f;
    m[2][0] = 0.0f;  m[2][1] = 0.0f;  m[2][2] = zS;    m[2][3] = 0.0f;
}

#if !defined(WIN32) && !defined(WIN64)
/*---------------------------------------------------------------------*
    Paired-Single intrinsics version
 *---------------------------------------------------------------------*
                Note that this performs NO error checking.
 *---------------------------------------------------------------------*/
void PSMTXScale( Mtx m, f32 xS, f32 yS, f32 zS )
{
    f32x2 xS2 = {xS,   0.0F};
    f32x2 yS2 = {0.0F, yS};
    f32x2 zS2 = {zS, 0.0F};

    __PSQ_ST(m, xS2, 0, 0);
    __PSQ_STX(m,  8, c00, 0, 0);
    __PSQ_STX(m, 16, yS2, 0, 0);
    __PSQ_STX(m, 24, c00, 0, 0);
    __PSQ_STX(m, 32, c00, 0, 0);
    __PSQ_STX(m, 40, zS2, 0, 0);
}
#endif

/*---------------------------------------------------------------------*

Name:           MTXScaleApply

Description:    This function performs the operation equivalent to
                MTXScale + MTXConcat

Arguments:      src       matrix to be operated.

                dst       resultant matrix from concat.

                xS        x scale factor.

                yS        y scale factor.

                zS        z scale factor.

Return:         none

 *---------------------------------------------------------------------*/
/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
void C_MTXScaleApply ( MTX_CONST Mtx src, Mtx dst, f32 xS, f32 yS, f32 zS )
{
    ASSERTMSG( (src != 0), MTX_SCALEAPPLY_1 );
    ASSERTMSG( (dst != 0), MTX_SCALEAPPLY_2 );

    dst[0][0] = src[0][0] * xS;     dst[0][1] = src[0][1] * xS;
    dst[0][2] = src[0][2] * xS;     dst[0][3] = src[0][3] * xS;

    dst[1][0] = src[1][0] * yS;     dst[1][1] = src[1][1] * yS;
    dst[1][2] = src[1][2] * yS;     dst[1][3] = src[1][3] * yS;

    dst[2][0] = src[2][0] * zS;     dst[2][1] = src[2][1] * zS;
    dst[2][2] = src[2][2] * zS;     dst[2][3] = src[2][3] * zS;
}

#if !defined(WIN32) && !defined(WIN64)
/*---------------------------------------------------------------------*
    Paired-Single intrinsics version
 *---------------------------------------------------------------------*
                Note that this performs NO error checking.
 *---------------------------------------------------------------------*/

void PSMTXScaleApply ( MTX_CONST Mtx src, Mtx dst, f32 xS, f32 yS, f32 zS )
{
    //f32x2 fp0;
    //f32x2 fp1;
    f32x2 fp2;
    //f32x2 fp3;
    f32x2 fp4;
    f32x2 fp5;

    f32x2 fp6;
    f32x2 fp7;
    f32x2 fp8;
    //f32x2 fp9;
    //f32x2 fp10;
    //f32x2 fp11;

    f32x2 xS2 = {xS, xS};
    f32x2 yS2 = {yS, yS};
    f32x2 zS2 = {zS, zS};

    //psq_l       fp4, 0(src),        0, 0;
    fp4 = __PSQ_LX(src, 0, 0, 0);

    //psq_l       fp5, 8(src),        0, 0;
    fp5 = __PSQ_LX(src, 8, 0, 0);

    //ps_muls0    fp4, fp4, xS;
    fp4 = __PS_MUL(fp4, xS2);

    //psq_l       fp6, 16(src),       0, 0;
    fp6 = __PSQ_LX(src, 16, 0, 0);

    //ps_muls0    fp5, fp5, xS;
    fp5 = __PS_MUL(fp5, xS2);

    //psq_l       fp7, 24(src),       0, 0;
    fp7 = __PSQ_LX(src, 24, 0, 0);

    //ps_muls0    fp6, fp6, yS;
    fp6 = __PS_MUL(fp6, yS2);

    //psq_l       fp8, 32(src),       0, 0;
    fp8 = __PSQ_LX(src, 32, 0, 0);

    //psq_st      fp4, 0(dst),        0, 0;
    __PSQ_STX(dst, 0, fp4, 0, 0);

    //ps_muls0    fp7, fp7, yS;
    fp7 = __PS_MUL(fp7, yS2);

    //psq_l       fp2, 40(src),       0, 0;
    fp2 = __PSQ_LX(src, 40, 0, 0);

    //psq_st      fp5, 8(dst),        0, 0;
    __PSQ_STX(dst, 8, fp5, 0, 0);

    //ps_muls0    fp8, fp8, zS;
    fp8 = __PS_MUL(fp8, zS2);

    //psq_st      fp6, 16(dst),       0, 0;
    __PSQ_STX(dst, 16, fp6, 0, 0);

    //ps_muls0    fp2, fp2, zS;
    fp2 = __PS_MUL(fp2, zS2);

    //psq_st      fp7, 24(dst),       0, 0;
    __PSQ_STX(dst, 24, fp7, 0, 0);

    //psq_st      fp8, 32(dst),       0, 0;
    __PSQ_STX(dst, 32, fp8, 0, 0);

    //psq_st      fp2, 40(dst),       0, 0;
    __PSQ_STX(dst, 40, fp2, 0, 0);

}
#endif

/*---------------------------------------------------------------------*

Name:           MTXReflect

Description:    reflect a rotation matrix with respect to a plane.

Arguments:      m        matrix to be set

                p        point on the planar reflector.

                n       normal of the planar reflector.

Return:         none

 *---------------------------------------------------------------------*/
/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
void C_MTXReflect ( Mtx m, const Vec *p, const Vec *n )
{
    f32 vxy, vxz, vyz, pdotn;

    vxy   = -2.0f * n->x * n->y;
    vxz   = -2.0f * n->x * n->z;
    vyz   = -2.0f * n->y * n->z;
    pdotn = 2.0f * C_VECDotProduct(p, n);

    m[0][0] = 1.0f - 2.0f * n->x * n->x;
    m[0][1] = vxy;
    m[0][2] = vxz;
    m[0][3] = pdotn * n->x;

    m[1][0] = vxy;
    m[1][1] = 1.0f - 2.0f * n->y * n->y;
    m[1][2] = vyz;
    m[1][3] = pdotn * n->y;

    m[2][0] = vxz;
    m[2][1] = vyz;
    m[2][2] = 1.0f - 2.0f * n->z * n->z;
    m[2][3] = pdotn * n->z;
}

#if !defined(WIN32) && !defined(WIN64)
/*---------------------------------------------------------------------*
    Paired-Single intrinsics version
 *---------------------------------------------------------------------*/
void PSMTXReflect ( Mtx m, const Vec *p, const Vec *n )
{
    f32x2    vn_xy, vn_z1, n2vn_xy, n2vn_z1, pdotn;
    f32x2    tmp0, tmp1, tmp2, tmp3;
    f32x2    tmp4, tmp5, tmp6, tmp7;

    // vn_z1 = [nz][1.0F] : LOAD
    //vn_z1[0] = n->z;
    //vn_z1[1] = 1.0F;
    vn_z1 = __PSQ_LX(n, 8, 1, 0);

    // vn_xy = [nx][ny]   : LOAD
    //vn_xy[0] = n->x;
    //vn_xy[1] = n->y;
    vn_xy = __PSQ_LX(n, 0, 0, 0);

    // tmp0 = [px][py]   : LOAD
    //tmp0[0] = p->x;
    //tmp0[1] = p->y;
    tmp0 = __PSQ_LX(p, 0, 0, 0);

    // n2vn_z1 = [-2nz][-2.0F]
    n2vn_z1 = __PS_NMADD(vn_z1, c11, vn_z1);

    // tmp1 = [pz][1.0F] : LOAD
    //psq_l       tmp1,  8(p), 1, 0
    //tmp1[0] = p->z;
    //tmp1[1] = 1.0F;
    tmp1 = __PSQ_LX(p, 8, 1, 0);

    // n2vn_xy = [-2nx][-2ny]
    n2vn_xy = __PS_NMADD(vn_xy, c11, vn_xy);

    // tmp4 = [-2nx*nz][-2ny*nz]   : [m20][m21]
    tmp4 = __PS_MULS0(vn_xy, n2vn_z1);

    // pdotn = [-2(px*nx)][-2(py*ny)]
    pdotn = __PS_MUL(n2vn_xy, tmp0);

    // tmp2 = [-2nx*nx][-2nx*ny]
    tmp2 = __PS_MULS0(vn_xy, n2vn_xy);

    // pdotn = [-2(px*nx+py*ny)][?]
    pdotn = __PS_SUM0(pdotn, pdotn, pdotn);

    // tmp3 = [-2nx*ny][-2ny*ny]
    tmp3 = __PS_MULS1(vn_xy, n2vn_xy);

    // tmp4 = [m20][m21] : STORE
    //m[2][0] = tmp4[0];
    //m[2][1] = tmp4[1];
    __PSQ_STX(m, 32, tmp4, 0, 0);

    // tmp2 = [1-2nx*nx][-2nx*ny]  : [m00][m01]
    tmp2  = __PS_SUM0(tmp2, tmp2, c11);

    // pdotn = [2(px*nx+py*ny+pz*nz)][?]
    pdotn  = __PS_NMADD(n2vn_z1, tmp1, pdotn);

    // tmp3 = [-2nx*ny][1-2ny*ny]  : [m10][m11]
    tmp3 = __PS_SUM1(c11, tmp3, tmp3);

    // tmp2 = [m00][m01] : STORE
    //m[0][0] = tmp2[0];
    //m[0][1] = tmp2[1];
    __PSQ_STX(m, 0, tmp2, 0, 0);

    // tmp5 = [pdotn*nx][pdotn*ny]
    tmp5 = __PS_MULS0(vn_xy, pdotn);

    // tmp6 = [-2nz][pdotn]
    tmp6 = __PS_MERGE00(n2vn_z1, pdotn);

    // tmp3 = [m10][m11] : STORE
    //m[1][0] = tmp3[0];
    //m[1][1] = tmp3[1];
    __PSQ_STX(m, 16, tmp3, 0, 0);

    // tmp7 = [-2nx*nz][pdotn*nx]  : [m02][m03]
    tmp7 = __PS_MERGE00(tmp4, tmp5);

    // tmp6 = [-2nz*nz][pdotn*nz]
    tmp6 = __PS_MULS0(tmp6, vn_z1);

    // tmp5 = [-2ny*nz][pdotn*ny]  : [m12][m13]
    tmp5 = __PS_MERGE11(tmp4, tmp5);

    // tmp7 = [m02][m03] : STORE
    //m[0][2] = tmp7[0];
    //m[0][3] = tmp7[1];
    __PSQ_STX(m, 8, tmp7, 0, 0);

    // tmp6 = [1-2nz*nz][pdotn*nz] : [m22][m23]
    tmp6 = __PS_SUM0(tmp6, tmp6, c11);

    // tmp5 = [m12][m13] : STORE
    //m[1][2] = tmp5[0];
    //m[1][3] = tmp5[1];
    __PSQ_STX(m, 24, tmp5, 0, 0);

    // tmp6 = [m22][m23] : STORE
    //m[2][2] = tmp6[0];
    //m[2][3] = tmp6[1];
    __PSQ_STX(m, 40, tmp6, 0, 0);
}
#endif


/*---------------------------------------------------------------------*

                             VIEW SECTION

*---------------------------------------------------------------------*/

/*---------------------------------------------------------------------*

Name:           MTXLookAt

Description:    compute a matrix to transform points to camera coordinates.

Arguments:      m        matrix to be set

                camPos   camera position.

                camUp    camera 'up' direction.

                target   camera aim point.

Return:         none

*---------------------------------------------------------------------*/
/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
void C_MTXLookAt ( Mtx m, const Point3d *camPos, const Vec *camUp, const Point3d *target )
{
    Vec vLook,vRight,vUp;

    ASSERTMSG( (m != 0),      MTX_LOOKAT_1    );
    ASSERTMSG( (camPos != 0), MTX_LOOKAT_2    );
    ASSERTMSG( (camUp  != 0), MTX_LOOKAT_3    );
    ASSERTMSG( (target != 0), MTX_LOOKAT_4    );

    // compute unit target vector
    // use negative value to look down (-Z) axis
    vLook.x = camPos->x - target->x;
    vLook.y = camPos->y - target->y;
    vLook.z = camPos->z - target->z;
    VECNormalize( &vLook,&vLook );

    // vRight = camUp x vLook
    VECCrossProduct    ( camUp, &vLook, &vRight );
    VECNormalize( &vRight,&vRight );

    // vUp = vLook x vRight
    VECCrossProduct( &vLook, &vRight, &vUp );
    // Don't need to normalize vUp since it should already be unit length
    // VECNormalize( &vUp, &vUp );

    m[0][0] = vRight.x;
    m[0][1] = vRight.y;
    m[0][2] = vRight.z;
    m[0][3] = -( camPos->x * vRight.x + camPos->y * vRight.y + camPos->z * vRight.z );

    m[1][0] = vUp.x;
    m[1][1] = vUp.y;
    m[1][2] = vUp.z;
    m[1][3] = -( camPos->x * vUp.x + camPos->y * vUp.y + camPos->z * vUp.z );

    m[2][0] = vLook.x;
    m[2][1] = vLook.y;
    m[2][2] = vLook.z;
    m[2][3] = -( camPos->x * vLook.x + camPos->y * vLook.y + camPos->z * vLook.z );
}

/*---------------------------------------------------------------------*


                       TEXTURE PROJECTION SECTION


*---------------------------------------------------------------------*/

/*---------------------------------------------------------------------*

Name:           MTXLightFrustum

Description:    Compute a 3x4 projection matrix for texture projection

Arguments:      m        3x4 matrix to be set

                t        top coord. of view volume at the near clipping plane

                b        bottom coord of view volume at the near clipping plane

                lf       left coord. of view volume at near clipping plane

                r        right coord. of view volume at near clipping plane

                n        positive distance from camera to near clipping plane

                scaleS   scale in the S direction for projected coordinates
                         (usually 0.5)

                scaleT   scale in the T direction for projected coordinates
                         (usually 0.5)

                transS   translate in the S direction for projected coordinates
                         (usually 0.5)

                transT   translate in the T direction for projected coordinates
                         (usually 0.5)

Return:         none.

 *---------------------------------------------------------------------*/
/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
void C_MTXLightFrustum  ( Mtx m, float t, float b, float lf, float r, float n,
                          float scaleS, float scaleT, float transS,
                          float transT )
{
    f32 tmp;

    ASSERTMSG( (m != 0),  MTX_LIGHT_FRUSTUM_1  );
    ASSERTMSG( (t != b),  MTX_LIGHT_FRUSTUM_2  );
    ASSERTMSG( (lf != r), MTX_LIGHT_FRUSTUM_3  );

    tmp     =  1.0f / (r - lf);
    m[0][0] =  ((2*n) * tmp) * scaleS;
    m[0][1] =  0.0f;
    m[0][2] =  (((r + lf) * tmp) * scaleS) - transS;
    m[0][3] =  0.0f;

    tmp     =  1.0f / (t - b);
    m[1][0] =  0.0f;
    m[1][1] =  ((2*n) * tmp) * scaleT;
    m[1][2] =  (((t + b) * tmp) * scaleT) - transT;
    m[1][3] =  0.0f;

    m[2][0] =  0.0f;
    m[2][1] =  0.0f;
    m[2][2] = -1.0f;
    m[2][3] =  0.0f;
}

/*---------------------------------------------------------------------*

Name:           MTXLightPerspective

Description:    compute a 3x4 perspective projection matrix from
                field of view and aspect ratio for texture projection.

Arguments:      m        3x4 matrix to be set

                fovy     total field of view in in degrees in the YZ plane

                aspect   ratio of view window width:height (X / Y)

                scaleS   scale in the S direction for projected coordinates
                         (usually 0.5)

                scaleT   scale in the T direction for projected coordinates
                         (usually 0.5)

                transS   translate in the S direction for projected coordinates
                         (usually 0.5)

                transT   translate in the T direction for projected coordinates
                         (usually 0.5)

Return:         none

 *---------------------------------------------------------------------*/
/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
void C_MTXLightPerspective  ( Mtx m, f32 fovY, f32 aspect, float scaleS,
                              float scaleT, float transS, float transT )
{
    f32 angle;
    f32 cot;

    ASSERTMSG( (m != 0),                            MTX_LIGHT_PERSPECTIVE_1  );
    ASSERTMSG( ( (fovY > 0.0) && ( fovY < 180.0) ), MTX_LIGHT_PERSPECTIVE_2  );
    ASSERTMSG( (aspect != 0),                       MTX_LIGHT_PERSPECTIVE_3  );

    // find the cotangent of half the (YZ) field of view
    angle = fovY * 0.5f;
    angle = MTXDegToRad( angle );

    cot = 1.0f / tanf(angle);

    m[0][0] =    (cot / aspect) * scaleS;
    m[0][1] =    0.0f;
    m[0][2] =    -transS;
    m[0][3] =    0.0f;

    m[1][0] =    0.0f;
    m[1][1] =    cot * scaleT;
    m[1][2] =    -transT;
    m[1][3] =    0.0f;

    m[2][0] =    0.0f;
    m[2][1] =    0.0f;
    m[2][2] =   -1.0f;
    m[2][3] =    0.0f;
}

/*---------------------------------------------------------------------*

Name:           MTXLightOrtho

Description:    compute a 3x4 orthographic projection matrix.

Arguments:      m        matrix to be set

                t        top coord. of parallel view volume

                b        bottom coord of parallel view volume

                lf       left coord. of parallel view volume

                r        right coord. of parallel view volume

                scaleS   scale in the S direction for projected coordinates
                         (usually 0.5)

                scaleT   scale in the T direction for projected coordinates
                         (usually 0.5)

                transS   translate in the S direction for projected coordinates
                         (usually 0.5)

                transT   translate in the T direction for projected coordinates
                         (usually 0.5)

Return:         none

 *---------------------------------------------------------------------*/
/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
void C_MTXLightOrtho ( Mtx m, f32 t, f32 b, f32 lf, f32 r, float scaleS,
                              float scaleT, float transS, float transT )
{
    f32 tmp;

    ASSERTMSG( (m != 0),  MTX_LIGHT_ORTHO_1     );
    ASSERTMSG( (t != b),  MTX_LIGHT_ORTHO_2     );
    ASSERTMSG( (lf != r), MTX_LIGHT_ORTHO_3     );

    tmp     =  1.0f / (r - lf);
    m[0][0] =  (2.0f * tmp * scaleS);
    m[0][1] =  0.0f;
    m[0][2] =  0.0f;
    m[0][3] =  ((-(r + lf) * tmp) * scaleS) + transS;

    tmp     =  1.0f / (t - b);
    m[1][0] =  0.0f;
    m[1][1] =  (2.0f * tmp) * scaleT;
    m[1][2] =  0.0f;
    m[1][3] =  ((-(t + b) * tmp)* scaleT) + transT;

    m[2][0] =  0.0f;
    m[2][1] =  0.0f;
    m[2][2] =  0.0f;
    m[2][3] =  1.0f;
}

/*---------------------------------------------------------------------*

Name:           MTXReorder

Description:    Creates a reordered (column-major) matrix from a
                row-major matrix, using paired single operations.
                Reordered matrices are required for the MTXRO*
                functions, which operate faster than their non-reordered
                counterparts.

Arguments:      src      source matrix.
                dest     destination matrix, note type is ROMtx.

Return:         none

*---------------------------------------------------------------------*/
/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
void C_MTXReorder(MTX_CONST Mtx src, ROMtx dst)
{
    dst[0][0] = src[0][0];    dst[0][1] = src[1][0];    dst[0][2] = src[2][0];
    dst[1][0] = src[0][1];    dst[1][1] = src[1][1];    dst[1][2] = src[2][1];
    dst[2][0] = src[0][2];    dst[2][1] = src[1][2];    dst[2][2] = src[2][2];
    dst[3][0] = src[0][3];    dst[3][1] = src[1][3];    dst[3][2] = src[2][3];
}

#if !defined(WIN32) && !defined(WIN64)
/*---------------------------------------------------------------------*
    Paired-Single intrinsics version
 *---------------------------------------------------------------------*/
void PSMTXReorder(MTX_CONST Mtx src, register ROMtx dest)
{
    f32x2 S00_S01, S02_S03, S10_S11, S12_S13, S20_S21, S22_S23;
    f32x2 D00_D10, D11_D21, D02_D12, D22_D03, D13_D23, D20_D01;

    //psq_l       S00_S01, 0(src),  0, 0
    S00_S01 = __PSQ_L(src, 0, 0);

    //psq_l       S10_S11, 16(src), 0, 0
    S10_S11 = __PSQ_LX(src, 16, 0, 0);

    //psq_l       S20_S21, 32(src), 0, 0
    S20_S21 = __PSQ_LX(src, 32, 0, 0);

    //psq_l       S02_S03, 8(src),  0, 0
    S02_S03 = __PSQ_LX(src, 8, 0, 0);

    //ps_merge00  D00_D10, S00_S01, S10_S11
    D00_D10 = __PS_MERGE00(S00_S01, S10_S11);

    //psq_l       S12_S13, 24(src), 0, 0
    S12_S13 = __PSQ_LX(src, 24, 0, 0);

    //ps_merge01  D20_D01, S20_S21, S00_S01
    D20_D01 = __PS_MERGE01(S20_S21, S00_S01);

    //psq_l       S22_S23, 40(src), 0, 0
    S22_S23 = __PSQ_LX(src, 40, 0, 0);

    //ps_merge11  D11_D21, S10_S11, S20_S21
    D11_D21 = __PS_MERGE11(S10_S11, S20_S21);

    //psq_st      D00_D10, 0(dest), 0, 0
    __PSQ_ST(dest, D00_D10, 0, 0);

    //ps_merge00  D02_D12, S02_S03, S12_S13
    D02_D12 = __PS_MERGE00(S02_S03, S12_S13);

    //psq_st      D20_D01, 8(dest), 0, 0
    __PSQ_STX(dest, 8, D20_D01, 0, 0);

    //ps_merge01  D22_D03, S22_S23, S02_S03
    D22_D03 = __PS_MERGE01(S22_S23, S02_S03);

    //psq_st      D11_D21, 16(dest),0, 0
    __PSQ_STX(dest, 16, D11_D21, 0, 0);

    //ps_merge11  D13_D23, S12_S13, S22_S23
    D13_D23 = __PS_MERGE11(S12_S13, S22_S23);

    //psq_st      D02_D12, 24(dest),0, 0
    __PSQ_STX(dest, 24, D02_D12, 0, 0);

    //psq_st      D22_D03, 32(dest),0,0
    __PSQ_STX(dest, 32, D22_D03, 0, 0);

    //psq_st      D13_D23, 40(dest),0,0
    __PSQ_STX(dest, 40, D13_D23, 0, 0);
}

/*===========================================================================*/


extern void _ASM_MTXRotAxisRadInternal(Mtx m, const Vec *axis, f32 sT, f32 cT);

void ASM_MTXRotAxisRad(Mtx        m,
                       const Vec *axis,
                       f32        rad ) {
    f32     sinT, cosT;

    sinT = sinf(rad);
    cosT = cosf(rad);

    _ASM_MTXRotAxisRadInternal(m, axis, sinT, cosT);
}

void ASM_MTXRotRad ( Mtx m, char axis, f32 rad )
{
    f32 sinA, cosA;

    sinA = sinf(rad);
    cosA = cosf(rad);

    ASM_MTXRotTrig( m, axis, sinA, cosA );
}

void ASM_QUATDivide( const Quaternion *p, const Quaternion *q, Quaternion *r)
{
    Quaternion qtmp;

    ASM_QUATInverse(q, &qtmp);
    ASM_QUATMultiply(&qtmp, p, r);
}
#endif