mtx/src/mtx.c

/*---------------------------------------------------------------------------*
  Project: matrix vector Library
  File:    mtx.c

  Copyright 1998-2006 Nintendo.  All rights reserved.

  These coded instructions, statements, and computer programs contain
  proprietary information of Nintendo of America Inc. and/or Nintendo
  Company Ltd., and are protected by Federal copyright law.  They may
  not be disclosed to third parties or copied or duplicated in any form,
  in whole or in part, without the prior written consent of Nintendo.


  $Log: mtx.c,v $
  Revision 1.3  02/20/2006 04:25:42  mitu
  changed include path from dolphin/ to revolution/.

  Revision 1.2  12/30/2005 06:01:51  hirose
  Temporary workaround for CW3.0 Alpha3 problem.


    30    02/06/26 13:49 Hirose
    Fixed a register allocation problem with DEBUG build.

    29    02/06/20 11:21 Hirose
    Added MTXConcatArray.

    28    02/04/11 13:10 Hirose
    const type specifier support. (worked by Hiratsu@IRD)

    27    02/02/18 9:27 Hirose
    Workaround for floating-point precision mismatch issue.

    26    9/13/01 4:28p Hirose
    Removed unnecessary possibilities of zero division exception.

    25    8/28/01 3:48p Hirose
    Removed one refinement process in the calculation of square root.

    24    7/30/01 10:21p Hirose
    Changes for function definitions.

    23    7/23/01 8:45p Hirose
    Now all model matrix functions and matrix-vector functions have PS
    version.

    22    7/07/01 7:37p Hirose
    added PS versions of MTXTrans* / MTXScale* functions.

    21    3/29/01 8:23p Hirose
    fixed assertion messages

    20    3/29/01 3:06p Hirose
    added MTXInvXpose

    19    3/16/01 11:43p Hirose
    added PSMTXInverse

    18    2/22/01 11:58p Hirose
    Moved some sections into other files.
    Added some new Paired-Single functions. Etc.

    17    7/31/00 2:07p Carl
    Removed redundant verify in MTXRotRad.
    Minor potential optimization in VECNormalize.

    16    7/21/00 4:19p Carl
    Removed as many divides as possible.

    15    7/21/00 2:58p Carl
    Removed unnecessary VECNormalize in MTXLookAt.

    14    7/12/00 4:41p John
    Substitutes MTXConcat and MTXMultVecArray with their paired-singles
    equivalent for Gekko non-debug builds.

    13    7/07/00 7:09p Dante
    PC Compatibility

    12    5/10/00 1:50p Hirose
    added radian-based rotation matrix functions
    moved paired-single matrix stuff into an another source file

    11    4/13/00 11:26a Danm
    Restore high performance version of ROMulVecArray.

    10    4/10/00 11:56a Danm
    Added 2 matrix skinning support.

    9     3/27/00 6:38p Carl
    Fixed other comments.

    8     3/27/00 6:20p Carl
    Changed comment regarding z scale in MTXFrustum.

    7     3/22/00 2:01p John
    Added VECSquareDistance and VECDistance.

    6     2/14/00 2:04p Mikepc
    de-tabbed code.

    5     2/11/00 4:22p Tian
    Cleaned up reordered paired single code

    4     1/27/00 5:21p Tian
    Documented paired single ops.

    3     1/27/00 4:56p Tian
    Implemented fast paired single matrix ops : PSMTXReorder, PSMTXConcat,
    PSMTXROMultVecArray, PSMTXMultVecArray, PSMTXMultS16VecArray,
    PSMTXROMultS16VecArray

    2     12/06/99 12:40p Alligator
    modify proj mtx to scale/translate z to (0, w) range for hardware

    20    11/17/99 4:32p Ryan
    update to fin win32 compiler warnings

    19    11/12/99 12:08p Mikepc
    changed #include<dolphin/mtx/mtxAssert.h> to #include"mtxAssert.h" to
    reflect new location of mtxAssert.h file.

    18    11/10/99 7:21p Hirose
    added singular case check for VECHalfAngle

    17    11/10/99 4:40p Alligator
    added MTXReflect

    16    10/06/99 12:28p Yasu
    Appended MTXMultVecSR and MTXMultVecArraySR

    15    8/31/99 5:41p Shiki
    Revised to use sqrtf() instead of sqrt().

    14      8/31/99 5:18p Shiki
    Revised to use sinf()/cosf() instead of sin()/cos().

    13      7/28/99 3:38p Ryan

    12      7/28/99 11:30a Ryan
    Added Texture Projection functions

    11      7/02/99 12:53p Mikepc
    changed mtx row/col

  $NoKeywords: $
 *---------------------------------------------------------------------------*/
#ifdef WIN32
#include <win32/win32.h>
#endif

#include <math.h>
#include <revolution/mtx.h>
#include "mtxAssert.h"

#if ( __MWERKS__ == 0x00004100 )
#pragma defer_codegen on    // This will be removed when compiler is fixed.
#endif

/*---------------------------------------------------------------------*
    Constants
 *---------------------------------------------------------------------*/
static f32 Unit01[] = { 0.0f, 1.0f };


/*---------------------------------------------------------------------*


                            GENERAL   SECTION


*---------------------------------------------------------------------*/


/*---------------------------------------------------------------------*

Name:           MTXIdentity

Description:    sets a matrix to identity

Arguments:      m :  matrix to be set

Return   :         none

*---------------------------------------------------------------------*/
/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
void C_MTXIdentity ( Mtx m )
{

    ASSERTMSG( (m != 0), MTX_IDENTITY_1 );


    m[0][0] = 1.0f;     m[0][1] = 0.0f;  m[0][2] = 0.0f;  m[0][3] = 0.0f;

    m[1][0] = 0.0f;     m[1][1] = 1.0f;  m[1][2] = 0.0f;  m[1][3] = 0.0f;

    m[2][0] = 0.0f;     m[2][1] = 0.0f;  m[2][2] = 1.0f;  m[2][3] = 0.0f;

}

/*---------------------------------------------------------------------*
    Paired-Single assembler version
 *---------------------------------------------------------------------*
            Note that this performs NO error checking.
            Actually there is not much performance advantage.
 *---------------------------------------------------------------------*/
#ifdef GEKKO
void PSMTXIdentity
(
    register Mtx m    // r3
)
{
    register f32 c_zero = 0.0F; // { 0.0F, 0.0F };
    register f32 c_one  = 1.0F; // { 1.0.0F, 1.0.0F };
    register f32 c_01;
    register f32 c_10;

    asm
    {
        psq_st      c_zero, 8(m),   0, 0     // m[0][2], m[0][3]
        ps_merge01  c_01, c_zero, c_one      // { 0.1F, 1.0F }
        psq_st      c_zero, 24(m),  0, 0     // m[1][2], m[1][3]
        ps_merge10  c_10, c_one, c_zero      // fp2 = { 1.0F, 0.0F }
        psq_st      c_zero, 32(m),  0, 0     // m[2][0], m[2][1]
        psq_st      c_01,   16(m),  0, 0     // m[1][0], m[1][1]
        psq_st      c_10,   0(m),   0, 0     // m[0][0], m[0][1]
        psq_st      c_10,   40(m),  0, 0     // m[2][2], m[2][3]
    }
}
#endif // GEKKO

/*---------------------------------------------------------------------*

Name:           MTXCopy

Description:    copies the contents of one matrix into another

Arguments:      src        source matrix for copy
                dst        destination matrix for copy


Return   :         none

*---------------------------------------------------------------------*/
/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
void C_MTXCopy ( const Mtx src, Mtx dst )
{

    ASSERTMSG( (src != 0) , MTX_COPY_1 );
    ASSERTMSG( (dst != 0) , MTX_COPY_2 );

    if( src == dst )
    {
        return;
    }


    dst[0][0] = src[0][0];    dst[0][1] = src[0][1];    dst[0][2] = src[0][2];    dst[0][3] = src[0][3];

    dst[1][0] = src[1][0];    dst[1][1] = src[1][1];    dst[1][2] = src[1][2];    dst[1][3] = src[1][3];

    dst[2][0] = src[2][0];    dst[2][1] = src[2][1];    dst[2][2] = src[2][2];    dst[2][3] = src[2][3];

}

/*---------------------------------------------------------------------*
    Paired-Single assembler version
 *---------------------------------------------------------------------*
                Note that this performs NO error checking.
 *---------------------------------------------------------------------*/
#ifdef GEKKO
asm void PSMTXCopy
(
    const register Mtx src,   // r3
    register Mtx dst    // r4
)
{
    nofralloc

    psq_l       fp0, 0(src),   0, 0
    psq_st      fp0, 0(dst),   0, 0
    psq_l       fp1, 8(src),   0, 0
    psq_st      fp1, 8(dst),   0, 0
    psq_l       fp2, 16(src),  0, 0
    psq_st      fp2, 16(dst),  0, 0
    psq_l       fp3, 24(src),  0, 0
    psq_st      fp3, 24(dst),  0, 0
    psq_l       fp4, 32(src),  0, 0
    psq_st      fp4, 32(dst),  0, 0
    psq_l       fp5, 40(src),  0, 0
    psq_st      fp5, 40(dst),  0, 0

    blr
}
#endif // GEKKO

/*---------------------------------------------------------------------*

Name:           MTXConcat

Description:    concatenates two matrices.
                order of operation is A x B = AB.
                ok for any of ab == a == b.

                saves a MTXCopy operation if ab != to a or b.

Arguments:      a        first matrix for concat.
                b        second matrix for concat.
                ab       resultant matrix from concat.

Return   :         none

 *---------------------------------------------------------------------*/

/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
void C_MTXConcat ( const Mtx a, const Mtx b, Mtx ab )
{
    Mtx mTmp;
    MtxPtr m;

    ASSERTMSG( (a  != 0), MTX_CONCAT_1 );
    ASSERTMSG( (b  != 0), MTX_CONCAT_2 );
    ASSERTMSG( (ab != 0), MTX_CONCAT_3 );

    if( (ab == a) || (ab == b) )
    {
        m = mTmp;
    }

    else
    {
        m = ab;
    }

    // compute (a x b) -> m

    m[0][0] = a[0][0]*b[0][0] + a[0][1]*b[1][0] + a[0][2]*b[2][0];
    m[0][1] = a[0][0]*b[0][1] + a[0][1]*b[1][1] + a[0][2]*b[2][1];
    m[0][2] = a[0][0]*b[0][2] + a[0][1]*b[1][2] + a[0][2]*b[2][2];
    m[0][3] = a[0][0]*b[0][3] + a[0][1]*b[1][3] + a[0][2]*b[2][3] + a[0][3];

    m[1][0] = a[1][0]*b[0][0] + a[1][1]*b[1][0] + a[1][2]*b[2][0];
    m[1][1] = a[1][0]*b[0][1] + a[1][1]*b[1][1] + a[1][2]*b[2][1];
    m[1][2] = a[1][0]*b[0][2] + a[1][1]*b[1][2] + a[1][2]*b[2][2];
    m[1][3] = a[1][0]*b[0][3] + a[1][1]*b[1][3] + a[1][2]*b[2][3] + a[1][3];

    m[2][0] = a[2][0]*b[0][0] + a[2][1]*b[1][0] + a[2][2]*b[2][0];
    m[2][1] = a[2][0]*b[0][1] + a[2][1]*b[1][1] + a[2][2]*b[2][1];
    m[2][2] = a[2][0]*b[0][2] + a[2][1]*b[1][2] + a[2][2]*b[2][2];
    m[2][3] = a[2][0]*b[0][3] + a[2][1]*b[1][3] + a[2][2]*b[2][3] + a[2][3];


    // overwrite a or b if needed
    if(m == mTmp)
    {
        C_MTXCopy( mTmp, ab );
    }
}

/*---------------------------------------------------------------------*
    Paired-Single assembler version
 *---------------------------------------------------------------------*
                Note that this performs NO error checking.
 *---------------------------------------------------------------------*/
#ifdef GEKKO
asm void PSMTXConcat
(
    const register Mtx mA,    // r3
    const register Mtx mB,    // r4
          register Mtx mAB    // r5
)
{
    nofralloc

#define A00_A01 fp0
#define A02_A03 fp1
#define A10_A11 fp2
#define A12_A13 fp3
#define A20_A21 fp4
#define A22_A23 fp5

#define B00_B01 fp6
#define B02_B03 fp7
#define B10_B11 fp8
#define B12_B13 fp9
#define B20_B21 fp10
#define B22_B23 fp11

#define D00_D01 fp12
#define D02_D03 fp13
#define D10_D11 fp14
#define D12_D13 fp15
#define D20_D21 fp2
#define D22_D23 fp0

#define UNIT01  fp31

    // don't save LR since we don't make any function calls
    //    mflr    r0
    //    stw     r0, 4(r1)
    stwu    r1, -64(r1)
    psq_l   A00_A01, 0(mA), 0, 0
    stfd    fp14, 8(r1)
    psq_l   B00_B01, 0(mB), 0, 0
    addis   r6, 0, Unit01@ha
    psq_l   B02_B03, 8(mB), 0, 0
    stfd    fp15, 16(r1)
    addi    r6, r6, Unit01@l
    stfd    fp31, 40(r1)
    psq_l   B10_B11, 16(mB), 0, 0
    // D00_D01 = b00a00 , b01a00
    ps_muls0 D00_D01, B00_B01, A00_A01
    psq_l   A10_A11, 16(mA), 0, 0
    // D02_D03 = b02a00 , b03a00
    ps_muls0 D02_D03, B02_B03, A00_A01
    psq_l   UNIT01, 0(r6), 0, 0
    // D10_D11 = a10b00 , a10b01
    ps_muls0 D10_D11, B00_B01, A10_A11
    psq_l   B12_B13, 24(mB), 0, 0
    // D12_D13 = a10b02 , a10b03
    ps_muls0 D12_D13, B02_B03, A10_A11
    psq_l   A02_A03, 8(mA), 0, 0
        // fp12 = b10a01 + b00a00 , b11a01 + b01a00
        ps_madds1 D00_D01, B10_B11, A00_A01, D00_D01
    psq_l   A12_A13, 24(mA), 0, 0
        // D10_D11 = a10b00 + a11b10 , a10b01 + a11b11
        ps_madds1 D10_D11, B10_B11, A10_A11, D10_D11
    psq_l   B20_B21, 32(mB), 0, 0
        // D02_D03 = b12a01 + b02a00 , b13a01 + b03a00
        ps_madds1 D02_D03, B12_B13, A00_A01, D02_D03  // YYY LAST TIME FP0 IS USED
    psq_l   B22_B23, 40(mB), 0, 0
        // D12_D13 = a10b02 + a11b12, a10b03+a11b13
        ps_madds1 D12_D13, B12_B13, A10_A11, D12_D13 // YYY LAST TIME FP2 IS USED
    psq_l   A20_A21, 32(mA), 0, 0
    psq_l   A22_A23, 40(mA), 0, 0
            // D00_D01 = b20a02 + b10a01 + b00a00 , b21a02 + b11a01 + b01a00
            ps_madds0 D00_D01, B20_B21, A02_A03, D00_D01 // m00, m01 computed
            // D02_D03 = b12a01 + b02a00 + b22a02 , b13a01 + b03a00 + b23a02
            ps_madds0 D02_D03, B22_B23, A02_A03, D02_D03
            // D10_D11 = a10b00 + a11b10 +a12b20, a10b01 + a11b11 + a12b21
            ps_madds0 D10_D11, B20_B21, A12_A13, D10_D11 // m10, m11 computed
            // D12_D13 = a10b02 + a11b12 + a12b22, a10b03+a11b13 + a12b23 + a13
            ps_madds0 D12_D13, B22_B23, A12_A13, D12_D13


    // store m00m01
    psq_st  D00_D01, 0(mAB), 0, 0 // YYY LAST TIME FP12 IS USED

    // D20_D21 = a20b00, a20b01
    ps_muls0 D20_D21, B00_B01, A20_A21 // YYY LAST TIME FP6 IS USED
                // get a03 from fp1 and add to D02_D03
                ps_madds1 D02_D03, UNIT01, A02_A03, D02_D03 // m02, m03 computed
                // YYY LAST TIME FP1 IS USED
    // D22_D23 = a20b02, a20b03
    ps_muls0 D22_D23, B02_B03, A20_A21 // YYY LAST TIME FP7 IS USED
    // store m10m11
    psq_st  D10_D11, 16(mAB), 0, 0
                // get a13 from fp3 and add to D12_D13
                ps_madds1 D12_D13, UNIT01, A12_A13, D12_D13 // m12, m13 computed
    // store m02m03
    psq_st  D02_D03, 8(mAB), 0, 0 // YYY LAST TIME D02_D03 IS USED

        // D20_D21 = a20b00 + a21b10, a20b01 + a21b11
        ps_madds1 D20_D21, B10_B11, A20_A21, D20_D21 // YYY LAST TIME FP8 IS USED
        // D22_D23 = a20b02 + a21b12, a20b03 + a21b13
        ps_madds1 D22_D23, B12_B13, A20_A21, D22_D23
            // D20_D21 = a20b00 + a21b10 + a22b20, a20b01 + a21b11 + a22b21
            ps_madds0 D20_D21, B20_B21, A22_A23, D20_D21
    // Restore fp14
    lfd    fp14, 8(r1)  // D10_D11
    // store m12m13
    psq_st  D12_D13, 24(mAB), 0, 0
            // D22_D23 = a20b02 + a21b12 + a22b22, a20b03 + a21b13 + a22b23 + a23
            ps_madds0 D22_D23, B22_B23, A22_A23, D22_D23
    // store m20m21
    psq_st  D20_D21, 32(mAB), 0, 0
                // get a23 from fp5 and add to fp17
                ps_madds1 D22_D23, UNIT01, A22_A23, D22_D23
    // restore stack frame
    lfd    fp15, 16(r1) // D12_D13
    // store m22m23
    psq_st  D22_D23, 40(mAB), 0, 0

    lfd    fp31, 40(r1)
    addi   r1, r1, 64


    blr


#undef A00_A01
#undef A02_A03
#undef A10_A11
#undef A12_A13
#undef A20_A21
#undef A22_A23

#undef B00_B01
#undef B02_B03
#undef B10_B11
#undef B12_B13
#undef B20_B21
#undef B22_B23

#undef D00_D01
#undef D02_D03
#undef D10_D11
#undef D12_D13
#undef D20_D21
#undef D22_D23

#undef UNIT01


/*
  / m[0][0] = a[0][0]*b[0][0] + a[0][1]*b[1][0] + a[0][2]*b[2][0];
fp12m[0][1] = a[0][0]*b[0][1] + a[0][1]*b[1][1] + a[0][2]*b[2][1];
  / m[0][2] = a[0][0]*b[0][2] + a[0][1]*b[1][2] + a[0][2]*b[2][2];
fp13m[0][3] = a[0][0]*b[0][3] + a[0][1]*b[1][3] + a[0][2]*b[2][3] + a[0][3];

  / m[1][0] = a[1][0]*b[0][0] + a[1][1]*b[1][0] + a[1][2]*b[2][0];
fp14m[1][1] = a[1][0]*b[0][1] + a[1][1]*b[1][1] + a[1][2]*b[2][1];
  / m[1][2] = a[1][0]*b[0][2] + a[1][1]*b[1][2] + a[1][2]*b[2][2];
fp15m[1][3] = a[1][0]*b[0][3] + a[1][1]*b[1][3] + a[1][2]*b[2][3] + a[1][3];

  / m[2][0] = a[2][0]*b[0][0] + a[2][1]*b[1][0] + a[2][2]*b[2][0];
fp16m[2][1] = a[2][0]*b[0][1] + a[2][1]*b[1][1] + a[2][2]*b[2][1];
  / m[2][2] = a[2][0]*b[0][2] + a[2][1]*b[1][2] + a[2][2]*b[2][2];
fp17m[2][3] = a[2][0]*b[0][3] + a[2][1]*b[1][3] + a[2][2]*b[2][3] + a[2][3];
OPTIMIZATIONS :
  fp0 instead of fp17
  fp2 instead of fp16
    psq_l   fp0, 0(mA), 0, 0    // a[0][0], a[0][1]
    psq_l   fp1, 8(mA), 0, 0    // a[0][2], a[0][3]
    psq_l   fp2, 16(mA), 0, 0   // a[1][0], a[1][1]
    psq_l   fp3, 24(mA), 0, 0   // a[1][2], a[1][3]
    psq_l   fp4, 32(mA), 0, 0   // a[2][0], a[2][1]
    psq_l   fp5, 40(mA), 0, 0   // a[2][2], a[2][3]

    psq_l   fp6, 0(mB), 0, 0    // b[0][0], b[0][1]
    psq_l   fp7, 8(mB), 0, 0    // b[0][2], b[0][3]
    psq_l   fp8, 16(mB), 0, 0   // b[1][0], b[1][1]
    psq_l   fp9, 24(mB), 0, 0   // b[1][2], b[1][3]
    psq_l   fp10, 32(mB), 0, 0   // b[2][0], b[2][1]
    psq_l   fp11, 40(mB), 0, 0   // b[2][2], b[2][3]

*/
}
#endif // GEKKO


/*---------------------------------------------------------------------*

Name:           MTXConcatArray

Description:    concatenates a matrix to an array of matrices.
                order of operation is A x B(array) = AB(array).

Arguments:      a        first matrix for concat.
                srcBase  array base of second matrix for concat.
                dstBase  array base of resultant matrix from concat.
                count    number of matrices in srcBase, dstBase arrays.

                note:      cannot check for array overflow

Return   :         none

 *---------------------------------------------------------------------*/

/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
void C_MTXConcatArray ( const Mtx a, const Mtx* srcBase, Mtx* dstBase, u32 count )
{
    u32 i;

    ASSERTMSG( (a       != 0), "MTXConcatArray(): NULL MtxPtr 'a' " );
    ASSERTMSG( (srcBase != 0), "MTXConcatArray(): NULL MtxPtr 'srcBase' " );
    ASSERTMSG( (dstBase != 0), "MTXConcatArray(): NULL MtxPtr 'dstBase' " );
    ASSERTMSG( (count > 1),    "MTXConcatArray(): count must be greater than 1." );

    for ( i = 0 ; i < count ; i++ )
    {
        C_MTXConcat(a, *srcBase, *dstBase);

        srcBase++;
        dstBase++;
    }
}

/*---------------------------------------------------------------------*
    Paired-Single assembler version
 *---------------------------------------------------------------------*
                Note that this performs NO error checking.
 *---------------------------------------------------------------------*/
#ifdef GEKKO

// This code can be compiled only with register allocation optimization.
#if ( defined(__MWERKS__) && defined(_DEBUG) )
#pragma global_optimizer    on
#pragma optimization_level  1
#endif

void PSMTXConcatArray (
    const register Mtx  a,
    const register Mtx* srcBase,
          register Mtx* dstBase,
          register u32  count )
{
    register f32    va0, va1, va2, va3, va4, va5;
    register f32    vb0, vb1, vb2, vb3, vb4, vb5;
    register f32    vd0, vd1, vd2, vd3, vd4, vd5;
    register f32    u01;
    register f32*   u01Ptr = Unit01;

    asm
    {
        // [a00][a01]
        psq_l       va0, 0(a), 0, 0
        // [a02][a03]
        psq_l       va1, 8(a), 0, 0
        // [a10][a11]
        psq_l       va2, 16(a), 0, 0
        // [a12][a13]
        psq_l       va3, 24(a), 0, 0
        // count--
        subi        count, count, 1
        // [a20][a21]
        psq_l       va4, 32(a), 0, 0
        // [a22][a23]
        psq_l       va5, 40(a), 0, 0
        // Loop count
        mtctr       count
        // [0][1]
        psq_l       u01, 0(u01Ptr), 0, 0

        //---------------------------------
        // [b00][b01]
        psq_l       vb0, 0(srcBase), 0, 0
        // [b10][b11]
        psq_l       vb2, 16(srcBase), 0, 0

        // [a00*b00][a00*b01]
        ps_muls0    vd0, vb0, va0
        // [a10*b00][a10*b01]
        ps_muls0    vd2, vb0, va2
        // [a20*b00][a20*b01]
        ps_muls0    vd4, vb0, va4

        // [b20][b21]
        psq_l       vb4, 32(srcBase), 0, 0

        // [a00*b00 + a01*b10][a00*b01 + a01*b11]
        ps_madds1   vd0, vb2, va0, vd0
        // [a10*b00 + a11*b10][a10*b01 + a11*b11]
        ps_madds1   vd2, vb2, va2, vd2
        // [a20*b00 + a21*b10][a20*b01 + a21*b11]
        ps_madds1   vd4, vb2, va4, vd4

        // [b02][b03]
        psq_l       vb1, 8(srcBase), 0, 0

        // [a00*b00 + a01*b10 + a02*b20][a00*b01 + a01*b11 + a02*b21]
        ps_madds0   vd0, vb4, va1, vd0
        // [a10*b00 + a11*b10 + a12*b20][a10*b01 + a11*b11 + a12*b21]
        ps_madds0   vd2, vb4, va3, vd2
        // [a20*b00 + a21*b10 + a22*b20][a20*b01 + a21*b11 + a22*b21]
        ps_madds0   vd4, vb4, va5, vd4

        // [b12][b13]
        psq_l       vb3, 24(srcBase), 0, 0
        // [a00*b00 + a01*b10 + a02*b20][a00*b01 + a01*b11 + a02*b21]
        psq_st      vd0, 0(dstBase), 0, 0

        // [a00*b02][a00*b03]
        ps_muls0    vd1, vb1, va0
        // [a10*b02][a10*b03]
        ps_muls0    vd3, vb1, va2
        // [a20*b02][a20*b03]
        ps_muls0    vd5, vb1, va4

        // [b22][b23]
        psq_l       vb5, 40(srcBase), 0, 0
        // [a10*b00 + a11*b10 + a12*b20][a10*b01 + a11*b11 + a12*b21]
        psq_st      vd2, 16(dstBase), 0, 0

        // [a00*b02 + a01*b12][a00*b03 + a01*b13]
        ps_madds1   vd1, vb3, va0, vd1
        // [a10*b02 + a11*b12][a10*b03 + a11*b13]
        ps_madds1   vd3, vb3, va2, vd3
        // [a20*b02 + a21*b12][a20*b03 + a21*b13]
        ps_madds1   vd5, vb3, va4, vd5

    _loop:

        // ++srcBase
        addi        srcBase, srcBase, sizeof(Mtx)

        // [a00*b02 + a01*b12 + a02*b22][a00*b03 + a01*b13 + a02*b23]
        ps_madds0   vd1, vb5, va1, vd1
        // [a10*b02 + a11*b12 + a12*b22][a10*b03 + a11*b13 + a12*b23]
        ps_madds0   vd3, vb5, va3, vd3
        // [a20*b02 + a21*b12 + a22*b22][a20*b03 + a21*b13 + a22*b23]
        ps_madds0   vd5, vb5, va5, vd5

            // [b00][b01]
            psq_l       vb0, 0(srcBase), 0, 0
        // [a20*b00 + a21*b10 + a22*b20][a20*b01 + a21*b11 + a22*b21]
        psq_st      vd4, 32(dstBase), 0, 0

        // [a00*b02 + a01*b12 + a02*b22][a00*b03 + a01*b13 + a02*b23 + a03]
        ps_madd     vd1, u01, va1, vd1
        // [a10*b02 + a11*b12 + a12*b22][a10*b03 + a11*b13 + a12*b23 + a13]
        ps_madd     vd3, u01, va3, vd3
        // [a20*b02 + a21*b12 + a22*b22][a20*b03 + a21*b13 + a22*b23 + a23]
        ps_madd     vd5, u01, va5, vd5

            // [b10][b11]
            psq_l       vb2, 16(srcBase), 0, 0
        // [a00*b02 + a01*b12 + a02*b22][a00*b03 + a01*b13 + a02*b23 + a03]
        psq_st      vd1, 8(dstBase), 0, 0

            // [a00*b00][a00*b01]
            ps_muls0    vd0, vb0, va0
            // [a10*b00][a10*b01]
            ps_muls0    vd2, vb0, va2
            // [a20*b00][a20*b01]
            ps_muls0    vd4, vb0, va4

            // [b20][b21]
            psq_l       vb4, 32(srcBase), 0, 0
        // [a10*b02 + a11*b12 + a12*b22][a10*b03 + a11*b13 + a12*b23 + a13]
        psq_st      vd3, 24(dstBase), 0, 0

            // [a00*b00 + a01*b10][a00*b01 + a01*b11]
            ps_madds1   vd0, vb2, va0, vd0
            // [a10*b00 + a11*b10][a10*b01 + a11*b11]
            ps_madds1   vd2, vb2, va2, vd2
            // [a20*b00 + a21*b10][a20*b01 + a21*b11]
            ps_madds1   vd4, vb2, va4, vd4

            // [b02][b03]
            psq_l       vb1, 8(srcBase), 0, 0
        // [a20*b02 + a21*b12 + a22*b22][a20*b03 + a21*b13 + a22*b23 + a23]
        psq_st      vd5, 40(dstBase), 0, 0
        // ++dstBase
        addi        dstBase, dstBase, sizeof(Mtx)

            // [a00*b00 + a01*b10 + a02*b20][a00*b01 + a01*b11 + a02*b21]
            ps_madds0   vd0, vb4, va1, vd0
            // [a10*b00 + a11*b10 + a12*b20][a10*b01 + a11*b11 + a12*b21]
            ps_madds0   vd2, vb4, va3, vd2
            // [a20*b00 + a21*b10 + a22*b20][a20*b01 + a21*b11 + a22*b21]
            ps_madds0   vd4, vb4, va5, vd4

            // [b12][b13]
            psq_l       vb3, 24(srcBase), 0, 0
            // [a00*b00 + a01*b10 + a02*b20][a00*b01 + a01*b11 + a02*b21]
            psq_st      vd0, 0(dstBase), 0, 0

            // [a00*b02][a00*b03]
            ps_muls0    vd1, vb1, va0
            // [a10*b02][a10*b03]
            ps_muls0    vd3, vb1, va2
            // [a20*b02][a20*b03]
            ps_muls0    vd5, vb1, va4

            // [b22][b23]
            psq_l       vb5, 40(srcBase), 0, 0
            // [a10*b00 + a11*b10 + a12*b20][a10*b01 + a11*b11 + a12*b21]
            psq_st      vd2, 16(dstBase), 0, 0

            // [a00*b02 + a01*b12][a00*b03 + a01*b13]
            ps_madds1   vd1, vb3, va0, vd1
            // [a10*b02 + a11*b12][a10*b03 + a11*b13]
            ps_madds1   vd3, vb3, va2, vd3
            // [a20*b02 + a21*b12][a20*b03 + a21*b13]
            ps_madds1   vd5, vb3, va4, vd5

        // LOOP
        bdnz        _loop

            // [a20*b00 + a21*b10 + a22*b20][a20*b01 + a21*b11 + a22*b21]
            psq_st      vd4, 32(dstBase), 0, 0

            // [a00*b02 + a01*b12 + a02*b22][a00*b03 + a01*b13 + a02*b23]
            ps_madds0   vd1, vb5, va1, vd1
            // [a10*b02 + a11*b12 + a12*b22][a10*b03 + a11*b13 + a12*b23]
            ps_madds0   vd3, vb5, va3, vd3
            // [a20*b02 + a21*b12 + a22*b22][a20*b03 + a21*b13 + a22*b23]
            ps_madds0   vd5, vb5, va5, vd5

            // [a00*b02 + a01*b12 + a02*b22][a00*b03 + a01*b13 + a02*b23 + a03]
            ps_madd     vd1, u01, va1, vd1
            // [a10*b02 + a11*b12 + a12*b22][a10*b03 + a11*b13 + a12*b23 + a13]
            ps_madd     vd3, u01, va3, vd3
            // [a20*b02 + a21*b12 + a22*b22][a20*b03 + a21*b13 + a22*b23 + a23]
            ps_madd     vd5, u01, va5, vd5

            // [a00*b02 + a01*b12 + a02*b22][a00*b03 + a01*b13 + a02*b23 + a03]
            psq_st      vd1, 8(dstBase), 0, 0
            // [a10*b02 + a11*b12 + a12*b22][a10*b03 + a11*b13 + a12*b23 + a13]
            psq_st      vd3, 24(dstBase), 0, 0
            // [a20*b02 + a21*b12 + a22*b22][a20*b03 + a21*b13 + a22*b23 + a23]
            psq_st      vd5, 40(dstBase), 0, 0
            //---------------------------------
    }
}

#if ( defined(__MWERKS__) && defined(_DEBUG) )
#pragma optimization_level  0
#pragma global_optimizer    reset
#endif

#endif

/*---------------------------------------------------------------------*

Name:           MTXTranspose

Description:    computes the transpose of a matrix.
                As matrices are 3x4, fourth column (translation component) is
                lost and becomes (0,0,0).

                This function is intended for use in computing an
                inverse-transpose matrix to transform normals for lighting.
                In this case, lost translation component doesn't matter.


Arguments:      src       source matrix.
                xPose     destination (transposed) matrix.
                          ok if src == xPose.


Return   :         none

*---------------------------------------------------------------------*/
/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
void C_MTXTranspose ( const Mtx src, Mtx xPose )
{
    Mtx mTmp;
    MtxPtr m;


    ASSERTMSG( (src   != 0), MTX_TRANSPOSE_1  );
    ASSERTMSG( (xPose != 0), MTX_TRANSPOSE_2  );


    if(src == xPose)
    {
        m = mTmp;
    }
    else
    {
        m = xPose;
    }


    m[0][0] = src[0][0];   m[0][1] = src[1][0];      m[0][2] = src[2][0];     m[0][3] = 0.0f;
    m[1][0] = src[0][1];   m[1][1] = src[1][1];      m[1][2] = src[2][1];     m[1][3] = 0.0f;
    m[2][0] = src[0][2];   m[2][1] = src[1][2];      m[2][2] = src[2][2];     m[2][3] = 0.0f;


    // copy back if needed
    if( m == mTmp )
    {
        C_MTXCopy( mTmp, xPose );
    }
}

/*---------------------------------------------------------------------*
    Paired-Single assembler version
 *---------------------------------------------------------------------*
                Note that this performs NO error checking.
 *---------------------------------------------------------------------*/
#ifdef GEKKO
void PSMTXTranspose ( const register Mtx src, register Mtx xPose )
{
    register f32 c_zero = 0.0F;
    register f32 row0a, row1a, row0b, row1b;
    register f32 trns0, trns1, trns2;

    asm
    {
        psq_l       row0a, 0(src),  0, 0    // [0][0], [0][1]
        stfs        c_zero, 44(xPose)       // 0 -> [2][3]
        psq_l       row1a, 16(src), 0, 0    // [1][0], [1][1]
        ps_merge00  trns0, row0a, row1a     // [0][0], [1][0]
        psq_l       row0b, 8(src),  1, 0    // [0][2], 1
        ps_merge11  trns1, row0a, row1a     // [0][1], [1][1]
        psq_l       row1b, 24(src), 1, 0    // [1][2], 1
        psq_st      trns0, 0(xPose),  0, 0  // [0][0], [1][0] -> [0][0], [0][1]
        psq_l       row0a, 32(src), 0, 0    // [2][0], [2][1]
        ps_merge00  trns2, row0b, row1b     // [0][2], [1][2]
        psq_st      trns1, 16(xPose), 0, 0  // [0][1], [1][1] -> [1][0], [1][1]
        ps_merge00  trns0, row0a, c_zero    // [2][0], 0
        psq_st      trns2, 32(xPose), 0, 0  // [0][2], [1][2] -> [2][0], [2][1]
        ps_merge10  trns1, row0a, c_zero    // [2][1], 0
        psq_st      trns0, 8(xPose),  0, 0  // [2][0], 0 -> [0][2], [0][3]
        lfs         row0b, 40(src)          // [2][2]
        psq_st      trns1, 24(xPose), 0, 0  // [2][1], 0 -> [1][2], [1][3]
        stfs        row0b, 40(xPose)        // [2][2] -> [2][2]
    }
}
#endif // GEKKO

/*---------------------------------------------------------------------*

Name:           MTXInverse

Description:    computes a fast inverse of a matrix.
                this algorithm works for matrices with a fourth row of
                (0,0,0,1).

                for a matrix
                M =  |     A      C      |  where A is the upper 3x3 submatrix,
                     |     0      1      |        C is a 1x3 column vector


                INV(M)     =    |  inv(A)      (inv(A))*(-C)    |
                                |     0               1         |


Arguments:      src       source matrix.
                inv       destination (inverse) matrix.
                          ok if src == inv.


Return   :         0 if src is not invertible.
                1 on success.

*---------------------------------------------------------------------*/
/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
u32 C_MTXInverse ( const Mtx src, Mtx inv )
{
    Mtx mTmp;
    MtxPtr m;
    f32 det;

    ASSERTMSG( (src != 0), MTX_INVERSE_1 );
    ASSERTMSG( (inv != 0), MTX_INVERSE_2 );

    if( src == inv )
    {
        m = mTmp;
    }
    else
    {
        m = inv;
    }


    // compute the determinant of the upper 3x3 submatrix
    det =   src[0][0]*src[1][1]*src[2][2] + src[0][1]*src[1][2]*src[2][0] + src[0][2]*src[1][0]*src[2][1]
          - src[2][0]*src[1][1]*src[0][2] - src[1][0]*src[0][1]*src[2][2] - src[0][0]*src[2][1]*src[1][2];


    // check if matrix is singular
    if( det == 0.0f )
    {
        return 0;
    }


    // compute the inverse of the upper submatrix:

    // find the transposed matrix of cofactors of the upper submatrix
    // and multiply by (1/det)

    det = 1.0f / det;


    m[0][0] =  (src[1][1]*src[2][2] - src[2][1]*src[1][2]) * det;
    m[0][1] = -(src[0][1]*src[2][2] - src[2][1]*src[0][2]) * det;
    m[0][2] =  (src[0][1]*src[1][2] - src[1][1]*src[0][2]) * det;

    m[1][0] = -(src[1][0]*src[2][2] - src[2][0]*src[1][2]) * det;
    m[1][1] =  (src[0][0]*src[2][2] - src[2][0]*src[0][2]) * det;
    m[1][2] = -(src[0][0]*src[1][2] - src[1][0]*src[0][2]) * det;

    m[2][0] =  (src[1][0]*src[2][1] - src[2][0]*src[1][1]) * det;
    m[2][1] = -(src[0][0]*src[2][1] - src[2][0]*src[0][1]) * det;
    m[2][2] =  (src[0][0]*src[1][1] - src[1][0]*src[0][1]) * det;


    // compute (invA)*(-C)
    m[0][3] = -m[0][0]*src[0][3] - m[0][1]*src[1][3] - m[0][2]*src[2][3];
    m[1][3] = -m[1][0]*src[0][3] - m[1][1]*src[1][3] - m[1][2]*src[2][3];
    m[2][3] = -m[2][0]*src[0][3] - m[2][1]*src[1][3] - m[2][2]*src[2][3];

    // copy back if needed
    if( m == mTmp )
    {
        C_MTXCopy( mTmp,inv );
    }

    return 1;
}

/*---------------------------------------------------------------------*
    Paired-Single assembler version
 *---------------------------------------------------------------------*
            Note that this performs NO error checking.
            Results may be a little bit different from the C version
            because it doesn't perform exactly same calculation.
 *---------------------------------------------------------------------*/
#ifdef GEKKO
asm u32 PSMTXInverse ( const register Mtx src, register Mtx inv )
{
    nofralloc

    // fp0 [ 00 ][ 1.0F ] : Load
    psq_l       fp0, 0(src), 1, 0
    // fp1 [ 01 ][ 02 ]   : Load
    psq_l       fp1, 4(src), 0, 0
    // fp2 [ 10 ][ 1.0F ] : Load
    psq_l       fp2, 16(src), 1, 0
        // fp6 [ 02 ][ 00 ]
        ps_merge10  fp6, fp1, fp0
    // fp3 [ 11 ][ 12 ]   : Load
    psq_l       fp3, 20(src), 0, 0
    // fp4 [ 20 ][ 1.0F ] : Load
    psq_l       fp4, 32(src), 1, 0
        // fp7 [ 12 ][ 10 ]
        ps_merge10  fp7, fp3, fp2
    // fp5 [ 21 ][ 22 ]   : Load
    psq_l       fp5, 36(src), 0, 0

    // fp11[ 11*02 ][ 00*12 ]
    ps_mul      fp11, fp3, fp6
    // fp13[ 21*12 ][ 10*22 ]
    ps_mul      fp13, fp5, fp7
        // fp8 [ 22 ][ 20 ]
        ps_merge10  fp8, fp5, fp4
    // fp11[ 01*12 - 11*02 ][ 10*02 - 00*12 ]  -> free(fp7[ 12 ][ 10 ])
    ps_msub     fp11, fp1, fp7, fp11
    // fp12[ 01*22 ][ 20*02 ]
    ps_mul      fp12, fp1, fp8
    // fp13[ 11*22 - 21*12 ][ 20*12 - 10*22 ]  -> free(fp8[ 22 ][ 20 ])
    ps_msub     fp13, fp3, fp8, fp13
        // fp10[ 20*11 ][ N/A ]
        ps_mul      fp10, fp3, fp4
    // fp12[ 21*02 - 01*22 ][ 00*22 - 20*02 ]  -> free(fp6[ 02 ][ 00 ])
    ps_msub     fp12, fp5, fp6, fp12
        // fp9 [ 00*21 ][ N/A ]
        ps_mul      fp9,  fp0, fp5
        // fp8 [ 10*01 ][ N/A ]
        ps_mul      fp8,  fp1, fp2

    // fp6 [ 0.0F ][ 0.0F ]
    ps_sub      fp6, fp6, fp6
        // fp10[ 10*21 - 20*11 ][ N/A ] -> free(fp5[ 21 ][ 22 ])
        ps_msub     fp10, fp2, fp5, fp10
    // fp7 [ 00*(11*22-21*12) ][ N/A ]
    ps_mul      fp7, fp0, fp13
        // fp9 [ 20*01 - 00*21 ][ N/A ]
        ps_msub     fp9,  fp1, fp4, fp9
    // fp7 [ 00*(11*22-21*12) + 10*(21*02-01*22) ][ N/A ]
    ps_madd     fp7, fp2, fp12, fp7
        // fp8 [ 00*11 - 10*01 ][ N/A ]
        ps_msub     fp8,  fp0, fp3, fp8
    // fp7 [ 00*(11*22-21*12) + 10*(21*02-01*22) + 20*(01*12-11*02) ][ N/A ] : det
    ps_madd     fp7, fp4, fp11, fp7

    // ( det == 0 ) ?
    ps_cmpo0    cr0, fp7, fp6
    bne         _regular

    // return value (singular)
    addi        r3, 0, 0
    blr

  _regular:

    // fp0 [ rdet = 1/det ][ N/A ]
    fres        fp0, fp7

    // Refinement : ( E = est. of 1/K ) -> ( E' = 2E - K*E*E )
    ps_add      fp6, fp0, fp0
    ps_mul      fp5, fp0, fp0
    ps_nmsub    fp0, fp7, fp5, fp6


    // fp1 [ 03 ][ 03 ] : Load
    lfs         fp1, 12(src)
        // fp13[ ( 11*22 - 21*12 ) * rdet ][ ( 20*12 - 10*22 ) * rdet ] : i[0][0], i[1][0]
        ps_muls0    fp13, fp13, fp0
    // fp2 [ 13 ][ 13 ] : Load
    lfs         fp2, 28(src)
        // fp12[ ( 21*02 - 01*22 ) * rdet ][ ( 00*22 - 20*02 ) * rdet ] : i[0][1], i[1][1]
        ps_muls0    fp12, fp12, fp0
    // fp3 [ 23 ][ 23 ]   : Load
    lfs         fp3, 44(src)
        // fp11[ ( 01*12 - 11*02 ) * rdet ][ ( 10*02 - 00*12 ) * rdet ] : i[0][2], i[1][2]
        ps_muls0    fp11, fp11, fp0
    // fp5 [ i00 ][ i01 ]
    ps_merge00  fp5, fp13, fp12
        // fp10[ ( 10*21 - 20*11 ) * rdet ] : i[2][0]
        ps_muls0    fp10, fp10, fp0
    // fp4 [ i10 ][ i11 ]
    ps_merge11  fp4, fp13, fp12
        // fp9 [ ( 20*01 - 00*21 ) * rdet ] : i[2][1]
        ps_muls0    fp9,  fp9,  fp0

    // [ i00 ][ i01 ] : Store fp5   -> free(fp5[ i00 ][ i01 ])
    psq_st      fp5,  0(inv), 0, 0
        // fp6 [ i00*03 ][ i10*03 ]
        ps_mul      fp6, fp13, fp1
    // [ i10 ][ i11 ] : Store fp4   -> free(fp4[ i10 ][ i11 ])
    psq_st      fp4,  16(inv), 0, 0
        // fp8 [ ( 00*11 - 10*01 ) * rdet ] : i[2][2]
        ps_muls0    fp8,  fp8,  fp0
        // fp6 [ i00*03+i01*13 ][ i10*03+i11*13 ]
        ps_madd     fp6, fp12, fp2, fp6
    // [ i20 ] : Store fp10
    psq_st      fp10, 32(inv), 1, 0
        // fp6 [ -i00*03-i01*13-i02*23 ][ -i10*03-i11*13-i12*23 ] : i[0][3], i[1][3]
        ps_nmadd    fp6, fp11, fp3, fp6
    // [ i21 ] : Store fp9
    psq_st      fp9,  36(inv), 1, 0
        // fp7 [ i20*03 ][ N/A ]
        ps_mul      fp7, fp10, fp1
        // fp5 [ i02 ][ i03 ]
        ps_merge00  fp5, fp11, fp6
    // [ i22 ] : Store fp8
    psq_st      fp8,  40(inv), 1, 0
        // fp4 [ i12 ][ i13 ]
        ps_merge11  fp4, fp11, fp6
    // [ i02 ][ i03 ] : Store fp5
    psq_st      fp5,  8(inv), 0, 0
        // fp7 [ i20*03+i21*13 ][ N/A ]
        ps_madd     fp7, fp9,  fp2, fp7
    // [ i12 ][ i13 ] : Store fp4
    psq_st      fp4,  24(inv), 0, 0
        // fp7 [ -i20*03-i21*13-i22*23 ][ N/A ] : i[2][3]
        ps_nmadd    fp7, fp8,  fp3, fp7
            // return value (regular)
            addi        r3, 0, 1
    // [ i23 ] : Store fp7
    psq_st      fp7,  44(inv), 1, 0

    blr
}
#endif // GEKKO

/*---------------------------------------------------------------------*

Name:           MTXInvXpose

Description:    computes a fast inverse-transpose of a matrix.
                this algorithm works for matrices with a fourth row of
                (0,0,0,1). Commonly used for calculating normal transform
                matrices.

                This function is equivalent to the combination of
                two functions MTXInverse + MTXTranspose.

Arguments:      src       source matrix.
                invx      destination (inverse-transpose) matrix.
                          ok if src == invx.

Return   :         0 if src is not invertible.
                1 on success.

*---------------------------------------------------------------------*/
/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
u32 C_MTXInvXpose ( const Mtx src, Mtx invX )
{
    Mtx mTmp;
    MtxPtr m;
    f32 det;

    ASSERTMSG( (src != 0), MTX_INVXPOSE_1 );
    ASSERTMSG( (invX != 0), MTX_INVXPOSE_2 );

    if( src == invX )
    {
        m = mTmp;
    }
    else
    {
        m = invX;
    }

    // compute the determinant of the upper 3x3 submatrix
    det =   src[0][0]*src[1][1]*src[2][2] + src[0][1]*src[1][2]*src[2][0] + src[0][2]*src[1][0]*src[2][1]
          - src[2][0]*src[1][1]*src[0][2] - src[1][0]*src[0][1]*src[2][2] - src[0][0]*src[2][1]*src[1][2];

    // check if matrix is singular
    if( det == 0.0f )
    {
        return 0;
    }

    // compute the inverse-transpose of the upper submatrix:

    // find the transposed matrix of cofactors of the upper submatrix
    // and multiply by (1/det)

    det = 1.0f / det;

    m[0][0] =  (src[1][1]*src[2][2] - src[2][1]*src[1][2]) * det;
    m[0][1] = -(src[1][0]*src[2][2] - src[2][0]*src[1][2]) * det;
    m[0][2] =  (src[1][0]*src[2][1] - src[2][0]*src[1][1]) * det;

    m[1][0] = -(src[0][1]*src[2][2] - src[2][1]*src[0][2]) * det;
    m[1][1] =  (src[0][0]*src[2][2] - src[2][0]*src[0][2]) * det;
    m[1][2] = -(src[0][0]*src[2][1] - src[2][0]*src[0][1]) * det;

    m[2][0] =  (src[0][1]*src[1][2] - src[1][1]*src[0][2]) * det;
    m[2][1] = -(src[0][0]*src[1][2] - src[1][0]*src[0][2]) * det;
    m[2][2] =  (src[0][0]*src[1][1] - src[1][0]*src[0][1]) * det;


    // the fourth columns should be all zero
    m[0][3] = 0.0F;
    m[1][3] = 0.0F;
    m[2][3] = 0.0F;

    // copy back if needed
    if( m == mTmp )
    {
        C_MTXCopy( mTmp,invX );
    }

    return 1;
}

/*---------------------------------------------------------------------*
    Paired-Single assembler version
 *---------------------------------------------------------------------*
            Note that this performs NO error checking.
            Results may be a little bit different from the C version
            because it doesn't perform exactly same calculation.
 *---------------------------------------------------------------------*/
#ifdef GEKKO
asm u32 PSMTXInvXpose ( const register Mtx src, register Mtx invX )
{
    nofralloc

    // fp0 [ 00 ][ 1.0F ] : Load
    psq_l       fp0, 0(src), 1, 0
    // fp1 [ 01 ][ 02 ]   : Load
    psq_l       fp1, 4(src), 0, 0
    // fp2 [ 10 ][ 1.0F ] : Load
    psq_l       fp2, 16(src), 1, 0
        // fp6 [ 02 ][ 00 ]
        ps_merge10  fp6, fp1, fp0
    // fp3 [ 11 ][ 12 ]   : Load
    psq_l       fp3, 20(src), 0, 0
    // fp4 [ 20 ][ 1.0F ] : Load
    psq_l       fp4, 32(src), 1, 0
        // fp7 [ 12 ][ 10 ]
        ps_merge10  fp7, fp3, fp2
    // fp5 [ 21 ][ 22 ]   : Load
    psq_l       fp5, 36(src), 0, 0

        // fp11[ 11*02 ][ 00*12 ]
        ps_mul      fp11, fp3, fp6

        // fp8 [ 22 ][ 20 ]
        ps_merge10  fp8, fp5, fp4

        // fp13[ 21*12 ][ 10*22 ]
        ps_mul      fp13, fp5, fp7
        // fp11[ 01*12 - 11*02 ][ 10*02 - 00*12 ]
        ps_msub     fp11, fp1, fp7, fp11
        // fp12[ 01*22 ][ 20*02 ]
        ps_mul      fp12, fp1, fp8
        // fp13[ 11*22 - 21*12 ][ 20*12 - 10*22 ]
        ps_msub     fp13, fp3, fp8, fp13
        // fp12[ 21*02 - 01*22 ][ 00*22 - 20*02 ]
        ps_msub     fp12, fp5, fp6, fp12

        // fp10[ 20*11 ][ N/A ]
        ps_mul      fp10, fp3, fp4
        // fp9 [ 00*21 ][ N/A ]
        ps_mul      fp9,  fp0, fp5
        // fp8 [ 10*01 ][ N/A ]
        ps_mul      fp8,  fp1, fp2
        // fp10[ 10*21 - 20*11 ][ N/A ]
        ps_msub     fp10, fp2, fp5, fp10
        // fp9 [ 20*01 - 00*21 ][ N/A ]
        ps_msub     fp9,  fp1, fp4, fp9
        // fp8 [ 00*11 - 10*01 ][ N/A ]
        ps_msub     fp8,  fp0, fp3, fp8

        // fp7 [ 00*(11*22-21*12) ][ N/A ]
        ps_mul      fp7, fp0, fp13
        // fp1 [ 0.0F ][ 0.0F ]
        ps_sub      fp1, fp1, fp1
        // fp7 [ 00*(11*22-21*12) + 10*(21*02-01*22) ][ N/A ]
        ps_madd     fp7, fp2, fp12, fp7
        // fp7 [ 00*(11*22-21*12) + 10*(21*02-01*22) + 20*(01*12-11*02) ][ N/A ] : det
        ps_madd     fp7, fp4, fp11, fp7

        // ( det == 0 ) ?
        ps_cmpo0    cr0, fp7, fp1
        bne         _regular

        // return value (singular)
        addi        r3, 0, 0
        blr

  _regular:

        // fp0 [ 1/det ][ N/A ]
        fres        fp0, fp7

    // [ ix03 ] : Store fp1
    psq_st      fp1,  12(invX), 1, 0

        // Refinement : ( E = est. of 1/N ) -> ( E' = 2E - N*E*E )
        ps_add      fp6, fp0, fp0
        ps_mul      fp5, fp0, fp0

    // [ ix13 ] : Store fp1
    psq_st      fp1,  28(invX), 1, 0

        ps_nmsub    fp0, fp7, fp5, fp6

    // [ ix23 ] : Store fp1
    psq_st      fp1,  44(invX), 1, 0

        // fp13[ ( 11*22 - 21*12 ) * rdet ][ ( 20*12 - 10*22 ) * rdet ] : ix[0][0], ix[0][1]
        ps_muls0    fp13, fp13, fp0
        // fp12[ ( 21*02 - 01*22 ) * rdet ][ ( 00*22 - 20*02 ) * rdet ] : ix[1][0], ix[1][1]
        ps_muls0    fp12, fp12, fp0
        // fp11[ ( 01*12 - 11*02 ) * rdet ][ ( 10*02 - 00*12 ) * rdet ] : ix[2][0], ix[2][1]
        ps_muls0    fp11, fp11, fp0

    // [ ix00 ][ ix01 ] : Store fp13
    psq_st      fp13,  0(invX), 0, 0
    // [ ix10 ][ ix11 ] : Store fp12
    psq_st      fp12,  16(invX), 0, 0

        // fp10[ ( 10*21 - 20*11 ) * rdet ] : i[0][2]
        ps_muls0    fp10, fp10, fp0

        // fp9 [ ( 20*01 - 00*21 ) * rdet ] : i[1][2]
        ps_muls0    fp9,  fp9,  fp0

    // [ ix20 ][ ix21 ] : Store fp11
    psq_st      fp11,  32(invX), 0, 0
    // [ ix02 ]         : Store fp10
    psq_st      fp10,  8(invX), 1, 0

        // fp8 [ ( 00*11 - 10*01 ) * rdet ] : i[2][2]
        ps_muls0    fp8,  fp8,  fp0

        // return value (regular)
        addi        r3, 0, 1

    // [ ix12 ]         : Store fp9
    psq_st      fp9,   24(invX), 1, 0
    // [ ix22 ]         : Store fp8
    psq_st      fp8,   40(invX), 1, 0

    blr
}
#endif

/*---------------------------------------------------------------------*


                             MODEL  SECTION


*---------------------------------------------------------------------*/


/*---------------------------------------------------------------------*

Name:           MTXRotDeg

Description:    sets a rotation matrix about one of the X, Y or Z axes

Arguments:      m       matrix to be set

                axis    major axis about which to rotate.
                        axis is passed in as a character.
                        it must be one of 'X', 'x', 'Y', 'y', 'Z', 'z'

                deg     rotation angle in degrees.

                        note:  counter-clockwise rotation is positive.

Return   :         none

*---------------------------------------------------------------------*/
// The function is defined as a macro in mtx.h
/*
void MTXRotDeg ( Mtx m, char axis, f32 deg )
{
    MTXRotRad( m, axis, MTXDegToRad( deg ) );
}
*/

/*---------------------------------------------------------------------*

Name:           MTXRotRad

Description:    sets a rotation matrix about one of the X, Y or Z axes

Arguments:      m       matrix to be set

                axis    major axis about which to rotate.
                        axis is passed in as a character.
                        it must be one of 'X', 'x', 'Y', 'y', 'Z', 'z'

                deg     rotation angle in radians.

                        note:  counter-clockwise rotation is positive.

Return   :         none

*---------------------------------------------------------------------*/
/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
void C_MTXRotRad ( Mtx m, char axis, f32 rad )
{

    f32 sinA, cosA;

    ASSERTMSG( (m != 0), MTX_ROTRAD_1 );

    // verification of "axis" will occur in MTXRotTrig

    sinA = sinf(rad);
    cosA = cosf(rad);

    C_MTXRotTrig( m, axis, sinA, cosA );
}

/*---------------------------------------------------------------------*
    Paired-Single assembler version
 *---------------------------------------------------------------------*
                Note that this performs NO error checking.
 *---------------------------------------------------------------------*/
#ifdef GEKKO
void PSMTXRotRad ( Mtx m, char axis, f32 rad )
{
    f32 sinA, cosA;

    sinA = sinf(rad);
    cosA = cosf(rad);

    PSMTXRotTrig( m, axis, sinA, cosA );
}
#endif // GEKKO

/*---------------------------------------------------------------------*

Name:           MTXRotTrig

Description:    sets a rotation matrix about one of the X, Y or Z axes
                from specified trig ratios

Arguments:      m       matrix to be set

                axis    major axis about which to rotate.
                        axis is passed in as a character.
                        It must be one of 'X', 'x', 'Y', 'y', 'Z', 'z'

                sinA    sine of rotation angle.

                cosA    cosine of rotation angle.

                        note:  counter-clockwise rotation is positive.

Return   :         none

*---------------------------------------------------------------------*/
/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
void C_MTXRotTrig ( Mtx m, char axis, f32 sinA, f32 cosA )
{

    ASSERTMSG( (m != 0), MTX_ROTTRIG_1 );

    switch(axis)
    {

    case 'x':
    case 'X':
        m[0][0] =  1.0f;  m[0][1] =  0.0f;    m[0][2] =  0.0f;  m[0][3] = 0.0f;
        m[1][0] =  0.0f;  m[1][1] =  cosA;    m[1][2] = -sinA;  m[1][3] = 0.0f;
        m[2][0] =  0.0f;  m[2][1] =  sinA;    m[2][2] =  cosA;  m[2][3] = 0.0f;
        break;

    case 'y':
    case 'Y':
        m[0][0] =  cosA;  m[0][1] =  0.0f;    m[0][2] =  sinA;  m[0][3] = 0.0f;
        m[1][0] =  0.0f;  m[1][1] =  1.0f;    m[1][2] =  0.0f;  m[1][3] = 0.0f;
        m[2][0] = -sinA;  m[2][1] =  0.0f;    m[2][2] =  cosA;  m[2][3] = 0.0f;
        break;

    case 'z':
    case 'Z':
        m[0][0] =  cosA;  m[0][1] = -sinA;    m[0][2] =  0.0f;  m[0][3] = 0.0f;
        m[1][0] =  sinA;  m[1][1] =  cosA;    m[1][2] =  0.0f;  m[1][3] = 0.0f;
        m[2][0] =  0.0f;  m[2][1] =  0.0f;    m[2][2] =  1.0f;  m[2][3] = 0.0f;
        break;

    default:
        ASSERTMSG( 0, MTX_ROTTRIG_2 );
        break;

    }

}

/*---------------------------------------------------------------------*
    Paired-Single assembler version
 *---------------------------------------------------------------------*
                Note that this performs NO error checking.
 *---------------------------------------------------------------------*/
#ifdef GEKKO
void PSMTXRotTrig (
    register Mtx    m,
    register char   axis,
    register f32    sinA,
    register f32    cosA )
{
    register f32    fc0, fc1, nsinA;
    register f32    fw0, fw1, fw2, fw3;

    asm
    {
        frsp        sinA, sinA      // to make sure sinA = single precision
        frsp        cosA, cosA      // to make sure cosA = single precision
    }

    fc0 = 0.0F;
    fc1 = 1.0F;

    asm
    {
        // always lower case
        ori         axis, axis, 0x20
        ps_neg      nsinA, sinA

        // branches
        cmplwi      axis, 'x'
        beq         _case_x
        cmplwi      axis, 'y'
        beq         _case_y
        cmplwi      axis, 'z'
        beq         _case_z
        b           _end

    _case_x:
        psq_st      fc1,  0(m), 1, 0
        psq_st      fc0,  4(m), 0, 0
        ps_merge00  fw0, sinA, cosA
        psq_st      fc0, 12(m), 0, 0
        ps_merge00  fw1, cosA, nsinA
        psq_st      fc0, 28(m), 0, 0
        psq_st      fc0, 44(m), 1, 0
        psq_st      fw0, 36(m), 0, 0
        psq_st      fw1, 20(m), 0, 0
        b           _end;

    _case_y:
        ps_merge00  fw0, cosA, fc0
        ps_merge00  fw1, fc0, fc1
        psq_st      fc0, 24(m), 0, 0
        psq_st      fw0,  0(m), 0, 0
        ps_merge00  fw2, nsinA, fc0
        ps_merge00  fw3, sinA, fc0
        psq_st      fw0, 40(m), 0, 0;
        psq_st      fw1, 16(m), 0, 0;
        psq_st      fw3,  8(m), 0, 0;
        psq_st      fw2, 32(m), 0, 0;
        b           _end;

    _case_z:
        psq_st      fc0,  8(m), 0, 0
        ps_merge00  fw0, sinA, cosA
        ps_merge00  fw2, cosA, nsinA
        psq_st      fc0, 24(m), 0, 0
        psq_st      fc0, 32(m), 0, 0
        ps_merge00  fw1, fc1, fc0
        psq_st      fw0, 16(m), 0, 0
        psq_st      fw2,  0(m), 0, 0
        psq_st      fw1, 40(m), 0, 0

    _end:

    }
}
#endif // GEKKO

/*---------------------------------------------------------------------*

Name:           MTXRotAxisDeg

Description:    Sets a rotation matrix about an arbitrary axis


Arguments:      m       matrix to be set

                axis    ptr to a vector containing the x,y,z axis
                        components.
                        axis does not have to be a unit vector.

                deg     rotation angle in degrees.

                        note:  counter-clockwise rotation is positive.

Return   :         none

*---------------------------------------------------------------------*/
// The function is defined as a macro in mtx.h
/*
void MTXRotAxisDeg ( Mtx m, Vec *axis, f32 deg )
{
    MTXRotAxisRad( m, axis, MTXDegToRad( deg ) );
}
*/

/*---------------------------------------------------------------------*

Name:           MTXRotAxisRad

Description:    Sets a rotation matrix about an arbitrary axis


Arguments:      m       matrix to be set

                axis    ptr to a vector containing the x,y,z axis
                        components.
                        axis does not have to be a unit vector.

                deg     rotation angle in radians.

                        note:  counter-clockwise rotation is positive.

Return   :         none

*---------------------------------------------------------------------*/
/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
void C_MTXRotAxisRad( Mtx m, const Vec *axis, f32 rad )
{
    Vec vN;
    f32 s, c;             // sinTheta, cosTheta
    f32 t;                // ( 1 - cosTheta )
    f32 x, y, z;          // x, y, z components of normalized axis
    f32 xSq, ySq, zSq;    // x, y, z squared


    ASSERTMSG( (m    != 0), MTX_ROTAXIS_1  );
    ASSERTMSG( (axis != 0), MTX_ROTAXIS_2  );


    s = sinf(rad);
    c = cosf(rad);
    t = 1.0f - c;

    C_VECNormalize( axis, &vN );

    x = vN.x;
    y = vN.y;
    z = vN.z;

    xSq = x * x;
    ySq = y * y;
    zSq = z * z;

    m[0][0] = ( t * xSq )   + ( c );
    m[0][1] = ( t * x * y ) - ( s * z );
    m[0][2] = ( t * x * z ) + ( s * y );
    m[0][3] =    0.0f;

    m[1][0] = ( t * x * y ) + ( s * z );
    m[1][1] = ( t * ySq )   + ( c );
    m[1][2] = ( t * y * z ) - ( s * x );
    m[1][3] =    0.0f;

    m[2][0] = ( t * x * z ) - ( s * y );
    m[2][1] = ( t * y * z ) + ( s * x );
    m[2][2] = ( t * zSq )   + ( c );
    m[2][3] =    0.0f;

}

/*---------------------------------------------------------------------*
    Paired-Single assembler version
 *---------------------------------------------------------------------*
                Note that this performs NO error checking.
 *---------------------------------------------------------------------*/
#ifdef GEKKO

static void __PSMTXRotAxisRadInternal(
    register Mtx    m,
    const register Vec *axis,
    register f32    sT,
    register f32    cT )
{
    register f32    tT, fc0;
    register f32    tmp0, tmp1, tmp2, tmp3, tmp4;
    register f32    tmp5, tmp6, tmp7, tmp8, tmp9;

    tmp9 = 0.5F;
    tmp8 = 3.0F;

    asm
    {
        // to make sure cT = (single precision float value)
        frsp        cT, cT
        // tmp0 = [x][y] : LOAD
        psq_l       tmp0, 0(axis), 0, 0
        // to make sure sT = (single precision float value)
        frsp        sT, sT
        // tmp1 = [z][z] : LOAD
        lfs         tmp1, 8(axis)

        // tmp2 = [x*x][y*y]
        ps_mul      tmp2, tmp0, tmp0
        // tmp7 = [1.0F]
        fadds       tmp7, tmp9, tmp9
        // tmp3 = [x*x+z*z][y*y+z*z]
        ps_madd     tmp3, tmp1, tmp1, tmp2
        // fc0 = [0.0F]
        fsubs       fc0, tmp9, tmp9
        // tmp4 = [S = x*x+y*y+z*z][z]
        ps_sum0     tmp4, tmp3, tmp1, tmp2

        // tT = 1.0F - cT
        fsubs       tT, tmp7, cT

        // tmp5 = [1.0/sqrt(S)] :estimation[E]
        frsqrte     tmp5, tmp4
        // Newton-Rapson refinement step
        // E' = E/2(3.0 - E*E*S)
        fmuls       tmp2, tmp5, tmp5            // E*E
        fmuls       tmp3, tmp5, tmp9            // E/2
        fnmsubs     tmp2, tmp2, tmp4, tmp8      // (3-E*E*S)
        fmuls       tmp5, tmp2, tmp3            // (E/2)(3-E*E*S)

        // cT = [c][c]
        ps_merge00  cT, cT, cT

        // tmp0 = [nx = x/sqrt(S)][ny = y/sqrt(S)]
        ps_muls0    tmp0, tmp0, tmp5
        // tmp1 = [nz = z/sqrt(S)][nz = z/sqrt(S)]
        ps_muls0    tmp1, tmp1, tmp5

        // tmp4 = [t*nx][t*ny]
        ps_muls0    tmp4, tmp0, tT
        // tmp9 = [s*nx][s*ny]
        ps_muls0    tmp9, tmp0, sT
        // tmp5 = [t*nz][t*nz]
        ps_muls0    tmp5, tmp1, tT

        // tmp3 = [t*nx*ny][t*ny*ny]
        ps_muls1    tmp3, tmp4, tmp0
        // tmp2 = [t*nx*nx][t*ny*nx]
        ps_muls0    tmp2, tmp4, tmp0
        // tmp4 = [t*nx*nz][t*ny*nz]
        ps_muls0    tmp4, tmp4, tmp1

        // tmp6 = [t*nx*ny-s*nz][t*nx*ny-s*nz]
        fnmsubs     tmp6, tmp1, sT, tmp3
        // tmp7 = [t*nx*ny+s*nz][t*ny*ny+s*nz]
        fmadds      tmp7, tmp1, sT, tmp3

        // tmp0 = [-s*nx][-s*ny]
        ps_neg      tmp0, tmp9
        // tmp8 = [t*nx*nz+s*ny][0] == [m02][m03]
        ps_sum0     tmp8, tmp4, fc0, tmp9
        // tmp2 = [t*nx*nx+c][t*nx*ny-s*nz] == [m00][m01]
        ps_sum0     tmp2, tmp2, tmp6, cT
        // tmp3 = [t*nx*ny+s*nz][t*ny*ny+c] == [m10][m11]
        ps_sum1     tmp3, cT, tmp7, tmp3
        // tmp6 = [t*ny*nz-s*nx][0] == [m12][m13]
        ps_sum0     tmp6, tmp0, fc0 ,tmp4

            // tmp8 [m02][m03] : STORE
            psq_st      tmp8, 8(m), 0, 0
        // tmp0 = [t*nx*nz-s*ny][t*ny*nz]
        ps_sum0     tmp0, tmp4, tmp4, tmp0
            // tmp2 [m00][m01] : STORE
            psq_st      tmp2, 0(m), 0, 0
        // tmp5 = [t*nz*nz][t*nz*nz]
        ps_muls0    tmp5, tmp5, tmp1
            // tmp3 [m10][m11] : STORE
            psq_st      tmp3, 16(m), 0, 0
        // tmp4 = [t*nx*nz-s*ny][t*ny*nz+s*nx] == [m20][m21]
        ps_sum1     tmp4, tmp9, tmp0, tmp4
            // tmp6 [m12][m13] : STORE
            psq_st      tmp6, 24(m), 0, 0
        // tmp5 = [t*nz*nz+c][0]   == [m22][m23]
        ps_sum0     tmp5, tmp5, fc0, cT
            // tmp4 [m20][m21] : STORE
            psq_st      tmp4, 32(m), 0, 0
            // tmp5 [m22][m23] : STORE
            psq_st      tmp5, 40(m), 0, 0
    }
}

void PSMTXRotAxisRad(
    Mtx             m,
    const Vec      *axis,
    f32             rad )
{
    f32     sinT, cosT;

    sinT = sinf(rad);
    cosT = cosf(rad);

    __PSMTXRotAxisRadInternal(m, axis, sinT, cosT);
}

#endif // GEKKO

/*---------------------------------------------------------------------*

Name:           MTXTrans

Description:    sets a translation matrix.


Arguments:       m       matrix to be set

                xT        x component of translation.

                yT        y component of translation.

                zT        z component of translation.

Return   :         none

*---------------------------------------------------------------------*/
/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
void C_MTXTrans ( Mtx m, f32 xT, f32 yT, f32 zT )
{

    ASSERTMSG( (m != 0), MTX_TRANS_1 );


    m[0][0] = 1.0f;  m[0][1] = 0.0f;  m[0][2] = 0.0f;  m[0][3] =  xT;
    m[1][0] = 0.0f;  m[1][1] = 1.0f;  m[1][2] = 0.0f;  m[1][3] =  yT;
    m[2][0] = 0.0f;  m[2][1] = 0.0f;  m[2][2] = 1.0f;  m[2][3] =  zT;

}

/*---------------------------------------------------------------------*
    Paired-Single assembler version
 *---------------------------------------------------------------------*
                Note that this performs NO error checking.
 *---------------------------------------------------------------------*/
#ifdef  GEKKO
void PSMTXTrans(
    register Mtx m,
    register f32 xT,
    register f32 yT,
    register f32 zT
)
{
    register f32 c0 = 0.0F;
    register f32 c1 = 1.0F;

    asm
    {
        stfs        xT,     12(m)
        stfs        yT,     28(m)
        psq_st      c0,      4(m), 0, 0
        psq_st      c0,     32(m), 0, 0
        stfs        c0,     16(m)
        stfs        c1,     20(m)
        stfs        c0,     24(m)
        stfs        c1,     40(m)
        stfs        zT,     44(m)
        stfs        c1,      0(m)
    }
}
#endif  // GEKKO

/*---------------------------------------------------------------------*

Name:           MTXTransApply

Description:    This function performs the operation equivalent to
                MTXTrans + MTXConcat.


Arguments:      src       matrix to be operated.

                dst       resultant matrix from concat.

                xT        x component of translation.

                yT        y component of translation.

                zT        z component of translation.

Return   :         none

*---------------------------------------------------------------------*/
/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
void C_MTXTransApply ( const Mtx src, Mtx dst, f32 xT, f32 yT, f32 zT )
{
    ASSERTMSG( (src != 0), MTX_TRANSAPPLY_1 );
    ASSERTMSG( (dst != 0), MTX_TRANSAPPLY_1 );

    if ( src != dst )
    {
        dst[0][0] = src[0][0];    dst[0][1] = src[0][1];    dst[0][2] = src[0][2];
        dst[1][0] = src[1][0];    dst[1][1] = src[1][1];    dst[1][2] = src[1][2];
        dst[2][0] = src[2][0];    dst[2][1] = src[2][1];    dst[2][2] = src[2][2];
    }

    dst[0][3] = src[0][3] + xT;
    dst[1][3] = src[1][3] + yT;
    dst[2][3] = src[2][3] + zT;
}

/*---------------------------------------------------------------------*
    Paired-Single assembler version
 *---------------------------------------------------------------------*
                Note that this performs NO error checking.
 *---------------------------------------------------------------------*/
#ifdef  GEKKO
asm void PSMTXTransApply(
    const register Mtx src,
    register Mtx dst,
    register f32 xT,
    register f32 yT,
    register f32 zT )
{
    nofralloc;
    psq_l       fp4, 0(src),        0, 0;
    frsp        xT, xT;                     // to make sure xT = single precision
    psq_l       fp5, 8(src),        0, 0;
    frsp        yT, yT;                     // to make sure yT = single precision
    psq_l       fp7, 24(src),       0, 0;
    frsp        zT, zT;                     // to make sure zT = single precision
    psq_l       fp8, 40(src),       0, 0;
    psq_st      fp4, 0(dst),        0, 0;
    ps_sum1     fp5, xT, fp5, fp5;
    psq_l       fp6, 16(src),       0, 0;
    psq_st      fp5, 8(dst),        0, 0;
    ps_sum1     fp7, yT, fp7, fp7;
    psq_l       fp9, 32(src),       0, 0;
    psq_st      fp6, 16(dst),       0, 0;
    ps_sum1     fp8, zT, fp8, fp8;
    psq_st      fp7, 24(dst),       0, 0;
    psq_st      fp9, 32(dst),       0, 0;
    psq_st      fp8, 40(dst),       0, 0;
    blr;
}
#endif  // GEKKO


/*---------------------------------------------------------------------*

Name:            MTXScale

Description:     sets a scaling matrix.


Arguments:       m       matrix to be set

                xS        x scale factor.

                yS        y scale factor.

                zS        z scale factor.

Return   :         none

 *---------------------------------------------------------------------*/
/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
void C_MTXScale ( Mtx m, f32 xS, f32 yS, f32 zS )
{
    ASSERTMSG( (m != 0), MTX_SCALE_1 );


    m[0][0] = xS;    m[0][1] = 0.0f;  m[0][2] = 0.0f;  m[0][3] = 0.0f;
    m[1][0] = 0.0f;  m[1][1] = yS;    m[1][2] = 0.0f;  m[1][3] = 0.0f;
    m[2][0] = 0.0f;  m[2][1] = 0.0f;  m[2][2] = zS;    m[2][3] = 0.0f;
}

/*---------------------------------------------------------------------*
    Paired-Single assembler version
 *---------------------------------------------------------------------*
                Note that this performs NO error checking.
 *---------------------------------------------------------------------*/
#ifdef  GEKKO
void PSMTXScale(
    register Mtx m,
    register f32 xS,
    register f32 yS,
    register f32 zS
)
{
    register f32 c0 = 0.0F;

    asm
    {
        stfs        xS,      0(m)
        psq_st      c0,      4(m), 0, 0
        psq_st      c0,     12(m), 0, 0
        stfs        yS,     20(m)
        psq_st      c0,     24(m), 0, 0
        psq_st      c0,     32(m), 0, 0
        stfs        zS,     40(m)
        stfs        c0,     44(m)
    }
}
#endif  // GEKKO

/*---------------------------------------------------------------------*

Name:           MTXScaleApply

Description:    This function performs the operation equivalent to
                MTXScale + MTXConcat

Arguments:      src       matrix to be operated.

                dst       resultant matrix from concat.

                xS        x scale factor.

                yS        y scale factor.

                zS        z scale factor.

Return   :         none

 *---------------------------------------------------------------------*/
/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
void C_MTXScaleApply ( const Mtx src, Mtx dst, f32 xS, f32 yS, f32 zS )
{
    ASSERTMSG( (src != 0), MTX_SCALEAPPLY_1 );
    ASSERTMSG( (dst != 0), MTX_SCALEAPPLY_2 );

    dst[0][0] = src[0][0] * xS;     dst[0][1] = src[0][1] * xS;
    dst[0][2] = src[0][2] * xS;     dst[0][3] = src[0][3] * xS;

    dst[1][0] = src[1][0] * yS;     dst[1][1] = src[1][1] * yS;
    dst[1][2] = src[1][2] * yS;     dst[1][3] = src[1][3] * yS;

    dst[2][0] = src[2][0] * zS;     dst[2][1] = src[2][1] * zS;
    dst[2][2] = src[2][2] * zS;     dst[2][3] = src[2][3] * zS;
}

/*---------------------------------------------------------------------*
    Paired-Single assembler version
 *---------------------------------------------------------------------*
                Note that this performs NO error checking.
 *---------------------------------------------------------------------*/
#ifdef  GEKKO
asm void PSMTXScaleApply (
    const register Mtx src,
    register Mtx dst,
    register f32 xS,
    register f32 yS,
    register f32 zS )
{
    nofralloc;
    frsp        xS, xS;                     // to make sure xS = single precision
    psq_l       fp4, 0(src),        0, 0;
    frsp        yS, yS;                     // to make sure yS = single precision
    psq_l       fp5, 8(src),        0, 0;
    frsp        zS, zS;                     // to make sure zS = single precision
    ps_muls0    fp4, fp4, xS;
    psq_l       fp6, 16(src),       0, 0;
    ps_muls0    fp5, fp5, xS;
    psq_l       fp7, 24(src),       0, 0;
    ps_muls0    fp6, fp6, yS;
    psq_l       fp8, 32(src),       0, 0;
    psq_st      fp4, 0(dst),        0, 0;
    ps_muls0    fp7, fp7, yS;
    psq_l       fp2, 40(src),       0, 0;
    psq_st      fp5, 8(dst),        0, 0;
    ps_muls0    fp8, fp8, zS;
    psq_st      fp6, 16(dst),       0, 0;
    ps_muls0    fp2, fp2, zS;
    psq_st      fp7, 24(dst),       0, 0;
    psq_st      fp8, 32(dst),       0, 0;
    psq_st      fp2, 40(dst),       0, 0;
    blr;
}
#endif  // GEKKO

/*---------------------------------------------------------------------*

Name:            MTXQuat

Description:     sets a rotation matrix from a quaternion.


Arguments:       m       matrix to be set

                 q        ptr to quaternion.

Return   :          none

 *---------------------------------------------------------------------*/
/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
void C_MTXQuat ( Mtx m, const Quaternion *q )
{

    f32 s,xs,ys,zs,wx,wy,wz,xx,xy,xz,yy,yz,zz;


    ASSERTMSG( (m != 0),                         MTX_QUAT_1     );
    ASSERTMSG( (q != 0),                         MTX_QUAT_2     );
    ASSERTMSG( ( q->x || q->y || q->z || q->w ), MTX_QUAT_3     );


    s = 2.0f / ( (q->x * q->x) + (q->y * q->y) + (q->z * q->z) + (q->w * q->w) );

    xs = q->x *  s;     ys = q->y *  s;  zs = q->z *  s;
    wx = q->w * xs;     wy = q->w * ys;  wz = q->w * zs;
    xx = q->x * xs;     xy = q->x * ys;  xz = q->x * zs;
    yy = q->y * ys;     yz = q->y * zs;  zz = q->z * zs;


    m[0][0] = 1.0f - (yy + zz);
    m[0][1] = xy   - wz;
    m[0][2] = xz   + wy;
    m[0][3] = 0.0f;

    m[1][0] = xy   + wz;
    m[1][1] = 1.0f - (xx + zz);
    m[1][2] = yz   - wx;
    m[1][3] = 0.0f;

    m[2][0] = xz   - wy;
    m[2][1] = yz   + wx;
    m[2][2] = 1.0f - (xx + yy);
    m[2][3] = 0.0f;

}

/*---------------------------------------------------------------------*
    Paired-Single assembler version
 *---------------------------------------------------------------------*
                Note that this performs NO error checking.
 *---------------------------------------------------------------------*/
#ifdef GEKKO
void PSMTXQuat ( register Mtx m, const register Quaternion *q )
{
    register f32    c_zero, c_one, c_two, scale;
    register f32    tmp0, tmp1, tmp2, tmp3, tmp4;
    register f32    tmp5, tmp6, tmp7, tmp8, tmp9;

    c_one = 1.0F;

    asm
    {
        // tmp0 = [qx][qy] : LOAD
        psq_l       tmp0, 0(q), 0, 0
        // tmp1 = [qz][qw] : LOAD
        psq_l       tmp1, 8(q), 0, 0
        // c_zero = [0.0F][0.0F]
        fsubs       c_zero, c_one, c_one
        // c_two  = [2.0F][2.0F]
        fadds       c_two, c_one, c_one
        // tmp2 = [qx*qx][qy*qy]
        ps_mul      tmp2, tmp0, tmp0
        // tmp5 = [qy][qx]
        ps_merge10  tmp5, tmp0, tmp0
        // tmp4 = [qx*qx+qz*qz][qy*qy+qw*qw]
        ps_madd     tmp4, tmp1, tmp1, tmp2
        // tmp3 = [qz*qz][qw*qw]
        ps_mul      tmp3, tmp1, tmp1
            // scale = [qx*qx+qy*qy+qz*qz+qw*qw][?]
            ps_sum0     scale, tmp4, tmp4, tmp4
        // tmp7 = [qy*qw][qx*qw]
        ps_muls1    tmp7, tmp5, tmp1
            // Newton-Rapson refinement (1/X) : E' = 2E-X*E*E
            // tmp9 = [E = Est.(1/X)]
            fres        tmp9, scale
        // tmp4 = [qx*qx+qz*qz][qy*qy+qz*qz]
        ps_sum1     tmp4, tmp3, tmp4, tmp2
            // scale = [2-X*E]
            ps_nmsub    scale, scale, tmp9, c_two
        // tmp6 = [qz*qw][?]
        ps_muls1    tmp6, tmp1, tmp1
            // scale = [E(2-scale*E) = E']
            ps_mul      scale, tmp9, scale
        // tmp2 = [qx*qx+qy*qy]
        ps_sum0     tmp2, tmp2, tmp2, tmp2
            // scale = [s = 2E' = 2.0F/(qx*qx+qy*qy+qz*qz+qw*qw)]
            fmuls       scale, scale, c_two
        // tmp8 = [qx*qy+qz*qw][?]
        ps_madd     tmp8, tmp0, tmp5, tmp6
        // tmp6 = [qx*qy-qz*qw][?]
        ps_msub     tmp6, tmp0, tmp5, tmp6
            // c_zero [m03] : STORE
            psq_st      c_zero, 12(m), 1, 0
        // tmp2 = [1-s(qx*qx+qy*qy)]   : [m22]
        ps_nmsub    tmp2, tmp2, scale, c_one
        // tmp4 = [1-s(qx*qx+qz*qz)][1-s(qy*qy+qz*qz)] : [m11][m00]
        ps_nmsub    tmp4, tmp4, scale, c_one
            // c_zero [m23] : STORE
            psq_st      c_zero, 44(m), 1, 0
        // tmp8 = [s(qx*qy+qz*qw)][?]  : [m10]
        ps_mul      tmp8, tmp8, scale
        // tmp6 = [s(qx*qy-qz*qw)][?]  : [m01]
        ps_mul      tmp6, tmp6, scale
            // tmp2 [m22] : STORE
            psq_st      tmp2, 40(m), 1, 0
        // tmp5 = [qx*qz+qy*qw][qy*qz+qx*qw]
        ps_madds0   tmp5, tmp0, tmp1, tmp7
        // tmp1 = [m10][m11]
        ps_merge00  tmp1, tmp8, tmp4
        // tmp7 = [qx*qz-qy*qw][qy*qz-qx*qw]
        ps_nmsub    tmp7, tmp7, c_two, tmp5
        // tmp0 = [m00][m01]
        ps_merge10  tmp0, tmp4, tmp6
            // tmp1 [m10][m11] : STORE
            psq_st      tmp1, 16(m), 0, 0
        // tmp5 = [s(qx*qz+qy*qw)][s(qy*qz+qx*qw)] : [m02][m21]
        ps_mul      tmp5, tmp5, scale
        // tmp7 = [s(qx*qz-qy*qw)][s(qy*qz-qx*qw)] : [m20][m12]
        ps_mul      tmp7, tmp7, scale
            // tmp0 [m00][m01] : STORE
            psq_st      tmp0,  0(m), 0, 0
            // tmp5 [m02] : STORE
            psq_st      tmp5,  8(m), 1, 0
        // tmp3 = [m12][m13]
        ps_merge10  tmp3, tmp7, c_zero
        // tmp9 = [m20][m21]
        ps_merge01  tmp9, tmp7, tmp5
            // tmp3 [m12][m13] : STORE
            psq_st      tmp3, 24(m), 0, 0
            // tmp9 [m20][m21] : STORE
            psq_st      tmp9, 32(m), 0, 0

    }
}
#endif // GEKKO

/*---------------------------------------------------------------------*

Name:           MTXReflect

Description:    reflect a rotation matrix with respect to a plane.


Arguments:      m       matrix to be set

                p        point on the planar reflector.

                n       normal of the planar reflector.

Return   :         none

 *---------------------------------------------------------------------*/
/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
void C_MTXReflect ( Mtx m, const Vec *p, const Vec *n )
{
    f32 vxy, vxz, vyz, pdotn;

    vxy   = -2.0f * n->x * n->y;
    vxz   = -2.0f * n->x * n->z;
    vyz   = -2.0f * n->y * n->z;
    pdotn = 2.0f * C_VECDotProduct(p, n);

    m[0][0] = 1.0f - 2.0f * n->x * n->x;
    m[0][1] = vxy;
    m[0][2] = vxz;
    m[0][3] = pdotn * n->x;

    m[1][0] = vxy;
    m[1][1] = 1.0f - 2.0f * n->y * n->y;
    m[1][2] = vyz;
    m[1][3] = pdotn * n->y;

    m[2][0] = vxz;
    m[2][1] = vyz;
    m[2][2] = 1.0f - 2.0f * n->z * n->z;
    m[2][3] = pdotn * n->z;
}

/*---------------------------------------------------------------------*
    Paired-Single assembler version
 *---------------------------------------------------------------------*/
#ifdef GEKKO
void PSMTXReflect( register Mtx m, const register Vec *p, const register Vec *n )
{
    register f32    c_one = 1.0F;
    register f32    vn_xy, vn_z1, n2vn_xy, n2vn_z1, pdotn;
    register f32    tmp0, tmp1, tmp2, tmp3;
    register f32    tmp4, tmp5, tmp6, tmp7;

    asm
    {
            // vn_z1 = [nz][1.0F] : LOAD
            psq_l       vn_z1, 8(n), 1, 0
            // vn_xy = [nx][ny]   : LOAD
            psq_l       vn_xy, 0(n), 0, 0

            // tmp0 = [px][py]   : LOAD
            psq_l       tmp0,  0(p), 0, 0
        // n2vn_z1 = [-2nz][-2.0F]
        ps_nmadd    n2vn_z1, vn_z1, c_one, vn_z1
            // tmp1 = [pz][1.0F] : LOAD
            psq_l       tmp1,  8(p), 1, 0
        // n2vn_xy = [-2nx][-2ny]
        ps_nmadd    n2vn_xy, vn_xy, c_one, vn_xy

        // tmp4 = [-2nx*nz][-2ny*nz]   : [m20][m21]
        ps_muls0    tmp4, vn_xy, n2vn_z1
        // pdotn = [-2(px*nx)][-2(py*ny)]
        ps_mul      pdotn, n2vn_xy, tmp0
        // tmp2 = [-2nx*nx][-2nx*ny]
        ps_muls0    tmp2, vn_xy, n2vn_xy
        // pdotn = [-2(px*nx+py*ny)][?]
        ps_sum0     pdotn, pdotn, pdotn, pdotn
        // tmp3 = [-2nx*ny][-2ny*ny]
        ps_muls1    tmp3, vn_xy, n2vn_xy
            // tmp4 = [m20][m21] : STORE
            psq_st      tmp4, 32(m), 0, 0
        // tmp2 = [1-2nx*nx][-2nx*ny]  : [m00][m01]
        ps_sum0     tmp2, tmp2, tmp2, c_one
        // pdotn = [2(px*nx+py*ny+pz*nz)][?]
        ps_nmadd    pdotn, n2vn_z1, tmp1, pdotn
        // tmp3 = [-2nx*ny][1-2ny*ny]  : [m10][m11]
        ps_sum1     tmp3, c_one, tmp3, tmp3
            // tmp2 = [m00][m01] : STORE
            psq_st      tmp2,  0(m), 0, 0
        // tmp5 = [pdotn*nx][pdotn*ny]
        ps_muls0    tmp5, vn_xy, pdotn
        // tmp6 = [-2nz][pdotn]
        ps_merge00  tmp6, n2vn_z1, pdotn
            // tmp3 = [m10][m11] : STORE
            psq_st      tmp3, 16(m), 0, 0

        // tmp7 = [-2nx*nz][pdotn*nx]  : [m02][m03]
        ps_merge00  tmp7, tmp4, tmp5
        // tmp6 = [-2nz*nz][pdotn*nz]
        ps_muls0    tmp6, tmp6, vn_z1
        // tmp5 = [-2ny*nz][pdotn*ny]  : [m12][m13]
        ps_merge11  tmp5, tmp4, tmp5
            // tmp7 = [m02][m03] : STORE
            psq_st      tmp7,  8(m), 0, 0
        // tmp6 = [1-2nz*nz][pdotn*nz] : [m22][m23]
        ps_sum0     tmp6, tmp6, tmp6, c_one
            // tmp5 = [m12][m13] : STORE
            psq_st      tmp5, 24(m), 0, 0
            // tmp6 = [m22][m23] : STORE
            psq_st      tmp6, 40(m), 0, 0
    }
}
#endif // GEKKO

/*---------------------------------------------------------------------*


                             VIEW   SECTION


*---------------------------------------------------------------------*/


/*---------------------------------------------------------------------*

Name:           MTXLookAt

Description:    compute a matrix to transform points to camera coordinates.


Arguments:      m       matrix to be set

                camPos   camera position.

                camUp    camera 'up' direction.

                target   camera aim point.


Return   :         none

*---------------------------------------------------------------------*/
/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
void C_MTXLookAt ( Mtx m, const Point3d *camPos, const Vec *camUp, const Point3d *target )
{

    Vec vLook,vRight,vUp;


    ASSERTMSG( (m != 0),      MTX_LOOKAT_1    );
    ASSERTMSG( (camPos != 0), MTX_LOOKAT_2    );
    ASSERTMSG( (camUp  != 0), MTX_LOOKAT_3    );
    ASSERTMSG( (target != 0), MTX_LOOKAT_4    );


    // compute unit target vector
    // use negative value to look down (-Z) axis
    vLook.x = camPos->x - target->x;
    vLook.y = camPos->y - target->y;
    vLook.z = camPos->z - target->z;
    VECNormalize( &vLook,&vLook );


    // vRight = camUp x vLook
    VECCrossProduct    ( camUp, &vLook, &vRight );
    VECNormalize( &vRight,&vRight );


    // vUp = vLook x vRight
    VECCrossProduct( &vLook, &vRight, &vUp );
    // Don't need to normalize vUp since it should already be unit length
    // VECNormalize( &vUp, &vUp );


    m[0][0] = vRight.x;
    m[0][1] = vRight.y;
    m[0][2] = vRight.z;
    m[0][3] = -( camPos->x * vRight.x + camPos->y * vRight.y + camPos->z * vRight.z );

    m[1][0] = vUp.x;
    m[1][1] = vUp.y;
    m[1][2] = vUp.z;
    m[1][3] = -( camPos->x * vUp.x + camPos->y * vUp.y + camPos->z * vUp.z );

    m[2][0] = vLook.x;
    m[2][1] = vLook.y;
    m[2][2] = vLook.z;
    m[2][3] = -( camPos->x * vLook.x + camPos->y * vLook.y + camPos->z * vLook.z );


}

/*---------------------------------------------------------------------*


                       TEXTURE PROJECTION SECTION


*---------------------------------------------------------------------*/


/*---------------------------------------------------------------------*

Name:           MTXLightFrustum

Description:    Compute a 3x4 projection matrix for texture projection


Arguments:      m        3x4 matrix to be set

                t        top coord. of view volume at the near clipping plane

                b        bottom coord of view volume at the near clipping plane

                l        left coord. of view volume at near clipping plane

                r        right coord. of view volume at near clipping plane

                n            positive distance from camera to near clipping plane

                scaleS   scale in the S direction for projected coordinates
                         (usually 0.5)

                scaleT   scale in the T direction for projected coordinates
                         (usually 0.5)

                transS   translate in the S direction for projected coordinates
                         (usually 0.5)

                transT   translate in the T direction for projected coordinates
                         (usually 0.5)


Return   :         none.

 *---------------------------------------------------------------------*/
/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
void C_MTXLightFrustum  ( Mtx m, float t, float b, float l, float r, float n,
                          float scaleS, float scaleT, float transS,
                          float transT )
{
    f32 tmp;


    ASSERTMSG( (m != 0),  MTX_LIGHT_FRUSTUM_1  );
    ASSERTMSG( (t != b),  MTX_LIGHT_FRUSTUM_2  );
    ASSERTMSG( (l != r),  MTX_LIGHT_FRUSTUM_3  );


    // NOTE: Be careful about "l" vs. "1" below!!!

    tmp     =  1.0f / (r - l);
    m[0][0] =  ((2*n) * tmp) * scaleS;
    m[0][1] =  0.0f;
    m[0][2] =  (((r + l) * tmp) * scaleS) - transS;
    m[0][3] =  0.0f;

    tmp     =  1.0f / (t - b);
    m[1][0] =  0.0f;
    m[1][1] =  ((2*n) * tmp) * scaleT;
    m[1][2] =  (((t + b) * tmp) * scaleT) - transT;
    m[1][3] =  0.0f;

    m[2][0] =  0.0f;
    m[2][1] =  0.0f;
    m[2][2] = -1.0f;
    m[2][3] =  0.0f;
}

/*---------------------------------------------------------------------*

Name:           MTXLightPerspective

Description:    compute a 3x4 perspective projection matrix from
                field of view and aspect ratio for texture projection.


Arguments:      m        3x4 matrix to be set

                fovy     total field of view in in degrees in the YZ plane

                aspect       ratio of view window width:height (X / Y)

                scaleS   scale in the S direction for projected coordinates
                         (usually 0.5)

                scaleT   scale in the T direction for projected coordinates
                         (usually 0.5)

                transS   translate in the S direction for projected coordinates
                         (usually 0.5)

                transT   translate in the T direction for projected coordinates
                         (usually 0.5)


Return   :         none

 *---------------------------------------------------------------------*/
/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
void C_MTXLightPerspective  ( Mtx m, f32 fovY, f32 aspect, float scaleS,
                              float scaleT, float transS, float transT )
{
    f32 angle;
    f32 cot;

    ASSERTMSG( (m != 0),                            MTX_LIGHT_PERSPECTIVE_1  );
    ASSERTMSG( ( (fovY > 0.0) && ( fovY < 180.0) ), MTX_LIGHT_PERSPECTIVE_2  );
    ASSERTMSG( (aspect != 0),                       MTX_LIGHT_PERSPECTIVE_3  );

    // find the cotangent of half the (YZ) field of view
    angle = fovY * 0.5f;
    angle = MTXDegToRad( angle );

    cot = 1.0f / tanf(angle);

    m[0][0] =    (cot / aspect) * scaleS;
    m[0][1] =    0.0f;
    m[0][2] =    -transS;
    m[0][3] =    0.0f;

    m[1][0] =    0.0f;
    m[1][1] =    cot * scaleT;
    m[1][2] =    -transT;
    m[1][3] =    0.0f;

    m[2][0] =    0.0f;
    m[2][1] =    0.0f;
    m[2][2] =   -1.0f;
    m[2][3] =    0.0f;
}

/*---------------------------------------------------------------------*

Name:           MTXLightOrtho

Description:    compute a 3x4 orthographic projection matrix.


Arguments:      m       matrix to be set

                t        top coord. of parallel view volume

                b        bottom coord of parallel view volume

                l        left coord. of parallel view volume

                r        right coord. of parallel view volume

                scaleS   scale in the S direction for projected coordinates
                         (usually 0.5)

                scaleT   scale in the T direction for projected coordinates
                         (usually 0.5)

                transS   translate in the S direction for projected coordinates
                         (usually 0.5)

                transT   translate in the T direction for projected coordinates
                         (usually 0.5)


Return   :         none

 *---------------------------------------------------------------------*/
/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
void C_MTXLightOrtho ( Mtx m, f32 t, f32 b, f32 l, f32 r, float scaleS,
                              float scaleT, float transS, float transT )
{
    f32 tmp;


    ASSERTMSG( (m != 0),  MTX_LIGHT_ORTHO_1     );
    ASSERTMSG( (t != b),  MTX_LIGHT_ORTHO_2     );
    ASSERTMSG( (l != r),  MTX_LIGHT_ORTHO_3     );


    // NOTE: Be careful about "l" vs. "1" below!!!

    tmp     =  1.0f / (r - l);
    m[0][0] =  (2.0f * tmp * scaleS);
    m[0][1] =  0.0f;
    m[0][2] =  0.0f;
    m[0][3] =  ((-(r + l) * tmp) * scaleS) + transS;

    tmp     =  1.0f / (t - b);
    m[1][0] =  0.0f;
    m[1][1] =  (2.0f * tmp) * scaleT;
    m[1][2] =  0.0f;
    m[1][3] =  ((-(t + b) * tmp)* scaleT) + transT;

    m[2][0] =  0.0f;
    m[2][1] =  0.0f;
    m[2][2] =  0.0f;
    m[2][3] =  1.0f;
}

/*===========================================================================*/

#if ( __MWERKS__ == 0x00004100 )
#pragma defer_codegen reset
#endif