/*---------------------------------------------------------------------------*
  Project: matrix vector Library
  File:    mtxvec.c

  Copyright 1998-2011 Nintendo.  All rights reserved.

  These coded instructions, statements, and computer programs contain
  proprietary information of Nintendo of America Inc. and/or Nintendo
  Company Ltd., and are protected by Federal copyright law.     They may
  not be disclosed to third parties or copied or duplicated in any form,
  in whole or in part, without the prior written consent of Nintendo.

 *---------------------------------------------------------------------------*/

#include <math.h>
#include <stdio.h>
#include <cafe/mtx.h>
#include <cafe/mtx/mtx44.h>
#include "mtxAssert.h"
#include "mtx44Assert.h"

/*---------------------------------------------------------------------*

                             VECTOR SECTION

 *---------------------------------------------------------------------*/

//static const f32x2    c_zero  = {0.0F, 0.0F};
static const f32x2    c_half  = {0.5F, 0.5F};
static const f32x2    c_three = {3.0F, 3.0F};

/*---------------------------------------------------------------------*

Name:           VECAdd

Description:    add two vectors.


Arguments:      a    first vector.

                b    second vector.

                ab   resultant vector (a + b).
                     ok if ab == a or ab == b.


Return:         none.

 *---------------------------------------------------------------------*/
/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
void C_VECAdd ( const Vec *a, const Vec *b, Vec *ab )
{

    ASSERTMSG( ( a    != 0), VEC_ADD_1 );
    ASSERTMSG( ( b    != 0), VEC_ADD_2 );
    ASSERTMSG( ( ab != 0),   VEC_ADD_3 );


    ab->x = a->x + b->x;
    ab->y = a->y + b->y;
    ab->z = a->z + b->z;

}

#if !defined(WIN32) && !defined(WIN64)
/*---------------------------------------------------------------------*
    Paired-Single intrinsics version
 *---------------------------------------------------------------------*
            Note that this performs NO error checking.
 *---------------------------------------------------------------------*/
void PSVECAdd ( const Vec *a, const Vec *b, Vec *ab )
{
    f32x2 V1_XY;
    f32x2 V2_XY;
    f32x2 V1_Z;
    f32x2 V2_Z;
    f32x2 D1_XY;
    f32x2 D1_Z;

    //load vectors XY
    V1_XY = __PSQ_L(a, 0, 0);
    V2_XY = __PSQ_L(b, 0, 0);

    //add vectors XY
    D1_XY = __PS_ADD(V1_XY, V2_XY);

    //store result XY
    __PSQ_ST(ab, D1_XY, 0, 0);

    //load vectors Z
    V1_Z = __PSQ_LX(a, 8, 1, 0);
    V2_Z = __PSQ_LX(b, 8, 1, 0);

    //add vectors Z
    D1_Z = __PS_ADD(V1_Z, V2_Z);

    //store result YZ
    __PSQ_STX(ab, 8, D1_Z, 1, 0);
}
#endif

/*---------------------------------------------------------------------*

Name:           VECSubtract

Description:    subtract one vector from another.


Arguments:      a       first vector.

                b       second vector.

                a_b     resultant vector (a - b).
                        ok if a_b == a or a_b == b.


Return:         none.

 *---------------------------------------------------------------------*/
/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
void C_VECSubtract ( const Vec *a, const Vec *b, Vec *a_b )
{

    ASSERTMSG( ( a    != 0),    VEC_SUBTRACT_1     );
    ASSERTMSG( ( b    != 0),    VEC_SUBTRACT_2     );
    ASSERTMSG( ( a_b != 0),     VEC_SUBTRACT_3     );


    a_b->x = a->x - b->x;
    a_b->y = a->y - b->y;
    a_b->z = a->z - b->z;

}

#if !defined(WIN32) && !defined(WIN64)
/*---------------------------------------------------------------------*
    Paired-Single intrinsics version
 *---------------------------------------------------------------------*
            Note that this performs NO error checking.
 *---------------------------------------------------------------------*/
void PSVECSubtract ( const Vec *a, const Vec *b, Vec *ab )
{
    f32x2 V1_XY;
    f32x2 V2_XY;
    f32x2 V1_Z;
    f32x2 V2_Z;
    f32x2 D1_XY;
    f32x2 D1_Z;

    //load vectors XY
    V1_XY = __PSQ_L(a, 0, 0);
    V2_XY = __PSQ_L(b, 0, 0);

    //sub vectors XY
    D1_XY = __PS_SUB(V1_XY, V2_XY);

    //store result XY
    __PSQ_ST(ab, D1_XY, 0, 0);

    //load vectors Z
    V1_Z = __PSQ_LX(a, 8, 1, 0);
    V2_Z = __PSQ_LX(b, 8, 1, 0);

    //sub vectors Z
    D1_Z = __PS_SUB(V1_Z, V2_Z);

    //store result Z
    __PSQ_STX(ab, 8, D1_Z, 1, 0);
}
#endif

/*---------------------------------------------------------------------*

Name:           VECScale

Description:    multiply a vector by a scalar.


Arguments:      src     unscaled source vector.

                dst     scaled resultant vector ( src * scale).
                        ok if dst == src.

                scale   scaling factor.


Return:         none.

 *---------------------------------------------------------------------*/
/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
void C_VECScale ( const Vec *src, Vec *dst, f32 scale )
{

    ASSERTMSG( ( src  != 0),  VEC_SCALE_1  );
    ASSERTMSG( ( dst  != 0),  VEC_SCALE_2  );


    dst->x = src->x * scale;
    dst->y = src->y * scale;
    dst->z = src->z * scale;

}

#if !defined(WIN32) && !defined(WIN64)
/*---------------------------------------------------------------------*
    Paired-Single intrinsics version
 *---------------------------------------------------------------------*
            Note that this performs NO error checking.
 *---------------------------------------------------------------------*/
void PSVECScale ( const Vec *src, Vec *dst, f32 scale )
{
    f32x2 V1_XY;
    f32x2 V1_Z;
    f32x2 D1_XY;
    f32x2 D1_Z;
    f32x2 SCALE = {scale, scale};

    //load vectors XY
    V1_XY = __PSQ_L(src, 0, 0);

    //load vectors Z
    V1_Z = __PSQ_LX(src, 8, 1, 0);

    //muls vectors XY
    D1_XY = __PS_MULS0(V1_XY, SCALE);

    //store result XY
    __PSQ_ST(dst, D1_XY, 0, 1);

    //muls vectors Z
    D1_Z = __PS_MULS0(V1_Z, SCALE);

    //store result YZ
    __PSQ_STX(dst, 8, D1_Z, 1, 0);
}
#endif

/*---------------------------------------------------------------------*

Name:           VECNormalize

Description:    normalize a vector.


Arguments:      src     non-unit source vector.

                unit    resultant unit vector ( src / src magnitude ).
                        ok if unit == src


Return:         none.

 *---------------------------------------------------------------------*/
/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
void C_VECNormalize ( const Vec *src, Vec *unit )
{
    f32 mag;

    ASSERTMSG( (src != 0 ),     VEC_NORMALIZE_1  );
    ASSERTMSG( (unit != 0),     VEC_NORMALIZE_2  );

    mag = (src->x * src->x) + (src->y * src->y) + (src->z * src->z);

    ASSERTMSG( (mag != 0),      VEC_NORMALIZE_3  );

    mag = 1.0f / sqrtf(mag);

    unit->x = src->x * mag;
    unit->y = src->y * mag;
    unit->z = src->z * mag;

}

#if !defined(WIN32) && !defined(WIN64)
/*---------------------------------------------------------------------*
    Paired-Single intrinsics version
 *---------------------------------------------------------------------*
            Note that this performs NO error checking.
 *---------------------------------------------------------------------*/
void PSVECNormalize ( const Vec *vec1, Vec *dst )
{
    f32x2 v1_xy, v1_z;
    f32x2 xx_zz, xx_yy;
    f32x2 sqsum;
    f32x2 rsqrt;
    f32x2 nwork0, nwork1;

    // X | Y
    //psq_l       v1_xy, 0(vec1), 0, 0;
    v1_xy = __PSQ_L(vec1, 0, 0);

    // X*X | Y*Y
    //ps_mul      xx_yy, v1_xy, v1_xy;
    xx_yy = __PS_MUL(v1_xy, v1_xy);

    // Z | 1
    //psq_l       v1_z, 8(vec1), 1, 0;
    v1_z = __PSQ_LX(vec1, 8, 1, 0);

    // X*X+Z*Z | Y*Y+1
    //ps_madd     xx_zz, v1_z, v1_z, xx_yy;
    xx_zz = __PS_MADD(v1_z, v1_z, xx_yy);

    // X*X+Z*Z+Y*Y | Z
    //ps_sum0     sqsum, xx_zz, v1_z, xx_yy;
    sqsum = __PS_SUM0(xx_zz, v1_z, xx_yy);

    // 1.0/sqrt : estimation[E]
    //frsqrte     rsqrt, sqsum;
    rsqrt = __PS_RSQRTE(sqsum);

    // Newton's refinement x 1
    // E' = (E/2)(3 - sqsum * E * E)
    //fmuls       nwork0, rsqrt, rsqrt;
    //fmuls       nwork1, rsqrt, c_half;
    //fnmsubs     nwork0, nwork0, sqsum, c_three;
    //fmuls       rsqrt, nwork0, nwork1;
    nwork0 = __PS_MUL(rsqrt, rsqrt);
    nwork1 = __PS_MUL(rsqrt, c_half);
    nwork0 = __PS_NMSUB(nwork0, sqsum, c_three);
    rsqrt = __PS_MUL(nwork0, nwork1);

    // X * mag | Y * mag
    //ps_muls0    v1_xy, v1_xy, rsqrt;
    v1_xy = __PS_MULS0(v1_xy, rsqrt);

    //psq_st      v1_xy, 0(dst), 0, 0;
    __PSQ_ST(dst, v1_xy, 0, 0);

    // Z * mag
    //ps_muls0    v1_z, v1_z, rsqrt;
    v1_z = __PS_MULS0(v1_z, rsqrt);

    //psq_st      v1_z, 8(dst), 1, 0;
    __PSQ_STX(dst, 8, v1_z, 1, 0);
}
#endif

/*---------------------------------------------------------------------*

Name:           VECSquareMag

Description:    compute the square of the magnitude of a vector.


Arguments:      v    source vector.


Return:         square magnitude of the vector.

 *---------------------------------------------------------------------*/
/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
f32 C_VECSquareMag ( const Vec *v )
{
    f32 sqmag;

    ASSERTMSG( (v != 0),  VEC_MAG_1 );

    sqmag = (v->x * v->x) + (v->y * v->y) + (v->z * v->z);

    return sqmag;
}

#if !defined(WIN32) && !defined(WIN64)
/*---------------------------------------------------------------------*
    Paired-Single intrinsics version
 *---------------------------------------------------------------------*
            Note that this performs NO error checking.
 *---------------------------------------------------------------------*/

f32 PSVECSquareMag ( const Vec *v )
{
    f32x2    V1_XY, V1_ZZ, sqmag;

    // load X | Y
    V1_XY = __PSQ_L(v, 0, 0);

    // XX | YY
    V1_XY = __PS_MUL(V1_XY, V1_XY);

    // load Z | Z
    V1_ZZ[0] = v->z;
    V1_ZZ[1] = v->z;

    // XX + ZZ | YY + ZZ
    sqmag = __PS_MADD(V1_ZZ, V1_ZZ, V1_XY);
    sqmag = __PS_SUM0(sqmag, V1_XY, V1_XY);

    return (f32)sqmag[0];
}
#endif

/*---------------------------------------------------------------------*

Name:           VECMag

Description:    compute the magnitude of a vector.


Arguments:      v    source vector.


Return:         magnitude of the vector.

 *---------------------------------------------------------------------*/
/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
f32 C_VECMag ( const Vec *v )
{
    return sqrtf( C_VECSquareMag(v) );
}

#if !defined(WIN32) && !defined(WIN64)
/*---------------------------------------------------------------------*
    Paired-Single intrinsics version
 *---------------------------------------------------------------------*/
f32 PSVECMag ( const Vec *v )
{
    f32x2    vzz, vxy; //vyz, 
    f32x2    sqmag, rmag; //dyz, dxy, 
    f32x2    nwork0, nwork1;

    // Square mag calculation
    //psq_l       vxy, 0(v), 0, 0
    vxy = __PSQ_L(v, 0, 0);

    //ps_mul      vxy, vxy, vxy
    vxy = __PS_MUL(vxy, vxy);
    vzz = __PSQ_LX(v, 8, 0, 0);

    //ps_madd     sqmag, vzz, vzz, vxy
    sqmag = __PS_MADD(vzz, vzz, vxy);

    // Square mag
    //ps_sum0     sqmag, sqmag, vxy, vxy
    sqmag = __PS_SUM0(sqmag, vxy, vxy);
    
    if (sqmag[0] != 0)
    {
        // 1.0/sqrt : estimation[E]
        //frsqrte     rmag, sqmag
        rmag = __PS_RSQRTE(sqmag);

        // Refinement x 1 : E' = (E/2)(3 - X*E*E)
        //fmul        nwork0, rsqmag, rsqmag
        nwork0 = __PS_MUL(rmag, rmag);

        //fmul        nwork1, rsqmag, c_half
        nwork1 = __PS_MUL(rmag, c_half);

        //fnmsub      nwork0, nwork0, mag, c_three
        nwork0 = __PS_NMSUB(nwork0, sqmag, c_three);

        //fmul        rsqmag, nwork0, nwork1
        rmag = __PS_MUL(nwork0, nwork1);

        // 1/sqrt(X) * X = sqrt(X)
        //fmuls       sqmag, sqmag, rmag
        sqmag = __PS_MUL(sqmag, rmag);
    }

    return (f32)sqmag[0];
}
#endif

/*---------------------------------------------------------------------*

Name:           VECDotProduct

Description:    compute the dot product of two vectors.


Arguments:      a    first vector.

                b    second vector.

                note:  input vectors do not have to be normalized.
                       input vectors are not normalized within the function.

                       if direct cosine computation of the angle
                       between a and b is desired, a and b should be
                       normalized prior to calling VECDotProduct.


Return:         dot product value.

 *---------------------------------------------------------------------*/
/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
f32 C_VECDotProduct ( const Vec *a, const Vec *b )
{
    f32 dot;

    ASSERTMSG( (a != 0), VEC_DOTPRODUCT_1 );
    ASSERTMSG( (b != 0), VEC_DOTPRODUCT_2 );

    dot = (a->x * b->x) + (a->y * b->y) + (a->z * b->z);

    return dot;
}

#if !defined(WIN32) && !defined(WIN64)
/*---------------------------------------------------------------------*
    Paired-Single intrinsics version
 *---------------------------------------------------------------------*
            Note that this performs NO error checking.
 *---------------------------------------------------------------------*/
f32 PSVECDotProduct ( const Vec *vec1, const Vec *vec2 )
{
    f32x2    fp1, fp2, fp3, fp4, fp5;

    //psq_l    fp2, 4(vec1), 0, 0;
    fp2 = __PSQ_LX(vec1, 4, 0, 0);

    //psq_l    fp3, 4(vec2), 0, 0;
    fp3 = __PSQ_LX(vec2, 4, 0, 0);

    //ps_mul   fp2, fp2, fp3;
    fp2 = __PS_MUL(fp2, fp3);

    //psq_l    fp5, 0(vec1), 0, 0;
    fp5 = __PSQ_L(vec1, 0, 0);

    //psq_l    fp4, 0(vec2), 0, 0;
    fp4 = __PSQ_L(vec2, 0, 0);

    //ps_madd  fp3, fp5, fp4, fp2;
    fp3 = __PS_MADD(fp5, fp4, fp2);

    //ps_sum0  fp1, fp3, fp2, fp2;
    fp1 = __PS_SUM0(fp3, fp2, fp2);

    return (f32)fp1[0];
}
#endif

/*---------------------------------------------------------------------*

Name:           VECCrossProduct

Description:    compute the cross product of two vectors.


Arguments:      a       first vector.

                b       second vector.

                note:  input vectors do not have to be normalized.


                axb     resultant vector.
                        ok if axb == a or axb == b.


Return:         none.

 *---------------------------------------------------------------------*/
/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
void C_VECCrossProduct ( const Vec *a, const Vec *b, Vec *axb )
{
    Vec vTmp;


    ASSERTMSG( (a    != 0),   VEC_CROSSPRODUCT_1    );
    ASSERTMSG( (b    != 0),   VEC_CROSSPRODUCT_2    );
    ASSERTMSG( (axb != 0),    VEC_CROSSPRODUCT_3    );


    vTmp.x =  ( a->y * b->z ) - ( a->z * b->y );
    vTmp.y =  ( a->z * b->x ) - ( a->x * b->z );
    vTmp.z =  ( a->x * b->y ) - ( a->y * b->x );


    axb->x = vTmp.x;
    axb->y = vTmp.y;
    axb->z = vTmp.z;

}

#if !defined(WIN32) && !defined(WIN64)
/*---------------------------------------------------------------------*
    Paired-Single intrinsics version
 *---------------------------------------------------------------------*
            Note that this performs NO error checking.
 *---------------------------------------------------------------------*/
void PSVECCrossProduct
(
    const Vec *vec1,
    const Vec *vec2,
    Vec *dst
)
{
    f32x2 fp0, fp1;
    f32x2 fp2 = {vec1->z, vec1->z};
    f32x2 fp3 = {vec2->z, vec2->z};
    f32x2 fp4, fp5, fp6, fp7, fp8, fp9, fp10;

    //x =   a.n[VY]*b.n[VZ] - a.n[VZ]*b.n[VY];
    //y =   a.n[VZ]*b.n[VX] - a.n[VX]*b.n[VZ];
    //z =   a.n[VX]*b.n[VY] - a.n[VY]*b.n[VX];

    // BX | BY
    fp1 = __PSQ_L(vec2, 0, 0);

    // AX | AY
    fp0 = __PSQ_L(vec1, 0, 0);

    // BY | BX
    fp6 = __PS_MERGE10(fp1, fp1);

    // BX*AZ | BY*AZ
    fp4 = __PS_MUL(fp1, fp2);

    // BX*AX | BY*AX
    fp7 = __PS_MULS0(fp1, fp0);

    // AX*BZ-BX*AZ | AY*BZ-BY*AZ
    fp5 = __PS_MSUB(fp0, fp3, fp4);

    // AX*BY-BX*AX | AY*BX-BY*AX
    fp8 = __PS_MSUB(fp0, fp6, fp7);

    // AY*BZ-AZ*BY | AY*BZ-AZ*BY
    fp9 = __PS_MERGE11(fp5, fp5);

    // AX*BZ-AZ*BX | AY*BX-AX*BY
    fp10 = __PS_MERGE01(fp5, fp8);

    //Store X
    __PSQ_ST(dst, fp9, 1, 0);

    // AZ*BX-AX*BZ | AX*BY-AY*BX
    fp10 = __PS_NEG(fp10);

    // store YZ
    __PSQ_STX(dst, 4, fp10, 0, 0);

}
#endif

/*---------------------------------------------------------------------*

Name:           VECHalfAngle

Description:    compute the vector halfway between two vectors.
                intended for use in computing specular highlights


Arguments:      a     first vector.
                      this must point FROM the light source (tail)
                      TO the surface (head).

                b     second vector.
                      this must point FROM the viewer (tail)
                      TO the surface (head).

                note:     input vectors do not have to be normalized.


                half  resultant normalized 'half-angle' vector.
                      ok if half == a or half == b


Return:         none.

 *---------------------------------------------------------------------*/
/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
void C_VECHalfAngle ( const Vec *a, const Vec *b, Vec *half )
{
    Vec aTmp, bTmp, hTmp;


    ASSERTMSG( (a    != 0),    VEC_HALFANGLE_1  );
    ASSERTMSG( (b    != 0),    VEC_HALFANGLE_2  );
    ASSERTMSG( (half != 0),    VEC_HALFANGLE_3  );


    aTmp.x = -a->x;
    aTmp.y = -a->y;
    aTmp.z = -a->z;

    bTmp.x = -b->x;
    bTmp.y = -b->y;
    bTmp.z = -b->z;

    C_VECNormalize( &aTmp, &aTmp );
    C_VECNormalize( &bTmp, &bTmp );

    C_VECAdd( &aTmp, &bTmp, &hTmp );

    if ( C_VECDotProduct( &hTmp, &hTmp ) > 0.0F )
    {
        C_VECNormalize( &hTmp, half );
    }
    else    // The singular case returns zero vector
    {
        *half = hTmp;
    }

}

/*---------------------------------------------------------------------*

Name:           VECReflect

Description:    reflect a vector about a normal to a surface.


Arguments:      src        incident vector.

                normal     normal to surface.

                dst        normalized reflected vector.
                           ok if dst == src


Return:         none.

 *---------------------------------------------------------------------*/
/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
void C_VECReflect ( const Vec *src, const Vec *normal, Vec *dst )
{
    f32 cosA;
    Vec uI, uN;


    ASSERTMSG( (src != 0),     VEC_REFLECT_1  );
    ASSERTMSG( (normal != 0),  VEC_REFLECT_2  );
    ASSERTMSG( (dst != 0),     VEC_REFLECT_3  );


    // assume src is incident to a surface.
    // reverse direction of src so that src and normal
    // sit tail to tail.
    uI.x = -( src->x );
    uI.y = -( src->y );
    uI.z = -( src->z );


    // VECNormalize will catch any zero magnitude vectors
    C_VECNormalize( &uI,    &uI);
    C_VECNormalize( normal, &uN);

    // angle between the unit vectors
    cosA = C_VECDotProduct( &uI, &uN);


    // R = 2N(N.I) - I
    dst->x = (2.0f * uN.x * cosA) - uI.x;
    dst->y = (2.0f * uN.y * cosA) - uI.y;
    dst->z = (2.0f * uN.z * cosA) - uI.z;

    C_VECNormalize( dst, dst );

}

/*---------------------------------------------------------------------*

Name:           VECSquareDistance

Description:    Returns the square of the distance between vectors
                a and b.  Distance can be calculated using the
                square root of the returned value.


Arguments:      a     first vector.

                b     second vector.


Return:         square distance of between vectors.

 *---------------------------------------------------------------------*/
/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
f32 C_VECSquareDistance( const Vec *a, const Vec *b )
{
    Vec diff;

    diff.x = a->x - b->x;
    diff.y = a->y - b->y;
    diff.z = a->z - b->z;

    return (diff.x * diff.x) + (diff.y * diff.y) + (diff.z * diff.z);
}

#if !defined(WIN32) && !defined(WIN64)
/*---------------------------------------------------------------------*
    Paired-Single intrinsics version
 *---------------------------------------------------------------------*/

f32 PSVECSquareDistance( const Vec *a, const Vec *b )
{
    f32x2    v0yz, v1yz, v0xy, v1xy;
    f32x2    dyz, dxy, sqdist;

    v0yz = __PSQ_LX(a, 4, 0, 0);
    v1yz = __PSQ_LX(b, 4, 0, 0);

    dyz = __PS_SUB(v0yz, v1yz); // [Y0-Y1][Z0-Z1]

    v0xy = __PSQ_L(a, 0, 0);
    v1xy = __PSQ_L(b, 0, 0);

    dyz = __PS_MUL(dyz, dyz);              // [dYdY][dZdZ]
    dxy = __PS_SUB(v0xy, v1xy);            // [X0-X1][Y0-Y1]

    sqdist = __PS_MADD(dxy, dxy, dyz);      // [dXdX+dYdY][dYdY+dZdZ]
    sqdist = __PS_SUM0(sqdist, dyz, dyz);   // [dXdX+dYdY+dZdZ][N/A]

    return (f32)sqdist[0];
}
#endif


/*---------------------------------------------------------------------*

Name:           VECDistance

Description:    Returns the distance between vectors a and b.


Arguments:      a     first vector.

                b     second vector.


Return:         distance between the two vectors.

 *---------------------------------------------------------------------*/
/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
f32 C_VECDistance( const Vec *a, const Vec *b )
{
    return sqrtf( C_VECSquareDistance( a, b ) );
}

#if !defined(WIN32) && !defined(WIN64)
/*---------------------------------------------------------------------*
    Paired-Single intrinsics version
 *---------------------------------------------------------------------*/
f32 PSVECDistance( const Vec *a, const Vec *b )
{
    f32x2    v0yz, v1yz, v0xy, v1xy;
    f32x2    dyz, dxy, sqdist, rdist;
    f32x2    nwork0, nwork1;

    //psq_l       v0yz, 4(a), 0, 0           // [Y0][Z0]
    v0yz = __PSQ_LX(a, 4, 0, 0);

    //psq_l       v1yz, 4(b), 0, 0           // [Y1][Z1]
    v1yz = __PSQ_LX(b, 4, 0, 0);

    //ps_sub      dyz, v0yz, v1yz            // [Y0-Y1][Z0-Z1]
    dyz = __PS_SUB(v0yz, v1yz);

    //psq_l       v0xy, 0(a), 0, 0           // [X0][Y0]
    v0xy = __PSQ_L(a, 0, 0);

    //psq_l       v1xy, 0(b), 0, 0           // [X1][Y1]
    v1xy = __PSQ_L(b, 0, 0);

    //ps_mul      dyz, dyz, dyz              // [dYdY][dZdZ]
    dyz = __PS_MUL(dyz, dyz);

    //ps_sub      dxy, v0xy, v1xy            // [X0-X1][Y0-Y1]
    dxy = __PS_SUB(v0xy, v1xy);

    //ps_madd     sqdist, dxy, dxy, dyz      // [dXdX+dYdY][dYdY+dZdZ]
    sqdist = __PS_MADD(dxy, dxy, dyz);

    //ps_sum0     sqdist, sqdist, dyz, dyz   // [dXdX+dYdY+dZdZ][N/A]
    sqdist = __PS_SUM0(sqdist, dyz, dyz);
    
    if (sqdist[0] != 0)
    {
        // 1.0/sqrt : estimation[E]
        //frsqrte     rdist, sqdist
        rdist = __PS_RSQRTE(sqdist);

        // Refinement x 1 : E' = (E/2)(3 - X*E*E)
        //fmul        nwork0, rsqmag, rsqmag
        nwork0 = __PS_MUL(rdist, rdist);

        //fmul        nwork1, rsqmag, c_half
        nwork1 = __PS_MUL(rdist, c_half);

        //fnmsub      nwork0, nwork0, mag, c_three
        nwork0 = __PS_NMSUB(nwork0, sqdist, c_three);

        //fmul        rsqmag, nwork0, nwork1
        rdist = __PS_MUL(nwork0, nwork1);

        // 1/sqrt(X) * X = sqrt(X)
        //fmuls       sqdist, sqdist, rdist
        sqdist = __PS_MUL(sqdist, rdist);
    }

    return (f32)sqdist[0];
}
#endif


/*---------------------------------------------------------------------*

Name:           MTXMultVec

Description:    multiplies a vector by a matrix.
                m x src = dst.


Arguments:      m         matrix.
                src       source vector for multiply.
                dst       resultant vector from multiply.

                note:      ok if src == dst.


Return:         none

*---------------------------------------------------------------------*/
/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
void C_MTXMultVec ( MTX_CONST Mtx m, const Vec *src, Vec *dst )
{
    Vec vTmp;

    ASSERTMSG( (m   != 0), MTX_MULTVEC_1 );
    ASSERTMSG( (src != 0), MTX_MULTVEC_2 );
    ASSERTMSG( (dst != 0), MTX_MULTVEC_3 );

    // a Vec has a 4th implicit 'w' coordinate of 1
    vTmp.x = m[0][0]*src->x + m[0][1]*src->y + m[0][2]*src->z + m[0][3];
    vTmp.y = m[1][0]*src->x + m[1][1]*src->y + m[1][2]*src->z + m[1][3];
    vTmp.z = m[2][0]*src->x + m[2][1]*src->y + m[2][2]*src->z + m[2][3];

    // copy back
    dst->x = vTmp.x;
    dst->y = vTmp.y;
    dst->z = vTmp.z;
}

#if !defined(WIN32) && !defined(WIN64)
/*---------------------------------------------------------------------*
    Paired-Single intrinsics version
 *---------------------------------------------------------------------*
                Note that NO error checking is performed.
 *---------------------------------------------------------------------*/

void PSMTXMultVec ( MTX_CONST Mtx m, const Vec *src, Vec *dst )
{
    f32x2 fp0, fp1, fp2, fp3, fp4, fp5, fp6, fp8, fp9, fp10, fp11, fp12; // fp7, 
    const f32x2 zero = {0.0, 0.0};

    // load v[0], v[1]
    fp0 = __PSQ_L(src, 0, 0);

    // load m[0][0], m[0][1]
    fp2 = __PSQ_L(m, 0, 0);

    // load v[2], 1
    //fp1[0]=src->z;
    //fp1[1]=1.0F;
    fp1 = __PSQ_LX(src, 8, 1, 0);

    // m[0][0]*v[0], m[0][1]*v[1]
    fp4 = __PS_MUL(fp2, fp0);

    // load m[0][2], m[0][3]
    fp3 = __PSQ_LX(m, 8, 0, 0);

    // m[0][0]*v[0]+m[0][2]*v[2], m[0][1]*v[1]+m[0][3]
    fp5 = __PS_MADD(fp3, fp1, fp4);

    // load m[1][0], m[1][1]
    fp8 = __PSQ_LX(m, 16, 0, 0);
    fp6 = zero;

    // m[0][0]*v[0]+m[0][2]*v[2]+m[0][1]*v[1]+m[0][3], ???
    fp6 = __PS_SUM0(fp5, fp6, fp5);

    // load m[1][2], m[1][3]
    fp9 = __PSQ_LX(m, 24, 0, 0);

    // m[1][0]*v[0], m[1][1]*v[1]
    fp10 = __PS_MUL(fp8, fp0);

    // store dst[0]
    __PSQ_ST(dst, fp6, 1, 0);

    // m[1][0]*v[0]+m[1][2]*v[2], m[1][1]*v[1]+m[1][3]
    fp11 = __PS_MADD(fp9, fp1, fp10);

    // load m[2][0], m[2][1]
    fp2 = __PSQ_LX(m, 32, 0, 0);
    fp12 = zero;

    // m[1][0]*v[0]+m[1][2]*v[2]+m[2][1]*v[1]+m[1][3], ???
    fp12 = __PS_SUM0(fp11, fp12, fp11);

    // load m[2][2], m[2][3]
    fp3 = __PSQ_LX(m, 40, 0, 0);

    // m[0][0]*v[0], m[0][1]*v[1]
    fp4 = __PS_MUL(fp2, fp0);

    // store dst[1]
    __PSQ_STX(dst, 4, fp12, 1, 0);

    // m[0][0]*v[0]+m[0][2]*v[2], m[0][1]*v[1]+m[0][3]
    fp5 = __PS_MADD(fp3, fp1, fp4);
    fp6 = zero;

    // m[0][0]*v[0]+m[0][2]*v[2]+m[0][1]*v[1]+m[0][3], ???
    fp6 = __PS_SUM0(fp5, fp6, fp5);

    // store dst[2]
    __PSQ_STX(dst, 8, fp6, 1, 0);
}
#endif

/*---------------------------------------------------------------------*

Name:           MTXMultVecArray

Description:    multiplies an array of vectors by a matrix.


Arguments:      m         matrix.
                srcBase   start of source vector array.
                dstBase   start of resultant vector array.

                note:     ok if srcBase == dstBase.

                count     number of vectors in srcBase, dstBase arrays
                          note:      cannot check for array overflow

Return:         none

*---------------------------------------------------------------------*/
/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
void C_MTXMultVecArray ( MTX_CONST Mtx m, const Vec *srcBase, Vec *dstBase, u32 count )
{
    u32 i;
    Vec vTmp;

    ASSERTMSG( (m       != 0), MTX_MULTVECARRAY_1 );
    ASSERTMSG( (srcBase != 0), MTX_MULTVECARRAY_2 );
    ASSERTMSG( (dstBase != 0), MTX_MULTVECARRAY_3 );
    ASSERTMSG( (count > 1),    MTX_MULTVECARRAY_4 );

    for(i=0; i< count; i++)
    {
        // Vec has a 4th implicit 'w' coordinate of 1
        vTmp.x = m[0][0]*srcBase->x + m[0][1]*srcBase->y + m[0][2]*srcBase->z + m[0][3];
        vTmp.y = m[1][0]*srcBase->x + m[1][1]*srcBase->y + m[1][2]*srcBase->z + m[1][3];
        vTmp.z = m[2][0]*srcBase->x + m[2][1]*srcBase->y + m[2][2]*srcBase->z + m[2][3];

        // copy back
        dstBase->x = vTmp.x;
        dstBase->y = vTmp.y;
        dstBase->z = vTmp.z;

        srcBase++;
        dstBase++;
    }
}

#if !defined(WIN32) && !defined(WIN64)
/*---------------------------------------------------------------------*
    Paired-Single intrinsics version
 *---------------------------------------------------------------------*
                Note that NO error checking is performed.

                The count should be greater than 1.
 *---------------------------------------------------------------------*/

void PSMTXMultVecArray ( MTX_CONST Mtx m, const Vec *srcBase, Vec *dstBase, u32 count )
{
    u32 i;

    for ( i = 0 ; i < count ; i++ )
    {
        PSMTXMultVec(m, srcBase, dstBase);

        srcBase++;
        dstBase++;
    }
}
#endif

/*---------------------------------------------------------------------*

Name:         MTXMultVecSR

Description:  multiplies a vector by a matrix 3x3 (Scaling and Rotation)
              component.

              m x src = dst.

Arguments:    m       matrix.
              src     source vector for multiply.
              dst     resultant vector from multiply.

              note:   ok if src == dst.

Return:       none

*---------------------------------------------------------------------*/
/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
void C_MTXMultVecSR ( MTX_CONST Mtx m, const Vec *src, Vec *dst )
{
    Vec vTmp;

    ASSERTMSG( (m   != 0), MTX_MULTVECSR_1 );
    ASSERTMSG( (src != 0), MTX_MULTVECSR_2 );
    ASSERTMSG( (dst != 0), MTX_MULTVECSR_3 );

    // a Vec has a 4th implicit 'w' coordinate of 1
    vTmp.x = m[0][0]*src->x + m[0][1]*src->y + m[0][2]*src->z;
    vTmp.y = m[1][0]*src->x + m[1][1]*src->y + m[1][2]*src->z;
    vTmp.z = m[2][0]*src->x + m[2][1]*src->y + m[2][2]*src->z;

    // copy back
    dst->x = vTmp.x;
    dst->y = vTmp.y;
    dst->z = vTmp.z;
}

#if !defined(WIN32) && !defined(WIN64)
/*---------------------------------------------------------------------*
    Paired-Single intrinsics version
 *---------------------------------------------------------------------*
                Note that this performs NO error checking.
 *---------------------------------------------------------------------*/

void PSMTXMultVecSR ( MTX_CONST Mtx m, const Vec *src, Vec *dst )
{
    f32x2 fp0, fp1, fp2, fp3, fp4, fp5, fp6, fp7, fp8, fp9, fp10, fp11, fp12, fp13;

    // m[0][0], m[0][1]
    fp0 = __PSQ_L(m, 0, 0);

    // fp6 - x y
    fp6 = __PSQ_L(src, 0, 0);

    // m[1][0], m[1][1]
    fp2 = __PSQ_LX(m, 16, 0, 0);

    // fp8 = m00x m01y // next X
    fp8 = __PS_MUL(fp0, fp6);

    // m[2][0], m[2][1]
    fp4 = __PSQ_LX(m, 32, 0, 0);

    // fp10 = m10x m11y // next Y
    fp10 = __PS_MUL(fp2, fp6);

    // fp7 - z,1.0
    //fp7[0] = src->z;
    //fp7[1] = 1.0F;
    fp7 = __PSQ_LX(src, 8, 1, 0);

    // fp12 = m20x m21y // next Z
    fp12 = __PS_MUL(fp4, fp6);

    // m[1][2], m[1][3]
    fp3 = __PSQ_LX(m, 24, 0, 0);

    fp8 = __PS_SUM0(fp8, fp8, fp8);

    // m[2][2], m[2][3]
    fp5 = __PSQ_LX(m, 40, 0, 0);

    fp10 = __PS_SUM0(fp10, fp10, fp10);

    // m[0][2], m[0][3]
    fp1 = __PSQ_LX(m, 8, 0, 0);

    fp12 = __PS_SUM0(fp12, fp12, fp12);
    fp9 = __PS_MADD(fp1, fp7, fp8);

    // store X
    __PSQ_ST(dst, fp9, 1, 0);

    fp11 = __PS_MADD(fp3, fp7, fp10);

    // store Y
    __PSQ_STX(dst, 4, fp11, 1, 0);

    fp13 = __PS_MADD(fp5, fp7, fp12);

    //  store Z
    __PSQ_STX(dst, 8, fp13, 1, 0);
}
#endif

/*---------------------------------------------------------------------*

Name:           MTXMultVecArraySR

Description:    multiplies an array of vectors by a matrix 3x3
                (Scaling and Rotation) component.

Arguments:      m        matrix.
                srcBase  start of source vector array.
                dstBase  start of resultant vector array.

                note:    ok if srcBase == dstBase.

                count    number of vectors in srcBase, dstBase arrays
                note:    cannot check for array overflow

Return:         none

*---------------------------------------------------------------------*/
/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
void C_MTXMultVecArraySR ( MTX_CONST Mtx m, const Vec *srcBase, Vec *dstBase, u32 count )
{
    u32 i;
    Vec vTmp;

    ASSERTMSG( (m       != 0), MTX_MULTVECARRAYSR_1 );
    ASSERTMSG( (srcBase != 0), MTX_MULTVECARRAYSR_2 );
    ASSERTMSG( (dstBase != 0), MTX_MULTVECARRAYSR_3 );
    ASSERTMSG( (count > 1),    MTX_MULTVECARRAYSR_4 );

    for ( i = 0; i < count; i ++ )
    {
        // Vec has a 4th implicit 'w' coordinate of 1
        vTmp.x = m[0][0]*srcBase->x + m[0][1]*srcBase->y + m[0][2]*srcBase->z;
        vTmp.y = m[1][0]*srcBase->x + m[1][1]*srcBase->y + m[1][2]*srcBase->z;
        vTmp.z = m[2][0]*srcBase->x + m[2][1]*srcBase->y + m[2][2]*srcBase->z;

        // copy back
        dstBase->x = vTmp.x;
        dstBase->y = vTmp.y;
        dstBase->z = vTmp.z;

        srcBase++;
        dstBase++;
    }
}

#if !defined(WIN32) && !defined(WIN64)
/*---------------------------------------------------------------------*
    Paired-Single intrinsics version
 *---------------------------------------------------------------------*
                Note that NO error checking is performed.
 *---------------------------------------------------------------------*/

void PSMTXMultVecArraySR ( MTX_CONST Mtx m, const Vec *srcBase, Vec *dstBase, u32 count )
{
    u32 i;

    for ( i = 0 ; i < count ; i++ )
    {
        PSMTXMultVecSR(m, srcBase, dstBase);

        srcBase++;
        dstBase++;
    }
}
#endif


/*---------------------------------------------------------------------*

Name:           MTXROMultVecArray

Description:    Multiplies an array of vectors by a reordered matrix,
                using paired single operations.
                OK if source = destination.
                NOTE: number of vertices transformed cannot be less than
                2.

                Note that NO error checking is performed.

Arguments:      m         reordered matrix.
                srcBase   start of source vector array.
                dstBase   start of resultant vector array.
                count     number of vectors in srcBase, dstBase arrays
                          COUNT MUST BE GREATER THAN 2.


Return:         none

*---------------------------------------------------------------------*/
/*---------------------------------------------------------------------*
    C version
 *---------------------------------------------------------------------*/
void C_MTXROMultVecArray
(
    MTX_CONST ROMtx  m,      // r3
    const Vec   *srcBase,// r4
    Vec   *dstBase,// r5
    u32    count   // r6
)
{
    u32 i;
    Vec vTmp;

    ASSERTMSG( (m       != 0), MTX_MULTVECARRAY_1 );
    ASSERTMSG( (srcBase != 0), MTX_MULTVECARRAY_2 );
    ASSERTMSG( (dstBase != 0), MTX_MULTVECARRAY_3 );
    ASSERTMSG( (count > 1),    MTX_MULTVECARRAY_4 );

    for(i=0; i< count; i++)
    {
        // Vec has a 4th implicit 'w' coordinate of 1
        vTmp.x = m[0][0]*srcBase->x + m[1][0]*srcBase->y + m[2][0]*srcBase->z + m[3][0];
        vTmp.y = m[0][1]*srcBase->x + m[1][1]*srcBase->y + m[2][1]*srcBase->z + m[3][1];
        vTmp.z = m[0][2]*srcBase->x + m[1][2]*srcBase->y + m[2][2]*srcBase->z + m[3][2];

        // copy back
        dstBase->x = vTmp.x;
        dstBase->y = vTmp.y;
        dstBase->z = vTmp.z;

        srcBase++;
        dstBase++;
    }
}
/*===========================================================================*/