/*---------------------------------------------------------------------------*
  Project: matrix vector Library
  File:    mtxQuat_asm.s

  Copyright 1998-2011 Nintendo.  All rights reserved.

  These coded instructions, statements, and computer programs contain
  proprietary information of Nintendo of America Inc. and/or Nintendo
  Company Ltd., and are protected by Federal copyright law.     They may
  not be disclosed to third parties or copied or duplicated in any form,
  in whole or in part, without the prior written consent of Nintendo.

 *---------------------------------------------------------------------------*/

        .data
        .align 2
CONST_0_5F:     .float        0.5
CONST_1_0F:     .float        1.0
CONST_3_0F:     .float        3.0
CONST_EPSILON:  .float        0.00001


        .text


////////////////////////////////////////////////////////////////////////////////
// void ASM_QUATAdd(const Quaternion *p, const Quaternion *q, Quaternion *r)
        .global ASM_QUATAdd
#define p r3
#define q r4
#define r r5
ASM_QUATAdd:
        .type ASM_QUATAdd, @function
#define pxy fp1
#define qxy fp2
#define rxy fp3
#define pzw fp4
#define qzw fp5
#define rzw fp6
        psq_l     pxy,  0(p), 0, 0
        psq_l     qxy,  0(q), 0, 0
        ps_add    rxy,   pxy, qxy
        psq_st    rxy,  0(r), 0, 0

        psq_l     pzw,  8(p), 0, 0
        psq_l     qzw,  8(q), 0, 0
        ps_add    rzw,   pzw, qzw
        psq_st    rzw,  8(r), 0, 0
        blr
        .size ASM_QUATAdd,$-ASM_QUATAdd
#undef p
#undef q
#undef r
#undef pxy
#undef qxy
#undef rxy
#undef pzw
#undef qzw
#undef rzw



////////////////////////////////////////////////////////////////////////////////
// void ASM_QUATSubtract(const Quaternion *p, const Quaternion *q, Quaternion *r)
#define p r3
#define q r4
#define r r5
        .global ASM_QUATSubtract
ASM_QUATSubtract:
        .type ASM_QUATSubtract, @function
#define pxy fp1
#define qxy fp2
#define rxy fp3
#define pzw fp4
#define qzw fp5
#define rzw fp6
        psq_l     pxy,  0(p), 0, 0
        psq_l     qxy,  0(q), 0, 0
        ps_sub    rxy,   pxy, qxy
        psq_st    rxy,  0(r), 0, 0

        psq_l     pzw,  8(p), 0, 0
        psq_l     qzw,  8(q), 0, 0
        ps_sub    rzw,   pzw, qzw
        psq_st    rzw,  8(r), 0, 0
        blr
        .size ASM_QUATSubtract,$-ASM_QUATSubtract
#undef p
#undef q
#undef r
#undef pxy
#undef qxy
#undef rxy
#undef pzw
#undef qzw
#undef rzw



////////////////////////////////////////////////////////////////////////////////
// void ASM_QUATMultiply(const Quaternion *p, const Quaternion *q, Quaternion *pq)
#define p  r3
#define q  r4
#define pq r5
        .global ASM_QUATMultiply
ASM_QUATMultiply:
        .type ASM_QUATMultiply, @function
#define pxy   fp1
#define pzw   fp2
#define qxy   fp3
#define qzw   fp4
#define pnxy  fp5
#define pnzw  fp6
#define pnxny fp7
#define pnznw fp8
#define rxy   fp9
#define rzw   fp10
#define sxy   fp11
#define szw   fp12
        // [px][py] : Load
        psq_l       pxy, 0(p), 0, 0
        // [pz][pw] : Load
        psq_l       pzw, 8(p), 0, 0

        // [qx][qy] : Load
        psq_l       qxy, 0(q), 0, 0
        // [-px][-py]
        ps_neg      pnxny, pxy
        // [qz][qw] : Load
        psq_l       qzw, 8(q), 0, 0
        // [-pz][-pw]
        ps_neg      pnznw, pzw

        // [-px][py]
        ps_merge01  pnxy, pnxny, pxy

        // [pz*qx][pw*qx]
        ps_muls0    rxy, pzw, qxy
        // [-px*qx][-py*qx]
        ps_muls0    rzw, pnxny, qxy

        // [-pz][pw]
        ps_merge01  pnzw, pnznw, pzw

        // [-px*qy][py*qy]
        ps_muls1    szw, pnxy, qxy
        // [pz*qx-px*qz][pw*qx+py*qz]
        ps_madds0   rxy, pnxy, qzw, rxy
        // [-pz*qy][pw*qy]
        ps_muls1    sxy, pnzw, qxy
        // [-px*qx-pz*qz][-py*qx+pw*qz]
        ps_madds0   rzw, pnzw, qzw, rzw
        // [-px*qy-pz*qw][py*qy-pw*qw]
        ps_madds1   szw, pnznw, qzw, szw
        // [pw*qx+py*qz][pz*qx-px*qz]
        ps_merge10  rxy, rxy, rxy
        // [-pz*qy+px*qw][pw*qy+py*qw]
        ps_madds1   sxy, pxy, qzw, sxy
        // [-py*qx+pw*qz][-px*qx-pz*qz]
        ps_merge10  rzw, rzw, rzw

        // [pw*qx+py*qz-pz*qy+px*qw][pz*qx-px*qz+pw*qy+py*qw] : [pqx][pqy]
        ps_add      rxy, rxy, sxy
        // [pqx][pqy] : Store
        psq_st      rxy, 0(pq), 0, 0
        // [-py*qx+pw*qz+px*qy+pz*qw][-px*qx-pz*qz-py*qy+pw*qw] : [pqz][pqw]
        ps_sub      rzw, rzw, szw
        // [pqz][pqw] : Store
        psq_st      rzw, 8(pq), 0, 0

        blr
        .size ASM_QUATMultiply,$-ASM_QUATMultiply
#undef p
#undef q
#undef pq
#undef pxy
#undef pzw
#undef qxy
#undef qzw
#undef pnxy
#undef pnzw
#undef pnxny
#undef pnznw
#undef rxy
#undef rzw
#undef sxy
#undef szw



////////////////////////////////////////////////////////////////////////////////
// void ASM_QUATInverse(const Quaternion *src, Quaternion *inv)
        .global ASM_QUATInverse
#define src     r3
#define inv     r4
ASM_QUATInverse:
        .type ASM_QUATInverse, @function
#define sxy     fp1
#define szw     fp2
#define izz     fp3
#define iww     fp4
#define mag     fp5
#define nmag    fp6
#define norminv fp7
#define nninv   fp8
#define nwork0  fp9
#define c_zero  fp10
#define c_one   fp11
#define c_two   fp12
        // c_one = 1.0F;
        lis         r5, CONST_1_0F@h
        ori         r5, r5, CONST_1_0F@l
        lfs         c_one, 0(r5)

        // load xy
        psq_l       sxy, 0(src), 0, 0

        // mag = [x*x][y*y]
        ps_mul      mag, sxy, sxy
        // c_zero = [0.0F]
        ps_sub      c_zero, c_one, c_one

        // load zw
        psq_l       szw, 8(src), 0, 0

        // mag = [x*x+z*z][y*y+w*w]
        ps_madd     mag, szw, szw, mag
        // c_two = [2.0F]
        ps_add      c_two, c_one, c_one
        // mag = [x*x+y*y+z*z+w*w][N/A]
        ps_sum0     mag, mag, mag, mag

        // zero check
        fcmpu       cr0, mag, c_zero
        beq-        _ASM_QUATInverse_zero

        // norminv = 1.0F / mag
        fres        norminv, mag
        // nmag = -mag
        ps_neg      nmag, mag
        // Newton-Rapson refinment (x1) : E' = 2E-X*E*E
        ps_nmsub    nwork0, mag, norminv, c_two
        ps_mul      norminv, norminv, nwork0
        b           _ASM_QUATInverse_mulnorm

_ASM_QUATInverse_zero:
        fmr         norminv, c_one

_ASM_QUATInverse_mulnorm:
        // nninv = [ -norminv ]
        ps_neg      nninv, norminv

        // iww = [ w*norminv ][ N/A ]
        ps_muls1    iww, norminv, szw
        // sxy = [ -x*norminv ][ -y*norminv ]
        ps_muls0    sxy, sxy, nninv

        // store w
        psq_st      iww, 12(inv), 1, 0

        // izz = [ -z*norminv ][ N/A ]
        ps_muls0    izz, szw, nninv

        // store xy
        psq_st      sxy, 0(inv), 0, 0
        // store z
        psq_st      izz, 8(inv), 1, 0

        blr
        .size ASM_QUATInverse,$-ASM_QUATInverse
#undef src
#undef inv
#undef sxy
#undef szw
#undef izz
#undef iww
#undef mag
#undef nmag
#undef norminv
#undef nninv
#undef nwork0
#undef c_zero
#undef c_one
#undef c_two


////////////////////////////////////////////////////////////////////////////////
// void ASM_QUATScale(const Quaternion *q, Quaternion *r, f32 scale)
        .global ASM_QUATScale
#define q     r3
#define r     r4
#define scale fp1
ASM_QUATScale:
        .type ASM_QUATScale, @function
#define rxy   fp2
#define rzw   fp3
        psq_l       rxy, 0(q), 0, 0
        psq_l       rzw, 8(q), 0, 0
        ps_muls0    rxy, rxy, scale
        psq_st      rxy, 0(r), 0, 0
        ps_muls0    rzw, rzw, scale
        psq_st      rzw, 8(r), 0, 0
        blr
        .size ASM_QUATScale,$-ASM_QUATScale
#undef q
#undef r
#undef scale
#undef rxy
#undef rzw



////////////////////////////////////////////////////////////////////////////////
// f32 ASM_QUATDotProduct(const Quaternion *p, const Quaternion *q)
#define p r3
#define q r4
        .global ASM_QUATDotProduct
ASM_QUATDotProduct:
        .type ASM_QUATDotProduct, @function
#define pxy fp2
#define pzw fp3
#define qxy fp4
#define qzw fp5
#define dp  fp1
        psq_l       pxy, 0(p), 0, 0
        psq_l       qxy, 0(q), 0, 0
        ps_mul      dp, pxy, qxy

        psq_l       pzw, 8(p), 0, 0
        psq_l       qzw, 8(q), 0, 0
        ps_madd     dp, pzw, qzw, dp

        ps_sum0     dp, dp, dp, dp

        blr
        .size ASM_QUATDotProduct,$-ASM_QUATDotProduct
#undef pxy
#undef pzw
#undef qxy
#undef qzw
#undef dp



////////////////////////////////////////////////////////////////////////////////
// void ASM_QUATNormalize(const Quaternion *src, Quaternion *unit)
#define src  r3
#define unit r4
        .global ASM_QUATNormalize
#define sxy     fp1
#define szw     fp2
#define mag     fp3
#define rsqmag  fp4
#define diff    fp5
#define c_zero  fp6
#define nwork0  fp7
#define nwork1  fp8
#define epsilon fp9
#define c_half  fp10
#define c_three fp11
ASM_QUATNormalize:
        .type ASM_QUATNormalize, @function
        
        // epsilon = QUAT_EPSILON;
        lis         r5, CONST_EPSILON@h
        ori         r5, r5, CONST_EPSILON@l
        lfs         epsilon, 0(r5)

        // c_half  = 0.5F;
        lis         r5, CONST_0_5F@h
        ori         r5, r5, CONST_0_5F@l
        lfs         c_half, 0(r5)

        // c_three = 3.0F;
        lis         r5, CONST_3_0F@h
        ori         r5, r5, CONST_3_0F@l
        lfs         c_three, 0(r5)

        psq_l       sxy, 0(src), 0, 0

        // mag = [x*x][y*y]
        ps_mul      mag, sxy, sxy

        psq_l       szw, 8(src), 0, 0

        // c_zero = [0.0F]
        ps_sub      c_zero, epsilon, epsilon
        // mag = [x*x+z*z][y*y+w*w]
        ps_madd     mag, szw, szw, mag
        // mag = [x*x+y*y+z*z+w*w][N/A]
        ps_sum0     mag, mag, mag, mag

        // rsqmag = 1.0F / sqrtf(mag) : estimation
        frsqrte     rsqmag, mag
        // diff = mag - epsilon
        ps_sub      diff, mag, epsilon
        // Newton-Rapson refinement (x1) : E' = (E/2)(3 - X * E * E)
        fmul        nwork0, rsqmag, rsqmag
        fmul        nwork1, rsqmag, c_half
        fnmsub      nwork0, nwork0, mag, c_three
        fmul        rsqmag, nwork0, nwork1

        // rsqmag = ( mag >= epsilon ) ? rsqmag : 0
        ps_sel      rsqmag, diff, rsqmag, c_zero
        // sxy = [x*rsqmag][y*rsqmag]
        ps_muls0    sxy, sxy, rsqmag
        // szw = [z*rsqmag][w*rsqmag]
        ps_muls0    szw, szw, rsqmag

        psq_st      sxy, 0(unit), 0, 0
        psq_st      szw, 8(unit), 0, 0

        blr
        .size ASM_QUATNormalize,$-ASM_QUATNormalize
        
#undef src
#undef unit
#undef sxy
#undef szw
#undef mag
#undef rsqmag
#undef diff
#undef c_zero
#undef nwork0
#undef nwork1
#undef epsilon
#undef c_half
#undef c_three

////////////////////////////////////////////////////////////////////////////////
// void ASM_MTXQuat(Mtx m, const Quaternion *q)
#define m  r3
#define q  r4
#define c_zero fp1
#define c_one  fp2
#define c_two  fp3
#define scale  fp4
#define tmp0   fp5
#define tmp1   fp6
#define tmp2   fp7
#define tmp3   fp8
#define tmp4   fp9
#define tmp5   fp10
#define tmp6   fp11
#define tmp7   fp12
#define tmp8   fp13
#define tmp9   fp14

        .global ASM_MTXQuat
ASM_MTXQuat:
        .type ASM_MTXQuat, @function
        
        mflr        r0
        stwu        r1, -24(r1)
        stw         r0, 28(r1)

        psq_st      fp14, 8(r1), 0, 0
        stfd        fp14, 16(r1)

        // c_one = 1.0F;
        lis         r5, CONST_1_0F@h
        ori         r5, r5, CONST_1_0F@l
        lfs         c_one, 0(r5)

        // tmp0 = [qx][qy] : LOAD
        psq_l       tmp0, 0(q), 0, 0
        // tmp1 = [qz][qw] : LOAD
        psq_l       tmp1, 8(q), 0, 0
        // c_zero = [0.0F][0.0F]
        fsubs       c_zero, c_one, c_one
        // c_two  = [2.0F][2.0F]
        fadds       c_two, c_one, c_one
        // tmp2 = [qx*qx][qy*qy]
        ps_mul      tmp2, tmp0, tmp0
        // tmp5 = [qy][qx]
        ps_merge10  tmp5, tmp0, tmp0
        // tmp4 = [qx*qx+qz*qz][qy*qy+qw*qw]
        ps_madd     tmp4, tmp1, tmp1, tmp2
        // tmp3 = [qz*qz][qw*qw]
        ps_mul      tmp3, tmp1, tmp1
        // scale = [qx*qx+qy*qy+qz*qz+qw*qw][?]
        ps_sum0     scale, tmp4, tmp4, tmp4
        // tmp7 = [qy*qw][qx*qw]
        ps_muls1    tmp7, tmp5, tmp1
        // Newton-Rapson refinment (1/X) : E' = 2E-X*E*E
        // tmp9 = [E = Est.(1/X)]
        fres        tmp9, scale
        // tmp4 = [qx*qx+qz*qz][qy*qy+qz*qz]
        ps_sum1     tmp4, tmp3, tmp4, tmp2
        // scale = [2-X*E]
        ps_nmsub    scale, scale, tmp9, c_two
        // tmp6 = [qz*qw][?]
        ps_muls1    tmp6, tmp1, tmp1
        // scale = [E(2-scale*E) = E']
        ps_mul      scale, tmp9, scale
        // tmp2 = [qx*qx+qy*qy]
        ps_sum0     tmp2, tmp2, tmp2, tmp2
        // scale = [s = 2E' = 2.0F/(qx*qx+qy*qy+qz*qz+qw*qw)]
        fmuls       scale, scale, c_two
        // tmp8 = [qx*qy+qz*qw][?]
        ps_madd     tmp8, tmp0, tmp5, tmp6
        // tmp6 = [qx*qy-qz*qw][?]
        ps_msub     tmp6, tmp0, tmp5, tmp6
        // c_zero [m03] : STORE
        psq_st      c_zero, 12(m), 1, 0
        // tmp2 = [1-s(qx*qx+qy*qy)]   : [m22]
        ps_nmsub    tmp2, tmp2, scale, c_one
        // tmp4 = [1-s(qx*qx+qz*qz)][1-s(qy*qy+qz*qz)] : [m11][m00]
        ps_nmsub    tmp4, tmp4, scale, c_one
        // c_zero [m23] : STORE
        psq_st      c_zero, 44(m), 1, 0
        // tmp8 = [s(qx*qy+qz*qw)][?]  : [m10]
        ps_mul      tmp8, tmp8, scale
        // tmp6 = [s(qx*qy-qz*qw)][?]  : [m01]
        ps_mul      tmp6, tmp6, scale
        // tmp2 [m22] : STORE
        psq_st      tmp2, 40(m), 1, 0
        // tmp5 = [qx*qz+qy*qw][qy*qz+qx*qw]
        ps_madds0   tmp5, tmp0, tmp1, tmp7
        // tmp1 = [m10][m11]
        ps_merge00  tmp1, tmp8, tmp4
        // tmp7 = [qx*qz-qy*qw][qy*qz-qx*qw]
        ps_nmsub    tmp7, tmp7, c_two, tmp5
        // tmp0 = [m00][m01]
        ps_merge10  tmp0, tmp4, tmp6
        // tmp1 [m10][m11] : STORE
        psq_st      tmp1, 16(m), 0, 0
        // tmp5 = [s(qx*qz+qy*qw)][s(qy*qz+qx*qw)] : [m02][m21]
        ps_mul      tmp5, tmp5, scale
        // tmp7 = [s(qx*qz-qy*qw)][s(qy*qz-qx*qw)] : [m20][m12]
        ps_mul      tmp7, tmp7, scale
        // tmp0 [m00][m01] : STORE
        psq_st      tmp0,  0(m), 0, 0
        // tmp5 [m02] : STORE
        psq_st      tmp5,  8(m), 1, 0
        // tmp3 = [m12][m13]
        ps_merge10  tmp3, tmp7, c_zero
        // tmp9 = [m20][m21]
        ps_merge01  tmp9, tmp7, tmp5
        // tmp3 [m12][m13] : STORE
        psq_st      tmp3, 24(m), 0, 0
        // tmp9 [m20][m21] : STORE
        psq_st      tmp9, 32(m), 0, 0

        psq_l       f14, 8(r1), 0, 0
        lfd         f14, 16(r1)

        lwz         r0, 28(r1)
        mtlr        r0
        addi        r1, r1, 24

        blr
        .size ASM_MTXQuat,$-ASM_MTXQuat
#undef m
#undef q
#undef c_zero
#undef c_one
#undef c_two
#undef scale
#undef tmp0
#undef tmp1
#undef tmp2
#undef tmp3
#undef tmp4
#undef tmp5
#undef tmp6
#undef tmp7
#undef tmp8
#undef tmp9