/*---------------------------------------------------------------------------* Project: matrix vector Library File: mtxQuat_asm.s Copyright 1998-2011 Nintendo. All rights reserved. These coded instructions, statements, and computer programs contain proprietary information of Nintendo of America Inc. and/or Nintendo Company Ltd., and are protected by Federal copyright law. They may not be disclosed to third parties or copied or duplicated in any form, in whole or in part, without the prior written consent of Nintendo. *---------------------------------------------------------------------------*/ .data .align 2 CONST_0_5F: .float 0.5 CONST_1_0F: .float 1.0 CONST_3_0F: .float 3.0 CONST_EPSILON: .float 0.00001 .text //////////////////////////////////////////////////////////////////////////////// // void ASM_QUATAdd(const Quaternion *p, const Quaternion *q, Quaternion *r) .global ASM_QUATAdd #define p r3 #define q r4 #define r r5 ASM_QUATAdd: .type ASM_QUATAdd, @function #define pxy fp1 #define qxy fp2 #define rxy fp3 #define pzw fp4 #define qzw fp5 #define rzw fp6 psq_l pxy, 0(p), 0, 0 psq_l qxy, 0(q), 0, 0 ps_add rxy, pxy, qxy psq_st rxy, 0(r), 0, 0 psq_l pzw, 8(p), 0, 0 psq_l qzw, 8(q), 0, 0 ps_add rzw, pzw, qzw psq_st rzw, 8(r), 0, 0 blr .size ASM_QUATAdd,$-ASM_QUATAdd #undef p #undef q #undef r #undef pxy #undef qxy #undef rxy #undef pzw #undef qzw #undef rzw //////////////////////////////////////////////////////////////////////////////// // void ASM_QUATSubtract(const Quaternion *p, const Quaternion *q, Quaternion *r) #define p r3 #define q r4 #define r r5 .global ASM_QUATSubtract ASM_QUATSubtract: .type ASM_QUATSubtract, @function #define pxy fp1 #define qxy fp2 #define rxy fp3 #define pzw fp4 #define qzw fp5 #define rzw fp6 psq_l pxy, 0(p), 0, 0 psq_l qxy, 0(q), 0, 0 ps_sub rxy, pxy, qxy psq_st rxy, 0(r), 0, 0 psq_l pzw, 8(p), 0, 0 psq_l qzw, 8(q), 0, 0 ps_sub rzw, pzw, qzw psq_st rzw, 8(r), 0, 0 blr .size ASM_QUATSubtract,$-ASM_QUATSubtract #undef p #undef q #undef r #undef pxy #undef qxy #undef rxy #undef pzw #undef qzw #undef rzw //////////////////////////////////////////////////////////////////////////////// // void ASM_QUATMultiply(const Quaternion *p, const Quaternion *q, Quaternion *pq) #define p r3 #define q r4 #define pq r5 .global ASM_QUATMultiply ASM_QUATMultiply: .type ASM_QUATMultiply, @function #define pxy fp1 #define pzw fp2 #define qxy fp3 #define qzw fp4 #define pnxy fp5 #define pnzw fp6 #define pnxny fp7 #define pnznw fp8 #define rxy fp9 #define rzw fp10 #define sxy fp11 #define szw fp12 // [px][py] : Load psq_l pxy, 0(p), 0, 0 // [pz][pw] : Load psq_l pzw, 8(p), 0, 0 // [qx][qy] : Load psq_l qxy, 0(q), 0, 0 // [-px][-py] ps_neg pnxny, pxy // [qz][qw] : Load psq_l qzw, 8(q), 0, 0 // [-pz][-pw] ps_neg pnznw, pzw // [-px][py] ps_merge01 pnxy, pnxny, pxy // [pz*qx][pw*qx] ps_muls0 rxy, pzw, qxy // [-px*qx][-py*qx] ps_muls0 rzw, pnxny, qxy // [-pz][pw] ps_merge01 pnzw, pnznw, pzw // [-px*qy][py*qy] ps_muls1 szw, pnxy, qxy // [pz*qx-px*qz][pw*qx+py*qz] ps_madds0 rxy, pnxy, qzw, rxy // [-pz*qy][pw*qy] ps_muls1 sxy, pnzw, qxy // [-px*qx-pz*qz][-py*qx+pw*qz] ps_madds0 rzw, pnzw, qzw, rzw // [-px*qy-pz*qw][py*qy-pw*qw] ps_madds1 szw, pnznw, qzw, szw // [pw*qx+py*qz][pz*qx-px*qz] ps_merge10 rxy, rxy, rxy // [-pz*qy+px*qw][pw*qy+py*qw] ps_madds1 sxy, pxy, qzw, sxy // [-py*qx+pw*qz][-px*qx-pz*qz] ps_merge10 rzw, rzw, rzw // [pw*qx+py*qz-pz*qy+px*qw][pz*qx-px*qz+pw*qy+py*qw] : [pqx][pqy] ps_add rxy, rxy, sxy // [pqx][pqy] : Store psq_st rxy, 0(pq), 0, 0 // [-py*qx+pw*qz+px*qy+pz*qw][-px*qx-pz*qz-py*qy+pw*qw] : [pqz][pqw] ps_sub rzw, rzw, szw // [pqz][pqw] : Store psq_st rzw, 8(pq), 0, 0 blr .size ASM_QUATMultiply,$-ASM_QUATMultiply #undef p #undef q #undef pq #undef pxy #undef pzw #undef qxy #undef qzw #undef pnxy #undef pnzw #undef pnxny #undef pnznw #undef rxy #undef rzw #undef sxy #undef szw //////////////////////////////////////////////////////////////////////////////// // void ASM_QUATInverse(const Quaternion *src, Quaternion *inv) .global ASM_QUATInverse #define src r3 #define inv r4 ASM_QUATInverse: .type ASM_QUATInverse, @function #define sxy fp1 #define szw fp2 #define izz fp3 #define iww fp4 #define mag fp5 #define nmag fp6 #define norminv fp7 #define nninv fp8 #define nwork0 fp9 #define c_zero fp10 #define c_one fp11 #define c_two fp12 // c_one = 1.0F; lis r5, CONST_1_0F@h ori r5, r5, CONST_1_0F@l lfs c_one, 0(r5) // load xy psq_l sxy, 0(src), 0, 0 // mag = [x*x][y*y] ps_mul mag, sxy, sxy // c_zero = [0.0F] ps_sub c_zero, c_one, c_one // load zw psq_l szw, 8(src), 0, 0 // mag = [x*x+z*z][y*y+w*w] ps_madd mag, szw, szw, mag // c_two = [2.0F] ps_add c_two, c_one, c_one // mag = [x*x+y*y+z*z+w*w][N/A] ps_sum0 mag, mag, mag, mag // zero check fcmpu cr0, mag, c_zero beq- _ASM_QUATInverse_zero // norminv = 1.0F / mag fres norminv, mag // nmag = -mag ps_neg nmag, mag // Newton-Rapson refinment (x1) : E' = 2E-X*E*E ps_nmsub nwork0, mag, norminv, c_two ps_mul norminv, norminv, nwork0 b _ASM_QUATInverse_mulnorm _ASM_QUATInverse_zero: fmr norminv, c_one _ASM_QUATInverse_mulnorm: // nninv = [ -norminv ] ps_neg nninv, norminv // iww = [ w*norminv ][ N/A ] ps_muls1 iww, norminv, szw // sxy = [ -x*norminv ][ -y*norminv ] ps_muls0 sxy, sxy, nninv // store w psq_st iww, 12(inv), 1, 0 // izz = [ -z*norminv ][ N/A ] ps_muls0 izz, szw, nninv // store xy psq_st sxy, 0(inv), 0, 0 // store z psq_st izz, 8(inv), 1, 0 blr .size ASM_QUATInverse,$-ASM_QUATInverse #undef src #undef inv #undef sxy #undef szw #undef izz #undef iww #undef mag #undef nmag #undef norminv #undef nninv #undef nwork0 #undef c_zero #undef c_one #undef c_two //////////////////////////////////////////////////////////////////////////////// // void ASM_QUATScale(const Quaternion *q, Quaternion *r, f32 scale) .global ASM_QUATScale #define q r3 #define r r4 #define scale fp1 ASM_QUATScale: .type ASM_QUATScale, @function #define rxy fp2 #define rzw fp3 psq_l rxy, 0(q), 0, 0 psq_l rzw, 8(q), 0, 0 ps_muls0 rxy, rxy, scale psq_st rxy, 0(r), 0, 0 ps_muls0 rzw, rzw, scale psq_st rzw, 8(r), 0, 0 blr .size ASM_QUATScale,$-ASM_QUATScale #undef q #undef r #undef scale #undef rxy #undef rzw //////////////////////////////////////////////////////////////////////////////// // f32 ASM_QUATDotProduct(const Quaternion *p, const Quaternion *q) #define p r3 #define q r4 .global ASM_QUATDotProduct ASM_QUATDotProduct: .type ASM_QUATDotProduct, @function #define pxy fp2 #define pzw fp3 #define qxy fp4 #define qzw fp5 #define dp fp1 psq_l pxy, 0(p), 0, 0 psq_l qxy, 0(q), 0, 0 ps_mul dp, pxy, qxy psq_l pzw, 8(p), 0, 0 psq_l qzw, 8(q), 0, 0 ps_madd dp, pzw, qzw, dp ps_sum0 dp, dp, dp, dp blr .size ASM_QUATDotProduct,$-ASM_QUATDotProduct #undef pxy #undef pzw #undef qxy #undef qzw #undef dp //////////////////////////////////////////////////////////////////////////////// // void ASM_QUATNormalize(const Quaternion *src, Quaternion *unit) #define src r3 #define unit r4 .global ASM_QUATNormalize #define sxy fp1 #define szw fp2 #define mag fp3 #define rsqmag fp4 #define diff fp5 #define c_zero fp6 #define nwork0 fp7 #define nwork1 fp8 #define epsilon fp9 #define c_half fp10 #define c_three fp11 ASM_QUATNormalize: .type ASM_QUATNormalize, @function // epsilon = QUAT_EPSILON; lis r5, CONST_EPSILON@h ori r5, r5, CONST_EPSILON@l lfs epsilon, 0(r5) // c_half = 0.5F; lis r5, CONST_0_5F@h ori r5, r5, CONST_0_5F@l lfs c_half, 0(r5) // c_three = 3.0F; lis r5, CONST_3_0F@h ori r5, r5, CONST_3_0F@l lfs c_three, 0(r5) psq_l sxy, 0(src), 0, 0 // mag = [x*x][y*y] ps_mul mag, sxy, sxy psq_l szw, 8(src), 0, 0 // c_zero = [0.0F] ps_sub c_zero, epsilon, epsilon // mag = [x*x+z*z][y*y+w*w] ps_madd mag, szw, szw, mag // mag = [x*x+y*y+z*z+w*w][N/A] ps_sum0 mag, mag, mag, mag // rsqmag = 1.0F / sqrtf(mag) : estimation frsqrte rsqmag, mag // diff = mag - epsilon ps_sub diff, mag, epsilon // Newton-Rapson refinement (x1) : E' = (E/2)(3 - X * E * E) fmul nwork0, rsqmag, rsqmag fmul nwork1, rsqmag, c_half fnmsub nwork0, nwork0, mag, c_three fmul rsqmag, nwork0, nwork1 // rsqmag = ( mag >= epsilon ) ? rsqmag : 0 ps_sel rsqmag, diff, rsqmag, c_zero // sxy = [x*rsqmag][y*rsqmag] ps_muls0 sxy, sxy, rsqmag // szw = [z*rsqmag][w*rsqmag] ps_muls0 szw, szw, rsqmag psq_st sxy, 0(unit), 0, 0 psq_st szw, 8(unit), 0, 0 blr .size ASM_QUATNormalize,$-ASM_QUATNormalize #undef src #undef unit #undef sxy #undef szw #undef mag #undef rsqmag #undef diff #undef c_zero #undef nwork0 #undef nwork1 #undef epsilon #undef c_half #undef c_three //////////////////////////////////////////////////////////////////////////////// // void ASM_MTXQuat(Mtx m, const Quaternion *q) #define m r3 #define q r4 #define c_zero fp1 #define c_one fp2 #define c_two fp3 #define scale fp4 #define tmp0 fp5 #define tmp1 fp6 #define tmp2 fp7 #define tmp3 fp8 #define tmp4 fp9 #define tmp5 fp10 #define tmp6 fp11 #define tmp7 fp12 #define tmp8 fp13 #define tmp9 fp14 .global ASM_MTXQuat ASM_MTXQuat: .type ASM_MTXQuat, @function mflr r0 stwu r1, -24(r1) stw r0, 28(r1) psq_st fp14, 8(r1), 0, 0 stfd fp14, 16(r1) // c_one = 1.0F; lis r5, CONST_1_0F@h ori r5, r5, CONST_1_0F@l lfs c_one, 0(r5) // tmp0 = [qx][qy] : LOAD psq_l tmp0, 0(q), 0, 0 // tmp1 = [qz][qw] : LOAD psq_l tmp1, 8(q), 0, 0 // c_zero = [0.0F][0.0F] fsubs c_zero, c_one, c_one // c_two = [2.0F][2.0F] fadds c_two, c_one, c_one // tmp2 = [qx*qx][qy*qy] ps_mul tmp2, tmp0, tmp0 // tmp5 = [qy][qx] ps_merge10 tmp5, tmp0, tmp0 // tmp4 = [qx*qx+qz*qz][qy*qy+qw*qw] ps_madd tmp4, tmp1, tmp1, tmp2 // tmp3 = [qz*qz][qw*qw] ps_mul tmp3, tmp1, tmp1 // scale = [qx*qx+qy*qy+qz*qz+qw*qw][?] ps_sum0 scale, tmp4, tmp4, tmp4 // tmp7 = [qy*qw][qx*qw] ps_muls1 tmp7, tmp5, tmp1 // Newton-Rapson refinment (1/X) : E' = 2E-X*E*E // tmp9 = [E = Est.(1/X)] fres tmp9, scale // tmp4 = [qx*qx+qz*qz][qy*qy+qz*qz] ps_sum1 tmp4, tmp3, tmp4, tmp2 // scale = [2-X*E] ps_nmsub scale, scale, tmp9, c_two // tmp6 = [qz*qw][?] ps_muls1 tmp6, tmp1, tmp1 // scale = [E(2-scale*E) = E'] ps_mul scale, tmp9, scale // tmp2 = [qx*qx+qy*qy] ps_sum0 tmp2, tmp2, tmp2, tmp2 // scale = [s = 2E' = 2.0F/(qx*qx+qy*qy+qz*qz+qw*qw)] fmuls scale, scale, c_two // tmp8 = [qx*qy+qz*qw][?] ps_madd tmp8, tmp0, tmp5, tmp6 // tmp6 = [qx*qy-qz*qw][?] ps_msub tmp6, tmp0, tmp5, tmp6 // c_zero [m03] : STORE psq_st c_zero, 12(m), 1, 0 // tmp2 = [1-s(qx*qx+qy*qy)] : [m22] ps_nmsub tmp2, tmp2, scale, c_one // tmp4 = [1-s(qx*qx+qz*qz)][1-s(qy*qy+qz*qz)] : [m11][m00] ps_nmsub tmp4, tmp4, scale, c_one // c_zero [m23] : STORE psq_st c_zero, 44(m), 1, 0 // tmp8 = [s(qx*qy+qz*qw)][?] : [m10] ps_mul tmp8, tmp8, scale // tmp6 = [s(qx*qy-qz*qw)][?] : [m01] ps_mul tmp6, tmp6, scale // tmp2 [m22] : STORE psq_st tmp2, 40(m), 1, 0 // tmp5 = [qx*qz+qy*qw][qy*qz+qx*qw] ps_madds0 tmp5, tmp0, tmp1, tmp7 // tmp1 = [m10][m11] ps_merge00 tmp1, tmp8, tmp4 // tmp7 = [qx*qz-qy*qw][qy*qz-qx*qw] ps_nmsub tmp7, tmp7, c_two, tmp5 // tmp0 = [m00][m01] ps_merge10 tmp0, tmp4, tmp6 // tmp1 [m10][m11] : STORE psq_st tmp1, 16(m), 0, 0 // tmp5 = [s(qx*qz+qy*qw)][s(qy*qz+qx*qw)] : [m02][m21] ps_mul tmp5, tmp5, scale // tmp7 = [s(qx*qz-qy*qw)][s(qy*qz-qx*qw)] : [m20][m12] ps_mul tmp7, tmp7, scale // tmp0 [m00][m01] : STORE psq_st tmp0, 0(m), 0, 0 // tmp5 [m02] : STORE psq_st tmp5, 8(m), 1, 0 // tmp3 = [m12][m13] ps_merge10 tmp3, tmp7, c_zero // tmp9 = [m20][m21] ps_merge01 tmp9, tmp7, tmp5 // tmp3 [m12][m13] : STORE psq_st tmp3, 24(m), 0, 0 // tmp9 [m20][m21] : STORE psq_st tmp9, 32(m), 0, 0 psq_l f14, 8(r1), 0, 0 lfd f14, 16(r1) lwz r0, 28(r1) mtlr r0 addi r1, r1, 24 blr .size ASM_MTXQuat,$-ASM_MTXQuat #undef m #undef q #undef c_zero #undef c_one #undef c_two #undef scale #undef tmp0 #undef tmp1 #undef tmp2 #undef tmp3 #undef tmp4 #undef tmp5 #undef tmp6 #undef tmp7 #undef tmp8 #undef tmp9