/*---------------------------------------------------------------------------* Project: matrix vector Library File: mtxVec_asm.s Copyright 1998-2011 Nintendo. All rights reserved. These coded instructions, statements, and computer programs contain proprietary information of Nintendo of America Inc. and/or Nintendo Company Ltd., and are protected by Federal copyright law. They may not be disclosed to third parties or copied or duplicated in any form, in whole or in part, without the prior written consent of Nintendo. *---------------------------------------------------------------------------*/ .data .align 2 CONST_0_0F: .float 0.0 CONST_0_5F: .float 0.5 CONST_3_0F: .float 3.0 .text // vec library definitions #define RET_REG fp1 #define V1_XY fp2 #define V1_Z fp3 #define V2_XY fp4 #define V2_Z fp5 #define D1_XY fp6 #define D1_Z fp7 #define D2_XY fp8 #define D2_Z fp9 #define W1_XY fp10 #define W1_Z fp11 #define W2_XY fp12 #define W2_Z fp13 //////////////////////////////////////////////////////////////////////////////// // void ASM_CrossProduct (const Vec* vec1, const Vec* vec2, Vec* dst) #define vec1 r3 #define vec2 r4 #define dst r5 .global ASM_VECCrossProduct ASM_VECCrossProduct: .type ASM_VECCrossProduct, @function //x = a.n[VY]*b.n[VZ] - a.n[VZ]*b.n[VY]; //y = a.n[VZ]*b.n[VX] - a.n[VX]*b.n[VZ]; //z = a.n[VX]*b.n[VY] - a.n[VY]*b.n[VX]; // BX | BY psq_l fp1, 0(vec2), 0, 0 // AZ | AZ lfs fp2, 8(vec1) // AX | AY psq_l fp0, 0(vec1), 0, 0 // BY | BX ps_merge10 fp6, fp1, fp1 // BZ | BZ lfs fp3, 8(vec2) // BX*AZ | BY*AZ ps_mul fp4, fp1, fp2 // BX*AX | BY*AX ps_muls0 fp7, fp1, fp0 // AX*BZ-BX*AZ | AY*BZ-BY*AZ ps_msub fp5, fp0, fp3, fp4 // AX*BY-BX*AX | AY*BX-BY*AX ps_msub fp8, fp0, fp6, fp7 // AY*BZ-AZ*BY | AY*BZ-AZ*BY ps_merge11 fp9, fp5, fp5 // AX*BZ-AZ*BX | AY*BX-AX*BY ps_merge01 fp10, fp5, fp8 psq_st fp9, 0(dst), 1, 0 // AZ*BX-AX*BZ | AX*BY-AY*BX ps_neg fp10, fp10 psq_st fp10, 4(dst), 0, 0 blr .size ASM_VECCrossProduct,$-ASM_VECCrossProduct #undef vec1 #undef vec2 #undef dst //////////////////////////////////////////////////////////////////////////////// // void ASM_VECAdd(const Vec* vec1, const Vec* vec2, Vec* dst) .global ASM_VECAdd #define vec1 r3 #define vec2 r4 #define dst r5 ASM_VECAdd: .type ASM_VECAdd, @function //load vectors XY psq_l V1_XY, 0(vec1), 0, 0; psq_l V2_XY, 0(vec2), 0, 0; //add vectors XY ps_add D1_XY, V1_XY, V2_XY; //store result XY psq_st D1_XY, 0(dst), 0, 0; //load vectors Z psq_l V1_Z, 8(vec1), 1, 0; psq_l V2_Z, 8(vec2), 1, 0; //add vectors Z ps_add D1_Z, V1_Z, V2_Z; //store result Z psq_st D1_Z, 8(dst), 1, 0; blr .size ASM_VECAdd,$-ASM_VECAdd #undef vec1 #undef vec2 #undef dst //////////////////////////////////////////////////////////////////////////////// // void ASM_VECSubtract(const Vec* vec1, const Vec* vec2, Vec* dst) .global ASM_VECSubtract #define vec1 r3 #define vec2 r4 #define dst r5 ASM_VECSubtract: .type ASM_VECSubtract, @function //load vectors XY psq_l V1_XY, 0(vec1), 0, 0; psq_l V2_XY, 0(vec2), 0, 0; //subtract vectors XY ps_sub D1_XY, V1_XY, V2_XY; //store vectors XY psq_st D1_XY, 0(dst), 0, 0; //load vectors Z psq_l V1_Z, 8(vec1), 1, 0; psq_l V2_Z, 8(vec2), 1, 0; //subtract vectors Z ps_sub D1_Z, V1_Z, V2_Z; //store vectors Z psq_st D1_Z, 8(dst), 1, 0; blr .size ASM_VECSubtract,$-ASM_VECSubtract #undef vec1 #undef vec2 #undef dst //////////////////////////////////////////////////////////////////////////////// // f32 ASM_VECSquareMag(const Vec* vec1) .global ASM_VECSquareMag #define vec1 r3 #define sqmag f1 #define vxy f2 #define vzz f4 ASM_VECSquareMag: .type ASM_VECSquareMag, @function // load X | Y psq_l vxy, 0(vec1), 0, 0 // XX | YY ps_mul vxy, vxy, vxy // load Z | Z lfs vzz, 8(vec1) // XX + ZZ | YY + ZZ ps_madd sqmag, vzz, vzz, vxy ps_sum0 sqmag, sqmag, vxy, vxy blr .size ASM_VECSquareMag,$-ASM_VECSquareMag #undef vec1 #undef vxy #undef vzz #undef sqmag //////////////////////////////////////////////////////////////////////////////// // f32 ASM_VECSquareDistance(const Vec* a, const Vec* b) .global ASM_VECSquareDistance #define a r3 #define b r4 #define v0yz f2 #define v1yz f3 #define v0xy f4 #define v1xy f5 #define dyz f6 #define dxy f7 #define sqdist f1 ASM_VECSquareDistance: .type ASM_VECSquareDistance, @function psq_l v0yz, 4(a), 0, 0 // [Y0][Z0] psq_l v1yz, 4(b), 0, 0 // [Y1][Z1] ps_sub dyz, v0yz, v1yz // [Y0-Y1][Z0-Z1] psq_l v0xy, 0(a), 0, 0 // [X0][Y0] psq_l v1xy, 0(b), 0, 0 // [X1][Y1] ps_mul dyz, dyz, dyz // [dYdY][dZdZ] ps_sub dxy, v0xy, v1xy // [X0-X1][Y0-Y1] ps_madd sqdist, dxy, dxy, dyz // [dXdX+dYdY][dYdY+dZdZ] ps_sum0 sqdist, sqdist, dyz, dyz // [dXdX+dYdY+dZdZ][N/A] blr .size ASM_VECSquareDistance,$-ASM_VECSquareDistance #undef a #undef b #undef v0yz #undef vlyz #undef v0xy #undef v1xy #undef dyz #undef dxy #undef sqdist //////////////////////////////////////////////////////////////////////////////// // f32 ASM_VECMag(const Vec* v) .global ASM_VECMag #define v r3 #define vxy f2 #define vzz f3 #define sqmag f1 #define rmag f4 #define nwork0 f5 #define nwork1 f6 #define c_three f7 #define c_half f8 #define c_zero f9 ASM_VECMag: .type ASM_VECMag, @function // c_half = 0.5F; lis r4, CONST_0_5F@h ori r4, r4, CONST_0_5F@l lfs c_half, 0(r4) // Square mag calculation psq_l vxy, 0(v), 0, 0 ps_mul vxy, vxy, vxy lfs vzz, 8(v) fsubs c_zero, c_half, c_half ps_madd sqmag, vzz, vzz, vxy // Square mag ps_sum0 sqmag, sqmag, vxy, vxy // Zero check fcmpu cr0, sqmag, c_zero beq- _ASM_VECMag_exit // 1.0/sqrt : estimation[E] frsqrte rmag, sqmag // c_three = 3.0F; lis r4, CONST_3_0F@h ori r4, r4, CONST_3_0F@l lfs c_three, 0(r4) // Refinement x 1 : E' = (E/2)(3 - X*E*E) fmuls nwork0, rmag, rmag fmuls nwork1, rmag, c_half fnmsubs nwork0, nwork0, sqmag, c_three fmuls rmag, nwork0, nwork1 // 1/sqrt(X) * X = sqrt(X) fmuls sqmag, sqmag, rmag _ASM_VECMag_exit: blr .size ASM_VECMag,$-ASM_VECMag #undef v #undef vxy #undef vzz #undef sqmag #undef rmag #undef nwork0 #undef nwork1 #undef c_three #undef c_half #undef c_zero //////////////////////////////////////////////////////////////////////////////// // void ASM_VECScale(const Vec *src, Vec *dst, f32 mult) .global ASM_VECScale #define src r3 #define dst r4 #define mult f1 #define vxy f2 #define vz f3 #define rxy f4 #define rz f5 ASM_VECScale: .type ASM_VECScale, @function //load vector XY psq_l vxy, 0(src), 0, 0 //load vector Z psq_l vz, 8(src), 1, 0 //multiply vector XY ps_muls0 rxy, vxy, mult //store result XY psq_st rxy, 0(dst), 0, 0 //multiply vector Z ps_muls0 rz, vz, mult //store vector Z psq_st rz, 8(dst), 1, 0 blr .size ASM_VECScale,$-ASM_VECScale #undef src #undef dst #undef mult #undef vxy #undef vz #undef rxy #undef rz //////////////////////////////////////////////////////////////////////////////// // f32 ASM_VECDistance(const Vec *a, const Vec *b) .global ASM_VECDistance #define a r3 #define b r4 #define sqdist f1 #define v0yz f2 #define v1yz f3 #define v0xy f4 #define v1xy f5 #define dyz f6 #define dxy f7 #define rdist f8 #define nwork0 f9 #define nwork1 f10 #define c_half f11 #define c_three f12 #define c_zero f13 ASM_VECDistance: .type ASM_VECDistance, @function psq_l v0yz, 4(a), 0, 0 // [Y0][Z0] psq_l v1yz, 4(b), 0, 0 // [Y1][Z1] ps_sub dyz, v0yz, v1yz // [Y0-Y1][Z0-Z1] psq_l v0xy, 0(a), 0, 0 // [X0][Y0] psq_l v1xy, 0(b), 0, 0 // [X1][Y1] ps_mul dyz, dyz, dyz // [dYdY][dZdZ] ps_sub dxy, v0xy, v1xy // [X0-X1][Y0-Y1] // c_half = 0.5F; lis r5, CONST_0_5F@h ori r5, r5, CONST_0_5F@l lfs c_half, 0(r5) // c_zero = 0.0F; lis r5, CONST_0_0F@h ori r5, r5, CONST_0_0F@l lfs c_zero, 0(r5) ps_madd sqdist, dxy, dxy, dyz // [dXdX+dYdY][dYdY+dZdZ] fsubs c_zero, c_half, c_half ps_sum0 sqdist, sqdist, dyz, dyz // [dXdX+dYdY+dZdZ][N/A] // Zero check fcmpu cr0, c_zero, sqdist beq- _ASM_VECDistance_exit // c_three = 3.0F; lis r5, CONST_3_0F@h ori r5, r5, CONST_3_0F@l lfs c_three, 0(r5) // 1.0/sqrt : estimation[E] frsqrte rdist, sqdist // Refinement x 1 : E' = (E/2)(3 - X*E*E) fmuls nwork0, rdist, rdist fmuls nwork1, rdist, c_half fnmsubs nwork0, nwork0, sqdist, c_three fmuls rdist, nwork0, nwork1 // 1/sqrt(X) * X = sqrt(X) fmuls sqdist, sqdist, rdist _ASM_VECDistance_exit: blr .size ASM_VECDistance,$-ASM_VECDistance #undef a #undef b #undef sqdist #undef v0yz #undef v1yz #undef v0xy #undef v1xy #undef dyz #undef dxy #undef rdist #undef nwork0 #undef nwork1 #undef c_half #undef c_three #undef c_zero //////////////////////////////////////////////////////////////////////////////// // void ASM_VECNormalize(const Vec *vec1, const Vec *dst) .global ASM_VECNormalize #define vec1 r3 #define dst r4 #define rsqrt f1 #define v1_xy f2 #define v1_z f3 #define xx_yy f4 #define xx_zz f5 #define sqsum f6 #define nwork0 f7 #define nwork1 f8 #define c_half f9 #define c_three f10 ASM_VECNormalize: .type ASM_VECNormalize, @function // c_half = 0.5F; lis r5, CONST_0_5F@h ori r5, r5, CONST_0_5F@l lfs c_half, 0(r5) // X | Y psq_l v1_xy, 0(vec1), 0, 0; // X*X | Y*Y ps_mul xx_yy, v1_xy, v1_xy; // Z | 1 psq_l v1_z, 8(vec1), 1, 0; // X*X+Z*Z | Y*Y+1 ps_madd xx_zz, v1_z, v1_z, xx_yy; // X*X+Z*Z+Y*Y | Z ps_sum0 sqsum, xx_zz, v1_z, xx_yy; // c_three = 3.0F; lis r5, CONST_3_0F@h ori r5, r5, CONST_3_0F@l lfs c_three, 0(r5) // 1.0/sqrt : estimation[E] frsqrte rsqrt, sqsum; // Newton's refinement x 1 // E' = (E/2)(3 - sqsum * E * E) fmuls nwork0, rsqrt, rsqrt; fmuls nwork1, rsqrt, c_half; fnmsubs nwork0, nwork0, sqsum, c_three; fmuls rsqrt, nwork0, nwork1; // X * mag | Y * mag ps_muls0 v1_xy, v1_xy, rsqrt; psq_st v1_xy, 0(dst), 0, 0; // Z * mag ps_muls0 v1_z, v1_z, rsqrt; psq_st v1_z, 8(dst), 1, 0; blr .size ASM_VECNormalize,$-ASM_VECNormalize #undef vec1 #undef dst #undef sqsum #undef v1_xy #undef v1_z #undef xx_yy #undef xx_zz #undef rsqrt #undef nwork0 #undef nwork1 #undef c_half #undef c_three //////////////////////////////////////////////////////////////////////////////// //f32 ASM_VECDotProduct(const Vec *a, const Vec *b) .global ASM_VECDotProduct #define a r3 #define b r4 ASM_VECDotProduct: .type ASM_VECDotProduct, @function psq_l fp2, 4(a), 0, 0; psq_l fp3, 4(b), 0, 0; ps_mul fp2, fp2, fp3; psq_l fp5, 0(a), 0, 0; psq_l fp4, 0(b), 0, 0; ps_madd fp3, fp5, fp4, fp2; ps_sum0 fp1, fp3, fp2, fp2; blr .size ASM_VECDotProduct,$-ASM_VECDotProduct #undef a #undef b //////////////////////////////////////////////////////////////////////////////// // void ASM_MTXMultVec(const Mtx m, const Vec *src, Vec *dst) #define m r3 #define src r4 #define dst r5 .global ASM_MTXMultVec ASM_MTXMultVec: .type ASM_MTXMultVec, @function // load v[0], v[1] psq_l fp0, 0(src), 0, 0 // load m[0][0], m[0][1] psq_l fp2, 0(m), 0, 0 // load v[2], 1 psq_l fp1, 8(src), 1, 0 // m[0][0]*v[0], m[0][1]*v[1] ps_mul fp4, fp2, fp0 // load m[0][2], m[0][3] psq_l fp3, 8(m), 0, 0 // m[0][0]*v[0]+m[0][2]*v[2], m[0][1]*v[1]+m[0][3] ps_madd fp5, fp3, fp1, fp4 // load m[1][0], m[1][1] psq_l fp8, 16(m), 0, 0 // m[0][0]*v[0]+m[0][2]*v[2]+m[0][1]*v[1]+m[0][3], ??? ps_sum0 fp6, fp5, fp6, fp5 // load m[1][2], m[1][3] psq_l fp9, 24(m), 0, 0 // m[1][0]*v[0], m[1][1]*v[1] ps_mul fp10, fp8, fp0 // store dst[0] psq_st fp6, 0(dst), 1, 0 // m[1][0]*v[0]+m[1][2]*v[2], m[1][1]*v[1]+m[1][3] ps_madd fp11, fp9, fp1, fp10 // load m[2][0], m[2][1] psq_l fp2, 32(m), 0, 0 // m[1][0]*v[0]+m[1][2]*v[2]+m[2][1]*v[1]+m[1][3], ??? ps_sum0 fp12, fp11, fp12, fp11 // load m[2][2], m[2][3] psq_l fp3, 40(m), 0, 0 // m[0][0]*v[0], m[0][1]*v[1] ps_mul fp4, fp2, fp0 // store dst[1] psq_st fp12, 4(dst), 1, 0 // m[0][0]*v[0]+m[0][2]*v[2], m[0][1]*v[1]+m[0][3] ps_madd fp5, fp3, fp1, fp4 // m[0][0]*v[0]+m[0][2]*v[2]+m[0][1]*v[1]+m[0][3], ??? ps_sum0 fp6, fp5, fp6, fp5 // store dst[0] psq_st fp6, 8(dst), 1, 0 blr .size ASM_MTXMultVec,$-ASM_MTXMultVec #undef m #undef src #undef dst //////////////////////////////////////////////////////////////////////////////// // void ASM_MTXMultVecArray (const Mtx m, const Vec *srcBase, Vec *dstBase, u32 count ) #define m r3 #define srcBase r4 #define dstBase r5 #define count r6 .global ASM_MTXMultVecArray ASM_MTXMultVecArray: .type ASM_MTXMultVecArray, @function // fp13 [m00][m01] : LOAD psq_l fp13, 0(m), 0, 0 // fp12 [m10][m11] : LOAD psq_l fp12, 16(m), 0, 0 // decrement loop count due to unrolling subi count, count, 1 // fp11 [m02][m03] : LOAD psq_l fp11, 8(m), 0, 0 // fp0 [m00][m10] ps_merge00 fp0, fp13, fp12 // base pointer adjustment subi dstBase, dstBase, 4 // fp10 [m12][m13] : LOAD psq_l fp10, 24(m), 0, 0 // fp1 [m01][m11] ps_merge11 fp1, fp13, fp12 // loop counter mtctr count // fp4 [m20][m21] : LOAD psq_l fp4, 32(m), 0, 0 // fp2 [m02][m12] ps_merge00 fp2, fp11, fp10 // fp5 [m22][m23] : LOAD psq_l fp5, 40(m), 0, 0 // fp3 [m03][m13] ps_merge11 fp3, fp11, fp10 // fp6 [v0][v1] : LOAD psq_l fp6, 0(srcBase), 0, 0 // fp7 [v2][1.0F] : LOAD psq_lu fp7, 8(srcBase), 1, 0 // fp8 [m00*v0+m03][m10*v0+m13] ps_madds0 fp8, fp0, fp6, fp3 // fp9 [m20*v0][m21*v1] ps_mul fp9, fp4, fp6 // fp8 [m00*v0+m01*v1+m03][m10*v0+m11*v1+m13] ps_madds1 fp8, fp1, fp6, fp8 // fp10 [m20*v0+m22*v2][m21*v1+m23*1.0F] ps_madd fp10, fp5, fp7, fp9 _ASM_MTXMultVecArray_mloop: //-------- Unrolled loop -------- // fp6 [v0][v1] : LOAD psq_lu fp6, 4(srcBase), 0, 0 // fp12 [m00*v0+m01*v1+m02*v2+m03][m10*v0+m11*v1+m12*v2+m13] ps_madds0 fp12, fp2, fp7, fp8 // fp7 [v2][1.0F] : LOAD psq_lu fp7, 8(srcBase), 1, 0 // fp13 [m20*v0+m21*v1+m22*v2+m23][?] ps_sum0 fp13, fp10, fp9, fp10 // fp8 [m00*v0+m03][m10*v0+m13] ps_madds0 fp8, fp0, fp6, fp3 // fp9 [m20*v0][m21*v1] ps_mul fp9, fp4, fp6 // fp12 [v0'][v1'] : STORE psq_stu fp12, 4(dstBase), 0, 0 // fp8 [m00*v0+m01*v1+m03][m10*v0+m11*v1+m13] ps_madds1 fp8, fp1, fp6, fp8 // fp13 [v2'][ ? ] : STORE psq_stu fp13, 8(dstBase), 1, 0 // fp10 [m20*v0+m22*v2][m21*v1+m23*1.0F] ps_madd fp10, fp5, fp7, fp9 // LOOP bdnz _ASM_MTXMultVecArray_mloop // fp12 [m00*v0+m01*v1+m02*v2+m03][m10*v0+m11*v1+m12*v2+m13] ps_madds0 fp12, fp2, fp7, fp8 // fp13 [m20*v0+m21*v1+m22*v2+m23][?] ps_sum0 fp13, fp10, fp9, fp10 // fp12 [v0'][v1'] : STORE psq_stu fp12, 4(dstBase), 0, 0 // fp13 [v2'][ ? ] : STORE psq_stu fp13, 8(dstBase), 1, 0 blr .size ASM_MTXMultVecArray,$-ASM_MTXMultVecArray #undef m #undef srcBase #undef dstBase #undef count //////////////////////////////////////////////////////////////////////////////// // void ASM_MTXMultVecSR(const Mtx m, const Vec *src, Vec *dst) .global ASM_MTXMultVecSR #define m r3 #define src r4 #define dst r5 ASM_MTXMultVecSR: .type ASM_MTXMultVecSR, @function psq_l fp0, 0(m), 0, 0 // m[0][0], m[0][1] GQR0 = 0 // fp6 - x y psq_l fp6, 0(src), 0, 0 psq_l fp2, 16(m), 0, 0 // m[1][0], m[1][1] // fp8 = m00x m01y // next X ps_mul fp8, fp0, fp6 psq_l fp4, 32(m), 0, 0 // m[2][0], m[2][1] // fp10 = m10x m11y // next Y ps_mul fp10, fp2, fp6 psq_l fp7, 8(src), 1, 0 // fp7 - z,1.0 // fp12 = m20x m21y // next Z ps_mul fp12, fp4, fp6 // YYY last FP6 usage psq_l fp3, 24(m), 0, 0 // m[1][2], m[1][3] ps_sum0 fp8, fp8, fp8, fp8 psq_l fp5, 40(m), 0, 0 // m[2][2], m[2][3] ps_sum0 fp10, fp10, fp10, fp10 psq_l fp1, 8(m), 0, 0 // m[0][2], m[0][3] ps_sum0 fp12, fp12, fp12, fp12 ps_madd fp9, fp1, fp7, fp8 psq_st fp9, 0(dst), 1, 0 // store X ps_madd fp11, fp3, fp7, fp10 psq_st fp11, 4(dst), 1, 0 // store Y ps_madd fp13, fp5, fp7, fp12 psq_st fp13, 8(dst), 1, 0 // sore Z blr .size ASM_MTXMultVecSR,$-ASM_MTXMultVecSR #undef m #undef src #undef dst //////////////////////////////////////////////////////////////////////////////// // void ASM_MTXMultVecArraySR(const Mtx m, const Vec *srcBase, Vec *dstBase, u32 count) #define m r3 #define srcBase r4 #define dstBase r5 #define count r6 .global ASM_MTXMultVecArraySR ASM_MTXMultVecArraySR: .type ASM_MTXMultVecArraySR, @function // fp13 [m00][m01] : LOAD psq_l fp13, 0(m), 0, 0 // fp12 [m10][m11] : LOAD psq_l fp12, 16(m), 0, 0 // decrement loop count due to unrolling subi count, count, 1 // fp11 [m02][1.0F] : LOAD psq_l fp11, 8(m), 1, 0 // fp0 [m00][m10] ps_merge00 fp0, fp13, fp12 // base pointer adjustment subi dstBase, dstBase, 4 // fp10 [m12][1.0F] : LOAD psq_l fp10, 24(m), 1, 0 // fp1 [m01][m11] ps_merge11 fp1, fp13, fp12 // loop counter mtctr count // fp3 [m20][m21] : LOAD psq_l fp3, 32(m), 0, 0 // fp2 [m02][m12] ps_merge00 fp2, fp11, fp10 // fp4 [m22][1.0F] : LOAD psq_l fp4, 40(m), 1, 0 // fp6 [v0][v1] : LOAD psq_l fp6, 0(srcBase), 0, 0 // fp7 [v2][1.0F] : LOAD psq_lu fp7, 8(srcBase), 1, 0 // fp8 [m00*v0][m10*v0] ps_muls0 fp8, fp0, fp6 // fp9 [m20*v0][m21*v1] ps_mul fp9, fp3, fp6 // fp8 [m00*v0+m01*v1][m10*v0+m11*v1] ps_madds1 fp8, fp1, fp6, fp8 // fp10 [m20*v0+m22*v2][?] ps_madd fp10, fp4, fp7, fp9 _ASM_MTXMultVecArraySR_mloop: //-------- Unrolled loop -------- // fp6 [v0][v1] : LOAD psq_lu fp6, 4(srcBase), 0, 0 // fp12 [m00*v0+m01*v1+m02*v2][m10*v0+m11*v1+m12*v2] ps_madds0 fp12, fp2, fp7, fp8 // fp7 [v2][1.0F] : LOAD psq_lu fp7, 8(srcBase), 1, 0 // fp13 [m20*v0+m21*v1+m22*v2][?] ps_sum0 fp13, fp10, fp9, fp9 // fp8 [m00*v0][m10*v0] ps_muls0 fp8, fp0, fp6 // fp9 [m20*v0][m21*v1] ps_mul fp9, fp3, fp6 // fp12 [v0'][v1'] : STORE psq_stu fp12, 4(dstBase), 0, 0 // fp8 [m00*v0+m01*v1][m10*v0+m11*v1] ps_madds1 fp8, fp1, fp6, fp8 // fp13 [v2'][ ? ] : STORE psq_stu fp13, 8(dstBase), 1, 0 // fp10 [m20*v0+m22*v2][?] ps_madd fp10, fp4, fp7, fp9 // LOOP bdnz _ASM_MTXMultVecArraySR_mloop // fp12 [m00*v0+m01*v1+m02*v2][m10*v0+m11*v1+m12*v2] ps_madds0 fp12, fp2, fp7, fp8 // fp13 [m20*v0+m21*v1+m22*v2][?] ps_sum0 fp13, fp10, fp9, fp9 // fp12 [v0'][v1'] : STORE psq_stu fp12, 4(dstBase), 0, 0 // fp13 [v2'][ ? ] : STORE psq_stu fp13, 8(dstBase), 1, 0 blr .size ASM_MTXMultVecArraySR,$-ASM_MTXMultVecArraySR #undef m #undef srcBase #undef dstBase #undef count