/*---------------------------------------------------------------------------* Project: matrix vector Library File: mtx_asm.s Copyright 1998-2011 Nintendo. All rights reserved. These coded instructions, statements, and computer programs contain proprietary information of Nintendo of America Inc. and/or Nintendo Company Ltd., and are protected by Federal copyright law. They may not be disclosed to third parties or copied or duplicated in any form, in whole or in part, without the prior written consent of Nintendo. *---------------------------------------------------------------------------*/ .data .align 2 Unit01: .float 0.0 .float 1.0 CONST_0_0F: .float 0.0 CONST_0_5F: .float 0.5 CONST_1_0F: .float 1.0 CONST_3_0F: .float 3.0 .text //////////////////////////////////////////////////////////////////////////////// // void ASM_MTXIdentity(Mtx m) #define m r3 #define c_01 f1 #define c_10 f2 #define c_zero f3 #define c_one f4 .global ASM_MTXIdentity ASM_MTXIdentity: .type ASM_MTXIdentity, @function // c_zero = 0.0F; lis r4, CONST_0_0F@h ori r4, r4, CONST_0_0F@l lfs c_zero, 0(r4) psq_st c_zero, 8(m), 0, 0 // m[0][2], m[0][3] // c_one = 1.0F; lis r5, CONST_1_0F@h ori r5, r5, CONST_1_0F@l lfs c_one, 0(r5) ps_merge01 c_01, c_zero, c_one // { 0.1F, 1.0F } psq_st c_zero, 24(m), 0, 0 // m[1][2], m[1][3] ps_merge10 c_10, c_one, c_zero // fp2 = { 1.0F, 0.0F } psq_st c_zero, 32(m), 0, 0 // m[2][0], m[2][1] psq_st c_01, 16(m), 0, 0 // m[1][0], m[1][1] psq_st c_10, 0(m), 0, 0 // m[0][0], m[0][1] psq_st c_10, 40(m), 0, 0 // m[2][2], m[2][3] blr .size ASM_MTXIdentity,$-ASM_MTXIdentity #undef m #undef c_01 #undef c_10 #undef c_zero #undef c_one //////////////////////////////////////////////////////////////////////////////// // void ASM_MTXCopy(const Mtx src, Mtx dst) #define src r3 #define dst r4 .global ASM_MTXCopy ASM_MTXCopy: .type ASM_MTXCopy, @function psq_l fp0, 0(src), 0, 0 psq_st fp0, 0(dst), 0, 0 psq_l fp1, 8(src), 0, 0 psq_st fp1, 8(dst), 0, 0 psq_l fp2, 16(src), 0, 0 psq_st fp2, 16(dst), 0, 0 psq_l fp3, 24(src), 0, 0 psq_st fp3, 24(dst), 0, 0 psq_l fp4, 32(src), 0, 0 psq_st fp4, 32(dst), 0, 0 psq_l fp5, 40(src), 0, 0 psq_st fp5, 40(dst), 0, 0 blr .size ASM_MTXCopy,$-ASM_MTXCopy #undef src #undef dst //////////////////////////////////////////////////////////////////////////////// // void ASM_MTXConcat(const Mtx mA, const Mtx mB, Mtx mAB) #define mA r3 #define mB r4 #define mAB r5 .global ASM_MTXConcat ASM_MTXConcat: .type ASM_MTXConcat, @function #define A00_A01 fp0 #define A02_A03 fp1 #define A10_A11 fp2 #define A12_A13 fp3 #define A20_A21 fp4 #define A22_A23 fp5 #define B00_B01 fp6 #define B02_B03 fp7 #define B10_B11 fp8 #define B12_B13 fp9 #define B20_B21 fp10 #define B22_B23 fp11 #define D00_D01 fp12 #define D02_D03 fp13 #define D10_D11 fp14 #define D12_D13 fp15 #define D20_D21 fp2 #define D22_D23 fp0 #define UNIT01 fp31 // don't save LR since we don't make any function calls // mflr r0 // stw r0, 4(r1) stwu r1, -64(r1) psq_l A00_A01, 0(mA), 0, 0 psq_st fp14, 8(r1), 0, 0 stfd fp14, 16(r1) psq_l B00_B01, 0(mB), 0, 0 addis r6, 0, Unit01@ha psq_l B02_B03, 8(mB), 0, 0 psq_st fp15, 24(r1), 0, 0 stfd fp15, 32(r1) addi r6, r6, Unit01@l psq_st fp31, 40(r1), 0, 0 stfd fp31, 48(r1) psq_l B10_B11, 16(mB), 0, 0 // D00_D01 = b00a00 , b01a00 ps_muls0 D00_D01, B00_B01, A00_A01 psq_l A10_A11, 16(mA), 0, 0 // D02_D03 = b02a00 , b03a00 ps_muls0 D02_D03, B02_B03, A00_A01 psq_l UNIT01, 0(r6), 0, 0 // D10_D11 = a10b00 , a10b01 ps_muls0 D10_D11, B00_B01, A10_A11 psq_l B12_B13, 24(mB), 0, 0 // D12_D13 = a10b02 , a10b03 ps_muls0 D12_D13, B02_B03, A10_A11 psq_l A02_A03, 8(mA), 0, 0 // fp12 = b10a01 + b00a00 , b11a01 + b01a00 ps_madds1 D00_D01, B10_B11, A00_A01, D00_D01 psq_l A12_A13, 24(mA), 0, 0 // D10_D11 = a10b00 + a11b10 , a10b01 + a11b11 ps_madds1 D10_D11, B10_B11, A10_A11, D10_D11 psq_l B20_B21, 32(mB), 0, 0 // D02_D03 = b12a01 + b02a00 , b13a01 + b03a00 ps_madds1 D02_D03, B12_B13, A00_A01, D02_D03 // YYY LAST TIME FP0 IS USED psq_l B22_B23, 40(mB), 0, 0 // D12_D13 = a10b02 + a11b12, a10b03+a11b13 ps_madds1 D12_D13, B12_B13, A10_A11, D12_D13 // YYY LAST TIME FP2 IS USED psq_l A20_A21, 32(mA), 0, 0 psq_l A22_A23, 40(mA), 0, 0 // D00_D01 = b20a02 + b10a01 + b00a00 , b21a02 + b11a01 + b01a00 ps_madds0 D00_D01, B20_B21, A02_A03, D00_D01 // m00, m01 computed // D02_D03 = b12a01 + b02a00 + b22a02 , b13a01 + b03a00 + b23a02 ps_madds0 D02_D03, B22_B23, A02_A03, D02_D03 // D10_D11 = a10b00 + a11b10 +a12b20, a10b01 + a11b11 + a12b21 ps_madds0 D10_D11, B20_B21, A12_A13, D10_D11 // m10, m11 computed // D12_D13 = a10b02 + a11b12 + a12b22, a10b03+a11b13 + a12b23 + a13 ps_madds0 D12_D13, B22_B23, A12_A13, D12_D13 // store m00m01 psq_st D00_D01, 0(mAB), 0, 0 // YYY LAST TIME FP12 IS USED // D20_D21 = a20b00, a20b01 ps_muls0 D20_D21, B00_B01, A20_A21 // YYY LAST TIME FP6 IS USED // get a03 from fp1 and add to D02_D03 ps_madds1 D02_D03, UNIT01, A02_A03, D02_D03 // m02, m03 computed // YYY LAST TIME FP1 IS USED // D22_D23 = a20b02, a20b03 ps_muls0 D22_D23, B02_B03, A20_A21 // YYY LAST TIME FP7 IS USED // store m10m11 psq_st D10_D11, 16(mAB), 0, 0 // get a13 from fp3 and add to D12_D13 ps_madds1 D12_D13, UNIT01, A12_A13, D12_D13 // m12, m13 computed // store m02m03 psq_st D02_D03, 8(mAB), 0, 0 // YYY LAST TIME D02_D03 IS USED // D20_D21 = a20b00 + a21b10, a20b01 + a21b11 ps_madds1 D20_D21, B10_B11, A20_A21, D20_D21 // YYY LAST TIME FP8 IS USED // D22_D23 = a20b02 + a21b12, a20b03 + a21b13 ps_madds1 D22_D23, B12_B13, A20_A21, D22_D23 // D20_D21 = a20b00 + a21b10 + a22b20, a20b01 + a21b11 + a22b21 ps_madds0 D20_D21, B20_B21, A22_A23, D20_D21 // Restore fp14 psq_l fp14, 8(r1), 0, 0 lfd fp14, 16(r1) // D10_D11 // store m12m13 psq_st D12_D13, 24(mAB), 0, 0 // D22_D23 = a20b02 + a21b12 + a22b22, a20b03 + a21b13 + a22b23 + a23 ps_madds0 D22_D23, B22_B23, A22_A23, D22_D23 // store m20m21 psq_st D20_D21, 32(mAB), 0, 0 // get a23 from fp5 and add to fp17 ps_madds1 D22_D23, UNIT01, A22_A23, D22_D23 // restore stack frame psq_l fp15, 24(r1), 0, 0 lfd fp15, 32(r1) // D12_D13 // store m22m23 psq_st D22_D23, 40(mAB), 0, 0 psq_l fp31, 40(r1), 0, 0 lfd fp31, 48(r1) addi r1, r1, 64 blr .size ASM_MTXConcat,$-ASM_MTXConcat #undef mA #undef mB #undef mAB #undef A00_A01 #undef A02_A03 #undef A10_A11 #undef A12_A13 #undef A20_A21 #undef A22_A23 #undef B00_B01 #undef B02_B03 #undef B10_B11 #undef B12_B13 #undef B20_B21 #undef B22_B23 #undef D00_D01 #undef D02_D03 #undef D10_D11 #undef D12_D13 #undef D20_D21 #undef D22_D23 #undef UNIT01 //////////////////////////////////////////////////////////////////////////////// // void ASM_MTXConcatArray (const Mtx a, const Mtx* srcBase, Mtx* dstBase, u32 count) #define a r3 #define srcBase r4 #define dstBase r5 #define count r6 .global ASM_MTXConcatArray ASM_MTXConcatArray: .type ASM_MTXConcatArray, @function #define va0 f0 #define va1 f1 #define va2 f2 #define va3 f3 #define va4 f4 #define va5 f5 #define vb0 f6 #define vb1 f7 #define vb2 f8 #define vb3 f9 #define vb4 f10 #define vb5 f11 #define vd0 f12 #define vd1 f13 #define vd2 f14 #define vd3 f15 #define vd4 f16 #define vd5 f17 #define u01 f18 #define u01Ptr r7 #define sizeof_Mtx 48 mflr r0 stwu r1, -88(r1) stw r0, 92(r1) psq_st f14, 8(r1), 0, 0 stfd f14, 16(r1) psq_st f15, 24(r1), 0, 0 stfd f15, 32(r1) psq_st f16, 40(r1), 0, 0 stfd f16, 48(r1) psq_st f17, 56(r1), 0, 0 stfd f17, 64(r1) psq_st f18, 72(r1), 0, 0 stfd f18, 80(r1) lis u01Ptr, Unit01@h ori u01Ptr, u01Ptr, Unit01@l // [a00][a01] psq_l va0, 0(a), 0, 0 // [a02][a03] psq_l va1, 8(a), 0, 0 // [a10][a11] psq_l va2, 16(a), 0, 0 // [a12][a13] psq_l va3, 24(a), 0, 0 // count-- subi count, count, 1 // [a20][a21] psq_l va4, 32(a), 0, 0 // [a22][a23] psq_l va5, 40(a), 0, 0 // Loop count mtctr count // [0][1] psq_l u01, 0(u01Ptr), 0, 0 //--------------------------------- // [b00][b01] psq_l vb0, 0(srcBase), 0, 0 // [b10][b11] psq_l vb2, 16(srcBase), 0, 0 // [a00*b00][a00*b01] ps_muls0 vd0, vb0, va0 // [a10*b00][a10*b01] ps_muls0 vd2, vb0, va2 // [a20*b00][a20*b01] ps_muls0 vd4, vb0, va4 // [b20][b21] psq_l vb4, 32(srcBase), 0, 0 // [a00*b00 + a01*b10][a00*b01 + a01*b11] ps_madds1 vd0, vb2, va0, vd0 // [a10*b00 + a11*b10][a10*b01 + a11*b11] ps_madds1 vd2, vb2, va2, vd2 // [a20*b00 + a21*b10][a20*b01 + a21*b11] ps_madds1 vd4, vb2, va4, vd4 // [b02][b03] psq_l vb1, 8(srcBase), 0, 0 // [a00*b00 + a01*b10 + a02*b20][a00*b01 + a01*b11 + a02*b21] ps_madds0 vd0, vb4, va1, vd0 // [a10*b00 + a11*b10 + a12*b20][a10*b01 + a11*b11 + a12*b21] ps_madds0 vd2, vb4, va3, vd2 // [a20*b00 + a21*b10 + a22*b20][a20*b01 + a21*b11 + a22*b21] ps_madds0 vd4, vb4, va5, vd4 // [b12][b13] psq_l vb3, 24(srcBase), 0, 0 // [a00*b00 + a01*b10 + a02*b20][a00*b01 + a01*b11 + a02*b21] psq_st vd0, 0(dstBase), 0, 0 // [a00*b02][a00*b03] ps_muls0 vd1, vb1, va0 // [a10*b02][a10*b03] ps_muls0 vd3, vb1, va2 // [a20*b02][a20*b03] ps_muls0 vd5, vb1, va4 // [b22][b23] psq_l vb5, 40(srcBase), 0, 0 // [a10*b00 + a11*b10 + a12*b20][a10*b01 + a11*b11 + a12*b21] psq_st vd2, 16(dstBase), 0, 0 // [a00*b02 + a01*b12][a00*b03 + a01*b13] ps_madds1 vd1, vb3, va0, vd1 // [a10*b02 + a11*b12][a10*b03 + a11*b13] ps_madds1 vd3, vb3, va2, vd3 // [a20*b02 + a21*b12][a20*b03 + a21*b13] ps_madds1 vd5, vb3, va4, vd5 _ASM_MTXConcatArray_loop: // ++srcBase addi srcBase, srcBase, sizeof_Mtx // [a00*b02 + a01*b12 + a02*b22][a00*b03 + a01*b13 + a02*b23] ps_madds0 vd1, vb5, va1, vd1 // [a10*b02 + a11*b12 + a12*b22][a10*b03 + a11*b13 + a12*b23] ps_madds0 vd3, vb5, va3, vd3 // [a20*b02 + a21*b12 + a22*b22][a20*b03 + a21*b13 + a22*b23] ps_madds0 vd5, vb5, va5, vd5 // [b00][b01] psq_l vb0, 0(srcBase), 0, 0 // [a20*b00 + a21*b10 + a22*b20][a20*b01 + a21*b11 + a22*b21] psq_st vd4, 32(dstBase), 0, 0 // [a00*b02 + a01*b12 + a02*b22][a00*b03 + a01*b13 + a02*b23 + a03] ps_madd vd1, u01, va1, vd1 // [a10*b02 + a11*b12 + a12*b22][a10*b03 + a11*b13 + a12*b23 + a13] ps_madd vd3, u01, va3, vd3 // [a20*b02 + a21*b12 + a22*b22][a20*b03 + a21*b13 + a22*b23 + a23] ps_madd vd5, u01, va5, vd5 // [b10][b11] psq_l vb2, 16(srcBase), 0, 0 // [a00*b02 + a01*b12 + a02*b22][a00*b03 + a01*b13 + a02*b23 + a03] psq_st vd1, 8(dstBase), 0, 0 // [a00*b00][a00*b01] ps_muls0 vd0, vb0, va0 // [a10*b00][a10*b01] ps_muls0 vd2, vb0, va2 // [a20*b00][a20*b01] ps_muls0 vd4, vb0, va4 // [b20][b21] psq_l vb4, 32(srcBase), 0, 0 // [a10*b02 + a11*b12 + a12*b22][a10*b03 + a11*b13 + a12*b23 + a13] psq_st vd3, 24(dstBase), 0, 0 // [a00*b00 + a01*b10][a00*b01 + a01*b11] ps_madds1 vd0, vb2, va0, vd0 // [a10*b00 + a11*b10][a10*b01 + a11*b11] ps_madds1 vd2, vb2, va2, vd2 // [a20*b00 + a21*b10][a20*b01 + a21*b11] ps_madds1 vd4, vb2, va4, vd4 // [b02][b03] psq_l vb1, 8(srcBase), 0, 0 // [a20*b02 + a21*b12 + a22*b22][a20*b03 + a21*b13 + a22*b23 + a23] psq_st vd5, 40(dstBase), 0, 0 // ++dstBase addi dstBase, dstBase, sizeof_Mtx // [a00*b00 + a01*b10 + a02*b20][a00*b01 + a01*b11 + a02*b21] ps_madds0 vd0, vb4, va1, vd0 // [a10*b00 + a11*b10 + a12*b20][a10*b01 + a11*b11 + a12*b21] ps_madds0 vd2, vb4, va3, vd2 // [a20*b00 + a21*b10 + a22*b20][a20*b01 + a21*b11 + a22*b21] ps_madds0 vd4, vb4, va5, vd4 // [b12][b13] psq_l vb3, 24(srcBase), 0, 0 // [a00*b00 + a01*b10 + a02*b20][a00*b01 + a01*b11 + a02*b21] psq_st vd0, 0(dstBase), 0, 0 // [a00*b02][a00*b03] ps_muls0 vd1, vb1, va0 // [a10*b02][a10*b03] ps_muls0 vd3, vb1, va2 // [a20*b02][a20*b03] ps_muls0 vd5, vb1, va4 // [b22][b23] psq_l vb5, 40(srcBase), 0, 0 // [a10*b00 + a11*b10 + a12*b20][a10*b01 + a11*b11 + a12*b21] psq_st vd2, 16(dstBase), 0, 0 // [a00*b02 + a01*b12][a00*b03 + a01*b13] ps_madds1 vd1, vb3, va0, vd1 // [a10*b02 + a11*b12][a10*b03 + a11*b13] ps_madds1 vd3, vb3, va2, vd3 // [a20*b02 + a21*b12][a20*b03 + a21*b13] ps_madds1 vd5, vb3, va4, vd5 // LOOP bdnz _ASM_MTXConcatArray_loop // [a20*b00 + a21*b10 + a22*b20][a20*b01 + a21*b11 + a22*b21] psq_st vd4, 32(dstBase), 0, 0 // [a00*b02 + a01*b12 + a02*b22][a00*b03 + a01*b13 + a02*b23] ps_madds0 vd1, vb5, va1, vd1 // [a10*b02 + a11*b12 + a12*b22][a10*b03 + a11*b13 + a12*b23] ps_madds0 vd3, vb5, va3, vd3 // [a20*b02 + a21*b12 + a22*b22][a20*b03 + a21*b13 + a22*b23] ps_madds0 vd5, vb5, va5, vd5 // [a00*b02 + a01*b12 + a02*b22][a00*b03 + a01*b13 + a02*b23 + a03] ps_madd vd1, u01, va1, vd1 // [a10*b02 + a11*b12 + a12*b22][a10*b03 + a11*b13 + a12*b23 + a13] ps_madd vd3, u01, va3, vd3 // [a20*b02 + a21*b12 + a22*b22][a20*b03 + a21*b13 + a22*b23 + a23] ps_madd vd5, u01, va5, vd5 // [a00*b02 + a01*b12 + a02*b22][a00*b03 + a01*b13 + a02*b23 + a03] psq_st vd1, 8(dstBase), 0, 0 // [a10*b02 + a11*b12 + a12*b22][a10*b03 + a11*b13 + a12*b23 + a13] psq_st vd3, 24(dstBase), 0, 0 // [a20*b02 + a21*b12 + a22*b22][a20*b03 + a21*b13 + a22*b23 + a23] psq_st vd5, 40(dstBase), 0, 0 psq_l f14, 8(r1), 0, 0 lfd f14, 16(r1) psq_l f15, 24(r1), 0, 0 lfd f15, 32(r1) psq_l f16, 40(r1), 0, 0 lfd f16, 48(r1) psq_l f17, 56(r1), 0, 0 lfd f17, 64(r1) psq_l f18, 72(r1), 0, 0 lfd f18, 80(r1) lwz r0, 92(r1) mtlr r0 addi r1, r1, 88 blr .size ASM_MTXConcatArray,$-ASM_MTXConcatArray #undef a #undef srcBase #undef dstBase #undef count #undef va0 #undef va1 #undef va2 #undef va3 #undef va4 #undef va5 #undef vb0 #undef vb1 #undef vb2 #undef vb3 #undef vb4 #undef vb5 #undef vd0 #undef vd1 #undef vd2 #undef vd3 #undef vd4 #undef vd5 #undef u01 #undef u01Ptr //////////////////////////////////////////////////////////////////////////////// // void ASM_MTXTranspose ( const Mtx src, Mtx xPose ) { #define src r3 #define xPose r4 #define c_zero fp1 #define row0a fp2 #define row1a fp3 #define row0b fp4 #define row1b fp5 #define trns0 fp6 #define trns1 fp7 #define trns2 fp8 .global ASM_MTXTranspose ASM_MTXTranspose: .type ASM_MTXTranspose, @function // c_zero = 0.0F; lis r5, CONST_0_0F@h ori r5, r5, CONST_0_0F@l lfs c_zero, 0(r5) psq_l row0a, 0(src), 0, 0 // [0][0], [0][1] stfs c_zero, 44(xPose) // 0 -> [2][3] psq_l row1a, 16(src), 0, 0 // [1][0], [1][1] ps_merge00 trns0, row0a, row1a // [0][0], [1][0] psq_l row0b, 8(src), 1, 0 // [0][2], 1 ps_merge11 trns1, row0a, row1a // [0][1], [1][1] psq_l row1b, 24(src), 1, 0 // [1][2], 1 psq_st trns0, 0(xPose), 0, 0 // [0][0], [1][0] -> [0][0], [0][1] psq_l row0a, 32(src), 0, 0 // [2][0], [2][1] ps_merge00 trns2, row0b, row1b // [0][2], [1][2] psq_st trns1, 16(xPose), 0, 0 // [0][1], [1][1] -> [1][0], [1][1] ps_merge00 trns0, row0a, c_zero // [2][0], 0 psq_st trns2, 32(xPose), 0, 0 // [0][2], [1][2] -> [2][0], [2][1] ps_merge10 trns1, row0a, c_zero // [2][1], 0 psq_st trns0, 8(xPose), 0, 0 // [2][0], 0 -> [0][2], [0][3] lfs row0b, 40(src) // [2][2] psq_st trns1, 24(xPose), 0, 0 // [2][1], 0 -> [1][2], [1][3] stfs row0b, 40(xPose) // [2][2] -> [2][2] blr .size ASM_MTXTranspose,$-ASM_MTXTranspose #undef src #undef xPose #undef c_zero #undef row0a #undef row1a #undef row0b #undef row1b #undef trns0 #undef trns1 #undef trns2 //////////////////////////////////////////////////////////////////////////////// // u32 ASM_MTXInverse(const Mtx src, Mtx inv) { #define src r3 #define inv r4 .global ASM_MTXInverse ASM_MTXInverse: .type ASM_MTXInverse, @function // fp0 [ 00 ][ 1.0F ] : Load psq_l fp0, 0( src ), 1, 0 // fp1 [ 01 ][ 02 ] : Load psq_l fp1, 4( src ), 0, 0 // fp2 [ 10 ][ 1.0F ] : Load psq_l fp2, 16( src ), 1, 0 // fp6 [ 02 ][ 00 ] ps_merge10 fp6, fp1, fp0 // fp3 [ 11 ][ 12 ] : Load psq_l fp3, 20( src ), 0, 0 // fp4 [ 20 ][ 1.0F ] : Load psq_l fp4, 32( src ), 1, 0 // fp7 [ 12 ][ 10 ] ps_merge10 fp7, fp3, fp2 // fp5 [ 21 ][ 22 ] : Load psq_l fp5, 36( src ), 0, 0 // fp11[ 11*02 ][ 00*12 ] ps_mul fp11, fp3, fp6 // fp8 [ 22 ][ 20 ] ps_merge10 fp8, fp5, fp4 // fp13[ 21*12 ][ 10*22 ] ps_mul fp13, fp5, fp7 // fp11[ 01*12 - 11*02 ][ 10*02 - 00*12 ] ps_msub fp11, fp1, fp7, fp11 // fp12[ 01*22 ][ 20*02 ] ps_mul fp12, fp1, fp8 // fp13[ 11*22 - 21*12 ][ 20*12 - 10*22 ] ps_msub fp13, fp3, fp8, fp13 // fp10[ 20*11 ][ N/A ] ps_mul fp10, fp3, fp4 // fp12[ 21*02 - 01*22 ][ 00*22 - 20*02 ] ps_msub fp12, fp5, fp6, fp12 // fp7 [ 00*(11*22-21*12) ][ N/A ] ps_mul fp7, fp0, fp13 // fp9 [ 00*21 ][ N/A ] ps_mul fp9, fp0, fp5 // fp8 [ 10*01 ][ N/A ] ps_mul fp8, fp1, fp2 // fp7 [ 00*(11*22-21*12) + 10*(21*02-01*22) ][ N/A ] ps_madd fp7, fp2, fp12, fp7 // fp6 [ 0.0F ][ 0.0F ] ps_sub fp6, fp6, fp6 // fp10[ 10*21 - 20*11 ][ N/A ] ps_msub fp10, fp2, fp5, fp10 // fp7 [ 00*(11*22-21*12) + 10*(21*02-01*22) + 20*(01*12-11*02) ][ N/A ] : det ps_madd fp7, fp4, fp11, fp7 // fp9 [ 20*01 - 00*21 ][ N/A ] ps_msub fp9, fp1, fp4, fp9 // fp8 [ 00*11 - 10*01 ][ N/A ] ps_msub fp8, fp0, fp3, fp8 // ( det == 0 ) ? ps_cmpo0 cr0, fp7, fp6 bne _ASM_MTXInverse_regular // return value (singular) addi r3, 0, 0 blr _ASM_MTXInverse_regular: // fp0 [ 1/det ][ N/A ] fres fp0, fp7 // Newton's approximation // Refinement : ( E = est. of 1/K ) -> ( E' = ( 2 - K * E ) * E ) ps_add fp6, fp0, fp0 ps_mul fp5, fp7, fp0 ps_nmsub fp0, fp0, fp5, fp6 // fp1 [ 03 ][ 03 ] : Load lfs fp1, 12(src) // fp13[ ( 11*22 - 21*12 ) * rdet ][ ( 20*12 - 10*22 ) * rdet ] : i[0][0], i[1][0] ps_muls0 fp13, fp13, fp0 // fp2 [ 13 ][ 13 ] : Load lfs fp2, 28(src) // fp12[ ( 21*02 - 01*22 ) * rdet ][ ( 00*22 - 20*02 ) * rdet ] : i[0][1], i[1][1] ps_muls0 fp12, fp12, fp0 // fp3 [ 23 ][ 23 ] : Load lfs fp3, 44(src) // fp11[ ( 01*12 - 11*02 ) * rdet ][ ( 10*02 - 00*12 ) * rdet ] : i[0][2], i[1][2] ps_muls0 fp11, fp11, fp0 // fp5 [ i00 ][ i01 ] ps_merge00 fp5, fp13, fp12 // fp4 [ i10 ][ i11 ] ps_merge11 fp4, fp13, fp12 // fp6 [ i00*03 ][ i10*03 ] ps_mul fp6, fp13, fp1 // [ i00 ][ i01 ] : Store fp5 -> free(fp5[ i00 ][ i01 ]) psq_st fp5, 0(inv), 0, 0 // [ i10 ][ i11 ] : Store fp4 -> free(fp4[ i10 ][ i11 ]) psq_st fp4, 16(inv), 0, 0 // fp10[ ( 10*21 - 20*11 ) * rdet ] : i[2][0] ps_muls0 fp10, fp10, fp0 // fp9 [ ( 20*01 - 00*21 ) * rdet ] : i[2][1] ps_muls0 fp9, fp9, fp0 // fp6 [ i00*03+i01*13 ][ i10*03+i11*13 ] ps_madd fp6, fp12, fp2, fp6 // [ i20 ] : Store fp10 psq_st fp10, 32(inv), 1, 0 // fp8 [ ( 00*11 - 10*01 ) * rdet ] : i[2][2] ps_muls0 fp8, fp8, fp0 // fp6 [ -i00*03-i01*13-i02*23 ][ -i10*03-i11*13-i12*23 ] : i[0][3], i[1][3] ps_nmadd fp6, fp11, fp3, fp6 // [ i21 ] : Store fp9 psq_st fp9, 36(inv), 1, 0 // fp7 [ i20*03 ][ N/A ] ps_mul fp7, fp10, fp1 // fp5 [ i02 ][ i03 ] ps_merge00 fp5, fp11, fp6 // [ i22 ] : Store fp8 psq_st fp8, 40(inv), 1, 0 // fp7 [ i20*03+i21*13 ][ N/A ] ps_madd fp7, fp9, fp2, fp7 // fp4 [ i12 ][ i13 ] ps_merge11 fp4, fp11, fp6 // [ i02 ][ i03 ] : Store fp5 psq_st fp5, 8(inv), 0, 0 // fp7 [ -i20*03-i21*13-i22*23 ][ N/A ] : i[2][3] ps_nmadd fp7, fp8, fp3, fp7 // [ i12 ][ i13 ] : Store fp4 psq_st fp4, 24(inv), 0, 0 // [ i23 ] : Store fp7 psq_st fp7, 44(inv), 1, 0 // return value (regular) addi r3, 0, 1 blr .size ASM_MTXInverse,$-ASM_MTXInverse #undef src #undef inv //////////////////////////////////////////////////////////////////////////////// // u32 ASM_MTXInvXpose(const Mtx src, Mtx invX) .global ASM_MTXInvXpose #define src r3 #define invX r4 ASM_MTXInvXpose: .type ASM_MTXInvXpose, @function // fp0 [ 00 ][ 1.0F ] : Load psq_l fp0, 0( src ), 1, 0 // fp1 [ 01 ][ 02 ] : Load psq_l fp1, 4( src ), 0, 0 // fp2 [ 10 ][ 1.0F ] : Load psq_l fp2, 16( src ), 1, 0 // fp6 [ 02 ][ 00 ] ps_merge10 fp6, fp1, fp0 // fp3 [ 11 ][ 12 ] : Load psq_l fp3, 20( src ), 0, 0 // fp4 [ 20 ][ 1.0F ] : Load psq_l fp4, 32( src ), 1, 0 // fp7 [ 12 ][ 10 ] ps_merge10 fp7, fp3, fp2 // fp5 [ 21 ][ 22 ] : Load psq_l fp5, 36( src ), 0, 0 // fp11[ 11*02 ][ 00*12 ] ps_mul fp11, fp3, fp6 // fp8 [ 22 ][ 20 ] ps_merge10 fp8, fp5, fp4 // fp13[ 21*12 ][ 10*22 ] ps_mul fp13, fp5, fp7 // fp11[ 01*12 - 11*02 ][ 10*02 - 00*12 ] ps_msub fp11, fp1, fp7, fp11 // fp12[ 01*22 ][ 20*02 ] ps_mul fp12, fp1, fp8 // fp13[ 11*22 - 21*12 ][ 20*12 - 10*22 ] ps_msub fp13, fp3, fp8, fp13 // fp10[ 20*11 ][ N/A ] ps_mul fp10, fp3, fp4 // fp12[ 21*02 - 01*22 ][ 00*22 - 20*02 ] ps_msub fp12, fp5, fp6, fp12 // fp7 [ 00*(11*22-21*12) ][ N/A ] ps_mul fp7, fp0, fp13 // fp9 [ 00*21 ][ N/A ] ps_mul fp9, fp0, fp5 // fp8 [ 10*01 ][ N/A ] ps_mul fp8, fp1, fp2 // fp7 [ 00*(11*22-21*12) + 10*(21*02-01*22) ][ N/A ] ps_madd fp7, fp2, fp12, fp7 // fp6 [ 0.0F ][ 0.0F ] ps_sub fp6, fp6, fp6 // fp10[ 10*21 - 20*11 ][ N/A ] ps_msub fp10, fp2, fp5, fp10 // fp7 [ 00*(11*22-21*12) + 10*(21*02-01*22) + 20*(01*12-11*02) ][ N/A ] : det ps_madd fp7, fp4, fp11, fp7 // fp9 [ 20*01 - 00*21 ][ N/A ] ps_msub fp9, fp1, fp4, fp9 // fp8 [ 00*11 - 10*01 ][ N/A ] ps_msub fp8, fp0, fp3, fp8 // ( det == 0 ) ? ps_cmpo0 cr0, fp7, fp6 //bne _regular bne _ASM_MTXInvXpose_regular // return value (singular) addi r3, 0, 0 blr _ASM_MTXInvXpose_regular: // fp0 [ 1/det ][ N/A ] fres fp0, fp7 psq_st fp6, 12(invX),1, 0 // Newton's approximation // Refinement : ( E = est. of 1/K ) -> ( E' = ( 2 - K * E ) * E ) ps_add fp4, fp0, fp0 ps_mul fp5, fp7, fp0 psq_st fp6, 28(invX),1, 0 ps_nmsub fp0, fp0, fp5, fp4 psq_st fp6, 44(invX),1, 0 // fp13[ ( 11*22 - 21*12 ) * rdet ][ ( 20*12 - 10*22 ) * rdet ] : ix[0][0], ix[0][1] ps_muls0 fp13, fp13, fp0 // fp12[ ( 21*02 - 01*22 ) * rdet ][ ( 00*22 - 20*02 ) * rdet ] : ix[1][0], ix[1][1] ps_muls0 fp12, fp12, fp0 // [ ix00 ][ ix01 ] : Store fp13 psq_st fp13, 0( invX ), 0, 0 // fp11[ ( 01*12 - 11*02 ) * rdet ][ ( 10*02 - 00*12 ) * rdet ] : ix[2][0], ix[2][1] ps_muls0 fp11, fp11, fp0 // [ ix10 ][ ix11 ] : Store fp12 psq_st fp12, 16( invX ), 0, 0 // fp10[ ( 10*21 - 20*11 ) * rdet ] : i[0][2] ps_muls0 fp10, fp10, fp0 // [ ix20 ][ ix21 ] : Store fp11 psq_st fp11, 32( invX ), 0, 0 // fp9 [ ( 20*01 - 00*21 ) * rdet ] : i[1][2] ps_muls0 fp9, fp9, fp0 // [ ix02 ] : Store fp10 psq_st fp10, 8( invX ), 1, 0 // fp8 [ ( 00*11 - 10*01 ) * rdet ] : i[2][2] ps_muls0 fp8, fp8, fp0 // [ ix12 ] : Store fp9 psq_st fp9, 24( invX ), 1, 0 // [ ix22 ] : Store fp8 psq_st fp8, 40( invX ), 1, 0 // return value (regular) addi r3, 0, 1 blr .size ASM_MTXInvXpose,$-ASM_MTXInvXpose #undef src #undef invX //////////////////////////////////////////////////////////////////////////////// // void ASM_MTXReflect(Mtx m, const Vec *p, const Vec *n) #define m r3 #define p r4 #define n r5 .global ASM_MTXReflect ASM_MTXReflect: .type ASM_MTXReflect, @function #define c_one fp1 #define vn_xy fp2 #define vn_z1 fp3 #define n2vn_xy fp4 #define n2vn_z1 fp5 #define pdotn fp6 #define tmp0 fp7 #define tmp1 fp8 #define tmp2 fp9 #define tmp3 fp10 #define tmp4 fp11 #define tmp5 fp12 #define tmp6 fp13 #define tmp7 fp0 // c_one = 1.0F lis r6, CONST_1_0F@h ori r6, r6, CONST_1_0F@l lfs c_one, 0(r6) // vn_z1 = [nz][1.0F] : LOAD psq_l vn_z1, 8(n), 1, 0 // vn_xy = [nx][ny] : LOAD psq_l vn_xy, 0(n), 0, 0 // tmp0 = [px][py] : LOAD psq_l tmp0, 0(p), 0, 0 // n2vn_z1 = [-2nz][-2.0F] ps_nmadd n2vn_z1, vn_z1, c_one, vn_z1 // tmp1 = [pz][1.0F] : LOAD psq_l tmp1, 8(p), 1, 0 // n2vn_xy = [-2nx][-2ny] ps_nmadd n2vn_xy, vn_xy, c_one, vn_xy // tmp4 = [-2nx*nz][-2ny*nz] : [m20][m21] ps_muls0 tmp4, vn_xy, n2vn_z1 // pdotn = [-2(px*nx)][-2(py*ny)] ps_mul pdotn, n2vn_xy, tmp0 // tmp2 = [-2nx*nx][-2nx*ny] ps_muls0 tmp2, vn_xy, n2vn_xy // pdotn = [-2(px*nx+py*ny)][?] ps_sum0 pdotn, pdotn, pdotn, pdotn // tmp3 = [-2nx*ny][-2ny*ny] ps_muls1 tmp3, vn_xy, n2vn_xy // tmp4 = [m20][m21] : STORE psq_st tmp4, 32(m), 0, 0 // tmp2 = [1-2nx*nx][-2nx*ny] : [m00][m01] ps_sum0 tmp2, tmp2, tmp2, c_one // pdotn = [2(px*nx+py*ny+pz*nz)][?] ps_nmadd pdotn, n2vn_z1, tmp1, pdotn // tmp3 = [-2nx*ny][1-2ny*ny] : [m10][m11] ps_sum1 tmp3, c_one, tmp3, tmp3 // tmp2 = [m00][m01] : STORE psq_st tmp2, 0(m), 0, 0 // tmp5 = [pdotn*nx][pdotn*ny] ps_muls0 tmp5, vn_xy, pdotn // tmp6 = [-2nz][pdotn] ps_merge00 tmp6, n2vn_z1, pdotn // tmp3 = [m10][m11] : STORE psq_st tmp3, 16(m), 0, 0 // tmp7 = [-2nx*nz][pdotn*nx] : [m02][m03] ps_merge00 tmp7, tmp4, tmp5 // tmp6 = [-2nz*nz][pdotn*nz] ps_muls0 tmp6, tmp6, vn_z1 // tmp5 = [-2ny*nz][pdotn*ny] : [m12][m13] ps_merge11 tmp5, tmp4, tmp5 // tmp7 = [m02][m03] : STORE psq_st tmp7, 8(m), 0, 0 // tmp6 = [1-2nz*nz][pdotn*nz] : [m22][m23] ps_sum0 tmp6, tmp6, tmp6, c_one // tmp5 = [m12][m13] : STORE psq_st tmp5, 24(m), 0, 0 // tmp6 = [m22][m23] : STORE psq_st tmp6, 40(m), 0, 0 blr .size ASM_MTXReflect,$-ASM_MTXReflect #undef m #undef p #undef n #undef c_one #undef vn_xy #undef vn_z1 #undef n2vn_xy #undef n2vn_z1 #undef pdotn #undef tmp0 #undef tmp1 #undef tmp2 #undef tmp3 #undef tmp4 #undef tmp5 #undef tmp6 #undef tmp7 //////////////////////////////////////////////////////////////////////////////// // void ASM_MTXScaleApply (const Mtx src, Mtx dst, f32 xS, f32 yS, f32 zS) #define src r3 #define dst r4 #define xS fp1 #define yS fp2 #define zS fp3 .global ASM_MTXScaleApply ASM_MTXScaleApply: .type ASM_MTXScaleApply, @function frsp xS, xS // to make sure xS = single precision psq_l fp4, 0(src), 0, 0 frsp yS, yS // to make sure yS = single precision psq_l fp5, 8(src), 0, 0 frsp zS, zS // to make sure zS = single precision ps_muls0 fp4, fp4, xS psq_l fp6, 16(src), 0, 0 ps_muls0 fp5, fp5, xS psq_l fp7, 24(src), 0, 0 ps_muls0 fp6, fp6, yS psq_l fp8, 32(src), 0, 0 psq_st fp4, 0(dst), 0, 0 ps_muls0 fp7, fp7, yS psq_l fp2, 40(src), 0, 0 psq_st fp5, 8(dst), 0, 0 ps_muls0 fp8, fp8, zS psq_st fp6, 16(dst), 0, 0 ps_muls0 fp2, fp2, zS psq_st fp7, 24(dst), 0, 0 psq_st fp8, 32(dst), 0, 0 psq_st fp2, 40(dst), 0, 0 blr .size ASM_MTXScaleApply,$-ASM_MTXScaleApply #undef src #undef dst #undef xS #undef yS #undef zS //////////////////////////////////////////////////////////////////////////////// // void _ASM_MTXRotAxisRadInternal(Mtx m, const Vec *axis, f32 sT, f32 cT) .global _ASM_MTXRotAxisRadInternal #define m r3 #define axis r4 #define sT fp1 #define cT fp2 _ASM_MTXRotAxisRadInternal: .type _ASM_MTXRotAxisRadInternal, @function #define tT fp3 #define fc0 fp4 #define tmp0 fp5 #define tmp1 fp6 #define tmp2 fp7 #define tmp3 fp8 #define tmp4 fp9 #define tmp5 fp10 #define tmp6 fp11 #define tmp7 fp12 #define tmp8 fp13 #define tmp9 fp14 mflr r0 stwu r1, -24(r1) stw r0, 28(r1) psq_st fp14, 8(r1), 0, 0 stfd fp14, 16(r1) // tmp8 = 3.0F; lis r6, CONST_3_0F@h ori r6, r6, CONST_3_0F@l lfs tmp8, 0(r6) // tmp9 = 0.5F; lis r5, CONST_0_5F@h ori r5, r5, CONST_0_5F@l lfs tmp9, 0(r5) // to make sure cT = (single precision float value) frsp cT, cT // tmp0 = [x][y] : LOAD psq_l tmp0, 0(axis), 0, 0 // to make sure sT = (single precision float value) frsp sT, sT // tmp1 = [z][z] : LOAD lfs tmp1, 8(axis) // tmp2 = [x*x][y*y] ps_mul tmp2, tmp0, tmp0 // tmp7 = [1.0F] fadds tmp7, tmp9, tmp9 // tmp3 = [x*x+z*z][y*y+z*z] ps_madd tmp3, tmp1, tmp1, tmp2 // fc0 = [0.0F] fsubs fc0, tmp9, tmp9 // tmp4 = [S = x*x+y*y+z*z][z] ps_sum0 tmp4, tmp3, tmp1, tmp2 // tT = 1.0F - cT fsubs tT, tmp7, cT // tmp5 = [1.0/sqrt(S)] :estimation[E] frsqrte tmp5, tmp4 // Newton-Rapson refinement step // E' = E/2(3.0 - E*E*S) fmuls tmp2, tmp5, tmp5 // E*E fmuls tmp3, tmp5, tmp9 // E/2 fnmsubs tmp2, tmp2, tmp4, tmp8 // (3-E*E*S) fmuls tmp5, tmp2, tmp3 // (E/2)(3-E*E*S) // cT = [c][c] ps_merge00 cT, cT, cT // tmp0 = [nx = x/sqrt(S)][ny = y/sqrt(S)] ps_muls0 tmp0, tmp0, tmp5 // tmp1 = [nz = z/sqrt(S)][nz = z/sqrt(S)] ps_muls0 tmp1, tmp1, tmp5 // tmp4 = [t*nx][t*ny] ps_muls0 tmp4, tmp0, tT // tmp9 = [s*nx][s*ny] ps_muls0 tmp9, tmp0, sT // tmp5 = [t*nz][t*nz] ps_muls0 tmp5, tmp1, tT // tmp3 = [t*nx*ny][t*ny*ny] ps_muls1 tmp3, tmp4, tmp0 // tmp2 = [t*nx*nx][t*ny*nx] ps_muls0 tmp2, tmp4, tmp0 // tmp4 = [t*nx*nz][t*ny*nz] ps_muls0 tmp4, tmp4, tmp1 // tmp6 = [t*nx*ny-s*nz][t*nx*ny-s*nz] fnmsubs tmp6, tmp1, sT, tmp3 // tmp7 = [t*nx*ny+s*nz][t*ny*ny+s*nz] fmadds tmp7, tmp1, sT, tmp3 // tmp0 = [-s*nx][-s*ny] ps_neg tmp0, tmp9 // tmp8 = [t*nx*nz+s*ny][0] == [m02][m03] ps_sum0 tmp8, tmp4, fc0, tmp9 // tmp2 = [t*nx*nx+c][t*nx*ny-s*nz] == [m00][m01] ps_sum0 tmp2, tmp2, tmp6, cT // tmp3 = [t*nx*ny+s*nz][t*ny*ny+c] == [m10][m11] ps_sum1 tmp3, cT, tmp7, tmp3 // tmp6 = [t*ny*nz-s*nx][0] == [m12][m13] ps_sum0 tmp6, tmp0, fc0 ,tmp4 // tmp8 [m02][m03] : STORE psq_st tmp8, 8(m), 0, 0 // tmp0 = [t*nx*nz-s*ny][t*ny*nz] ps_sum0 tmp0, tmp4, tmp4, tmp0 // tmp2 [m00][m01] : STORE psq_st tmp2, 0(m), 0, 0 // tmp5 = [t*nz*nz][t*nz*nz] ps_muls0 tmp5, tmp5, tmp1 // tmp3 [m10][m11] : STORE psq_st tmp3, 16(m), 0, 0 // tmp4 = [t*nx*nz-s*ny][t*ny*nz+s*nx] == [m20][m21] ps_sum1 tmp4, tmp9, tmp0, tmp4 // tmp6 [m12][m13] : STORE psq_st tmp6, 24(m), 0, 0 // tmp5 = [t*nz*nz+c][0] == [m22][m23] ps_sum0 tmp5, tmp5, fc0, cT // tmp4 [m20][m21] : STORE psq_st tmp4, 32(m), 0, 0 // tmp5 [m22][m23] : STORE psq_st tmp5, 40(m), 0, 0 psq_l fp14, 8(r1), 0, 0 lfd fp14, 16(r1) lwz r0, 28(r1) mtlr r0 addi r1, r1, 24 blr .size _ASM_MTXRotAxisRadInternal,$-_ASM_MTXRotAxisRadInternal #undef m #undef axis #undef sT #undef cT #undef tT #undef fc0 #undef tmp0 #undef tmp1 #undef tmp2 #undef tmp3 #undef tmp4 #undef tmp5 #undef tmp6 #undef tmp7 #undef tmp8 #undef tmp9 //////////////////////////////////////////////////////////////////////////////// // void ASM_MTXTrans(Mtx m, f32 xT, f32 yT, f32 zT) #define m r3 #define xT fp1 #define yT fp2 #define zT fp3 #define c_zero fp4 #define c_one fp5 .global ASM_MTXTrans ASM_MTXTrans: .type ASM_MTXTrans, @function // c_zero = 0.0F; lis r4, CONST_0_0F@h ori r4, r4, CONST_0_0F@l lfs c_zero, 0(r4) // c_one = 1.0F; lis r5, CONST_1_0F@h ori r5, r5, CONST_1_0F@l lfs c_one, 0(r5) stfs xT, 12(m) stfs yT, 28(m) psq_st c_zero, 4(m), 0, 0 psq_st c_zero, 32(m), 0, 0 stfs c_zero, 16(m) stfs c_one, 20(m) stfs c_zero, 24(m) stfs c_one, 40(m) stfs zT, 44(m) stfs c_one, 0(m) blr .size ASM_MTXTrans,$-ASM_MTXTrans #undef m #undef xT #undef yT #undef zT #undef c_zero #undef c_one //////////////////////////////////////////////////////////////////////////////// // void ASM_MTXTransApply(const Mtx src, Mtx dst, f32 xT, f32 yT, f32 zT ) #define src r3 #define dst r4 #define xT fp1 #define yT fp2 #define zT fp3 .global ASM_MTXTransApply ASM_MTXTransApply: .type ASM_MTXTransApply, @function psq_l fp4, 0(src), 0, 0 frsp xT, xT; // to make sure xT = single precision psq_l fp5, 8(src), 0, 0 frsp yT, yT; // to make sure yT = single precision psq_l fp7, 24(src), 0, 0 frsp zT, zT; // to make sure zT = single precision psq_l fp8, 40(src), 0, 0 psq_st fp4, 0(dst), 0, 0 ps_sum1 fp5, xT, fp5, fp5 psq_l fp6, 16(src), 0, 0 psq_st fp5, 8(dst), 0, 0 ps_sum1 fp7, yT, fp7, fp7 psq_l fp9, 32(src), 0, 0 psq_st fp6, 16(dst), 0, 0 ps_sum1 fp8, zT, fp8, fp8 psq_st fp7, 24(dst), 0, 0 psq_st fp9, 32(dst), 0, 0 psq_st fp8, 40(dst), 0, 0 blr .size ASM_MTXTransApply,$-ASM_MTXTransApply #undef src #undef dst #undef xT #undef yT #undef zT //////////////////////////////////////////////////////////////////////////////// // void ASM_MTXScale(Mtx m, f32 xS, f32 yS, f32 zS) #define m r3 #define xS fp1 #define yS fp2 #define zS fp3 #define c_zero fp4 .global ASM_MTXScale ASM_MTXScale: .type ASM_MTXScale, @function // c_zero = 0.0F; lis r4, CONST_0_0F@h ori r4, r4, CONST_0_0F@l lfs c_zero, 0(r4) stfs xS, 0(m) psq_st c_zero, 4(m), 0, 0 psq_st c_zero, 12(m), 0, 0 stfs yS, 20(m) psq_st c_zero, 24(m), 0, 0 psq_st c_zero, 32(m), 0, 0 stfs zS, 40(m) stfs c_zero, 44(m) blr .size ASM_MTXScale,$-ASM_MTXScale #undef m #undef xS #undef yS #undef zS //////////////////////////////////////////////////////////////////////////////// // void ASM_MTXRotTrig(Mtx m, char axis, f32 sinA, f32 cosA); #define m r3 #define axis r4 #define sinA fp1 #define cosA fp2 #define fc0 fp3 #define fc1 fp4 #define nsinA fp5 #define fw0 fp6 #define fw1 fp7 #define fw2 fp8 #define fw3 fp9 .global ASM_MTXRotTrig ASM_MTXRotTrig: .type ASM_MTXRotTrig, @function frsp sinA, sinA // to make sure sinA = single precision frsp cosA, cosA // to make sure cosA = single precision // fc0 = 0.0F; lis r5, CONST_0_0F@h ori r5, r5, CONST_0_0F@l lfs fc0, 0(r5) // fc1 = 1.0F; lis r6, CONST_1_0F@h ori r6, r6, CONST_1_0F@l lfs fc1, 0(r6) // always lower case ori axis, axis, 0x20 ps_neg nsinA, sinA // branches cmplwi axis, 'x' beq _case_x cmplwi axis, 'y' beq _case_y cmplwi axis, 'z' beq _case_z b _end _case_x: psq_st fc1, 0(m), 1, 0 psq_st fc0, 4(m), 0, 0 ps_merge00 fw0, sinA, cosA psq_st fc0, 12(m), 0, 0 ps_merge00 fw1, cosA, nsinA psq_st fc0, 28(m), 0, 0 psq_st fc0, 44(m), 1, 0 psq_st fw0, 36(m), 0, 0 psq_st fw1, 20(m), 0, 0 b _end; _case_y: ps_merge00 fw0, cosA, fc0 ps_merge00 fw1, fc0, fc1 psq_st fc0, 24(m), 0, 0 psq_st fw0, 0(m), 0, 0 ps_merge00 fw2, nsinA, fc0 ps_merge00 fw3, sinA, fc0 psq_st fw0, 40(m), 0, 0; psq_st fw1, 16(m), 0, 0; psq_st fw3, 8(m), 0, 0; psq_st fw2, 32(m), 0, 0; b _end; _case_z: psq_st fc0, 8(m), 0, 0 ps_merge00 fw0, sinA, cosA ps_merge00 fw2, cosA, nsinA psq_st fc0, 24(m), 0, 0 psq_st fc0, 32(m), 0, 0 ps_merge00 fw1, fc1, fc0 psq_st fw0, 16(m), 0, 0 psq_st fw2, 0(m), 0, 0 psq_st fw1, 40(m), 0, 0 _end: blr .size ASM_MTXRotTrig,$-ASM_MTXRotTrig #undef m #undef axis #undef sinA #undef cosA #undef fc0 #undef fc1 #undef nsinA #undef fw0 #undef fw1 #undef fw2 #undef fw3 //////////////////////////////////////////////////////////////////////////////// // void ASM_MTXReorder(const Mtx src, ROMtx dest) #define src r3 #define dest r4 .global ASM_MTXReorder #define S00_S01 fp1 #define S02_S03 fp2 #define S10_S11 fp3 #define S12_S13 fp4 #define S20_S21 fp5 #define S22_S23 fp6 #define D00_D10 fp7 #define D11_D21 fp8 #define D02_D12 fp9 #define D22_D03 fp10 #define D13_D23 fp11 #define D20_D01 fp12 ASM_MTXReorder: .type ASM_MTXReorder, @function psq_l S00_S01, 0(src), 0, 0 psq_l S10_S11, 16(src), 0, 0 psq_l S20_S21, 32(src), 0, 0 psq_l S02_S03, 8(src), 0, 0 ps_merge00 D00_D10, S00_S01, S10_S11 psq_l S12_S13, 24(src), 0, 0 ps_merge01 D20_D01, S20_S21, S00_S01 psq_l S22_S23, 40(src), 0, 0 ps_merge11 D11_D21, S10_S11, S20_S21 psq_st D00_D10, 0(dest), 0, 0 ps_merge00 D02_D12, S02_S03, S12_S13 psq_st D20_D01, 8(dest), 0, 0 ps_merge01 D22_D03, S22_S23, S02_S03 psq_st D11_D21, 16(dest),0, 0 ps_merge11 D13_D23, S12_S13, S22_S23 psq_st D02_D12, 24(dest),0, 0 psq_st D22_D03, 32(dest),0,0 psq_st D13_D23, 40(dest),0,0 blr .size ASM_MTXReorder,$-ASM_MTXReorder #undef src #undef dest #undef S00_S01 #undef S02_S03 #undef S10_S11 #undef S12_S13 #undef S20_S21 #undef S22_S23 #undef D00_D10 #undef D11_D21 #undef D02_D12 #undef D22_D03 #undef D13_D23 #undef D20_D01