/*---------------------------------------------------------------------------* Project: matrix vector Library File: psmtx.c Copyright 1998, 1999, 2000 Nintendo. All rights reserved. These coded instructions, statements, and computer programs contain proprietary information of Nintendo of America Inc. and/or Nintendo Company Ltd., and are protected by Federal copyright law. They may not be disclosed to third parties or copied or duplicated in any form, in whole or in part, without the prior written consent of Nintendo. $Log: psmtx.c,v $ Revision 1.2 2006/02/20 04:25:42 mitu Changed include path from dolphin/ to revolution/. Revision 1.1.1.1 2005/05/12 02:15:49 yasuh-to Ported from dolphin source tree. NoKeywords: $ 6 2003/08/21 5:17 Dante Changed GQR1 to GQR6 in PSMTXMultS16VecArray 5 2002/04/11 13:11 Hirose const type specifier support. (by Hiratsu@IRD) 4 2001/02/26 11:56p Hirose Avoided use of GQR1 which is reserved by the compiler. 3 2001/02/22 11:49p Hirose Some functions are moved to another file according to arrangement updates. 2 2000/07/12 4:41p John Substitutes MTXConcat and MTXMultVecArray with their paired-singles Equivalent for Gekko non-debug builds. 1 2000/05/10 1:48p Hirose Moved paired-single matrix stuff into an another source file $NoKeywords: $ *---------------------------------------------------------------------------*/ #include #include #include "mtxAssert.h" /*---------------------------------------------------------------------* Special purpose Paired-single optimized code All paired-single code assumes GQR0 = 0. *---------------------------------------------------------------------*/ #ifdef GEKKO /*---------------------------------------------------------------------* Name: PSMTXReorder Description: Creates a reordered (column-major) matrix from a row-major matrix, using paired single operations. Reordered matrices are required for the PSMTXRO* functions, which operate faster than their non-reordered counterparts. Performance: ~15 cycles. Arguments: src: source matrix. dest: destination matrix, note type is ROMtx. Return: none *---------------------------------------------------------------------*/ asm void PSMTXReorder(const register Mtx src, register ROMtx dest) { nofralloc #define S00_S01 fp0 #define S02_S03 fp1 #define S10_S11 fp2 #define S12_S13 fp3 #define S20_S21 fp4 #define S22_S23 fp5 #define D00_D10 fp6 #define D11_D21 fp7 #define D02_D12 fp8 #define D22_D03 fp9 #define D13_D23 fp10 #define D20_D01 fp12 psq_l S00_S01, 0(src), 0, 0 psq_l S10_S11, 16(src), 0, 0 psq_l S20_S21, 32(src), 0, 0 psq_l S02_S03, 8(src), 0, 0 ps_merge00 D00_D10, S00_S01, S10_S11 psq_l S12_S13, 24(src), 0, 0 ps_merge01 D20_D01, S20_S21, S00_S01 psq_l S22_S23, 40(src), 0, 0 ps_merge11 D11_D21, S10_S11, S20_S21 psq_st D00_D10, 0(dest), 0, 0 ps_merge00 D02_D12, S02_S03, S12_S13 psq_st D20_D01, 8(dest), 0, 0 ps_merge01 D22_D03, S22_S23, S02_S03 psq_st D11_D21, 16(dest),0, 0 ps_merge11 D13_D23, S12_S13, S22_S23 psq_st D02_D12, 24(dest),0, 0 psq_st D22_D03, 32(dest),0,0 psq_st D13_D23, 40(dest),0,0 blr #undef S00_S01 #undef S02_S03 #undef S10_S11 #undef S12_S13 #undef S20_S21 #undef S22_S23 #undef D00_D10 #undef D11_D21 #undef D02_D12 #undef D22_D03 #undef D13_D23 #undef D20_D01 } /*---------------------------------------------------------------------* Name: PSMTXROMultVecArray Description: Multiplies an array of vectors by a reordered matrix, using paired single operations. This function is significantly faster than PSMTXMultVecArray, but requires that you have reordered the matrix in advance with PSMTXReorder. OK if source = destination. NOTE: number of vertices transformed cannot be less than 2. Note that NO error checking is performed. Performance : 9.586 - 9.814 cycles per vertex, where count = 70 Arguments: m: reordered matrix. srcBase: start of source vector array. dstBase: start of resultant vector array. count: number of vectors in srcBase, dstBase arrays COUNT MUST BE GREATER THAN 2. Return: none *---------------------------------------------------------------------*/ asm void PSMTXROMultVecArray ( const register ROMtx m, // r3 const register Vec *srcBase,// r4 register Vec *dstBase,// r5 register u32 count // r6 ) { nofralloc #define M00_M10 fp0 #define M20_nnn fp1 #define M01_M11 fp2 #define M21_nnn fp3 #define M02_M12 fp4 #define M22_nnn fp5 #define M03_M13 fp6 #define M23_nnn fp7 // source vectors - 2 3D vectors in 3 PS registers #define SX0_SY0 fp8 #define SZ0_SX1 fp9 #define SY1_SZ1 fp10 // Destination registers - 2 3d vectors in 4 PS registers #define DX0_DY0 fp11 #define DZ0_nnn fp12 #define DX1_DY1 fp13 #define DZ1_nnn fp14 // temp registers for writing back values. These registers store the final // results from the PREVIOUS loop #define WX0_WY0 fp15 #define WZ0_nnn fp16 #define WX1_WY1 fp17 #define WZ1_nnn fp18 stwu r1, -64(r1) stfd fp14, 8(r1) // unrolled once, but since we're dividing by 2, add 1 to ensure if // odd # of vertices, the last one gets x-formed. addi r7, count, -1 stfd fp15, 16(r1) srwi r7, r7, 1 // 2 at a time stfd fp16, 24(r1) stfd fp17, 32(r1) stfd fp18, 40(r1) mtctr r7 // load matrix psq_l M00_M10, 0(m),0,0 addi srcBase, srcBase, -8 psq_l M20_nnn, 8(m),1,0 addi dstBase, dstBase, -4 psq_l M03_M13, 36(m),0,0 psq_lu SX0_SY0, 8(srcBase), 0, 0 psq_l M23_nnn, 44(m),1,0 psq_lu SZ0_SX1, 8(srcBase), 0, 0 // ------------------------------UNROLLED // DX0=M00*SX0+M03, DY0=M10*SX0+M13 // DZ0=M20*SX0+M23 // DX1=M00*SX1+M03, DY1=M10*SX1+M13 // DZ1=M20*SX1+M23 ps_madds0 DX0_DY0, M00_M10, SX0_SY0, M03_M13 psq_l M01_M11, 12(m),0,0 ps_madds0 DZ0_nnn, M20_nnn, SX0_SY0, M23_nnn psq_l M21_nnn, 20(m),1,0 ps_madds1 DX1_DY1, M00_M10, SZ0_SX1, M03_M13 psq_lu SY1_SZ1,8(srcBase), 0, 0 ps_madds1 DZ1_nnn, M20_nnn, SZ0_SX1, M23_nnn psq_l M22_nnn, 32(m),1,0 // DX0=M01*SY0+DX0, DY0=M11*SY0+DY0 // DZ0=M21*SY0+DZ0 // DX1=M01*SY1+DX1, DY1=M11*SY1+DY1 // DZ1=M21*SY1+DZ1 ps_madds1 DX0_DY0, M01_M11, SX0_SY0, DX0_DY0 ps_madds1 DZ0_nnn, M21_nnn, SX0_SY0, DZ0_nnn psq_l M02_M12, 24(m),0,0 ps_madds0 DX1_DY1, M01_M11, SY1_SZ1, DX1_DY1 psq_lu SX0_SY0, 8(srcBase), 0, 0 ps_madds0 DZ1_nnn, M21_nnn, SY1_SZ1, DZ1_nnn // DX0=M02*SZ0+DX0, DY0=M12*SZ0+DY0 // DZ0=M22*SZ0+DZ0 // DX1=M02*SZ1+DX1, DY1=M12*SZ1+DY1 // DZ1=M22*SZ1+DZ1 // Write final values to temp registers ps_madds0 WX0_WY0, M02_M12, SZ0_SX1, DX0_DY0 ps_madds0 WZ0_nnn, M22_nnn, SZ0_SX1, DZ0_nnn psq_lu SZ0_SX1, 8(srcBase), 0, 0 ps_madds1 WX1_WY1, M02_M12, SY1_SZ1, DX1_DY1 ps_madds1 WZ1_nnn, M22_nnn, SY1_SZ1, DZ1_nnn psq_lu SY1_SZ1,8(srcBase), 0, 0 // -------------------------- LOOP START _mloop: ps_madds0 DX0_DY0, M00_M10, SX0_SY0, M03_M13 psq_stu WX0_WY0, 4(dstBase), 0, 0 ps_madds0 DZ0_nnn, M20_nnn, SX0_SY0, M23_nnn psq_stu WZ0_nnn, 8(dstBase), 1, 0 ps_madds1 DX1_DY1, M00_M10, SZ0_SX1, M03_M13 psq_stu WX1_WY1, 4(dstBase), 0, 0 ps_madds1 DZ1_nnn, M20_nnn, SZ0_SX1, M23_nnn psq_stu WZ1_nnn, 8(dstBase), 1, 0 ps_madds1 DX0_DY0, M01_M11, SX0_SY0, DX0_DY0 ps_madds1 DZ0_nnn, M21_nnn, SX0_SY0, DZ0_nnn // YYY LAST SX0_SY0 USAGE FOR THIS ITERATION psq_lu SX0_SY0, 8(srcBase), 0, 0 // NEXT SX0 SY0 ps_madds0 DX1_DY1, M01_M11, SY1_SZ1, DX1_DY1 ps_madds0 DZ1_nnn, M21_nnn, SY1_SZ1, DZ1_nnn // Write final values to temp registers ps_madds0 WX0_WY0, M02_M12, SZ0_SX1, DX0_DY0 ps_madds0 WZ0_nnn, M22_nnn, SZ0_SX1, DZ0_nnn // YYY LAST SZ0_SX1 USAGE FOR THIS ITERATION psq_lu SZ0_SX1, 8(srcBase), 0, 0 // NEXT SZ0 SX1 ps_madds1 WX1_WY1, M02_M12, SY1_SZ1, DX1_DY1 ps_madds1 WZ1_nnn, M22_nnn, SY1_SZ1, DZ1_nnn // YYY LAST SY1_SZ1 USAGE FOR THIS ITERATION psq_lu SY1_SZ1,8(srcBase), 0, 0 // NEXT SY1 SZ1 bdnz+ _mloop // -------------------------- LOOP END psq_stu WX0_WY0, 4(dstBase), 0, 0 rlwinm. r7, count, 0, 31, 31 // check odd psq_stu WZ0_nnn, 8(dstBase), 1, 0 bne _return // Skipped if odd number of vectors psq_stu WX1_WY1, 4(dstBase), 0, 0 // can't put anything here psq_stu WZ1_nnn, 8(dstBase), 1, 0 _return: lfd fp14, 8(r1) lfd fp15, 16(r1) lfd fp16, 24(r1) lfd fp17, 32(r1) lfd fp18, 40(r1) addi r1, r1, 64 blr #undef M00_M10 #undef M20_nnn #undef M01_M11 #undef M21_nnn #undef M02_M12 #undef M22_nnn #undef M03_M13 #undef M23_nnn #undef SX0_SY0 #undef SZ0_SX1 #undef SY1_SZ1 #undef DX0_DY0 #undef DZ0_nnn #undef DX1_DY1 #undef DZ1_nnn #undef WX0_WY0 #undef WZ0_nnn #undef WX1_WY1 #undef WZ1_nnn } /*---------------------------------------------------------------------* Name: PSMTXROSkin2VecArray Description: Multiplies an array of vectors by a reordered matrix, using paired single operations. This function is significantly faster than PSMTXMultVecArray, but requires that you have reordered the matrix in advance with PSMTXReorder. OK if source = destination. NOTE: number of vertices transformed cannot be less than 2. Note that NO error checking is performed. Performance : ~20.8 cycles per vertex, where count = 70 Arguments: m0: first reordered matrix m1: second reordered matrix wtBase: start of per vertex weight array srcBase: start of source vector array. dstBase: start of resultant vector array. count: number of vectors in srcBase, dstBase arrays COUNT MUST BE GREATER THAN 2. Return: none *---------------------------------------------------------------------*/ asm void PSMTXROSkin2VecArray ( const register ROMtx m0, // r3 const register ROMtx m1, // r4 const register f32 *wtBase, // r5 const register Vec *srcBase,// r6 register Vec *dstBase,// r7 register u32 count // r8 ) { nofralloc // transposed matrix #define M00_10 fp0 #define M20 fp1 #define M01_11 fp2 #define M21 fp3 #define M02_12 fp4 #define M22 fp5 #define M03_13 fp6 #define M23 fp7 // source vector - 1 3D vectors in 2 PS registers #define Sx_y fp8 #define Sz fp9 // Destination vector - 1 3d vector in 2 PS registers #define Dx_y fp10 #define Dz fp11 // intermediate vector 1 3D vector in 2 PS registers #define Ix_y fp12 #define Iz fp13 #define M0_00_10 fp14 #define M0_20 fp15 #define M0_01_11 fp16 #define M0_21 fp17 #define M0_02_12 fp18 #define M0_22 fp19 #define M0_03_13 fp20 #define M0_23 fp21 #define M1_00_10 fp22 #define M1_20 fp23 #define M1_01_11 fp24 #define M1_21 fp25 #define M1_02_12 fp26 #define M1_22 fp27 #define M1_03_13 fp28 #define M1_23 fp29 #define Wt fp30 // save FP regs stwu r1, -160(r1) stfd fp14, 8(r1) stfd fp15, 16(r1) stfd fp16, 24(r1) stfd fp17, 32(r1) stfd fp18, 40(r1) stfd fp19, 48(r1) stfd fp20, 56(r1) stfd fp21, 64(r1) stfd fp22, 72(r1) stfd fp23, 80(r1) stfd fp24, 88(r1) stfd fp25, 96(r1) stfd fp26, 104(r1) stfd fp27, 112(r1) stfd fp28, 120(r1) stfd fp29, 128(r1) stfd fp30, 136(r1) // always perform at least one iteration of loop addi r9, count, -1 mtctr r9 addi srcBase, srcBase, -4 addi dstBase, dstBase, -4 addi wtBase, wtBase, -4 // load matrices m0 and (m1-m0) psq_l M0_00_10,0(m0),0,0 psq_l M1_00_10,0(m1),0,0 psq_l M0_20, 8(m0),1,0 psq_l M1_20, 8(m1),1,0 psq_l M0_01_11, 12(m0),0,0 psq_l M1_01_11, 12(m1),0,0 ps_sub M1_00_10,M1_00_10,M0_00_10 psq_l M0_21, 20(m0),1,0 psq_l M1_21, 20(m1),1,0 ps_sub M1_20,M1_20,M0_20 psq_l M0_02_12, 24(m0),0,0 psq_l M1_02_12, 24(m1),0,0 ps_sub M1_01_11,M1_01_11,M0_01_11 psq_l M0_22, 32(m0),1,0 psq_l M1_22, 32(m1),1,0 ps_sub M1_21,M1_21,M0_21 psq_l M0_03_13, 36(m0),0,0 psq_l M1_03_13, 36(m1),0,0 ps_sub M1_02_12,M1_02_12,M0_02_12 psq_l M0_23, 44(m0),1,0 psq_l M1_23, 44(m1),1,0 ps_sub M1_22,M1_22,M0_22 ps_sub M1_03_13,M1_03_13,M0_03_13 ps_sub M1_23,M1_23,M0_23 //start of first iteration psq_lu Wt,4(wtBase),1,0 // Wt = *wtBase++; psq_lu Sx_y, 4(srcBase), 0, 0 // Sx_y = *srcBase++; psq_lu Sz, 8(srcBase), 1, 0 // Sz = *srcBase++; ps_madds0 M00_10,M1_00_10,Wt,M0_00_10 // m = lerp(m0, m1, wt); ps_madds0 M20,M1_20,Wt,M0_20 // m = lerp(m0, m1, wt); ps_madds0 M01_11,M1_01_11,Wt,M0_01_11 // m = lerp(m0, m1, wt); ps_madds0 M21,M1_21,Wt,M0_21 // m = lerp(m0, m1, wt); ps_madds0 M02_12,M1_02_12,Wt,M0_02_12 // m = lerp(m0, m1, wt); ps_madds0 M22,M1_22,Wt,M0_22 // m = lerp(m0, m1, wt); ps_madds0 M03_13,M1_03_13,Wt,M0_03_13 // m = lerp(m0, m1, wt); ps_madds0 M23,M1_23,Wt,M0_23 // m = lerp(m0, m1, wt); ps_madds0 Ix_y, M00_10, Sx_y, M03_13 // Ix_y = M03_13 + M00_10 * Sx; ps_madds0 Iz, M20, Sx_y, M23 // Iz = M23 + M20 * Sx; psq_lu Wt,4(wtBase),1,0 // Wt = *wtBase++; _mloop: ps_madds1 Ix_y,M01_11,Sx_y,Ix_y // Ix_y += M01_11 * Sy; ps_madds1 Iz,M21,Sx_y,Iz // Iz += M21 * Sy; psq_lu Sx_y, 4(srcBase), 0, 0 // Sx_y = *srcBase++; ps_madds0 Dx_y, M02_12, Sz, Ix_y // Dx_y = Ix_y + M01_12 * Sz; ps_madds0 Dz, M22, Sz, Iz // Dz = Iz + M22 * Sz; psq_lu Sz, 8(srcBase), 1, 0 // Sz = *srcBase++; ps_madds0 M00_10,M1_00_10,Wt,M0_00_10 // m = lerp(m0, m1, wt); ps_madds0 M20,M1_20,Wt,M0_20 // m = lerp(m0, m1, wt); ps_madds0 M01_11,M1_01_11,Wt,M0_01_11 // m = lerp(m0, m1, wt); ps_madds0 M21,M1_21,Wt,M0_21 // m = lerp(m0, m1, wt); ps_madds0 M02_12,M1_02_12,Wt,M0_02_12 // m = lerp(m0, m1, wt); ps_madds0 M22,M1_22,Wt,M0_22 // m = lerp(m0, m1, wt); ps_madds0 M03_13,M1_03_13,Wt,M0_03_13 // m = lerp(m0, m1, wt); ps_madds0 M23,M1_23,Wt,M0_23 // m = lerp(m0, m1, wt); psq_stu Dx_y, 4(dstBase), 0, 0 // *dstBase++ = Dx_y; ps_madds0 Ix_y, M00_10, Sx_y, M03_13 // Ix_y = M03_13 + M00_10 * Sx; ps_madds0 Iz, M20, Sx_y, M23 // Iz = M23 + M20 * Sx; psq_stu Dz, 8(dstBase), 1, 0 // *dstBase++ = Dz; psq_lu Wt,4(wtBase),1,0 // Wt = *wtBase++; bdnz+ _mloop _mlend: ps_madds1 Ix_y,M01_11,Sx_y,Ix_y // Ix_y += M01_11 * Sy; ps_madds1 Iz,M21,Sx_y,Iz // Iz += M21 * Sy; ps_madds0 Dx_y, M02_12, Sz, Ix_y // Dx_y = Ix_y + M01_12 * Sz; psq_stu Dx_y, 4(dstBase), 0, 0 // *dstBase++ = Dx_y; ps_madds0 Dz, M22, Sz, Iz // Dz = Iz + M22 * Sz; psq_stu Dz, 8(dstBase), 1, 0 // *dstBase++ = Dz; lfd fp14, 8(r1) lfd fp15, 16(r1) lfd fp16, 24(r1) lfd fp17, 32(r1) lfd fp18, 40(r1) lfd fp19, 48(r1) lfd fp20, 56(r1) lfd fp21, 64(r1) lfd fp22, 72(r1) lfd fp23, 80(r1) lfd fp24, 88(r1) lfd fp25, 96(r1) lfd fp26, 104(r1) lfd fp27, 112(r1) lfd fp28, 120(r1) lfd fp29, 128(r1) lfd fp30, 136(r1) addi r1, r1, 160 blr #undef M00_10 #undef M20 #undef M01_11 #undef M21 #undef M02_12 #undef M22 #undef M03_13 #undef M23 #undef Sx_y #undef Sz #undef Dx_y #undef Dz #undef Ix_y #undef Iz #undef M0_00_10 #undef M0_20 #undef M0_01_11 #undef M0_21 #undef M0_02_12 #undef M0_22 #undef M0_03_13 #undef M0_23 #undef M1_00_10 #undef M1_20 #undef M1_01_11 #undef M1_21 #undef M1_02_12 #undef M1_22 #undef M1_03_13 #undef M1_23 #undef Wt } /*---------------------------------------------------------------------* Name: PSMTXROMultS16VecArray Description: Multiplies an array of signed 16 bit vectors by a reordered matrix, generating a Vec array of floats. No cost in conversion. However, this code does take a hit because it uses mtspr to set up a quantization register to convert S16 -> F32. For production code, the GQR should be set up in advance. OK if source = destination. Note that NO error checking is performed. Performance : 9.671 - 9.900 cycles per vertex where count = 70 Arguments: m: matrix. srcBase: start of source s16 vector array. dstBase: start of resultant vector array. Note that available room should be twice as large as source data. count: number of vectors in srcBase, dstBase arrays COUNT MUST BE GREATER THAN 1. Return: none *---------------------------------------------------------------------*/ asm void PSMTXROMultS16VecArray ( const register ROMtx m, // r3 const register S16Vec *srcBase,// r4 register Vec *dstBase,// r5 register u32 count // r6 ) { nofralloc #define M00_M10 fp0 #define M20_nnn fp1 #define M01_M11 fp2 #define M21_nnn fp3 #define M02_M12 fp4 #define M22_nnn fp5 #define M03_M13 fp6 #define M23_nnn fp7 // source vectors - 2 3D vectors in 3 PS registers #define SX0_SY0 fp8 #define SZ0_SX1 fp9 #define SY1_SZ1 fp10 // Destination registers - 2 3d vectors in 4 PS registers #define DX0_DY0 fp11 #define DZ0_nnn fp12 #define DX1_DY1 fp13 #define DZ1_nnn fp14 // temp registers for writing back values. These registers store the final // results from the PREVIOUS loop #define WX0_WY0 fp15 #define WZ0_nnn fp16 #define WX1_WY1 fp17 #define WZ1_nnn fp18 stwu r1, -64(r1) stfd fp14, 8(r1) // unrolled once, but since we're dividing by 2, add 1 to ensure if // odd # of vertices, the last one gets x-formed. addi r7, count, -1 stfd fp15, 16(r1) srwi r7, r7, 1 // 2 at a time stfd fp16, 24(r1) lis r8, 0x0007 // setup GQR6 stfd fp17, 32(r1) mtspr GQR6, r8 // this will stall like a monkey's butt stfd fp18, 40(r1) mtctr r7 // load matrix psq_l M00_M10, 0(m),0,0 addi srcBase, srcBase, -4 psq_l M20_nnn, 8(m),1,0 addi dstBase, dstBase, -4 psq_l M03_M13, 36(m),0,0 psq_lu SX0_SY0, 4(srcBase), 0, 6 psq_l M23_nnn, 44(m),1,0 psq_lu SZ0_SX1, 4(srcBase), 0, 6 // ------------------------------UNROLLED // DX0=M00*SX0+M03, DY0=M10*SX0+M13 // DZ0=M20*SX0+M23 // DX1=M00*SX1+M03, DY1=M10*SX1+M13 // DZ1=M20*SX1+M23 ps_madds0 DX0_DY0, M00_M10, SX0_SY0, M03_M13 psq_l M01_M11, 12(m),0,0 ps_madds0 DZ0_nnn, M20_nnn, SX0_SY0, M23_nnn psq_l M21_nnn, 20(m),1,0 ps_madds1 DX1_DY1, M00_M10, SZ0_SX1, M03_M13 psq_lu SY1_SZ1, 4(srcBase), 0, 6 ps_madds1 DZ1_nnn, M20_nnn, SZ0_SX1, M23_nnn psq_l M22_nnn, 32(m),1,0 // DX0=M01*SY0+DX0, DY0=M11*SY0+DY0 // DZ0=M21*SY0+DZ0 // DX1=M01*SY1+DX1, DY1=M11*SY1+DY1 // DZ1=M21*SY1+DZ1 ps_madds1 DX0_DY0, M01_M11, SX0_SY0, DX0_DY0 ps_madds1 DZ0_nnn, M21_nnn, SX0_SY0, DZ0_nnn psq_l M02_M12, 24(m),0,0 ps_madds0 DX1_DY1, M01_M11, SY1_SZ1, DX1_DY1 psq_lu SX0_SY0, 4(srcBase), 0, 6 ps_madds0 DZ1_nnn, M21_nnn, SY1_SZ1, DZ1_nnn // DX0=M02*SZ0+DX0, DY0=M12*SZ0+DY0 // DZ0=M22*SZ0+DZ0 // DX1=M02*SZ1+DX1, DY1=M12*SZ1+DY1 // DZ1=M22*SZ1+DZ1 // Write final values to temp registers ps_madds0 WX0_WY0, M02_M12, SZ0_SX1, DX0_DY0 ps_madds0 WZ0_nnn, M22_nnn, SZ0_SX1, DZ0_nnn psq_lu SZ0_SX1, 4(srcBase), 0, 6 ps_madds1 WX1_WY1, M02_M12, SY1_SZ1, DX1_DY1 ps_madds1 WZ1_nnn, M22_nnn, SY1_SZ1, DZ1_nnn psq_lu SY1_SZ1, 4(srcBase), 0, 6 // -------------------------- LOOP START _mloop: ps_madds0 DX0_DY0, M00_M10, SX0_SY0, M03_M13 psq_stu WX0_WY0, 4(dstBase), 0, 0 ps_madds0 DZ0_nnn, M20_nnn, SX0_SY0, M23_nnn psq_stu WZ0_nnn, 8(dstBase), 1, 0 ps_madds1 DX1_DY1, M00_M10, SZ0_SX1, M03_M13 psq_stu WX1_WY1, 4(dstBase), 0, 0 ps_madds1 DZ1_nnn, M20_nnn, SZ0_SX1, M23_nnn psq_stu WZ1_nnn, 8(dstBase), 1, 0 ps_madds1 DX0_DY0, M01_M11, SX0_SY0, DX0_DY0 ps_madds1 DZ0_nnn, M21_nnn, SX0_SY0, DZ0_nnn // YYY LAST SX0_SY0 USAGE FOR THIS ITERATION psq_lu SX0_SY0, 4(srcBase), 0, 6 // NEXT SX0 SY0 ps_madds0 DX1_DY1, M01_M11, SY1_SZ1, DX1_DY1 ps_madds0 DZ1_nnn, M21_nnn, SY1_SZ1, DZ1_nnn // Write final values to temp registers ps_madds0 WX0_WY0, M02_M12, SZ0_SX1, DX0_DY0 ps_madds0 WZ0_nnn, M22_nnn, SZ0_SX1, DZ0_nnn // YYY LAST SZ0_SX1 USAGE FOR THIS ITERATION psq_lu SZ0_SX1, 4(srcBase), 0, 6 // NEXT SZ0 SX1 ps_madds1 WX1_WY1, M02_M12, SY1_SZ1, DX1_DY1 ps_madds1 WZ1_nnn, M22_nnn, SY1_SZ1, DZ1_nnn // YYY LAST SY1_SZ1 USAGE FOR THIS ITERATION psq_lu SY1_SZ1, 4(srcBase), 0, 6 // NEXT SY1 SZ1 bdnz+ _mloop // -------------------------- LOOP END psq_stu WX0_WY0, 4(dstBase), 0, 0 rlwinm. r7, count, 0, 31, 31 // check odd psq_stu WZ0_nnn, 8(dstBase), 1, 0 bne _return // Skipped if odd number of vectors psq_stu WX1_WY1, 4(dstBase), 0, 0 // can't put anything here psq_stu WZ1_nnn, 8(dstBase), 1, 0 _return: lfd fp14, 8(r1) lfd fp15, 16(r1) lfd fp16, 24(r1) lfd fp17, 32(r1) lfd fp18, 40(r1) addi r1, r1, 64 blr #undef M00_M10 #undef M20_nnn #undef M01_M11 #undef M21_nnn #undef M02_M12 #undef M22_nnn #undef M03_M13 #undef M23_nnn #undef SX0_SY0 #undef SZ0_SX1 #undef SY1_SZ1 #undef DX0_DY0 #undef DZ0_nnn #undef DX1_DY1 #undef DZ1_nnn #undef WX0_WY0 #undef WZ0_nnn #undef WX1_WY1 #undef WZ1_nnn } /*---------------------------------------------------------------------* Name: PSMTXMultS16VecArray Description: Multiplies an array of signed 16 bit vectors by a matrix, generating a Vec array of floats. No cost in conversion. However, this code does take a hit because it uses mtspr to set up a quantization register to convert S16 -> F32. For production code, the GQR should be set up in advance. This function takes longer than PSMTXROMultS16VecArray which uses reordered matrices. OK if source = destination. Note that NO error checking is performed. Performance : 13.714 - 13.786 cycles per vertex where count = 70 Arguments: m: matrix. srcBase: start of source vector array. dstBase: start of resultant vector array. Note that available room should be twice as large as source data. count: number of vectors in srcBase, dstBase arrays COUNT MUST BE GREATER THAN 1. Return: none *---------------------------------------------------------------------*/ asm void PSMTXMultS16VecArray ( const register Mtx m, // r3 const register S16Vec *srcBase,// r4 register Vec *dstBase,// r5 register u32 count // r6 ) { nofralloc // cmpwi count, 0 // beq- _return psq_l fp0, 0(m), 0, 0 // [0][0], [0][1] lis r7, 0x0007 // setup GQR6 mtspr GQR6, r7 // this will stall like a monkey's butt // fp6 - x y psq_l fp6, 0(srcBase), 0, 6 subi count, count, 1 // unrolling once // fp7 - z 1 psq_l fp7, 4(srcBase), 1, 6 mtctr count // unused slot here psq_l fp1, 8(m), 0, 0 // [0][2], [0][3] addi srcBase, srcBase, 4 // load ops will add 2, // but we already got the first vertex psq_l fp2, 16(m), 0, 0 // [1][0], [1][1] addi dstBase, dstBase, -4 // store ops will add 4 psq_l fp3, 24(m), 0, 0 // [1][2], [1][3] // ------ first loop starts here // fp8 = m00x m01y // next X ps_mul fp8, fp0, fp6 psq_l fp4, 32(m), 0, 0 // [2][0], [2][1] // fp10 = m10x m11y // next Y ps_mul fp10, fp2, fp6 psq_l fp5, 40(m), 0, 0 // [2][2], [2][3] // fp12 = m20x m21y // next Z ps_mul fp12, fp4, fp6 // YYY last FP6 usage // fp6 - x y psq_lu fp6, 2(srcBase), 0, 6 // advance to x // Potential FP stall here if psq_lu dispatches same // cycle as previous ps_mul // fp8 = m00x + m02z | m01y + m03 ps_madd fp8, fp1, fp7 ,fp8 // fp10 = m10x + m12z | m11y + m13 ps_madd fp10, fp3, fp7 ,fp10 // fp12 = m20x + m22z | m21y + m23 ps_madd fp12, fp5, fp7 ,fp12 // YYY last FP7 usage // fp7 - z 1 psq_lu fp7, 4(srcBase), 1, 6 // advance to z, will be skipped by next lu // Potential FP stall here if psq_lu dispatches same // cycle as previous ps_madd (fp8 dependency) ps_sum0 fp9, fp8, fp8, fp8 // X ready // ------------------- main loop _mloop: ps_sum0 fp11, fp10, fp10, fp10 // Y ready // fp8 = m00x m01y // next X ps_mul fp8, fp0, fp6 ps_sum0 fp13, fp12, fp12, fp12 // Z ready // fp10 = m10x m11y // next Y ps_mul fp10, fp2, fp6 psq_stu fp9, 4(dstBase), 1, 0 // prev X // fp12 = m20x m21y // next Z ps_mul fp12, fp4, fp6 // YYY last FP6 usage psq_stu fp11, 4(dstBase), 1, 0 // prev Y // fp8 = m00x + m02z | m01y + m03 ps_madd fp8, fp1, fp7 ,fp8 psq_stu fp13, 4(dstBase), 1, 0 // prev Z // fp10 = m10x + m12z | m11y + m13 ps_madd fp10, fp3, fp7 ,fp10 // fp6 - x y psq_lu fp6, 2(srcBase), 0, 6 // advance to x // fp12 = m20x + m22z | m21y + m23 ps_madd fp12, fp5, fp7 ,fp12 // YYY last FP7 usage // fp7 - z 1 psq_lu fp7, 4(srcBase), 1, 6 // advance to z, will be skipped by next lu // Potential FP stall here if psq_lu dispatches same // cycle as previous ps_madd ps_sum0 fp9, fp8, fp8, fp8 // X ready bdnz+ _mloop // ------------------- end of loop ps_sum0 fp11, fp10, fp10, fp10 // Y ready ps_sum0 fp13, fp12, fp12, fp12 // Z ready // commit last iteration psq_stu fp9, 4(dstBase), 1, 0 psq_stu fp11, 4(dstBase), 1, 0 psq_stu fp13, 4(dstBase), 1, 0 _return: blr } #endif // GEKKO /*===========================================================================*/