/*---------------------------------------------------------------------------* Project: Matrix vector Library File: psmtx.c Copyright 1998-2007 Nintendo. All rights reserved. These coded instructions, statements, and computer programs contain proprietary information of Nintendo of America Inc. and/or Nintendo Company Ltd., and are protected by Federal copyright law. They may not be disclosed to third parties or copied or duplicated in any form, in whole or in part, without the prior written consent of Nintendo. $Log: psmtx.c,v $ Revision 1.3 2007/08/30 10:42:41 hirose Updated PSMTXROMultVecArray to make it Broadway EABI compliant. Removed unsupported functions. Revision 1.2 2006/02/20 04:25:42 mitu Changed include path from dolphin/ to revolution/. Revision 1.1.1.1 2005/05/12 02:15:49 yasuh-to Ported from dolphin sheath tree. NoKeywords: $ 6 2003/08/21 5:17 Dante Changed GQR1 to GQR6 in PSMTXMultS16VecArray 5 2002/04/11 13:11 Hirose const type specifier support. (by Hiratsu@IRD) 4 2001/02/26 11:56p Hirose Avoided use of GQR1, which is reserved by the compiler. 3 2001/02/22 11:49p Hirose Some functions are moved to another file according to arrangement updates. 2 2000/07/12 4:41p John Substituted MTXConcat and MTXMultVecArray with their paired-singles equivalent for Gekko non-debug builds. 1 2000/05/10 1:48p Hirose Moved paired-single matrix stuff into an another source file $NoKeywords: $ *---------------------------------------------------------------------------*/ #include #include #include "mtxAssert.h" /*---------------------------------------------------------------------* Special purpose Paired-single optimized code All paired-single code assumes GQR0 = 0. *---------------------------------------------------------------------*/ #ifdef GEKKO /*---------------------------------------------------------------------* Name: PSMTXReorder Description: Creates a reordered (column-major) matrix from a row-major matrix, using paired single operations. Reordered matrices are required for the PSMTXRO* functions, which operate faster than their non-reordered counterparts. Performance: ~15 cycles. Arguments: src source matrix. dest destination matrix, note type is ROMtx. Return : None. *---------------------------------------------------------------------*/ asm void PSMTXReorder(const register Mtx src, register ROMtx dest) { nofralloc #define S00_S01 fp0 #define S02_S03 fp1 #define S10_S11 fp2 #define S12_S13 fp3 #define S20_S21 fp4 #define S22_S23 fp5 #define D00_D10 fp6 #define D11_D21 fp7 #define D02_D12 fp8 #define D22_D03 fp9 #define D13_D23 fp10 #define D20_D01 fp12 psq_l S00_S01, 0(src), 0, 0 psq_l S10_S11, 16(src), 0, 0 psq_l S20_S21, 32(src), 0, 0 psq_l S02_S03, 8(src), 0, 0 ps_merge00 D00_D10, S00_S01, S10_S11 psq_l S12_S13, 24(src), 0, 0 ps_merge01 D20_D01, S20_S21, S00_S01 psq_l S22_S23, 40(src), 0, 0 ps_merge11 D11_D21, S10_S11, S20_S21 psq_st D00_D10, 0(dest), 0, 0 ps_merge00 D02_D12, S02_S03, S12_S13 psq_st D20_D01, 8(dest), 0, 0 ps_merge01 D22_D03, S22_S23, S02_S03 psq_st D11_D21, 16(dest),0, 0 ps_merge11 D13_D23, S12_S13, S22_S23 psq_st D02_D12, 24(dest),0, 0 psq_st D22_D03, 32(dest),0,0 psq_st D13_D23, 40(dest),0,0 blr #undef S00_S01 #undef S02_S03 #undef S10_S11 #undef S12_S13 #undef S20_S21 #undef S22_S23 #undef D00_D10 #undef D11_D21 #undef D02_D12 #undef D22_D03 #undef D13_D23 #undef D20_D01 } /*---------------------------------------------------------------------* Name: PSMTXROMultVecArray Description: Multiplies an array of vectors by a reordered matrix, using paired single operations. This function is significantly faster than PSMTXMultVecArray, but requires that you have reordered the matrix in advance with PSMTXReorder. OK if source = destination. NOTE: number of vertices transformed cannot be less than 2. Note that NO error checking is performed. Performance: 9.586 - 9.814 cycles per vertex, where count = 70 Arguments: m reordered matrix. srcBase start of source vector array. dstBase Start of resultant vector array. count Number of vectors in srcBase, dstBase arrays COUNT MUST BE GREATER THAN 2. Return: None. *---------------------------------------------------------------------*/ asm void PSMTXROMultVecArray ( const register ROMtx m, // r3 const register Vec *srcBase,// r4 register Vec *dstBase,// r5 register u32 count // r6 ) { nofralloc #define M00_M10 fp0 #define M20_nnn fp1 #define M01_M11 fp2 #define M21_nnn fp3 #define M02_M12 fp4 #define M22_nnn fp5 #define M03_M13 fp6 #define M23_nnn fp7 // Source vectors - 2 3D vectors in 3 PS registers #define SX0_SY0 fp8 #define SZ0_SX1 fp9 #define SY1_SZ1 fp10 // Destination registers - 2 3d vectors in 4 PS registers #define DX0_DY0 fp11 #define DZ0_nnn fp12 #define DX1_DY1 fp13 #define DZ1_nnn fp14 // temp registers for writing back values. These registers store the final // results from the PREVIOUS loop #define WX0_WY0 fp15 #define WZ0_nnn fp16 #define WX1_WY1 fp17 #define WZ1_nnn fp18 stwu r1, -96(rsp) stfd fp14, 8(rsp) psq_st fp14, 16(rsp), 0, 0 // unrolled once, but since we're dividing by 2, add 1 to ensure if // odd # of vertices, the last one gets x-formed. addi r7, count, -1 stfd fp15, 24(rsp) psq_st fp15, 32(rsp), 0, 0 srwi r7, r7, 1 // 2 at a time stfd fp16, 40(rsp) psq_st fp16, 48(rsp), 0, 0 stfd fp17, 56(rsp) psq_st fp17, 64(rsp), 0, 0 stfd fp18, 72(rsp) psq_st fp18, 80(rsp), 0, 0 mtctr r7 // Load matrix psq_l M00_M10, 0(m),0,0 addi srcBase, srcBase, -8 psq_l M20_nnn, 8(m),1,0 addi dstBase, dstBase, -4 psq_l M03_M13, 36(m),0,0 psq_lu SX0_SY0, 8(srcBase), 0, 0 psq_l M23_nnn, 44(m),1,0 psq_lu SZ0_SX1, 8(srcBase), 0, 0 // ------------------------------UNROLLED // DX0=M00*SX0+M03, DY0=M10*SX0+M13 // DZ0=M20*SX0+M23 // DX1=M00*SX1+M03, DY1=M10*SX1+M13 // DZ1=M20*SX1+M23 ps_madds0 DX0_DY0, M00_M10, SX0_SY0, M03_M13 psq_l M01_M11, 12(m),0,0 ps_madds0 DZ0_nnn, M20_nnn, SX0_SY0, M23_nnn psq_l M21_nnn, 20(m),1,0 ps_madds1 DX1_DY1, M00_M10, SZ0_SX1, M03_M13 psq_lu SY1_SZ1,8(srcBase), 0, 0 ps_madds1 DZ1_nnn, M20_nnn, SZ0_SX1, M23_nnn psq_l M22_nnn, 32(m),1,0 // DX0=M01*SY0+DX0, DY0=M11*SY0+DY0 // DZ0=M21*SY0+DZ0 // DX1=M01*SY1+DX1, DY1=M11*SY1+DY1 // DZ1=M21*SY1+DZ1 ps_madds1 DX0_DY0, M01_M11, SX0_SY0, DX0_DY0 ps_madds1 DZ0_nnn, M21_nnn, SX0_SY0, DZ0_nnn psq_l M02_M12, 24(m),0,0 ps_madds0 DX1_DY1, M01_M11, SY1_SZ1, DX1_DY1 psq_lu SX0_SY0, 8(srcBase), 0, 0 ps_madds0 DZ1_nnn, M21_nnn, SY1_SZ1, DZ1_nnn // DX0=M02*SZ0+DX0, DY0=M12*SZ0+DY0 // DZ0=M22*SZ0+DZ0 // DX1=M02*SZ1+DX1, DY1=M12*SZ1+DY1 // DZ1=M22*SZ1+DZ1 // Write final values to temp registers ps_madds0 WX0_WY0, M02_M12, SZ0_SX1, DX0_DY0 ps_madds0 WZ0_nnn, M22_nnn, SZ0_SX1, DZ0_nnn psq_lu SZ0_SX1, 8(srcBase), 0, 0 ps_madds1 WX1_WY1, M02_M12, SY1_SZ1, DX1_DY1 ps_madds1 WZ1_nnn, M22_nnn, SY1_SZ1, DZ1_nnn psq_lu SY1_SZ1,8(srcBase), 0, 0 // -------------------------- LOOP START _mloop: ps_madds0 DX0_DY0, M00_M10, SX0_SY0, M03_M13 psq_stu WX0_WY0, 4(dstBase), 0, 0 ps_madds0 DZ0_nnn, M20_nnn, SX0_SY0, M23_nnn psq_stu WZ0_nnn, 8(dstBase), 1, 0 ps_madds1 DX1_DY1, M00_M10, SZ0_SX1, M03_M13 psq_stu WX1_WY1, 4(dstBase), 0, 0 ps_madds1 DZ1_nnn, M20_nnn, SZ0_SX1, M23_nnn psq_stu WZ1_nnn, 8(dstBase), 1, 0 ps_madds1 DX0_DY0, M01_M11, SX0_SY0, DX0_DY0 ps_madds1 DZ0_nnn, M21_nnn, SX0_SY0, DZ0_nnn // YYY LAST SX0_SY0 USAGE FOR THIS ITERATION psq_lu SX0_SY0, 8(srcBase), 0, 0 // NEXT SX0 SY0 ps_madds0 DX1_DY1, M01_M11, SY1_SZ1, DX1_DY1 ps_madds0 DZ1_nnn, M21_nnn, SY1_SZ1, DZ1_nnn // Write final values to temp registers ps_madds0 WX0_WY0, M02_M12, SZ0_SX1, DX0_DY0 ps_madds0 WZ0_nnn, M22_nnn, SZ0_SX1, DZ0_nnn // YYY LAST SZ0_SX1 USAGE FOR THIS ITERATION psq_lu SZ0_SX1, 8(srcBase), 0, 0 // NEXT SZ0 SX1 ps_madds1 WX1_WY1, M02_M12, SY1_SZ1, DX1_DY1 ps_madds1 WZ1_nnn, M22_nnn, SY1_SZ1, DZ1_nnn // YYY LAST SY1_SZ1 USAGE FOR THIS ITERATION psq_lu SY1_SZ1,8(srcBase), 0, 0 // NEXT SY1 SZ1 bdnz+ _mloop // -------------------------- LOOP END psq_stu WX0_WY0, 4(dstBase), 0, 0 rlwinm. r7, count, 0, 31, 31 // Check odd psq_stu WZ0_nnn, 8(dstBase), 1, 0 bne _return // Skipped if odd number of vectors psq_stu WX1_WY1, 4(dstBase), 0, 0 // Can't put anything here psq_stu WZ1_nnn, 8(dstBase), 1, 0 _return: psq_l fp14, 16(rsp), 0, 0 lfd fp14, 8(rsp) psq_l fp15, 32(rsp), 0, 0 lfd fp15, 24(rsp) psq_l fp16, 48(rsp), 0, 0 lfd fp16, 40(rsp) psq_l fp17, 64(rsp), 0, 0 lfd fp17, 56(rsp) psq_l fp18, 80(rsp), 0, 0 lfd fp18, 72(rsp) addi r1, r1, 96 blr #undef M00_M10 #undef M20_nnn #undef M01_M11 #undef M21_nnn #undef M02_M12 #undef M22_nnn #undef M03_M13 #undef M23_nnn #undef SX0_SY0 #undef SZ0_SX1 #undef SY1_SZ1 #undef DX0_DY0 #undef DZ0_nnn #undef DX1_DY1 #undef DZ1_nnn #undef WX0_WY0 #undef WZ0_nnn #undef WX1_WY1 #undef WZ1_nnn } #endif // GEKKO /*===========================================================================*/