/*---------------------------------------------------------------------------*
  Project: Matrix vector Library
  File:    psmtx.c

  Copyright 1998-2007 Nintendo. All rights reserved.

  These coded instructions, statements, and computer programs contain
  proprietary information of Nintendo of America Inc. and/or Nintendo
  Company Ltd., and are protected by Federal copyright law. They may
  not be disclosed to third parties or copied or duplicated in any form,
  in whole or in part, without the prior written consent of Nintendo.


  $Log: psmtx.c,v $
  Revision 1.3  2007/08/30 10:42:41  hirose
  Updated PSMTXROMultVecArray to make it Broadway EABI compliant.
  Removed unsupported functions.

  Revision 1.2  2006/02/20 04:25:42  mitu
  Changed include path from dolphin/ to revolution/.

  Revision 1.1.1.1  2005/05/12 02:15:49  yasuh-to
  Ported from dolphin sheath tree.
NoKeywords: $
    
    6     2003/08/21 5:17 Dante
    Changed GQR1 to GQR6 in PSMTXMultS16VecArray
    
    5    2002/04/11 13:11 Hirose
    const type specifier support. (by Hiratsu@IRD)
    
    4     2001/02/26 11:56p Hirose
    Avoided use of GQR1, which is reserved by the compiler.
    
    3     2001/02/22 11:49p Hirose
    Some functions are moved to another file according to arrangement
    updates.
    
    2    2000/07/12 4:41p John
    Substituted MTXConcat and MTXMultVecArray with their paired-singles
    equivalent for Gekko non-debug builds.
    
    1     2000/05/10 1:48p Hirose
    Moved paired-single matrix stuff into an another source file
    
  $NoKeywords: $
 *---------------------------------------------------------------------------*/

#include <math.h>
#include <revolution/mtx.h>
#include "mtxAssert.h"


/*---------------------------------------------------------------------*
   Special purpose Paired-single optimized code

   All paired-single code assumes GQR0 = 0.
 *---------------------------------------------------------------------*/
#ifdef GEKKO

/*---------------------------------------------------------------------*

Name:           PSMTXReorder

Description:    Creates a reordered (column-major) matrix from a
                row-major matrix, using paired single operations.
                Reordered matrices are required for the PSMTXRO* 
                functions, which operate faster than their non-reordered
                counterparts.

                Performance:  ~15 cycles.

Arguments:      src       source matrix.
                dest     destination matrix, note type is ROMtx.

Return   :         None.

*---------------------------------------------------------------------*/
asm void 
PSMTXReorder(const register Mtx src, register ROMtx dest)
{
    nofralloc
#define S00_S01 fp0
#define S02_S03 fp1
#define S10_S11 fp2
#define S12_S13 fp3
#define S20_S21 fp4
#define S22_S23 fp5
#define D00_D10 fp6
#define D11_D21 fp7
#define D02_D12 fp8
#define D22_D03 fp9
#define D13_D23 fp10
#define D20_D01 fp12

    
    psq_l       S00_S01, 0(src),  0, 0 
    psq_l       S10_S11, 16(src), 0, 0 
    psq_l       S20_S21, 32(src), 0, 0 
    psq_l       S02_S03, 8(src),  0, 0 
    ps_merge00  D00_D10, S00_S01, S10_S11 
    psq_l       S12_S13, 24(src), 0, 0 
    ps_merge01  D20_D01, S20_S21, S00_S01
    psq_l       S22_S23, 40(src), 0, 0 
    ps_merge11  D11_D21, S10_S11, S20_S21
    psq_st      D00_D10, 0(dest), 0, 0
    ps_merge00  D02_D12, S02_S03, S12_S13 
    psq_st      D20_D01, 8(dest), 0, 0
    ps_merge01  D22_D03, S22_S23, S02_S03 
    psq_st      D11_D21, 16(dest),0, 0
    ps_merge11  D13_D23, S12_S13, S22_S23
    psq_st      D02_D12, 24(dest),0, 0
    psq_st      D22_D03, 32(dest),0,0
    psq_st      D13_D23, 40(dest),0,0

    blr
#undef S00_S01
#undef S02_S03
#undef S10_S11
#undef S12_S13
#undef S20_S21
#undef S22_S23
#undef D00_D10
#undef D11_D21
#undef D02_D12
#undef D22_D03
#undef D13_D23
#undef D20_D01

}

/*---------------------------------------------------------------------*

Name:            PSMTXROMultVecArray

Description:    Multiplies an array of vectors by a reordered matrix, 
                using paired single operations.
                This function is significantly faster than 
                PSMTXMultVecArray, but requires that you have reordered
                the matrix in advance with PSMTXReorder.
                OK if source = destination.
                NOTE: number of vertices transformed cannot be less than 
                2.
                
                Note that NO error checking is performed.

                Performance: 9.586 - 9.814 cycles per vertex, where
                              count = 70

Arguments:      m         reordered matrix.
                srcBase  start of source vector array. 
                dstBase  Start of resultant vector array.
                count    Number of vectors in srcBase, dstBase arrays
                          COUNT MUST BE GREATER THAN 2.


Return:         None.

*---------------------------------------------------------------------*/
asm void 
PSMTXROMultVecArray
( 
    const register ROMtx  m,      // r3
    const register Vec   *srcBase,// r4
          register Vec   *dstBase,// r5
          register u32    count   // r6
)
{
    nofralloc
#define M00_M10 fp0
#define M20_nnn fp1
#define M01_M11 fp2
#define M21_nnn fp3
#define M02_M12 fp4
#define M22_nnn fp5
#define M03_M13 fp6
#define M23_nnn fp7

// Source vectors - 2 3D vectors in 3 PS registers
#define SX0_SY0 fp8
#define SZ0_SX1 fp9
#define SY1_SZ1 fp10
// Destination registers - 2 3d vectors in 4 PS registers
#define DX0_DY0 fp11
#define DZ0_nnn fp12
#define DX1_DY1 fp13
#define DZ1_nnn fp14
// temp registers for writing back values.  These registers store the final
// results from the PREVIOUS loop
#define WX0_WY0 fp15
#define WZ0_nnn fp16
#define WX1_WY1 fp17
#define WZ1_nnn fp18

    stwu    r1, -96(rsp)
    stfd    fp14, 8(rsp)
    psq_st  fp14, 16(rsp), 0, 0
    // unrolled once, but since we're dividing by 2, add 1 to ensure if
    // odd # of vertices, the last one gets x-formed.
    addi    r7, count, -1    
    stfd    fp15, 24(rsp)
    psq_st  fp15, 32(rsp), 0, 0
    srwi    r7, r7, 1 // 2 at a time
    stfd    fp16, 40(rsp)
    psq_st  fp16, 48(rsp), 0, 0
    stfd    fp17, 56(rsp)
    psq_st  fp17, 64(rsp), 0, 0
    stfd    fp18, 72(rsp)
    psq_st  fp18, 80(rsp), 0, 0
    mtctr   r7
    // Load matrix
    psq_l   M00_M10, 0(m),0,0  
    addi    srcBase, srcBase, -8
    psq_l   M20_nnn, 8(m),1,0  
    addi    dstBase, dstBase, -4
    psq_l   M03_M13, 36(m),0,0 
    psq_lu  SX0_SY0, 8(srcBase), 0, 0
    psq_l   M23_nnn, 44(m),1,0 
    psq_lu  SZ0_SX1, 8(srcBase), 0, 0

    // ------------------------------UNROLLED

    //  DX0=M00*SX0+M03, DY0=M10*SX0+M13
    //  DZ0=M20*SX0+M23
    //  DX1=M00*SX1+M03, DY1=M10*SX1+M13
    //  DZ1=M20*SX1+M23

    ps_madds0    DX0_DY0, M00_M10, SX0_SY0, M03_M13
    psq_l   M01_M11, 12(m),0,0 
    ps_madds0    DZ0_nnn, M20_nnn, SX0_SY0, M23_nnn
    psq_l   M21_nnn, 20(m),1,0   
    ps_madds1    DX1_DY1, M00_M10, SZ0_SX1, M03_M13
    psq_lu SY1_SZ1,8(srcBase), 0, 0
    ps_madds1    DZ1_nnn, M20_nnn, SZ0_SX1, M23_nnn
    psq_l   M22_nnn, 32(m),1,0 

    //  DX0=M01*SY0+DX0, DY0=M11*SY0+DY0
    //  DZ0=M21*SY0+DZ0
    //  DX1=M01*SY1+DX1, DY1=M11*SY1+DY1
    //  DZ1=M21*SY1+DZ1

    ps_madds1    DX0_DY0, M01_M11, SX0_SY0, DX0_DY0
    ps_madds1    DZ0_nnn, M21_nnn, SX0_SY0, DZ0_nnn
    psq_l   M02_M12, 24(m),0,0 
    ps_madds0    DX1_DY1, M01_M11, SY1_SZ1, DX1_DY1
    psq_lu SX0_SY0, 8(srcBase), 0, 0
    ps_madds0    DZ1_nnn, M21_nnn, SY1_SZ1, DZ1_nnn

    //  DX0=M02*SZ0+DX0, DY0=M12*SZ0+DY0
    //  DZ0=M22*SZ0+DZ0
    //  DX1=M02*SZ1+DX1, DY1=M12*SZ1+DY1
    //  DZ1=M22*SZ1+DZ1

    // Write final values to temp registers
    ps_madds0    WX0_WY0, M02_M12, SZ0_SX1, DX0_DY0
    ps_madds0    WZ0_nnn, M22_nnn, SZ0_SX1, DZ0_nnn
    psq_lu SZ0_SX1, 8(srcBase), 0, 0
    ps_madds1    WX1_WY1, M02_M12, SY1_SZ1, DX1_DY1
    ps_madds1    WZ1_nnn, M22_nnn, SY1_SZ1, DZ1_nnn
    psq_lu SY1_SZ1,8(srcBase), 0, 0

    // -------------------------- LOOP START
_mloop:
    ps_madds0    DX0_DY0, M00_M10, SX0_SY0, M03_M13
      psq_stu     WX0_WY0, 4(dstBase), 0, 0
    ps_madds0    DZ0_nnn, M20_nnn, SX0_SY0, M23_nnn
      psq_stu     WZ0_nnn, 8(dstBase), 1, 0
    ps_madds1    DX1_DY1, M00_M10, SZ0_SX1, M03_M13
      psq_stu     WX1_WY1, 4(dstBase), 0, 0
    ps_madds1    DZ1_nnn, M20_nnn, SZ0_SX1, M23_nnn
      psq_stu     WZ1_nnn, 8(dstBase), 1, 0
    ps_madds1    DX0_DY0, M01_M11, SX0_SY0, DX0_DY0
    ps_madds1    DZ0_nnn, M21_nnn, SX0_SY0, DZ0_nnn
    // YYY LAST SX0_SY0 USAGE FOR THIS ITERATION
      psq_lu SX0_SY0, 8(srcBase), 0, 0 // NEXT SX0 SY0
    ps_madds0    DX1_DY1, M01_M11, SY1_SZ1, DX1_DY1
    ps_madds0    DZ1_nnn, M21_nnn, SY1_SZ1, DZ1_nnn

    // Write final values to temp registers
    ps_madds0    WX0_WY0, M02_M12, SZ0_SX1, DX0_DY0
    ps_madds0    WZ0_nnn, M22_nnn, SZ0_SX1, DZ0_nnn
    // YYY LAST SZ0_SX1 USAGE FOR THIS ITERATION
      psq_lu SZ0_SX1, 8(srcBase), 0, 0 // NEXT SZ0 SX1
    ps_madds1    WX1_WY1, M02_M12, SY1_SZ1, DX1_DY1
    ps_madds1    WZ1_nnn, M22_nnn, SY1_SZ1, DZ1_nnn
    // YYY LAST SY1_SZ1 USAGE FOR THIS ITERATION
      psq_lu SY1_SZ1,8(srcBase), 0, 0 // NEXT SY1 SZ1

    bdnz+ _mloop    // -------------------------- LOOP END

    psq_stu     WX0_WY0, 4(dstBase), 0, 0
    rlwinm.     r7, count, 0, 31, 31 // Check odd
    psq_stu     WZ0_nnn, 8(dstBase), 1, 0
    bne     _return
    // Skipped if odd number of vectors
    psq_stu     WX1_WY1, 4(dstBase), 0, 0
    // Can't put anything here
    psq_stu     WZ1_nnn, 8(dstBase), 1, 0


_return:    
    psq_l   fp14, 16(rsp), 0, 0
    lfd     fp14, 8(rsp)
    psq_l   fp15, 32(rsp), 0, 0
    lfd     fp15, 24(rsp)
    psq_l   fp16, 48(rsp), 0, 0
    lfd     fp16, 40(rsp)
    psq_l   fp17, 64(rsp), 0, 0
    lfd     fp17, 56(rsp)
    psq_l   fp18, 80(rsp), 0, 0
    lfd     fp18, 72(rsp)
    addi    r1, r1, 96
    blr

#undef M00_M10
#undef M20_nnn
#undef M01_M11
#undef M21_nnn
#undef M02_M12
#undef M22_nnn
#undef M03_M13
#undef M23_nnn
#undef SX0_SY0
#undef SZ0_SX1
#undef SY1_SZ1
#undef DX0_DY0
#undef DZ0_nnn
#undef DX1_DY1
#undef DZ1_nnn
#undef WX0_WY0
#undef WZ0_nnn
#undef WX1_WY1
#undef WZ1_nnn

}


#endif // GEKKO


/*===========================================================================*/