1 /*---------------------------------------------------------------------------*
2   Project: Matrix vector Library
3   File:    psmtx.c
4 
5   Copyright 1998-2007 Nintendo. All rights reserved.
6 
7   These coded instructions, statements, and computer programs contain
8   proprietary information of Nintendo of America Inc. and/or Nintendo
9   Company Ltd., and are protected by Federal copyright law. They may
10   not be disclosed to third parties or copied or duplicated in any form,
11   in whole or in part, without the prior written consent of Nintendo.
12 
13 
14   $Log: psmtx.c,v $
15   Revision 1.3  2007/08/30 10:42:41  hirose
16   Updated PSMTXROMultVecArray to make it Broadway EABI compliant.
17   Removed unsupported functions.
18 
19   Revision 1.2  2006/02/20 04:25:42  mitu
20   Changed include path from dolphin/ to revolution/.
21 
22   Revision 1.1.1.1  2005/05/12 02:15:49  yasuh-to
23   Ported from dolphin sheath tree.
24 NoKeywords: $
25 
26     6     2003/08/21 5:17 Dante
27     Changed GQR1 to GQR6 in PSMTXMultS16VecArray
28 
29     5    2002/04/11 13:11 Hirose
30     const type specifier support. (by Hiratsu@IRD)
31 
32     4     2001/02/26 11:56p Hirose
33     Avoided use of GQR1, which is reserved by the compiler.
34 
35     3     2001/02/22 11:49p Hirose
36     Some functions are moved to another file according to arrangement
37     updates.
38 
39     2    2000/07/12 4:41p John
40     Substituted MTXConcat and MTXMultVecArray with their paired-singles
41     equivalent for Gekko non-debug builds.
42 
43     1     2000/05/10 1:48p Hirose
44     Moved paired-single matrix stuff into an another source file
45 
46   $NoKeywords: $
47  *---------------------------------------------------------------------------*/
48 
49 #include <math.h>
50 #include <revolution/mtx.h>
51 #include "mtxAssert.h"
52 
53 
54 /*---------------------------------------------------------------------*
55    Special purpose Paired-single optimized code
56 
57    All paired-single code assumes GQR0 = 0.
58  *---------------------------------------------------------------------*/
59 #ifdef GEKKO
60 
61 /*---------------------------------------------------------------------*
62 
63 Name:           PSMTXReorder
64 
65 Description:    Creates a reordered (column-major) matrix from a
66                 row-major matrix, using paired single operations.
67                 Reordered matrices are required for the PSMTXRO*
68                 functions, which operate faster than their non-reordered
69                 counterparts.
70 
71                 Performance:  ~15 cycles.
72 
73 Arguments:      src       source matrix.
74                 dest     destination matrix, note type is ROMtx.
75 
76 Return   :         None.
77 
78 *---------------------------------------------------------------------*/
79 asm void
PSMTXReorder(const register Mtx src,register ROMtx dest)80 PSMTXReorder(const register Mtx src, register ROMtx dest)
81 {
82     nofralloc
83 #define S00_S01 fp0
84 #define S02_S03 fp1
85 #define S10_S11 fp2
86 #define S12_S13 fp3
87 #define S20_S21 fp4
88 #define S22_S23 fp5
89 #define D00_D10 fp6
90 #define D11_D21 fp7
91 #define D02_D12 fp8
92 #define D22_D03 fp9
93 #define D13_D23 fp10
94 #define D20_D01 fp12
95 
96 
97     psq_l       S00_S01, 0(src),  0, 0
98     psq_l       S10_S11, 16(src), 0, 0
99     psq_l       S20_S21, 32(src), 0, 0
100     psq_l       S02_S03, 8(src),  0, 0
101     ps_merge00  D00_D10, S00_S01, S10_S11
102     psq_l       S12_S13, 24(src), 0, 0
103     ps_merge01  D20_D01, S20_S21, S00_S01
104     psq_l       S22_S23, 40(src), 0, 0
105     ps_merge11  D11_D21, S10_S11, S20_S21
106     psq_st      D00_D10, 0(dest), 0, 0
107     ps_merge00  D02_D12, S02_S03, S12_S13
108     psq_st      D20_D01, 8(dest), 0, 0
109     ps_merge01  D22_D03, S22_S23, S02_S03
110     psq_st      D11_D21, 16(dest),0, 0
111     ps_merge11  D13_D23, S12_S13, S22_S23
112     psq_st      D02_D12, 24(dest),0, 0
113     psq_st      D22_D03, 32(dest),0,0
114     psq_st      D13_D23, 40(dest),0,0
115 
116     blr
117 #undef S00_S01
118 #undef S02_S03
119 #undef S10_S11
120 #undef S12_S13
121 #undef S20_S21
122 #undef S22_S23
123 #undef D00_D10
124 #undef D11_D21
125 #undef D02_D12
126 #undef D22_D03
127 #undef D13_D23
128 #undef D20_D01
129 
130 }
131 
132 /*---------------------------------------------------------------------*
133 
134 Name:            PSMTXROMultVecArray
135 
136 Description:    Multiplies an array of vectors by a reordered matrix,
137                 using paired single operations.
138                 This function is significantly faster than
139                 PSMTXMultVecArray, but requires that you have reordered
140                 the matrix in advance with PSMTXReorder.
141                 OK if source = destination.
142                 NOTE: number of vertices transformed cannot be less than
143                 2.
144 
145                 Note that NO error checking is performed.
146 
147                 Performance: 9.586 - 9.814 cycles per vertex, where
148                               count = 70
149 
150 Arguments:      m         reordered matrix.
151                 srcBase  start of source vector array.
152                 dstBase  Start of resultant vector array.
153                 count    Number of vectors in srcBase, dstBase arrays
154                           COUNT MUST BE GREATER THAN 2.
155 
156 
157 Return:         None.
158 
159 *---------------------------------------------------------------------*/
160 asm void
PSMTXROMultVecArray(const register ROMtx m,const register Vec * srcBase,register Vec * dstBase,register u32 count)161 PSMTXROMultVecArray
162 (
163     const register ROMtx  m,      // r3
164     const register Vec   *srcBase,// r4
165           register Vec   *dstBase,// r5
166           register u32    count   // r6
167 )
168 {
169     nofralloc
170 #define M00_M10 fp0
171 #define M20_nnn fp1
172 #define M01_M11 fp2
173 #define M21_nnn fp3
174 #define M02_M12 fp4
175 #define M22_nnn fp5
176 #define M03_M13 fp6
177 #define M23_nnn fp7
178 
179 // Source vectors - 2 3D vectors in 3 PS registers
180 #define SX0_SY0 fp8
181 #define SZ0_SX1 fp9
182 #define SY1_SZ1 fp10
183 // Destination registers - 2 3d vectors in 4 PS registers
184 #define DX0_DY0 fp11
185 #define DZ0_nnn fp12
186 #define DX1_DY1 fp13
187 #define DZ1_nnn fp14
188 // temp registers for writing back values.  These registers store the final
189 // results from the PREVIOUS loop
190 #define WX0_WY0 fp15
191 #define WZ0_nnn fp16
192 #define WX1_WY1 fp17
193 #define WZ1_nnn fp18
194 
195     stwu    r1, -96(rsp)
196     stfd    fp14, 8(rsp)
197     psq_st  fp14, 16(rsp), 0, 0
198     // unrolled once, but since we're dividing by 2, add 1 to ensure if
199     // odd # of vertices, the last one gets x-formed.
200     addi    r7, count, -1
201     stfd    fp15, 24(rsp)
202     psq_st  fp15, 32(rsp), 0, 0
203     srwi    r7, r7, 1 // 2 at a time
204     stfd    fp16, 40(rsp)
205     psq_st  fp16, 48(rsp), 0, 0
206     stfd    fp17, 56(rsp)
207     psq_st  fp17, 64(rsp), 0, 0
208     stfd    fp18, 72(rsp)
209     psq_st  fp18, 80(rsp), 0, 0
210     mtctr   r7
211     // Load matrix
212     psq_l   M00_M10, 0(m),0,0
213     addi    srcBase, srcBase, -8
214     psq_l   M20_nnn, 8(m),1,0
215     addi    dstBase, dstBase, -4
216     psq_l   M03_M13, 36(m),0,0
217     psq_lu  SX0_SY0, 8(srcBase), 0, 0
218     psq_l   M23_nnn, 44(m),1,0
219     psq_lu  SZ0_SX1, 8(srcBase), 0, 0
220 
221     // ------------------------------UNROLLED
222 
223     //  DX0=M00*SX0+M03, DY0=M10*SX0+M13
224     //  DZ0=M20*SX0+M23
225     //  DX1=M00*SX1+M03, DY1=M10*SX1+M13
226     //  DZ1=M20*SX1+M23
227 
228     ps_madds0    DX0_DY0, M00_M10, SX0_SY0, M03_M13
229     psq_l   M01_M11, 12(m),0,0
230     ps_madds0    DZ0_nnn, M20_nnn, SX0_SY0, M23_nnn
231     psq_l   M21_nnn, 20(m),1,0
232     ps_madds1    DX1_DY1, M00_M10, SZ0_SX1, M03_M13
233     psq_lu SY1_SZ1,8(srcBase), 0, 0
234     ps_madds1    DZ1_nnn, M20_nnn, SZ0_SX1, M23_nnn
235     psq_l   M22_nnn, 32(m),1,0
236 
237     //  DX0=M01*SY0+DX0, DY0=M11*SY0+DY0
238     //  DZ0=M21*SY0+DZ0
239     //  DX1=M01*SY1+DX1, DY1=M11*SY1+DY1
240     //  DZ1=M21*SY1+DZ1
241 
242     ps_madds1    DX0_DY0, M01_M11, SX0_SY0, DX0_DY0
243     ps_madds1    DZ0_nnn, M21_nnn, SX0_SY0, DZ0_nnn
244     psq_l   M02_M12, 24(m),0,0
245     ps_madds0    DX1_DY1, M01_M11, SY1_SZ1, DX1_DY1
246     psq_lu SX0_SY0, 8(srcBase), 0, 0
247     ps_madds0    DZ1_nnn, M21_nnn, SY1_SZ1, DZ1_nnn
248 
249     //  DX0=M02*SZ0+DX0, DY0=M12*SZ0+DY0
250     //  DZ0=M22*SZ0+DZ0
251     //  DX1=M02*SZ1+DX1, DY1=M12*SZ1+DY1
252     //  DZ1=M22*SZ1+DZ1
253 
254     // Write final values to temp registers
255     ps_madds0    WX0_WY0, M02_M12, SZ0_SX1, DX0_DY0
256     ps_madds0    WZ0_nnn, M22_nnn, SZ0_SX1, DZ0_nnn
257     psq_lu SZ0_SX1, 8(srcBase), 0, 0
258     ps_madds1    WX1_WY1, M02_M12, SY1_SZ1, DX1_DY1
259     ps_madds1    WZ1_nnn, M22_nnn, SY1_SZ1, DZ1_nnn
260     psq_lu SY1_SZ1,8(srcBase), 0, 0
261 
262     // -------------------------- LOOP START
263 _mloop:
264     ps_madds0    DX0_DY0, M00_M10, SX0_SY0, M03_M13
265       psq_stu     WX0_WY0, 4(dstBase), 0, 0
266     ps_madds0    DZ0_nnn, M20_nnn, SX0_SY0, M23_nnn
267       psq_stu     WZ0_nnn, 8(dstBase), 1, 0
268     ps_madds1    DX1_DY1, M00_M10, SZ0_SX1, M03_M13
269       psq_stu     WX1_WY1, 4(dstBase), 0, 0
270     ps_madds1    DZ1_nnn, M20_nnn, SZ0_SX1, M23_nnn
271       psq_stu     WZ1_nnn, 8(dstBase), 1, 0
272     ps_madds1    DX0_DY0, M01_M11, SX0_SY0, DX0_DY0
273     ps_madds1    DZ0_nnn, M21_nnn, SX0_SY0, DZ0_nnn
274     // YYY LAST SX0_SY0 USAGE FOR THIS ITERATION
275       psq_lu SX0_SY0, 8(srcBase), 0, 0 // NEXT SX0 SY0
276     ps_madds0    DX1_DY1, M01_M11, SY1_SZ1, DX1_DY1
277     ps_madds0    DZ1_nnn, M21_nnn, SY1_SZ1, DZ1_nnn
278 
279     // Write final values to temp registers
280     ps_madds0    WX0_WY0, M02_M12, SZ0_SX1, DX0_DY0
281     ps_madds0    WZ0_nnn, M22_nnn, SZ0_SX1, DZ0_nnn
282     // YYY LAST SZ0_SX1 USAGE FOR THIS ITERATION
283       psq_lu SZ0_SX1, 8(srcBase), 0, 0 // NEXT SZ0 SX1
284     ps_madds1    WX1_WY1, M02_M12, SY1_SZ1, DX1_DY1
285     ps_madds1    WZ1_nnn, M22_nnn, SY1_SZ1, DZ1_nnn
286     // YYY LAST SY1_SZ1 USAGE FOR THIS ITERATION
287       psq_lu SY1_SZ1,8(srcBase), 0, 0 // NEXT SY1 SZ1
288 
289     bdnz+ _mloop    // -------------------------- LOOP END
290 
291     psq_stu     WX0_WY0, 4(dstBase), 0, 0
292     rlwinm.     r7, count, 0, 31, 31 // Check odd
293     psq_stu     WZ0_nnn, 8(dstBase), 1, 0
294     bne     _return
295     // Skipped if odd number of vectors
296     psq_stu     WX1_WY1, 4(dstBase), 0, 0
297     // Can't put anything here
298     psq_stu     WZ1_nnn, 8(dstBase), 1, 0
299 
300 
301 _return:
302     psq_l   fp14, 16(rsp), 0, 0
303     lfd     fp14, 8(rsp)
304     psq_l   fp15, 32(rsp), 0, 0
305     lfd     fp15, 24(rsp)
306     psq_l   fp16, 48(rsp), 0, 0
307     lfd     fp16, 40(rsp)
308     psq_l   fp17, 64(rsp), 0, 0
309     lfd     fp17, 56(rsp)
310     psq_l   fp18, 80(rsp), 0, 0
311     lfd     fp18, 72(rsp)
312     addi    r1, r1, 96
313     blr
314 
315 #undef M00_M10
316 #undef M20_nnn
317 #undef M01_M11
318 #undef M21_nnn
319 #undef M02_M12
320 #undef M22_nnn
321 #undef M03_M13
322 #undef M23_nnn
323 #undef SX0_SY0
324 #undef SZ0_SX1
325 #undef SY1_SZ1
326 #undef DX0_DY0
327 #undef DZ0_nnn
328 #undef DX1_DY1
329 #undef DZ1_nnn
330 #undef WX0_WY0
331 #undef WZ0_nnn
332 #undef WX1_WY1
333 #undef WZ1_nnn
334 
335 }
336 
337 
338 #endif // GEKKO
339 
340 
341 /*===========================================================================*/
342