1 /*---------------------------------------------------------------------------*
2   Project: matrix vector Library
3   File:    psmtx.c
4 
5   Copyright 1998, 1999, 2000 Nintendo.  All rights reserved.
6 
7   These coded instructions, statements, and computer programs contain
8   proprietary information of Nintendo of America Inc. and/or Nintendo
9   Company Ltd., and are protected by Federal copyright law.  They may
10   not be disclosed to third parties or copied or duplicated in any form,
11   in whole or in part, without the prior written consent of Nintendo.
12 
13 
14   $Log: psmtx.c,v $
15   Revision 1.2  02/20/2006 04:25:42  mitu
16   changed include path from dolphin/ to revolution/.
17 
18   Revision 1.1.1.1  2005/05/12 02:15:49  yasuh-to
19   transitioned from the Dolphin source tree
20 NoKeywords: $
21 
22     6     03/08/21 5:17 Dante
23     Changed GQR1 to GQR6 in PSMTXMultS16VecArray
24 
25     5     02/04/11 13:11 Hirose
26     const type specifier support. (worked by Hiratsu@IRD)
27 
28     4     2/26/01 11:56p Hirose
29     avoided use of GQR1 which is reserved by the compiler.
30 
31     3     2/22/01 11:49p Hirose
32     Some functions are moved to another file according to arrangement
33     updates.
34 
35     2    7/12/00 4:41p John
36     Substitutes MTXConcat and MTXMultVecArray with their paired-singles
37     equivalent for Gekko non-debug builds.
38 
39     1     5/10/00 1:48p Hirose
40     moved paired-single matrix stuff into an another source file
41 
42   $NoKeywords: $
43  *---------------------------------------------------------------------------*/
44 
45 #include <math.h>
46 #include <revolution/mtx.h>
47 #include "mtxAssert.h"
48 
49 
50 /*---------------------------------------------------------------------*
51    Special purpose Paired-single optimized code
52 
53    All paired-single code assumes GQR0 = 0.
54  *---------------------------------------------------------------------*/
55 #ifdef GEKKO
56 
57 /*---------------------------------------------------------------------*
58 
59 Name:           PSMTXReorder
60 
61 Description:    Creates a reordered (column-major) matrix from a
62                 row-major matrix, using paired single operations.
63                 Reordered matrices are required for the PSMTXRO*
64                 functions, which operate faster than their non-reordered
65                 counterparts.
66 
67                 Performance:  ~15 cycles.
68 
69 Arguments:      src       source matrix.
70                 dest     destination matrix, note type is ROMtx.
71 
72 Return   :         none
73 
74 *---------------------------------------------------------------------*/
75 asm void
PSMTXReorder(const register Mtx src,register ROMtx dest)76 PSMTXReorder(const register Mtx src, register ROMtx dest)
77 {
78     nofralloc
79 #define S00_S01 fp0
80 #define S02_S03 fp1
81 #define S10_S11 fp2
82 #define S12_S13 fp3
83 #define S20_S21 fp4
84 #define S22_S23 fp5
85 #define D00_D10 fp6
86 #define D11_D21 fp7
87 #define D02_D12 fp8
88 #define D22_D03 fp9
89 #define D13_D23 fp10
90 #define D20_D01 fp12
91 
92 
93     psq_l       S00_S01, 0(src),  0, 0
94     psq_l       S10_S11, 16(src), 0, 0
95     psq_l       S20_S21, 32(src), 0, 0
96     psq_l       S02_S03, 8(src),  0, 0
97     ps_merge00  D00_D10, S00_S01, S10_S11
98     psq_l       S12_S13, 24(src), 0, 0
99     ps_merge01  D20_D01, S20_S21, S00_S01
100     psq_l       S22_S23, 40(src), 0, 0
101     ps_merge11  D11_D21, S10_S11, S20_S21
102     psq_st      D00_D10, 0(dest), 0, 0
103     ps_merge00  D02_D12, S02_S03, S12_S13
104     psq_st      D20_D01, 8(dest), 0, 0
105     ps_merge01  D22_D03, S22_S23, S02_S03
106     psq_st      D11_D21, 16(dest),0, 0
107     ps_merge11  D13_D23, S12_S13, S22_S23
108     psq_st      D02_D12, 24(dest),0, 0
109     psq_st      D22_D03, 32(dest),0,0
110     psq_st      D13_D23, 40(dest),0,0
111 
112     blr
113 #undef S00_S01
114 #undef S02_S03
115 #undef S10_S11
116 #undef S12_S13
117 #undef S20_S21
118 #undef S22_S23
119 #undef D00_D10
120 #undef D11_D21
121 #undef D02_D12
122 #undef D22_D03
123 #undef D13_D23
124 #undef D20_D01
125 
126 }
127 
128 /*---------------------------------------------------------------------*
129 
130 Name:            PSMTXROMultVecArray
131 
132 Description:    Multiplies an array of vectors by a reordered matrix,
133                 using paired single operations.
134                 This function is significantly faster than
135                 PSMTXMultVecArray, but requires that you have reordered
136                 the matrix in advance with PSMTXReorder.
137                 OK if source = destination.
138                 NOTE: number of vertices transformed cannot be less than
139                 2.
140 
141                 Note that NO error checking is performed.
142 
143                 Performance: 9.586 - 9.814 cycles per vertex, where
144                               count = 70
145 
146 Arguments:      m         reordered matrix.
147                 srcBase   start of source vector array.
148                 dstBase   start of resultant vector array.
149                 count     number of vectors in srcBase, dstBase arrays
150                           COUNT MUST BE GREATER THAN 2.
151 
152 
153 Return   :         none
154 
155 *---------------------------------------------------------------------*/
156 asm void
PSMTXROMultVecArray(const register ROMtx m,const register Vec * srcBase,register Vec * dstBase,register u32 count)157 PSMTXROMultVecArray
158 (
159     const register ROMtx  m,      // r3
160     const register Vec   *srcBase,// r4
161           register Vec   *dstBase,// r5
162           register u32    count   // r6
163 )
164 {
165     nofralloc
166 #define M00_M10 fp0
167 #define M20_nnn fp1
168 #define M01_M11 fp2
169 #define M21_nnn fp3
170 #define M02_M12 fp4
171 #define M22_nnn fp5
172 #define M03_M13 fp6
173 #define M23_nnn fp7
174 
175 // source vectors - 2 3D vectors in 3 PS registers
176 #define SX0_SY0 fp8
177 #define SZ0_SX1 fp9
178 #define SY1_SZ1 fp10
179 // Destination registers - 2 3d vectors in 4 PS registers
180 #define DX0_DY0 fp11
181 #define DZ0_nnn fp12
182 #define DX1_DY1 fp13
183 #define DZ1_nnn fp14
184 // temp registers for writing back values.  These registers store the final
185 // results from the PREVIOUS loop
186 #define WX0_WY0 fp15
187 #define WZ0_nnn fp16
188 #define WX1_WY1 fp17
189 #define WZ1_nnn fp18
190 
191     stwu    r1, -64(r1)
192     stfd    fp14, 8(r1)
193     // unrolled once, but since we're dividing by 2, add 1 to ensure if
194     // odd # of vertices, the last one gets x-formed.
195     addi    r7, count, -1
196     stfd    fp15, 16(r1)
197     srwi    r7, r7, 1 // 2 at a time
198     stfd    fp16, 24(r1)
199     stfd    fp17, 32(r1)
200     stfd    fp18, 40(r1)
201     mtctr   r7
202     // load matrix
203     psq_l   M00_M10, 0(m),0,0
204     addi    srcBase, srcBase, -8
205     psq_l   M20_nnn, 8(m),1,0
206     addi    dstBase, dstBase, -4
207     psq_l   M03_M13, 36(m),0,0
208     psq_lu  SX0_SY0, 8(srcBase), 0, 0
209     psq_l   M23_nnn, 44(m),1,0
210     psq_lu  SZ0_SX1, 8(srcBase), 0, 0
211 
212     // ------------------------------UNROLLED
213 
214     //  DX0=M00*SX0+M03, DY0=M10*SX0+M13
215     //  DZ0=M20*SX0+M23
216     //  DX1=M00*SX1+M03, DY1=M10*SX1+M13
217     //  DZ1=M20*SX1+M23
218 
219     ps_madds0    DX0_DY0, M00_M10, SX0_SY0, M03_M13
220     psq_l   M01_M11, 12(m),0,0
221     ps_madds0    DZ0_nnn, M20_nnn, SX0_SY0, M23_nnn
222     psq_l   M21_nnn, 20(m),1,0
223     ps_madds1    DX1_DY1, M00_M10, SZ0_SX1, M03_M13
224     psq_lu SY1_SZ1,8(srcBase), 0, 0
225     ps_madds1    DZ1_nnn, M20_nnn, SZ0_SX1, M23_nnn
226     psq_l   M22_nnn, 32(m),1,0
227 
228     //  DX0=M01*SY0+DX0, DY0=M11*SY0+DY0
229     //  DZ0=M21*SY0+DZ0
230     //  DX1=M01*SY1+DX1, DY1=M11*SY1+DY1
231     //  DZ1=M21*SY1+DZ1
232 
233     ps_madds1    DX0_DY0, M01_M11, SX0_SY0, DX0_DY0
234     ps_madds1    DZ0_nnn, M21_nnn, SX0_SY0, DZ0_nnn
235     psq_l   M02_M12, 24(m),0,0
236     ps_madds0    DX1_DY1, M01_M11, SY1_SZ1, DX1_DY1
237     psq_lu SX0_SY0, 8(srcBase), 0, 0
238     ps_madds0    DZ1_nnn, M21_nnn, SY1_SZ1, DZ1_nnn
239 
240     //  DX0=M02*SZ0+DX0, DY0=M12*SZ0+DY0
241     //  DZ0=M22*SZ0+DZ0
242     //  DX1=M02*SZ1+DX1, DY1=M12*SZ1+DY1
243     //  DZ1=M22*SZ1+DZ1
244 
245     // Write final values to temp registers
246     ps_madds0    WX0_WY0, M02_M12, SZ0_SX1, DX0_DY0
247     ps_madds0    WZ0_nnn, M22_nnn, SZ0_SX1, DZ0_nnn
248     psq_lu SZ0_SX1, 8(srcBase), 0, 0
249     ps_madds1    WX1_WY1, M02_M12, SY1_SZ1, DX1_DY1
250     ps_madds1    WZ1_nnn, M22_nnn, SY1_SZ1, DZ1_nnn
251     psq_lu SY1_SZ1,8(srcBase), 0, 0
252 
253     // -------------------------- LOOP START
254 _mloop:
255     ps_madds0    DX0_DY0, M00_M10, SX0_SY0, M03_M13
256       psq_stu     WX0_WY0, 4(dstBase), 0, 0
257     ps_madds0    DZ0_nnn, M20_nnn, SX0_SY0, M23_nnn
258       psq_stu     WZ0_nnn, 8(dstBase), 1, 0
259     ps_madds1    DX1_DY1, M00_M10, SZ0_SX1, M03_M13
260       psq_stu     WX1_WY1, 4(dstBase), 0, 0
261     ps_madds1    DZ1_nnn, M20_nnn, SZ0_SX1, M23_nnn
262       psq_stu     WZ1_nnn, 8(dstBase), 1, 0
263     ps_madds1    DX0_DY0, M01_M11, SX0_SY0, DX0_DY0
264     ps_madds1    DZ0_nnn, M21_nnn, SX0_SY0, DZ0_nnn
265     // YYY LAST SX0_SY0 USAGE FOR THIS ITERATION
266       psq_lu SX0_SY0, 8(srcBase), 0, 0 // NEXT SX0 SY0
267     ps_madds0    DX1_DY1, M01_M11, SY1_SZ1, DX1_DY1
268     ps_madds0    DZ1_nnn, M21_nnn, SY1_SZ1, DZ1_nnn
269 
270     // Write final values to temp registers
271     ps_madds0    WX0_WY0, M02_M12, SZ0_SX1, DX0_DY0
272     ps_madds0    WZ0_nnn, M22_nnn, SZ0_SX1, DZ0_nnn
273     // YYY LAST SZ0_SX1 USAGE FOR THIS ITERATION
274       psq_lu SZ0_SX1, 8(srcBase), 0, 0 // NEXT SZ0 SX1
275     ps_madds1    WX1_WY1, M02_M12, SY1_SZ1, DX1_DY1
276     ps_madds1    WZ1_nnn, M22_nnn, SY1_SZ1, DZ1_nnn
277     // YYY LAST SY1_SZ1 USAGE FOR THIS ITERATION
278       psq_lu SY1_SZ1,8(srcBase), 0, 0 // NEXT SY1 SZ1
279 
280     bdnz+ _mloop    // -------------------------- LOOP END
281 
282     psq_stu     WX0_WY0, 4(dstBase), 0, 0
283     rlwinm.     r7, count, 0, 31, 31 // check odd
284     psq_stu     WZ0_nnn, 8(dstBase), 1, 0
285     bne     _return
286     // Skipped if odd number of vectors
287     psq_stu     WX1_WY1, 4(dstBase), 0, 0
288     // can't put anything here
289     psq_stu     WZ1_nnn, 8(dstBase), 1, 0
290 
291 
292 _return:
293     lfd     fp14, 8(r1)
294     lfd     fp15, 16(r1)
295     lfd     fp16, 24(r1)
296     lfd     fp17, 32(r1)
297     lfd     fp18, 40(r1)
298     addi    r1, r1, 64
299     blr
300 
301 #undef M00_M10
302 #undef M20_nnn
303 #undef M01_M11
304 #undef M21_nnn
305 #undef M02_M12
306 #undef M22_nnn
307 #undef M03_M13
308 #undef M23_nnn
309 #undef SX0_SY0
310 #undef SZ0_SX1
311 #undef SY1_SZ1
312 #undef DX0_DY0
313 #undef DZ0_nnn
314 #undef DX1_DY1
315 #undef DZ1_nnn
316 #undef WX0_WY0
317 #undef WZ0_nnn
318 #undef WX1_WY1
319 #undef WZ1_nnn
320 
321 }
322 
323 /*---------------------------------------------------------------------*
324 
325 Name:           PSMTXROSkin2VecArray
326 
327 Description:    Multiplies an array of vectors by a reordered matrix,
328                 using paired single operations.
329                 This function is significantly faster than
330                 PSMTXMultVecArray, but requires that you have reordered
331                 the matrix in advance with PSMTXReorder.
332                 OK if source = destination.
333                 NOTE: number of vertices transformed cannot be less than
334                 2.
335 
336                 Note that NO error checking is performed.
337 
338                 Performance: ~20.8 cycles per vertex, where
339                               count = 70
340 
341 Arguments:      m0        first reordered matrix
342                 m1        second reordered matrix
343                 wtBase    start of per vertex weight array
344                 srcBase   start of source vector array.
345                 dstBase   start of resultant vector array.
346                 count     number of vectors in srcBase, dstBase arrays
347                           COUNT MUST BE GREATER THAN 2.
348 
349 
350 Return   :         none
351 
352 *---------------------------------------------------------------------*/
353 asm void
PSMTXROSkin2VecArray(const register ROMtx m0,const register ROMtx m1,const register f32 * wtBase,const register Vec * srcBase,register Vec * dstBase,register u32 count)354 PSMTXROSkin2VecArray
355 (
356     const register ROMtx  m0,     // r3
357     const register ROMtx  m1,     // r4
358     const register f32   *wtBase, // r5
359     const register Vec   *srcBase,// r6
360           register Vec   *dstBase,// r7
361           register u32    count   // r8
362 )
363 {
364     nofralloc
365 // transposed matrix
366 #define M00_10  fp0
367 #define M20     fp1
368 #define M01_11  fp2
369 #define M21     fp3
370 #define M02_12  fp4
371 #define M22     fp5
372 #define M03_13  fp6
373 #define M23     fp7
374 
375 // source vector - 1 3D vectors in 2 PS registers
376 #define Sx_y    fp8
377 #define Sz      fp9
378 
379 // Destination vector - 1 3d vector in 2 PS registers
380 #define Dx_y    fp10
381 #define Dz      fp11
382 
383 // intermediate vector 1 3D vector in 2 PS registers
384 #define Ix_y    fp12
385 #define Iz      fp13
386 
387 #define M0_00_10    fp14
388 #define M0_20       fp15
389 #define M0_01_11    fp16
390 #define M0_21       fp17
391 #define M0_02_12    fp18
392 #define M0_22       fp19
393 #define M0_03_13    fp20
394 #define M0_23       fp21
395 
396 
397 #define M1_00_10    fp22
398 #define M1_20       fp23
399 #define M1_01_11    fp24
400 #define M1_21       fp25
401 #define M1_02_12    fp26
402 #define M1_22       fp27
403 #define M1_03_13    fp28
404 #define M1_23       fp29
405 
406 #define Wt          fp30
407 
408     // save FP regs
409     stwu        r1, -160(r1)
410     stfd        fp14,  8(r1)
411     stfd        fp15, 16(r1)
412     stfd        fp16, 24(r1)
413     stfd        fp17, 32(r1)
414     stfd        fp18, 40(r1)
415     stfd        fp19, 48(r1)
416     stfd        fp20, 56(r1)
417     stfd        fp21, 64(r1)
418     stfd        fp22, 72(r1)
419     stfd        fp23, 80(r1)
420     stfd        fp24, 88(r1)
421     stfd        fp25, 96(r1)
422     stfd        fp26, 104(r1)
423     stfd        fp27, 112(r1)
424     stfd        fp28, 120(r1)
425     stfd        fp29, 128(r1)
426     stfd        fp30, 136(r1)
427 
428     // always perform at least one iteration of loop
429     addi        r9, count, -1
430     mtctr       r9
431 
432     addi        srcBase, srcBase, -4
433     addi        dstBase, dstBase, -4
434     addi        wtBase,  wtBase,  -4
435 
436     // load matrices m0 and (m1-m0)
437     psq_l       M0_00_10,0(m0),0,0
438     psq_l       M1_00_10,0(m1),0,0
439 
440     psq_l       M0_20, 8(m0),1,0
441     psq_l       M1_20, 8(m1),1,0
442 
443     psq_l       M0_01_11, 12(m0),0,0
444     psq_l       M1_01_11, 12(m1),0,0
445 
446     ps_sub      M1_00_10,M1_00_10,M0_00_10
447 
448     psq_l       M0_21, 20(m0),1,0
449     psq_l       M1_21, 20(m1),1,0
450 
451     ps_sub      M1_20,M1_20,M0_20
452 
453     psq_l       M0_02_12, 24(m0),0,0
454     psq_l       M1_02_12, 24(m1),0,0
455 
456     ps_sub      M1_01_11,M1_01_11,M0_01_11
457 
458     psq_l       M0_22, 32(m0),1,0
459     psq_l       M1_22, 32(m1),1,0
460 
461     ps_sub      M1_21,M1_21,M0_21
462 
463     psq_l       M0_03_13, 36(m0),0,0
464     psq_l       M1_03_13, 36(m1),0,0
465 
466     ps_sub      M1_02_12,M1_02_12,M0_02_12
467 
468     psq_l       M0_23, 44(m0),1,0
469     psq_l       M1_23, 44(m1),1,0
470 
471     ps_sub      M1_22,M1_22,M0_22
472     ps_sub      M1_03_13,M1_03_13,M0_03_13
473     ps_sub      M1_23,M1_23,M0_23
474 
475     //start of first iteration
476     psq_lu      Wt,4(wtBase),1,0                // Wt = *wtBase++;
477     psq_lu      Sx_y, 4(srcBase), 0, 0          // Sx_y = *srcBase++;
478     psq_lu      Sz, 8(srcBase), 1, 0            // Sz = *srcBase++;
479 
480     ps_madds0   M00_10,M1_00_10,Wt,M0_00_10     // m = lerp(m0, m1, wt);
481     ps_madds0   M20,M1_20,Wt,M0_20              // m = lerp(m0, m1, wt);
482     ps_madds0   M01_11,M1_01_11,Wt,M0_01_11     // m = lerp(m0, m1, wt);
483     ps_madds0   M21,M1_21,Wt,M0_21              // m = lerp(m0, m1, wt);
484     ps_madds0   M02_12,M1_02_12,Wt,M0_02_12     // m = lerp(m0, m1, wt);
485     ps_madds0   M22,M1_22,Wt,M0_22              // m = lerp(m0, m1, wt);
486     ps_madds0   M03_13,M1_03_13,Wt,M0_03_13     // m = lerp(m0, m1, wt);
487     ps_madds0   M23,M1_23,Wt,M0_23              // m = lerp(m0, m1, wt);
488 
489     ps_madds0   Ix_y, M00_10, Sx_y, M03_13      // Ix_y = M03_13 + M00_10 * Sx;
490     ps_madds0   Iz, M20, Sx_y, M23              // Iz   = M23    + M20    * Sx;
491 
492     psq_lu      Wt,4(wtBase),1,0                // Wt = *wtBase++;
493 
494 _mloop:
495     ps_madds1   Ix_y,M01_11,Sx_y,Ix_y           // Ix_y += M01_11 * Sy;
496     ps_madds1   Iz,M21,Sx_y,Iz                  // Iz   += M21    * Sy;
497 
498     psq_lu      Sx_y, 4(srcBase), 0, 0          // Sx_y = *srcBase++;
499 
500     ps_madds0   Dx_y, M02_12, Sz, Ix_y          // Dx_y = Ix_y + M01_12 * Sz;
501     ps_madds0   Dz, M22, Sz, Iz                 // Dz   = Iz   + M22    * Sz;
502 
503     psq_lu      Sz, 8(srcBase), 1, 0            // Sz = *srcBase++;
504 
505     ps_madds0   M00_10,M1_00_10,Wt,M0_00_10     // m = lerp(m0, m1, wt);
506     ps_madds0   M20,M1_20,Wt,M0_20              // m = lerp(m0, m1, wt);
507     ps_madds0   M01_11,M1_01_11,Wt,M0_01_11     // m = lerp(m0, m1, wt);
508     ps_madds0   M21,M1_21,Wt,M0_21              // m = lerp(m0, m1, wt);
509     ps_madds0   M02_12,M1_02_12,Wt,M0_02_12     // m = lerp(m0, m1, wt);
510     ps_madds0   M22,M1_22,Wt,M0_22              // m = lerp(m0, m1, wt);
511     ps_madds0   M03_13,M1_03_13,Wt,M0_03_13     // m = lerp(m0, m1, wt);
512     ps_madds0   M23,M1_23,Wt,M0_23              // m = lerp(m0, m1, wt);
513 
514     psq_stu     Dx_y, 4(dstBase), 0, 0          // *dstBase++ = Dx_y;
515 
516     ps_madds0   Ix_y, M00_10, Sx_y, M03_13      // Ix_y = M03_13 + M00_10 * Sx;
517     ps_madds0   Iz, M20, Sx_y, M23              // Iz   = M23    + M20    * Sx;
518 
519     psq_stu     Dz, 8(dstBase), 1, 0            // *dstBase++ = Dz;
520 
521     psq_lu      Wt,4(wtBase),1,0                // Wt = *wtBase++;
522 
523     bdnz+       _mloop
524 _mlend:
525 
526     ps_madds1   Ix_y,M01_11,Sx_y,Ix_y           // Ix_y += M01_11 * Sy;
527     ps_madds1   Iz,M21,Sx_y,Iz                  // Iz   += M21    * Sy;
528 
529     ps_madds0   Dx_y, M02_12, Sz, Ix_y          // Dx_y = Ix_y + M01_12 * Sz;
530 
531     psq_stu     Dx_y, 4(dstBase), 0, 0          // *dstBase++ = Dx_y;
532 
533     ps_madds0   Dz, M22, Sz, Iz                 // Dz   = Iz   + M22    * Sz;
534 
535     psq_stu     Dz, 8(dstBase), 1, 0            // *dstBase++ = Dz;
536 
537     lfd         fp14,  8(r1)
538     lfd         fp15, 16(r1)
539     lfd         fp16, 24(r1)
540     lfd         fp17, 32(r1)
541     lfd         fp18, 40(r1)
542     lfd         fp19, 48(r1)
543     lfd         fp20, 56(r1)
544     lfd         fp21, 64(r1)
545     lfd         fp22, 72(r1)
546     lfd         fp23, 80(r1)
547     lfd         fp24, 88(r1)
548     lfd         fp25, 96(r1)
549     lfd         fp26, 104(r1)
550     lfd         fp27, 112(r1)
551     lfd         fp28, 120(r1)
552     lfd         fp29, 128(r1)
553     lfd         fp30, 136(r1)
554     addi        r1, r1, 160
555 
556     blr
557 
558 #undef M00_10
559 #undef M20
560 #undef M01_11
561 #undef M21
562 #undef M02_12
563 #undef M22
564 #undef M03_13
565 #undef M23
566 
567 #undef Sx_y
568 #undef Sz
569 
570 #undef Dx_y
571 #undef Dz
572 
573 #undef Ix_y
574 #undef Iz
575 
576 #undef  M0_00_10
577 #undef  M0_20
578 #undef  M0_01_11
579 #undef  M0_21
580 #undef  M0_02_12
581 #undef  M0_22
582 #undef  M0_03_13
583 #undef  M0_23
584 
585 
586 #undef  M1_00_10
587 #undef  M1_20
588 #undef  M1_01_11
589 #undef  M1_21
590 #undef  M1_02_12
591 #undef  M1_22
592 #undef  M1_03_13
593 #undef  M1_23
594 
595 #undef  Wt
596 }
597 
598 /*---------------------------------------------------------------------*
599 
600 Name:           PSMTXROMultS16VecArray
601 
602 Description:    Multiplies an array of signed 16 bit vectors by a
603                 reordered matrix, generating a Vec array of floats.
604                 No cost in conversion.  However, this code does take a
605                 hit because it uses mtspr to set up a quantization
606                 register to convert S16 -> F32.  For production code,
607                 the GQR should be set up in advance.
608 
609                 OK if source = destination.
610 
611                 Note that NO error checking is performed.
612 
613                 Performance: 9.671 - 9.900 cycles per vertex where
614                               count = 70
615 
616 Arguments:      m         matrix.
617                 srcBase   start of source s16 vector array.
618                 dstBase   start of resultant vector array. Note that
619                           available room should be twice as large as
620                           source data.
621                 count     number of vectors in srcBase, dstBase arrays
622                           COUNT MUST BE GREATER THAN 1.
623 
624 
625 Return   :         none
626 
627 *---------------------------------------------------------------------*/
628 asm void
PSMTXROMultS16VecArray(const register ROMtx m,const register S16Vec * srcBase,register Vec * dstBase,register u32 count)629 PSMTXROMultS16VecArray
630 (
631     const register ROMtx   m,      // r3
632     const register S16Vec *srcBase,// r4
633           register Vec    *dstBase,// r5
634           register u32     count   // r6
635 )
636 {
637     nofralloc
638 #define M00_M10 fp0
639 #define M20_nnn fp1
640 #define M01_M11 fp2
641 #define M21_nnn fp3
642 #define M02_M12 fp4
643 #define M22_nnn fp5
644 #define M03_M13 fp6
645 #define M23_nnn fp7
646 
647 // source vectors - 2 3D vectors in 3 PS registers
648 #define SX0_SY0 fp8
649 #define SZ0_SX1 fp9
650 #define SY1_SZ1 fp10
651 // Destination registers - 2 3d vectors in 4 PS registers
652 #define DX0_DY0 fp11
653 #define DZ0_nnn fp12
654 #define DX1_DY1 fp13
655 #define DZ1_nnn fp14
656 // temp registers for writing back values.  These registers store the final
657 // results from the PREVIOUS loop
658 #define WX0_WY0 fp15
659 #define WZ0_nnn fp16
660 #define WX1_WY1 fp17
661 #define WZ1_nnn fp18
662 
663     stwu    r1, -64(r1)
664     stfd    fp14, 8(r1)
665     // unrolled once, but since we're dividing by 2, add 1 to ensure if
666     // odd # of vertices, the last one gets x-formed.
667     addi    r7, count, -1
668     stfd    fp15, 16(r1)
669     srwi    r7, r7, 1 // 2 at a time
670     stfd    fp16, 24(r1)
671     lis     r8, 0x0007  // setup GQR6
672     stfd    fp17, 32(r1)
673     mtspr   GQR6, r8    // this will stall like a monkey's butt
674     stfd    fp18, 40(r1)
675     mtctr   r7
676     // load matrix
677     psq_l   M00_M10, 0(m),0,0
678     addi    srcBase, srcBase, -4
679     psq_l   M20_nnn, 8(m),1,0
680     addi    dstBase, dstBase, -4
681     psq_l   M03_M13, 36(m),0,0
682     psq_lu  SX0_SY0, 4(srcBase), 0, 6
683     psq_l   M23_nnn, 44(m),1,0
684     psq_lu  SZ0_SX1, 4(srcBase), 0, 6
685 
686 
687     // ------------------------------UNROLLED
688 
689     //  DX0=M00*SX0+M03, DY0=M10*SX0+M13
690     //  DZ0=M20*SX0+M23
691     //  DX1=M00*SX1+M03, DY1=M10*SX1+M13
692     //  DZ1=M20*SX1+M23
693 
694 
695     ps_madds0    DX0_DY0, M00_M10, SX0_SY0, M03_M13
696     psq_l   M01_M11, 12(m),0,0
697     ps_madds0    DZ0_nnn, M20_nnn, SX0_SY0, M23_nnn
698     psq_l   M21_nnn, 20(m),1,0
699     ps_madds1    DX1_DY1, M00_M10, SZ0_SX1, M03_M13
700     psq_lu SY1_SZ1, 4(srcBase), 0, 6
701     ps_madds1    DZ1_nnn, M20_nnn, SZ0_SX1, M23_nnn
702     psq_l   M22_nnn, 32(m),1,0
703 
704     //  DX0=M01*SY0+DX0, DY0=M11*SY0+DY0
705     //  DZ0=M21*SY0+DZ0
706     //  DX1=M01*SY1+DX1, DY1=M11*SY1+DY1
707     //  DZ1=M21*SY1+DZ1
708 
709     ps_madds1    DX0_DY0, M01_M11, SX0_SY0, DX0_DY0
710     ps_madds1    DZ0_nnn, M21_nnn, SX0_SY0, DZ0_nnn
711     psq_l   M02_M12, 24(m),0,0
712     ps_madds0    DX1_DY1, M01_M11, SY1_SZ1, DX1_DY1
713     psq_lu SX0_SY0, 4(srcBase), 0, 6
714     ps_madds0    DZ1_nnn, M21_nnn, SY1_SZ1, DZ1_nnn
715 
716     //  DX0=M02*SZ0+DX0, DY0=M12*SZ0+DY0
717     //  DZ0=M22*SZ0+DZ0
718     //  DX1=M02*SZ1+DX1, DY1=M12*SZ1+DY1
719     //  DZ1=M22*SZ1+DZ1
720 
721     // Write final values to temp registers
722     ps_madds0    WX0_WY0, M02_M12, SZ0_SX1, DX0_DY0
723     ps_madds0    WZ0_nnn, M22_nnn, SZ0_SX1, DZ0_nnn
724     psq_lu SZ0_SX1, 4(srcBase), 0, 6
725     ps_madds1    WX1_WY1, M02_M12, SY1_SZ1, DX1_DY1
726     ps_madds1    WZ1_nnn, M22_nnn, SY1_SZ1, DZ1_nnn
727     psq_lu SY1_SZ1, 4(srcBase), 0, 6
728 
729     // -------------------------- LOOP START
730 _mloop:
731     ps_madds0    DX0_DY0, M00_M10, SX0_SY0, M03_M13
732       psq_stu     WX0_WY0, 4(dstBase), 0, 0
733     ps_madds0    DZ0_nnn, M20_nnn, SX0_SY0, M23_nnn
734       psq_stu     WZ0_nnn, 8(dstBase), 1, 0
735     ps_madds1    DX1_DY1, M00_M10, SZ0_SX1, M03_M13
736       psq_stu     WX1_WY1, 4(dstBase), 0, 0
737     ps_madds1    DZ1_nnn, M20_nnn, SZ0_SX1, M23_nnn
738       psq_stu     WZ1_nnn, 8(dstBase), 1, 0
739     ps_madds1    DX0_DY0, M01_M11, SX0_SY0, DX0_DY0
740     ps_madds1    DZ0_nnn, M21_nnn, SX0_SY0, DZ0_nnn
741     // YYY LAST SX0_SY0 USAGE FOR THIS ITERATION
742       psq_lu SX0_SY0, 4(srcBase), 0, 6 // NEXT SX0 SY0
743     ps_madds0    DX1_DY1, M01_M11, SY1_SZ1, DX1_DY1
744     ps_madds0    DZ1_nnn, M21_nnn, SY1_SZ1, DZ1_nnn
745 
746     // Write final values to temp registers
747     ps_madds0    WX0_WY0, M02_M12, SZ0_SX1, DX0_DY0
748     ps_madds0    WZ0_nnn, M22_nnn, SZ0_SX1, DZ0_nnn
749     // YYY LAST SZ0_SX1 USAGE FOR THIS ITERATION
750       psq_lu SZ0_SX1, 4(srcBase), 0, 6 // NEXT SZ0 SX1
751     ps_madds1    WX1_WY1, M02_M12, SY1_SZ1, DX1_DY1
752     ps_madds1    WZ1_nnn, M22_nnn, SY1_SZ1, DZ1_nnn
753     // YYY LAST SY1_SZ1 USAGE FOR THIS ITERATION
754       psq_lu SY1_SZ1, 4(srcBase), 0, 6 // NEXT SY1 SZ1
755 
756     bdnz+ _mloop    // -------------------------- LOOP END
757 
758     psq_stu     WX0_WY0, 4(dstBase), 0, 0
759     rlwinm.     r7, count, 0, 31, 31 // check odd
760     psq_stu     WZ0_nnn, 8(dstBase), 1, 0
761     bne     _return
762     // Skipped if odd number of vectors
763     psq_stu     WX1_WY1, 4(dstBase), 0, 0
764     // can't put anything here
765     psq_stu     WZ1_nnn, 8(dstBase), 1, 0
766 
767 
768 _return:
769     lfd     fp14, 8(r1)
770     lfd     fp15, 16(r1)
771     lfd     fp16, 24(r1)
772     lfd     fp17, 32(r1)
773     lfd     fp18, 40(r1)
774     addi    r1, r1, 64
775     blr
776 
777 
778 #undef M00_M10
779 #undef M20_nnn
780 #undef M01_M11
781 #undef M21_nnn
782 #undef M02_M12
783 #undef M22_nnn
784 #undef M03_M13
785 #undef M23_nnn
786 #undef SX0_SY0
787 #undef SZ0_SX1
788 #undef SY1_SZ1
789 #undef DX0_DY0
790 #undef DZ0_nnn
791 #undef DX1_DY1
792 #undef DZ1_nnn
793 #undef WX0_WY0
794 #undef WZ0_nnn
795 #undef WX1_WY1
796 #undef WZ1_nnn
797 
798 }
799 
800 
801 
802 /*---------------------------------------------------------------------*
803 
804 Name:           PSMTXMultS16VecArray
805 
806 Description:    Multiplies an array of signed 16 bit vectors by a matrix,
807                 generating a Vec array of floats.  No cost in conversion.
808                 However, this code does take a hit because it uses
809                 mtspr to set up a quantization register to convert
810                 S16 -> F32.  For production code, the GQR should be set
811                 up in advance.
812 
813                 This function takes longer than PSMTXROMultS16VecArray
814                 which uses reordered matrices.
815                 OK if source = destination.
816 
817                 Note that NO error checking is performed.
818 
819                 Performance: 13.714 -13.786 cycles per vertex where
820                               count = 70
821 
822 Arguments:      m         matrix.
823                 srcBase   start of source vector array.
824                 dstBase   start of resultant vector array. Note that
825                           available room should be twice as large as
826                           source data.
827                 count     number of vectors in srcBase, dstBase arrays
828                           COUNT MUST BE GREATER THAN 1.
829 
830 
831 Return   :         none
832 
833 *---------------------------------------------------------------------*/
834 asm void
PSMTXMultS16VecArray(const register Mtx m,const register S16Vec * srcBase,register Vec * dstBase,register u32 count)835 PSMTXMultS16VecArray
836 (
837     const register Mtx     m,      // r3
838     const register S16Vec *srcBase,// r4
839           register Vec    *dstBase,// r5
840           register u32     count   // r6
841 )
842 {
843     nofralloc
844     //      cmpwi   count, 0
845     //      beq-    _return
846     psq_l   fp0, 0(m), 0, 0    // [0][0], [0][1]
847     lis     r7, 0x0007  // setup GQR6
848     mtspr   GQR6, r7    // this will stall like a monkey's butt
849     // fp6 - x y
850     psq_l   fp6, 0(srcBase), 0, 6
851     subi    count, count, 1 // unrolling once
852     // fp7 - z 1
853     psq_l   fp7, 4(srcBase), 1, 6
854     mtctr   count
855     // unused slot here
856     psq_l   fp1, 8(m), 0, 0    // [0][2], [0][3]
857     addi    srcBase, srcBase, 4 // load ops will add 2,
858                                 // but we already got the first vertex
859     psq_l   fp2, 16(m), 0, 0   // [1][0], [1][1]
860     addi    dstBase, dstBase, -4 // store ops will add 4
861     psq_l   fp3, 24(m), 0, 0   // [1][2], [1][3]
862 
863 
864 
865 
866     // ------ first loop starts here
867     // fp8 = m00x m01y // next X
868     ps_mul  fp8, fp0, fp6
869     psq_l   fp4, 32(m), 0, 0   // [2][0], [2][1]
870     // fp10 = m10x m11y // next Y
871     ps_mul  fp10, fp2, fp6
872     psq_l   fp5, 40(m), 0, 0   // [2][2], [2][3]
873     // fp12 = m20x m21y // next Z
874     ps_mul  fp12, fp4, fp6  // YYY last FP6 usage
875 
876     // fp6 - x y
877     psq_lu  fp6, 2(srcBase), 0, 6 // advance to x
878     // Potential FP stall here if psq_lu dispatches same
879     // cycle as previous ps_mul
880 
881     // fp8 = m00x + m02z  | m01y + m03
882     ps_madd fp8, fp1, fp7 ,fp8
883     // fp10 = m10x + m12z  | m11y + m13
884     ps_madd fp10, fp3, fp7 ,fp10
885     // fp12 = m20x + m22z  | m21y + m23
886     ps_madd fp12, fp5, fp7 ,fp12 // YYY last FP7 usage
887 
888     // fp7 - z 1
889     psq_lu  fp7, 4(srcBase), 1, 6 // advance to z, will be skipped by next lu
890     // Potential FP stall here if psq_lu dispatches same
891     // cycle as previous ps_madd (fp8 dependency)
892     ps_sum0 fp9, fp8, fp8, fp8 // X ready
893 
894     // ------------------- main loop
895 _mloop:
896     ps_sum0 fp11, fp10, fp10, fp10 // Y ready
897     // fp8 = m00x m01y // next X
898     ps_mul  fp8, fp0, fp6
899     ps_sum0 fp13, fp12, fp12, fp12 // Z ready
900     // fp10 = m10x m11y // next Y
901     ps_mul  fp10, fp2, fp6
902       psq_stu  fp9,  4(dstBase), 1, 0   // prev X
903     // fp12 = m20x m21y // next Z
904     ps_mul  fp12, fp4, fp6  // YYY last FP6 usage
905       psq_stu  fp11, 4(dstBase), 1, 0   // prev Y
906     // fp8 = m00x + m02z  | m01y + m03
907     ps_madd fp8, fp1, fp7 ,fp8
908       psq_stu  fp13, 4(dstBase), 1, 0   // prev Z
909     // fp10 = m10x + m12z  | m11y + m13
910     ps_madd fp10, fp3, fp7 ,fp10
911       // fp6 - x y
912       psq_lu  fp6, 2(srcBase), 0, 6 // advance to x
913     // fp12 = m20x + m22z  | m21y + m23
914     ps_madd fp12, fp5, fp7 ,fp12 // YYY last FP7 usage
915 
916     // fp7 - z 1
917     psq_lu  fp7, 4(srcBase), 1, 6 // advance to z, will be skipped by next lu
918     // Potential FP stall here if psq_lu dispatches same
919     // cycle as previous ps_madd
920 
921     ps_sum0 fp9, fp8, fp8, fp8 // X ready
922     bdnz+   _mloop
923     // ------------------- end of loop
924     ps_sum0 fp11, fp10, fp10, fp10 // Y ready
925     ps_sum0 fp13, fp12, fp12, fp12 // Z ready
926     // commit last iteration
927     psq_stu  fp9,  4(dstBase), 1, 0
928     psq_stu  fp11, 4(dstBase), 1, 0
929     psq_stu  fp13, 4(dstBase), 1, 0
930 
931 _return:
932     blr
933 }
934 
935 
936 #endif // GEKKO
937 
938 
939 /*===========================================================================*/
940