1 /*---------------------------------------------------------------------------*
2 Project: matrix vector Library
3 File: psmtx.c
4
5 Copyright 1998, 1999, 2000 Nintendo. All rights reserved.
6
7 These coded instructions, statements, and computer programs contain
8 proprietary information of Nintendo of America Inc. and/or Nintendo
9 Company Ltd., and are protected by Federal copyright law. They may
10 not be disclosed to third parties or copied or duplicated in any form,
11 in whole or in part, without the prior written consent of Nintendo.
12
13
14 $Log: psmtx.c,v $
15 Revision 1.2 02/20/2006 04:25:42 mitu
16 changed include path from dolphin/ to revolution/.
17
18 Revision 1.1.1.1 2005/05/12 02:15:49 yasuh-to
19 transitioned from the Dolphin source tree
20 NoKeywords: $
21
22 6 03/08/21 5:17 Dante
23 Changed GQR1 to GQR6 in PSMTXMultS16VecArray
24
25 5 02/04/11 13:11 Hirose
26 const type specifier support. (worked by Hiratsu@IRD)
27
28 4 2/26/01 11:56p Hirose
29 avoided use of GQR1 which is reserved by the compiler.
30
31 3 2/22/01 11:49p Hirose
32 Some functions are moved to another file according to arrangement
33 updates.
34
35 2 7/12/00 4:41p John
36 Substitutes MTXConcat and MTXMultVecArray with their paired-singles
37 equivalent for Gekko non-debug builds.
38
39 1 5/10/00 1:48p Hirose
40 moved paired-single matrix stuff into an another source file
41
42 $NoKeywords: $
43 *---------------------------------------------------------------------------*/
44
45 #include <math.h>
46 #include <revolution/mtx.h>
47 #include "mtxAssert.h"
48
49
50 /*---------------------------------------------------------------------*
51 Special purpose Paired-single optimized code
52
53 All paired-single code assumes GQR0 = 0.
54 *---------------------------------------------------------------------*/
55 #ifdef GEKKO
56
57 /*---------------------------------------------------------------------*
58
59 Name: PSMTXReorder
60
61 Description: Creates a reordered (column-major) matrix from a
62 row-major matrix, using paired single operations.
63 Reordered matrices are required for the PSMTXRO*
64 functions, which operate faster than their non-reordered
65 counterparts.
66
67 Performance: ~15 cycles.
68
69 Arguments: src source matrix.
70 dest destination matrix, note type is ROMtx.
71
72 Return : none
73
74 *---------------------------------------------------------------------*/
75 asm void
PSMTXReorder(const register Mtx src,register ROMtx dest)76 PSMTXReorder(const register Mtx src, register ROMtx dest)
77 {
78 nofralloc
79 #define S00_S01 fp0
80 #define S02_S03 fp1
81 #define S10_S11 fp2
82 #define S12_S13 fp3
83 #define S20_S21 fp4
84 #define S22_S23 fp5
85 #define D00_D10 fp6
86 #define D11_D21 fp7
87 #define D02_D12 fp8
88 #define D22_D03 fp9
89 #define D13_D23 fp10
90 #define D20_D01 fp12
91
92
93 psq_l S00_S01, 0(src), 0, 0
94 psq_l S10_S11, 16(src), 0, 0
95 psq_l S20_S21, 32(src), 0, 0
96 psq_l S02_S03, 8(src), 0, 0
97 ps_merge00 D00_D10, S00_S01, S10_S11
98 psq_l S12_S13, 24(src), 0, 0
99 ps_merge01 D20_D01, S20_S21, S00_S01
100 psq_l S22_S23, 40(src), 0, 0
101 ps_merge11 D11_D21, S10_S11, S20_S21
102 psq_st D00_D10, 0(dest), 0, 0
103 ps_merge00 D02_D12, S02_S03, S12_S13
104 psq_st D20_D01, 8(dest), 0, 0
105 ps_merge01 D22_D03, S22_S23, S02_S03
106 psq_st D11_D21, 16(dest),0, 0
107 ps_merge11 D13_D23, S12_S13, S22_S23
108 psq_st D02_D12, 24(dest),0, 0
109 psq_st D22_D03, 32(dest),0,0
110 psq_st D13_D23, 40(dest),0,0
111
112 blr
113 #undef S00_S01
114 #undef S02_S03
115 #undef S10_S11
116 #undef S12_S13
117 #undef S20_S21
118 #undef S22_S23
119 #undef D00_D10
120 #undef D11_D21
121 #undef D02_D12
122 #undef D22_D03
123 #undef D13_D23
124 #undef D20_D01
125
126 }
127
128 /*---------------------------------------------------------------------*
129
130 Name: PSMTXROMultVecArray
131
132 Description: Multiplies an array of vectors by a reordered matrix,
133 using paired single operations.
134 This function is significantly faster than
135 PSMTXMultVecArray, but requires that you have reordered
136 the matrix in advance with PSMTXReorder.
137 OK if source = destination.
138 NOTE: number of vertices transformed cannot be less than
139 2.
140
141 Note that NO error checking is performed.
142
143 Performance: 9.586 - 9.814 cycles per vertex, where
144 count = 70
145
146 Arguments: m reordered matrix.
147 srcBase start of source vector array.
148 dstBase start of resultant vector array.
149 count number of vectors in srcBase, dstBase arrays
150 COUNT MUST BE GREATER THAN 2.
151
152
153 Return : none
154
155 *---------------------------------------------------------------------*/
156 asm void
PSMTXROMultVecArray(const register ROMtx m,const register Vec * srcBase,register Vec * dstBase,register u32 count)157 PSMTXROMultVecArray
158 (
159 const register ROMtx m, // r3
160 const register Vec *srcBase,// r4
161 register Vec *dstBase,// r5
162 register u32 count // r6
163 )
164 {
165 nofralloc
166 #define M00_M10 fp0
167 #define M20_nnn fp1
168 #define M01_M11 fp2
169 #define M21_nnn fp3
170 #define M02_M12 fp4
171 #define M22_nnn fp5
172 #define M03_M13 fp6
173 #define M23_nnn fp7
174
175 // source vectors - 2 3D vectors in 3 PS registers
176 #define SX0_SY0 fp8
177 #define SZ0_SX1 fp9
178 #define SY1_SZ1 fp10
179 // Destination registers - 2 3d vectors in 4 PS registers
180 #define DX0_DY0 fp11
181 #define DZ0_nnn fp12
182 #define DX1_DY1 fp13
183 #define DZ1_nnn fp14
184 // temp registers for writing back values. These registers store the final
185 // results from the PREVIOUS loop
186 #define WX0_WY0 fp15
187 #define WZ0_nnn fp16
188 #define WX1_WY1 fp17
189 #define WZ1_nnn fp18
190
191 stwu r1, -64(r1)
192 stfd fp14, 8(r1)
193 // unrolled once, but since we're dividing by 2, add 1 to ensure if
194 // odd # of vertices, the last one gets x-formed.
195 addi r7, count, -1
196 stfd fp15, 16(r1)
197 srwi r7, r7, 1 // 2 at a time
198 stfd fp16, 24(r1)
199 stfd fp17, 32(r1)
200 stfd fp18, 40(r1)
201 mtctr r7
202 // load matrix
203 psq_l M00_M10, 0(m),0,0
204 addi srcBase, srcBase, -8
205 psq_l M20_nnn, 8(m),1,0
206 addi dstBase, dstBase, -4
207 psq_l M03_M13, 36(m),0,0
208 psq_lu SX0_SY0, 8(srcBase), 0, 0
209 psq_l M23_nnn, 44(m),1,0
210 psq_lu SZ0_SX1, 8(srcBase), 0, 0
211
212 // ------------------------------UNROLLED
213
214 // DX0=M00*SX0+M03, DY0=M10*SX0+M13
215 // DZ0=M20*SX0+M23
216 // DX1=M00*SX1+M03, DY1=M10*SX1+M13
217 // DZ1=M20*SX1+M23
218
219 ps_madds0 DX0_DY0, M00_M10, SX0_SY0, M03_M13
220 psq_l M01_M11, 12(m),0,0
221 ps_madds0 DZ0_nnn, M20_nnn, SX0_SY0, M23_nnn
222 psq_l M21_nnn, 20(m),1,0
223 ps_madds1 DX1_DY1, M00_M10, SZ0_SX1, M03_M13
224 psq_lu SY1_SZ1,8(srcBase), 0, 0
225 ps_madds1 DZ1_nnn, M20_nnn, SZ0_SX1, M23_nnn
226 psq_l M22_nnn, 32(m),1,0
227
228 // DX0=M01*SY0+DX0, DY0=M11*SY0+DY0
229 // DZ0=M21*SY0+DZ0
230 // DX1=M01*SY1+DX1, DY1=M11*SY1+DY1
231 // DZ1=M21*SY1+DZ1
232
233 ps_madds1 DX0_DY0, M01_M11, SX0_SY0, DX0_DY0
234 ps_madds1 DZ0_nnn, M21_nnn, SX0_SY0, DZ0_nnn
235 psq_l M02_M12, 24(m),0,0
236 ps_madds0 DX1_DY1, M01_M11, SY1_SZ1, DX1_DY1
237 psq_lu SX0_SY0, 8(srcBase), 0, 0
238 ps_madds0 DZ1_nnn, M21_nnn, SY1_SZ1, DZ1_nnn
239
240 // DX0=M02*SZ0+DX0, DY0=M12*SZ0+DY0
241 // DZ0=M22*SZ0+DZ0
242 // DX1=M02*SZ1+DX1, DY1=M12*SZ1+DY1
243 // DZ1=M22*SZ1+DZ1
244
245 // Write final values to temp registers
246 ps_madds0 WX0_WY0, M02_M12, SZ0_SX1, DX0_DY0
247 ps_madds0 WZ0_nnn, M22_nnn, SZ0_SX1, DZ0_nnn
248 psq_lu SZ0_SX1, 8(srcBase), 0, 0
249 ps_madds1 WX1_WY1, M02_M12, SY1_SZ1, DX1_DY1
250 ps_madds1 WZ1_nnn, M22_nnn, SY1_SZ1, DZ1_nnn
251 psq_lu SY1_SZ1,8(srcBase), 0, 0
252
253 // -------------------------- LOOP START
254 _mloop:
255 ps_madds0 DX0_DY0, M00_M10, SX0_SY0, M03_M13
256 psq_stu WX0_WY0, 4(dstBase), 0, 0
257 ps_madds0 DZ0_nnn, M20_nnn, SX0_SY0, M23_nnn
258 psq_stu WZ0_nnn, 8(dstBase), 1, 0
259 ps_madds1 DX1_DY1, M00_M10, SZ0_SX1, M03_M13
260 psq_stu WX1_WY1, 4(dstBase), 0, 0
261 ps_madds1 DZ1_nnn, M20_nnn, SZ0_SX1, M23_nnn
262 psq_stu WZ1_nnn, 8(dstBase), 1, 0
263 ps_madds1 DX0_DY0, M01_M11, SX0_SY0, DX0_DY0
264 ps_madds1 DZ0_nnn, M21_nnn, SX0_SY0, DZ0_nnn
265 // YYY LAST SX0_SY0 USAGE FOR THIS ITERATION
266 psq_lu SX0_SY0, 8(srcBase), 0, 0 // NEXT SX0 SY0
267 ps_madds0 DX1_DY1, M01_M11, SY1_SZ1, DX1_DY1
268 ps_madds0 DZ1_nnn, M21_nnn, SY1_SZ1, DZ1_nnn
269
270 // Write final values to temp registers
271 ps_madds0 WX0_WY0, M02_M12, SZ0_SX1, DX0_DY0
272 ps_madds0 WZ0_nnn, M22_nnn, SZ0_SX1, DZ0_nnn
273 // YYY LAST SZ0_SX1 USAGE FOR THIS ITERATION
274 psq_lu SZ0_SX1, 8(srcBase), 0, 0 // NEXT SZ0 SX1
275 ps_madds1 WX1_WY1, M02_M12, SY1_SZ1, DX1_DY1
276 ps_madds1 WZ1_nnn, M22_nnn, SY1_SZ1, DZ1_nnn
277 // YYY LAST SY1_SZ1 USAGE FOR THIS ITERATION
278 psq_lu SY1_SZ1,8(srcBase), 0, 0 // NEXT SY1 SZ1
279
280 bdnz+ _mloop // -------------------------- LOOP END
281
282 psq_stu WX0_WY0, 4(dstBase), 0, 0
283 rlwinm. r7, count, 0, 31, 31 // check odd
284 psq_stu WZ0_nnn, 8(dstBase), 1, 0
285 bne _return
286 // Skipped if odd number of vectors
287 psq_stu WX1_WY1, 4(dstBase), 0, 0
288 // can't put anything here
289 psq_stu WZ1_nnn, 8(dstBase), 1, 0
290
291
292 _return:
293 lfd fp14, 8(r1)
294 lfd fp15, 16(r1)
295 lfd fp16, 24(r1)
296 lfd fp17, 32(r1)
297 lfd fp18, 40(r1)
298 addi r1, r1, 64
299 blr
300
301 #undef M00_M10
302 #undef M20_nnn
303 #undef M01_M11
304 #undef M21_nnn
305 #undef M02_M12
306 #undef M22_nnn
307 #undef M03_M13
308 #undef M23_nnn
309 #undef SX0_SY0
310 #undef SZ0_SX1
311 #undef SY1_SZ1
312 #undef DX0_DY0
313 #undef DZ0_nnn
314 #undef DX1_DY1
315 #undef DZ1_nnn
316 #undef WX0_WY0
317 #undef WZ0_nnn
318 #undef WX1_WY1
319 #undef WZ1_nnn
320
321 }
322
323 /*---------------------------------------------------------------------*
324
325 Name: PSMTXROSkin2VecArray
326
327 Description: Multiplies an array of vectors by a reordered matrix,
328 using paired single operations.
329 This function is significantly faster than
330 PSMTXMultVecArray, but requires that you have reordered
331 the matrix in advance with PSMTXReorder.
332 OK if source = destination.
333 NOTE: number of vertices transformed cannot be less than
334 2.
335
336 Note that NO error checking is performed.
337
338 Performance: ~20.8 cycles per vertex, where
339 count = 70
340
341 Arguments: m0 first reordered matrix
342 m1 second reordered matrix
343 wtBase start of per vertex weight array
344 srcBase start of source vector array.
345 dstBase start of resultant vector array.
346 count number of vectors in srcBase, dstBase arrays
347 COUNT MUST BE GREATER THAN 2.
348
349
350 Return : none
351
352 *---------------------------------------------------------------------*/
353 asm void
PSMTXROSkin2VecArray(const register ROMtx m0,const register ROMtx m1,const register f32 * wtBase,const register Vec * srcBase,register Vec * dstBase,register u32 count)354 PSMTXROSkin2VecArray
355 (
356 const register ROMtx m0, // r3
357 const register ROMtx m1, // r4
358 const register f32 *wtBase, // r5
359 const register Vec *srcBase,// r6
360 register Vec *dstBase,// r7
361 register u32 count // r8
362 )
363 {
364 nofralloc
365 // transposed matrix
366 #define M00_10 fp0
367 #define M20 fp1
368 #define M01_11 fp2
369 #define M21 fp3
370 #define M02_12 fp4
371 #define M22 fp5
372 #define M03_13 fp6
373 #define M23 fp7
374
375 // source vector - 1 3D vectors in 2 PS registers
376 #define Sx_y fp8
377 #define Sz fp9
378
379 // Destination vector - 1 3d vector in 2 PS registers
380 #define Dx_y fp10
381 #define Dz fp11
382
383 // intermediate vector 1 3D vector in 2 PS registers
384 #define Ix_y fp12
385 #define Iz fp13
386
387 #define M0_00_10 fp14
388 #define M0_20 fp15
389 #define M0_01_11 fp16
390 #define M0_21 fp17
391 #define M0_02_12 fp18
392 #define M0_22 fp19
393 #define M0_03_13 fp20
394 #define M0_23 fp21
395
396
397 #define M1_00_10 fp22
398 #define M1_20 fp23
399 #define M1_01_11 fp24
400 #define M1_21 fp25
401 #define M1_02_12 fp26
402 #define M1_22 fp27
403 #define M1_03_13 fp28
404 #define M1_23 fp29
405
406 #define Wt fp30
407
408 // save FP regs
409 stwu r1, -160(r1)
410 stfd fp14, 8(r1)
411 stfd fp15, 16(r1)
412 stfd fp16, 24(r1)
413 stfd fp17, 32(r1)
414 stfd fp18, 40(r1)
415 stfd fp19, 48(r1)
416 stfd fp20, 56(r1)
417 stfd fp21, 64(r1)
418 stfd fp22, 72(r1)
419 stfd fp23, 80(r1)
420 stfd fp24, 88(r1)
421 stfd fp25, 96(r1)
422 stfd fp26, 104(r1)
423 stfd fp27, 112(r1)
424 stfd fp28, 120(r1)
425 stfd fp29, 128(r1)
426 stfd fp30, 136(r1)
427
428 // always perform at least one iteration of loop
429 addi r9, count, -1
430 mtctr r9
431
432 addi srcBase, srcBase, -4
433 addi dstBase, dstBase, -4
434 addi wtBase, wtBase, -4
435
436 // load matrices m0 and (m1-m0)
437 psq_l M0_00_10,0(m0),0,0
438 psq_l M1_00_10,0(m1),0,0
439
440 psq_l M0_20, 8(m0),1,0
441 psq_l M1_20, 8(m1),1,0
442
443 psq_l M0_01_11, 12(m0),0,0
444 psq_l M1_01_11, 12(m1),0,0
445
446 ps_sub M1_00_10,M1_00_10,M0_00_10
447
448 psq_l M0_21, 20(m0),1,0
449 psq_l M1_21, 20(m1),1,0
450
451 ps_sub M1_20,M1_20,M0_20
452
453 psq_l M0_02_12, 24(m0),0,0
454 psq_l M1_02_12, 24(m1),0,0
455
456 ps_sub M1_01_11,M1_01_11,M0_01_11
457
458 psq_l M0_22, 32(m0),1,0
459 psq_l M1_22, 32(m1),1,0
460
461 ps_sub M1_21,M1_21,M0_21
462
463 psq_l M0_03_13, 36(m0),0,0
464 psq_l M1_03_13, 36(m1),0,0
465
466 ps_sub M1_02_12,M1_02_12,M0_02_12
467
468 psq_l M0_23, 44(m0),1,0
469 psq_l M1_23, 44(m1),1,0
470
471 ps_sub M1_22,M1_22,M0_22
472 ps_sub M1_03_13,M1_03_13,M0_03_13
473 ps_sub M1_23,M1_23,M0_23
474
475 //start of first iteration
476 psq_lu Wt,4(wtBase),1,0 // Wt = *wtBase++;
477 psq_lu Sx_y, 4(srcBase), 0, 0 // Sx_y = *srcBase++;
478 psq_lu Sz, 8(srcBase), 1, 0 // Sz = *srcBase++;
479
480 ps_madds0 M00_10,M1_00_10,Wt,M0_00_10 // m = lerp(m0, m1, wt);
481 ps_madds0 M20,M1_20,Wt,M0_20 // m = lerp(m0, m1, wt);
482 ps_madds0 M01_11,M1_01_11,Wt,M0_01_11 // m = lerp(m0, m1, wt);
483 ps_madds0 M21,M1_21,Wt,M0_21 // m = lerp(m0, m1, wt);
484 ps_madds0 M02_12,M1_02_12,Wt,M0_02_12 // m = lerp(m0, m1, wt);
485 ps_madds0 M22,M1_22,Wt,M0_22 // m = lerp(m0, m1, wt);
486 ps_madds0 M03_13,M1_03_13,Wt,M0_03_13 // m = lerp(m0, m1, wt);
487 ps_madds0 M23,M1_23,Wt,M0_23 // m = lerp(m0, m1, wt);
488
489 ps_madds0 Ix_y, M00_10, Sx_y, M03_13 // Ix_y = M03_13 + M00_10 * Sx;
490 ps_madds0 Iz, M20, Sx_y, M23 // Iz = M23 + M20 * Sx;
491
492 psq_lu Wt,4(wtBase),1,0 // Wt = *wtBase++;
493
494 _mloop:
495 ps_madds1 Ix_y,M01_11,Sx_y,Ix_y // Ix_y += M01_11 * Sy;
496 ps_madds1 Iz,M21,Sx_y,Iz // Iz += M21 * Sy;
497
498 psq_lu Sx_y, 4(srcBase), 0, 0 // Sx_y = *srcBase++;
499
500 ps_madds0 Dx_y, M02_12, Sz, Ix_y // Dx_y = Ix_y + M01_12 * Sz;
501 ps_madds0 Dz, M22, Sz, Iz // Dz = Iz + M22 * Sz;
502
503 psq_lu Sz, 8(srcBase), 1, 0 // Sz = *srcBase++;
504
505 ps_madds0 M00_10,M1_00_10,Wt,M0_00_10 // m = lerp(m0, m1, wt);
506 ps_madds0 M20,M1_20,Wt,M0_20 // m = lerp(m0, m1, wt);
507 ps_madds0 M01_11,M1_01_11,Wt,M0_01_11 // m = lerp(m0, m1, wt);
508 ps_madds0 M21,M1_21,Wt,M0_21 // m = lerp(m0, m1, wt);
509 ps_madds0 M02_12,M1_02_12,Wt,M0_02_12 // m = lerp(m0, m1, wt);
510 ps_madds0 M22,M1_22,Wt,M0_22 // m = lerp(m0, m1, wt);
511 ps_madds0 M03_13,M1_03_13,Wt,M0_03_13 // m = lerp(m0, m1, wt);
512 ps_madds0 M23,M1_23,Wt,M0_23 // m = lerp(m0, m1, wt);
513
514 psq_stu Dx_y, 4(dstBase), 0, 0 // *dstBase++ = Dx_y;
515
516 ps_madds0 Ix_y, M00_10, Sx_y, M03_13 // Ix_y = M03_13 + M00_10 * Sx;
517 ps_madds0 Iz, M20, Sx_y, M23 // Iz = M23 + M20 * Sx;
518
519 psq_stu Dz, 8(dstBase), 1, 0 // *dstBase++ = Dz;
520
521 psq_lu Wt,4(wtBase),1,0 // Wt = *wtBase++;
522
523 bdnz+ _mloop
524 _mlend:
525
526 ps_madds1 Ix_y,M01_11,Sx_y,Ix_y // Ix_y += M01_11 * Sy;
527 ps_madds1 Iz,M21,Sx_y,Iz // Iz += M21 * Sy;
528
529 ps_madds0 Dx_y, M02_12, Sz, Ix_y // Dx_y = Ix_y + M01_12 * Sz;
530
531 psq_stu Dx_y, 4(dstBase), 0, 0 // *dstBase++ = Dx_y;
532
533 ps_madds0 Dz, M22, Sz, Iz // Dz = Iz + M22 * Sz;
534
535 psq_stu Dz, 8(dstBase), 1, 0 // *dstBase++ = Dz;
536
537 lfd fp14, 8(r1)
538 lfd fp15, 16(r1)
539 lfd fp16, 24(r1)
540 lfd fp17, 32(r1)
541 lfd fp18, 40(r1)
542 lfd fp19, 48(r1)
543 lfd fp20, 56(r1)
544 lfd fp21, 64(r1)
545 lfd fp22, 72(r1)
546 lfd fp23, 80(r1)
547 lfd fp24, 88(r1)
548 lfd fp25, 96(r1)
549 lfd fp26, 104(r1)
550 lfd fp27, 112(r1)
551 lfd fp28, 120(r1)
552 lfd fp29, 128(r1)
553 lfd fp30, 136(r1)
554 addi r1, r1, 160
555
556 blr
557
558 #undef M00_10
559 #undef M20
560 #undef M01_11
561 #undef M21
562 #undef M02_12
563 #undef M22
564 #undef M03_13
565 #undef M23
566
567 #undef Sx_y
568 #undef Sz
569
570 #undef Dx_y
571 #undef Dz
572
573 #undef Ix_y
574 #undef Iz
575
576 #undef M0_00_10
577 #undef M0_20
578 #undef M0_01_11
579 #undef M0_21
580 #undef M0_02_12
581 #undef M0_22
582 #undef M0_03_13
583 #undef M0_23
584
585
586 #undef M1_00_10
587 #undef M1_20
588 #undef M1_01_11
589 #undef M1_21
590 #undef M1_02_12
591 #undef M1_22
592 #undef M1_03_13
593 #undef M1_23
594
595 #undef Wt
596 }
597
598 /*---------------------------------------------------------------------*
599
600 Name: PSMTXROMultS16VecArray
601
602 Description: Multiplies an array of signed 16 bit vectors by a
603 reordered matrix, generating a Vec array of floats.
604 No cost in conversion. However, this code does take a
605 hit because it uses mtspr to set up a quantization
606 register to convert S16 -> F32. For production code,
607 the GQR should be set up in advance.
608
609 OK if source = destination.
610
611 Note that NO error checking is performed.
612
613 Performance: 9.671 - 9.900 cycles per vertex where
614 count = 70
615
616 Arguments: m matrix.
617 srcBase start of source s16 vector array.
618 dstBase start of resultant vector array. Note that
619 available room should be twice as large as
620 source data.
621 count number of vectors in srcBase, dstBase arrays
622 COUNT MUST BE GREATER THAN 1.
623
624
625 Return : none
626
627 *---------------------------------------------------------------------*/
628 asm void
PSMTXROMultS16VecArray(const register ROMtx m,const register S16Vec * srcBase,register Vec * dstBase,register u32 count)629 PSMTXROMultS16VecArray
630 (
631 const register ROMtx m, // r3
632 const register S16Vec *srcBase,// r4
633 register Vec *dstBase,// r5
634 register u32 count // r6
635 )
636 {
637 nofralloc
638 #define M00_M10 fp0
639 #define M20_nnn fp1
640 #define M01_M11 fp2
641 #define M21_nnn fp3
642 #define M02_M12 fp4
643 #define M22_nnn fp5
644 #define M03_M13 fp6
645 #define M23_nnn fp7
646
647 // source vectors - 2 3D vectors in 3 PS registers
648 #define SX0_SY0 fp8
649 #define SZ0_SX1 fp9
650 #define SY1_SZ1 fp10
651 // Destination registers - 2 3d vectors in 4 PS registers
652 #define DX0_DY0 fp11
653 #define DZ0_nnn fp12
654 #define DX1_DY1 fp13
655 #define DZ1_nnn fp14
656 // temp registers for writing back values. These registers store the final
657 // results from the PREVIOUS loop
658 #define WX0_WY0 fp15
659 #define WZ0_nnn fp16
660 #define WX1_WY1 fp17
661 #define WZ1_nnn fp18
662
663 stwu r1, -64(r1)
664 stfd fp14, 8(r1)
665 // unrolled once, but since we're dividing by 2, add 1 to ensure if
666 // odd # of vertices, the last one gets x-formed.
667 addi r7, count, -1
668 stfd fp15, 16(r1)
669 srwi r7, r7, 1 // 2 at a time
670 stfd fp16, 24(r1)
671 lis r8, 0x0007 // setup GQR6
672 stfd fp17, 32(r1)
673 mtspr GQR6, r8 // this will stall like a monkey's butt
674 stfd fp18, 40(r1)
675 mtctr r7
676 // load matrix
677 psq_l M00_M10, 0(m),0,0
678 addi srcBase, srcBase, -4
679 psq_l M20_nnn, 8(m),1,0
680 addi dstBase, dstBase, -4
681 psq_l M03_M13, 36(m),0,0
682 psq_lu SX0_SY0, 4(srcBase), 0, 6
683 psq_l M23_nnn, 44(m),1,0
684 psq_lu SZ0_SX1, 4(srcBase), 0, 6
685
686
687 // ------------------------------UNROLLED
688
689 // DX0=M00*SX0+M03, DY0=M10*SX0+M13
690 // DZ0=M20*SX0+M23
691 // DX1=M00*SX1+M03, DY1=M10*SX1+M13
692 // DZ1=M20*SX1+M23
693
694
695 ps_madds0 DX0_DY0, M00_M10, SX0_SY0, M03_M13
696 psq_l M01_M11, 12(m),0,0
697 ps_madds0 DZ0_nnn, M20_nnn, SX0_SY0, M23_nnn
698 psq_l M21_nnn, 20(m),1,0
699 ps_madds1 DX1_DY1, M00_M10, SZ0_SX1, M03_M13
700 psq_lu SY1_SZ1, 4(srcBase), 0, 6
701 ps_madds1 DZ1_nnn, M20_nnn, SZ0_SX1, M23_nnn
702 psq_l M22_nnn, 32(m),1,0
703
704 // DX0=M01*SY0+DX0, DY0=M11*SY0+DY0
705 // DZ0=M21*SY0+DZ0
706 // DX1=M01*SY1+DX1, DY1=M11*SY1+DY1
707 // DZ1=M21*SY1+DZ1
708
709 ps_madds1 DX0_DY0, M01_M11, SX0_SY0, DX0_DY0
710 ps_madds1 DZ0_nnn, M21_nnn, SX0_SY0, DZ0_nnn
711 psq_l M02_M12, 24(m),0,0
712 ps_madds0 DX1_DY1, M01_M11, SY1_SZ1, DX1_DY1
713 psq_lu SX0_SY0, 4(srcBase), 0, 6
714 ps_madds0 DZ1_nnn, M21_nnn, SY1_SZ1, DZ1_nnn
715
716 // DX0=M02*SZ0+DX0, DY0=M12*SZ0+DY0
717 // DZ0=M22*SZ0+DZ0
718 // DX1=M02*SZ1+DX1, DY1=M12*SZ1+DY1
719 // DZ1=M22*SZ1+DZ1
720
721 // Write final values to temp registers
722 ps_madds0 WX0_WY0, M02_M12, SZ0_SX1, DX0_DY0
723 ps_madds0 WZ0_nnn, M22_nnn, SZ0_SX1, DZ0_nnn
724 psq_lu SZ0_SX1, 4(srcBase), 0, 6
725 ps_madds1 WX1_WY1, M02_M12, SY1_SZ1, DX1_DY1
726 ps_madds1 WZ1_nnn, M22_nnn, SY1_SZ1, DZ1_nnn
727 psq_lu SY1_SZ1, 4(srcBase), 0, 6
728
729 // -------------------------- LOOP START
730 _mloop:
731 ps_madds0 DX0_DY0, M00_M10, SX0_SY0, M03_M13
732 psq_stu WX0_WY0, 4(dstBase), 0, 0
733 ps_madds0 DZ0_nnn, M20_nnn, SX0_SY0, M23_nnn
734 psq_stu WZ0_nnn, 8(dstBase), 1, 0
735 ps_madds1 DX1_DY1, M00_M10, SZ0_SX1, M03_M13
736 psq_stu WX1_WY1, 4(dstBase), 0, 0
737 ps_madds1 DZ1_nnn, M20_nnn, SZ0_SX1, M23_nnn
738 psq_stu WZ1_nnn, 8(dstBase), 1, 0
739 ps_madds1 DX0_DY0, M01_M11, SX0_SY0, DX0_DY0
740 ps_madds1 DZ0_nnn, M21_nnn, SX0_SY0, DZ0_nnn
741 // YYY LAST SX0_SY0 USAGE FOR THIS ITERATION
742 psq_lu SX0_SY0, 4(srcBase), 0, 6 // NEXT SX0 SY0
743 ps_madds0 DX1_DY1, M01_M11, SY1_SZ1, DX1_DY1
744 ps_madds0 DZ1_nnn, M21_nnn, SY1_SZ1, DZ1_nnn
745
746 // Write final values to temp registers
747 ps_madds0 WX0_WY0, M02_M12, SZ0_SX1, DX0_DY0
748 ps_madds0 WZ0_nnn, M22_nnn, SZ0_SX1, DZ0_nnn
749 // YYY LAST SZ0_SX1 USAGE FOR THIS ITERATION
750 psq_lu SZ0_SX1, 4(srcBase), 0, 6 // NEXT SZ0 SX1
751 ps_madds1 WX1_WY1, M02_M12, SY1_SZ1, DX1_DY1
752 ps_madds1 WZ1_nnn, M22_nnn, SY1_SZ1, DZ1_nnn
753 // YYY LAST SY1_SZ1 USAGE FOR THIS ITERATION
754 psq_lu SY1_SZ1, 4(srcBase), 0, 6 // NEXT SY1 SZ1
755
756 bdnz+ _mloop // -------------------------- LOOP END
757
758 psq_stu WX0_WY0, 4(dstBase), 0, 0
759 rlwinm. r7, count, 0, 31, 31 // check odd
760 psq_stu WZ0_nnn, 8(dstBase), 1, 0
761 bne _return
762 // Skipped if odd number of vectors
763 psq_stu WX1_WY1, 4(dstBase), 0, 0
764 // can't put anything here
765 psq_stu WZ1_nnn, 8(dstBase), 1, 0
766
767
768 _return:
769 lfd fp14, 8(r1)
770 lfd fp15, 16(r1)
771 lfd fp16, 24(r1)
772 lfd fp17, 32(r1)
773 lfd fp18, 40(r1)
774 addi r1, r1, 64
775 blr
776
777
778 #undef M00_M10
779 #undef M20_nnn
780 #undef M01_M11
781 #undef M21_nnn
782 #undef M02_M12
783 #undef M22_nnn
784 #undef M03_M13
785 #undef M23_nnn
786 #undef SX0_SY0
787 #undef SZ0_SX1
788 #undef SY1_SZ1
789 #undef DX0_DY0
790 #undef DZ0_nnn
791 #undef DX1_DY1
792 #undef DZ1_nnn
793 #undef WX0_WY0
794 #undef WZ0_nnn
795 #undef WX1_WY1
796 #undef WZ1_nnn
797
798 }
799
800
801
802 /*---------------------------------------------------------------------*
803
804 Name: PSMTXMultS16VecArray
805
806 Description: Multiplies an array of signed 16 bit vectors by a matrix,
807 generating a Vec array of floats. No cost in conversion.
808 However, this code does take a hit because it uses
809 mtspr to set up a quantization register to convert
810 S16 -> F32. For production code, the GQR should be set
811 up in advance.
812
813 This function takes longer than PSMTXROMultS16VecArray
814 which uses reordered matrices.
815 OK if source = destination.
816
817 Note that NO error checking is performed.
818
819 Performance: 13.714 -13.786 cycles per vertex where
820 count = 70
821
822 Arguments: m matrix.
823 srcBase start of source vector array.
824 dstBase start of resultant vector array. Note that
825 available room should be twice as large as
826 source data.
827 count number of vectors in srcBase, dstBase arrays
828 COUNT MUST BE GREATER THAN 1.
829
830
831 Return : none
832
833 *---------------------------------------------------------------------*/
834 asm void
PSMTXMultS16VecArray(const register Mtx m,const register S16Vec * srcBase,register Vec * dstBase,register u32 count)835 PSMTXMultS16VecArray
836 (
837 const register Mtx m, // r3
838 const register S16Vec *srcBase,// r4
839 register Vec *dstBase,// r5
840 register u32 count // r6
841 )
842 {
843 nofralloc
844 // cmpwi count, 0
845 // beq- _return
846 psq_l fp0, 0(m), 0, 0 // [0][0], [0][1]
847 lis r7, 0x0007 // setup GQR6
848 mtspr GQR6, r7 // this will stall like a monkey's butt
849 // fp6 - x y
850 psq_l fp6, 0(srcBase), 0, 6
851 subi count, count, 1 // unrolling once
852 // fp7 - z 1
853 psq_l fp7, 4(srcBase), 1, 6
854 mtctr count
855 // unused slot here
856 psq_l fp1, 8(m), 0, 0 // [0][2], [0][3]
857 addi srcBase, srcBase, 4 // load ops will add 2,
858 // but we already got the first vertex
859 psq_l fp2, 16(m), 0, 0 // [1][0], [1][1]
860 addi dstBase, dstBase, -4 // store ops will add 4
861 psq_l fp3, 24(m), 0, 0 // [1][2], [1][3]
862
863
864
865
866 // ------ first loop starts here
867 // fp8 = m00x m01y // next X
868 ps_mul fp8, fp0, fp6
869 psq_l fp4, 32(m), 0, 0 // [2][0], [2][1]
870 // fp10 = m10x m11y // next Y
871 ps_mul fp10, fp2, fp6
872 psq_l fp5, 40(m), 0, 0 // [2][2], [2][3]
873 // fp12 = m20x m21y // next Z
874 ps_mul fp12, fp4, fp6 // YYY last FP6 usage
875
876 // fp6 - x y
877 psq_lu fp6, 2(srcBase), 0, 6 // advance to x
878 // Potential FP stall here if psq_lu dispatches same
879 // cycle as previous ps_mul
880
881 // fp8 = m00x + m02z | m01y + m03
882 ps_madd fp8, fp1, fp7 ,fp8
883 // fp10 = m10x + m12z | m11y + m13
884 ps_madd fp10, fp3, fp7 ,fp10
885 // fp12 = m20x + m22z | m21y + m23
886 ps_madd fp12, fp5, fp7 ,fp12 // YYY last FP7 usage
887
888 // fp7 - z 1
889 psq_lu fp7, 4(srcBase), 1, 6 // advance to z, will be skipped by next lu
890 // Potential FP stall here if psq_lu dispatches same
891 // cycle as previous ps_madd (fp8 dependency)
892 ps_sum0 fp9, fp8, fp8, fp8 // X ready
893
894 // ------------------- main loop
895 _mloop:
896 ps_sum0 fp11, fp10, fp10, fp10 // Y ready
897 // fp8 = m00x m01y // next X
898 ps_mul fp8, fp0, fp6
899 ps_sum0 fp13, fp12, fp12, fp12 // Z ready
900 // fp10 = m10x m11y // next Y
901 ps_mul fp10, fp2, fp6
902 psq_stu fp9, 4(dstBase), 1, 0 // prev X
903 // fp12 = m20x m21y // next Z
904 ps_mul fp12, fp4, fp6 // YYY last FP6 usage
905 psq_stu fp11, 4(dstBase), 1, 0 // prev Y
906 // fp8 = m00x + m02z | m01y + m03
907 ps_madd fp8, fp1, fp7 ,fp8
908 psq_stu fp13, 4(dstBase), 1, 0 // prev Z
909 // fp10 = m10x + m12z | m11y + m13
910 ps_madd fp10, fp3, fp7 ,fp10
911 // fp6 - x y
912 psq_lu fp6, 2(srcBase), 0, 6 // advance to x
913 // fp12 = m20x + m22z | m21y + m23
914 ps_madd fp12, fp5, fp7 ,fp12 // YYY last FP7 usage
915
916 // fp7 - z 1
917 psq_lu fp7, 4(srcBase), 1, 6 // advance to z, will be skipped by next lu
918 // Potential FP stall here if psq_lu dispatches same
919 // cycle as previous ps_madd
920
921 ps_sum0 fp9, fp8, fp8, fp8 // X ready
922 bdnz+ _mloop
923 // ------------------- end of loop
924 ps_sum0 fp11, fp10, fp10, fp10 // Y ready
925 ps_sum0 fp13, fp12, fp12, fp12 // Z ready
926 // commit last iteration
927 psq_stu fp9, 4(dstBase), 1, 0
928 psq_stu fp11, 4(dstBase), 1, 0
929 psq_stu fp13, 4(dstBase), 1, 0
930
931 _return:
932 blr
933 }
934
935
936 #endif // GEKKO
937
938
939 /*===========================================================================*/
940