1 /*---------------------------------------------------------------------------*
2 Project: Matrix vector Library
3 File: psmtx.c
4
5 Copyright 1998-2007 Nintendo. All rights reserved.
6
7 These coded instructions, statements, and computer programs contain
8 proprietary information of Nintendo of America Inc. and/or Nintendo
9 Company Ltd., and are protected by Federal copyright law. They may
10 not be disclosed to third parties or copied or duplicated in any form,
11 in whole or in part, without the prior written consent of Nintendo.
12
13
14 $Log: psmtx.c,v $
15 Revision 1.3 2007/08/30 10:42:41 hirose
16 Updated PSMTXROMultVecArray to make it Broadway EABI compliant.
17 Removed unsupported functions.
18
19 Revision 1.2 2006/02/20 04:25:42 mitu
20 Changed include path from dolphin/ to revolution/.
21
22 Revision 1.1.1.1 2005/05/12 02:15:49 yasuh-to
23 Ported from dolphin sheath tree.
24 NoKeywords: $
25
26 6 2003/08/21 5:17 Dante
27 Changed GQR1 to GQR6 in PSMTXMultS16VecArray
28
29 5 2002/04/11 13:11 Hirose
30 const type specifier support. (by Hiratsu@IRD)
31
32 4 2001/02/26 11:56p Hirose
33 Avoided use of GQR1, which is reserved by the compiler.
34
35 3 2001/02/22 11:49p Hirose
36 Some functions are moved to another file according to arrangement
37 updates.
38
39 2 2000/07/12 4:41p John
40 Substituted MTXConcat and MTXMultVecArray with their paired-singles
41 equivalent for Gekko non-debug builds.
42
43 1 2000/05/10 1:48p Hirose
44 Moved paired-single matrix stuff into an another source file
45
46 $NoKeywords: $
47 *---------------------------------------------------------------------------*/
48
49 #include <math.h>
50 #include <revolution/mtx.h>
51 #include "mtxAssert.h"
52
53
54 /*---------------------------------------------------------------------*
55 Special purpose Paired-single optimized code
56
57 All paired-single code assumes GQR0 = 0.
58 *---------------------------------------------------------------------*/
59 #ifdef GEKKO
60
61 /*---------------------------------------------------------------------*
62
63 Name: PSMTXReorder
64
65 Description: Creates a reordered (column-major) matrix from a
66 row-major matrix, using paired single operations.
67 Reordered matrices are required for the PSMTXRO*
68 functions, which operate faster than their non-reordered
69 counterparts.
70
71 Performance: ~15 cycles.
72
73 Arguments: src source matrix.
74 dest destination matrix, note type is ROMtx.
75
76 Return : None.
77
78 *---------------------------------------------------------------------*/
79 asm void
PSMTXReorder(const register Mtx src,register ROMtx dest)80 PSMTXReorder(const register Mtx src, register ROMtx dest)
81 {
82 nofralloc
83 #define S00_S01 fp0
84 #define S02_S03 fp1
85 #define S10_S11 fp2
86 #define S12_S13 fp3
87 #define S20_S21 fp4
88 #define S22_S23 fp5
89 #define D00_D10 fp6
90 #define D11_D21 fp7
91 #define D02_D12 fp8
92 #define D22_D03 fp9
93 #define D13_D23 fp10
94 #define D20_D01 fp12
95
96
97 psq_l S00_S01, 0(src), 0, 0
98 psq_l S10_S11, 16(src), 0, 0
99 psq_l S20_S21, 32(src), 0, 0
100 psq_l S02_S03, 8(src), 0, 0
101 ps_merge00 D00_D10, S00_S01, S10_S11
102 psq_l S12_S13, 24(src), 0, 0
103 ps_merge01 D20_D01, S20_S21, S00_S01
104 psq_l S22_S23, 40(src), 0, 0
105 ps_merge11 D11_D21, S10_S11, S20_S21
106 psq_st D00_D10, 0(dest), 0, 0
107 ps_merge00 D02_D12, S02_S03, S12_S13
108 psq_st D20_D01, 8(dest), 0, 0
109 ps_merge01 D22_D03, S22_S23, S02_S03
110 psq_st D11_D21, 16(dest),0, 0
111 ps_merge11 D13_D23, S12_S13, S22_S23
112 psq_st D02_D12, 24(dest),0, 0
113 psq_st D22_D03, 32(dest),0,0
114 psq_st D13_D23, 40(dest),0,0
115
116 blr
117 #undef S00_S01
118 #undef S02_S03
119 #undef S10_S11
120 #undef S12_S13
121 #undef S20_S21
122 #undef S22_S23
123 #undef D00_D10
124 #undef D11_D21
125 #undef D02_D12
126 #undef D22_D03
127 #undef D13_D23
128 #undef D20_D01
129
130 }
131
132 /*---------------------------------------------------------------------*
133
134 Name: PSMTXROMultVecArray
135
136 Description: Multiplies an array of vectors by a reordered matrix,
137 using paired single operations.
138 This function is significantly faster than
139 PSMTXMultVecArray, but requires that you have reordered
140 the matrix in advance with PSMTXReorder.
141 OK if source = destination.
142 NOTE: number of vertices transformed cannot be less than
143 2.
144
145 Note that NO error checking is performed.
146
147 Performance: 9.586 - 9.814 cycles per vertex, where
148 count = 70
149
150 Arguments: m reordered matrix.
151 srcBase start of source vector array.
152 dstBase Start of resultant vector array.
153 count Number of vectors in srcBase, dstBase arrays
154 COUNT MUST BE GREATER THAN 2.
155
156
157 Return: None.
158
159 *---------------------------------------------------------------------*/
160 asm void
PSMTXROMultVecArray(const register ROMtx m,const register Vec * srcBase,register Vec * dstBase,register u32 count)161 PSMTXROMultVecArray
162 (
163 const register ROMtx m, // r3
164 const register Vec *srcBase,// r4
165 register Vec *dstBase,// r5
166 register u32 count // r6
167 )
168 {
169 nofralloc
170 #define M00_M10 fp0
171 #define M20_nnn fp1
172 #define M01_M11 fp2
173 #define M21_nnn fp3
174 #define M02_M12 fp4
175 #define M22_nnn fp5
176 #define M03_M13 fp6
177 #define M23_nnn fp7
178
179 // Source vectors - 2 3D vectors in 3 PS registers
180 #define SX0_SY0 fp8
181 #define SZ0_SX1 fp9
182 #define SY1_SZ1 fp10
183 // Destination registers - 2 3d vectors in 4 PS registers
184 #define DX0_DY0 fp11
185 #define DZ0_nnn fp12
186 #define DX1_DY1 fp13
187 #define DZ1_nnn fp14
188 // temp registers for writing back values. These registers store the final
189 // results from the PREVIOUS loop
190 #define WX0_WY0 fp15
191 #define WZ0_nnn fp16
192 #define WX1_WY1 fp17
193 #define WZ1_nnn fp18
194
195 stwu r1, -96(rsp)
196 stfd fp14, 8(rsp)
197 psq_st fp14, 16(rsp), 0, 0
198 // unrolled once, but since we're dividing by 2, add 1 to ensure if
199 // odd # of vertices, the last one gets x-formed.
200 addi r7, count, -1
201 stfd fp15, 24(rsp)
202 psq_st fp15, 32(rsp), 0, 0
203 srwi r7, r7, 1 // 2 at a time
204 stfd fp16, 40(rsp)
205 psq_st fp16, 48(rsp), 0, 0
206 stfd fp17, 56(rsp)
207 psq_st fp17, 64(rsp), 0, 0
208 stfd fp18, 72(rsp)
209 psq_st fp18, 80(rsp), 0, 0
210 mtctr r7
211 // Load matrix
212 psq_l M00_M10, 0(m),0,0
213 addi srcBase, srcBase, -8
214 psq_l M20_nnn, 8(m),1,0
215 addi dstBase, dstBase, -4
216 psq_l M03_M13, 36(m),0,0
217 psq_lu SX0_SY0, 8(srcBase), 0, 0
218 psq_l M23_nnn, 44(m),1,0
219 psq_lu SZ0_SX1, 8(srcBase), 0, 0
220
221 // ------------------------------UNROLLED
222
223 // DX0=M00*SX0+M03, DY0=M10*SX0+M13
224 // DZ0=M20*SX0+M23
225 // DX1=M00*SX1+M03, DY1=M10*SX1+M13
226 // DZ1=M20*SX1+M23
227
228 ps_madds0 DX0_DY0, M00_M10, SX0_SY0, M03_M13
229 psq_l M01_M11, 12(m),0,0
230 ps_madds0 DZ0_nnn, M20_nnn, SX0_SY0, M23_nnn
231 psq_l M21_nnn, 20(m),1,0
232 ps_madds1 DX1_DY1, M00_M10, SZ0_SX1, M03_M13
233 psq_lu SY1_SZ1,8(srcBase), 0, 0
234 ps_madds1 DZ1_nnn, M20_nnn, SZ0_SX1, M23_nnn
235 psq_l M22_nnn, 32(m),1,0
236
237 // DX0=M01*SY0+DX0, DY0=M11*SY0+DY0
238 // DZ0=M21*SY0+DZ0
239 // DX1=M01*SY1+DX1, DY1=M11*SY1+DY1
240 // DZ1=M21*SY1+DZ1
241
242 ps_madds1 DX0_DY0, M01_M11, SX0_SY0, DX0_DY0
243 ps_madds1 DZ0_nnn, M21_nnn, SX0_SY0, DZ0_nnn
244 psq_l M02_M12, 24(m),0,0
245 ps_madds0 DX1_DY1, M01_M11, SY1_SZ1, DX1_DY1
246 psq_lu SX0_SY0, 8(srcBase), 0, 0
247 ps_madds0 DZ1_nnn, M21_nnn, SY1_SZ1, DZ1_nnn
248
249 // DX0=M02*SZ0+DX0, DY0=M12*SZ0+DY0
250 // DZ0=M22*SZ0+DZ0
251 // DX1=M02*SZ1+DX1, DY1=M12*SZ1+DY1
252 // DZ1=M22*SZ1+DZ1
253
254 // Write final values to temp registers
255 ps_madds0 WX0_WY0, M02_M12, SZ0_SX1, DX0_DY0
256 ps_madds0 WZ0_nnn, M22_nnn, SZ0_SX1, DZ0_nnn
257 psq_lu SZ0_SX1, 8(srcBase), 0, 0
258 ps_madds1 WX1_WY1, M02_M12, SY1_SZ1, DX1_DY1
259 ps_madds1 WZ1_nnn, M22_nnn, SY1_SZ1, DZ1_nnn
260 psq_lu SY1_SZ1,8(srcBase), 0, 0
261
262 // -------------------------- LOOP START
263 _mloop:
264 ps_madds0 DX0_DY0, M00_M10, SX0_SY0, M03_M13
265 psq_stu WX0_WY0, 4(dstBase), 0, 0
266 ps_madds0 DZ0_nnn, M20_nnn, SX0_SY0, M23_nnn
267 psq_stu WZ0_nnn, 8(dstBase), 1, 0
268 ps_madds1 DX1_DY1, M00_M10, SZ0_SX1, M03_M13
269 psq_stu WX1_WY1, 4(dstBase), 0, 0
270 ps_madds1 DZ1_nnn, M20_nnn, SZ0_SX1, M23_nnn
271 psq_stu WZ1_nnn, 8(dstBase), 1, 0
272 ps_madds1 DX0_DY0, M01_M11, SX0_SY0, DX0_DY0
273 ps_madds1 DZ0_nnn, M21_nnn, SX0_SY0, DZ0_nnn
274 // YYY LAST SX0_SY0 USAGE FOR THIS ITERATION
275 psq_lu SX0_SY0, 8(srcBase), 0, 0 // NEXT SX0 SY0
276 ps_madds0 DX1_DY1, M01_M11, SY1_SZ1, DX1_DY1
277 ps_madds0 DZ1_nnn, M21_nnn, SY1_SZ1, DZ1_nnn
278
279 // Write final values to temp registers
280 ps_madds0 WX0_WY0, M02_M12, SZ0_SX1, DX0_DY0
281 ps_madds0 WZ0_nnn, M22_nnn, SZ0_SX1, DZ0_nnn
282 // YYY LAST SZ0_SX1 USAGE FOR THIS ITERATION
283 psq_lu SZ0_SX1, 8(srcBase), 0, 0 // NEXT SZ0 SX1
284 ps_madds1 WX1_WY1, M02_M12, SY1_SZ1, DX1_DY1
285 ps_madds1 WZ1_nnn, M22_nnn, SY1_SZ1, DZ1_nnn
286 // YYY LAST SY1_SZ1 USAGE FOR THIS ITERATION
287 psq_lu SY1_SZ1,8(srcBase), 0, 0 // NEXT SY1 SZ1
288
289 bdnz+ _mloop // -------------------------- LOOP END
290
291 psq_stu WX0_WY0, 4(dstBase), 0, 0
292 rlwinm. r7, count, 0, 31, 31 // Check odd
293 psq_stu WZ0_nnn, 8(dstBase), 1, 0
294 bne _return
295 // Skipped if odd number of vectors
296 psq_stu WX1_WY1, 4(dstBase), 0, 0
297 // Can't put anything here
298 psq_stu WZ1_nnn, 8(dstBase), 1, 0
299
300
301 _return:
302 psq_l fp14, 16(rsp), 0, 0
303 lfd fp14, 8(rsp)
304 psq_l fp15, 32(rsp), 0, 0
305 lfd fp15, 24(rsp)
306 psq_l fp16, 48(rsp), 0, 0
307 lfd fp16, 40(rsp)
308 psq_l fp17, 64(rsp), 0, 0
309 lfd fp17, 56(rsp)
310 psq_l fp18, 80(rsp), 0, 0
311 lfd fp18, 72(rsp)
312 addi r1, r1, 96
313 blr
314
315 #undef M00_M10
316 #undef M20_nnn
317 #undef M01_M11
318 #undef M21_nnn
319 #undef M02_M12
320 #undef M22_nnn
321 #undef M03_M13
322 #undef M23_nnn
323 #undef SX0_SY0
324 #undef SZ0_SX1
325 #undef SY1_SZ1
326 #undef DX0_DY0
327 #undef DZ0_nnn
328 #undef DX1_DY1
329 #undef DZ1_nnn
330 #undef WX0_WY0
331 #undef WZ0_nnn
332 #undef WX1_WY1
333 #undef WZ1_nnn
334
335 }
336
337
338 #endif // GEKKO
339
340
341 /*===========================================================================*/
342