1 /*---------------------------------------------------------------------------*
2 Project: Horizon
3 File: math_Matrix44.cpp
4
5 Copyright (C)2009-2012 Nintendo Co., Ltd. All rights reserved.
6
7 These coded instructions, statements, and computer programs contain
8 proprietary information of Nintendo of America Inc. and/or Nintendo
9 Company Ltd., and are protected by Federal copyright law. They may
10 not be disclosed to third parties or copied or duplicated in any form,
11 in whole or in part, without the prior written consent of Nintendo.
12
13 $Rev: 46347 $
14 *---------------------------------------------------------------------------*/
15
16 #include <nn/math.h>
17
18 #include <cmath>
19 #include <nn/math/math_Matrix44.h>
20
21 #if !defined(NN_MATH_AS_INLINE)
22 #include <nn/math/ARMv6/inline/math_Matrix44.ipp>
23 #endif
24
25
26 namespace nn{
27 namespace math {
28 namespace ARMv6 {
29 #include <nn/hw/ARM/code32.h>
30
31 NN_FUNC_ATTR_PRIVATE_SECTION
MTX44AddAsm(MTX44 *,const MTX44 *,const MTX44 *)32 asm MTX44* MTX44AddAsm(MTX44* , const MTX44* , const MTX44* )
33 {
34 VPUSH {d8-d11} // Save registers
35 VLDMIA r2,{s0-s15} // The entire p2 matrix is put in the [S0-S15] registers
36 VLDMIA r1!,{s16-s23} // Matrix p1 is put into the [S16-S23] registers
37
38 VADD.F32 s0,s16,s0
39 VADD.F32 s1,s17,s1
40 VADD.F32 s2,s18,s2
41 VADD.F32 s3,s19,s3
42
43 VLDMIA r1!,{s16-s19} // Continuation of p1
44
45 VADD.F32 s4,s20,s4
46 VADD.F32 s5,s21,s5
47 VADD.F32 s6,s22,s6
48 VADD.F32 s7,s23,s7
49
50 VLDMIA r1!,{s20-s23} // Continuation of p1
51
52 VADD.F32 s8,s16,s8
53 VADD.F32 s9,s17,s9
54 VADD.F32 s10,s18,s10
55 VADD.F32 s11,s19,s11
56
57 VADD.F32 s12,s20,s12
58 VADD.F32 s13,s21,s13
59 VADD.F32 s14,s22,s14
60 VADD.F32 s15,s23,s15
61
62 VPOP {d8-d11} // Register return
63 VSTMIA r0,{s0-s15} // Store result
64 BX lr // Return
65
66 }
67
68 NN_FUNC_ATTR_PRIVATE_SECTION
MTX44CopyAsm(MTX44 *,const MTX44 *)69 asm MTX44* MTX44CopyAsm(MTX44* , const MTX44* )
70 {
71 CMP r1,r0 // Are p and pOut the same?
72 BXEQ lr // If the same, returns without copying
73 VLDMIA r1,{s0-s15} // All p are put in the [S0-S15] registers
74 VSTMIA r0,{s0-s15} // All pOut are put in the [S0-S15] registers
75 BX lr
76 }
77
78 NN_FUNC_ATTR_PRIVATE_SECTION
MTX44MultAsm(MTX44 *,const MTX44 *,const MTX44 *)79 asm MTX44* MTX44MultAsm(MTX44* , const MTX44* , const MTX44* )
80 {
81 VPUSH {d8-d12} // Save registers
82
83 VLDMIA r2!,{s16-s23} // Matrix p2 is put into the [S16-S23] registers
84 VLDR.F32 s24,[r1,#16*0+0*4] // p1[0][0]
85 VLDR.F32 s25,[r1,#16*1+0*4] // p1[1][0]
86
87 VMUL.F32 s0,s24,s16
88 VMUL.F32 s1,s24,s17
89 VMUL.F32 s2,s24,s18
90 VMUL.F32 s3,s24,s19
91 VLDR.F32 s24,[r1,#16*2+0*4] // p1[2][0]
92
93 VMUL.F32 s4,s25,s16
94 VMUL.F32 s5,s25,s17
95 VMUL.F32 s6,s25,s18
96 VMUL.F32 s7,s25,s19
97 VLDR.F32 s25,[r1,#16*3+0*4] // p1[3][0]
98
99 VMUL.F32 s8,s24,s16
100 VMUL.F32 s9,s24,s17
101 VMUL.F32 s10,s24,s18
102 VMUL.F32 s11,s24,s19
103 VLDR.F32 s24,[r1,#16*0+1*4] // p1[0][1]
104
105 VMUL.F32 s12,s25,s16
106 VMUL.F32 s13,s25,s17
107 VMUL.F32 s14,s25,s18
108 VMUL.F32 s15,s25,s19
109 VLDR.F32 s25,[r1,#16*1+1*4] // p1[1][1]
110
111 VLDMIA r2!,{s16-s19} // Matrix p2 is put into the [S16-S19] registers
112
113 VMLA.F32 s0,s24,s20
114 VMLA.F32 s1,s24,s21
115 VMLA.F32 s2,s24,s22
116 VMLA.F32 s3,s24,s23
117 VLDR.F32 s24,[r1,#16*2+1*4] // p1[2][1]
118
119 VMLA.F32 s4,s25,s20
120 VMLA.F32 s5,s25,s21
121 VMLA.F32 s6,s25,s22
122 VMLA.F32 s7,s25,s23
123 VLDR.F32 s25,[r1,#16*3+1*4] // p1[3][1]
124
125 VMLA.F32 s8,s24,s20
126 VMLA.F32 s9,s24,s21
127 VMLA.F32 s10,s24,s22
128 VMLA.F32 s11,s24,s23
129 VLDR.F32 s24,[r1,#16*0+2*4] // p1[0][2]
130
131 VMLA.F32 s12,s25,s20
132 VMLA.F32 s13,s25,s21
133 VMLA.F32 s14,s25,s22
134 VMLA.F32 s15,s25,s23
135 VLDR.F32 s25,[r1,#16*1+2*4] // p1[1][2]
136
137 VLDMIA r2,{s20-s23} // Matrix p2 is put into the [S20-S23] registers
138
139 VMLA.F32 s0,s24,s16
140 VMLA.F32 s1,s24,s17
141 VMLA.F32 s2,s24,s18
142 VMLA.F32 s3,s24,s19
143 VLDR.F32 s24,[r1,#16*2+2*4] // p1[2][2]
144
145 VMLA.F32 s4,s25,s16
146 VMLA.F32 s5,s25,s17
147 VMLA.F32 s6,s25,s18
148 VMLA.F32 s7,s25,s19
149 VLDR.F32 s25,[r1,#16*3+2*4] // p1[3][2]
150
151 VMLA.F32 s8,s24,s16
152 VMLA.F32 s9,s24,s17
153 VMLA.F32 s10,s24,s18
154 VMLA.F32 s11,s24,s19
155 VLDR.F32 s24,[r1,#16*0+3*4] // p1[0][3]
156
157 VMLA.F32 s12,s25,s16
158 VMLA.F32 s13,s25,s17
159 VMLA.F32 s14,s25,s18
160 VMLA.F32 s15,s25,s19
161 VLDR.F32 s25,[r1,#16*1+3*4] // p1[1][3]
162
163 VMLA.F32 s0,s24,s20
164 VMLA.F32 s1,s24,s21
165 VMLA.F32 s2,s24,s22
166 VMLA.F32 s3,s24,s23
167 VLDR.F32 s24,[r1,#16*2+3*4] // p1[2][3]
168
169 VMLA.F32 s4,s25,s20
170 VMLA.F32 s5,s25,s21
171 VMLA.F32 s6,s25,s22
172 VMLA.F32 s7,s25,s23
173 VLDR.F32 s25,[r1,#16*3+3*4] // p1[3][3]
174
175 VMLA.F32 s8,s24,s20
176 VMLA.F32 s9,s24,s21
177 VMLA.F32 s10,s24,s22
178 VMLA.F32 s11,s24,s23
179
180 VMLA.F32 s12,s25,s20
181 VMLA.F32 s13,s25,s21
182 VMLA.F32 s14,s25,s22
183 VMLA.F32 s15,s25,s23
184
185 VPOP {d8-d12} // Register return
186 VSTMIA r0,{s0-s15} // Store result
187 BX lr // Return
188
189 }
190
191 NN_FUNC_ATTR_PRIVATE_SECTION
MTX44MultAsm(MTX44 *,const MTX44 *,f32)192 asm MTX44* MTX44MultAsm(MTX44* , const MTX44* , f32 )
193 {
194 VPUSH {d8} // Save registers
195 VLDMIA r1!,{s16} // Matrix p is put into the [S2-S17] registers
196 VLDMIA r1,{s1-s15} // Matrix p is put into the [S2-S17] registers
197
198 VMUL.F32 s1,s1,s0
199 VMUL.F32 s2,s2,s0
200 VMUL.F32 s3,s3,s0
201
202 VMUL.F32 s4,s4,s0
203 VMUL.F32 s5,s5,s0
204 VMUL.F32 s6,s6,s0
205 VMUL.F32 s7,s7,s0
206
207 VMUL.F32 s8,s8,s0
208 VMUL.F32 s9,s9,s0
209 VMUL.F32 s10,s10,s0
210 VMUL.F32 s11,s11,s0
211
212 VMUL.F32 s12,s12,s0
213 VMUL.F32 s13,s13,s0
214 VMUL.F32 s14,s14,s0
215 VMUL.F32 s15,s15,s0
216
217 VMUL.F32 s0,s16,s0
218 VPOP {d8} // Register return
219 VSTMIA r0,{s0-s15} // Store result
220 BX lr // Return
221 }
222
223 NN_FUNC_ATTR_PRIVATE_SECTION
MTX44MultScaleAsm(MTX44 *,const MTX44 *,const VEC3 *)224 asm MTX44* MTX44MultScaleAsm(MTX44*, const MTX44*, const VEC3*)
225 {
226 VLDMIA r1,{s0-s11} // Matrix p is put into the [S0-S11] registers
227 VLDMIA r2,{s12-s14} // VEC3 is put into the [S12-S14] registers
228
229 VMUL.F32 s0,s0,s12
230 VMUL.F32 s1,s1,s13
231 VMUL.F32 s2,s2,s14
232
233 VMUL.F32 s4,s4,s12
234 VMUL.F32 s5,s5,s13
235 VMUL.F32 s6,s6,s14
236
237 VMUL.F32 s8,s8,s12
238 VMUL.F32 s9,s9,s13
239 VMUL.F32 s10,s10,s14
240
241 VSTMIA r0,{s0-s11} // Store result
242 BX lr // Return
243 }
244
245 NN_FUNC_ATTR_PRIVATE_SECTION
MTX44MultScaleAsm(MTX44 *,const VEC3 *,const MTX44 *)246 asm MTX44* MTX44MultScaleAsm(MTX44*, const VEC3*, const MTX44*)
247 {
248 VLDMIA r2,{s0-s11} // Matrix p is put into the [S0-S11] registers
249 VLDMIA r1,{s12-s14} // VEC3 is put into the [S12-S14] registers
250
251 VMUL.F32 s0,s0,s12
252 VMUL.F32 s1,s1,s12
253 VMUL.F32 s2,s2,s12
254 VMUL.F32 s3,s3,s12
255
256 VMUL.F32 s4,s4,s13
257 VMUL.F32 s5,s5,s13
258 VMUL.F32 s6,s6,s13
259 VMUL.F32 s7,s7,s13
260
261 VMUL.F32 s8,s8,s14
262 VMUL.F32 s9,s9,s14
263 VMUL.F32 s10,s10,s14
264 VMUL.F32 s11,s11,s14
265
266 VSTMIA r0,{s0-s11} // Store result
267 BX lr // Return
268 }
269
270 NN_FUNC_ATTR_PRIVATE_SECTION
MTX44MultTranslateAsm(MTX44 *,const VEC3 *,const MTX44 *)271 asm MTX44* MTX44MultTranslateAsm(MTX44*, const VEC3*, const MTX44*)
272 {
273 VLDMIA r2,{s0-s11} // The entire pM matrix is put in the [S0-S11] registers
274 VLDMIA r1,{s12-s14} // All vectors are put in the [S12-S14] registers
275
276 VADD.F32 s3,s3,s12
277 VADD.F32 s7,s7,s13
278 VADD.F32 s11,s11,s14
279
280 VSTMIA r0,{s0-s11} // Store result
281 BX lr // Return
282 }
283
284 NN_FUNC_ATTR_PRIVATE_SECTION
MTX44MultTranslateAsm(MTX44 *,const MTX44 *,const VEC3 *)285 asm MTX44* MTX44MultTranslateAsm(MTX44*, const MTX44*, const VEC3*)
286 {
287 VLDMIA r1!,{s0-s11} // Matrix pM is put into the [S0-S11] registers
288 VLDMIA r2,{s12-s14} // All vectors are put in the [S12-S14] registers
289
290 VMLA.F32 s3,s0,s12
291 VMLA.F32 s7,s4,s12
292 VMLA.F32 s11,s8,s12
293
294 VMLA.F32 s3,s1,s13
295 VMLA.F32 s7,s5,s13
296 VMLA.F32 s11,s9,s13
297
298 VMLA.F32 s3,s2,s14
299 VMLA.F32 s7,s6,s14
300 VMLA.F32 s11,s10,s14
301
302 VLDMIA r1!,{s12-s15} // Matrix pM is put into the [S12-S15] registers
303
304 VSTMIA r0,{s0-s15} // Store result
305 BX lr // Return
306
307 }
308
309 NN_FUNC_ATTR_PRIVATE_SECTION
MTX44TransposeAsm(MTX44 *,const MTX44 *)310 asm MTX44* MTX44TransposeAsm(MTX44* , const MTX44*)
311 {
312 VLDR.F32 s0,[r1,#0*16+0*4]
313 VLDR.F32 s1,[r1,#1*16+0*4]
314 VLDR.F32 s2,[r1,#2*16+0*4]
315 VLDR.F32 s3,[r1,#3*16+0*4]
316 VLDR.F32 s4,[r1,#0*16+1*4]
317 VLDR.F32 s5,[r1,#1*16+1*4]
318 VLDR.F32 s6,[r1,#2*16+1*4]
319 VLDR.F32 s7,[r1,#3*16+1*4]
320 VLDR.F32 s8,[r1,#0*16+2*4]
321 VLDR.F32 s9,[r1,#1*16+2*4]
322 VLDR.F32 s10,[r1,#2*16+2*4]
323 VLDR.F32 s11,[r1,#3*16+2*4]
324 VLDR.F32 s12,[r1,#0*16+3*4]
325 VLDR.F32 s13,[r1,#1*16+3*4]
326 VLDR.F32 s14,[r1,#2*16+3*4]
327 VLDR.F32 s15,[r1,#3*16+3*4]
328
329 VSTMIA r0,{s0-s15} // Store result
330 BX lr // Return
331 }
332
333 NN_FUNC_ATTR_PRIVATE_SECTION
VEC3TransformAsm(VEC4 *,const MTX44 *,const VEC3 *)334 asm VEC4* VEC3TransformAsm(VEC4*, const MTX44*, const VEC3*)
335 {
336 VPUSH {d8-d9} // Save registers
337
338 VLDMIA r1,{s0-s15} // The entire pM matrix is put in the [S0-S15] registers
339 VLDMIA r2,{s16-s18} // All vectors are put in the [S16-S18] registers
340
341 VMLA.F32 s3,s0,s16
342 VMLA.F32 s7,s4,s16
343 VMLA.F32 s11,s8,s16
344 VMLA.F32 s15,s12,s16
345
346 VMLA.F32 s3,s1,s17
347 VMLA.F32 s7,s5,s17
348 VMLA.F32 s11,s9,s17
349 VMLA.F32 s15,s13,s17
350
351 VMLA.F32 s3,s2,s18
352 VMLA.F32 s7,s6,s18
353 VMLA.F32 s11,s10,s18
354 VMLA.F32 s15,s14,s18
355
356 VPOP {d8-d9} // Register return
357
358 VSTR.F32 s3,[r0,#0]
359 VSTR.F32 s7,[r0,#4]
360 VSTR.F32 s11,[r0,#8] // Store result
361 VSTR.F32 s15,[r0,#12] // Store result
362
363 BX lr // Return
364
365 }
366
367 #include <nn/hw/ARM/codereset.h>
368
369 } // namespace ARMv6
370 } // namespace math
371 } // namespace nn
372