1 /*---------------------------------------------------------------------------*
2   Project:  Horizon
3   File:     math_Matrix44.cpp
4 
5   Copyright (C)2009-2012 Nintendo Co., Ltd.  All rights reserved.
6 
7   These coded instructions, statements, and computer programs contain
8   proprietary information of Nintendo of America Inc. and/or Nintendo
9   Company Ltd., and are protected by Federal copyright law.  They may
10   not be disclosed to third parties or copied or duplicated in any form,
11   in whole or in part, without the prior written consent of Nintendo.
12 
13   $Rev: 46347 $
14  *---------------------------------------------------------------------------*/
15 
16 #include <nn/math.h>
17 
18 #include <cmath>
19 #include <nn/math/math_Matrix44.h>
20 
21 #if !defined(NN_MATH_AS_INLINE)
22 #include <nn/math/ARMv6/inline/math_Matrix44.ipp>
23 #endif
24 
25 
26 namespace nn{
27 namespace math {
28 namespace ARMv6 {
29 #include <nn/hw/ARM/code32.h>
30 
31 NN_FUNC_ATTR_PRIVATE_SECTION
MTX44AddAsm(MTX44 *,const MTX44 *,const MTX44 *)32 asm MTX44* MTX44AddAsm(MTX44* , const MTX44* , const MTX44* )
33 {
34     VPUSH       {d8-d11}            // Save registers
35     VLDMIA      r2,{s0-s15}         // The entire p2 matrix is put in the [S0-S15] registers
36     VLDMIA      r1!,{s16-s23}       // Matrix p1 is put into the [S16-S23] registers
37 
38     VADD.F32    s0,s16,s0
39     VADD.F32    s1,s17,s1
40     VADD.F32    s2,s18,s2
41     VADD.F32    s3,s19,s3
42 
43     VLDMIA      r1!,{s16-s19}       // Continuation of p1
44 
45     VADD.F32    s4,s20,s4
46     VADD.F32    s5,s21,s5
47     VADD.F32    s6,s22,s6
48     VADD.F32    s7,s23,s7
49 
50     VLDMIA      r1!,{s20-s23}       // Continuation of p1
51 
52     VADD.F32    s8,s16,s8
53     VADD.F32    s9,s17,s9
54     VADD.F32    s10,s18,s10
55     VADD.F32    s11,s19,s11
56 
57     VADD.F32    s12,s20,s12
58     VADD.F32    s13,s21,s13
59     VADD.F32    s14,s22,s14
60     VADD.F32    s15,s23,s15
61 
62     VPOP        {d8-d11}            // Register return
63     VSTMIA      r0,{s0-s15}         // Store result
64     BX          lr                  // Return
65 
66 }
67 
68 NN_FUNC_ATTR_PRIVATE_SECTION
MTX44CopyAsm(MTX44 *,const MTX44 *)69 asm MTX44* MTX44CopyAsm(MTX44* , const MTX44* )
70 {
71     CMP         r1,r0           // Are p and pOut the same?
72     BXEQ        lr              // If the same, returns without copying
73     VLDMIA      r1,{s0-s15}     // All p are put in the [S0-S15] registers
74     VSTMIA      r0,{s0-s15}     // All pOut are put in the [S0-S15] registers
75     BX          lr
76 }
77 
78 NN_FUNC_ATTR_PRIVATE_SECTION
MTX44MultAsm(MTX44 *,const MTX44 *,const MTX44 *)79 asm MTX44* MTX44MultAsm(MTX44* , const MTX44* , const MTX44* )
80 {
81     VPUSH       {d8-d12}            // Save registers
82 
83     VLDMIA      r2!,{s16-s23}       // Matrix p2 is put into the [S16-S23] registers
84     VLDR.F32    s24,[r1,#16*0+0*4]  // p1[0][0]
85     VLDR.F32    s25,[r1,#16*1+0*4]  // p1[1][0]
86 
87     VMUL.F32    s0,s24,s16
88     VMUL.F32    s1,s24,s17
89     VMUL.F32    s2,s24,s18
90     VMUL.F32    s3,s24,s19
91     VLDR.F32    s24,[r1,#16*2+0*4]  // p1[2][0]
92 
93     VMUL.F32    s4,s25,s16
94     VMUL.F32    s5,s25,s17
95     VMUL.F32    s6,s25,s18
96     VMUL.F32    s7,s25,s19
97     VLDR.F32    s25,[r1,#16*3+0*4]  // p1[3][0]
98 
99     VMUL.F32    s8,s24,s16
100     VMUL.F32    s9,s24,s17
101     VMUL.F32    s10,s24,s18
102     VMUL.F32    s11,s24,s19
103     VLDR.F32    s24,[r1,#16*0+1*4]  // p1[0][1]
104 
105     VMUL.F32    s12,s25,s16
106     VMUL.F32    s13,s25,s17
107     VMUL.F32    s14,s25,s18
108     VMUL.F32    s15,s25,s19
109     VLDR.F32    s25,[r1,#16*1+1*4]  // p1[1][1]
110 
111     VLDMIA      r2!,{s16-s19}       // Matrix p2 is put into the [S16-S19] registers
112 
113     VMLA.F32    s0,s24,s20
114     VMLA.F32    s1,s24,s21
115     VMLA.F32    s2,s24,s22
116     VMLA.F32    s3,s24,s23
117     VLDR.F32    s24,[r1,#16*2+1*4]  // p1[2][1]
118 
119     VMLA.F32    s4,s25,s20
120     VMLA.F32    s5,s25,s21
121     VMLA.F32    s6,s25,s22
122     VMLA.F32    s7,s25,s23
123     VLDR.F32    s25,[r1,#16*3+1*4]  // p1[3][1]
124 
125     VMLA.F32    s8,s24,s20
126     VMLA.F32    s9,s24,s21
127     VMLA.F32    s10,s24,s22
128     VMLA.F32    s11,s24,s23
129     VLDR.F32    s24,[r1,#16*0+2*4]  // p1[0][2]
130 
131     VMLA.F32    s12,s25,s20
132     VMLA.F32    s13,s25,s21
133     VMLA.F32    s14,s25,s22
134     VMLA.F32    s15,s25,s23
135     VLDR.F32    s25,[r1,#16*1+2*4]  // p1[1][2]
136 
137     VLDMIA      r2,{s20-s23}        // Matrix p2 is put into the [S20-S23] registers
138 
139     VMLA.F32    s0,s24,s16
140     VMLA.F32    s1,s24,s17
141     VMLA.F32    s2,s24,s18
142     VMLA.F32    s3,s24,s19
143     VLDR.F32    s24,[r1,#16*2+2*4]  // p1[2][2]
144 
145     VMLA.F32    s4,s25,s16
146     VMLA.F32    s5,s25,s17
147     VMLA.F32    s6,s25,s18
148     VMLA.F32    s7,s25,s19
149     VLDR.F32    s25,[r1,#16*3+2*4]  // p1[3][2]
150 
151     VMLA.F32    s8,s24,s16
152     VMLA.F32    s9,s24,s17
153     VMLA.F32    s10,s24,s18
154     VMLA.F32    s11,s24,s19
155     VLDR.F32    s24,[r1,#16*0+3*4]  // p1[0][3]
156 
157     VMLA.F32    s12,s25,s16
158     VMLA.F32    s13,s25,s17
159     VMLA.F32    s14,s25,s18
160     VMLA.F32    s15,s25,s19
161     VLDR.F32    s25,[r1,#16*1+3*4]  // p1[1][3]
162 
163     VMLA.F32    s0,s24,s20
164     VMLA.F32    s1,s24,s21
165     VMLA.F32    s2,s24,s22
166     VMLA.F32    s3,s24,s23
167     VLDR.F32    s24,[r1,#16*2+3*4]  // p1[2][3]
168 
169     VMLA.F32    s4,s25,s20
170     VMLA.F32    s5,s25,s21
171     VMLA.F32    s6,s25,s22
172     VMLA.F32    s7,s25,s23
173     VLDR.F32    s25,[r1,#16*3+3*4]  // p1[3][3]
174 
175     VMLA.F32    s8,s24,s20
176     VMLA.F32    s9,s24,s21
177     VMLA.F32    s10,s24,s22
178     VMLA.F32    s11,s24,s23
179 
180     VMLA.F32    s12,s25,s20
181     VMLA.F32    s13,s25,s21
182     VMLA.F32    s14,s25,s22
183     VMLA.F32    s15,s25,s23
184 
185     VPOP        {d8-d12}            // Register return
186     VSTMIA      r0,{s0-s15}         // Store result
187     BX          lr                  // Return
188 
189 }
190 
191 NN_FUNC_ATTR_PRIVATE_SECTION
MTX44MultAsm(MTX44 *,const MTX44 *,f32)192 asm MTX44* MTX44MultAsm(MTX44* , const MTX44* , f32 )
193 {
194     VPUSH       {d8}               // Save registers
195     VLDMIA      r1!,{s16}          // Matrix p is put into the [S2-S17] registers
196     VLDMIA      r1,{s1-s15}        // Matrix p is put into the [S2-S17] registers
197 
198     VMUL.F32    s1,s1,s0
199     VMUL.F32    s2,s2,s0
200     VMUL.F32    s3,s3,s0
201 
202     VMUL.F32    s4,s4,s0
203     VMUL.F32    s5,s5,s0
204     VMUL.F32    s6,s6,s0
205     VMUL.F32    s7,s7,s0
206 
207     VMUL.F32    s8,s8,s0
208     VMUL.F32    s9,s9,s0
209     VMUL.F32    s10,s10,s0
210     VMUL.F32    s11,s11,s0
211 
212     VMUL.F32    s12,s12,s0
213     VMUL.F32    s13,s13,s0
214     VMUL.F32    s14,s14,s0
215     VMUL.F32    s15,s15,s0
216 
217     VMUL.F32    s0,s16,s0
218     VPOP        {d8}                // Register return
219     VSTMIA      r0,{s0-s15}         // Store result
220     BX          lr                  // Return
221 }
222 
223 NN_FUNC_ATTR_PRIVATE_SECTION
MTX44MultScaleAsm(MTX44 *,const MTX44 *,const VEC3 *)224 asm MTX44* MTX44MultScaleAsm(MTX44*, const MTX44*, const VEC3*)
225 {
226     VLDMIA      r1,{s0-s11}        // Matrix p is put into the [S0-S11] registers
227     VLDMIA      r2,{s12-s14}       // VEC3 is put into the [S12-S14] registers
228 
229     VMUL.F32    s0,s0,s12
230     VMUL.F32    s1,s1,s13
231     VMUL.F32    s2,s2,s14
232 
233     VMUL.F32    s4,s4,s12
234     VMUL.F32    s5,s5,s13
235     VMUL.F32    s6,s6,s14
236 
237     VMUL.F32    s8,s8,s12
238     VMUL.F32    s9,s9,s13
239     VMUL.F32    s10,s10,s14
240 
241     VSTMIA      r0,{s0-s11}         // Store result
242     BX          lr                  // Return
243 }
244 
245 NN_FUNC_ATTR_PRIVATE_SECTION
MTX44MultScaleAsm(MTX44 *,const VEC3 *,const MTX44 *)246 asm MTX44* MTX44MultScaleAsm(MTX44*, const VEC3*, const MTX44*)
247 {
248     VLDMIA      r2,{s0-s11}        // Matrix p is put into the [S0-S11] registers
249     VLDMIA      r1,{s12-s14}       // VEC3 is put into the [S12-S14] registers
250 
251     VMUL.F32    s0,s0,s12
252     VMUL.F32    s1,s1,s12
253     VMUL.F32    s2,s2,s12
254     VMUL.F32    s3,s3,s12
255 
256     VMUL.F32    s4,s4,s13
257     VMUL.F32    s5,s5,s13
258     VMUL.F32    s6,s6,s13
259     VMUL.F32    s7,s7,s13
260 
261     VMUL.F32    s8,s8,s14
262     VMUL.F32    s9,s9,s14
263     VMUL.F32    s10,s10,s14
264     VMUL.F32    s11,s11,s14
265 
266     VSTMIA      r0,{s0-s11}         // Store result
267     BX          lr                  // Return
268 }
269 
270 NN_FUNC_ATTR_PRIVATE_SECTION
MTX44MultTranslateAsm(MTX44 *,const VEC3 *,const MTX44 *)271 asm MTX44* MTX44MultTranslateAsm(MTX44*, const VEC3*, const MTX44*)
272 {
273     VLDMIA      r2,{s0-s11}         // The entire pM matrix is put in the [S0-S11] registers
274     VLDMIA      r1,{s12-s14}        // All vectors are put in the [S12-S14] registers
275 
276     VADD.F32    s3,s3,s12
277     VADD.F32    s7,s7,s13
278     VADD.F32    s11,s11,s14
279 
280     VSTMIA      r0,{s0-s11}         // Store result
281     BX          lr                  // Return
282 }
283 
284 NN_FUNC_ATTR_PRIVATE_SECTION
MTX44MultTranslateAsm(MTX44 *,const MTX44 *,const VEC3 *)285 asm MTX44* MTX44MultTranslateAsm(MTX44*, const MTX44*, const VEC3*)
286 {
287     VLDMIA      r1!,{s0-s11}        // Matrix pM is put into the [S0-S11] registers
288     VLDMIA      r2,{s12-s14}        // All vectors are put in the [S12-S14] registers
289 
290     VMLA.F32    s3,s0,s12
291     VMLA.F32    s7,s4,s12
292     VMLA.F32    s11,s8,s12
293 
294     VMLA.F32    s3,s1,s13
295     VMLA.F32    s7,s5,s13
296     VMLA.F32    s11,s9,s13
297 
298     VMLA.F32    s3,s2,s14
299     VMLA.F32    s7,s6,s14
300     VMLA.F32    s11,s10,s14
301 
302     VLDMIA      r1!,{s12-s15}       // Matrix pM is put into the [S12-S15] registers
303 
304     VSTMIA      r0,{s0-s15}         // Store result
305     BX          lr                  // Return
306 
307 }
308 
309 NN_FUNC_ATTR_PRIVATE_SECTION
MTX44TransposeAsm(MTX44 *,const MTX44 *)310 asm MTX44* MTX44TransposeAsm(MTX44* , const MTX44*)
311 {
312     VLDR.F32    s0,[r1,#0*16+0*4]
313     VLDR.F32    s1,[r1,#1*16+0*4]
314     VLDR.F32    s2,[r1,#2*16+0*4]
315     VLDR.F32    s3,[r1,#3*16+0*4]
316     VLDR.F32    s4,[r1,#0*16+1*4]
317     VLDR.F32    s5,[r1,#1*16+1*4]
318     VLDR.F32    s6,[r1,#2*16+1*4]
319     VLDR.F32    s7,[r1,#3*16+1*4]
320     VLDR.F32    s8,[r1,#0*16+2*4]
321     VLDR.F32    s9,[r1,#1*16+2*4]
322     VLDR.F32    s10,[r1,#2*16+2*4]
323     VLDR.F32    s11,[r1,#3*16+2*4]
324     VLDR.F32    s12,[r1,#0*16+3*4]
325     VLDR.F32    s13,[r1,#1*16+3*4]
326     VLDR.F32    s14,[r1,#2*16+3*4]
327     VLDR.F32    s15,[r1,#3*16+3*4]
328 
329     VSTMIA      r0,{s0-s15}         // Store result
330     BX          lr                  // Return
331 }
332 
333 NN_FUNC_ATTR_PRIVATE_SECTION
VEC3TransformAsm(VEC4 *,const MTX44 *,const VEC3 *)334 asm VEC4* VEC3TransformAsm(VEC4*, const MTX44*, const VEC3*)
335 {
336     VPUSH       {d8-d9}             // Save registers
337 
338     VLDMIA      r1,{s0-s15}         // The entire pM matrix is put in the [S0-S15] registers
339     VLDMIA      r2,{s16-s18}        // All vectors are put in the [S16-S18] registers
340 
341     VMLA.F32    s3,s0,s16
342     VMLA.F32    s7,s4,s16
343     VMLA.F32    s11,s8,s16
344     VMLA.F32    s15,s12,s16
345 
346     VMLA.F32    s3,s1,s17
347     VMLA.F32    s7,s5,s17
348     VMLA.F32    s11,s9,s17
349     VMLA.F32    s15,s13,s17
350 
351     VMLA.F32    s3,s2,s18
352     VMLA.F32    s7,s6,s18
353     VMLA.F32    s11,s10,s18
354     VMLA.F32    s15,s14,s18
355 
356     VPOP        {d8-d9}             // Register return
357 
358     VSTR.F32    s3,[r0,#0]
359     VSTR.F32    s7,[r0,#4]
360     VSTR.F32    s11,[r0,#8]         // Store result
361     VSTR.F32    s15,[r0,#12]        // Store result
362 
363     BX          lr                  // Return
364 
365 }
366 
367 #include <nn/hw/ARM/codereset.h>
368 
369 }  // namespace ARMv6
370 }  // namespace math
371 }  // namespace nn
372