1 /*---------------------------------------------------------------------------*
2   Project:  Horizon
3   File:     math_Matrix34.cpp
4 
5   Copyright (C)2009-2012 Nintendo Co., Ltd.  All rights reserved.
6 
7   These coded instructions, statements, and computer programs contain
8   proprietary information of Nintendo of America Inc. and/or Nintendo
9   Company Ltd., and are protected by Federal copyright law.  They may
10   not be disclosed to third parties or copied or duplicated in any form,
11   in whole or in part, without the prior written consent of Nintendo.
12 
13   $Rev: 46347 $
14  *---------------------------------------------------------------------------*/
15 
16 #include <nn/math.h>
17 
18 #include <cmath>
19 #include <nn/math/ARMv6/math_Matrix34.h>
20 
21 #if !defined(NN_MATH_AS_INLINE)
22 #include <nn/math/ARMv6/inline/math_Matrix34.ipp>
23 #endif
24 
25 namespace nn {
26 namespace math {
27 namespace ARMv6 {
28 
29 #include <nn/hw/ARM/code32.h>
30 
31 NN_FUNC_ATTR_PRIVATE_SECTION
32 asm MTX34*
MTX34CopyAsm(MTX34 *,const MTX34 *)33 MTX34CopyAsm(MTX34* , const MTX34*)
34 {
35     CMP         r1,r0           // Are p and pOut the same?
36     BXEQ        lr              // If the same, returns without copying
37     VLDMIA      r1!,{s0-s5}     // Segment and load to shorten stall times due to data hazards
38     MOV         r2, r0          //
39     VLDMIA      r1,{s6-s11}
40     VSTMIA      r2!,{s0-s5}
41     VSTMIA      r2,{s6-s11}
42     BX          lr
43 }
44 
45 NN_FUNC_ATTR_PRIVATE_SECTION
46 asm MTX34*
MTX34MultAsm_ORG(MTX34 *,const MTX34 *,const MTX34 *)47 MTX34MultAsm_ORG(MTX34*, const MTX34*, const MTX34*)
48 {
49     VPUSH       {d8-d15}            // Save registers
50 
51     VLDMIA      r2,{s8-s19}         // The entire p2 matrix is put in the [S8-S19] registers
52     VLDMIA      r1!,{s0-s7}         // Matrix p1 is put into the [S0-S7] registers
53 
54     VMUL.F32    s20,s8,s0
55     VMUL.F32    s21,s9,s0
56     VMUL.F32    s22,s10,s0
57     VMUL.F32    s23,s11,s0
58 
59     VMUL.F32    s24,s8,s4
60     VMUL.F32    s25,s9,s4
61     VLDR.F32    s0,[r1,#0]          // Continuation of p1
62     VMUL.F32    s26,s10,s4
63     VMUL.F32    s27,s11,s4
64 
65     VMUL.F32    s28,s8,s0
66     VMUL.F32    s29,s9,s0
67     VMUL.F32    s30,s10,s0
68     VMUL.F32    s31,s11,s0
69 
70     VMLA.F32    s20,s12,s1
71     VMLA.F32    s21,s13,s1
72     VLDR.F32    s4,[r1,#4]          // Continuation of p1
73     VMLA.F32    s22,s14,s1
74     VMLA.F32    s23,s15,s1
75 
76     VMLA.F32    s24,s12,s5
77     VMLA.F32    s25,s13,s5
78     VMLA.F32    s26,s14,s5
79     VMLA.F32    s27,s15,s5
80 
81     VMLA.F32    s28,s12,s4
82     VMLA.F32    s29,s13,s4
83     VMLA.F32    s30,s14,s4
84     VMLA.F32    s31,s15,s4
85 
86     VLDR.F32    s1,[r1,#8]          // Continuation of p1
87     VMLA.F32    s23,s19,s2
88     VMLA.F32    s20,s16,s2
89     VMLA.F32    s21,s17,s2
90     VMLA.F32    s22,s18,s2
91 
92     VLDR.F32    s5,[r1,#12]         // Continuation of p1
93     VMLA.F32    s27,s19,s6
94     VMLA.F32    s24,s16,s6
95     VMLA.F32    s25,s17,s6
96     VMLA.F32    s26,s18,s6
97 
98     VADD.F32    s23,s23,s3
99 
100     VMLA.F32    s31,s19,s1
101     VMLA.F32    s28,s16,s1
102     VMLA.F32    s29,s17,s1
103     VMLA.F32    s30,s18,s1
104 
105     VADD.F32    s27,s27,s7
106     VADD.F32    s31,s31,s5
107 
108     VSTMIA      r0!,{s20-s23}       // Store result
109     VPOP        {d8-d11}            // Faster to segment POP
110     VSTMIA      r0!,{s24-s27}       // Store result
111     VPOP        {d12-d13}           // Faster to break into small pieces
112     VSTMIA      r0,{s28-s31}        // Store result
113     VPOP        {d14-d15}           // Faster to break into chunks
114     BX          lr                  // Return
115 }
116 
117 NN_FUNC_ATTR_PRIVATE_SECTION
118 asm MTX34*
MTX34MultAsm(MTX34 *,const MTX34 *,const MTX34 *)119 MTX34MultAsm(MTX34*, const MTX34*, const MTX34*)
120 {
121     VPUSH       {d8-d10}            // Save registers
122 
123     VLDR.F32    s3,[r1,#4*4*0+4*3]  // Matrix p1[0][3]
124     VLDR.F32    s7,[r1,#4*4*1+4*3]  // Matrix p1[1][3]
125     VLDR.F32    s11,[r1,#4*4*2+4*3] // Matrix p1[2][3]
126 
127     VLDMIA      r2!,{s12-s15}       // Matrix p2 is put into the [S12-S15] registers
128 
129     VLDR.F32    s20,[r1,#4*4*0+4*0] // Matrix p1[0][0]
130     VLDR.F32    s21,[r1,#4*4*1+4*0] // Matrix p1[1][0]
131     VMUL.F32    s0,s12,s20
132     VMUL.F32    s1,s13,s20
133     VMUL.F32    s2,s14,s20
134     VMLA.F32    s3,s15,s20
135 
136     VLDR.F32    s20,[r1,#4*4*2+4*0] // Matrix p1[2][0]
137     VMUL.F32    s4,s12,s21
138     VMUL.F32    s5,s13,s21
139     VMUL.F32    s6,s14,s21
140     VMLA.F32    s7,s15,s21
141     VLDMIA      r2!,{s16-s19}       // Matrix p2 is put into the [S16-S19] registers
142 
143     VLDR.F32    s21,[r1,#4*4*0+4*1] // Matrix p1[0][1]
144     VMUL.F32    s8,s12,s20
145     VMUL.F32    s9,s13,s20
146     VMUL.F32    s10,s14,s20
147     VMLA.F32    s11,s15,s20
148     VLDMIA      r2,{s12-s15}        // Matrix p2 is put into the [S12-S15] registers
149 
150     VLDR.F32    s20,[r1,#4*4*1+4*1] // Matrix p1[1][1]
151     VMLA.F32    s0,s16,s21
152     VMLA.F32    s1,s17,s21
153     VMLA.F32    s2,s18,s21
154     VMLA.F32    s3,s19,s21
155 
156     VLDR.F32    s21,[r1,#4*4*2+4*1] // Matrix p1[2][1]
157     VMLA.F32    s4,s16,s20
158     VMLA.F32    s5,s17,s20
159     VMLA.F32    s6,s18,s20
160     VMLA.F32    s7,s19,s20
161 
162     VLDR.F32    s20,[r1,#4*4*0+4*2] // Matrix p1[0][2]
163     VMLA.F32    s8,s16,s21
164     VMLA.F32    s9,s17,s21
165     VMLA.F32    s10,s18,s21
166     VMLA.F32    s11,s19,s21
167 
168     VLDR.F32    s21,[r1,#4*4*1+4*2] // Matrix p1[1][2]
169     VMLA.F32    s0,s12,s20
170     VMLA.F32    s1,s13,s20
171     VMLA.F32    s2,s14,s20
172     VMLA.F32    s3,s15,s20
173 
174     VLDR.F32    s20,[r1,#4*4*2+4*2] // Matrix p1[2][2]
175     VMLA.F32    s4,s12,s21
176     VMLA.F32    s5,s13,s21
177     VMLA.F32    s6,s14,s21
178     VMLA.F32    s7,s15,s21
179 
180     VMLA.F32    s8,s12,s20
181     VMLA.F32    s9,s13,s20
182     VMLA.F32    s10,s14,s20
183     VMLA.F32    s11,s15,s20
184 
185     VPOP        {d8-d10}            // POP
186     MOV         r1,r0
187     VSTMIA      r1!,{s0-s3}         // Store result
188     VSTMIA      r1,{s4-s11}         // Store result
189     BX          lr                  // Return
190 }
191 
192 NN_FUNC_ATTR_PRIVATE_SECTION
193 asm MTX34*
MTX34MultAsm(MTX34 *,const MTX34 *,f32)194 MTX34MultAsm(MTX34*, const MTX34*, f32)
195 {
196     VLDMIA      r1,{s2-s13}         // Matrix p is put into the [S1-S12] registers
197 
198     VMUL.F32    s2,s2,s0
199     VMUL.F32    s3,s3,s0
200     VMUL.F32    s4,s4,s0
201     VMUL.F32    s5,s5,s0
202 
203     VMUL.F32    s6,s6,s0
204     VMUL.F32    s7,s7,s0
205     VMUL.F32    s8,s8,s0
206     VMUL.F32    s9,s9,s0
207 
208     VMUL.F32    s10,s10,s0
209     VMUL.F32    s11,s11,s0
210     VMUL.F32    s12,s12,s0
211     VMUL.F32    s13,s13,s0
212 
213     VSTMIA      r0,{s2-s13}         // Store result
214     BX          lr                  // Return
215 }
216 
217 NN_FUNC_ATTR_PRIVATE_SECTION
218 asm MTX34*
MTX34AddAsm(MTX34 *,const MTX34 *,const MTX34 *)219 MTX34AddAsm(MTX34*, const MTX34*, const MTX34*)
220 {
221     VPUSH       {d8-d9}             // Save registers
222     VLDMIA      r2,{s0-s11}         // The entire p2 matrix is put in the [S0-S11] registers
223     VLDMIA      r1!,{s12-s19}       // Matrix p1 is put into the [S12-S19] registers
224 
225     VADD.F32    s0,s12,s0
226     VADD.F32    s1,s13,s1
227     VADD.F32    s2,s14,s2
228 
229     VADD.F32    s3,s15,s3
230     VADD.F32    s4,s16,s4
231     VLDMIA      r1!,{s12-s15}       // Continuation of p1
232     VADD.F32    s5,s17,s5
233 
234     VADD.F32    s6,s18,s6
235     VADD.F32    s7,s19,s7
236     VADD.F32    s8,s12,s8
237 
238     VADD.F32    s9,s13,s9
239     VADD.F32    s10,s14,s10
240     VADD.F32    s11,s15,s11
241 
242     VPOP        {d8-d9}             // Register return
243     VSTMIA      r0,{s0-s11}         // Store result
244     BX          lr                  // Return
245 }
246 
247 NN_FUNC_ATTR_PRIVATE_SECTION
248 asm MTX34*
MTX34MAddAsm(MTX34 *,f32,const MTX34 *,const MTX34 *)249 MTX34MAddAsm(MTX34*, f32, const MTX34*, const MTX34*)
250 {
251     VPUSH       {d8-d10}            // Save registers
252     VLDMIA      r2,{s2-s13}         // The entire p2 matrix is put in the [S2-S13] registers
253     VLDMIA      r1!,{s14-s21}       // Matrix p1 is put into the [S14-S21] registers
254 
255     VMLA.F32    s2,s14,s0
256     VMLA.F32    s3,s15,s0
257     VMLA.F32    s4,s16,s0
258     VMLA.F32    s5,s17,s0
259     VLDMIA      r1,{s14-s17}        // Continuation of p1
260 
261     VMLA.F32    s6,s18,s0
262     VMLA.F32    s7,s19,s0
263     VMLA.F32    s8,s20,s0
264     VMLA.F32    s9,s21,s0
265 
266     VMLA.F32    s10,s14,s0
267     VMLA.F32    s11,s15,s0
268     VMLA.F32    s12,s16,s0
269     VMLA.F32    s13,s17,s0
270 
271     VPOP        {d8-d10}            // Register return
272     VSTMIA      r0,{s2-s13}         // Store result
273     BX          lr                  // Return
274 }
275 
276 NN_FUNC_ATTR_PRIVATE_SECTION
277 asm MTX34*
MTX34MultScaleAsm(MTX34 *,const MTX34 *,const VEC3 *)278 MTX34MultScaleAsm(MTX34* , const MTX34* , const VEC3* )
279 {
280     VLDMIA      r1,{s0-s11}         // Matrix p is put into the [S0-S11] registers
281     VLDMIA      r2,{s12-s14}        // VEC3 is put into the [S12-S14] registers
282 
283     VMUL.F32    s0,s0,s12
284     VMUL.F32    s1,s1,s13
285     VMUL.F32    s2,s2,s14
286 
287     VMUL.F32    s4,s4,s12
288     VMUL.F32    s5,s5,s13
289     VMUL.F32    s6,s6,s14
290 
291     VMUL.F32    s8,s8,s12
292     VMUL.F32    s9,s9,s13
293     VMUL.F32    s10,s10,s14
294 
295     VSTMIA      r0,{s0-s11}         // Store result
296     BX          lr                  // Return
297 }
298 
299 NN_FUNC_ATTR_PRIVATE_SECTION
300 asm MTX34*
MTX34MultScaleAsm(MTX34 *,const VEC3 *,const MTX34 *)301 MTX34MultScaleAsm(MTX34* , const VEC3*, const MTX34* )
302 {
303     VLDMIA      r2,{s0-s11}         // Matrix p is put into the [S0-S11] registers
304     VLDMIA      r1,{s12-s14}        // VEC3 is put into the [S12-S14] registers
305 
306     VMUL.F32    s0,s0,s12
307     VMUL.F32    s1,s1,s12
308     VMUL.F32    s2,s2,s12
309     VMUL.F32    s3,s3,s12
310 
311     VMUL.F32    s4,s4,s13
312     VMUL.F32    s5,s5,s13
313     VMUL.F32    s6,s6,s13
314     VMUL.F32    s7,s7,s13
315 
316     VMUL.F32    s8,s8,s14
317     VMUL.F32    s9,s9,s14
318     VMUL.F32    s10,s10,s14
319     VMUL.F32    s11,s11,s14
320 
321     VSTMIA      r0,{s0-s11}         // Store result
322     BX          lr                  // Return
323 }
324 
325 NN_FUNC_ATTR_PRIVATE_SECTION
326 asm u32
MTX34InverseAsm(MTX34 *,const MTX34 *)327 MTX34InverseAsm(MTX34*, const MTX34* )
328 {
329     VLDMIA      r1,{s0-s2}
330     ADD         r1,#4*4
331     VLDMIA      r1,{s3-s5}
332     ADD         r1,#4*4
333     VLDMIA      r1,{s6-s8}
334 
335     VMUL.F32    s10,s0,s4
336     VMUL.F32    s11,s1,s5
337     VMUL.F32    s12,s2,s3
338     VMUL.F32    s13,s6,s4
339     VMUL.F32    s14,s3,s1
340     VMUL.F32    s15,s0,s7
341 
342     VMUL.F32    s10,s10,s8
343     VMUL.F32    s11,s11,s6
344     VMUL.F32    s12,s12,s7
345 
346     VMLS.F32    s10,s13,s2
347     VMLS.F32    s11,s14,s8
348     VMLS.F32    s12,s15,s5
349 
350     VADD.F32    s10,s10,s11
351     VLDR.F32    s15,=1.0
352     VADD.F32    s10,s10,s12
353 
354     FMRS        r2,s10
355     CMP         r2,#0x80000000
356     CMPNE       r2,#0
357     MOVEQ       r0,#0
358     BXEQ        lr
359 
360     VPUSH       {d8-d12}            // Save registers
361     VDIV.F32    s15,s10
362 
363     VMUL.F32    s16,s4,s8
364     VMUL.F32    s17,s1,s8
365     VMUL.F32    s18,s1,s5
366 
367     VMUL.F32    s19,s3,s8
368     VMUL.F32    s20,s0,s8
369     VMUL.F32    s21,s0,s5
370 
371     VMUL.F32    s22,s3,s7
372     VMUL.F32    s23,s0,s7
373     VMUL.F32    s24,s0,s4
374 
375 
376     VMLS.F32    s16,s7,s5
377     VMLS.F32    s17,s7,s2
378     VMLS.F32    s18,s4,s2
379 
380     VMLS.F32    s19,s6,s5
381     VMLS.F32    s20,s6,s2
382     VMLS.F32    s21,s3,s2
383 
384     VMLS.F32    s22,s6,s4
385     VMLS.F32    s23,s6,s1
386     VMLS.F32    s24,s3,s1
387 
388 
389     VMUL.F32    s0,s16,s15          // m[0][0]
390     VNMUL.F32   s1,s17,s15          // m[0][1]
391     VMUL.F32    s2,s18,s15          // m[0][2]
392 
393     VNMUL.F32   s4,s19,s15          // m[1][0]
394     VMUL.F32    s5,s20,s15          // m[1][1]
395     VNMUL.F32   s6,s21,s15          // m[1][2]
396 
397     VMUL.F32    s8,s22,s15          // m[2][0]
398     VLDR.F32    s12,[r1,#4*3-(4*4*2)]// src[0][3]
399     VNMUL.F32   s9,s23,s15          // m[2][1]
400     VMUL.F32    s10,s24,s15         // m[2][2]
401 
402     VNMUL.F32   s3,s0,s12
403     VLDR.F32    s13,[r1,#4*3-(4*4)] // src[1][3]
404     VNMUL.F32   s7,s4,s12
405     VNMUL.F32   s11,s8,s12
406 
407     VMLS.F32    s3,s1,s13
408     VLDR.F32    s14,[r1,#4*3]       // src[2][3]
409     VMLS.F32    s7,s5,s13
410     VMLS.F32    s11,s9,s13
411 
412     VMLS.F32    s3,s2,s14
413     VMLS.F32    s7,s6,s14
414     VMLS.F32    s11,s10,s14
415 
416     VPOP        {d8-d12}            // Register return
417     VSTMIA      r0,{s0-s11}         // Store result
418     MOV         r0,#1
419     BX          lr                  // Return
420 }
421 
422 NN_FUNC_ATTR_PRIVATE_SECTION
423 asm u32
MTX34InvTransposeAsm(MTX34 *,const MTX34 *)424 MTX34InvTransposeAsm(MTX34*, const MTX34* )
425 {
426     VLDMIA      r1,{s0-s2}
427     ADD         r1,#4*4
428     VLDMIA      r1,{s3-s5}
429     ADD         r1,#4*4
430     VLDMIA      r1,{s6-s8}
431 
432     VMUL.F32    s10,s0,s4
433     VMUL.F32    s11,s1,s5
434     VMUL.F32    s12,s2,s3
435     VMUL.F32    s13,s6,s4
436     VMUL.F32    s14,s3,s1
437     VMUL.F32    s15,s0,s7
438 
439     VMUL.F32    s10,s10,s8
440     VMUL.F32    s11,s11,s6
441     VMUL.F32    s12,s12,s7
442 
443     VMLS.F32    s10,s13,s2
444     VMLS.F32    s11,s14,s8
445     VMLS.F32    s12,s15,s5
446 
447     VADD.F32    s10,s10,s11
448     VLDR.F32    s15,=1.0
449     VADD.F32    s10,s10,s12
450 
451     FMRS        r2,s10
452     CMP         r2,#0x80000000
453     CMPNE       r2,#0
454     MOVEQ       r0,#0
455     BXEQ        lr
456 
457     VPUSH       {d8-d12}            // Save registers
458     VDIV.F32    s15,s10
459 
460     VMUL.F32    s16,s4,s8
461     VMUL.F32    s17,s3,s8
462     VMUL.F32    s18,s3,s7
463 
464     VMUL.F32    s19,s1,s8
465     VMUL.F32    s20,s0,s8
466     VMUL.F32    s21,s0,s7
467 
468     VMUL.F32    s22,s1,s5
469     VMUL.F32    s23,s0,s5
470     VMUL.F32    s24,s0,s4
471 
472 
473     VMLS.F32    s16,s7,s5
474     VMLS.F32    s17,s6,s5
475     VMLS.F32    s18,s6,s4
476 
477     VMLS.F32    s19,s7,s2
478     VMLS.F32    s20,s6,s2
479     VMLS.F32    s21,s6,s1
480 
481     VMLS.F32    s22,s4,s2
482     VMLS.F32    s23,s3,s2
483     VMLS.F32    s24,s3,s1
484 
485 
486     VMUL.F32    s0,s16,s15          // m[0][0]
487     VNMUL.F32   s1,s17,s15          // m[0][1]
488     VMUL.F32    s2,s18,s15          // m[0][2]
489 
490     VNMUL.F32   s4,s19,s15          // m[1][0]
491     VMUL.F32    s5,s20,s15          // m[1][1]
492     VNMUL.F32   s6,s21,s15          // m[1][2]
493 
494     VMUL.F32    s8,s22,s15          // m[2][0]
495     VNMUL.F32   s9,s23,s15          // m[2][1]
496     VMUL.F32    s10,s24,s15         // m[2][2]
497 
498     VLDR.F32    s3,=0.0
499     VLDR.F32    s7,=0.0
500     VLDR.F32    s11,=0.0
501 
502     VPOP        {d8-d12}            // Register return
503     VSTMIA      r0,{s0-s11}         // Store result
504     MOV         r0,#1
505     BX          lr                  // Return
506 
507 }
508 
509 NN_FUNC_ATTR_PRIVATE_SECTION
510 asm MTX34*
MTX34MultTranslateAsm(MTX34 *,const VEC3 *,const MTX34 *)511 MTX34MultTranslateAsm(MTX34*, const VEC3*, const MTX34*)
512 {
513     VLDMIA      r2,{s0-s11}         // The entire pM matrix is put in the [S0-S11] registers
514     VLDMIA      r1,{s12-s14}        // All vectors are put in the [S0-S11] registers
515 
516     VADD.F32    s3,s3,s12
517     VADD.F32    s7,s7,s13
518     VADD.F32    s11,s11,s14
519 
520     VSTMIA      r0,{s0-s11}         // Store result
521     BX          lr                  // Return
522 }
523 
524 NN_FUNC_ATTR_PRIVATE_SECTION
525 asm MTX34*
MTX34MultTranslateAsm(MTX34 *,const MTX34 *,const VEC3 *)526 MTX34MultTranslateAsm(MTX34*, const MTX34*, const VEC3*)
527 {
528     VLDMIA      r1,{s0-s11}         // The entire pM matrix is put in the [S0-S11] registers
529     VLDMIA      r2,{s12-s14}        // All vectors are put in the [S0-S11] registers
530 
531     VMLA.F32    s3,s0,s12
532     VMLA.F32    s7,s4,s12
533     VMLA.F32    s11,s8,s12
534 
535     VMLA.F32    s3,s1,s13
536     VMLA.F32    s7,s5,s13
537     VMLA.F32    s11,s9,s13
538 
539     VMLA.F32    s3,s2,s14
540     VMLA.F32    s7,s6,s14
541     VMLA.F32    s11,s10,s14
542 
543     VSTMIA      r0,{s0-s11}         // Store result
544     BX          lr                  // Return
545 
546 }
547 
548 NN_FUNC_ATTR_PRIVATE_SECTION
VEC3TransformAsm(VEC3 *,const MTX34 *,const VEC3 *)549 asm VEC3* VEC3TransformAsm(VEC3* , const MTX34* , const VEC3* )
550 {
551     VLDMIA      r1,{s0-s11}         // The entire pM matrix is put in the [S0-S11] registers
552     VLDMIA      r2,{s12-s14}        // All vectors are put in the [S0-S11] registers
553 
554     VMLA.F32    s3,s0,s12
555     VMLA.F32    s7,s4,s12
556     VMLA.F32    s11,s8,s12
557 
558     VMLA.F32    s3,s1,s13
559     VMLA.F32    s7,s5,s13
560     VMLA.F32    s11,s9,s13
561 
562     VMLA.F32    s3,s2,s14
563     VMLA.F32    s7,s6,s14
564     VMLA.F32    s11,s10,s14
565 
566     VSTR.F32    s3,[r0,#0]
567     VSTR.F32    s7,[r0,#4]
568     VSTR.F32    s11,[r0,#8]         // Store result
569     BX          lr                  // Return
570 
571 }
572 
573 NN_FUNC_ATTR_PRIVATE_SECTION
574 asm MTX34*
MTX34TransposeAsm(MTX34 *,const MTX34 *)575 MTX34TransposeAsm(MTX34* , const MTX34* )
576 {
577     VLDR.F32    s0,[r1,#0*16+0*4]
578     VLDR.F32    s1,[r1,#1*16+0*4]
579     VLDR.F32    s2,[r1,#2*16+0*4]
580     VLDR.F32    s3,=0.0
581     VLDR.F32    s4,[r1,#0*16+1*4]
582     VLDR.F32    s5,[r1,#1*16+1*4]
583     VLDR.F32    s6,[r1,#2*16+1*4]
584     VLDR.F32    s7,=0.0
585     VLDR.F32    s8,[r1,#0*16+2*4]
586     VLDR.F32    s9,[r1,#1*16+2*4]
587     VLDR.F32    s10,[r1,#2*16+2*4]
588     VLDR.F32    s11,=0.0
589 
590     VSTMIA      r0,{s0-s11}         // Store result
591     BX          lr                  // Return
592 }
593 
594 NN_FUNC_ATTR_PRIVATE_SECTION
595 asm MTX34*
MTX34ScaleAsm(MTX34 *,const VEC3 *)596 MTX34ScaleAsm(MTX34* , const VEC3* )
597 {
598     VLDR.F32    s0,[r1,#0]
599     VLDR.F32    s1,=0.0
600     VLDR.F32    s2,=0.0
601     VLDR.F32    s3,=0.0
602     VLDR.F32    s4,=0.0
603     VLDR.F32    s5,[r1,#4]
604     VLDR.F32    s6,=0.0
605     VLDR.F32    s7,=0.0
606     VLDR.F32    s8,=0.0
607     VLDR.F32    s9,=0.0
608     VLDR.F32    s10,[r1,#8]
609     VLDR.F32    s11,=0.0
610 
611     VSTMIA      r0,{s0-s11}         // Store result
612     BX          lr                  // Return
613 }
614 
615 #include <nn/hw/ARM/codereset.h>
616 
617 }  // namespace ARMv6
618 }  // namespace math
619 }  // namespace nn
620