1/*---------------------------------------------------------------------------*
2  Project:  Horizon
3  File:     math_Matrix33.ipp
4
5  Copyright (C)2009-2010 Nintendo Co., Ltd.  All rights reserved.
6
7  These coded instructions, statements, and computer programs contain
8  proprietary information of Nintendo of America Inc. and/or Nintendo
9  Company Ltd., and are protected by Federal copyright law.  They may
10  not be disclosed to third parties or copied or duplicated in any form,
11  in whole or in part, without the prior written consent of Nintendo.
12
13  $Revision: 13623 $
14 *---------------------------------------------------------------------------*/
15#include <nn/math/math_Vector3.h>
16
17namespace nn {
18namespace math {
19namespace ARMv6 {
20
21#include <nn/hw/ARM/code32.h>
22/* Please see man pages for details
23
24
25*/
26
27template<typename TMatrix>
28inline asm TMatrix*
29MTX33MultAsm_ORG(TMatrix*, const TMatrix*, const TMatrix*)
30{
31    VPUSH       {d8-d13}            // Save registers
32
33    //Because a template is being used, calculate an offset value for the matrix...
34    MOV         r3,#__cpp(offsetof(TMatrix,f))
35    ADD         r1,r1,r3
36    ADD         r2,r2,r3
37    MOV         r3,#__cpp(TMatrix::COLUMN_COUNT)*4
38
39    VLDMIA      r2,{s9-s11}         // First line of matrix p2 to registers [S9-S11]
40    VLDMIA      r1,{s0-s2}          // First line of matrix p1 to registers [S0-S2]
41    ADD         r1,r1,r3
42    ADD         r2,r2,r3
43    VLDMIA      r2,{s12-s14}        // Second line of matrix p2 to registers [S12-S14]
44    VLDMIA      r1,{s3-s5}          // Second line of matrix p1 to registers [S3-S5]
45    ADD         r1,r1,r3
46    ADD         r2,r2,r3
47    VLDMIA      r2,{s15-s17}        // Third line of matrix p2 to registers [S15-S17]
48    VLDMIA      r1,{s6-s8}          // Third line of matrix p1 to registers [S6-S8]
49
50    VMUL.F32    s18,s9,s0
51    VMUL.F32    s19,s10,s0
52    VMUL.F32    s20,s11,s0
53
54    VMUL.F32    s21,s9,s3
55    VMUL.F32    s22,s10,s3
56    VMUL.F32    s23,s11,s3
57
58    VMUL.F32    s24,s9,s6
59    VMUL.F32    s25,s10,s6
60    VMUL.F32    s26,s11,s6
61
62    VMLA.F32    s18,s12,s1
63    VMLA.F32    s19,s13,s1
64    VMLA.F32    s20,s14,s1
65
66    VMLA.F32    s21,s12,s4
67    VMLA.F32    s22,s13,s4
68    VMLA.F32    s23,s14,s4
69
70    VMLA.F32    s24,s12,s7
71    VMLA.F32    s25,s13,s7
72    VMLA.F32    s26,s14,s7
73
74    VMLA.F32    s18,s15,s2
75    VMLA.F32    s19,s16,s2
76    VMLA.F32    s20,s17,s2
77
78    VMLA.F32    s21,s15,s5
79    VMLA.F32    s22,s16,s5
80    VMLA.F32    s23,s17,s5
81
82    VMLA.F32    s24,s15,s8
83    VMLA.F32    s25,s16,s8
84    VMLA.F32    s26,s17,s8
85
86    VSTMIA      r0,{s18-s20}        // Store result
87    VPOP        {d8-d9}             // Faster to segment POP
88    ADD         r1,r0,r3
89    VSTMIA      r1,{s21-s23}        // Store result
90    VPOP        {d10-d11}           // Faster to break into small pieces
91    ADD         r1,r1,r3
92    VSTMIA      r1,{s24-s26}        // Store result
93    VPOP        {d12-d13}           // Faster to break into chunks
94    BX          lr                  // Return
95}
96
97template<typename TMatrix>
98inline asm TMatrix*
99MTX33MultAsm(TMatrix*, const TMatrix*, const TMatrix*)
100{
101
102
103    //Because a template is being used, calculate an offset value for the matrix...
104    MOV         r3,#__cpp(offsetof(TMatrix,f))
105    ADD         r1,r1,r3
106    ADD         r2,r2,r3
107    MOV         r3,#__cpp(TMatrix::COLUMN_COUNT)*4
108
109    CMP         r3,#3*4
110    BNE         %FT0
111
112    VPUSH       {d8}                  // Save registers
113    VLDMIA      r2!,{s10-s15}         // First and second line of matrix p2 to registers [S10-S15]
114
115    VLDR.F32    s16,[r1,#3*4*0+4*0]  // Matrix p1[0][0]
116    VLDR.F32    s17,[r1,#3*4*1+4*0]  // Matrix p1[1][0]
117
118    VMUL.F32    s0,s10,s16
119    VMUL.F32    s1,s11,s16
120    VMUL.F32    s2,s12,s16
121    VLDR.F32    s16,[r1,#3*4*2+4*0]  // Matrix p1[2][0]
122
123    VMUL.F32    s3,s10,s17
124    VMUL.F32    s4,s11,s17
125    VMUL.F32    s5,s12,s17
126    VLDR.F32    s17,[r1,#3*4*0+4*1]  // Matrix p1[0][1]
127
128    VMUL.F32    s6,s10,s16
129    VMUL.F32    s7,s11,s16
130    VMUL.F32    s8,s12,s16
131    VLDR.F32    s16,[r1,#3*4*1+4*1]  // Matrix p1[1][1]
132
133    VLDMIA      r2,{s10-s12}         // Third line of matrix p2 to registers [S10-S12]
134    VMLA.F32    s0,s13,s17
135    VMLA.F32    s1,s14,s17
136    VMLA.F32    s2,s15,s17
137    VLDR.F32    s17,[r1,#3*4*2+4*1]  // Matrix p1[2][1]
138
139    VMLA.F32    s3,s13,s16
140    VMLA.F32    s4,s14,s16
141    VMLA.F32    s5,s15,s16
142    VLDR.F32    s16,[r1,#3*4*0+4*2]  // Matrix p1[0][2]
143
144    VMLA.F32    s6,s13,s17
145    VMLA.F32    s7,s14,s17
146    VMLA.F32    s8,s15,s17
147    VLDR.F32    s17,[r1,#3*4*1+4*2]  // Matrix p1[1][2]
148
149    VMLA.F32    s0,s10,s16
150    VMLA.F32    s1,s11,s16
151    VMLA.F32    s2,s12,s16
152    VLDR.F32    s16,[r1,#3*4*2+4*2]  // Matrix p1[2][2]
153
154    VMLA.F32    s3,s10,s17
155    VMLA.F32    s4,s11,s17
156    VMLA.F32    s5,s12,s17
157
158    VMLA.F32    s6,s10,s16
159    VMLA.F32    s7,s11,s16
160    VMLA.F32    s8,s12,s16
161
162    VPOP        {d8}                // POP
163
164    VSTMIA      r0,{s0-s8}          // Store result
165    BX          lr                  // Return
166
1670
168    VPUSH       {d8-d13}            // Save registers
169    VLDMIA      r2,{s9-s11}         // First line of matrix p2 to registers [S9-S11]
170    VLDMIA      r1,{s18-s20}        // First line of matrix p1 to registers [S18-S20]
171    ADD         r1,r1,r3
172    ADD         r2,r2,r3
173    VLDMIA      r2,{s12-s14}        // Second line of matrix p2 to registers [S12-S14]
174    VLDMIA      r1,{s21-s23}        // Second line of matrix p1 to registers [S21-S23]
175    ADD         r1,r1,r3
176    ADD         r2,r2,r3
177    VLDMIA      r2,{s15-s17}        // Third line of matrix p2 to registers [S15-S17]
178    VLDMIA      r1,{s24-s26}        // Third line of matrix p1 to registers [S24-S26]
179
180    VMUL.F32    s0,s9,s18
181    VMUL.F32    s1,s10,s18
182    VMUL.F32    s2,s11,s18
183
184    VMUL.F32    s3,s9,s21
185    VMUL.F32    s4,s10,s21
186    VMUL.F32    s5,s11,s21
187
188    VMUL.F32    s6,s9,s24
189    VMUL.F32    s7,s10,s24
190    VMUL.F32    s8,s11,s24
191
192    VMLA.F32    s0,s12,s19
193    VMLA.F32    s1,s13,s19
194    VMLA.F32    s2,s14,s19
195
196    VMLA.F32    s3,s12,s22
197    VMLA.F32    s4,s13,s22
198    VMLA.F32    s5,s14,s22
199
200    VMLA.F32    s6,s12,s25
201    VMLA.F32    s7,s13,s25
202    VMLA.F32    s8,s14,s25
203
204    VMLA.F32    s0,s15,s20
205    VMLA.F32    s1,s16,s20
206    VMLA.F32    s2,s17,s20
207
208    VMLA.F32    s3,s15,s23
209    VMLA.F32    s4,s16,s23
210    VMLA.F32    s5,s17,s23
211
212    VMLA.F32    s6,s15,s26
213    VMLA.F32    s7,s16,s26
214    VMLA.F32    s8,s17,s26
215
216    VPOP        {d8-d13}            // POP
217
218    ADD         r1,r0,r3
219    ADD         r2,r1,r3
220    VSTMIA      r0,{s0-s2}          // Store result
221    VSTMIA      r1,{s3-s5}          // Store result
222    VSTMIA      r2,{s6-s8}          // Store result
223    BX          lr                  // Return
224
225}
226
227/*
228
229*/
230
231#include <nn/hw/ARM/codereset.h>
232/* Please see man pages for details
233
234
235*/
236
237/*
238
239
240
241
242
243
244
245*/
246NN_MATH_INLINE VEC3*
247VEC3TransformC(VEC3* pOut, const MTX33* pM, const VEC3* pV)
248{
249    NN_NULL_ASSERT(pOut);
250    NN_NULL_ASSERT(pM);
251    NN_NULL_ASSERT(pV);
252
253    VEC3 vTmp;
254    VEC3* pDst = (pOut == pV) ? &vTmp : pOut;
255    pDst->x = pM->f._00 * pV->x + pM->f._01 * pV->y + pM->f._02 * pV->z;
256    pDst->y = pM->f._10 * pV->x + pM->f._11 * pV->y + pM->f._12 * pV->z;
257    pDst->z = pM->f._20 * pV->x + pM->f._21 * pV->y + pM->f._22 * pV->z;
258
259    if (pDst == &vTmp)
260    {
261        pOut->x = pDst->x;
262        pOut->y = pDst->y;
263        pOut->z = pDst->z;
264    }
265
266    return pOut;
267}
268
269/*
270
271*/
272
273/* ------------------------------------------------------------------------
274        MTX33
275   ------------------------------------------------------------------------ */
276
277/* Please see man pages for details
278
279
280*/
281
282/*
283
284
285
286
287
288
289
290*/
291template<typename TMatrix>
292inline TMatrix*
293MTX33MultC(TMatrix* pOut, const TMatrix* __restrict p1, const TMatrix* __restrict p2)
294{
295    NN_NULL_ASSERT( p1 );
296    NN_NULL_ASSERT( p2 );
297    NN_NULL_ASSERT( pOut );
298
299    TMatrix mTmp;
300    TMatrix* __restrict pDst = (pOut == p1 || pOut == p2) ? &mTmp : pOut;
301
302    pDst->f._00 = p1->f._00 * p2->f._00 + p1->f._01 * p2->f._10 + p1->f._02 * p2->f._20;
303    pDst->f._01 = p1->f._00 * p2->f._01 + p1->f._01 * p2->f._11 + p1->f._02 * p2->f._21;
304    pDst->f._02 = p1->f._00 * p2->f._02 + p1->f._01 * p2->f._12 + p1->f._02 * p2->f._22;
305
306    pDst->f._10 = p1->f._10 * p2->f._00 + p1->f._11 * p2->f._10 + p1->f._12 * p2->f._20;
307    pDst->f._11 = p1->f._10 * p2->f._01 + p1->f._11 * p2->f._11 + p1->f._12 * p2->f._21;
308    pDst->f._12 = p1->f._10 * p2->f._02 + p1->f._11 * p2->f._12 + p1->f._12 * p2->f._22;
309
310    pDst->f._20 = p1->f._20 * p2->f._00 + p1->f._21 * p2->f._10 + p1->f._22 * p2->f._20;
311    pDst->f._21 = p1->f._20 * p2->f._01 + p1->f._21 * p2->f._11 + p1->f._22 * p2->f._21;
312    pDst->f._22 = p1->f._20 * p2->f._02 + p1->f._21 * p2->f._12 + p1->f._22 * p2->f._22;
313
314    if (pDst == &mTmp)
315    {
316        pOut->f._00 = pDst->f._00; pOut->f._01 = pDst->f._01; pOut->f._02 = pDst->f._02;
317        pOut->f._10 = pDst->f._10; pOut->f._11 = pDst->f._11; pOut->f._12 = pDst->f._12;
318        pOut->f._20 = pDst->f._20; pOut->f._21 = pDst->f._21; pOut->f._22 = pDst->f._22;
319    }
320
321    return pOut;
322}
323
324/*
325
326
327
328
329
330
331*/
332NN_MATH_INLINE MTX33*
333MTX33CopyC(MTX33* pOut, const MTX33* p)
334{
335    if (pOut != p)
336    {
337        *pOut = *p;
338    }
339
340    return pOut;
341}
342
343
344
345/*
346
347
348
349
350
351
352
353
354*/
355NN_MATH_INLINE MTX33*
356MTX33MAddC(MTX33* pOut, f32 t, const MTX33* p1, const MTX33* p2)
357{
358    pOut->f._00 = t * p1->f._00 + p2->f._00;
359    pOut->f._01 = t * p1->f._01 + p2->f._01;
360    pOut->f._02 = t * p1->f._02 + p2->f._02;
361
362    pOut->f._10 = t * p1->f._10 + p2->f._10;
363    pOut->f._11 = t * p1->f._11 + p2->f._11;
364    pOut->f._12 = t * p1->f._12 + p2->f._12;
365
366    pOut->f._20 = t * p1->f._20 + p2->f._20;
367    pOut->f._21 = t * p1->f._21 + p2->f._21;
368    pOut->f._22 = t * p1->f._22 + p2->f._22;
369
370    return pOut;
371}
372NN_MATH_INLINE MTX33*
373MTX33MAddC_FAST(MTX33* pOut, f32 t, const MTX33* p1, const MTX33* p2)
374{
375    register f32 m00, m01, m02, m10, m11, m12, m20, m21, m22;
376
377    m00 = p2->f._00;
378    m01 = p2->f._01;
379    m02 = p2->f._02;
380
381    m10 = p2->f._10;
382    m11 = p2->f._11;
383    m12 = p2->f._12;
384
385    m20 = p2->f._20;
386    m21 = p2->f._21;
387    m22 = p2->f._22;
388
389    m00 += t * p1->f._00;
390    m01 += t * p1->f._01;
391    m02 += t * p1->f._02;
392
393    m10 += t * p1->f._10;
394    m11 += t * p1->f._11;
395    m12 += t * p1->f._12;
396
397    m20 += t * p1->f._20;
398    m21 += t * p1->f._21;
399    m22 += t * p1->f._22;
400
401    pOut->f._00 = m00;
402    pOut->f._01 = m01;
403    pOut->f._02 = m02;
404
405    pOut->f._10 = m10;
406    pOut->f._11 = m11;
407    pOut->f._12 = m12;
408
409    pOut->f._20 = m20;
410    pOut->f._21 = m21;
411    pOut->f._22 = m22;
412
413    return pOut;
414
415}
416
417/*
418
419*/
420}  // namespace ARMv6
421}  // namespace math
422}  // namespace nn
423