1/*---------------------------------------------------------------------------*
2  Project: matrix vector Library
3  File:    mtx44_asm.s
4
5  Copyright 1998-2011 Nintendo.  All rights reserved.
6
7  These coded instructions, statements, and computer programs contain
8  proprietary information of Nintendo of America Inc. and/or Nintendo
9  Company Ltd., and are protected by Federal copyright law.     They may
10  not be disclosed to third parties or copied or duplicated in any form,
11  in whole or in part, without the prior written consent of Nintendo.
12
13 *---------------------------------------------------------------------------*/
14
15        .data
16        .align 2
17CONST_0_0F:     .float        0.0
18CONST_0_5F:     .float        0.5
19CONST_1_0F:     .float        1.0
20CONST_3_0F:     .float        3.0
21
22        .text
23
24////////////////////////////////////////////////////////////////////////////////
25//void ASM_MTX44Identity(Mtx44 m)
26#define m  r3
27#define c1 fp1
28#define c0 fp2
29        .global ASM_MTX44Identity
30ASM_MTX44Identity:
31        .type ASM_MTX44Identity, @function
32
33        // c1 = 1.0F;
34        lis         r4, CONST_1_0F@h
35        ori         r4, r4, CONST_1_0F@l
36        lfs         c1, 0(r4)
37
38        // c0 = 0.0F;
39        lis         r5, CONST_0_0F@h
40        ori         r5, r5, CONST_0_0F@l
41        lfs         c0, 0(r5)
42
43        stfs        c1,  0(m)
44        psq_st      c0,  4(m), 0, 0
45        psq_st      c0, 12(m), 0, 0
46        stfs        c1, 20(m)
47        psq_st      c0, 24(m), 0, 0
48        psq_st      c0, 32(m), 0, 0
49        stfs        c1, 40(m)
50        psq_st      c0, 44(m), 0, 0
51        psq_st      c0, 52(m), 0, 0
52        stfs        c1, 60(m)
53
54        blr
55        .size ASM_MTX44Identity,$-ASM_MTX44Identity
56
57#undef m
58#undef c1
59#undef c0
60
61////////////////////////////////////////////////////////////////////////////////
62//void ASM_MTX44Copy(const Mtx44 src, Mtx44 dst)
63#define src r3
64#define dst r4
65        .global ASM_MTX44Copy
66ASM_MTX44Copy:
67        .type ASM_MTX44Copy, @function
68
69        psq_l       fp1,  0(src), 0, 0
70        psq_st      fp1,  0(dst), 0, 0
71        psq_l       fp1,  8(src), 0, 0
72        psq_st      fp1,  8(dst), 0, 0
73        psq_l       fp1, 16(src), 0, 0
74        psq_st      fp1, 16(dst), 0, 0
75        psq_l       fp1, 24(src), 0, 0
76        psq_st      fp1, 24(dst), 0, 0
77        psq_l       fp1, 32(src), 0, 0
78        psq_st      fp1, 32(dst), 0, 0
79        psq_l       fp1, 40(src), 0, 0
80        psq_st      fp1, 40(dst), 0, 0
81        psq_l       fp1, 48(src), 0, 0
82        psq_st      fp1, 48(dst), 0, 0
83        psq_l       fp1, 56(src), 0, 0
84        psq_st      fp1, 56(dst), 0, 0
85
86        blr
87        .size ASM_MTX44Copy,$-ASM_MTX44Copy
88
89#undef src
90#undef dst
91
92
93////////////////////////////////////////////////////////////////////////////////
94//void ASM_MTX44Transpose(const Mtx44 src, Mtx44 xPose)
95#define src r3
96#define xPose r4
97        .global ASM_MTX44Transpose
98ASM_MTX44Transpose:
99        .type ASM_MTX44Transpose, @function
100
101        psq_l       fp0,  0(src), 0, 0     // fp0 <= s00,s01
102        psq_l       fp1, 16(src), 0, 0     // fp1 <= s10,s11
103
104        ps_merge00  fp4, fp0, fp1              // fp4 <= t00,t10
105        psq_l       fp2,  8(src), 0, 0     // fp2 <= s02,s03
106        psq_st      fp4,  0(xPose), 0, 0
107
108        ps_merge11  fp5, fp0, fp1              // fp5 <= t01,t11
109        psq_l       fp3, 24(src), 0, 0     // fp3 <= s12,s13
110        psq_st      fp5, 16(xPose), 0, 0;
111
112        ps_merge00  fp4, fp2, fp3              // fp4 <= t02,t12
113        psq_l       fp0, 32(src), 0, 0     // fp0 <= s20,s21
114        psq_st      fp4, 32(xPose), 0, 0
115
116        ps_merge11  fp5, fp2, fp3              // fp5 <= t03,t13
117        psq_l       fp1, 48(src), 0, 0     // fp1 <= s30,s31
118        psq_st      fp5, 48(xPose), 0, 0
119
120        ps_merge00  fp4, fp0, fp1              // fp4 <= t20,t30
121        psq_l       fp2, 40(src), 0, 0     // fp2 <= s22,s23
122        psq_st      fp4,  8(xPose), 0, 0
123
124        ps_merge11  fp5, fp0, fp1             // fp5 <= t21,t31
125        psq_l       fp3, 56(src), 0, 0     // fp2 <= s32,s33
126        psq_st      fp5, 24(xPose), 0, 0
127
128        ps_merge00  fp4, fp2, fp3              // fp4 <= s22,s32
129        psq_st      fp4, 40(xPose), 0, 0
130
131        ps_merge11  fp5, fp2, fp3             // fp5 <= s23,s33
132        psq_st      fp5, 56(xPose), 0, 0
133
134        blr
135        .size ASM_MTX44Transpose,$-ASM_MTX44Transpose
136
137#undef src
138#undef xPose
139
140////////////////////////////////////////////////////////////////////////////////
141// void ASM_MTX44Concat(const Mtx44 a, const Mtx44 b, Mtx44 ab)
142#define a  r3
143#define b  r4
144#define ab r5
145        .global ASM_MTX44Concat
146ASM_MTX44Concat:
147        .type ASM_MTX44Concat, @function
148
149        psq_l       fp0 ,  0(a), 0, 0          // a00,a01
150        psq_l       fp2 ,  0(b), 0, 0          // b00,b01
151        ps_muls0    fp6 ,   fp2,  fp0          // b00a00,b01a00
152        psq_l       fp3 , 16(b), 0, 0          // b10,b11
153        psq_l       fp4 , 32(b), 0, 0          // b20,b21
154        ps_madds1   fp6 ,   fp3,  fp0,  fp6    // b00a00+b10a01,b01a00+b11a01
155        psq_l       fp1 ,  8(a), 0, 0          // a02,a03
156        psq_l       fp5 , 48(b), 0, 0          // b30,b31
157
158        // b00a00+b10a01+b20a02,b01a00+b11a01+b21a02
159        ps_madds0   fp6 ,   fp4,  fp1,  fp6
160        psq_l       fp0 , 16(a), 0, 0          // a10,a11
161
162        // b00a00+b10a01+b20a02+b30a03,b01a00+b11a01+b21a02+b31a03
163        ps_madds1   fp6 ,   fp5,  fp1,  fp6
164        psq_l       fp1 , 24(a), 0, 0          // a12,a13
165        ps_muls0    fp8 ,   fp2,  fp0          // b00a10,b01a10
166        ps_madds1   fp8 ,   fp3,  fp0,  fp8    // b00a10+b10a11,b01a11+b11a11
167        psq_l       fp0 , 32(a), 0, 0          // a20,a21
168
169        // b00a10+b10a11+b20a12,b01a11+b11a11+b21a12
170        ps_madds0   fp8 ,   fp4,  fp1,  fp8
171
172        // b00a10+b10a11+b20a12+b30a13,b01a10+b11a11+b21a12+b31a13
173        ps_madds1   fp8 ,   fp5,  fp1,  fp8
174        psq_l       fp1 , 40(a), 0, 0          // a22,a23
175        ps_muls0    fp10,   fp2,  fp0          // b00a20,b01a20
176        ps_madds1   fp10,   fp3,  fp0, fp10    // b00a20+b10a21,b01a20+b11a21
177        psq_l       fp0 , 48(a), 0, 0          // a30,a31
178
179        // b00a20+b10a21+b20a22,b01a20+b11a21+b21a22
180        ps_madds0   fp10,   fp4,  fp1, fp10
181
182        // b00a20+b10a21+b20a22+b30a23,b01a20+b11a21+b21a22+b31a23
183        ps_madds1   fp10,   fp5,  fp1, fp10
184        psq_l       fp1 , 56(a), 0, 0          // a32,a33
185
186        ps_muls0    fp12,   fp2,  fp0          // b00a30,b01a30
187        psq_l       fp2 ,  8(b), 0, 0          // b02,b03
188        ps_madds1   fp12,   fp3,  fp0, fp12    // b00a30+b10a31,b01a30+b11a31
189        psq_l       fp0 ,  0(a), 0, 0          // a00,a01
190
191        // b00a30+b10a31+b20a32,b01a30+b11a31+b21a32
192        ps_madds0   fp12,   fp4,  fp1, fp12
193        psq_l       fp3 , 24(b), 0, 0          // b12,b13
194
195        // b00a30+b10a31+b20a32+b30a33,b01a30+b11a31+b21a32+b31a33
196        ps_madds1   fp12,   fp5,  fp1, fp12
197        psq_l       fp1 ,  8(a), 0, 0          // a02,a03
198
199        ps_muls0    fp7 ,   fp2,  fp0          // b02a00,b03a00
200        psq_l       fp4 , 40(b), 0, 0          // b22,b23
201        ps_madds1   fp7 ,   fp3,  fp0, fp7     // b02a00+b12a01,b03a00+b13a01
202        psq_l       fp5 , 56(b), 0, 0          // b32,b33
203
204        // b02a00+b12a01+b22a02,b03a00+b13a01+b23a02
205        ps_madds0   fp7 ,   fp4,  fp1, fp7
206
207        psq_l       fp0 , 16(a), 0, 0          // a10,a11
208
209        // b02a00+b12a01+b22a02+b32a03,b03a00+b13a01+b23a02+b33a03
210        ps_madds1   fp7 ,   fp5,  fp1, fp7
211        psq_l       fp1 , 24(a), 0, 0          // a12,a13
212
213        ps_muls0    fp9 ,   fp2,  fp0          // b02a10,b03a10
214        psq_st      fp6 , 0(ab), 0, 0          // ab00,ab01
215        ps_madds1   fp9 ,   fp3,  fp0, fp9     // b02a10+b12a11,b03a10+b13a11
216        psq_l       fp0 , 32(a), 0, 0          // a20,a21
217
218        // b02a10+b12a11+b22a12,b03a10+b13a11+b23a12
219        ps_madds0   fp9,    fp4,  fp1, fp9
220        psq_st      fp8 ,16(ab), 0, 0          // ab10,ab11
221
222        // b02a10+b12a11+b22a12+b32a13,b03a10+b13a11+b23a12+b33a13
223        ps_madds1   fp9 ,   fp5,  fp1, fp9
224        psq_l       fp1 , 40(a), 0, 0          // a22,a23
225        ps_muls0    fp11,   fp2,  fp0          // b02a20,b03a20
226        psq_st      fp10,32(ab), 0, 0          // ab20,ab21
227        ps_madds1   fp11,   fp3,  fp0, fp11    // b02a20+b12a21,b03a20+b13a21
228        psq_l       fp0 , 48(a), 0, 0          // a30,a31
229
230        // b02a20+b12a21+b22a22,b03a20+b13a21+b23a22
231        ps_madds0   fp11,   fp4,  fp1, fp11
232        psq_st      fp12,48(ab), 0, 0          // ab30,ab31
233
234        // b02a20+b12a21+b22a22+b32a23,b03a20+b13a21+b23a22+b33a23
235        ps_madds1   fp11,   fp5,  fp1, fp11
236
237        psq_l       fp1,  56(a), 0, 0          // a32,a33
238        ps_muls0    fp13,   fp2,  fp0          // b02a30,b03a30
239        psq_st      fp7 , 8(ab), 0, 0          // ab02,ab03
240        ps_madds1   fp13,   fp3,  fp0, fp13    // b02a30+b12a31,b03a30+b13a31
241        psq_st      fp9 ,24(ab), 0, 0          // ab12,ab13
242
243        // b02a30+b12a31+b22a32,b03a30+b13a31+b23a32
244        ps_madds0   fp13,   fp4,  fp1, fp13
245        psq_st      fp11,40(ab), 0, 0          // ab22,ab23
246
247        // b02a30+b12a31+b22a32+b32a33,b03a30+b13a31+b23a32+b33a33
248        ps_madds1   fp13,   fp5,  fp1, fp13
249
250        psq_st      fp13,56(ab), 0, 0          // ab32,ab33
251        blr
252        .size ASM_MTX44Concat,$-ASM_MTX44Concat
253
254#undef a
255#undef b
256#undef ab
257
258
259
260////////////////////////////////////////////////////////////////////////////////
261// void _ASM_MTX44RotAxisRadInternal(Mtx44 m, const Vec *axis, f32 sT, f32 cT)
262        .global _ASM_MTX44RotAxisRadInternal
263#define m    r3
264#define axis r4
265#define sT   fp1
266#define cT   fp2
267_ASM_MTX44RotAxisRadInternal:
268        .type _ASM_MTX44RotAxisRadInternal, @function
269
270#define tT   fp3
271#define fc0  fp4
272#define tmp0 fp5
273#define tmp1 fp6
274#define tmp2 fp7
275#define tmp3 fp8
276#define tmp4 fp9
277#define tmp5 fp10
278#define tmp6 fp11
279#define tmp7 fp12
280#define tmp8 fp13
281#define tmp9 fp0
282
283        // tmp9 = 0.5F;
284        lis         r5, CONST_0_5F@h
285        ori         r5, r5, CONST_0_5F@l
286        lfs         tmp9, 0(r5)
287
288        // tmp8 = 3.0F;
289        lis         r5, CONST_3_0F@h
290        ori         r5, r5, CONST_3_0F@l
291        lfs         tmp8, 0(r5)
292
293        // to make sure cT = (single precision float value)
294        frsp        cT, cT
295        // tmp0 = [x][y] : LOAD
296        psq_l       tmp0, 0(axis), 0, 0
297        // to make sure sT = (single precision float value)
298        frsp        sT, sT
299        // tmp1 = [z][z] : LOAD
300        lfs         tmp1, 8(axis)
301
302        // tmp2 = [x*x][y*y]
303        ps_mul      tmp2, tmp0, tmp0
304        // tmp7 = [1.0F]
305        fadds       tmp7, tmp9, tmp9
306        // tmp3 = [x*x+z*z][y*y+z*z]
307        ps_madd     tmp3, tmp1, tmp1, tmp2
308        // fc0 = [0.0F]
309        fsubs       fc0, tmp9, tmp9
310        // tmp4 = [S = x*x+y*y+z*z][z]
311        ps_sum0     tmp4, tmp3, tmp1, tmp2
312
313        // tT = 1.0F - cT
314        fsubs       tT, tmp7, cT
315
316        // tmp5 = [1.0/sqrt(S)] :estimation[E]
317        frsqrte     tmp5, tmp4
318        // tmp7 = [0][1]
319        ps_merge00  tmp7, fc0, tmp7
320        // Newton-Rapson refinement step
321        // E' = E/2(3.0 - E*E*S)
322        fmuls       tmp2, tmp5, tmp5            // E*E
323        fmuls       tmp3, tmp5, tmp9            // E/2
324        // fc0 [m30=0][m31=0] : STORE
325        psq_st      fc0, 48(m), 0, 0
326        fnmsubs     tmp2, tmp2, tmp4, tmp8      // (3-E*E*S)
327        fmuls       tmp5, tmp2, tmp3            // (E/2)(3-E*E*S)
328        // tmp7 [m32=0][m33=1] : STORE
329        psq_st      tmp7, 56(m), 0, 0
330        // cT = [c][c]
331        ps_merge00  cT, cT, cT
332
333        // tmp0 = [nx = x/sqrt(S)][ny = y/sqrt(S)]
334        ps_muls0    tmp0, tmp0, tmp5
335        // tmp1 = [nz = z/sqrt(S)][nz = z/sqrt(S)]
336        ps_muls0    tmp1, tmp1, tmp5
337        // tmp4 = [t*nx][t*ny]
338        ps_muls0    tmp4, tmp0, tT
339        // tmp9 = [s*nx][s*ny]
340        ps_muls0    tmp9, tmp0, sT
341        // tmp5 = [t*nz][t*nz]
342        ps_muls0    tmp5, tmp1, tT
343        // tmp3 = [t*nx*ny][t*ny*ny]
344        ps_muls1    tmp3, tmp4, tmp0
345        // tmp2 = [t*nx*nx][t*ny*nx]
346        ps_muls0    tmp2, tmp4, tmp0
347        // tmp4 = [t*nx*nz][t*ny*nz]
348        ps_muls0    tmp4, tmp4, tmp1
349
350        // tmp6 = [t*nx*ny-s*nz][t*nx*ny-s*nz]
351        fnmsubs     tmp6, tmp1, sT, tmp3
352        // tmp7 = [t*nx*ny+s*nz][t*ny*ny+s*nz]
353        fmadds      tmp7, tmp1, sT, tmp3
354
355        // tmp0 = [-s*nx][-s*ny]
356        ps_neg      tmp0, tmp9
357        // tmp8 = [t*nx*nz+s*ny][0] == [m02][m03]
358        ps_sum0     tmp8, tmp4, fc0, tmp9
359        // tmp2 = [t*nx*nx+c][t*nx*ny-s*nz] == [m00][m01]
360        ps_sum0     tmp2, tmp2, tmp6, cT
361        // tmp3 = [t*nx*ny+s*nz][t*ny*ny+c] == [m10][m11]
362        ps_sum1     tmp3, cT, tmp7, tmp3
363        // tmp6 = [t*ny*nz-s*nx][0] == [m12][m13]
364        ps_sum0     tmp6, tmp0, fc0 ,tmp4
365
366        // tmp8 [m02][m03] : STORE
367        psq_st      tmp8, 8(m), 0, 0
368        // tmp0 = [t*nx*nz-s*ny][t*ny*nz]
369        ps_sum0     tmp0, tmp4, tmp4, tmp0
370        // tmp2 [m00][m01] : STORE
371        psq_st      tmp2, 0(m), 0, 0
372        // tmp5 = [t*nz*nz][t*nz*nz]
373        ps_muls0    tmp5, tmp5, tmp1
374        // tmp3 [m10][m11] : STORE
375        psq_st      tmp3, 16(m), 0, 0
376        // tmp4 = [t*nx*nz-s*ny][t*ny*nz+s*nx] == [m20][m21]
377        ps_sum1     tmp4, tmp9, tmp0, tmp4
378        // tmp6 [m12][m13] : STORE
379        psq_st      tmp6, 24(m), 0, 0
380        // tmp5 = [t*nz*nz+c][0]   == [m22][m23]
381        ps_sum0     tmp5, tmp5, fc0, cT
382        // tmp4 [m20][m21] : STORE
383        psq_st      tmp4, 32(m), 0, 0
384        // tmp5 [m22][m23] : STORE
385        psq_st      tmp5, 40(m), 0, 0
386
387        blr
388        .size _ASM_MTX44RotAxisRadInternal,$-_ASM_MTX44RotAxisRadInternal
389#undef m
390#undef axis
391#undef sT
392#undef cT
393#undef tT
394#undef fc0
395#undef tmp0
396#undef tmp1
397#undef tmp2
398#undef tmp3
399#undef tmp4
400#undef tmp5
401#undef tmp6
402#undef tmp7
403#undef tmp8
404#undef tmp9
405
406
407
408////////////////////////////////////////////////////////////////////////////////
409// void ASM_MTX44ScaleApply(const Mtx44 src, Mtx44 dst, f32 xS, f32 yS, f32 zS)
410        .global ASM_MTX44ScaleApply
411#define src r3
412#define dst r4
413#define xS  fp1
414#define yS  fp2
415#define zS  fp3
416ASM_MTX44ScaleApply:
417        .type ASM_MTX44ScaleApply, @function
418
419        psq_l       fp4,     0(src), 0, 0          // fp4 <- src00,src01
420        frsp        xS, xS                         // to make sure xS = single precision
421        psq_l       fp5,     8(src), 0, 0          // fp5 <- src02,src03
422        frsp        yS, yS                         // to make sure yS = single precision
423        psq_l       fp6,    16(src), 0, 0          // fp6 <- src10,src11
424        ps_muls0    fp4,    fp4, xS                // fp4 <- src00*xS,src01*xS
425        psq_l       fp7,    24(src), 0, 0          // fp7 <- src12,src13
426        ps_muls0    fp5,    fp5, xS                // fp5 <- src02*xS,src03*xS
427        psq_l       fp8,    32(src), 0, 0          // fp8 <- src20,src21
428        frsp        zS, zS                         // to make sure zS = single precision
429        psq_st      fp4,     0(dst), 0, 0          // dst00,dst01
430        ps_muls0    fp6,    fp6, yS                // fp6 <- src10*yS,src11*yS
431        psq_l       fp9,    40(src), 0, 0          // fp9 <- src22,src23
432        psq_st      fp5,     8(dst), 0, 0          // dst02,dst03
433        ps_muls0    fp7,    fp7, yS                // fp7 <- src12*yS,src13*yS
434        psq_l       fp10,   48(src), 0, 0          // fp10 <- src30src31
435        psq_st      fp6,    16(dst), 0, 0          // dst10,dst11
436        ps_muls0    fp8,    fp8, zS                // fp8 <- src20*zS,src21*zS
437        psq_l       fp11,   56(src), 0, 0          // fp11 <- src32,src33
438        psq_st      fp7,    24(dst), 0, 0          // dst12,dst13
439        ps_muls0    fp9,    fp9, zS                // fp9 <- src22*zS,src23*zS
440        psq_st      fp8,    32(dst), 0, 0          // dst20,dst21
441        psq_st      fp9,    40(dst), 0, 0          // dst22,dst23
442        psq_st      fp10,   48(dst), 0, 0          // dst30,dst31
443        psq_st      fp11,   56(dst), 0, 0          // dst32,dst33
444        blr
445        .size ASM_MTX44ScaleApply,$-ASM_MTX44ScaleApply
446#undef src
447#undef dst
448#undef xS
449#undef yS
450#undef zS
451
452////////////////////////////////////////////////////////////////////////////////
453//void ASM_MTX44Trans(Mtx44 m, f32 xT, f32 yT, f32 zT)
454        .global ASM_MTX44Trans
455#define m r3
456#define xT     fp1
457#define yT     fp2
458#define zT     fp3
459#define c_one  fp4
460#define c_zero fp5
461#define c_01   fp6
462ASM_MTX44Trans:
463        .type ASM_MTX44Trans, @function
464
465        // c_one = 1.0F;
466        lis         r4, CONST_1_0F@h
467        ori         r4, r4, CONST_1_0F@l
468        lfs         c_one, 0(r4)
469
470        // c_zero = 0.0F;
471        lis         r5, CONST_0_0F@h
472        ori         r5, r5, CONST_0_0F@l
473        lfs         c_zero, 0(r5)
474
475        stfs        xT,     12(m)              // m03
476        stfs        yT,     28(m)              // m13
477        ps_merge00  c_01,   c_zero, c_one      // c_01 <- 0.0, 1.0
478        stfs        zT,     44(m)              // m23
479        psq_st      c_one,   0(m), 1, 0        // m00
480        psq_st      c_zero,  4(m), 0, 0        // m01,m02
481        psq_st      c_01,   16(m), 0, 0        // m10,m11
482        psq_st      c_zero, 24(m), 1, 0        // m12
483        psq_st      c_zero, 32(m), 0, 0        // m20,m21
484        psq_st      c_one,  40(m), 1, 0        // m22
485        psq_st      c_zero, 48(m), 0, 0        // m30,m31
486        psq_st      c_01,   56(m), 0, 0        // m32,m33
487
488        blr
489        .size ASM_MTX44Trans,$-ASM_MTX44Trans
490#undef m
491#undef xT
492#undef yT
493#undef zT
494#undef c_zero
495#undef c_one
496#undef c_01
497
498////////////////////////////////////////////////////////////////////////////////
499//void ASM_MTX44TransApply(const Mtx44 src, Mtx44 dst, f32 xT, f32 yT, f32 zT)
500        .global ASM_MTX44TransApply
501#define src r3
502#define dst r4
503#define xT     fp1
504#define yT     fp2
505#define zT     fp3
506ASM_MTX44TransApply:
507        .type ASM_MTX44TransApply, @function
508
509        psq_l       fp4, 0(src),     0, 0
510        frsp        xT, xT                         // to make sure xS = single precision
511        psq_l       fp5, 8(src),     0, 0
512        frsp        yT, yT                        // to make sure yS = single precision
513        psq_l       fp6, 16(src),    0, 0
514        frsp        zT, zT                        // to make sure zS = single precision
515        psq_l       fp7, 24(src),    0, 0
516        psq_st      fp4, 0(dst),     0, 0
517        ps_sum1     fp5, xT, fp5, fp5
518        psq_l       fp4, 40(src),    0, 0
519        psq_st      fp6, 16(dst),    0, 0
520        ps_sum1     fp7, yT, fp7, fp7
521        psq_l       fp8, 32(src),    0, 0
522        psq_st      fp5, 8(dst),     0, 0
523        ps_sum1     fp4, zT, fp4, fp4
524        psq_st      fp7, 24(dst),    0, 0
525        psq_st      fp8, 32(dst),    0, 0
526        psq_l       fp5, 48(src),    0, 0
527        psq_l       fp6, 56(src),    0, 0
528        psq_st      fp4, 40(dst),    0, 0
529        psq_st      fp5, 48(dst),    0, 0
530        psq_st      fp6, 56(dst),    0, 0
531
532        blr
533        .size ASM_MTX44TransApply,$-ASM_MTX44TransApply
534
535#undef m
536#undef xT
537#undef yT
538#undef zT
539#undef c_zero
540#undef c_one
541#undef c_01
542
543////////////////////////////////////////////////////////////////////////////////
544//void ASM_MTX44Scale(Mtx44 m, f32 xS, f32 yS, f32 zS)
545        .global ASM_MTX44Scale
546#define m r3
547#define xS     fp1
548#define yS     fp2
549#define zS     fp3
550#define c_one  fp4
551#define c_zero fp5
552ASM_MTX44Scale:
553        .type ASM_MTX44Scale, @function
554
555        // c_one = 1.0F;
556        lis         r4, CONST_1_0F@h
557        ori         r4, r4, CONST_1_0F@l
558        lfs         c_one, 0(r4)
559
560        // c_zero = 0.0F;
561        lis         r5, CONST_0_0F@h
562        ori         r5, r5, CONST_0_0F@l
563        lfs         c_zero, 0(r5)
564
565        stfs        xS,      0(m)
566        psq_st      c_zero,  4(m), 0, 0        // m01,m02
567        psq_st      c_zero, 12(m), 0, 0        // m03,m10
568        stfs        yS,     20(m)              // m11
569        psq_st      c_zero, 24(m), 0, 0        // m12,m13
570        psq_st      c_zero, 32(m), 0, 0        // m20,m21
571        stfs        zS,     40(m)              // m22
572        psq_st      c_zero, 44(m), 0, 0        // m23,m30
573        psq_st      c_zero, 52(m), 0, 0        // m31,m32
574        stfs        c_one,  60(m)              // m33
575
576        blr
577        .size ASM_MTX44Scale,$-ASM_MTX44Scale
578
579#undef m
580#undef xS
581#undef yS
582#undef zS
583#undef c_zero
584#undef c_one
585
586////////////////////////////////////////////////////////////////////////////////
587//void ASM_MTX44RotTrig(Mtx44 m, char axis, f32 sinA, f32 cosA)
588        .global ASM_MTX44RotTrig
589#define m r3
590#define axis  r4
591#define sinA  fp1
592#define cosA  fp2
593#define ftmp0 fp3
594#define ftmp1 fp4
595#define ftmp2 fp5
596#define ftmp3 fp6
597#define ftmp4 fp7
598#define c_one  fp8
599#define c_zero fp9
600ASM_MTX44RotTrig:
601        .type ASM_MTX44RotTrig, @function
602
603        // c_one = 1.0F;
604        lis         r5, CONST_1_0F@h
605        ori         r5, r5, CONST_1_0F@l
606        lfs         c_one, 0(r5)
607
608        // c_zero = 0.0F;
609        lis         r6, CONST_0_0F@h
610        ori         r6, r6, CONST_0_0F@l
611        lfs         c_zero, 0(r6)
612
613        frsp        sinA, sinA      // to make sure sinA = single precision
614
615        // always lower case
616        ori         axis, axis, 0x20
617
618        frsp        cosA, cosA      // to make sure cosA = single precision
619
620        // branches
621        cmplwi      axis, 'x';                  // if 'x'
622        beq         _case_x;
623        cmplwi      axis, 'y';                  // if 'y'
624        beq         _case_y;
625        cmplwi      axis, 'z';                  // if 'z'
626        beq         _case_z;
627        b           _end;
628
629    _case_x:
630        psq_st      c_one,   0(m), 1, 0;        // m00 <= 1.0
631        psq_st      c_zero,  4(m), 0, 0;        // m01,m02 <= 0.0,0.0
632        ps_neg      ftmp0, sinA;                // ftmp0 <= -sinA
633        psq_st      c_zero, 12(m), 0, 0;        // m03,m10 <= 0.0,0.0
634        ps_merge00  ftmp1, sinA, cosA;          // ftmp1 <= sinA,cosA
635        psq_st      c_zero, 28(m), 0, 0;        // m13,m20 <= 0.0,0.0
636        ps_merge00  ftmp0, cosA, ftmp0;         // ftmp0 <= cosA,-sinA
637        psq_st      c_zero, 44(m), 0, 0;        // m23,m30 <= 0.0,0.0
638        psq_st      c_zero, 52(m), 0, 0;        // m23,m30 <= 0.0,0.0
639        psq_st      ftmp1,  36(m), 0, 0;        // m21,m22 <= sinA,cosA
640        psq_st      ftmp0,  20(m), 0, 0;        // m11,m12 <= cosA,-sinA
641        psq_st      c_one,  60(m), 1, 0;        // m33 <= 0.0
642        b           _end;
643
644    _case_y:
645        ps_merge00  ftmp1, cosA, c_zero;        // ftmp1 <= cosA,0.0
646        psq_st      c_zero, 48(m), 0, 0;        // m30,m31 <= 0.0,0.0
647        ps_neg      ftmp0, sinA;                // ftmp0 <= -sinA
648        psq_st      c_zero, 24(m), 0, 0;        // m12,m13 <= 0.0,0.0
649        ps_merge00  ftmp3, c_zero, c_one;       // ftmp3 <= 0.0,1.0
650        psq_st      ftmp1,   0(m), 0, 0;        // m00,m01 <= cosA,0.0
651        ps_merge00  ftmp4, ftmp0, c_zero;       // ftmp4 <= -sinA,0.0
652        ps_merge00  ftmp2, sinA,  c_zero;       // ftmp2 <= sinA,0.0
653        psq_st      ftmp3,  16(m), 0, 0;        // m10,m11 <= 0.0,1.0
654        psq_st      ftmp2,   8(m), 0, 0;        // m02,m03 <= sinA,0.0
655        psq_st      ftmp4,  32(m), 0, 0;        // m20,m21 <= -sinA,0.0
656        psq_st      ftmp1,  40(m), 0, 0;        // m22,m23 <= cosA,0.0
657        psq_st      ftmp3,  56(m), 0, 0;        // m32,m33 <= 0.0,1.0
658        b           _end;
659
660    _case_z:
661        psq_st      c_zero,  8(m), 0, 0;        // m02,m03 <= 0.0,0.0
662        ps_neg      ftmp0, sinA;                // ftmp0 <= -sinA
663        psq_st      c_zero, 24(m), 0, 0;        // m12,m13 <= 0.0,0.0
664        ps_merge00  ftmp1, sinA, cosA;          // ftmp1 <= sinA,cosA
665        psq_st      c_zero, 32(m), 0, 0;        // m20,m21 <= 0.0,0.0
666        ps_merge00  ftmp2, c_one, c_zero;       // ftmp2 <= 1.0,0.0
667        psq_st      c_zero, 48(m), 0, 0;        // m30,m31 <= 0.0,0.0
668        ps_merge00  ftmp3, c_zero, c_one;       // ftmp2 <= 0.0,1.0
669        psq_st      ftmp1,  16(m), 0, 0;        // m10,m11 <= sinA,cosA
670        ps_merge00  ftmp4, cosA, ftmp0;         // ftmp4 <= cosA, -sinA
671        psq_st      ftmp2,  40(m), 0, 0;        // m22,m23 <= 1.0,0.0
672        psq_st      ftmp3,  56(m), 0, 0;        // m32,m33 <= 0.0,1.0
673        psq_st      ftmp4,   0(m), 0, 0;        // m00,m00 <= cosA,-sinA
674
675    _end:
676
677        blr
678        .size ASM_MTX44RotTrig,$-ASM_MTX44RotTrig
679
680#undef m
681#undef axis
682#undef sinA
683#undef cosA
684#undef ftmp0
685#undef ftmp1
686#undef ftmp2
687#undef ftmp3
688#undef ftmp4
689#undef c_one
690#undef c_zero
691
692
693////////////////////////////////////////////////////////////////////////////////
694//void ASM_MTX34To44( MTX_CONST Mtx src, Mtx44 dst )
695        .global ASM_MTX34To44
696#define src r3
697#define dst r4
698#define c_00 fp1
699#define c_11 fp2
700#define c_01 fp3
701#define tmp  fp4
702ASM_MTX34To44:
703        .type ASM_MTX34To44, @function
704
705        // c_11 = 1.0F;
706        lis         r5, CONST_1_0F@h
707        ori         r5, r5, CONST_1_0F@l
708        lfs         c_11, 0(r5)
709
710        // c_00 = 0.0F;
711        lis         r6, CONST_0_0F@h
712        ori         r6, r6, CONST_0_0F@l
713        lfs         c_00, 0(r6)
714
715        psq_l       tmp,  0(src), 0, 0;
716        psq_st      tmp,  0(dst), 0, 0;
717        psq_l       tmp,  8(src), 0, 0;
718        psq_st      tmp,  8(dst), 0, 0;
719        psq_l       tmp, 16(src), 0, 0;
720        psq_st      tmp, 16(dst), 0, 0;
721        ps_merge00  c_01, c_00,  c_11;
722        psq_l       tmp, 24(src), 0, 0;
723        psq_st      tmp, 24(dst), 0, 0;
724        psq_l       tmp, 32(src), 0, 0;
725        psq_st      tmp, 32(dst), 0, 0;
726        psq_l       tmp, 40(src), 0, 0;
727        psq_st      tmp, 40(dst), 0, 0;
728        psq_st      c_00, 48(dst), 0, 0;
729        psq_st      c_01, 56(dst), 0, 0;
730        blr
731        .size ASM_MTX34To44,$-ASM_MTX34To44
732
733#undef src
734#undef dst
735#undef c_00
736#undef c_11
737#undef c_01
738
739