1/*---------------------------------------------------------------------------*
2  Project: matrix vector Library
3  File:    mtx_asm.s
4
5  Copyright 1998-2011 Nintendo.  All rights reserved.
6
7  These coded instructions, statements, and computer programs contain
8  proprietary information of Nintendo of America Inc. and/or Nintendo
9  Company Ltd., and are protected by Federal copyright law.     They may
10  not be disclosed to third parties or copied or duplicated in any form,
11  in whole or in part, without the prior written consent of Nintendo.
12
13 *---------------------------------------------------------------------------*/
14
15        .data
16        .align 2
17Unit01: .float        0.0
18        .float        1.0
19
20CONST_0_0F:     .float        0.0
21CONST_0_5F:     .float        0.5
22CONST_1_0F:     .float        1.0
23CONST_3_0F:     .float        3.0
24
25
26        .text
27
28
29////////////////////////////////////////////////////////////////////////////////
30// void ASM_MTXIdentity(Mtx m)
31#define m  r3
32#define c_01   f1
33#define c_10   f2
34#define c_zero f3
35#define c_one  f4
36        .global ASM_MTXIdentity
37ASM_MTXIdentity:
38        .type ASM_MTXIdentity, @function
39
40        // c_zero = 0.0F;
41        lis         r4, CONST_0_0F@h
42        ori         r4, r4, CONST_0_0F@l
43        lfs         c_zero, 0(r4)
44
45        psq_st      c_zero, 8(m),   0, 0     // m[0][2], m[0][3]
46
47        // c_one = 1.0F;
48        lis         r5, CONST_1_0F@h
49        ori         r5, r5, CONST_1_0F@l
50        lfs         c_one, 0(r5)
51
52        ps_merge01  c_01, c_zero, c_one      // { 0.1F, 1.0F }
53        psq_st      c_zero, 24(m),  0, 0     // m[1][2], m[1][3]
54        ps_merge10  c_10, c_one, c_zero      // fp2 = { 1.0F, 0.0F }
55        psq_st      c_zero, 32(m),  0, 0     // m[2][0], m[2][1]
56        psq_st      c_01,   16(m),  0, 0     // m[1][0], m[1][1]
57        psq_st      c_10,   0(m),   0, 0     // m[0][0], m[0][1]
58        psq_st      c_10,   40(m),  0, 0     // m[2][2], m[2][3]
59
60        blr
61       .size ASM_MTXIdentity,$-ASM_MTXIdentity
62#undef m
63#undef c_01
64#undef c_10
65#undef c_zero
66#undef c_one
67
68////////////////////////////////////////////////////////////////////////////////
69// void ASM_MTXCopy(const Mtx src, Mtx dst)
70#define src  r3
71#define dst  r4
72        .global ASM_MTXCopy
73ASM_MTXCopy:
74        .type ASM_MTXCopy, @function
75
76        psq_l       fp0, 0(src),   0, 0
77        psq_st      fp0, 0(dst),   0, 0
78        psq_l       fp1, 8(src),   0, 0
79        psq_st      fp1, 8(dst),   0, 0
80        psq_l       fp2, 16(src),  0, 0
81        psq_st      fp2, 16(dst),  0, 0
82        psq_l       fp3, 24(src),  0, 0
83        psq_st      fp3, 24(dst),  0, 0
84        psq_l       fp4, 32(src),  0, 0
85        psq_st      fp4, 32(dst),  0, 0
86        psq_l       fp5, 40(src),  0, 0
87        psq_st      fp5, 40(dst),  0, 0
88
89        blr
90       .size ASM_MTXCopy,$-ASM_MTXCopy
91
92#undef src
93#undef dst
94
95////////////////////////////////////////////////////////////////////////////////
96// void ASM_MTXConcat(const Mtx mA, const Mtx mB, Mtx mAB)
97#define mA  r3
98#define mB  r4
99#define mAB r5
100        .global ASM_MTXConcat
101ASM_MTXConcat:
102        .type ASM_MTXConcat, @function
103#define A00_A01 fp0
104#define A02_A03 fp1
105#define A10_A11 fp2
106#define A12_A13 fp3
107#define A20_A21 fp4
108#define A22_A23 fp5
109
110#define B00_B01 fp6
111#define B02_B03 fp7
112#define B10_B11 fp8
113#define B12_B13 fp9
114#define B20_B21 fp10
115#define B22_B23 fp11
116
117#define D00_D01 fp12
118#define D02_D03 fp13
119#define D10_D11 fp14
120#define D12_D13 fp15
121#define D20_D21 fp2
122#define D22_D23 fp0
123
124#define UNIT01  fp31
125
126        // don't save LR since we don't make any function calls
127        //    mflr    r0
128        //    stw     r0, 4(r1)
129        stwu        r1, -64(r1)
130        psq_l       A00_A01, 0(mA), 0, 0
131
132        psq_st      fp14, 8(r1), 0, 0
133        stfd        fp14, 16(r1)
134
135        psq_l       B00_B01, 0(mB), 0, 0
136        addis       r6, 0, Unit01@ha
137        psq_l       B02_B03, 8(mB), 0, 0
138
139        psq_st      fp15, 24(r1), 0, 0
140        stfd        fp15, 32(r1)
141
142        addi        r6, r6, Unit01@l
143
144        psq_st      fp31, 40(r1), 0, 0
145        stfd        fp31, 48(r1)
146
147        psq_l       B10_B11, 16(mB), 0, 0
148        // D00_D01 = b00a00 , b01a00
149        ps_muls0    D00_D01, B00_B01, A00_A01
150        psq_l       A10_A11, 16(mA), 0, 0
151        // D02_D03 = b02a00 , b03a00
152        ps_muls0    D02_D03, B02_B03, A00_A01
153        psq_l       UNIT01, 0(r6), 0, 0
154        // D10_D11 = a10b00 , a10b01
155        ps_muls0    D10_D11, B00_B01, A10_A11
156        psq_l       B12_B13, 24(mB), 0, 0
157        // D12_D13 = a10b02 , a10b03
158        ps_muls0    D12_D13, B02_B03, A10_A11
159        psq_l       A02_A03, 8(mA), 0, 0
160        // fp12 = b10a01 + b00a00 , b11a01 + b01a00
161        ps_madds1   D00_D01, B10_B11, A00_A01, D00_D01
162        psq_l       A12_A13, 24(mA), 0, 0
163        // D10_D11 = a10b00 + a11b10 , a10b01 + a11b11
164        ps_madds1   D10_D11, B10_B11, A10_A11, D10_D11
165        psq_l       B20_B21, 32(mB), 0, 0
166        // D02_D03 = b12a01 + b02a00 , b13a01 + b03a00
167        ps_madds1   D02_D03, B12_B13, A00_A01, D02_D03  // YYY LAST TIME FP0 IS USED
168        psq_l       B22_B23, 40(mB), 0, 0
169        // D12_D13 = a10b02 + a11b12, a10b03+a11b13
170        ps_madds1   D12_D13, B12_B13, A10_A11, D12_D13 // YYY LAST TIME FP2 IS USED
171        psq_l       A20_A21, 32(mA), 0, 0
172        psq_l       A22_A23, 40(mA), 0, 0
173        // D00_D01 = b20a02 + b10a01 + b00a00 , b21a02 + b11a01 + b01a00
174        ps_madds0   D00_D01, B20_B21, A02_A03, D00_D01 // m00, m01 computed
175        // D02_D03 = b12a01 + b02a00 + b22a02 , b13a01 + b03a00 + b23a02
176        ps_madds0   D02_D03, B22_B23, A02_A03, D02_D03
177        // D10_D11 = a10b00 + a11b10 +a12b20, a10b01 + a11b11 + a12b21
178        ps_madds0   D10_D11, B20_B21, A12_A13, D10_D11 // m10, m11 computed
179        // D12_D13 = a10b02 + a11b12 + a12b22, a10b03+a11b13 + a12b23 + a13
180        ps_madds0   D12_D13, B22_B23, A12_A13, D12_D13
181
182        // store m00m01
183        psq_st      D00_D01, 0(mAB), 0, 0 // YYY LAST TIME FP12 IS USED
184
185        // D20_D21 = a20b00, a20b01
186        ps_muls0    D20_D21, B00_B01, A20_A21 // YYY LAST TIME FP6 IS USED
187        // get a03 from fp1 and add to D02_D03
188        ps_madds1   D02_D03, UNIT01, A02_A03, D02_D03 // m02, m03 computed
189        // YYY LAST TIME FP1 IS USED
190        // D22_D23 = a20b02, a20b03
191        ps_muls0    D22_D23, B02_B03, A20_A21 // YYY LAST TIME FP7 IS USED
192        // store m10m11
193        psq_st      D10_D11, 16(mAB), 0, 0
194        // get a13 from fp3 and add to D12_D13
195        ps_madds1   D12_D13, UNIT01, A12_A13, D12_D13 // m12, m13 computed
196        // store m02m03
197        psq_st      D02_D03, 8(mAB), 0, 0 // YYY LAST TIME D02_D03 IS USED
198
199        // D20_D21 = a20b00 + a21b10, a20b01 + a21b11
200        ps_madds1   D20_D21, B10_B11, A20_A21, D20_D21 // YYY LAST TIME FP8 IS USED
201        // D22_D23 = a20b02 + a21b12, a20b03 + a21b13
202        ps_madds1   D22_D23, B12_B13, A20_A21, D22_D23
203        // D20_D21 = a20b00 + a21b10 + a22b20, a20b01 + a21b11 + a22b21
204        ps_madds0   D20_D21, B20_B21, A22_A23, D20_D21
205
206        // Restore fp14
207        psq_l       fp14, 8(r1), 0, 0
208        lfd         fp14, 16(r1)  // D10_D11
209
210        // store m12m13
211        psq_st      D12_D13, 24(mAB), 0, 0
212        // D22_D23 = a20b02 + a21b12 + a22b22, a20b03 + a21b13 + a22b23 + a23
213        ps_madds0   D22_D23, B22_B23, A22_A23, D22_D23
214        // store m20m21
215        psq_st      D20_D21, 32(mAB), 0, 0
216        // get a23 from fp5 and add to fp17
217        ps_madds1   D22_D23, UNIT01, A22_A23, D22_D23
218
219        // restore stack frame
220        psq_l       fp15, 24(r1), 0, 0
221        lfd         fp15, 32(r1) // D12_D13
222
223        // store m22m23
224        psq_st      D22_D23, 40(mAB), 0, 0
225
226        psq_l       fp31, 40(r1), 0, 0
227        lfd         fp31, 48(r1)
228
229        addi        r1, r1, 64
230
231        blr
232       .size ASM_MTXConcat,$-ASM_MTXConcat
233
234#undef mA
235#undef mB
236#undef mAB
237#undef A00_A01
238#undef A02_A03
239#undef A10_A11
240#undef A12_A13
241#undef A20_A21
242#undef A22_A23
243#undef B00_B01
244#undef B02_B03
245#undef B10_B11
246#undef B12_B13
247#undef B20_B21
248#undef B22_B23
249#undef D00_D01
250#undef D02_D03
251#undef D10_D11
252#undef D12_D13
253#undef D20_D21
254#undef D22_D23
255
256#undef UNIT01
257
258////////////////////////////////////////////////////////////////////////////////
259// void ASM_MTXConcatArray (const Mtx a, const Mtx* srcBase, Mtx* dstBase, u32 count)
260#define a       r3
261#define srcBase r4
262#define dstBase r5
263#define count   r6
264        .global ASM_MTXConcatArray
265ASM_MTXConcatArray:
266        .type ASM_MTXConcatArray, @function
267#define va0     f0
268#define va1     f1
269#define va2     f2
270#define va3     f3
271#define va4     f4
272#define va5     f5
273#define vb0     f6
274#define vb1     f7
275#define vb2     f8
276#define vb3     f9
277#define vb4     f10
278#define vb5     f11
279#define vd0     f12
280#define vd1     f13
281#define vd2     f14
282#define vd3     f15
283#define vd4     f16
284#define vd5     f17
285#define u01     f18
286#define u01Ptr  r7
287#define sizeof_Mtx 48
288
289        mflr        r0
290        stwu        r1, -88(r1)
291        stw         r0, 92(r1)
292
293        psq_st      f14, 8(r1), 0, 0
294        stfd        f14, 16(r1)
295        psq_st      f15, 24(r1), 0, 0
296        stfd        f15, 32(r1)
297        psq_st      f16, 40(r1), 0, 0
298        stfd        f16, 48(r1)
299        psq_st      f17, 56(r1), 0, 0
300        stfd        f17, 64(r1)
301        psq_st      f18, 72(r1), 0, 0
302        stfd        f18, 80(r1)
303
304        lis         u01Ptr, Unit01@h
305        ori         u01Ptr, u01Ptr, Unit01@l
306
307        // [a00][a01]
308        psq_l       va0, 0(a), 0, 0
309        // [a02][a03]
310        psq_l       va1, 8(a), 0, 0
311        // [a10][a11]
312        psq_l       va2, 16(a), 0, 0
313        // [a12][a13]
314        psq_l       va3, 24(a), 0, 0
315        // count--
316        subi        count, count, 1
317        // [a20][a21]
318        psq_l       va4, 32(a), 0, 0
319        // [a22][a23]
320        psq_l       va5, 40(a), 0, 0
321        // Loop count
322        mtctr       count
323        // [0][1]
324        psq_l       u01, 0(u01Ptr), 0, 0
325
326        //---------------------------------
327        // [b00][b01]
328        psq_l       vb0, 0(srcBase), 0, 0
329        // [b10][b11]
330        psq_l       vb2, 16(srcBase), 0, 0
331
332        // [a00*b00][a00*b01]
333        ps_muls0    vd0, vb0, va0
334        // [a10*b00][a10*b01]
335        ps_muls0    vd2, vb0, va2
336        // [a20*b00][a20*b01]
337        ps_muls0    vd4, vb0, va4
338
339        // [b20][b21]
340        psq_l       vb4, 32(srcBase), 0, 0
341
342        // [a00*b00 + a01*b10][a00*b01 + a01*b11]
343        ps_madds1   vd0, vb2, va0, vd0
344        // [a10*b00 + a11*b10][a10*b01 + a11*b11]
345        ps_madds1   vd2, vb2, va2, vd2
346        // [a20*b00 + a21*b10][a20*b01 + a21*b11]
347        ps_madds1   vd4, vb2, va4, vd4
348
349        // [b02][b03]
350        psq_l       vb1, 8(srcBase), 0, 0
351
352        // [a00*b00 + a01*b10 + a02*b20][a00*b01 + a01*b11 + a02*b21]
353        ps_madds0   vd0, vb4, va1, vd0
354        // [a10*b00 + a11*b10 + a12*b20][a10*b01 + a11*b11 + a12*b21]
355        ps_madds0   vd2, vb4, va3, vd2
356        // [a20*b00 + a21*b10 + a22*b20][a20*b01 + a21*b11 + a22*b21]
357        ps_madds0   vd4, vb4, va5, vd4
358
359        // [b12][b13]
360        psq_l       vb3, 24(srcBase), 0, 0
361        // [a00*b00 + a01*b10 + a02*b20][a00*b01 + a01*b11 + a02*b21]
362        psq_st      vd0, 0(dstBase), 0, 0
363
364        // [a00*b02][a00*b03]
365        ps_muls0    vd1, vb1, va0
366        // [a10*b02][a10*b03]
367        ps_muls0    vd3, vb1, va2
368        // [a20*b02][a20*b03]
369        ps_muls0    vd5, vb1, va4
370
371        // [b22][b23]
372        psq_l       vb5, 40(srcBase), 0, 0
373        // [a10*b00 + a11*b10 + a12*b20][a10*b01 + a11*b11 + a12*b21]
374        psq_st      vd2, 16(dstBase), 0, 0
375
376        // [a00*b02 + a01*b12][a00*b03 + a01*b13]
377        ps_madds1   vd1, vb3, va0, vd1
378        // [a10*b02 + a11*b12][a10*b03 + a11*b13]
379        ps_madds1   vd3, vb3, va2, vd3
380        // [a20*b02 + a21*b12][a20*b03 + a21*b13]
381        ps_madds1   vd5, vb3, va4, vd5
382
383_ASM_MTXConcatArray_loop:
384
385        // ++srcBase
386        addi        srcBase, srcBase, sizeof_Mtx
387
388        // [a00*b02 + a01*b12 + a02*b22][a00*b03 + a01*b13 + a02*b23]
389        ps_madds0   vd1, vb5, va1, vd1
390        // [a10*b02 + a11*b12 + a12*b22][a10*b03 + a11*b13 + a12*b23]
391        ps_madds0   vd3, vb5, va3, vd3
392        // [a20*b02 + a21*b12 + a22*b22][a20*b03 + a21*b13 + a22*b23]
393        ps_madds0   vd5, vb5, va5, vd5
394
395        // [b00][b01]
396        psq_l       vb0, 0(srcBase), 0, 0
397        // [a20*b00 + a21*b10 + a22*b20][a20*b01 + a21*b11 + a22*b21]
398        psq_st      vd4, 32(dstBase), 0, 0
399
400        // [a00*b02 + a01*b12 + a02*b22][a00*b03 + a01*b13 + a02*b23 + a03]
401        ps_madd     vd1, u01, va1, vd1
402        // [a10*b02 + a11*b12 + a12*b22][a10*b03 + a11*b13 + a12*b23 + a13]
403        ps_madd     vd3, u01, va3, vd3
404        // [a20*b02 + a21*b12 + a22*b22][a20*b03 + a21*b13 + a22*b23 + a23]
405        ps_madd     vd5, u01, va5, vd5
406
407        // [b10][b11]
408        psq_l       vb2, 16(srcBase), 0, 0
409        // [a00*b02 + a01*b12 + a02*b22][a00*b03 + a01*b13 + a02*b23 + a03]
410        psq_st      vd1, 8(dstBase), 0, 0
411
412        // [a00*b00][a00*b01]
413        ps_muls0    vd0, vb0, va0
414        // [a10*b00][a10*b01]
415        ps_muls0    vd2, vb0, va2
416        // [a20*b00][a20*b01]
417        ps_muls0    vd4, vb0, va4
418
419        // [b20][b21]
420        psq_l       vb4, 32(srcBase), 0, 0
421        // [a10*b02 + a11*b12 + a12*b22][a10*b03 + a11*b13 + a12*b23 + a13]
422        psq_st      vd3, 24(dstBase), 0, 0
423
424        // [a00*b00 + a01*b10][a00*b01 + a01*b11]
425        ps_madds1   vd0, vb2, va0, vd0
426        // [a10*b00 + a11*b10][a10*b01 + a11*b11]
427        ps_madds1   vd2, vb2, va2, vd2
428        // [a20*b00 + a21*b10][a20*b01 + a21*b11]
429        ps_madds1   vd4, vb2, va4, vd4
430
431        // [b02][b03]
432        psq_l       vb1, 8(srcBase), 0, 0
433        // [a20*b02 + a21*b12 + a22*b22][a20*b03 + a21*b13 + a22*b23 + a23]
434        psq_st      vd5, 40(dstBase), 0, 0
435        // ++dstBase
436        addi        dstBase, dstBase, sizeof_Mtx
437
438        // [a00*b00 + a01*b10 + a02*b20][a00*b01 + a01*b11 + a02*b21]
439        ps_madds0   vd0, vb4, va1, vd0
440        // [a10*b00 + a11*b10 + a12*b20][a10*b01 + a11*b11 + a12*b21]
441        ps_madds0   vd2, vb4, va3, vd2
442        // [a20*b00 + a21*b10 + a22*b20][a20*b01 + a21*b11 + a22*b21]
443        ps_madds0   vd4, vb4, va5, vd4
444
445        // [b12][b13]
446        psq_l       vb3, 24(srcBase), 0, 0
447        // [a00*b00 + a01*b10 + a02*b20][a00*b01 + a01*b11 + a02*b21]
448        psq_st      vd0, 0(dstBase), 0, 0
449
450        // [a00*b02][a00*b03]
451        ps_muls0    vd1, vb1, va0
452        // [a10*b02][a10*b03]
453        ps_muls0    vd3, vb1, va2
454        // [a20*b02][a20*b03]
455        ps_muls0    vd5, vb1, va4
456
457        // [b22][b23]
458        psq_l       vb5, 40(srcBase), 0, 0
459        // [a10*b00 + a11*b10 + a12*b20][a10*b01 + a11*b11 + a12*b21]
460        psq_st      vd2, 16(dstBase), 0, 0
461
462        // [a00*b02 + a01*b12][a00*b03 + a01*b13]
463        ps_madds1   vd1, vb3, va0, vd1
464        // [a10*b02 + a11*b12][a10*b03 + a11*b13]
465        ps_madds1   vd3, vb3, va2, vd3
466        // [a20*b02 + a21*b12][a20*b03 + a21*b13]
467        ps_madds1   vd5, vb3, va4, vd5
468
469        // LOOP
470        bdnz        _ASM_MTXConcatArray_loop
471
472        // [a20*b00 + a21*b10 + a22*b20][a20*b01 + a21*b11 + a22*b21]
473        psq_st      vd4, 32(dstBase), 0, 0
474
475        // [a00*b02 + a01*b12 + a02*b22][a00*b03 + a01*b13 + a02*b23]
476        ps_madds0   vd1, vb5, va1, vd1
477        // [a10*b02 + a11*b12 + a12*b22][a10*b03 + a11*b13 + a12*b23]
478        ps_madds0   vd3, vb5, va3, vd3
479        // [a20*b02 + a21*b12 + a22*b22][a20*b03 + a21*b13 + a22*b23]
480        ps_madds0   vd5, vb5, va5, vd5
481
482        // [a00*b02 + a01*b12 + a02*b22][a00*b03 + a01*b13 + a02*b23 + a03]
483        ps_madd     vd1, u01, va1, vd1
484        // [a10*b02 + a11*b12 + a12*b22][a10*b03 + a11*b13 + a12*b23 + a13]
485        ps_madd     vd3, u01, va3, vd3
486        // [a20*b02 + a21*b12 + a22*b22][a20*b03 + a21*b13 + a22*b23 + a23]
487        ps_madd     vd5, u01, va5, vd5
488
489        // [a00*b02 + a01*b12 + a02*b22][a00*b03 + a01*b13 + a02*b23 + a03]
490        psq_st      vd1, 8(dstBase), 0, 0
491        // [a10*b02 + a11*b12 + a12*b22][a10*b03 + a11*b13 + a12*b23 + a13]
492        psq_st      vd3, 24(dstBase), 0, 0
493        // [a20*b02 + a21*b12 + a22*b22][a20*b03 + a21*b13 + a22*b23 + a23]
494        psq_st      vd5, 40(dstBase), 0, 0
495
496        psq_l       f14, 8(r1), 0, 0
497        lfd         f14, 16(r1)
498        psq_l       f15, 24(r1), 0, 0
499        lfd         f15, 32(r1)
500        psq_l       f16, 40(r1), 0, 0
501        lfd         f16, 48(r1)
502        psq_l       f17, 56(r1), 0, 0
503        lfd         f17, 64(r1)
504        psq_l       f18, 72(r1), 0, 0
505        lfd         f18, 80(r1)
506
507        lwz         r0, 92(r1)
508        mtlr        r0
509        addi        r1, r1, 88
510        blr
511
512       .size ASM_MTXConcatArray,$-ASM_MTXConcatArray
513
514#undef a
515#undef srcBase
516#undef dstBase
517#undef count
518#undef va0
519#undef va1
520#undef va2
521#undef va3
522#undef va4
523#undef va5
524#undef vb0
525#undef vb1
526#undef vb2
527#undef vb3
528#undef vb4
529#undef vb5
530#undef vd0
531#undef vd1
532#undef vd2
533#undef vd3
534#undef vd4
535#undef vd5
536#undef u01
537#undef u01Ptr
538
539////////////////////////////////////////////////////////////////////////////////
540// void ASM_MTXTranspose ( const Mtx src, Mtx xPose ) {
541#define src   r3
542#define xPose r4
543#define c_zero fp1
544#define row0a  fp2
545#define row1a  fp3
546#define row0b  fp4
547#define row1b  fp5
548#define trns0  fp6
549#define trns1  fp7
550#define trns2  fp8
551        .global ASM_MTXTranspose
552ASM_MTXTranspose:
553        .type ASM_MTXTranspose, @function
554
555        // c_zero = 0.0F;
556        lis         r5, CONST_0_0F@h
557        ori         r5, r5, CONST_0_0F@l
558        lfs         c_zero, 0(r5)
559        psq_l       row0a, 0(src),  0, 0    // [0][0], [0][1]
560        stfs        c_zero, 44(xPose)       // 0 -> [2][3]
561        psq_l       row1a, 16(src), 0, 0    // [1][0], [1][1]
562        ps_merge00  trns0, row0a, row1a     // [0][0], [1][0]
563        psq_l       row0b, 8(src),  1, 0    // [0][2], 1
564        ps_merge11  trns1, row0a, row1a     // [0][1], [1][1]
565        psq_l       row1b, 24(src), 1, 0    // [1][2], 1
566        psq_st      trns0, 0(xPose),  0, 0  // [0][0], [1][0] -> [0][0], [0][1]
567        psq_l       row0a, 32(src), 0, 0    // [2][0], [2][1]
568        ps_merge00  trns2, row0b, row1b     // [0][2], [1][2]
569        psq_st      trns1, 16(xPose), 0, 0  // [0][1], [1][1] -> [1][0], [1][1]
570        ps_merge00  trns0, row0a, c_zero    // [2][0], 0
571        psq_st      trns2, 32(xPose), 0, 0  // [0][2], [1][2] -> [2][0], [2][1]
572        ps_merge10  trns1, row0a, c_zero    // [2][1], 0
573        psq_st      trns0, 8(xPose),  0, 0  // [2][0], 0 -> [0][2], [0][3]
574        lfs         row0b, 40(src)          // [2][2]
575        psq_st      trns1, 24(xPose), 0, 0  // [2][1], 0 -> [1][2], [1][3]
576        stfs        row0b, 40(xPose)        // [2][2] -> [2][2]
577
578        blr
579
580       .size ASM_MTXTranspose,$-ASM_MTXTranspose
581
582#undef src
583#undef xPose
584#undef c_zero
585#undef row0a
586#undef row1a
587#undef row0b
588#undef row1b
589#undef trns0
590#undef trns1
591#undef trns2
592
593////////////////////////////////////////////////////////////////////////////////
594// u32 ASM_MTXInverse(const Mtx src, Mtx inv) {
595#define src r3
596#define inv r4
597        .global ASM_MTXInverse
598ASM_MTXInverse:
599        .type ASM_MTXInverse, @function
600
601        // fp0 [ 00 ][ 1.0F ] : Load
602        psq_l      fp0, 0( src ), 1, 0
603        // fp1 [ 01 ][ 02 ]   : Load
604        psq_l      fp1, 4( src ), 0, 0
605        // fp2 [ 10 ][ 1.0F ] : Load
606        psq_l      fp2, 16( src ), 1, 0
607        // fp6 [ 02 ][ 00 ]
608        ps_merge10 fp6, fp1, fp0
609        // fp3 [ 11 ][ 12 ]   : Load
610        psq_l      fp3, 20( src ), 0, 0
611        // fp4 [ 20 ][ 1.0F ] : Load
612        psq_l      fp4, 32( src ), 1, 0
613        // fp7 [ 12 ][ 10 ]
614        ps_merge10 fp7, fp3, fp2
615        // fp5 [ 21 ][ 22 ]   : Load
616        psq_l      fp5, 36( src ), 0, 0
617        // fp11[ 11*02 ][ 00*12 ]
618        ps_mul     fp11, fp3, fp6
619        // fp8 [ 22 ][ 20 ]
620        ps_merge10 fp8, fp5, fp4
621        // fp13[ 21*12 ][ 10*22 ]
622        ps_mul     fp13, fp5, fp7
623        // fp11[ 01*12 - 11*02 ][ 10*02 - 00*12 ]
624        ps_msub    fp11, fp1, fp7, fp11
625        // fp12[ 01*22 ][ 20*02 ]
626        ps_mul     fp12, fp1, fp8
627        // fp13[ 11*22 - 21*12 ][ 20*12 - 10*22 ]
628        ps_msub    fp13, fp3, fp8, fp13
629        // fp10[ 20*11 ][ N/A ]
630        ps_mul     fp10, fp3, fp4
631        // fp12[ 21*02 - 01*22 ][ 00*22 - 20*02 ]
632        ps_msub    fp12, fp5, fp6, fp12
633        // fp7 [ 00*(11*22-21*12) ][ N/A ]
634        ps_mul     fp7, fp0, fp13
635        // fp9 [ 00*21 ][ N/A ]
636        ps_mul     fp9, fp0, fp5
637        // fp8 [ 10*01 ][ N/A ]
638        ps_mul     fp8, fp1, fp2
639        // fp7 [ 00*(11*22-21*12) + 10*(21*02-01*22) ][ N/A ]
640        ps_madd    fp7, fp2, fp12, fp7
641        // fp6 [ 0.0F ][ 0.0F ]
642        ps_sub     fp6, fp6, fp6
643        // fp10[ 10*21 - 20*11 ][ N/A ]
644        ps_msub    fp10, fp2, fp5, fp10
645        // fp7 [ 00*(11*22-21*12) + 10*(21*02-01*22) + 20*(01*12-11*02) ][ N/A ] : det
646        ps_madd    fp7, fp4, fp11, fp7
647        // fp9 [ 20*01 - 00*21 ][ N/A ]
648        ps_msub    fp9, fp1, fp4, fp9
649        // fp8 [ 00*11 - 10*01 ][ N/A ]
650        ps_msub    fp8, fp0, fp3, fp8
651
652        // ( det == 0 ) ?
653        ps_cmpo0   cr0, fp7, fp6
654        bne        _ASM_MTXInverse_regular
655
656        // return value (singular)
657        addi       r3, 0, 0
658
659        blr
660
661_ASM_MTXInverse_regular:
662
663        // fp0 [ 1/det ][ N/A ]
664        fres       fp0, fp7
665
666        // Newton's approximation
667        // Refinement : ( E = est. of 1/K ) -> ( E' = ( 2 - K * E ) * E )
668        ps_add     fp6, fp0, fp0
669        ps_mul     fp5, fp7, fp0
670        ps_nmsub   fp0, fp0, fp5, fp6
671
672        // fp1 [ 03 ][ 03 ] : Load
673        lfs        fp1, 12(src)
674        // fp13[ ( 11*22 - 21*12 ) * rdet ][ ( 20*12 - 10*22 ) * rdet ] : i[0][0], i[1][0]
675        ps_muls0   fp13, fp13, fp0
676        // fp2 [ 13 ][ 13 ] : Load
677        lfs        fp2, 28(src)
678        // fp12[ ( 21*02 - 01*22 ) * rdet ][ ( 00*22 - 20*02 ) * rdet ] : i[0][1], i[1][1]
679        ps_muls0   fp12, fp12, fp0
680        // fp3 [ 23 ][ 23 ] : Load
681        lfs        fp3, 44(src)
682        // fp11[ ( 01*12 - 11*02 ) * rdet ][ ( 10*02 - 00*12 ) * rdet ] : i[0][2], i[1][2]
683        ps_muls0   fp11, fp11, fp0
684        // fp5 [ i00 ][ i01 ]
685        ps_merge00 fp5, fp13, fp12
686        // fp4 [ i10 ][ i11 ]
687        ps_merge11 fp4, fp13, fp12
688        // fp6 [ i00*03 ][ i10*03 ]
689        ps_mul     fp6, fp13, fp1
690        // [ i00 ][ i01 ] : Store fp5   -> free(fp5[ i00 ][ i01 ])
691        psq_st     fp5,  0(inv), 0, 0
692        // [ i10 ][ i11 ] : Store fp4   -> free(fp4[ i10 ][ i11 ])
693        psq_st     fp4,  16(inv), 0, 0
694        // fp10[ ( 10*21 - 20*11 ) * rdet ] : i[2][0]
695        ps_muls0   fp10, fp10, fp0
696        // fp9 [ ( 20*01 - 00*21 ) * rdet ] : i[2][1]
697        ps_muls0   fp9,  fp9,  fp0
698        // fp6 [ i00*03+i01*13 ][ i10*03+i11*13 ]
699        ps_madd    fp6, fp12, fp2, fp6
700        // [ i20 ] : Store fp10
701        psq_st     fp10, 32(inv), 1, 0
702        // fp8 [ ( 00*11 - 10*01 ) * rdet ] : i[2][2]
703        ps_muls0   fp8,  fp8,  fp0
704        // fp6 [ -i00*03-i01*13-i02*23 ][ -i10*03-i11*13-i12*23 ] : i[0][3], i[1][3]
705        ps_nmadd   fp6, fp11, fp3, fp6
706        // [ i21 ] : Store fp9
707        psq_st     fp9,  36(inv), 1, 0
708        // fp7 [ i20*03 ][ N/A ]
709        ps_mul     fp7, fp10, fp1
710        // fp5 [ i02 ][ i03 ]
711        ps_merge00 fp5, fp11, fp6
712        // [ i22 ] : Store fp8
713        psq_st     fp8,  40(inv), 1, 0
714        // fp7 [ i20*03+i21*13 ][ N/A ]
715        ps_madd    fp7, fp9,  fp2, fp7
716        // fp4 [ i12 ][ i13 ]
717        ps_merge11 fp4, fp11, fp6
718        // [ i02 ][ i03 ] : Store fp5
719        psq_st     fp5,  8(inv), 0, 0
720        // fp7 [ -i20*03-i21*13-i22*23 ][ N/A ] : i[2][3]
721        ps_nmadd   fp7, fp8,  fp3, fp7
722        // [ i12 ][ i13 ] : Store fp4
723        psq_st     fp4,  24(inv), 0, 0
724        // [ i23 ] : Store fp7
725        psq_st     fp7,  44(inv), 1, 0
726
727        // return value (regular)
728        addi       r3, 0, 1
729
730        blr
731       .size ASM_MTXInverse,$-ASM_MTXInverse
732
733#undef src
734#undef inv
735
736
737
738////////////////////////////////////////////////////////////////////////////////
739// u32 ASM_MTXInvXpose(const Mtx src, Mtx invX)
740        .global ASM_MTXInvXpose
741#define src  r3
742#define invX r4
743ASM_MTXInvXpose:
744        .type ASM_MTXInvXpose, @function
745
746        // fp0 [ 00 ][ 1.0F ] : Load
747        psq_l      fp0, 0( src ), 1, 0
748        // fp1 [ 01 ][ 02 ]   : Load
749        psq_l      fp1, 4( src ), 0, 0
750        // fp2 [ 10 ][ 1.0F ] : Load
751        psq_l      fp2, 16( src ), 1, 0
752        // fp6 [ 02 ][ 00 ]
753        ps_merge10 fp6, fp1, fp0
754        // fp3 [ 11 ][ 12 ]   : Load
755        psq_l      fp3, 20( src ), 0, 0
756        // fp4 [ 20 ][ 1.0F ] : Load
757        psq_l      fp4, 32( src ), 1, 0
758        // fp7 [ 12 ][ 10 ]
759        ps_merge10 fp7, fp3, fp2
760        // fp5 [ 21 ][ 22 ]   : Load
761        psq_l      fp5, 36( src ), 0, 0
762        // fp11[ 11*02 ][ 00*12 ]
763        ps_mul     fp11, fp3, fp6
764        // fp8 [ 22 ][ 20 ]
765        ps_merge10 fp8, fp5, fp4
766        // fp13[ 21*12 ][ 10*22 ]
767        ps_mul     fp13, fp5, fp7
768        // fp11[ 01*12 - 11*02 ][ 10*02 - 00*12 ]
769        ps_msub    fp11, fp1, fp7, fp11
770        // fp12[ 01*22 ][ 20*02 ]
771        ps_mul     fp12, fp1, fp8
772        // fp13[ 11*22 - 21*12 ][ 20*12 - 10*22 ]
773        ps_msub    fp13, fp3, fp8, fp13
774        // fp10[ 20*11 ][ N/A ]
775        ps_mul     fp10, fp3, fp4
776        // fp12[ 21*02 - 01*22 ][ 00*22 - 20*02 ]
777        ps_msub    fp12, fp5, fp6, fp12
778        // fp7 [ 00*(11*22-21*12) ][ N/A ]
779        ps_mul     fp7, fp0, fp13
780        // fp9 [ 00*21 ][ N/A ]
781        ps_mul     fp9, fp0, fp5
782        // fp8 [ 10*01 ][ N/A ]
783        ps_mul     fp8, fp1, fp2
784        // fp7 [ 00*(11*22-21*12) + 10*(21*02-01*22) ][ N/A ]
785        ps_madd    fp7, fp2, fp12, fp7
786        // fp6 [ 0.0F ][ 0.0F ]
787        ps_sub     fp6, fp6, fp6
788        // fp10[ 10*21 - 20*11 ][ N/A ]
789        ps_msub    fp10, fp2, fp5, fp10
790        // fp7 [ 00*(11*22-21*12) + 10*(21*02-01*22) + 20*(01*12-11*02) ][ N/A ] : det
791        ps_madd    fp7, fp4, fp11, fp7
792        // fp9 [ 20*01 - 00*21 ][ N/A ]
793        ps_msub    fp9, fp1, fp4, fp9
794        // fp8 [ 00*11 - 10*01 ][ N/A ]
795        ps_msub    fp8, fp0, fp3, fp8
796
797        // ( det == 0 ) ?
798        ps_cmpo0   cr0, fp7, fp6
799        //bne     _regular
800        bne        _ASM_MTXInvXpose_regular
801
802        // return value (singular)
803        addi       r3, 0, 0
804
805        blr
806
807_ASM_MTXInvXpose_regular:
808
809        // fp0 [ 1/det ][ N/A ]
810        fres       fp0, fp7
811
812        psq_st     fp6, 12(invX),1, 0
813
814        // Newton's approximation
815        // Refinement : ( E = est. of 1/K ) -> ( E' = ( 2 - K * E ) * E )
816        ps_add     fp4, fp0, fp0
817        ps_mul     fp5, fp7, fp0
818        psq_st     fp6, 28(invX),1, 0
819        ps_nmsub   fp0, fp0, fp5, fp4
820        psq_st     fp6, 44(invX),1, 0
821
822        // fp13[ ( 11*22 - 21*12 ) * rdet ][ ( 20*12 - 10*22 ) * rdet ] : ix[0][0], ix[0][1]
823        ps_muls0   fp13, fp13, fp0
824        // fp12[ ( 21*02 - 01*22 ) * rdet ][ ( 00*22 - 20*02 ) * rdet ] : ix[1][0], ix[1][1]
825        ps_muls0   fp12, fp12, fp0
826        // [ ix00 ][ ix01 ] : Store fp13
827        psq_st     fp13, 0( invX ), 0, 0
828        // fp11[ ( 01*12 - 11*02 ) * rdet ][ ( 10*02 - 00*12 ) * rdet ] : ix[2][0], ix[2][1]
829        ps_muls0   fp11, fp11, fp0
830        // [ ix10 ][ ix11 ] : Store fp12
831        psq_st     fp12, 16( invX ), 0, 0
832        // fp10[ ( 10*21 - 20*11 ) * rdet ] : i[0][2]
833        ps_muls0   fp10, fp10, fp0
834        // [ ix20 ][ ix21 ] : Store fp11
835        psq_st     fp11, 32( invX ), 0, 0
836        // fp9 [ ( 20*01 - 00*21 ) * rdet ] : i[1][2]
837        ps_muls0   fp9, fp9, fp0
838        // [ ix02 ]         : Store fp10
839        psq_st     fp10, 8( invX ), 1, 0
840        // fp8 [ ( 00*11 - 10*01 ) * rdet ] : i[2][2]
841        ps_muls0   fp8, fp8, fp0
842        // [ ix12 ]         : Store fp9
843        psq_st     fp9, 24( invX ), 1, 0
844        // [ ix22 ]         : Store fp8
845        psq_st     fp8, 40( invX ), 1, 0
846
847        // return value (regular)
848        addi       r3, 0, 1
849
850        blr
851       .size ASM_MTXInvXpose,$-ASM_MTXInvXpose
852
853#undef src
854#undef invX
855
856
857
858////////////////////////////////////////////////////////////////////////////////
859// void ASM_MTXReflect(Mtx m, const Vec *p, const Vec *n)
860#define m  r3
861#define p  r4
862#define n  r5
863        .global ASM_MTXReflect
864ASM_MTXReflect:
865        .type ASM_MTXReflect, @function
866#define c_one   fp1
867#define vn_xy   fp2
868#define vn_z1   fp3
869#define n2vn_xy fp4
870#define n2vn_z1 fp5
871#define pdotn   fp6
872#define tmp0    fp7
873#define tmp1    fp8
874#define tmp2    fp9
875#define tmp3    fp10
876#define tmp4    fp11
877#define tmp5    fp12
878#define tmp6    fp13
879#define tmp7    fp0
880
881        // c_one = 1.0F
882        lis         r6, CONST_1_0F@h
883        ori         r6, r6, CONST_1_0F@l
884        lfs         c_one, 0(r6)
885
886        // vn_z1 = [nz][1.0F] : LOAD
887        psq_l       vn_z1, 8(n), 1, 0
888        // vn_xy = [nx][ny]   : LOAD
889        psq_l       vn_xy, 0(n), 0, 0
890
891        // tmp0 = [px][py]   : LOAD
892        psq_l       tmp0,  0(p), 0, 0
893        // n2vn_z1 = [-2nz][-2.0F]
894        ps_nmadd    n2vn_z1, vn_z1, c_one, vn_z1
895        // tmp1 = [pz][1.0F] : LOAD
896        psq_l       tmp1,  8(p), 1, 0
897        // n2vn_xy = [-2nx][-2ny]
898        ps_nmadd    n2vn_xy, vn_xy, c_one, vn_xy
899
900        // tmp4 = [-2nx*nz][-2ny*nz]   : [m20][m21]
901        ps_muls0    tmp4, vn_xy, n2vn_z1
902        // pdotn = [-2(px*nx)][-2(py*ny)]
903        ps_mul      pdotn, n2vn_xy, tmp0
904        // tmp2 = [-2nx*nx][-2nx*ny]
905        ps_muls0    tmp2, vn_xy, n2vn_xy
906        // pdotn = [-2(px*nx+py*ny)][?]
907        ps_sum0     pdotn, pdotn, pdotn, pdotn
908        // tmp3 = [-2nx*ny][-2ny*ny]
909        ps_muls1    tmp3, vn_xy, n2vn_xy
910        // tmp4 = [m20][m21] : STORE
911        psq_st      tmp4, 32(m), 0, 0
912        // tmp2 = [1-2nx*nx][-2nx*ny]  : [m00][m01]
913        ps_sum0     tmp2, tmp2, tmp2, c_one
914        // pdotn = [2(px*nx+py*ny+pz*nz)][?]
915        ps_nmadd    pdotn, n2vn_z1, tmp1, pdotn
916        // tmp3 = [-2nx*ny][1-2ny*ny]  : [m10][m11]
917        ps_sum1     tmp3, c_one, tmp3, tmp3
918        // tmp2 = [m00][m01] : STORE
919        psq_st      tmp2,  0(m), 0, 0
920        // tmp5 = [pdotn*nx][pdotn*ny]
921        ps_muls0    tmp5, vn_xy, pdotn
922        // tmp6 = [-2nz][pdotn]
923        ps_merge00  tmp6, n2vn_z1, pdotn
924        // tmp3 = [m10][m11] : STORE
925        psq_st      tmp3, 16(m), 0, 0
926
927        // tmp7 = [-2nx*nz][pdotn*nx]  : [m02][m03]
928        ps_merge00  tmp7, tmp4, tmp5
929        // tmp6 = [-2nz*nz][pdotn*nz]
930        ps_muls0    tmp6, tmp6, vn_z1
931        // tmp5 = [-2ny*nz][pdotn*ny]  : [m12][m13]
932        ps_merge11  tmp5, tmp4, tmp5
933        // tmp7 = [m02][m03] : STORE
934        psq_st      tmp7,  8(m), 0, 0
935        // tmp6 = [1-2nz*nz][pdotn*nz] : [m22][m23]
936        ps_sum0     tmp6, tmp6, tmp6, c_one
937        // tmp5 = [m12][m13] : STORE
938        psq_st      tmp5, 24(m), 0, 0
939        // tmp6 = [m22][m23] : STORE
940        psq_st      tmp6, 40(m), 0, 0
941
942        blr
943       .size ASM_MTXReflect,$-ASM_MTXReflect
944
945#undef m
946#undef p
947#undef n
948#undef c_one
949#undef vn_xy
950#undef vn_z1
951#undef n2vn_xy
952#undef n2vn_z1
953#undef pdotn
954#undef tmp0
955#undef tmp1
956#undef tmp2
957#undef tmp3
958#undef tmp4
959#undef tmp5
960#undef tmp6
961#undef tmp7
962
963
964
965////////////////////////////////////////////////////////////////////////////////
966// void ASM_MTXScaleApply (const Mtx src, Mtx dst, f32 xS, f32 yS, f32 zS)
967#define src  r3
968#define dst  r4
969#define xS   fp1
970#define yS   fp2
971#define zS   fp3
972        .global ASM_MTXScaleApply
973ASM_MTXScaleApply:
974        .type ASM_MTXScaleApply, @function
975        frsp        xS, xS                     // to make sure xS = single precision
976        psq_l       fp4, 0(src),        0, 0
977        frsp        yS, yS                     // to make sure yS = single precision
978        psq_l       fp5, 8(src),        0, 0
979        frsp        zS, zS                     // to make sure zS = single precision
980        ps_muls0    fp4, fp4, xS
981        psq_l       fp6, 16(src),       0, 0
982        ps_muls0    fp5, fp5, xS
983        psq_l       fp7, 24(src),       0, 0
984        ps_muls0    fp6, fp6, yS
985        psq_l       fp8, 32(src),       0, 0
986        psq_st      fp4, 0(dst),        0, 0
987        ps_muls0    fp7, fp7, yS
988        psq_l       fp2, 40(src),       0, 0
989        psq_st      fp5, 8(dst),        0, 0
990        ps_muls0    fp8, fp8, zS
991        psq_st      fp6, 16(dst),       0, 0
992        ps_muls0    fp2, fp2, zS
993        psq_st      fp7, 24(dst),       0, 0
994        psq_st      fp8, 32(dst),       0, 0
995        psq_st      fp2, 40(dst),       0, 0
996        blr
997        .size ASM_MTXScaleApply,$-ASM_MTXScaleApply
998#undef src
999#undef dst
1000#undef xS
1001#undef yS
1002#undef zS
1003
1004
1005
1006////////////////////////////////////////////////////////////////////////////////
1007// void _ASM_MTXRotAxisRadInternal(Mtx m, const Vec *axis, f32 sT, f32 cT)
1008        .global _ASM_MTXRotAxisRadInternal
1009#define m    r3
1010#define axis r4
1011#define sT   fp1
1012#define cT   fp2
1013_ASM_MTXRotAxisRadInternal:
1014        .type _ASM_MTXRotAxisRadInternal, @function
1015#define tT   fp3
1016#define fc0  fp4
1017#define tmp0 fp5
1018#define tmp1 fp6
1019#define tmp2 fp7
1020#define tmp3 fp8
1021#define tmp4 fp9
1022#define tmp5 fp10
1023#define tmp6 fp11
1024#define tmp7 fp12
1025#define tmp8 fp13
1026#define tmp9 fp14
1027
1028        mflr        r0
1029        stwu        r1, -24(r1)
1030        stw         r0, 28(r1)
1031
1032        psq_st      fp14, 8(r1), 0, 0
1033        stfd        fp14, 16(r1)
1034
1035        // tmp8 = 3.0F;
1036        lis         r6, CONST_3_0F@h
1037        ori         r6, r6, CONST_3_0F@l
1038        lfs         tmp8, 0(r6)
1039
1040        // tmp9 = 0.5F;
1041        lis         r5, CONST_0_5F@h
1042        ori         r5, r5, CONST_0_5F@l
1043        lfs         tmp9, 0(r5)
1044
1045        // to make sure cT = (single precision float value)
1046        frsp        cT, cT
1047        // tmp0 = [x][y] : LOAD
1048        psq_l       tmp0, 0(axis), 0, 0
1049        // to make sure sT = (single precision float value)
1050        frsp        sT, sT
1051        // tmp1 = [z][z] : LOAD
1052        lfs         tmp1, 8(axis)
1053
1054        // tmp2 = [x*x][y*y]
1055        ps_mul      tmp2, tmp0, tmp0
1056        // tmp7 = [1.0F]
1057        fadds       tmp7, tmp9, tmp9
1058        // tmp3 = [x*x+z*z][y*y+z*z]
1059        ps_madd     tmp3, tmp1, tmp1, tmp2
1060        // fc0 = [0.0F]
1061        fsubs       fc0, tmp9, tmp9
1062        // tmp4 = [S = x*x+y*y+z*z][z]
1063        ps_sum0     tmp4, tmp3, tmp1, tmp2
1064
1065        // tT = 1.0F - cT
1066        fsubs       tT, tmp7, cT
1067
1068        // tmp5 = [1.0/sqrt(S)] :estimation[E]
1069        frsqrte     tmp5, tmp4
1070        // Newton-Rapson refinement step
1071        // E' = E/2(3.0 - E*E*S)
1072        fmuls       tmp2, tmp5, tmp5            // E*E
1073        fmuls       tmp3, tmp5, tmp9            // E/2
1074        fnmsubs     tmp2, tmp2, tmp4, tmp8      // (3-E*E*S)
1075        fmuls       tmp5, tmp2, tmp3            // (E/2)(3-E*E*S)
1076
1077        // cT = [c][c]
1078        ps_merge00  cT, cT, cT
1079
1080        // tmp0 = [nx = x/sqrt(S)][ny = y/sqrt(S)]
1081        ps_muls0    tmp0, tmp0, tmp5
1082        // tmp1 = [nz = z/sqrt(S)][nz = z/sqrt(S)]
1083        ps_muls0    tmp1, tmp1, tmp5
1084
1085        // tmp4 = [t*nx][t*ny]
1086        ps_muls0    tmp4, tmp0, tT
1087        // tmp9 = [s*nx][s*ny]
1088        ps_muls0    tmp9, tmp0, sT
1089        // tmp5 = [t*nz][t*nz]
1090        ps_muls0    tmp5, tmp1, tT
1091
1092        // tmp3 = [t*nx*ny][t*ny*ny]
1093        ps_muls1    tmp3, tmp4, tmp0
1094        // tmp2 = [t*nx*nx][t*ny*nx]
1095        ps_muls0    tmp2, tmp4, tmp0
1096        // tmp4 = [t*nx*nz][t*ny*nz]
1097        ps_muls0    tmp4, tmp4, tmp1
1098
1099        // tmp6 = [t*nx*ny-s*nz][t*nx*ny-s*nz]
1100        fnmsubs     tmp6, tmp1, sT, tmp3
1101        // tmp7 = [t*nx*ny+s*nz][t*ny*ny+s*nz]
1102        fmadds      tmp7, tmp1, sT, tmp3
1103
1104        // tmp0 = [-s*nx][-s*ny]
1105        ps_neg      tmp0, tmp9
1106        // tmp8 = [t*nx*nz+s*ny][0] == [m02][m03]
1107        ps_sum0     tmp8, tmp4, fc0, tmp9
1108        // tmp2 = [t*nx*nx+c][t*nx*ny-s*nz] == [m00][m01]
1109        ps_sum0     tmp2, tmp2, tmp6, cT
1110        // tmp3 = [t*nx*ny+s*nz][t*ny*ny+c] == [m10][m11]
1111        ps_sum1     tmp3, cT, tmp7, tmp3
1112        // tmp6 = [t*ny*nz-s*nx][0] == [m12][m13]
1113        ps_sum0     tmp6, tmp0, fc0 ,tmp4
1114
1115        // tmp8 [m02][m03] : STORE
1116        psq_st      tmp8, 8(m), 0, 0
1117        // tmp0 = [t*nx*nz-s*ny][t*ny*nz]
1118        ps_sum0     tmp0, tmp4, tmp4, tmp0
1119        // tmp2 [m00][m01] : STORE
1120        psq_st      tmp2, 0(m), 0, 0
1121        // tmp5 = [t*nz*nz][t*nz*nz]
1122        ps_muls0    tmp5, tmp5, tmp1
1123        // tmp3 [m10][m11] : STORE
1124        psq_st      tmp3, 16(m), 0, 0
1125        // tmp4 = [t*nx*nz-s*ny][t*ny*nz+s*nx] == [m20][m21]
1126        ps_sum1     tmp4, tmp9, tmp0, tmp4
1127        // tmp6 [m12][m13] : STORE
1128        psq_st      tmp6, 24(m), 0, 0
1129        // tmp5 = [t*nz*nz+c][0]   == [m22][m23]
1130        ps_sum0     tmp5, tmp5, fc0, cT
1131        // tmp4 [m20][m21] : STORE
1132        psq_st      tmp4, 32(m), 0, 0
1133        // tmp5 [m22][m23] : STORE
1134        psq_st      tmp5, 40(m), 0, 0
1135
1136        psq_l       fp14, 8(r1), 0, 0
1137        lfd         fp14, 16(r1)
1138
1139        lwz         r0, 28(r1)
1140        mtlr        r0
1141        addi        r1, r1, 24
1142
1143        blr
1144        .size _ASM_MTXRotAxisRadInternal,$-_ASM_MTXRotAxisRadInternal
1145#undef m
1146#undef axis
1147#undef sT
1148#undef cT
1149#undef tT
1150#undef fc0
1151#undef tmp0
1152#undef tmp1
1153#undef tmp2
1154#undef tmp3
1155#undef tmp4
1156#undef tmp5
1157#undef tmp6
1158#undef tmp7
1159#undef tmp8
1160#undef tmp9
1161
1162
1163
1164////////////////////////////////////////////////////////////////////////////////
1165
1166// void ASM_MTXTrans(Mtx m, f32 xT, f32 yT, f32 zT)
1167#define m  r3
1168#define xT fp1
1169#define yT fp2
1170#define zT fp3
1171#define c_zero fp4
1172#define c_one  fp5
1173        .global ASM_MTXTrans
1174ASM_MTXTrans:
1175        .type ASM_MTXTrans, @function
1176
1177        // c_zero = 0.0F;
1178        lis         r4, CONST_0_0F@h
1179        ori         r4, r4, CONST_0_0F@l
1180        lfs         c_zero, 0(r4)
1181
1182        // c_one = 1.0F;
1183        lis         r5, CONST_1_0F@h
1184        ori         r5, r5, CONST_1_0F@l
1185        lfs         c_one, 0(r5)
1186
1187        stfs        xT,     12(m)
1188        stfs        yT,     28(m)
1189        psq_st      c_zero, 4(m), 0, 0
1190        psq_st      c_zero, 32(m), 0, 0
1191        stfs        c_zero, 16(m)
1192        stfs        c_one,  20(m)
1193        stfs        c_zero, 24(m)
1194        stfs        c_one,  40(m)
1195        stfs        zT,     44(m)
1196        stfs        c_one,  0(m)
1197
1198        blr
1199        .size ASM_MTXTrans,$-ASM_MTXTrans
1200#undef m
1201#undef xT
1202#undef yT
1203#undef zT
1204#undef c_zero
1205#undef c_one
1206
1207
1208////////////////////////////////////////////////////////////////////////////////
1209
1210// void ASM_MTXTransApply(const Mtx src, Mtx dst, f32 xT, f32 yT, f32 zT )
1211#define src r3
1212#define dst r4
1213#define xT fp1
1214#define yT fp2
1215#define zT fp3
1216        .global ASM_MTXTransApply
1217ASM_MTXTransApply:
1218        .type ASM_MTXTransApply, @function
1219
1220        psq_l       fp4, 0(src),        0, 0
1221        frsp        xT, xT;                     // to make sure xT = single precision
1222        psq_l       fp5, 8(src),        0, 0
1223        frsp        yT, yT;                     // to make sure yT = single precision
1224        psq_l       fp7, 24(src),       0, 0
1225        frsp        zT, zT;                     // to make sure zT = single precision
1226        psq_l       fp8, 40(src),       0, 0
1227        psq_st      fp4, 0(dst),        0, 0
1228        ps_sum1     fp5, xT, fp5, fp5
1229        psq_l       fp6, 16(src),       0, 0
1230        psq_st      fp5, 8(dst),        0, 0
1231        ps_sum1     fp7, yT, fp7, fp7
1232        psq_l       fp9, 32(src),       0, 0
1233        psq_st      fp6, 16(dst),       0, 0
1234        ps_sum1     fp8, zT, fp8, fp8
1235        psq_st      fp7, 24(dst),       0, 0
1236        psq_st      fp9, 32(dst),       0, 0
1237        psq_st      fp8, 40(dst),       0, 0
1238
1239        blr
1240        .size ASM_MTXTransApply,$-ASM_MTXTransApply
1241#undef src
1242#undef dst
1243#undef xT
1244#undef yT
1245#undef zT
1246
1247////////////////////////////////////////////////////////////////////////////////
1248
1249// void ASM_MTXScale(Mtx m, f32 xS, f32 yS, f32 zS)
1250#define m  r3
1251#define xS     fp1
1252#define yS     fp2
1253#define zS     fp3
1254#define c_zero fp4
1255        .global ASM_MTXScale
1256ASM_MTXScale:
1257        .type ASM_MTXScale, @function
1258
1259        // c_zero = 0.0F;
1260        lis         r4, CONST_0_0F@h
1261        ori         r4, r4, CONST_0_0F@l
1262        lfs         c_zero, 0(r4)
1263
1264        stfs        xS,     0(m)
1265        psq_st      c_zero, 4(m), 0, 0
1266        psq_st      c_zero, 12(m), 0, 0
1267        stfs        yS,     20(m)
1268        psq_st      c_zero, 24(m), 0, 0
1269        psq_st      c_zero, 32(m), 0, 0
1270        stfs        zS,     40(m)
1271        stfs        c_zero, 44(m)
1272
1273        blr
1274        .size ASM_MTXScale,$-ASM_MTXScale
1275#undef m
1276#undef xS
1277#undef yS
1278#undef zS
1279
1280
1281////////////////////////////////////////////////////////////////////////////////
1282
1283// void ASM_MTXRotTrig(Mtx m, char axis, f32 sinA, f32 cosA);
1284#define m  r3
1285#define axis   r4
1286#define sinA   fp1
1287#define cosA   fp2
1288#define fc0    fp3
1289#define fc1    fp4
1290#define nsinA  fp5
1291#define fw0    fp6
1292#define fw1    fp7
1293#define fw2    fp8
1294#define fw3    fp9
1295
1296        .global ASM_MTXRotTrig
1297ASM_MTXRotTrig:
1298        .type ASM_MTXRotTrig, @function
1299
1300        frsp        sinA, sinA      // to make sure sinA = single precision
1301        frsp        cosA, cosA      // to make sure cosA = single precision
1302
1303        // fc0 = 0.0F;
1304        lis         r5, CONST_0_0F@h
1305        ori         r5, r5, CONST_0_0F@l
1306        lfs         fc0, 0(r5)
1307
1308        // fc1 = 1.0F;
1309        lis         r6, CONST_1_0F@h
1310        ori         r6, r6, CONST_1_0F@l
1311        lfs         fc1, 0(r6)
1312
1313        // always lower case
1314        ori         axis, axis, 0x20
1315        ps_neg      nsinA, sinA
1316
1317        // branches
1318        cmplwi      axis, 'x'
1319        beq         _case_x
1320        cmplwi      axis, 'y'
1321        beq         _case_y
1322        cmplwi      axis, 'z'
1323        beq         _case_z
1324        b           _end
1325
1326    _case_x:
1327        psq_st      fc1,  0(m), 1, 0
1328        psq_st      fc0,  4(m), 0, 0
1329        ps_merge00  fw0, sinA, cosA
1330        psq_st      fc0, 12(m), 0, 0
1331        ps_merge00  fw1, cosA, nsinA
1332        psq_st      fc0, 28(m), 0, 0
1333        psq_st      fc0, 44(m), 1, 0
1334        psq_st      fw0, 36(m), 0, 0
1335        psq_st      fw1, 20(m), 0, 0
1336        b           _end;
1337
1338    _case_y:
1339        ps_merge00  fw0, cosA, fc0
1340        ps_merge00  fw1, fc0, fc1
1341        psq_st      fc0, 24(m), 0, 0
1342        psq_st      fw0,  0(m), 0, 0
1343        ps_merge00  fw2, nsinA, fc0
1344        ps_merge00  fw3, sinA, fc0
1345        psq_st      fw0, 40(m), 0, 0;
1346        psq_st      fw1, 16(m), 0, 0;
1347        psq_st      fw3,  8(m), 0, 0;
1348        psq_st      fw2, 32(m), 0, 0;
1349        b           _end;
1350
1351    _case_z:
1352        psq_st      fc0,  8(m), 0, 0
1353        ps_merge00  fw0, sinA, cosA
1354        ps_merge00  fw2, cosA, nsinA
1355        psq_st      fc0, 24(m), 0, 0
1356        psq_st      fc0, 32(m), 0, 0
1357        ps_merge00  fw1, fc1, fc0
1358        psq_st      fw0, 16(m), 0, 0
1359        psq_st      fw2,  0(m), 0, 0
1360        psq_st      fw1, 40(m), 0, 0
1361
1362    _end:
1363
1364        blr
1365        .size ASM_MTXRotTrig,$-ASM_MTXRotTrig
1366#undef m
1367#undef axis
1368#undef sinA
1369#undef cosA
1370#undef fc0
1371#undef fc1
1372#undef nsinA
1373#undef fw0
1374#undef fw1
1375#undef fw2
1376#undef fw3
1377
1378////////////////////////////////////////////////////////////////////////////////
1379
1380// void ASM_MTXReorder(const Mtx src, ROMtx dest)
1381#define src  r3
1382#define dest r4
1383        .global ASM_MTXReorder
1384#define S00_S01 fp1
1385#define S02_S03 fp2
1386#define S10_S11 fp3
1387#define S12_S13 fp4
1388#define S20_S21 fp5
1389#define S22_S23 fp6
1390#define D00_D10 fp7
1391#define D11_D21 fp8
1392#define D02_D12 fp9
1393#define D22_D03 fp10
1394#define D13_D23 fp11
1395#define D20_D01 fp12
1396
1397ASM_MTXReorder:
1398        .type ASM_MTXReorder, @function
1399
1400        psq_l       S00_S01, 0(src),  0, 0
1401        psq_l       S10_S11, 16(src), 0, 0
1402        psq_l       S20_S21, 32(src), 0, 0
1403        psq_l       S02_S03, 8(src),  0, 0
1404        ps_merge00  D00_D10, S00_S01, S10_S11
1405        psq_l       S12_S13, 24(src), 0, 0
1406        ps_merge01  D20_D01, S20_S21, S00_S01
1407        psq_l       S22_S23, 40(src), 0, 0
1408        ps_merge11  D11_D21, S10_S11, S20_S21
1409        psq_st      D00_D10, 0(dest), 0, 0
1410        ps_merge00  D02_D12, S02_S03, S12_S13
1411        psq_st      D20_D01, 8(dest), 0, 0
1412        ps_merge01  D22_D03, S22_S23, S02_S03
1413        psq_st      D11_D21, 16(dest),0, 0
1414        ps_merge11  D13_D23, S12_S13, S22_S23
1415        psq_st      D02_D12, 24(dest),0, 0
1416        psq_st      D22_D03, 32(dest),0,0
1417        psq_st      D13_D23, 40(dest),0,0
1418
1419        blr
1420        .size ASM_MTXReorder,$-ASM_MTXReorder
1421#undef src
1422#undef dest
1423#undef S00_S01
1424#undef S02_S03
1425#undef S10_S11
1426#undef S12_S13
1427#undef S20_S21
1428#undef S22_S23
1429#undef D00_D10
1430#undef D11_D21
1431#undef D02_D12
1432#undef D22_D03
1433#undef D13_D23
1434#undef D20_D01
1435
1436