1/*---------------------------------------------------------------------------*
2  Project: matrix vector Library
3  File:    mtxVec_asm.s
4
5  Copyright 1998-2011 Nintendo.  All rights reserved.
6
7  These coded instructions, statements, and computer programs contain
8  proprietary information of Nintendo of America Inc. and/or Nintendo
9  Company Ltd., and are protected by Federal copyright law.     They may
10  not be disclosed to third parties or copied or duplicated in any form,
11  in whole or in part, without the prior written consent of Nintendo.
12
13 *---------------------------------------------------------------------------*/
14
15        .data
16        .align 2
17CONST_0_0F:     .float        0.0
18CONST_0_5F:     .float        0.5
19CONST_3_0F:     .float        3.0
20
21        .text
22
23// vec library definitions
24#define RET_REG fp1
25#define V1_XY   fp2
26#define V1_Z    fp3
27#define V2_XY   fp4
28#define V2_Z    fp5
29#define D1_XY   fp6
30#define D1_Z    fp7
31#define D2_XY   fp8
32#define D2_Z    fp9
33#define W1_XY   fp10
34#define W1_Z    fp11
35#define W2_XY   fp12
36#define W2_Z    fp13
37
38////////////////////////////////////////////////////////////////////////////////
39// void ASM_CrossProduct (const Vec* vec1, const Vec* vec2, Vec* dst)
40#define vec1  r3
41#define vec2  r4
42#define dst   r5
43        .global ASM_VECCrossProduct
44ASM_VECCrossProduct:
45        .type ASM_VECCrossProduct, @function
46
47        //x =   a.n[VY]*b.n[VZ] - a.n[VZ]*b.n[VY];
48        //y =   a.n[VZ]*b.n[VX] - a.n[VX]*b.n[VZ];
49        //z =   a.n[VX]*b.n[VY] - a.n[VY]*b.n[VX];
50
51        // BX | BY
52        psq_l       fp1, 0(vec2), 0, 0
53        // AZ | AZ
54        lfs         fp2, 8(vec1)
55        // AX | AY
56        psq_l       fp0, 0(vec1), 0, 0
57        // BY | BX
58        ps_merge10  fp6, fp1, fp1
59        // BZ | BZ
60        lfs         fp3, 8(vec2)
61
62        // BX*AZ | BY*AZ
63        ps_mul      fp4, fp1, fp2
64        // BX*AX | BY*AX
65        ps_muls0    fp7, fp1, fp0
66        // AX*BZ-BX*AZ | AY*BZ-BY*AZ
67        ps_msub     fp5, fp0, fp3, fp4
68        // AX*BY-BX*AX | AY*BX-BY*AX
69        ps_msub     fp8, fp0, fp6, fp7
70
71        // AY*BZ-AZ*BY | AY*BZ-AZ*BY
72        ps_merge11  fp9, fp5, fp5
73        // AX*BZ-AZ*BX | AY*BX-AX*BY
74        ps_merge01  fp10, fp5, fp8
75
76        psq_st      fp9, 0(dst), 1, 0
77
78        // AZ*BX-AX*BZ | AX*BY-AY*BX
79        ps_neg      fp10, fp10
80
81        psq_st      fp10, 4(dst), 0, 0
82
83        blr
84        .size ASM_VECCrossProduct,$-ASM_VECCrossProduct
85#undef vec1
86#undef vec2
87#undef dst
88
89
90
91////////////////////////////////////////////////////////////////////////////////
92// void ASM_VECAdd(const Vec* vec1, const Vec* vec2, Vec* dst)
93        .global ASM_VECAdd
94#define vec1 r3
95#define vec2 r4
96#define dst  r5
97ASM_VECAdd:
98        .type ASM_VECAdd, @function
99
100        //load vectors XY
101        psq_l     V1_XY,  0(vec1), 0, 0;
102        psq_l     V2_XY,  0(vec2), 0, 0;
103        //add vectors XY
104        ps_add  D1_XY, V1_XY, V2_XY;
105        //store result XY
106        psq_st    D1_XY,  0(dst), 0, 0;
107        //load vectors Z
108        psq_l     V1_Z,   8(vec1), 1, 0;
109        psq_l     V2_Z,   8(vec2), 1, 0;
110        //add vectors Z
111        ps_add  D1_Z, V1_Z, V2_Z;
112        //store result Z
113        psq_st    D1_Z,   8(dst), 1, 0;
114
115        blr
116        .size ASM_VECAdd,$-ASM_VECAdd
117
118#undef vec1
119#undef vec2
120#undef dst
121
122
123////////////////////////////////////////////////////////////////////////////////
124// void ASM_VECSubtract(const Vec* vec1, const Vec* vec2, Vec* dst)
125        .global ASM_VECSubtract
126#define vec1 r3
127#define vec2 r4
128#define dst  r5
129ASM_VECSubtract:
130        .type ASM_VECSubtract, @function
131
132        //load vectors XY
133        psq_l     V1_XY,  0(vec1), 0, 0;
134        psq_l     V2_XY,  0(vec2), 0, 0;
135        //subtract vectors XY
136        ps_sub    D1_XY, V1_XY, V2_XY;
137        //store vectors XY
138        psq_st    D1_XY, 0(dst), 0, 0;
139
140        //load vectors Z
141        psq_l     V1_Z,   8(vec1), 1, 0;
142        psq_l     V2_Z,   8(vec2), 1, 0;
143        //subtract vectors Z
144        ps_sub    D1_Z, V1_Z, V2_Z;
145        //store vectors Z
146        psq_st    D1_Z,  8(dst), 1, 0;
147
148        blr
149        .size ASM_VECSubtract,$-ASM_VECSubtract
150
151#undef vec1
152#undef vec2
153#undef dst
154
155
156////////////////////////////////////////////////////////////////////////////////
157// f32 ASM_VECSquareMag(const Vec* vec1)
158        .global ASM_VECSquareMag
159#define vec1  r3
160#define sqmag f1
161#define vxy   f2
162#define vzz   f4
163ASM_VECSquareMag:
164        .type ASM_VECSquareMag, @function
165
166        // load X | Y
167        psq_l       vxy, 0(vec1), 0, 0
168        // XX | YY
169        ps_mul      vxy, vxy, vxy
170        // load Z | Z
171        lfs         vzz, 8(vec1)
172        // XX + ZZ | YY + ZZ
173        ps_madd     sqmag, vzz, vzz, vxy
174        ps_sum0     sqmag, sqmag, vxy, vxy
175
176        blr
177        .size ASM_VECSquareMag,$-ASM_VECSquareMag
178
179#undef vec1
180#undef vxy
181#undef vzz
182#undef sqmag
183
184
185
186////////////////////////////////////////////////////////////////////////////////
187// f32 ASM_VECSquareDistance(const Vec* a, const Vec* b)
188        .global ASM_VECSquareDistance
189#define a      r3
190#define b      r4
191#define v0yz   f2
192#define v1yz   f3
193#define v0xy   f4
194#define v1xy   f5
195#define dyz    f6
196#define dxy    f7
197#define sqdist f1
198ASM_VECSquareDistance:
199        .type ASM_VECSquareDistance, @function
200
201        psq_l    v0yz, 4(a), 0, 0           // [Y0][Z0]
202        psq_l    v1yz, 4(b), 0, 0           // [Y1][Z1]
203        ps_sub   dyz, v0yz, v1yz            // [Y0-Y1][Z0-Z1]
204
205        psq_l    v0xy, 0(a), 0, 0           // [X0][Y0]
206        psq_l    v1xy, 0(b), 0, 0           // [X1][Y1]
207        ps_mul   dyz, dyz, dyz              // [dYdY][dZdZ]
208        ps_sub   dxy, v0xy, v1xy            // [X0-X1][Y0-Y1]
209
210        ps_madd  sqdist, dxy, dxy, dyz      // [dXdX+dYdY][dYdY+dZdZ]
211        ps_sum0  sqdist, sqdist, dyz, dyz   // [dXdX+dYdY+dZdZ][N/A]
212
213        blr
214        .size ASM_VECSquareDistance,$-ASM_VECSquareDistance
215
216#undef a
217#undef b
218#undef v0yz
219#undef vlyz
220#undef v0xy
221#undef v1xy
222#undef dyz
223#undef dxy
224#undef sqdist
225
226
227////////////////////////////////////////////////////////////////////////////////
228// f32 ASM_VECMag(const Vec* v)
229        .global ASM_VECMag
230#define v       r3
231#define vxy     f2
232#define vzz     f3
233#define sqmag   f1
234#define rmag    f4
235#define nwork0  f5
236#define nwork1  f6
237#define c_three f7
238#define c_half  f8
239#define c_zero  f9
240ASM_VECMag:
241        .type ASM_VECMag, @function
242
243        // c_half = 0.5F;
244        lis         r4, CONST_0_5F@h
245        ori         r4, r4, CONST_0_5F@l
246        lfs         c_half, 0(r4)
247
248        // Square mag calculation
249        psq_l       vxy, 0(v), 0, 0
250        ps_mul      vxy, vxy, vxy
251        lfs         vzz, 8(v)
252        fsubs       c_zero, c_half, c_half
253        ps_madd     sqmag, vzz, vzz, vxy
254
255        // Square mag
256        ps_sum0     sqmag, sqmag, vxy, vxy
257
258        // Zero check
259        fcmpu       cr0, sqmag, c_zero
260        beq-        _ASM_VECMag_exit
261
262        // 1.0/sqrt : estimation[E]
263        frsqrte     rmag, sqmag
264
265        // c_three = 3.0F;
266        lis         r4, CONST_3_0F@h
267        ori         r4, r4, CONST_3_0F@l
268        lfs         c_three, 0(r4)
269
270        // Refinement x 1 : E' = (E/2)(3 - X*E*E)
271        fmuls       nwork0, rmag, rmag
272        fmuls       nwork1, rmag, c_half
273        fnmsubs     nwork0, nwork0, sqmag, c_three
274        fmuls       rmag, nwork0, nwork1
275
276        // 1/sqrt(X) * X = sqrt(X)
277        fmuls       sqmag, sqmag, rmag
278
279_ASM_VECMag_exit:
280        blr
281        .size ASM_VECMag,$-ASM_VECMag
282
283#undef v
284#undef vxy
285#undef vzz
286#undef sqmag
287#undef rmag
288#undef nwork0
289#undef nwork1
290#undef c_three
291#undef c_half
292#undef c_zero
293
294
295
296////////////////////////////////////////////////////////////////////////////////
297// void ASM_VECScale(const Vec *src, Vec *dst, f32 mult)
298        .global ASM_VECScale
299#define src  r3
300#define dst  r4
301#define mult f1
302#define vxy  f2
303#define vz   f3
304#define rxy  f4
305#define rz   f5
306ASM_VECScale:
307        .type ASM_VECScale, @function
308
309        //load vector XY
310        psq_l     vxy, 0(src), 0, 0
311        //load vector Z
312        psq_l     vz,  8(src), 1, 0
313        //multiply vector XY
314        ps_muls0  rxy, vxy, mult
315        //store result XY
316        psq_st    rxy, 0(dst), 0, 0
317        //multiply vector Z
318        ps_muls0  rz,  vz,  mult
319        //store vector Z
320        psq_st    rz,  8(dst), 1, 0
321
322        blr
323        .size ASM_VECScale,$-ASM_VECScale
324
325#undef src
326#undef dst
327#undef mult
328#undef vxy
329#undef vz
330#undef rxy
331#undef rz
332
333
334
335////////////////////////////////////////////////////////////////////////////////
336// f32 ASM_VECDistance(const Vec *a, const Vec *b)
337        .global ASM_VECDistance
338#define a       r3
339#define b       r4
340#define sqdist  f1
341#define v0yz    f2
342#define v1yz    f3
343#define v0xy    f4
344#define v1xy    f5
345#define dyz     f6
346#define dxy     f7
347#define rdist   f8
348#define nwork0  f9
349#define nwork1  f10
350#define c_half  f11
351#define c_three f12
352#define c_zero  f13
353ASM_VECDistance:
354        .type ASM_VECDistance, @function
355
356        psq_l       v0yz, 4(a), 0, 0           // [Y0][Z0]
357        psq_l       v1yz, 4(b), 0, 0           // [Y1][Z1]
358        ps_sub      dyz, v0yz, v1yz            // [Y0-Y1][Z0-Z1]
359
360        psq_l       v0xy, 0(a), 0, 0           // [X0][Y0]
361        psq_l       v1xy, 0(b), 0, 0           // [X1][Y1]
362        ps_mul      dyz, dyz, dyz              // [dYdY][dZdZ]
363        ps_sub      dxy, v0xy, v1xy            // [X0-X1][Y0-Y1]
364
365        // c_half = 0.5F;
366        lis         r5, CONST_0_5F@h
367        ori         r5, r5, CONST_0_5F@l
368        lfs         c_half, 0(r5)
369
370        // c_zero = 0.0F;
371        lis         r5, CONST_0_0F@h
372        ori         r5, r5, CONST_0_0F@l
373        lfs         c_zero, 0(r5)
374
375        ps_madd     sqdist, dxy, dxy, dyz      // [dXdX+dYdY][dYdY+dZdZ]
376        fsubs       c_zero, c_half, c_half
377        ps_sum0     sqdist, sqdist, dyz, dyz   // [dXdX+dYdY+dZdZ][N/A]
378
379        // Zero check
380        fcmpu       cr0, c_zero, sqdist
381        beq-        _ASM_VECDistance_exit
382
383        // c_three = 3.0F;
384        lis         r5, CONST_3_0F@h
385        ori         r5, r5, CONST_3_0F@l
386        lfs         c_three, 0(r5)
387
388        // 1.0/sqrt : estimation[E]
389        frsqrte     rdist, sqdist
390        // Refinement x 1 : E' = (E/2)(3 - X*E*E)
391        fmuls       nwork0, rdist, rdist
392        fmuls       nwork1, rdist, c_half
393        fnmsubs     nwork0, nwork0, sqdist, c_three
394        fmuls       rdist, nwork0, nwork1
395
396        // 1/sqrt(X) * X = sqrt(X)
397        fmuls       sqdist, sqdist, rdist
398
399_ASM_VECDistance_exit:
400        blr
401        .size ASM_VECDistance,$-ASM_VECDistance
402
403#undef a
404#undef b
405#undef sqdist
406#undef v0yz
407#undef v1yz
408#undef v0xy
409#undef v1xy
410#undef dyz
411#undef dxy
412#undef rdist
413#undef nwork0
414#undef nwork1
415#undef c_half
416#undef c_three
417#undef c_zero
418
419////////////////////////////////////////////////////////////////////////////////
420// void ASM_VECNormalize(const Vec *vec1, const Vec *dst)
421        .global ASM_VECNormalize
422#define vec1    r3
423#define dst     r4
424#define rsqrt   f1
425#define v1_xy   f2
426#define v1_z    f3
427#define xx_yy   f4
428#define xx_zz   f5
429#define sqsum   f6
430#define nwork0  f7
431#define nwork1  f8
432#define c_half  f9
433#define c_three f10
434ASM_VECNormalize:
435        .type ASM_VECNormalize, @function
436
437        // c_half = 0.5F;
438        lis         r5, CONST_0_5F@h
439        ori         r5, r5, CONST_0_5F@l
440        lfs         c_half, 0(r5)
441
442        // X | Y
443        psq_l       v1_xy, 0(vec1), 0, 0;
444        // X*X | Y*Y
445        ps_mul      xx_yy, v1_xy, v1_xy;
446        // Z | 1
447        psq_l       v1_z, 8(vec1), 1, 0;
448        // X*X+Z*Z | Y*Y+1
449        ps_madd     xx_zz, v1_z, v1_z, xx_yy;
450        // X*X+Z*Z+Y*Y | Z
451        ps_sum0     sqsum, xx_zz, v1_z, xx_yy;
452
453        // c_three = 3.0F;
454        lis         r5, CONST_3_0F@h
455        ori         r5, r5, CONST_3_0F@l
456        lfs         c_three, 0(r5)
457
458        // 1.0/sqrt : estimation[E]
459        frsqrte     rsqrt, sqsum;
460        // Newton's refinement x 1
461        // E' = (E/2)(3 - sqsum * E * E)
462        fmuls       nwork0, rsqrt, rsqrt;
463        fmuls       nwork1, rsqrt, c_half;
464        fnmsubs     nwork0, nwork0, sqsum, c_three;
465        fmuls       rsqrt, nwork0, nwork1;
466
467        // X * mag | Y * mag
468        ps_muls0    v1_xy, v1_xy, rsqrt;
469        psq_st      v1_xy, 0(dst), 0, 0;
470
471        // Z * mag
472        ps_muls0    v1_z, v1_z, rsqrt;
473        psq_st      v1_z, 8(dst), 1, 0;
474
475        blr
476        .size ASM_VECNormalize,$-ASM_VECNormalize
477
478#undef vec1
479#undef dst
480#undef sqsum
481#undef v1_xy
482#undef v1_z
483#undef xx_yy
484#undef xx_zz
485#undef rsqrt
486#undef nwork0
487#undef nwork1
488#undef c_half
489#undef c_three
490
491////////////////////////////////////////////////////////////////////////////////
492//f32  ASM_VECDotProduct(const Vec *a, const Vec *b)
493        .global ASM_VECDotProduct
494#define a    r3
495#define b    r4
496ASM_VECDotProduct:
497        .type ASM_VECDotProduct, @function
498
499        psq_l    fp2, 4(a), 0, 0;
500        psq_l    fp3, 4(b), 0, 0;
501
502        ps_mul   fp2, fp2, fp3;
503
504        psq_l    fp5, 0(a), 0, 0;
505        psq_l    fp4, 0(b), 0, 0;
506
507        ps_madd  fp3, fp5, fp4, fp2;
508        ps_sum0  fp1, fp3, fp2, fp2;
509
510        blr
511        .size ASM_VECDotProduct,$-ASM_VECDotProduct
512
513#undef a
514#undef b
515
516////////////////////////////////////////////////////////////////////////////////
517// void ASM_MTXMultVec(const Mtx m, const Vec *src, Vec *dst)
518#define m   r3
519#define src r4
520#define dst r5
521        .global ASM_MTXMultVec
522ASM_MTXMultVec:
523        .type ASM_MTXMultVec, @function
524
525        // load v[0], v[1]
526        psq_l       fp0, 0(src), 0, 0
527        // load m[0][0], m[0][1]
528        psq_l       fp2, 0(m), 0, 0
529        // load v[2], 1
530        psq_l       fp1, 8(src), 1, 0
531        // m[0][0]*v[0], m[0][1]*v[1]
532        ps_mul      fp4, fp2, fp0
533        // load m[0][2], m[0][3]
534        psq_l       fp3, 8(m), 0, 0
535        // m[0][0]*v[0]+m[0][2]*v[2], m[0][1]*v[1]+m[0][3]
536        ps_madd     fp5, fp3, fp1, fp4
537        // load m[1][0], m[1][1]
538        psq_l       fp8, 16(m), 0, 0
539        // m[0][0]*v[0]+m[0][2]*v[2]+m[0][1]*v[1]+m[0][3], ???
540        ps_sum0     fp6, fp5, fp6, fp5
541        // load m[1][2], m[1][3]
542        psq_l       fp9, 24(m), 0, 0
543        // m[1][0]*v[0], m[1][1]*v[1]
544        ps_mul      fp10, fp8, fp0
545        // store dst[0]
546        psq_st      fp6, 0(dst), 1, 0
547        // m[1][0]*v[0]+m[1][2]*v[2], m[1][1]*v[1]+m[1][3]
548        ps_madd     fp11, fp9, fp1, fp10
549        // load m[2][0], m[2][1]
550        psq_l       fp2, 32(m), 0, 0
551        // m[1][0]*v[0]+m[1][2]*v[2]+m[2][1]*v[1]+m[1][3], ???
552        ps_sum0     fp12, fp11, fp12, fp11
553        // load m[2][2], m[2][3]
554        psq_l       fp3, 40(m), 0, 0
555        // m[0][0]*v[0], m[0][1]*v[1]
556        ps_mul      fp4, fp2, fp0
557        // store dst[1]
558        psq_st      fp12, 4(dst), 1, 0
559        // m[0][0]*v[0]+m[0][2]*v[2], m[0][1]*v[1]+m[0][3]
560        ps_madd     fp5, fp3, fp1, fp4
561        // m[0][0]*v[0]+m[0][2]*v[2]+m[0][1]*v[1]+m[0][3], ???
562        ps_sum0     fp6, fp5, fp6, fp5
563        // store dst[0]
564        psq_st      fp6, 8(dst), 1, 0
565
566        blr
567        .size ASM_MTXMultVec,$-ASM_MTXMultVec
568#undef m
569#undef src
570#undef dst
571
572
573
574////////////////////////////////////////////////////////////////////////////////
575// void ASM_MTXMultVecArray (const Mtx m, const Vec *srcBase, Vec *dstBase, u32 count )
576#define m       r3
577#define srcBase r4
578#define dstBase r5
579#define count   r6
580        .global ASM_MTXMultVecArray
581ASM_MTXMultVecArray:
582        .type ASM_MTXMultVecArray, @function
583
584        // fp13 [m00][m01] : LOAD
585        psq_l       fp13,  0(m), 0, 0
586        // fp12 [m10][m11] : LOAD
587        psq_l       fp12, 16(m), 0, 0
588        // decrement loop count due to unrolling
589        subi        count, count, 1
590        // fp11 [m02][m03] : LOAD
591        psq_l       fp11,  8(m), 0, 0
592        // fp0 [m00][m10]
593        ps_merge00  fp0, fp13, fp12
594        // base pointer adjustment
595        subi        dstBase, dstBase, 4
596        // fp10 [m12][m13] : LOAD
597        psq_l       fp10, 24(m), 0, 0
598        // fp1 [m01][m11]
599        ps_merge11  fp1, fp13, fp12
600        // loop counter
601        mtctr       count
602        // fp4 [m20][m21] : LOAD
603        psq_l       fp4,  32(m), 0, 0
604        // fp2 [m02][m12]
605        ps_merge00  fp2, fp11, fp10
606        // fp5 [m22][m23] : LOAD
607        psq_l       fp5,  40(m), 0, 0
608        // fp3 [m03][m13]
609        ps_merge11  fp3, fp11, fp10
610
611        // fp6 [v0][v1]   : LOAD
612        psq_l       fp6,  0(srcBase), 0, 0
613        // fp7 [v2][1.0F] : LOAD
614        psq_lu      fp7,  8(srcBase), 1, 0
615        // fp8 [m00*v0+m03][m10*v0+m13]
616        ps_madds0   fp8, fp0, fp6, fp3
617        // fp9 [m20*v0][m21*v1]
618        ps_mul      fp9, fp4, fp6
619        // fp8 [m00*v0+m01*v1+m03][m10*v0+m11*v1+m13]
620        ps_madds1   fp8, fp1, fp6, fp8
621        // fp10 [m20*v0+m22*v2][m21*v1+m23*1.0F]
622        ps_madd     fp10, fp5, fp7, fp9
623
624_ASM_MTXMultVecArray_mloop:
625    //-------- Unrolled loop --------
626
627        // fp6 [v0][v1]   : LOAD
628        psq_lu      fp6,  4(srcBase), 0, 0
629        // fp12 [m00*v0+m01*v1+m02*v2+m03][m10*v0+m11*v1+m12*v2+m13]
630        ps_madds0   fp12, fp2, fp7, fp8
631        // fp7 [v2][1.0F] : LOAD
632        psq_lu      fp7,  8(srcBase), 1, 0
633        // fp13 [m20*v0+m21*v1+m22*v2+m23][?]
634        ps_sum0     fp13, fp10, fp9, fp10
635        // fp8 [m00*v0+m03][m10*v0+m13]
636        ps_madds0   fp8, fp0, fp6, fp3
637        // fp9 [m20*v0][m21*v1]
638        ps_mul      fp9, fp4, fp6
639        // fp12 [v0'][v1'] : STORE
640        psq_stu     fp12,  4(dstBase), 0, 0
641        // fp8 [m00*v0+m01*v1+m03][m10*v0+m11*v1+m13]
642        ps_madds1   fp8, fp1, fp6, fp8
643        // fp13 [v2'][ ? ] : STORE
644        psq_stu     fp13,  8(dstBase), 1, 0
645        // fp10 [m20*v0+m22*v2][m21*v1+m23*1.0F]
646        ps_madd     fp10, fp5, fp7, fp9
647
648        // LOOP
649        bdnz        _ASM_MTXMultVecArray_mloop
650
651
652        // fp12 [m00*v0+m01*v1+m02*v2+m03][m10*v0+m11*v1+m12*v2+m13]
653        ps_madds0   fp12, fp2, fp7, fp8
654        // fp13 [m20*v0+m21*v1+m22*v2+m23][?]
655        ps_sum0     fp13, fp10, fp9, fp10
656        // fp12 [v0'][v1'] : STORE
657        psq_stu     fp12,  4(dstBase), 0, 0
658        // fp13 [v2'][ ? ] : STORE
659        psq_stu     fp13,  8(dstBase), 1, 0
660
661        blr
662        .size ASM_MTXMultVecArray,$-ASM_MTXMultVecArray
663
664#undef m
665#undef srcBase
666#undef dstBase
667#undef count
668
669
670
671////////////////////////////////////////////////////////////////////////////////
672// void ASM_MTXMultVecSR(const Mtx m, const Vec *src, Vec *dst)
673        .global ASM_MTXMultVecSR
674#define m   r3
675#define src r4
676#define dst r5
677ASM_MTXMultVecSR:
678        .type ASM_MTXMultVecSR, @function
679
680        psq_l   fp0, 0(m), 0, 0    // m[0][0], m[0][1] GQR0 = 0
681
682        // fp6 - x y
683        psq_l   fp6, 0(src), 0, 0
684
685        psq_l   fp2, 16(m), 0, 0   // m[1][0], m[1][1]
686
687
688        // fp8 = m00x m01y // next X
689        ps_mul  fp8, fp0, fp6
690        psq_l   fp4, 32(m), 0, 0   // m[2][0], m[2][1]
691
692        // fp10 = m10x m11y // next Y
693        ps_mul  fp10, fp2, fp6
694        psq_l   fp7, 8(src), 1, 0   // fp7 - z,1.0
695
696        // fp12 = m20x m21y // next Z
697        ps_mul  fp12, fp4, fp6  // YYY last FP6 usage
698        psq_l   fp3, 24(m), 0, 0   // m[1][2], m[1][3]
699
700        ps_sum0 fp8, fp8, fp8, fp8
701        psq_l   fp5, 40(m), 0, 0   // m[2][2], m[2][3]
702
703        ps_sum0 fp10, fp10, fp10, fp10
704        psq_l   fp1,  8(m), 0, 0    // m[0][2], m[0][3]
705
706        ps_sum0 fp12, fp12, fp12, fp12
707        ps_madd fp9, fp1, fp7, fp8
708        psq_st  fp9,  0(dst), 1, 0      // store X
709
710        ps_madd fp11, fp3, fp7, fp10
711        psq_st  fp11, 4(dst), 1, 0      // store Y
712
713        ps_madd fp13, fp5, fp7, fp12
714        psq_st  fp13, 8(dst), 1, 0      //  sore Z
715
716        blr
717        .size ASM_MTXMultVecSR,$-ASM_MTXMultVecSR
718
719#undef m
720#undef src
721#undef dst
722
723
724
725////////////////////////////////////////////////////////////////////////////////
726// void ASM_MTXMultVecArraySR(const Mtx m, const Vec *srcBase, Vec *dstBase, u32 count)
727#define m       r3
728#define srcBase r4
729#define dstBase r5
730#define count   r6
731        .global ASM_MTXMultVecArraySR
732ASM_MTXMultVecArraySR:
733        .type ASM_MTXMultVecArraySR, @function
734
735        // fp13 [m00][m01] : LOAD
736        psq_l       fp13,  0(m), 0, 0
737        // fp12 [m10][m11] : LOAD
738        psq_l       fp12, 16(m), 0, 0
739        // decrement loop count due to unrolling
740        subi        count, count, 1
741        // fp11 [m02][1.0F] : LOAD
742        psq_l       fp11,  8(m), 1, 0
743        // fp0 [m00][m10]
744        ps_merge00  fp0, fp13, fp12
745        // base pointer adjustment
746        subi        dstBase, dstBase, 4
747        // fp10 [m12][1.0F] : LOAD
748        psq_l       fp10, 24(m), 1, 0
749        // fp1 [m01][m11]
750        ps_merge11  fp1, fp13, fp12
751        // loop counter
752        mtctr       count
753        // fp3 [m20][m21] : LOAD
754        psq_l       fp3,  32(m), 0, 0
755        // fp2 [m02][m12]
756        ps_merge00  fp2, fp11, fp10
757        // fp4 [m22][1.0F] : LOAD
758        psq_l       fp4,  40(m), 1, 0
759
760
761        // fp6 [v0][v1]   : LOAD
762        psq_l       fp6,  0(srcBase), 0, 0
763        // fp7 [v2][1.0F] : LOAD
764        psq_lu      fp7,  8(srcBase), 1, 0
765        // fp8 [m00*v0][m10*v0]
766        ps_muls0    fp8, fp0, fp6
767        // fp9 [m20*v0][m21*v1]
768        ps_mul      fp9, fp3, fp6
769        // fp8 [m00*v0+m01*v1][m10*v0+m11*v1]
770        ps_madds1   fp8, fp1, fp6, fp8
771        // fp10 [m20*v0+m22*v2][?]
772        ps_madd     fp10, fp4, fp7, fp9
773
774_ASM_MTXMultVecArraySR_mloop:
775        //-------- Unrolled loop --------
776
777        // fp6 [v0][v1]   : LOAD
778        psq_lu      fp6,  4(srcBase), 0, 0
779        // fp12 [m00*v0+m01*v1+m02*v2][m10*v0+m11*v1+m12*v2]
780        ps_madds0   fp12, fp2, fp7, fp8
781        // fp7 [v2][1.0F] : LOAD
782        psq_lu      fp7,  8(srcBase), 1, 0
783        // fp13 [m20*v0+m21*v1+m22*v2][?]
784        ps_sum0     fp13, fp10, fp9, fp9
785        // fp8 [m00*v0][m10*v0]
786        ps_muls0    fp8, fp0, fp6
787        // fp9 [m20*v0][m21*v1]
788        ps_mul      fp9, fp3, fp6
789        // fp12 [v0'][v1'] : STORE
790        psq_stu     fp12,  4(dstBase), 0, 0
791        // fp8 [m00*v0+m01*v1][m10*v0+m11*v1]
792        ps_madds1   fp8, fp1, fp6, fp8
793        // fp13 [v2'][ ? ] : STORE
794        psq_stu     fp13,  8(dstBase), 1, 0
795        // fp10 [m20*v0+m22*v2][?]
796        ps_madd     fp10, fp4, fp7, fp9
797
798        // LOOP
799        bdnz        _ASM_MTXMultVecArraySR_mloop
800
801
802        // fp12 [m00*v0+m01*v1+m02*v2][m10*v0+m11*v1+m12*v2]
803        ps_madds0   fp12, fp2, fp7, fp8
804        // fp13 [m20*v0+m21*v1+m22*v2][?]
805        ps_sum0     fp13, fp10, fp9, fp9
806        // fp12 [v0'][v1'] : STORE
807        psq_stu     fp12,  4(dstBase), 0, 0
808        // fp13 [v2'][ ? ] : STORE
809        psq_stu     fp13,  8(dstBase), 1, 0
810
811        blr
812        .size ASM_MTXMultVecArraySR,$-ASM_MTXMultVecArraySR
813
814#undef m
815#undef srcBase
816#undef dstBase
817#undef count
818