1/*---------------------------------------------------------------------------*
2  Project: matrix vector Library
3  File:    mtxQuat_asm.s
4
5  Copyright 1998-2011 Nintendo.  All rights reserved.
6
7  These coded instructions, statements, and computer programs contain
8  proprietary information of Nintendo of America Inc. and/or Nintendo
9  Company Ltd., and are protected by Federal copyright law.     They may
10  not be disclosed to third parties or copied or duplicated in any form,
11  in whole or in part, without the prior written consent of Nintendo.
12
13 *---------------------------------------------------------------------------*/
14
15        .data
16        .align 2
17CONST_0_5F:     .float        0.5
18CONST_1_0F:     .float        1.0
19CONST_3_0F:     .float        3.0
20CONST_EPSILON:  .float        0.00001
21
22
23        .text
24
25
26////////////////////////////////////////////////////////////////////////////////
27// void ASM_QUATAdd(const Quaternion *p, const Quaternion *q, Quaternion *r)
28        .global ASM_QUATAdd
29#define p r3
30#define q r4
31#define r r5
32ASM_QUATAdd:
33        .type ASM_QUATAdd, @function
34#define pxy fp1
35#define qxy fp2
36#define rxy fp3
37#define pzw fp4
38#define qzw fp5
39#define rzw fp6
40        psq_l     pxy,  0(p), 0, 0
41        psq_l     qxy,  0(q), 0, 0
42        ps_add    rxy,   pxy, qxy
43        psq_st    rxy,  0(r), 0, 0
44
45        psq_l     pzw,  8(p), 0, 0
46        psq_l     qzw,  8(q), 0, 0
47        ps_add    rzw,   pzw, qzw
48        psq_st    rzw,  8(r), 0, 0
49        blr
50        .size ASM_QUATAdd,$-ASM_QUATAdd
51#undef p
52#undef q
53#undef r
54#undef pxy
55#undef qxy
56#undef rxy
57#undef pzw
58#undef qzw
59#undef rzw
60
61
62
63////////////////////////////////////////////////////////////////////////////////
64// void ASM_QUATSubtract(const Quaternion *p, const Quaternion *q, Quaternion *r)
65#define p r3
66#define q r4
67#define r r5
68        .global ASM_QUATSubtract
69ASM_QUATSubtract:
70        .type ASM_QUATSubtract, @function
71#define pxy fp1
72#define qxy fp2
73#define rxy fp3
74#define pzw fp4
75#define qzw fp5
76#define rzw fp6
77        psq_l     pxy,  0(p), 0, 0
78        psq_l     qxy,  0(q), 0, 0
79        ps_sub    rxy,   pxy, qxy
80        psq_st    rxy,  0(r), 0, 0
81
82        psq_l     pzw,  8(p), 0, 0
83        psq_l     qzw,  8(q), 0, 0
84        ps_sub    rzw,   pzw, qzw
85        psq_st    rzw,  8(r), 0, 0
86        blr
87        .size ASM_QUATSubtract,$-ASM_QUATSubtract
88#undef p
89#undef q
90#undef r
91#undef pxy
92#undef qxy
93#undef rxy
94#undef pzw
95#undef qzw
96#undef rzw
97
98
99
100////////////////////////////////////////////////////////////////////////////////
101// void ASM_QUATMultiply(const Quaternion *p, const Quaternion *q, Quaternion *pq)
102#define p  r3
103#define q  r4
104#define pq r5
105        .global ASM_QUATMultiply
106ASM_QUATMultiply:
107        .type ASM_QUATMultiply, @function
108#define pxy   fp1
109#define pzw   fp2
110#define qxy   fp3
111#define qzw   fp4
112#define pnxy  fp5
113#define pnzw  fp6
114#define pnxny fp7
115#define pnznw fp8
116#define rxy   fp9
117#define rzw   fp10
118#define sxy   fp11
119#define szw   fp12
120        // [px][py] : Load
121        psq_l       pxy, 0(p), 0, 0
122        // [pz][pw] : Load
123        psq_l       pzw, 8(p), 0, 0
124
125        // [qx][qy] : Load
126        psq_l       qxy, 0(q), 0, 0
127        // [-px][-py]
128        ps_neg      pnxny, pxy
129        // [qz][qw] : Load
130        psq_l       qzw, 8(q), 0, 0
131        // [-pz][-pw]
132        ps_neg      pnznw, pzw
133
134        // [-px][py]
135        ps_merge01  pnxy, pnxny, pxy
136
137        // [pz*qx][pw*qx]
138        ps_muls0    rxy, pzw, qxy
139        // [-px*qx][-py*qx]
140        ps_muls0    rzw, pnxny, qxy
141
142        // [-pz][pw]
143        ps_merge01  pnzw, pnznw, pzw
144
145        // [-px*qy][py*qy]
146        ps_muls1    szw, pnxy, qxy
147        // [pz*qx-px*qz][pw*qx+py*qz]
148        ps_madds0   rxy, pnxy, qzw, rxy
149        // [-pz*qy][pw*qy]
150        ps_muls1    sxy, pnzw, qxy
151        // [-px*qx-pz*qz][-py*qx+pw*qz]
152        ps_madds0   rzw, pnzw, qzw, rzw
153        // [-px*qy-pz*qw][py*qy-pw*qw]
154        ps_madds1   szw, pnznw, qzw, szw
155        // [pw*qx+py*qz][pz*qx-px*qz]
156        ps_merge10  rxy, rxy, rxy
157        // [-pz*qy+px*qw][pw*qy+py*qw]
158        ps_madds1   sxy, pxy, qzw, sxy
159        // [-py*qx+pw*qz][-px*qx-pz*qz]
160        ps_merge10  rzw, rzw, rzw
161
162        // [pw*qx+py*qz-pz*qy+px*qw][pz*qx-px*qz+pw*qy+py*qw] : [pqx][pqy]
163        ps_add      rxy, rxy, sxy
164        // [pqx][pqy] : Store
165        psq_st      rxy, 0(pq), 0, 0
166        // [-py*qx+pw*qz+px*qy+pz*qw][-px*qx-pz*qz-py*qy+pw*qw] : [pqz][pqw]
167        ps_sub      rzw, rzw, szw
168        // [pqz][pqw] : Store
169        psq_st      rzw, 8(pq), 0, 0
170
171        blr
172        .size ASM_QUATMultiply,$-ASM_QUATMultiply
173#undef p
174#undef q
175#undef pq
176#undef pxy
177#undef pzw
178#undef qxy
179#undef qzw
180#undef pnxy
181#undef pnzw
182#undef pnxny
183#undef pnznw
184#undef rxy
185#undef rzw
186#undef sxy
187#undef szw
188
189
190
191////////////////////////////////////////////////////////////////////////////////
192// void ASM_QUATInverse(const Quaternion *src, Quaternion *inv)
193        .global ASM_QUATInverse
194#define src     r3
195#define inv     r4
196ASM_QUATInverse:
197        .type ASM_QUATInverse, @function
198#define sxy     fp1
199#define szw     fp2
200#define izz     fp3
201#define iww     fp4
202#define mag     fp5
203#define nmag    fp6
204#define norminv fp7
205#define nninv   fp8
206#define nwork0  fp9
207#define c_zero  fp10
208#define c_one   fp11
209#define c_two   fp12
210        // c_one = 1.0F;
211        lis         r5, CONST_1_0F@h
212        ori         r5, r5, CONST_1_0F@l
213        lfs         c_one, 0(r5)
214
215        // load xy
216        psq_l       sxy, 0(src), 0, 0
217
218        // mag = [x*x][y*y]
219        ps_mul      mag, sxy, sxy
220        // c_zero = [0.0F]
221        ps_sub      c_zero, c_one, c_one
222
223        // load zw
224        psq_l       szw, 8(src), 0, 0
225
226        // mag = [x*x+z*z][y*y+w*w]
227        ps_madd     mag, szw, szw, mag
228        // c_two = [2.0F]
229        ps_add      c_two, c_one, c_one
230        // mag = [x*x+y*y+z*z+w*w][N/A]
231        ps_sum0     mag, mag, mag, mag
232
233        // zero check
234        fcmpu       cr0, mag, c_zero
235        beq-        _ASM_QUATInverse_zero
236
237        // norminv = 1.0F / mag
238        fres        norminv, mag
239        // nmag = -mag
240        ps_neg      nmag, mag
241        // Newton-Rapson refinment (x1) : E' = 2E-X*E*E
242        ps_nmsub    nwork0, mag, norminv, c_two
243        ps_mul      norminv, norminv, nwork0
244        b           _ASM_QUATInverse_mulnorm
245
246_ASM_QUATInverse_zero:
247        fmr         norminv, c_one
248
249_ASM_QUATInverse_mulnorm:
250        // nninv = [ -norminv ]
251        ps_neg      nninv, norminv
252
253        // iww = [ w*norminv ][ N/A ]
254        ps_muls1    iww, norminv, szw
255        // sxy = [ -x*norminv ][ -y*norminv ]
256        ps_muls0    sxy, sxy, nninv
257
258        // store w
259        psq_st      iww, 12(inv), 1, 0
260
261        // izz = [ -z*norminv ][ N/A ]
262        ps_muls0    izz, szw, nninv
263
264        // store xy
265        psq_st      sxy, 0(inv), 0, 0
266        // store z
267        psq_st      izz, 8(inv), 1, 0
268
269        blr
270        .size ASM_QUATInverse,$-ASM_QUATInverse
271#undef src
272#undef inv
273#undef sxy
274#undef szw
275#undef izz
276#undef iww
277#undef mag
278#undef nmag
279#undef norminv
280#undef nninv
281#undef nwork0
282#undef c_zero
283#undef c_one
284#undef c_two
285
286
287////////////////////////////////////////////////////////////////////////////////
288// void ASM_QUATScale(const Quaternion *q, Quaternion *r, f32 scale)
289        .global ASM_QUATScale
290#define q     r3
291#define r     r4
292#define scale fp1
293ASM_QUATScale:
294        .type ASM_QUATScale, @function
295#define rxy   fp2
296#define rzw   fp3
297        psq_l       rxy, 0(q), 0, 0
298        psq_l       rzw, 8(q), 0, 0
299        ps_muls0    rxy, rxy, scale
300        psq_st      rxy, 0(r), 0, 0
301        ps_muls0    rzw, rzw, scale
302        psq_st      rzw, 8(r), 0, 0
303        blr
304        .size ASM_QUATScale,$-ASM_QUATScale
305#undef q
306#undef r
307#undef scale
308#undef rxy
309#undef rzw
310
311
312
313////////////////////////////////////////////////////////////////////////////////
314// f32 ASM_QUATDotProduct(const Quaternion *p, const Quaternion *q)
315#define p r3
316#define q r4
317        .global ASM_QUATDotProduct
318ASM_QUATDotProduct:
319        .type ASM_QUATDotProduct, @function
320#define pxy fp2
321#define pzw fp3
322#define qxy fp4
323#define qzw fp5
324#define dp  fp1
325        psq_l       pxy, 0(p), 0, 0
326        psq_l       qxy, 0(q), 0, 0
327        ps_mul      dp, pxy, qxy
328
329        psq_l       pzw, 8(p), 0, 0
330        psq_l       qzw, 8(q), 0, 0
331        ps_madd     dp, pzw, qzw, dp
332
333        ps_sum0     dp, dp, dp, dp
334
335        blr
336        .size ASM_QUATDotProduct,$-ASM_QUATDotProduct
337#undef pxy
338#undef pzw
339#undef qxy
340#undef qzw
341#undef dp
342
343
344
345////////////////////////////////////////////////////////////////////////////////
346// void ASM_QUATNormalize(const Quaternion *src, Quaternion *unit)
347#define src  r3
348#define unit r4
349        .global ASM_QUATNormalize
350#define sxy     fp1
351#define szw     fp2
352#define mag     fp3
353#define rsqmag  fp4
354#define diff    fp5
355#define c_zero  fp6
356#define nwork0  fp7
357#define nwork1  fp8
358#define epsilon fp9
359#define c_half  fp10
360#define c_three fp11
361ASM_QUATNormalize:
362        .type ASM_QUATNormalize, @function
363
364        // epsilon = QUAT_EPSILON;
365        lis         r5, CONST_EPSILON@h
366        ori         r5, r5, CONST_EPSILON@l
367        lfs         epsilon, 0(r5)
368
369        // c_half  = 0.5F;
370        lis         r5, CONST_0_5F@h
371        ori         r5, r5, CONST_0_5F@l
372        lfs         c_half, 0(r5)
373
374        // c_three = 3.0F;
375        lis         r5, CONST_3_0F@h
376        ori         r5, r5, CONST_3_0F@l
377        lfs         c_three, 0(r5)
378
379        psq_l       sxy, 0(src), 0, 0
380
381        // mag = [x*x][y*y]
382        ps_mul      mag, sxy, sxy
383
384        psq_l       szw, 8(src), 0, 0
385
386        // c_zero = [0.0F]
387        ps_sub      c_zero, epsilon, epsilon
388        // mag = [x*x+z*z][y*y+w*w]
389        ps_madd     mag, szw, szw, mag
390        // mag = [x*x+y*y+z*z+w*w][N/A]
391        ps_sum0     mag, mag, mag, mag
392
393        // rsqmag = 1.0F / sqrtf(mag) : estimation
394        frsqrte     rsqmag, mag
395        // diff = mag - epsilon
396        ps_sub      diff, mag, epsilon
397        // Newton-Rapson refinement (x1) : E' = (E/2)(3 - X * E * E)
398        fmul        nwork0, rsqmag, rsqmag
399        fmul        nwork1, rsqmag, c_half
400        fnmsub      nwork0, nwork0, mag, c_three
401        fmul        rsqmag, nwork0, nwork1
402
403        // rsqmag = ( mag >= epsilon ) ? rsqmag : 0
404        ps_sel      rsqmag, diff, rsqmag, c_zero
405        // sxy = [x*rsqmag][y*rsqmag]
406        ps_muls0    sxy, sxy, rsqmag
407        // szw = [z*rsqmag][w*rsqmag]
408        ps_muls0    szw, szw, rsqmag
409
410        psq_st      sxy, 0(unit), 0, 0
411        psq_st      szw, 8(unit), 0, 0
412
413        blr
414        .size ASM_QUATNormalize,$-ASM_QUATNormalize
415
416#undef src
417#undef unit
418#undef sxy
419#undef szw
420#undef mag
421#undef rsqmag
422#undef diff
423#undef c_zero
424#undef nwork0
425#undef nwork1
426#undef epsilon
427#undef c_half
428#undef c_three
429
430////////////////////////////////////////////////////////////////////////////////
431// void ASM_MTXQuat(Mtx m, const Quaternion *q)
432#define m  r3
433#define q  r4
434#define c_zero fp1
435#define c_one  fp2
436#define c_two  fp3
437#define scale  fp4
438#define tmp0   fp5
439#define tmp1   fp6
440#define tmp2   fp7
441#define tmp3   fp8
442#define tmp4   fp9
443#define tmp5   fp10
444#define tmp6   fp11
445#define tmp7   fp12
446#define tmp8   fp13
447#define tmp9   fp14
448
449        .global ASM_MTXQuat
450ASM_MTXQuat:
451        .type ASM_MTXQuat, @function
452
453        mflr        r0
454        stwu        r1, -24(r1)
455        stw         r0, 28(r1)
456
457        psq_st      fp14, 8(r1), 0, 0
458        stfd        fp14, 16(r1)
459
460        // c_one = 1.0F;
461        lis         r5, CONST_1_0F@h
462        ori         r5, r5, CONST_1_0F@l
463        lfs         c_one, 0(r5)
464
465        // tmp0 = [qx][qy] : LOAD
466        psq_l       tmp0, 0(q), 0, 0
467        // tmp1 = [qz][qw] : LOAD
468        psq_l       tmp1, 8(q), 0, 0
469        // c_zero = [0.0F][0.0F]
470        fsubs       c_zero, c_one, c_one
471        // c_two  = [2.0F][2.0F]
472        fadds       c_two, c_one, c_one
473        // tmp2 = [qx*qx][qy*qy]
474        ps_mul      tmp2, tmp0, tmp0
475        // tmp5 = [qy][qx]
476        ps_merge10  tmp5, tmp0, tmp0
477        // tmp4 = [qx*qx+qz*qz][qy*qy+qw*qw]
478        ps_madd     tmp4, tmp1, tmp1, tmp2
479        // tmp3 = [qz*qz][qw*qw]
480        ps_mul      tmp3, tmp1, tmp1
481        // scale = [qx*qx+qy*qy+qz*qz+qw*qw][?]
482        ps_sum0     scale, tmp4, tmp4, tmp4
483        // tmp7 = [qy*qw][qx*qw]
484        ps_muls1    tmp7, tmp5, tmp1
485        // Newton-Rapson refinment (1/X) : E' = 2E-X*E*E
486        // tmp9 = [E = Est.(1/X)]
487        fres        tmp9, scale
488        // tmp4 = [qx*qx+qz*qz][qy*qy+qz*qz]
489        ps_sum1     tmp4, tmp3, tmp4, tmp2
490        // scale = [2-X*E]
491        ps_nmsub    scale, scale, tmp9, c_two
492        // tmp6 = [qz*qw][?]
493        ps_muls1    tmp6, tmp1, tmp1
494        // scale = [E(2-scale*E) = E']
495        ps_mul      scale, tmp9, scale
496        // tmp2 = [qx*qx+qy*qy]
497        ps_sum0     tmp2, tmp2, tmp2, tmp2
498        // scale = [s = 2E' = 2.0F/(qx*qx+qy*qy+qz*qz+qw*qw)]
499        fmuls       scale, scale, c_two
500        // tmp8 = [qx*qy+qz*qw][?]
501        ps_madd     tmp8, tmp0, tmp5, tmp6
502        // tmp6 = [qx*qy-qz*qw][?]
503        ps_msub     tmp6, tmp0, tmp5, tmp6
504        // c_zero [m03] : STORE
505        psq_st      c_zero, 12(m), 1, 0
506        // tmp2 = [1-s(qx*qx+qy*qy)]   : [m22]
507        ps_nmsub    tmp2, tmp2, scale, c_one
508        // tmp4 = [1-s(qx*qx+qz*qz)][1-s(qy*qy+qz*qz)] : [m11][m00]
509        ps_nmsub    tmp4, tmp4, scale, c_one
510        // c_zero [m23] : STORE
511        psq_st      c_zero, 44(m), 1, 0
512        // tmp8 = [s(qx*qy+qz*qw)][?]  : [m10]
513        ps_mul      tmp8, tmp8, scale
514        // tmp6 = [s(qx*qy-qz*qw)][?]  : [m01]
515        ps_mul      tmp6, tmp6, scale
516        // tmp2 [m22] : STORE
517        psq_st      tmp2, 40(m), 1, 0
518        // tmp5 = [qx*qz+qy*qw][qy*qz+qx*qw]
519        ps_madds0   tmp5, tmp0, tmp1, tmp7
520        // tmp1 = [m10][m11]
521        ps_merge00  tmp1, tmp8, tmp4
522        // tmp7 = [qx*qz-qy*qw][qy*qz-qx*qw]
523        ps_nmsub    tmp7, tmp7, c_two, tmp5
524        // tmp0 = [m00][m01]
525        ps_merge10  tmp0, tmp4, tmp6
526        // tmp1 [m10][m11] : STORE
527        psq_st      tmp1, 16(m), 0, 0
528        // tmp5 = [s(qx*qz+qy*qw)][s(qy*qz+qx*qw)] : [m02][m21]
529        ps_mul      tmp5, tmp5, scale
530        // tmp7 = [s(qx*qz-qy*qw)][s(qy*qz-qx*qw)] : [m20][m12]
531        ps_mul      tmp7, tmp7, scale
532        // tmp0 [m00][m01] : STORE
533        psq_st      tmp0,  0(m), 0, 0
534        // tmp5 [m02] : STORE
535        psq_st      tmp5,  8(m), 1, 0
536        // tmp3 = [m12][m13]
537        ps_merge10  tmp3, tmp7, c_zero
538        // tmp9 = [m20][m21]
539        ps_merge01  tmp9, tmp7, tmp5
540        // tmp3 [m12][m13] : STORE
541        psq_st      tmp3, 24(m), 0, 0
542        // tmp9 [m20][m21] : STORE
543        psq_st      tmp9, 32(m), 0, 0
544
545        psq_l       f14, 8(r1), 0, 0
546        lfd         f14, 16(r1)
547
548        lwz         r0, 28(r1)
549        mtlr        r0
550        addi        r1, r1, 24
551
552        blr
553        .size ASM_MTXQuat,$-ASM_MTXQuat
554#undef m
555#undef q
556#undef c_zero
557#undef c_one
558#undef c_two
559#undef scale
560#undef tmp0
561#undef tmp1
562#undef tmp2
563#undef tmp3
564#undef tmp4
565#undef tmp5
566#undef tmp6
567#undef tmp7
568#undef tmp8
569#undef tmp9
570
571