1/*---------------------------------------------------------------------------* 2 Project: matrix vector Library 3 File: mtxQuat_asm.s 4 5 Copyright (C) Nintendo. All rights reserved. 6 7 These coded instructions, statements, and computer programs contain 8 proprietary information of Nintendo of America Inc. and/or Nintendo 9 Company Ltd., and are protected by Federal copyright law. They may 10 not be disclosed to third parties or copied or duplicated in any form, 11 in whole or in part, without the prior written consent of Nintendo. 12 13 *---------------------------------------------------------------------------*/ 14 15 .data 16 .align 2 17CONST_0_5F: .float 0.5 18CONST_1_0F: .float 1.0 19CONST_3_0F: .float 3.0 20CONST_EPSILON: .float 0.00001 21 22 23 .text 24 25 26//////////////////////////////////////////////////////////////////////////////// 27// void ASM_QUATAdd(const Quaternion *p, const Quaternion *q, Quaternion *r) 28 .global ASM_QUATAdd 29#define p r3 30#define q r4 31#define r r5 32ASM_QUATAdd: 33 .type ASM_QUATAdd, @function 34#define pxy fp1 35#define qxy fp2 36#define rxy fp3 37#define pzw fp4 38#define qzw fp5 39#define rzw fp6 40 psq_l pxy, 0(p), 0, 0 41 psq_l qxy, 0(q), 0, 0 42 ps_add rxy, pxy, qxy 43 psq_st rxy, 0(r), 0, 0 44 45 psq_l pzw, 8(p), 0, 0 46 psq_l qzw, 8(q), 0, 0 47 ps_add rzw, pzw, qzw 48 psq_st rzw, 8(r), 0, 0 49 blr 50 .size ASM_QUATAdd,$-ASM_QUATAdd 51#undef p 52#undef q 53#undef r 54#undef pxy 55#undef qxy 56#undef rxy 57#undef pzw 58#undef qzw 59#undef rzw 60 61 62 63//////////////////////////////////////////////////////////////////////////////// 64// void ASM_QUATSubtract(const Quaternion *p, const Quaternion *q, Quaternion *r) 65#define p r3 66#define q r4 67#define r r5 68 .global ASM_QUATSubtract 69ASM_QUATSubtract: 70 .type ASM_QUATSubtract, @function 71#define pxy fp1 72#define qxy fp2 73#define rxy fp3 74#define pzw fp4 75#define qzw fp5 76#define rzw fp6 77 psq_l pxy, 0(p), 0, 0 78 psq_l qxy, 0(q), 0, 0 79 ps_sub rxy, pxy, qxy 80 psq_st rxy, 0(r), 0, 0 81 82 psq_l pzw, 8(p), 0, 0 83 psq_l qzw, 8(q), 0, 0 84 ps_sub rzw, pzw, qzw 85 psq_st rzw, 8(r), 0, 0 86 blr 87 .size ASM_QUATSubtract,$-ASM_QUATSubtract 88#undef p 89#undef q 90#undef r 91#undef pxy 92#undef qxy 93#undef rxy 94#undef pzw 95#undef qzw 96#undef rzw 97 98 99 100//////////////////////////////////////////////////////////////////////////////// 101// void ASM_QUATMultiply(const Quaternion *p, const Quaternion *q, Quaternion *pq) 102#define p r3 103#define q r4 104#define pq r5 105 .global ASM_QUATMultiply 106ASM_QUATMultiply: 107 .type ASM_QUATMultiply, @function 108#define pxy fp1 109#define pzw fp2 110#define qxy fp3 111#define qzw fp4 112#define pnxy fp5 113#define pnzw fp6 114#define pnxny fp7 115#define pnznw fp8 116#define rxy fp9 117#define rzw fp10 118#define sxy fp11 119#define szw fp12 120 // [px][py] : Load 121 psq_l pxy, 0(p), 0, 0 122 // [pz][pw] : Load 123 psq_l pzw, 8(p), 0, 0 124 125 // [qx][qy] : Load 126 psq_l qxy, 0(q), 0, 0 127 // [-px][-py] 128 ps_neg pnxny, pxy 129 // [qz][qw] : Load 130 psq_l qzw, 8(q), 0, 0 131 // [-pz][-pw] 132 ps_neg pnznw, pzw 133 134 // [-px][py] 135 ps_merge01 pnxy, pnxny, pxy 136 137 // [pz*qx][pw*qx] 138 ps_muls0 rxy, pzw, qxy 139 // [-px*qx][-py*qx] 140 ps_muls0 rzw, pnxny, qxy 141 142 // [-pz][pw] 143 ps_merge01 pnzw, pnznw, pzw 144 145 // [-px*qy][py*qy] 146 ps_muls1 szw, pnxy, qxy 147 // [pz*qx-px*qz][pw*qx+py*qz] 148 ps_madds0 rxy, pnxy, qzw, rxy 149 // [-pz*qy][pw*qy] 150 ps_muls1 sxy, pnzw, qxy 151 // [-px*qx-pz*qz][-py*qx+pw*qz] 152 ps_madds0 rzw, pnzw, qzw, rzw 153 // [-px*qy-pz*qw][py*qy-pw*qw] 154 ps_madds1 szw, pnznw, qzw, szw 155 // [pw*qx+py*qz][pz*qx-px*qz] 156 ps_merge10 rxy, rxy, rxy 157 // [-pz*qy+px*qw][pw*qy+py*qw] 158 ps_madds1 sxy, pxy, qzw, sxy 159 // [-py*qx+pw*qz][-px*qx-pz*qz] 160 ps_merge10 rzw, rzw, rzw 161 162 // [pw*qx+py*qz-pz*qy+px*qw][pz*qx-px*qz+pw*qy+py*qw] : [pqx][pqy] 163 ps_add rxy, rxy, sxy 164 // [pqx][pqy] : Store 165 psq_st rxy, 0(pq), 0, 0 166 // [-py*qx+pw*qz+px*qy+pz*qw][-px*qx-pz*qz-py*qy+pw*qw] : [pqz][pqw] 167 ps_sub rzw, rzw, szw 168 // [pqz][pqw] : Store 169 psq_st rzw, 8(pq), 0, 0 170 171 blr 172 .size ASM_QUATMultiply,$-ASM_QUATMultiply 173#undef p 174#undef q 175#undef pq 176#undef pxy 177#undef pzw 178#undef qxy 179#undef qzw 180#undef pnxy 181#undef pnzw 182#undef pnxny 183#undef pnznw 184#undef rxy 185#undef rzw 186#undef sxy 187#undef szw 188 189 190 191//////////////////////////////////////////////////////////////////////////////// 192// void ASM_QUATInverse(const Quaternion *src, Quaternion *inv) 193 .global ASM_QUATInverse 194#define src r3 195#define inv r4 196ASM_QUATInverse: 197 .type ASM_QUATInverse, @function 198#define sxy fp1 199#define szw fp2 200#define izz fp3 201#define iww fp4 202#define mag fp5 203#define nmag fp6 204#define norminv fp7 205#define nninv fp8 206#define nwork0 fp9 207#define c_zero fp10 208#define c_one fp11 209#define c_two fp12 210 // c_one = 1.0F; 211 lis r5, CONST_1_0F@h 212 ori r5, r5, CONST_1_0F@l 213 lfs c_one, 0(r5) 214 215 // load xy 216 psq_l sxy, 0(src), 0, 0 217 218 // mag = [x*x][y*y] 219 ps_mul mag, sxy, sxy 220 // c_zero = [0.0F] 221 ps_sub c_zero, c_one, c_one 222 223 // load zw 224 psq_l szw, 8(src), 0, 0 225 226 // mag = [x*x+z*z][y*y+w*w] 227 ps_madd mag, szw, szw, mag 228 // c_two = [2.0F] 229 ps_add c_two, c_one, c_one 230 // mag = [x*x+y*y+z*z+w*w][N/A] 231 ps_sum0 mag, mag, mag, mag 232 233 // zero check 234 fcmpu cr0, mag, c_zero 235 beq- _ASM_QUATInverse_zero 236 237 // norminv = 1.0F / mag 238 fres norminv, mag 239 // nmag = -mag 240 ps_neg nmag, mag 241 // Newton-Rapson refinment (x1) : E' = 2E-X*E*E 242 ps_nmsub nwork0, mag, norminv, c_two 243 ps_mul norminv, norminv, nwork0 244 b _ASM_QUATInverse_mulnorm 245 246_ASM_QUATInverse_zero: 247 fmr norminv, c_one 248 249_ASM_QUATInverse_mulnorm: 250 // nninv = [ -norminv ] 251 ps_neg nninv, norminv 252 253 // iww = [ w*norminv ][ N/A ] 254 ps_muls1 iww, norminv, szw 255 // sxy = [ -x*norminv ][ -y*norminv ] 256 ps_muls0 sxy, sxy, nninv 257 258 // store w 259 psq_st iww, 12(inv), 1, 0 260 261 // izz = [ -z*norminv ][ N/A ] 262 ps_muls0 izz, szw, nninv 263 264 // store xy 265 psq_st sxy, 0(inv), 0, 0 266 // store z 267 psq_st izz, 8(inv), 1, 0 268 269 blr 270 .size ASM_QUATInverse,$-ASM_QUATInverse 271#undef src 272#undef inv 273#undef sxy 274#undef szw 275#undef izz 276#undef iww 277#undef mag 278#undef nmag 279#undef norminv 280#undef nninv 281#undef nwork0 282#undef c_zero 283#undef c_one 284#undef c_two 285 286 287//////////////////////////////////////////////////////////////////////////////// 288// void ASM_QUATScale(const Quaternion *q, Quaternion *r, f32 scale) 289 .global ASM_QUATScale 290#define q r3 291#define r r4 292#define scale fp1 293ASM_QUATScale: 294 .type ASM_QUATScale, @function 295#define rxy fp2 296#define rzw fp3 297 psq_l rxy, 0(q), 0, 0 298 psq_l rzw, 8(q), 0, 0 299 ps_muls0 rxy, rxy, scale 300 psq_st rxy, 0(r), 0, 0 301 ps_muls0 rzw, rzw, scale 302 psq_st rzw, 8(r), 0, 0 303 blr 304 .size ASM_QUATScale,$-ASM_QUATScale 305#undef q 306#undef r 307#undef scale 308#undef rxy 309#undef rzw 310 311 312 313//////////////////////////////////////////////////////////////////////////////// 314// f32 ASM_QUATDotProduct(const Quaternion *p, const Quaternion *q) 315#define p r3 316#define q r4 317 .global ASM_QUATDotProduct 318ASM_QUATDotProduct: 319 .type ASM_QUATDotProduct, @function 320#define pxy fp2 321#define pzw fp3 322#define qxy fp4 323#define qzw fp5 324#define dp fp1 325 psq_l pxy, 0(p), 0, 0 326 psq_l qxy, 0(q), 0, 0 327 ps_mul dp, pxy, qxy 328 329 psq_l pzw, 8(p), 0, 0 330 psq_l qzw, 8(q), 0, 0 331 ps_madd dp, pzw, qzw, dp 332 333 ps_sum0 dp, dp, dp, dp 334 335 blr 336 .size ASM_QUATDotProduct,$-ASM_QUATDotProduct 337#undef pxy 338#undef pzw 339#undef qxy 340#undef qzw 341#undef dp 342 343 344 345//////////////////////////////////////////////////////////////////////////////// 346// void ASM_QUATNormalize(const Quaternion *src, Quaternion *unit) 347#define src r3 348#define unit r4 349 .global ASM_QUATNormalize 350#define sxy fp1 351#define szw fp2 352#define mag fp3 353#define rsqmag fp4 354#define diff fp5 355#define c_zero fp6 356#define nwork0 fp7 357#define nwork1 fp8 358#define epsilon fp9 359#define c_half fp10 360#define c_three fp11 361ASM_QUATNormalize: 362 .type ASM_QUATNormalize, @function 363 364 // epsilon = QUAT_EPSILON; 365 lis r5, CONST_EPSILON@h 366 ori r5, r5, CONST_EPSILON@l 367 lfs epsilon, 0(r5) 368 369 // c_half = 0.5F; 370 lis r5, CONST_0_5F@h 371 ori r5, r5, CONST_0_5F@l 372 lfs c_half, 0(r5) 373 374 // c_three = 3.0F; 375 lis r5, CONST_3_0F@h 376 ori r5, r5, CONST_3_0F@l 377 lfs c_three, 0(r5) 378 379 psq_l sxy, 0(src), 0, 0 380 381 // mag = [x*x][y*y] 382 ps_mul mag, sxy, sxy 383 384 psq_l szw, 8(src), 0, 0 385 386 // c_zero = [0.0F] 387 ps_sub c_zero, epsilon, epsilon 388 // mag = [x*x+z*z][y*y+w*w] 389 ps_madd mag, szw, szw, mag 390 // mag = [x*x+y*y+z*z+w*w][N/A] 391 ps_sum0 mag, mag, mag, mag 392 393 // rsqmag = 1.0F / sqrtf(mag) : estimation 394 frsqrte rsqmag, mag 395 // diff = mag - epsilon 396 ps_sub diff, mag, epsilon 397 // Newton-Rapson refinement (x1) : E' = (E/2)(3 - X * E * E) 398 fmul nwork0, rsqmag, rsqmag 399 fmul nwork1, rsqmag, c_half 400 fnmsub nwork0, nwork0, mag, c_three 401 fmul rsqmag, nwork0, nwork1 402 403 // rsqmag = ( mag >= epsilon ) ? rsqmag : 0 404 ps_sel rsqmag, diff, rsqmag, c_zero 405 // sxy = [x*rsqmag][y*rsqmag] 406 ps_muls0 sxy, sxy, rsqmag 407 // szw = [z*rsqmag][w*rsqmag] 408 ps_muls0 szw, szw, rsqmag 409 410 psq_st sxy, 0(unit), 0, 0 411 psq_st szw, 8(unit), 0, 0 412 413 blr 414 .size ASM_QUATNormalize,$-ASM_QUATNormalize 415 416#undef src 417#undef unit 418#undef sxy 419#undef szw 420#undef mag 421#undef rsqmag 422#undef diff 423#undef c_zero 424#undef nwork0 425#undef nwork1 426#undef epsilon 427#undef c_half 428#undef c_three 429 430//////////////////////////////////////////////////////////////////////////////// 431// void ASM_MTXQuat(Mtx m, const Quaternion *q) 432#define m r3 433#define q r4 434#define c_zero fp1 435#define c_one fp2 436#define c_two fp3 437#define scale fp4 438#define tmp0 fp5 439#define tmp1 fp6 440#define tmp2 fp7 441#define tmp3 fp8 442#define tmp4 fp9 443#define tmp5 fp10 444#define tmp6 fp11 445#define tmp7 fp12 446#define tmp8 fp13 447#define tmp9 fp14 448 449 .global ASM_MTXQuat 450ASM_MTXQuat: 451 .type ASM_MTXQuat, @function 452 453 mflr r0 454 stwu r1, -24(r1) 455 stw r0, 28(r1) 456 457 psq_st fp14, 8(r1), 0, 0 458 stfd fp14, 16(r1) 459 460 // c_one = 1.0F; 461 lis r5, CONST_1_0F@h 462 ori r5, r5, CONST_1_0F@l 463 lfs c_one, 0(r5) 464 465 // tmp0 = [qx][qy] : LOAD 466 psq_l tmp0, 0(q), 0, 0 467 // tmp1 = [qz][qw] : LOAD 468 psq_l tmp1, 8(q), 0, 0 469 // c_zero = [0.0F][0.0F] 470 fsubs c_zero, c_one, c_one 471 // c_two = [2.0F][2.0F] 472 fadds c_two, c_one, c_one 473 // tmp2 = [qx*qx][qy*qy] 474 ps_mul tmp2, tmp0, tmp0 475 // tmp5 = [qy][qx] 476 ps_merge10 tmp5, tmp0, tmp0 477 // tmp4 = [qx*qx+qz*qz][qy*qy+qw*qw] 478 ps_madd tmp4, tmp1, tmp1, tmp2 479 // tmp3 = [qz*qz][qw*qw] 480 ps_mul tmp3, tmp1, tmp1 481 // scale = [qx*qx+qy*qy+qz*qz+qw*qw][?] 482 ps_sum0 scale, tmp4, tmp4, tmp4 483 // tmp7 = [qy*qw][qx*qw] 484 ps_muls1 tmp7, tmp5, tmp1 485 // Newton-Rapson refinment (1/X) : E' = 2E-X*E*E 486 // tmp9 = [E = Est.(1/X)] 487 fres tmp9, scale 488 // tmp4 = [qx*qx+qz*qz][qy*qy+qz*qz] 489 ps_sum1 tmp4, tmp3, tmp4, tmp2 490 // scale = [2-X*E] 491 ps_nmsub scale, scale, tmp9, c_two 492 // tmp6 = [qz*qw][?] 493 ps_muls1 tmp6, tmp1, tmp1 494 // scale = [E(2-scale*E) = E'] 495 ps_mul scale, tmp9, scale 496 // tmp2 = [qx*qx+qy*qy] 497 ps_sum0 tmp2, tmp2, tmp2, tmp2 498 // scale = [s = 2E' = 2.0F/(qx*qx+qy*qy+qz*qz+qw*qw)] 499 fmuls scale, scale, c_two 500 // tmp8 = [qx*qy+qz*qw][?] 501 ps_madd tmp8, tmp0, tmp5, tmp6 502 // tmp6 = [qx*qy-qz*qw][?] 503 ps_msub tmp6, tmp0, tmp5, tmp6 504 // c_zero [m03] : STORE 505 psq_st c_zero, 12(m), 1, 0 506 // tmp2 = [1-s(qx*qx+qy*qy)] : [m22] 507 ps_nmsub tmp2, tmp2, scale, c_one 508 // tmp4 = [1-s(qx*qx+qz*qz)][1-s(qy*qy+qz*qz)] : [m11][m00] 509 ps_nmsub tmp4, tmp4, scale, c_one 510 // c_zero [m23] : STORE 511 psq_st c_zero, 44(m), 1, 0 512 // tmp8 = [s(qx*qy+qz*qw)][?] : [m10] 513 ps_mul tmp8, tmp8, scale 514 // tmp6 = [s(qx*qy-qz*qw)][?] : [m01] 515 ps_mul tmp6, tmp6, scale 516 // tmp2 [m22] : STORE 517 psq_st tmp2, 40(m), 1, 0 518 // tmp5 = [qx*qz+qy*qw][qy*qz+qx*qw] 519 ps_madds0 tmp5, tmp0, tmp1, tmp7 520 // tmp1 = [m10][m11] 521 ps_merge00 tmp1, tmp8, tmp4 522 // tmp7 = [qx*qz-qy*qw][qy*qz-qx*qw] 523 ps_nmsub tmp7, tmp7, c_two, tmp5 524 // tmp0 = [m00][m01] 525 ps_merge10 tmp0, tmp4, tmp6 526 // tmp1 [m10][m11] : STORE 527 psq_st tmp1, 16(m), 0, 0 528 // tmp5 = [s(qx*qz+qy*qw)][s(qy*qz+qx*qw)] : [m02][m21] 529 ps_mul tmp5, tmp5, scale 530 // tmp7 = [s(qx*qz-qy*qw)][s(qy*qz-qx*qw)] : [m20][m12] 531 ps_mul tmp7, tmp7, scale 532 // tmp0 [m00][m01] : STORE 533 psq_st tmp0, 0(m), 0, 0 534 // tmp5 [m02] : STORE 535 psq_st tmp5, 8(m), 1, 0 536 // tmp3 = [m12][m13] 537 ps_merge10 tmp3, tmp7, c_zero 538 // tmp9 = [m20][m21] 539 ps_merge01 tmp9, tmp7, tmp5 540 // tmp3 [m12][m13] : STORE 541 psq_st tmp3, 24(m), 0, 0 542 // tmp9 [m20][m21] : STORE 543 psq_st tmp9, 32(m), 0, 0 544 545 psq_l f14, 8(r1), 0, 0 546 lfd f14, 16(r1) 547 548 lwz r0, 28(r1) 549 mtlr r0 550 addi r1, r1, 24 551 552 blr 553 .size ASM_MTXQuat,$-ASM_MTXQuat 554#undef m 555#undef q 556#undef c_zero 557#undef c_one 558#undef c_two 559#undef scale 560#undef tmp0 561#undef tmp1 562#undef tmp2 563#undef tmp3 564#undef tmp4 565#undef tmp5 566#undef tmp6 567#undef tmp7 568#undef tmp8 569#undef tmp9 570 571