1/*---------------------------------------------------------------------------* 2 Project: matrix vector Library 3 File: mtxVec_asm.s 4 5 Copyright 1998-2011 Nintendo. All rights reserved. 6 7 These coded instructions, statements, and computer programs contain 8 proprietary information of Nintendo of America Inc. and/or Nintendo 9 Company Ltd., and are protected by Federal copyright law. They may 10 not be disclosed to third parties or copied or duplicated in any form, 11 in whole or in part, without the prior written consent of Nintendo. 12 13 *---------------------------------------------------------------------------*/ 14 15 .data 16 .align 2 17CONST_0_0F: .float 0.0 18CONST_0_5F: .float 0.5 19CONST_3_0F: .float 3.0 20 21 .text 22 23// vec library definitions 24#define RET_REG fp1 25#define V1_XY fp2 26#define V1_Z fp3 27#define V2_XY fp4 28#define V2_Z fp5 29#define D1_XY fp6 30#define D1_Z fp7 31#define D2_XY fp8 32#define D2_Z fp9 33#define W1_XY fp10 34#define W1_Z fp11 35#define W2_XY fp12 36#define W2_Z fp13 37 38//////////////////////////////////////////////////////////////////////////////// 39// void ASM_CrossProduct (const Vec* vec1, const Vec* vec2, Vec* dst) 40#define vec1 r3 41#define vec2 r4 42#define dst r5 43 .global ASM_VECCrossProduct 44ASM_VECCrossProduct: 45 .type ASM_VECCrossProduct, @function 46 47 //x = a.n[VY]*b.n[VZ] - a.n[VZ]*b.n[VY]; 48 //y = a.n[VZ]*b.n[VX] - a.n[VX]*b.n[VZ]; 49 //z = a.n[VX]*b.n[VY] - a.n[VY]*b.n[VX]; 50 51 // BX | BY 52 psq_l fp1, 0(vec2), 0, 0 53 // AZ | AZ 54 lfs fp2, 8(vec1) 55 // AX | AY 56 psq_l fp0, 0(vec1), 0, 0 57 // BY | BX 58 ps_merge10 fp6, fp1, fp1 59 // BZ | BZ 60 lfs fp3, 8(vec2) 61 62 // BX*AZ | BY*AZ 63 ps_mul fp4, fp1, fp2 64 // BX*AX | BY*AX 65 ps_muls0 fp7, fp1, fp0 66 // AX*BZ-BX*AZ | AY*BZ-BY*AZ 67 ps_msub fp5, fp0, fp3, fp4 68 // AX*BY-BX*AX | AY*BX-BY*AX 69 ps_msub fp8, fp0, fp6, fp7 70 71 // AY*BZ-AZ*BY | AY*BZ-AZ*BY 72 ps_merge11 fp9, fp5, fp5 73 // AX*BZ-AZ*BX | AY*BX-AX*BY 74 ps_merge01 fp10, fp5, fp8 75 76 psq_st fp9, 0(dst), 1, 0 77 78 // AZ*BX-AX*BZ | AX*BY-AY*BX 79 ps_neg fp10, fp10 80 81 psq_st fp10, 4(dst), 0, 0 82 83 blr 84 .size ASM_VECCrossProduct,$-ASM_VECCrossProduct 85#undef vec1 86#undef vec2 87#undef dst 88 89 90 91//////////////////////////////////////////////////////////////////////////////// 92// void ASM_VECAdd(const Vec* vec1, const Vec* vec2, Vec* dst) 93 .global ASM_VECAdd 94#define vec1 r3 95#define vec2 r4 96#define dst r5 97ASM_VECAdd: 98 .type ASM_VECAdd, @function 99 100 //load vectors XY 101 psq_l V1_XY, 0(vec1), 0, 0; 102 psq_l V2_XY, 0(vec2), 0, 0; 103 //add vectors XY 104 ps_add D1_XY, V1_XY, V2_XY; 105 //store result XY 106 psq_st D1_XY, 0(dst), 0, 0; 107 //load vectors Z 108 psq_l V1_Z, 8(vec1), 1, 0; 109 psq_l V2_Z, 8(vec2), 1, 0; 110 //add vectors Z 111 ps_add D1_Z, V1_Z, V2_Z; 112 //store result Z 113 psq_st D1_Z, 8(dst), 1, 0; 114 115 blr 116 .size ASM_VECAdd,$-ASM_VECAdd 117 118#undef vec1 119#undef vec2 120#undef dst 121 122 123//////////////////////////////////////////////////////////////////////////////// 124// void ASM_VECSubtract(const Vec* vec1, const Vec* vec2, Vec* dst) 125 .global ASM_VECSubtract 126#define vec1 r3 127#define vec2 r4 128#define dst r5 129ASM_VECSubtract: 130 .type ASM_VECSubtract, @function 131 132 //load vectors XY 133 psq_l V1_XY, 0(vec1), 0, 0; 134 psq_l V2_XY, 0(vec2), 0, 0; 135 //subtract vectors XY 136 ps_sub D1_XY, V1_XY, V2_XY; 137 //store vectors XY 138 psq_st D1_XY, 0(dst), 0, 0; 139 140 //load vectors Z 141 psq_l V1_Z, 8(vec1), 1, 0; 142 psq_l V2_Z, 8(vec2), 1, 0; 143 //subtract vectors Z 144 ps_sub D1_Z, V1_Z, V2_Z; 145 //store vectors Z 146 psq_st D1_Z, 8(dst), 1, 0; 147 148 blr 149 .size ASM_VECSubtract,$-ASM_VECSubtract 150 151#undef vec1 152#undef vec2 153#undef dst 154 155 156//////////////////////////////////////////////////////////////////////////////// 157// f32 ASM_VECSquareMag(const Vec* vec1) 158 .global ASM_VECSquareMag 159#define vec1 r3 160#define sqmag f1 161#define vxy f2 162#define vzz f4 163ASM_VECSquareMag: 164 .type ASM_VECSquareMag, @function 165 166 // load X | Y 167 psq_l vxy, 0(vec1), 0, 0 168 // XX | YY 169 ps_mul vxy, vxy, vxy 170 // load Z | Z 171 lfs vzz, 8(vec1) 172 // XX + ZZ | YY + ZZ 173 ps_madd sqmag, vzz, vzz, vxy 174 ps_sum0 sqmag, sqmag, vxy, vxy 175 176 blr 177 .size ASM_VECSquareMag,$-ASM_VECSquareMag 178 179#undef vec1 180#undef vxy 181#undef vzz 182#undef sqmag 183 184 185 186//////////////////////////////////////////////////////////////////////////////// 187// f32 ASM_VECSquareDistance(const Vec* a, const Vec* b) 188 .global ASM_VECSquareDistance 189#define a r3 190#define b r4 191#define v0yz f2 192#define v1yz f3 193#define v0xy f4 194#define v1xy f5 195#define dyz f6 196#define dxy f7 197#define sqdist f1 198ASM_VECSquareDistance: 199 .type ASM_VECSquareDistance, @function 200 201 psq_l v0yz, 4(a), 0, 0 // [Y0][Z0] 202 psq_l v1yz, 4(b), 0, 0 // [Y1][Z1] 203 ps_sub dyz, v0yz, v1yz // [Y0-Y1][Z0-Z1] 204 205 psq_l v0xy, 0(a), 0, 0 // [X0][Y0] 206 psq_l v1xy, 0(b), 0, 0 // [X1][Y1] 207 ps_mul dyz, dyz, dyz // [dYdY][dZdZ] 208 ps_sub dxy, v0xy, v1xy // [X0-X1][Y0-Y1] 209 210 ps_madd sqdist, dxy, dxy, dyz // [dXdX+dYdY][dYdY+dZdZ] 211 ps_sum0 sqdist, sqdist, dyz, dyz // [dXdX+dYdY+dZdZ][N/A] 212 213 blr 214 .size ASM_VECSquareDistance,$-ASM_VECSquareDistance 215 216#undef a 217#undef b 218#undef v0yz 219#undef vlyz 220#undef v0xy 221#undef v1xy 222#undef dyz 223#undef dxy 224#undef sqdist 225 226 227//////////////////////////////////////////////////////////////////////////////// 228// f32 ASM_VECMag(const Vec* v) 229 .global ASM_VECMag 230#define v r3 231#define vxy f2 232#define vzz f3 233#define sqmag f1 234#define rmag f4 235#define nwork0 f5 236#define nwork1 f6 237#define c_three f7 238#define c_half f8 239#define c_zero f9 240ASM_VECMag: 241 .type ASM_VECMag, @function 242 243 // c_half = 0.5F; 244 lis r4, CONST_0_5F@h 245 ori r4, r4, CONST_0_5F@l 246 lfs c_half, 0(r4) 247 248 // Square mag calculation 249 psq_l vxy, 0(v), 0, 0 250 ps_mul vxy, vxy, vxy 251 lfs vzz, 8(v) 252 fsubs c_zero, c_half, c_half 253 ps_madd sqmag, vzz, vzz, vxy 254 255 // Square mag 256 ps_sum0 sqmag, sqmag, vxy, vxy 257 258 // Zero check 259 fcmpu cr0, sqmag, c_zero 260 beq- _ASM_VECMag_exit 261 262 // 1.0/sqrt : estimation[E] 263 frsqrte rmag, sqmag 264 265 // c_three = 3.0F; 266 lis r4, CONST_3_0F@h 267 ori r4, r4, CONST_3_0F@l 268 lfs c_three, 0(r4) 269 270 // Refinement x 1 : E' = (E/2)(3 - X*E*E) 271 fmuls nwork0, rmag, rmag 272 fmuls nwork1, rmag, c_half 273 fnmsubs nwork0, nwork0, sqmag, c_three 274 fmuls rmag, nwork0, nwork1 275 276 // 1/sqrt(X) * X = sqrt(X) 277 fmuls sqmag, sqmag, rmag 278 279_ASM_VECMag_exit: 280 blr 281 .size ASM_VECMag,$-ASM_VECMag 282 283#undef v 284#undef vxy 285#undef vzz 286#undef sqmag 287#undef rmag 288#undef nwork0 289#undef nwork1 290#undef c_three 291#undef c_half 292#undef c_zero 293 294 295 296//////////////////////////////////////////////////////////////////////////////// 297// void ASM_VECScale(const Vec *src, Vec *dst, f32 mult) 298 .global ASM_VECScale 299#define src r3 300#define dst r4 301#define mult f1 302#define vxy f2 303#define vz f3 304#define rxy f4 305#define rz f5 306ASM_VECScale: 307 .type ASM_VECScale, @function 308 309 //load vector XY 310 psq_l vxy, 0(src), 0, 0 311 //load vector Z 312 psq_l vz, 8(src), 1, 0 313 //multiply vector XY 314 ps_muls0 rxy, vxy, mult 315 //store result XY 316 psq_st rxy, 0(dst), 0, 0 317 //multiply vector Z 318 ps_muls0 rz, vz, mult 319 //store vector Z 320 psq_st rz, 8(dst), 1, 0 321 322 blr 323 .size ASM_VECScale,$-ASM_VECScale 324 325#undef src 326#undef dst 327#undef mult 328#undef vxy 329#undef vz 330#undef rxy 331#undef rz 332 333 334 335//////////////////////////////////////////////////////////////////////////////// 336// f32 ASM_VECDistance(const Vec *a, const Vec *b) 337 .global ASM_VECDistance 338#define a r3 339#define b r4 340#define sqdist f1 341#define v0yz f2 342#define v1yz f3 343#define v0xy f4 344#define v1xy f5 345#define dyz f6 346#define dxy f7 347#define rdist f8 348#define nwork0 f9 349#define nwork1 f10 350#define c_half f11 351#define c_three f12 352#define c_zero f13 353ASM_VECDistance: 354 .type ASM_VECDistance, @function 355 356 psq_l v0yz, 4(a), 0, 0 // [Y0][Z0] 357 psq_l v1yz, 4(b), 0, 0 // [Y1][Z1] 358 ps_sub dyz, v0yz, v1yz // [Y0-Y1][Z0-Z1] 359 360 psq_l v0xy, 0(a), 0, 0 // [X0][Y0] 361 psq_l v1xy, 0(b), 0, 0 // [X1][Y1] 362 ps_mul dyz, dyz, dyz // [dYdY][dZdZ] 363 ps_sub dxy, v0xy, v1xy // [X0-X1][Y0-Y1] 364 365 // c_half = 0.5F; 366 lis r5, CONST_0_5F@h 367 ori r5, r5, CONST_0_5F@l 368 lfs c_half, 0(r5) 369 370 // c_zero = 0.0F; 371 lis r5, CONST_0_0F@h 372 ori r5, r5, CONST_0_0F@l 373 lfs c_zero, 0(r5) 374 375 ps_madd sqdist, dxy, dxy, dyz // [dXdX+dYdY][dYdY+dZdZ] 376 fsubs c_zero, c_half, c_half 377 ps_sum0 sqdist, sqdist, dyz, dyz // [dXdX+dYdY+dZdZ][N/A] 378 379 // Zero check 380 fcmpu cr0, c_zero, sqdist 381 beq- _ASM_VECDistance_exit 382 383 // c_three = 3.0F; 384 lis r5, CONST_3_0F@h 385 ori r5, r5, CONST_3_0F@l 386 lfs c_three, 0(r5) 387 388 // 1.0/sqrt : estimation[E] 389 frsqrte rdist, sqdist 390 // Refinement x 1 : E' = (E/2)(3 - X*E*E) 391 fmuls nwork0, rdist, rdist 392 fmuls nwork1, rdist, c_half 393 fnmsubs nwork0, nwork0, sqdist, c_three 394 fmuls rdist, nwork0, nwork1 395 396 // 1/sqrt(X) * X = sqrt(X) 397 fmuls sqdist, sqdist, rdist 398 399_ASM_VECDistance_exit: 400 blr 401 .size ASM_VECDistance,$-ASM_VECDistance 402 403#undef a 404#undef b 405#undef sqdist 406#undef v0yz 407#undef v1yz 408#undef v0xy 409#undef v1xy 410#undef dyz 411#undef dxy 412#undef rdist 413#undef nwork0 414#undef nwork1 415#undef c_half 416#undef c_three 417#undef c_zero 418 419//////////////////////////////////////////////////////////////////////////////// 420// void ASM_VECNormalize(const Vec *vec1, const Vec *dst) 421 .global ASM_VECNormalize 422#define vec1 r3 423#define dst r4 424#define rsqrt f1 425#define v1_xy f2 426#define v1_z f3 427#define xx_yy f4 428#define xx_zz f5 429#define sqsum f6 430#define nwork0 f7 431#define nwork1 f8 432#define c_half f9 433#define c_three f10 434ASM_VECNormalize: 435 .type ASM_VECNormalize, @function 436 437 // c_half = 0.5F; 438 lis r5, CONST_0_5F@h 439 ori r5, r5, CONST_0_5F@l 440 lfs c_half, 0(r5) 441 442 // X | Y 443 psq_l v1_xy, 0(vec1), 0, 0; 444 // X*X | Y*Y 445 ps_mul xx_yy, v1_xy, v1_xy; 446 // Z | 1 447 psq_l v1_z, 8(vec1), 1, 0; 448 // X*X+Z*Z | Y*Y+1 449 ps_madd xx_zz, v1_z, v1_z, xx_yy; 450 // X*X+Z*Z+Y*Y | Z 451 ps_sum0 sqsum, xx_zz, v1_z, xx_yy; 452 453 // c_three = 3.0F; 454 lis r5, CONST_3_0F@h 455 ori r5, r5, CONST_3_0F@l 456 lfs c_three, 0(r5) 457 458 // 1.0/sqrt : estimation[E] 459 frsqrte rsqrt, sqsum; 460 // Newton's refinement x 1 461 // E' = (E/2)(3 - sqsum * E * E) 462 fmuls nwork0, rsqrt, rsqrt; 463 fmuls nwork1, rsqrt, c_half; 464 fnmsubs nwork0, nwork0, sqsum, c_three; 465 fmuls rsqrt, nwork0, nwork1; 466 467 // X * mag | Y * mag 468 ps_muls0 v1_xy, v1_xy, rsqrt; 469 psq_st v1_xy, 0(dst), 0, 0; 470 471 // Z * mag 472 ps_muls0 v1_z, v1_z, rsqrt; 473 psq_st v1_z, 8(dst), 1, 0; 474 475 blr 476 .size ASM_VECNormalize,$-ASM_VECNormalize 477 478#undef vec1 479#undef dst 480#undef sqsum 481#undef v1_xy 482#undef v1_z 483#undef xx_yy 484#undef xx_zz 485#undef rsqrt 486#undef nwork0 487#undef nwork1 488#undef c_half 489#undef c_three 490 491//////////////////////////////////////////////////////////////////////////////// 492//f32 ASM_VECDotProduct(const Vec *a, const Vec *b) 493 .global ASM_VECDotProduct 494#define a r3 495#define b r4 496ASM_VECDotProduct: 497 .type ASM_VECDotProduct, @function 498 499 psq_l fp2, 4(a), 0, 0; 500 psq_l fp3, 4(b), 0, 0; 501 502 ps_mul fp2, fp2, fp3; 503 504 psq_l fp5, 0(a), 0, 0; 505 psq_l fp4, 0(b), 0, 0; 506 507 ps_madd fp3, fp5, fp4, fp2; 508 ps_sum0 fp1, fp3, fp2, fp2; 509 510 blr 511 .size ASM_VECDotProduct,$-ASM_VECDotProduct 512 513#undef a 514#undef b 515 516//////////////////////////////////////////////////////////////////////////////// 517// void ASM_MTXMultVec(const Mtx m, const Vec *src, Vec *dst) 518#define m r3 519#define src r4 520#define dst r5 521 .global ASM_MTXMultVec 522ASM_MTXMultVec: 523 .type ASM_MTXMultVec, @function 524 525 // load v[0], v[1] 526 psq_l fp0, 0(src), 0, 0 527 // load m[0][0], m[0][1] 528 psq_l fp2, 0(m), 0, 0 529 // load v[2], 1 530 psq_l fp1, 8(src), 1, 0 531 // m[0][0]*v[0], m[0][1]*v[1] 532 ps_mul fp4, fp2, fp0 533 // load m[0][2], m[0][3] 534 psq_l fp3, 8(m), 0, 0 535 // m[0][0]*v[0]+m[0][2]*v[2], m[0][1]*v[1]+m[0][3] 536 ps_madd fp5, fp3, fp1, fp4 537 // load m[1][0], m[1][1] 538 psq_l fp8, 16(m), 0, 0 539 // m[0][0]*v[0]+m[0][2]*v[2]+m[0][1]*v[1]+m[0][3], ??? 540 ps_sum0 fp6, fp5, fp6, fp5 541 // load m[1][2], m[1][3] 542 psq_l fp9, 24(m), 0, 0 543 // m[1][0]*v[0], m[1][1]*v[1] 544 ps_mul fp10, fp8, fp0 545 // store dst[0] 546 psq_st fp6, 0(dst), 1, 0 547 // m[1][0]*v[0]+m[1][2]*v[2], m[1][1]*v[1]+m[1][3] 548 ps_madd fp11, fp9, fp1, fp10 549 // load m[2][0], m[2][1] 550 psq_l fp2, 32(m), 0, 0 551 // m[1][0]*v[0]+m[1][2]*v[2]+m[2][1]*v[1]+m[1][3], ??? 552 ps_sum0 fp12, fp11, fp12, fp11 553 // load m[2][2], m[2][3] 554 psq_l fp3, 40(m), 0, 0 555 // m[0][0]*v[0], m[0][1]*v[1] 556 ps_mul fp4, fp2, fp0 557 // store dst[1] 558 psq_st fp12, 4(dst), 1, 0 559 // m[0][0]*v[0]+m[0][2]*v[2], m[0][1]*v[1]+m[0][3] 560 ps_madd fp5, fp3, fp1, fp4 561 // m[0][0]*v[0]+m[0][2]*v[2]+m[0][1]*v[1]+m[0][3], ??? 562 ps_sum0 fp6, fp5, fp6, fp5 563 // store dst[0] 564 psq_st fp6, 8(dst), 1, 0 565 566 blr 567 .size ASM_MTXMultVec,$-ASM_MTXMultVec 568#undef m 569#undef src 570#undef dst 571 572 573 574//////////////////////////////////////////////////////////////////////////////// 575// void ASM_MTXMultVecArray (const Mtx m, const Vec *srcBase, Vec *dstBase, u32 count ) 576#define m r3 577#define srcBase r4 578#define dstBase r5 579#define count r6 580 .global ASM_MTXMultVecArray 581ASM_MTXMultVecArray: 582 .type ASM_MTXMultVecArray, @function 583 584 // fp13 [m00][m01] : LOAD 585 psq_l fp13, 0(m), 0, 0 586 // fp12 [m10][m11] : LOAD 587 psq_l fp12, 16(m), 0, 0 588 // decrement loop count due to unrolling 589 subi count, count, 1 590 // fp11 [m02][m03] : LOAD 591 psq_l fp11, 8(m), 0, 0 592 // fp0 [m00][m10] 593 ps_merge00 fp0, fp13, fp12 594 // base pointer adjustment 595 subi dstBase, dstBase, 4 596 // fp10 [m12][m13] : LOAD 597 psq_l fp10, 24(m), 0, 0 598 // fp1 [m01][m11] 599 ps_merge11 fp1, fp13, fp12 600 // loop counter 601 mtctr count 602 // fp4 [m20][m21] : LOAD 603 psq_l fp4, 32(m), 0, 0 604 // fp2 [m02][m12] 605 ps_merge00 fp2, fp11, fp10 606 // fp5 [m22][m23] : LOAD 607 psq_l fp5, 40(m), 0, 0 608 // fp3 [m03][m13] 609 ps_merge11 fp3, fp11, fp10 610 611 // fp6 [v0][v1] : LOAD 612 psq_l fp6, 0(srcBase), 0, 0 613 // fp7 [v2][1.0F] : LOAD 614 psq_lu fp7, 8(srcBase), 1, 0 615 // fp8 [m00*v0+m03][m10*v0+m13] 616 ps_madds0 fp8, fp0, fp6, fp3 617 // fp9 [m20*v0][m21*v1] 618 ps_mul fp9, fp4, fp6 619 // fp8 [m00*v0+m01*v1+m03][m10*v0+m11*v1+m13] 620 ps_madds1 fp8, fp1, fp6, fp8 621 // fp10 [m20*v0+m22*v2][m21*v1+m23*1.0F] 622 ps_madd fp10, fp5, fp7, fp9 623 624_ASM_MTXMultVecArray_mloop: 625 //-------- Unrolled loop -------- 626 627 // fp6 [v0][v1] : LOAD 628 psq_lu fp6, 4(srcBase), 0, 0 629 // fp12 [m00*v0+m01*v1+m02*v2+m03][m10*v0+m11*v1+m12*v2+m13] 630 ps_madds0 fp12, fp2, fp7, fp8 631 // fp7 [v2][1.0F] : LOAD 632 psq_lu fp7, 8(srcBase), 1, 0 633 // fp13 [m20*v0+m21*v1+m22*v2+m23][?] 634 ps_sum0 fp13, fp10, fp9, fp10 635 // fp8 [m00*v0+m03][m10*v0+m13] 636 ps_madds0 fp8, fp0, fp6, fp3 637 // fp9 [m20*v0][m21*v1] 638 ps_mul fp9, fp4, fp6 639 // fp12 [v0'][v1'] : STORE 640 psq_stu fp12, 4(dstBase), 0, 0 641 // fp8 [m00*v0+m01*v1+m03][m10*v0+m11*v1+m13] 642 ps_madds1 fp8, fp1, fp6, fp8 643 // fp13 [v2'][ ? ] : STORE 644 psq_stu fp13, 8(dstBase), 1, 0 645 // fp10 [m20*v0+m22*v2][m21*v1+m23*1.0F] 646 ps_madd fp10, fp5, fp7, fp9 647 648 // LOOP 649 bdnz _ASM_MTXMultVecArray_mloop 650 651 652 // fp12 [m00*v0+m01*v1+m02*v2+m03][m10*v0+m11*v1+m12*v2+m13] 653 ps_madds0 fp12, fp2, fp7, fp8 654 // fp13 [m20*v0+m21*v1+m22*v2+m23][?] 655 ps_sum0 fp13, fp10, fp9, fp10 656 // fp12 [v0'][v1'] : STORE 657 psq_stu fp12, 4(dstBase), 0, 0 658 // fp13 [v2'][ ? ] : STORE 659 psq_stu fp13, 8(dstBase), 1, 0 660 661 blr 662 .size ASM_MTXMultVecArray,$-ASM_MTXMultVecArray 663 664#undef m 665#undef srcBase 666#undef dstBase 667#undef count 668 669 670 671//////////////////////////////////////////////////////////////////////////////// 672// void ASM_MTXMultVecSR(const Mtx m, const Vec *src, Vec *dst) 673 .global ASM_MTXMultVecSR 674#define m r3 675#define src r4 676#define dst r5 677ASM_MTXMultVecSR: 678 .type ASM_MTXMultVecSR, @function 679 680 psq_l fp0, 0(m), 0, 0 // m[0][0], m[0][1] GQR0 = 0 681 682 // fp6 - x y 683 psq_l fp6, 0(src), 0, 0 684 685 psq_l fp2, 16(m), 0, 0 // m[1][0], m[1][1] 686 687 688 // fp8 = m00x m01y // next X 689 ps_mul fp8, fp0, fp6 690 psq_l fp4, 32(m), 0, 0 // m[2][0], m[2][1] 691 692 // fp10 = m10x m11y // next Y 693 ps_mul fp10, fp2, fp6 694 psq_l fp7, 8(src), 1, 0 // fp7 - z,1.0 695 696 // fp12 = m20x m21y // next Z 697 ps_mul fp12, fp4, fp6 // YYY last FP6 usage 698 psq_l fp3, 24(m), 0, 0 // m[1][2], m[1][3] 699 700 ps_sum0 fp8, fp8, fp8, fp8 701 psq_l fp5, 40(m), 0, 0 // m[2][2], m[2][3] 702 703 ps_sum0 fp10, fp10, fp10, fp10 704 psq_l fp1, 8(m), 0, 0 // m[0][2], m[0][3] 705 706 ps_sum0 fp12, fp12, fp12, fp12 707 ps_madd fp9, fp1, fp7, fp8 708 psq_st fp9, 0(dst), 1, 0 // store X 709 710 ps_madd fp11, fp3, fp7, fp10 711 psq_st fp11, 4(dst), 1, 0 // store Y 712 713 ps_madd fp13, fp5, fp7, fp12 714 psq_st fp13, 8(dst), 1, 0 // sore Z 715 716 blr 717 .size ASM_MTXMultVecSR,$-ASM_MTXMultVecSR 718 719#undef m 720#undef src 721#undef dst 722 723 724 725//////////////////////////////////////////////////////////////////////////////// 726// void ASM_MTXMultVecArraySR(const Mtx m, const Vec *srcBase, Vec *dstBase, u32 count) 727#define m r3 728#define srcBase r4 729#define dstBase r5 730#define count r6 731 .global ASM_MTXMultVecArraySR 732ASM_MTXMultVecArraySR: 733 .type ASM_MTXMultVecArraySR, @function 734 735 // fp13 [m00][m01] : LOAD 736 psq_l fp13, 0(m), 0, 0 737 // fp12 [m10][m11] : LOAD 738 psq_l fp12, 16(m), 0, 0 739 // decrement loop count due to unrolling 740 subi count, count, 1 741 // fp11 [m02][1.0F] : LOAD 742 psq_l fp11, 8(m), 1, 0 743 // fp0 [m00][m10] 744 ps_merge00 fp0, fp13, fp12 745 // base pointer adjustment 746 subi dstBase, dstBase, 4 747 // fp10 [m12][1.0F] : LOAD 748 psq_l fp10, 24(m), 1, 0 749 // fp1 [m01][m11] 750 ps_merge11 fp1, fp13, fp12 751 // loop counter 752 mtctr count 753 // fp3 [m20][m21] : LOAD 754 psq_l fp3, 32(m), 0, 0 755 // fp2 [m02][m12] 756 ps_merge00 fp2, fp11, fp10 757 // fp4 [m22][1.0F] : LOAD 758 psq_l fp4, 40(m), 1, 0 759 760 761 // fp6 [v0][v1] : LOAD 762 psq_l fp6, 0(srcBase), 0, 0 763 // fp7 [v2][1.0F] : LOAD 764 psq_lu fp7, 8(srcBase), 1, 0 765 // fp8 [m00*v0][m10*v0] 766 ps_muls0 fp8, fp0, fp6 767 // fp9 [m20*v0][m21*v1] 768 ps_mul fp9, fp3, fp6 769 // fp8 [m00*v0+m01*v1][m10*v0+m11*v1] 770 ps_madds1 fp8, fp1, fp6, fp8 771 // fp10 [m20*v0+m22*v2][?] 772 ps_madd fp10, fp4, fp7, fp9 773 774_ASM_MTXMultVecArraySR_mloop: 775 //-------- Unrolled loop -------- 776 777 // fp6 [v0][v1] : LOAD 778 psq_lu fp6, 4(srcBase), 0, 0 779 // fp12 [m00*v0+m01*v1+m02*v2][m10*v0+m11*v1+m12*v2] 780 ps_madds0 fp12, fp2, fp7, fp8 781 // fp7 [v2][1.0F] : LOAD 782 psq_lu fp7, 8(srcBase), 1, 0 783 // fp13 [m20*v0+m21*v1+m22*v2][?] 784 ps_sum0 fp13, fp10, fp9, fp9 785 // fp8 [m00*v0][m10*v0] 786 ps_muls0 fp8, fp0, fp6 787 // fp9 [m20*v0][m21*v1] 788 ps_mul fp9, fp3, fp6 789 // fp12 [v0'][v1'] : STORE 790 psq_stu fp12, 4(dstBase), 0, 0 791 // fp8 [m00*v0+m01*v1][m10*v0+m11*v1] 792 ps_madds1 fp8, fp1, fp6, fp8 793 // fp13 [v2'][ ? ] : STORE 794 psq_stu fp13, 8(dstBase), 1, 0 795 // fp10 [m20*v0+m22*v2][?] 796 ps_madd fp10, fp4, fp7, fp9 797 798 // LOOP 799 bdnz _ASM_MTXMultVecArraySR_mloop 800 801 802 // fp12 [m00*v0+m01*v1+m02*v2][m10*v0+m11*v1+m12*v2] 803 ps_madds0 fp12, fp2, fp7, fp8 804 // fp13 [m20*v0+m21*v1+m22*v2][?] 805 ps_sum0 fp13, fp10, fp9, fp9 806 // fp12 [v0'][v1'] : STORE 807 psq_stu fp12, 4(dstBase), 0, 0 808 // fp13 [v2'][ ? ] : STORE 809 psq_stu fp13, 8(dstBase), 1, 0 810 811 blr 812 .size ASM_MTXMultVecArraySR,$-ASM_MTXMultVecArraySR 813 814#undef m 815#undef srcBase 816#undef dstBase 817#undef count 818