1/*---------------------------------------------------------------------------* 2 Project: matrix vector Library 3 File: mtx_asm.s 4 5 Copyright 1998-2011 Nintendo. All rights reserved. 6 7 These coded instructions, statements, and computer programs contain 8 proprietary information of Nintendo of America Inc. and/or Nintendo 9 Company Ltd., and are protected by Federal copyright law. They may 10 not be disclosed to third parties or copied or duplicated in any form, 11 in whole or in part, without the prior written consent of Nintendo. 12 13 *---------------------------------------------------------------------------*/ 14 15 .data 16 .align 2 17Unit01: .float 0.0 18 .float 1.0 19 20CONST_0_0F: .float 0.0 21CONST_0_5F: .float 0.5 22CONST_1_0F: .float 1.0 23CONST_3_0F: .float 3.0 24 25 26 .text 27 28 29//////////////////////////////////////////////////////////////////////////////// 30// void ASM_MTXIdentity(Mtx m) 31#define m r3 32#define c_01 f1 33#define c_10 f2 34#define c_zero f3 35#define c_one f4 36 .global ASM_MTXIdentity 37ASM_MTXIdentity: 38 .type ASM_MTXIdentity, @function 39 40 // c_zero = 0.0F; 41 lis r4, CONST_0_0F@h 42 ori r4, r4, CONST_0_0F@l 43 lfs c_zero, 0(r4) 44 45 psq_st c_zero, 8(m), 0, 0 // m[0][2], m[0][3] 46 47 // c_one = 1.0F; 48 lis r5, CONST_1_0F@h 49 ori r5, r5, CONST_1_0F@l 50 lfs c_one, 0(r5) 51 52 ps_merge01 c_01, c_zero, c_one // { 0.1F, 1.0F } 53 psq_st c_zero, 24(m), 0, 0 // m[1][2], m[1][3] 54 ps_merge10 c_10, c_one, c_zero // fp2 = { 1.0F, 0.0F } 55 psq_st c_zero, 32(m), 0, 0 // m[2][0], m[2][1] 56 psq_st c_01, 16(m), 0, 0 // m[1][0], m[1][1] 57 psq_st c_10, 0(m), 0, 0 // m[0][0], m[0][1] 58 psq_st c_10, 40(m), 0, 0 // m[2][2], m[2][3] 59 60 blr 61 .size ASM_MTXIdentity,$-ASM_MTXIdentity 62#undef m 63#undef c_01 64#undef c_10 65#undef c_zero 66#undef c_one 67 68//////////////////////////////////////////////////////////////////////////////// 69// void ASM_MTXCopy(const Mtx src, Mtx dst) 70#define src r3 71#define dst r4 72 .global ASM_MTXCopy 73ASM_MTXCopy: 74 .type ASM_MTXCopy, @function 75 76 psq_l fp0, 0(src), 0, 0 77 psq_st fp0, 0(dst), 0, 0 78 psq_l fp1, 8(src), 0, 0 79 psq_st fp1, 8(dst), 0, 0 80 psq_l fp2, 16(src), 0, 0 81 psq_st fp2, 16(dst), 0, 0 82 psq_l fp3, 24(src), 0, 0 83 psq_st fp3, 24(dst), 0, 0 84 psq_l fp4, 32(src), 0, 0 85 psq_st fp4, 32(dst), 0, 0 86 psq_l fp5, 40(src), 0, 0 87 psq_st fp5, 40(dst), 0, 0 88 89 blr 90 .size ASM_MTXCopy,$-ASM_MTXCopy 91 92#undef src 93#undef dst 94 95//////////////////////////////////////////////////////////////////////////////// 96// void ASM_MTXConcat(const Mtx mA, const Mtx mB, Mtx mAB) 97#define mA r3 98#define mB r4 99#define mAB r5 100 .global ASM_MTXConcat 101ASM_MTXConcat: 102 .type ASM_MTXConcat, @function 103#define A00_A01 fp0 104#define A02_A03 fp1 105#define A10_A11 fp2 106#define A12_A13 fp3 107#define A20_A21 fp4 108#define A22_A23 fp5 109 110#define B00_B01 fp6 111#define B02_B03 fp7 112#define B10_B11 fp8 113#define B12_B13 fp9 114#define B20_B21 fp10 115#define B22_B23 fp11 116 117#define D00_D01 fp12 118#define D02_D03 fp13 119#define D10_D11 fp14 120#define D12_D13 fp15 121#define D20_D21 fp2 122#define D22_D23 fp0 123 124#define UNIT01 fp31 125 126 // don't save LR since we don't make any function calls 127 // mflr r0 128 // stw r0, 4(r1) 129 stwu r1, -64(r1) 130 psq_l A00_A01, 0(mA), 0, 0 131 132 psq_st fp14, 8(r1), 0, 0 133 stfd fp14, 16(r1) 134 135 psq_l B00_B01, 0(mB), 0, 0 136 addis r6, 0, Unit01@ha 137 psq_l B02_B03, 8(mB), 0, 0 138 139 psq_st fp15, 24(r1), 0, 0 140 stfd fp15, 32(r1) 141 142 addi r6, r6, Unit01@l 143 144 psq_st fp31, 40(r1), 0, 0 145 stfd fp31, 48(r1) 146 147 psq_l B10_B11, 16(mB), 0, 0 148 // D00_D01 = b00a00 , b01a00 149 ps_muls0 D00_D01, B00_B01, A00_A01 150 psq_l A10_A11, 16(mA), 0, 0 151 // D02_D03 = b02a00 , b03a00 152 ps_muls0 D02_D03, B02_B03, A00_A01 153 psq_l UNIT01, 0(r6), 0, 0 154 // D10_D11 = a10b00 , a10b01 155 ps_muls0 D10_D11, B00_B01, A10_A11 156 psq_l B12_B13, 24(mB), 0, 0 157 // D12_D13 = a10b02 , a10b03 158 ps_muls0 D12_D13, B02_B03, A10_A11 159 psq_l A02_A03, 8(mA), 0, 0 160 // fp12 = b10a01 + b00a00 , b11a01 + b01a00 161 ps_madds1 D00_D01, B10_B11, A00_A01, D00_D01 162 psq_l A12_A13, 24(mA), 0, 0 163 // D10_D11 = a10b00 + a11b10 , a10b01 + a11b11 164 ps_madds1 D10_D11, B10_B11, A10_A11, D10_D11 165 psq_l B20_B21, 32(mB), 0, 0 166 // D02_D03 = b12a01 + b02a00 , b13a01 + b03a00 167 ps_madds1 D02_D03, B12_B13, A00_A01, D02_D03 // YYY LAST TIME FP0 IS USED 168 psq_l B22_B23, 40(mB), 0, 0 169 // D12_D13 = a10b02 + a11b12, a10b03+a11b13 170 ps_madds1 D12_D13, B12_B13, A10_A11, D12_D13 // YYY LAST TIME FP2 IS USED 171 psq_l A20_A21, 32(mA), 0, 0 172 psq_l A22_A23, 40(mA), 0, 0 173 // D00_D01 = b20a02 + b10a01 + b00a00 , b21a02 + b11a01 + b01a00 174 ps_madds0 D00_D01, B20_B21, A02_A03, D00_D01 // m00, m01 computed 175 // D02_D03 = b12a01 + b02a00 + b22a02 , b13a01 + b03a00 + b23a02 176 ps_madds0 D02_D03, B22_B23, A02_A03, D02_D03 177 // D10_D11 = a10b00 + a11b10 +a12b20, a10b01 + a11b11 + a12b21 178 ps_madds0 D10_D11, B20_B21, A12_A13, D10_D11 // m10, m11 computed 179 // D12_D13 = a10b02 + a11b12 + a12b22, a10b03+a11b13 + a12b23 + a13 180 ps_madds0 D12_D13, B22_B23, A12_A13, D12_D13 181 182 // store m00m01 183 psq_st D00_D01, 0(mAB), 0, 0 // YYY LAST TIME FP12 IS USED 184 185 // D20_D21 = a20b00, a20b01 186 ps_muls0 D20_D21, B00_B01, A20_A21 // YYY LAST TIME FP6 IS USED 187 // get a03 from fp1 and add to D02_D03 188 ps_madds1 D02_D03, UNIT01, A02_A03, D02_D03 // m02, m03 computed 189 // YYY LAST TIME FP1 IS USED 190 // D22_D23 = a20b02, a20b03 191 ps_muls0 D22_D23, B02_B03, A20_A21 // YYY LAST TIME FP7 IS USED 192 // store m10m11 193 psq_st D10_D11, 16(mAB), 0, 0 194 // get a13 from fp3 and add to D12_D13 195 ps_madds1 D12_D13, UNIT01, A12_A13, D12_D13 // m12, m13 computed 196 // store m02m03 197 psq_st D02_D03, 8(mAB), 0, 0 // YYY LAST TIME D02_D03 IS USED 198 199 // D20_D21 = a20b00 + a21b10, a20b01 + a21b11 200 ps_madds1 D20_D21, B10_B11, A20_A21, D20_D21 // YYY LAST TIME FP8 IS USED 201 // D22_D23 = a20b02 + a21b12, a20b03 + a21b13 202 ps_madds1 D22_D23, B12_B13, A20_A21, D22_D23 203 // D20_D21 = a20b00 + a21b10 + a22b20, a20b01 + a21b11 + a22b21 204 ps_madds0 D20_D21, B20_B21, A22_A23, D20_D21 205 206 // Restore fp14 207 psq_l fp14, 8(r1), 0, 0 208 lfd fp14, 16(r1) // D10_D11 209 210 // store m12m13 211 psq_st D12_D13, 24(mAB), 0, 0 212 // D22_D23 = a20b02 + a21b12 + a22b22, a20b03 + a21b13 + a22b23 + a23 213 ps_madds0 D22_D23, B22_B23, A22_A23, D22_D23 214 // store m20m21 215 psq_st D20_D21, 32(mAB), 0, 0 216 // get a23 from fp5 and add to fp17 217 ps_madds1 D22_D23, UNIT01, A22_A23, D22_D23 218 219 // restore stack frame 220 psq_l fp15, 24(r1), 0, 0 221 lfd fp15, 32(r1) // D12_D13 222 223 // store m22m23 224 psq_st D22_D23, 40(mAB), 0, 0 225 226 psq_l fp31, 40(r1), 0, 0 227 lfd fp31, 48(r1) 228 229 addi r1, r1, 64 230 231 blr 232 .size ASM_MTXConcat,$-ASM_MTXConcat 233 234#undef mA 235#undef mB 236#undef mAB 237#undef A00_A01 238#undef A02_A03 239#undef A10_A11 240#undef A12_A13 241#undef A20_A21 242#undef A22_A23 243#undef B00_B01 244#undef B02_B03 245#undef B10_B11 246#undef B12_B13 247#undef B20_B21 248#undef B22_B23 249#undef D00_D01 250#undef D02_D03 251#undef D10_D11 252#undef D12_D13 253#undef D20_D21 254#undef D22_D23 255 256#undef UNIT01 257 258//////////////////////////////////////////////////////////////////////////////// 259// void ASM_MTXConcatArray (const Mtx a, const Mtx* srcBase, Mtx* dstBase, u32 count) 260#define a r3 261#define srcBase r4 262#define dstBase r5 263#define count r6 264 .global ASM_MTXConcatArray 265ASM_MTXConcatArray: 266 .type ASM_MTXConcatArray, @function 267#define va0 f0 268#define va1 f1 269#define va2 f2 270#define va3 f3 271#define va4 f4 272#define va5 f5 273#define vb0 f6 274#define vb1 f7 275#define vb2 f8 276#define vb3 f9 277#define vb4 f10 278#define vb5 f11 279#define vd0 f12 280#define vd1 f13 281#define vd2 f14 282#define vd3 f15 283#define vd4 f16 284#define vd5 f17 285#define u01 f18 286#define u01Ptr r7 287#define sizeof_Mtx 48 288 289 mflr r0 290 stwu r1, -88(r1) 291 stw r0, 92(r1) 292 293 psq_st f14, 8(r1), 0, 0 294 stfd f14, 16(r1) 295 psq_st f15, 24(r1), 0, 0 296 stfd f15, 32(r1) 297 psq_st f16, 40(r1), 0, 0 298 stfd f16, 48(r1) 299 psq_st f17, 56(r1), 0, 0 300 stfd f17, 64(r1) 301 psq_st f18, 72(r1), 0, 0 302 stfd f18, 80(r1) 303 304 lis u01Ptr, Unit01@h 305 ori u01Ptr, u01Ptr, Unit01@l 306 307 // [a00][a01] 308 psq_l va0, 0(a), 0, 0 309 // [a02][a03] 310 psq_l va1, 8(a), 0, 0 311 // [a10][a11] 312 psq_l va2, 16(a), 0, 0 313 // [a12][a13] 314 psq_l va3, 24(a), 0, 0 315 // count-- 316 subi count, count, 1 317 // [a20][a21] 318 psq_l va4, 32(a), 0, 0 319 // [a22][a23] 320 psq_l va5, 40(a), 0, 0 321 // Loop count 322 mtctr count 323 // [0][1] 324 psq_l u01, 0(u01Ptr), 0, 0 325 326 //--------------------------------- 327 // [b00][b01] 328 psq_l vb0, 0(srcBase), 0, 0 329 // [b10][b11] 330 psq_l vb2, 16(srcBase), 0, 0 331 332 // [a00*b00][a00*b01] 333 ps_muls0 vd0, vb0, va0 334 // [a10*b00][a10*b01] 335 ps_muls0 vd2, vb0, va2 336 // [a20*b00][a20*b01] 337 ps_muls0 vd4, vb0, va4 338 339 // [b20][b21] 340 psq_l vb4, 32(srcBase), 0, 0 341 342 // [a00*b00 + a01*b10][a00*b01 + a01*b11] 343 ps_madds1 vd0, vb2, va0, vd0 344 // [a10*b00 + a11*b10][a10*b01 + a11*b11] 345 ps_madds1 vd2, vb2, va2, vd2 346 // [a20*b00 + a21*b10][a20*b01 + a21*b11] 347 ps_madds1 vd4, vb2, va4, vd4 348 349 // [b02][b03] 350 psq_l vb1, 8(srcBase), 0, 0 351 352 // [a00*b00 + a01*b10 + a02*b20][a00*b01 + a01*b11 + a02*b21] 353 ps_madds0 vd0, vb4, va1, vd0 354 // [a10*b00 + a11*b10 + a12*b20][a10*b01 + a11*b11 + a12*b21] 355 ps_madds0 vd2, vb4, va3, vd2 356 // [a20*b00 + a21*b10 + a22*b20][a20*b01 + a21*b11 + a22*b21] 357 ps_madds0 vd4, vb4, va5, vd4 358 359 // [b12][b13] 360 psq_l vb3, 24(srcBase), 0, 0 361 // [a00*b00 + a01*b10 + a02*b20][a00*b01 + a01*b11 + a02*b21] 362 psq_st vd0, 0(dstBase), 0, 0 363 364 // [a00*b02][a00*b03] 365 ps_muls0 vd1, vb1, va0 366 // [a10*b02][a10*b03] 367 ps_muls0 vd3, vb1, va2 368 // [a20*b02][a20*b03] 369 ps_muls0 vd5, vb1, va4 370 371 // [b22][b23] 372 psq_l vb5, 40(srcBase), 0, 0 373 // [a10*b00 + a11*b10 + a12*b20][a10*b01 + a11*b11 + a12*b21] 374 psq_st vd2, 16(dstBase), 0, 0 375 376 // [a00*b02 + a01*b12][a00*b03 + a01*b13] 377 ps_madds1 vd1, vb3, va0, vd1 378 // [a10*b02 + a11*b12][a10*b03 + a11*b13] 379 ps_madds1 vd3, vb3, va2, vd3 380 // [a20*b02 + a21*b12][a20*b03 + a21*b13] 381 ps_madds1 vd5, vb3, va4, vd5 382 383_ASM_MTXConcatArray_loop: 384 385 // ++srcBase 386 addi srcBase, srcBase, sizeof_Mtx 387 388 // [a00*b02 + a01*b12 + a02*b22][a00*b03 + a01*b13 + a02*b23] 389 ps_madds0 vd1, vb5, va1, vd1 390 // [a10*b02 + a11*b12 + a12*b22][a10*b03 + a11*b13 + a12*b23] 391 ps_madds0 vd3, vb5, va3, vd3 392 // [a20*b02 + a21*b12 + a22*b22][a20*b03 + a21*b13 + a22*b23] 393 ps_madds0 vd5, vb5, va5, vd5 394 395 // [b00][b01] 396 psq_l vb0, 0(srcBase), 0, 0 397 // [a20*b00 + a21*b10 + a22*b20][a20*b01 + a21*b11 + a22*b21] 398 psq_st vd4, 32(dstBase), 0, 0 399 400 // [a00*b02 + a01*b12 + a02*b22][a00*b03 + a01*b13 + a02*b23 + a03] 401 ps_madd vd1, u01, va1, vd1 402 // [a10*b02 + a11*b12 + a12*b22][a10*b03 + a11*b13 + a12*b23 + a13] 403 ps_madd vd3, u01, va3, vd3 404 // [a20*b02 + a21*b12 + a22*b22][a20*b03 + a21*b13 + a22*b23 + a23] 405 ps_madd vd5, u01, va5, vd5 406 407 // [b10][b11] 408 psq_l vb2, 16(srcBase), 0, 0 409 // [a00*b02 + a01*b12 + a02*b22][a00*b03 + a01*b13 + a02*b23 + a03] 410 psq_st vd1, 8(dstBase), 0, 0 411 412 // [a00*b00][a00*b01] 413 ps_muls0 vd0, vb0, va0 414 // [a10*b00][a10*b01] 415 ps_muls0 vd2, vb0, va2 416 // [a20*b00][a20*b01] 417 ps_muls0 vd4, vb0, va4 418 419 // [b20][b21] 420 psq_l vb4, 32(srcBase), 0, 0 421 // [a10*b02 + a11*b12 + a12*b22][a10*b03 + a11*b13 + a12*b23 + a13] 422 psq_st vd3, 24(dstBase), 0, 0 423 424 // [a00*b00 + a01*b10][a00*b01 + a01*b11] 425 ps_madds1 vd0, vb2, va0, vd0 426 // [a10*b00 + a11*b10][a10*b01 + a11*b11] 427 ps_madds1 vd2, vb2, va2, vd2 428 // [a20*b00 + a21*b10][a20*b01 + a21*b11] 429 ps_madds1 vd4, vb2, va4, vd4 430 431 // [b02][b03] 432 psq_l vb1, 8(srcBase), 0, 0 433 // [a20*b02 + a21*b12 + a22*b22][a20*b03 + a21*b13 + a22*b23 + a23] 434 psq_st vd5, 40(dstBase), 0, 0 435 // ++dstBase 436 addi dstBase, dstBase, sizeof_Mtx 437 438 // [a00*b00 + a01*b10 + a02*b20][a00*b01 + a01*b11 + a02*b21] 439 ps_madds0 vd0, vb4, va1, vd0 440 // [a10*b00 + a11*b10 + a12*b20][a10*b01 + a11*b11 + a12*b21] 441 ps_madds0 vd2, vb4, va3, vd2 442 // [a20*b00 + a21*b10 + a22*b20][a20*b01 + a21*b11 + a22*b21] 443 ps_madds0 vd4, vb4, va5, vd4 444 445 // [b12][b13] 446 psq_l vb3, 24(srcBase), 0, 0 447 // [a00*b00 + a01*b10 + a02*b20][a00*b01 + a01*b11 + a02*b21] 448 psq_st vd0, 0(dstBase), 0, 0 449 450 // [a00*b02][a00*b03] 451 ps_muls0 vd1, vb1, va0 452 // [a10*b02][a10*b03] 453 ps_muls0 vd3, vb1, va2 454 // [a20*b02][a20*b03] 455 ps_muls0 vd5, vb1, va4 456 457 // [b22][b23] 458 psq_l vb5, 40(srcBase), 0, 0 459 // [a10*b00 + a11*b10 + a12*b20][a10*b01 + a11*b11 + a12*b21] 460 psq_st vd2, 16(dstBase), 0, 0 461 462 // [a00*b02 + a01*b12][a00*b03 + a01*b13] 463 ps_madds1 vd1, vb3, va0, vd1 464 // [a10*b02 + a11*b12][a10*b03 + a11*b13] 465 ps_madds1 vd3, vb3, va2, vd3 466 // [a20*b02 + a21*b12][a20*b03 + a21*b13] 467 ps_madds1 vd5, vb3, va4, vd5 468 469 // LOOP 470 bdnz _ASM_MTXConcatArray_loop 471 472 // [a20*b00 + a21*b10 + a22*b20][a20*b01 + a21*b11 + a22*b21] 473 psq_st vd4, 32(dstBase), 0, 0 474 475 // [a00*b02 + a01*b12 + a02*b22][a00*b03 + a01*b13 + a02*b23] 476 ps_madds0 vd1, vb5, va1, vd1 477 // [a10*b02 + a11*b12 + a12*b22][a10*b03 + a11*b13 + a12*b23] 478 ps_madds0 vd3, vb5, va3, vd3 479 // [a20*b02 + a21*b12 + a22*b22][a20*b03 + a21*b13 + a22*b23] 480 ps_madds0 vd5, vb5, va5, vd5 481 482 // [a00*b02 + a01*b12 + a02*b22][a00*b03 + a01*b13 + a02*b23 + a03] 483 ps_madd vd1, u01, va1, vd1 484 // [a10*b02 + a11*b12 + a12*b22][a10*b03 + a11*b13 + a12*b23 + a13] 485 ps_madd vd3, u01, va3, vd3 486 // [a20*b02 + a21*b12 + a22*b22][a20*b03 + a21*b13 + a22*b23 + a23] 487 ps_madd vd5, u01, va5, vd5 488 489 // [a00*b02 + a01*b12 + a02*b22][a00*b03 + a01*b13 + a02*b23 + a03] 490 psq_st vd1, 8(dstBase), 0, 0 491 // [a10*b02 + a11*b12 + a12*b22][a10*b03 + a11*b13 + a12*b23 + a13] 492 psq_st vd3, 24(dstBase), 0, 0 493 // [a20*b02 + a21*b12 + a22*b22][a20*b03 + a21*b13 + a22*b23 + a23] 494 psq_st vd5, 40(dstBase), 0, 0 495 496 psq_l f14, 8(r1), 0, 0 497 lfd f14, 16(r1) 498 psq_l f15, 24(r1), 0, 0 499 lfd f15, 32(r1) 500 psq_l f16, 40(r1), 0, 0 501 lfd f16, 48(r1) 502 psq_l f17, 56(r1), 0, 0 503 lfd f17, 64(r1) 504 psq_l f18, 72(r1), 0, 0 505 lfd f18, 80(r1) 506 507 lwz r0, 92(r1) 508 mtlr r0 509 addi r1, r1, 88 510 blr 511 512 .size ASM_MTXConcatArray,$-ASM_MTXConcatArray 513 514#undef a 515#undef srcBase 516#undef dstBase 517#undef count 518#undef va0 519#undef va1 520#undef va2 521#undef va3 522#undef va4 523#undef va5 524#undef vb0 525#undef vb1 526#undef vb2 527#undef vb3 528#undef vb4 529#undef vb5 530#undef vd0 531#undef vd1 532#undef vd2 533#undef vd3 534#undef vd4 535#undef vd5 536#undef u01 537#undef u01Ptr 538 539//////////////////////////////////////////////////////////////////////////////// 540// void ASM_MTXTranspose ( const Mtx src, Mtx xPose ) { 541#define src r3 542#define xPose r4 543#define c_zero fp1 544#define row0a fp2 545#define row1a fp3 546#define row0b fp4 547#define row1b fp5 548#define trns0 fp6 549#define trns1 fp7 550#define trns2 fp8 551 .global ASM_MTXTranspose 552ASM_MTXTranspose: 553 .type ASM_MTXTranspose, @function 554 555 // c_zero = 0.0F; 556 lis r5, CONST_0_0F@h 557 ori r5, r5, CONST_0_0F@l 558 lfs c_zero, 0(r5) 559 psq_l row0a, 0(src), 0, 0 // [0][0], [0][1] 560 stfs c_zero, 44(xPose) // 0 -> [2][3] 561 psq_l row1a, 16(src), 0, 0 // [1][0], [1][1] 562 ps_merge00 trns0, row0a, row1a // [0][0], [1][0] 563 psq_l row0b, 8(src), 1, 0 // [0][2], 1 564 ps_merge11 trns1, row0a, row1a // [0][1], [1][1] 565 psq_l row1b, 24(src), 1, 0 // [1][2], 1 566 psq_st trns0, 0(xPose), 0, 0 // [0][0], [1][0] -> [0][0], [0][1] 567 psq_l row0a, 32(src), 0, 0 // [2][0], [2][1] 568 ps_merge00 trns2, row0b, row1b // [0][2], [1][2] 569 psq_st trns1, 16(xPose), 0, 0 // [0][1], [1][1] -> [1][0], [1][1] 570 ps_merge00 trns0, row0a, c_zero // [2][0], 0 571 psq_st trns2, 32(xPose), 0, 0 // [0][2], [1][2] -> [2][0], [2][1] 572 ps_merge10 trns1, row0a, c_zero // [2][1], 0 573 psq_st trns0, 8(xPose), 0, 0 // [2][0], 0 -> [0][2], [0][3] 574 lfs row0b, 40(src) // [2][2] 575 psq_st trns1, 24(xPose), 0, 0 // [2][1], 0 -> [1][2], [1][3] 576 stfs row0b, 40(xPose) // [2][2] -> [2][2] 577 578 blr 579 580 .size ASM_MTXTranspose,$-ASM_MTXTranspose 581 582#undef src 583#undef xPose 584#undef c_zero 585#undef row0a 586#undef row1a 587#undef row0b 588#undef row1b 589#undef trns0 590#undef trns1 591#undef trns2 592 593//////////////////////////////////////////////////////////////////////////////// 594// u32 ASM_MTXInverse(const Mtx src, Mtx inv) { 595#define src r3 596#define inv r4 597 .global ASM_MTXInverse 598ASM_MTXInverse: 599 .type ASM_MTXInverse, @function 600 601 // fp0 [ 00 ][ 1.0F ] : Load 602 psq_l fp0, 0( src ), 1, 0 603 // fp1 [ 01 ][ 02 ] : Load 604 psq_l fp1, 4( src ), 0, 0 605 // fp2 [ 10 ][ 1.0F ] : Load 606 psq_l fp2, 16( src ), 1, 0 607 // fp6 [ 02 ][ 00 ] 608 ps_merge10 fp6, fp1, fp0 609 // fp3 [ 11 ][ 12 ] : Load 610 psq_l fp3, 20( src ), 0, 0 611 // fp4 [ 20 ][ 1.0F ] : Load 612 psq_l fp4, 32( src ), 1, 0 613 // fp7 [ 12 ][ 10 ] 614 ps_merge10 fp7, fp3, fp2 615 // fp5 [ 21 ][ 22 ] : Load 616 psq_l fp5, 36( src ), 0, 0 617 // fp11[ 11*02 ][ 00*12 ] 618 ps_mul fp11, fp3, fp6 619 // fp8 [ 22 ][ 20 ] 620 ps_merge10 fp8, fp5, fp4 621 // fp13[ 21*12 ][ 10*22 ] 622 ps_mul fp13, fp5, fp7 623 // fp11[ 01*12 - 11*02 ][ 10*02 - 00*12 ] 624 ps_msub fp11, fp1, fp7, fp11 625 // fp12[ 01*22 ][ 20*02 ] 626 ps_mul fp12, fp1, fp8 627 // fp13[ 11*22 - 21*12 ][ 20*12 - 10*22 ] 628 ps_msub fp13, fp3, fp8, fp13 629 // fp10[ 20*11 ][ N/A ] 630 ps_mul fp10, fp3, fp4 631 // fp12[ 21*02 - 01*22 ][ 00*22 - 20*02 ] 632 ps_msub fp12, fp5, fp6, fp12 633 // fp7 [ 00*(11*22-21*12) ][ N/A ] 634 ps_mul fp7, fp0, fp13 635 // fp9 [ 00*21 ][ N/A ] 636 ps_mul fp9, fp0, fp5 637 // fp8 [ 10*01 ][ N/A ] 638 ps_mul fp8, fp1, fp2 639 // fp7 [ 00*(11*22-21*12) + 10*(21*02-01*22) ][ N/A ] 640 ps_madd fp7, fp2, fp12, fp7 641 // fp6 [ 0.0F ][ 0.0F ] 642 ps_sub fp6, fp6, fp6 643 // fp10[ 10*21 - 20*11 ][ N/A ] 644 ps_msub fp10, fp2, fp5, fp10 645 // fp7 [ 00*(11*22-21*12) + 10*(21*02-01*22) + 20*(01*12-11*02) ][ N/A ] : det 646 ps_madd fp7, fp4, fp11, fp7 647 // fp9 [ 20*01 - 00*21 ][ N/A ] 648 ps_msub fp9, fp1, fp4, fp9 649 // fp8 [ 00*11 - 10*01 ][ N/A ] 650 ps_msub fp8, fp0, fp3, fp8 651 652 // ( det == 0 ) ? 653 ps_cmpo0 cr0, fp7, fp6 654 bne _ASM_MTXInverse_regular 655 656 // return value (singular) 657 addi r3, 0, 0 658 659 blr 660 661_ASM_MTXInverse_regular: 662 663 // fp0 [ 1/det ][ N/A ] 664 fres fp0, fp7 665 666 // Newton's approximation 667 // Refinement : ( E = est. of 1/K ) -> ( E' = ( 2 - K * E ) * E ) 668 ps_add fp6, fp0, fp0 669 ps_mul fp5, fp7, fp0 670 ps_nmsub fp0, fp0, fp5, fp6 671 672 // fp1 [ 03 ][ 03 ] : Load 673 lfs fp1, 12(src) 674 // fp13[ ( 11*22 - 21*12 ) * rdet ][ ( 20*12 - 10*22 ) * rdet ] : i[0][0], i[1][0] 675 ps_muls0 fp13, fp13, fp0 676 // fp2 [ 13 ][ 13 ] : Load 677 lfs fp2, 28(src) 678 // fp12[ ( 21*02 - 01*22 ) * rdet ][ ( 00*22 - 20*02 ) * rdet ] : i[0][1], i[1][1] 679 ps_muls0 fp12, fp12, fp0 680 // fp3 [ 23 ][ 23 ] : Load 681 lfs fp3, 44(src) 682 // fp11[ ( 01*12 - 11*02 ) * rdet ][ ( 10*02 - 00*12 ) * rdet ] : i[0][2], i[1][2] 683 ps_muls0 fp11, fp11, fp0 684 // fp5 [ i00 ][ i01 ] 685 ps_merge00 fp5, fp13, fp12 686 // fp4 [ i10 ][ i11 ] 687 ps_merge11 fp4, fp13, fp12 688 // fp6 [ i00*03 ][ i10*03 ] 689 ps_mul fp6, fp13, fp1 690 // [ i00 ][ i01 ] : Store fp5 -> free(fp5[ i00 ][ i01 ]) 691 psq_st fp5, 0(inv), 0, 0 692 // [ i10 ][ i11 ] : Store fp4 -> free(fp4[ i10 ][ i11 ]) 693 psq_st fp4, 16(inv), 0, 0 694 // fp10[ ( 10*21 - 20*11 ) * rdet ] : i[2][0] 695 ps_muls0 fp10, fp10, fp0 696 // fp9 [ ( 20*01 - 00*21 ) * rdet ] : i[2][1] 697 ps_muls0 fp9, fp9, fp0 698 // fp6 [ i00*03+i01*13 ][ i10*03+i11*13 ] 699 ps_madd fp6, fp12, fp2, fp6 700 // [ i20 ] : Store fp10 701 psq_st fp10, 32(inv), 1, 0 702 // fp8 [ ( 00*11 - 10*01 ) * rdet ] : i[2][2] 703 ps_muls0 fp8, fp8, fp0 704 // fp6 [ -i00*03-i01*13-i02*23 ][ -i10*03-i11*13-i12*23 ] : i[0][3], i[1][3] 705 ps_nmadd fp6, fp11, fp3, fp6 706 // [ i21 ] : Store fp9 707 psq_st fp9, 36(inv), 1, 0 708 // fp7 [ i20*03 ][ N/A ] 709 ps_mul fp7, fp10, fp1 710 // fp5 [ i02 ][ i03 ] 711 ps_merge00 fp5, fp11, fp6 712 // [ i22 ] : Store fp8 713 psq_st fp8, 40(inv), 1, 0 714 // fp7 [ i20*03+i21*13 ][ N/A ] 715 ps_madd fp7, fp9, fp2, fp7 716 // fp4 [ i12 ][ i13 ] 717 ps_merge11 fp4, fp11, fp6 718 // [ i02 ][ i03 ] : Store fp5 719 psq_st fp5, 8(inv), 0, 0 720 // fp7 [ -i20*03-i21*13-i22*23 ][ N/A ] : i[2][3] 721 ps_nmadd fp7, fp8, fp3, fp7 722 // [ i12 ][ i13 ] : Store fp4 723 psq_st fp4, 24(inv), 0, 0 724 // [ i23 ] : Store fp7 725 psq_st fp7, 44(inv), 1, 0 726 727 // return value (regular) 728 addi r3, 0, 1 729 730 blr 731 .size ASM_MTXInverse,$-ASM_MTXInverse 732 733#undef src 734#undef inv 735 736 737 738//////////////////////////////////////////////////////////////////////////////// 739// u32 ASM_MTXInvXpose(const Mtx src, Mtx invX) 740 .global ASM_MTXInvXpose 741#define src r3 742#define invX r4 743ASM_MTXInvXpose: 744 .type ASM_MTXInvXpose, @function 745 746 // fp0 [ 00 ][ 1.0F ] : Load 747 psq_l fp0, 0( src ), 1, 0 748 // fp1 [ 01 ][ 02 ] : Load 749 psq_l fp1, 4( src ), 0, 0 750 // fp2 [ 10 ][ 1.0F ] : Load 751 psq_l fp2, 16( src ), 1, 0 752 // fp6 [ 02 ][ 00 ] 753 ps_merge10 fp6, fp1, fp0 754 // fp3 [ 11 ][ 12 ] : Load 755 psq_l fp3, 20( src ), 0, 0 756 // fp4 [ 20 ][ 1.0F ] : Load 757 psq_l fp4, 32( src ), 1, 0 758 // fp7 [ 12 ][ 10 ] 759 ps_merge10 fp7, fp3, fp2 760 // fp5 [ 21 ][ 22 ] : Load 761 psq_l fp5, 36( src ), 0, 0 762 // fp11[ 11*02 ][ 00*12 ] 763 ps_mul fp11, fp3, fp6 764 // fp8 [ 22 ][ 20 ] 765 ps_merge10 fp8, fp5, fp4 766 // fp13[ 21*12 ][ 10*22 ] 767 ps_mul fp13, fp5, fp7 768 // fp11[ 01*12 - 11*02 ][ 10*02 - 00*12 ] 769 ps_msub fp11, fp1, fp7, fp11 770 // fp12[ 01*22 ][ 20*02 ] 771 ps_mul fp12, fp1, fp8 772 // fp13[ 11*22 - 21*12 ][ 20*12 - 10*22 ] 773 ps_msub fp13, fp3, fp8, fp13 774 // fp10[ 20*11 ][ N/A ] 775 ps_mul fp10, fp3, fp4 776 // fp12[ 21*02 - 01*22 ][ 00*22 - 20*02 ] 777 ps_msub fp12, fp5, fp6, fp12 778 // fp7 [ 00*(11*22-21*12) ][ N/A ] 779 ps_mul fp7, fp0, fp13 780 // fp9 [ 00*21 ][ N/A ] 781 ps_mul fp9, fp0, fp5 782 // fp8 [ 10*01 ][ N/A ] 783 ps_mul fp8, fp1, fp2 784 // fp7 [ 00*(11*22-21*12) + 10*(21*02-01*22) ][ N/A ] 785 ps_madd fp7, fp2, fp12, fp7 786 // fp6 [ 0.0F ][ 0.0F ] 787 ps_sub fp6, fp6, fp6 788 // fp10[ 10*21 - 20*11 ][ N/A ] 789 ps_msub fp10, fp2, fp5, fp10 790 // fp7 [ 00*(11*22-21*12) + 10*(21*02-01*22) + 20*(01*12-11*02) ][ N/A ] : det 791 ps_madd fp7, fp4, fp11, fp7 792 // fp9 [ 20*01 - 00*21 ][ N/A ] 793 ps_msub fp9, fp1, fp4, fp9 794 // fp8 [ 00*11 - 10*01 ][ N/A ] 795 ps_msub fp8, fp0, fp3, fp8 796 797 // ( det == 0 ) ? 798 ps_cmpo0 cr0, fp7, fp6 799 //bne _regular 800 bne _ASM_MTXInvXpose_regular 801 802 // return value (singular) 803 addi r3, 0, 0 804 805 blr 806 807_ASM_MTXInvXpose_regular: 808 809 // fp0 [ 1/det ][ N/A ] 810 fres fp0, fp7 811 812 psq_st fp6, 12(invX),1, 0 813 814 // Newton's approximation 815 // Refinement : ( E = est. of 1/K ) -> ( E' = ( 2 - K * E ) * E ) 816 ps_add fp4, fp0, fp0 817 ps_mul fp5, fp7, fp0 818 psq_st fp6, 28(invX),1, 0 819 ps_nmsub fp0, fp0, fp5, fp4 820 psq_st fp6, 44(invX),1, 0 821 822 // fp13[ ( 11*22 - 21*12 ) * rdet ][ ( 20*12 - 10*22 ) * rdet ] : ix[0][0], ix[0][1] 823 ps_muls0 fp13, fp13, fp0 824 // fp12[ ( 21*02 - 01*22 ) * rdet ][ ( 00*22 - 20*02 ) * rdet ] : ix[1][0], ix[1][1] 825 ps_muls0 fp12, fp12, fp0 826 // [ ix00 ][ ix01 ] : Store fp13 827 psq_st fp13, 0( invX ), 0, 0 828 // fp11[ ( 01*12 - 11*02 ) * rdet ][ ( 10*02 - 00*12 ) * rdet ] : ix[2][0], ix[2][1] 829 ps_muls0 fp11, fp11, fp0 830 // [ ix10 ][ ix11 ] : Store fp12 831 psq_st fp12, 16( invX ), 0, 0 832 // fp10[ ( 10*21 - 20*11 ) * rdet ] : i[0][2] 833 ps_muls0 fp10, fp10, fp0 834 // [ ix20 ][ ix21 ] : Store fp11 835 psq_st fp11, 32( invX ), 0, 0 836 // fp9 [ ( 20*01 - 00*21 ) * rdet ] : i[1][2] 837 ps_muls0 fp9, fp9, fp0 838 // [ ix02 ] : Store fp10 839 psq_st fp10, 8( invX ), 1, 0 840 // fp8 [ ( 00*11 - 10*01 ) * rdet ] : i[2][2] 841 ps_muls0 fp8, fp8, fp0 842 // [ ix12 ] : Store fp9 843 psq_st fp9, 24( invX ), 1, 0 844 // [ ix22 ] : Store fp8 845 psq_st fp8, 40( invX ), 1, 0 846 847 // return value (regular) 848 addi r3, 0, 1 849 850 blr 851 .size ASM_MTXInvXpose,$-ASM_MTXInvXpose 852 853#undef src 854#undef invX 855 856 857 858//////////////////////////////////////////////////////////////////////////////// 859// void ASM_MTXReflect(Mtx m, const Vec *p, const Vec *n) 860#define m r3 861#define p r4 862#define n r5 863 .global ASM_MTXReflect 864ASM_MTXReflect: 865 .type ASM_MTXReflect, @function 866#define c_one fp1 867#define vn_xy fp2 868#define vn_z1 fp3 869#define n2vn_xy fp4 870#define n2vn_z1 fp5 871#define pdotn fp6 872#define tmp0 fp7 873#define tmp1 fp8 874#define tmp2 fp9 875#define tmp3 fp10 876#define tmp4 fp11 877#define tmp5 fp12 878#define tmp6 fp13 879#define tmp7 fp0 880 881 // c_one = 1.0F 882 lis r6, CONST_1_0F@h 883 ori r6, r6, CONST_1_0F@l 884 lfs c_one, 0(r6) 885 886 // vn_z1 = [nz][1.0F] : LOAD 887 psq_l vn_z1, 8(n), 1, 0 888 // vn_xy = [nx][ny] : LOAD 889 psq_l vn_xy, 0(n), 0, 0 890 891 // tmp0 = [px][py] : LOAD 892 psq_l tmp0, 0(p), 0, 0 893 // n2vn_z1 = [-2nz][-2.0F] 894 ps_nmadd n2vn_z1, vn_z1, c_one, vn_z1 895 // tmp1 = [pz][1.0F] : LOAD 896 psq_l tmp1, 8(p), 1, 0 897 // n2vn_xy = [-2nx][-2ny] 898 ps_nmadd n2vn_xy, vn_xy, c_one, vn_xy 899 900 // tmp4 = [-2nx*nz][-2ny*nz] : [m20][m21] 901 ps_muls0 tmp4, vn_xy, n2vn_z1 902 // pdotn = [-2(px*nx)][-2(py*ny)] 903 ps_mul pdotn, n2vn_xy, tmp0 904 // tmp2 = [-2nx*nx][-2nx*ny] 905 ps_muls0 tmp2, vn_xy, n2vn_xy 906 // pdotn = [-2(px*nx+py*ny)][?] 907 ps_sum0 pdotn, pdotn, pdotn, pdotn 908 // tmp3 = [-2nx*ny][-2ny*ny] 909 ps_muls1 tmp3, vn_xy, n2vn_xy 910 // tmp4 = [m20][m21] : STORE 911 psq_st tmp4, 32(m), 0, 0 912 // tmp2 = [1-2nx*nx][-2nx*ny] : [m00][m01] 913 ps_sum0 tmp2, tmp2, tmp2, c_one 914 // pdotn = [2(px*nx+py*ny+pz*nz)][?] 915 ps_nmadd pdotn, n2vn_z1, tmp1, pdotn 916 // tmp3 = [-2nx*ny][1-2ny*ny] : [m10][m11] 917 ps_sum1 tmp3, c_one, tmp3, tmp3 918 // tmp2 = [m00][m01] : STORE 919 psq_st tmp2, 0(m), 0, 0 920 // tmp5 = [pdotn*nx][pdotn*ny] 921 ps_muls0 tmp5, vn_xy, pdotn 922 // tmp6 = [-2nz][pdotn] 923 ps_merge00 tmp6, n2vn_z1, pdotn 924 // tmp3 = [m10][m11] : STORE 925 psq_st tmp3, 16(m), 0, 0 926 927 // tmp7 = [-2nx*nz][pdotn*nx] : [m02][m03] 928 ps_merge00 tmp7, tmp4, tmp5 929 // tmp6 = [-2nz*nz][pdotn*nz] 930 ps_muls0 tmp6, tmp6, vn_z1 931 // tmp5 = [-2ny*nz][pdotn*ny] : [m12][m13] 932 ps_merge11 tmp5, tmp4, tmp5 933 // tmp7 = [m02][m03] : STORE 934 psq_st tmp7, 8(m), 0, 0 935 // tmp6 = [1-2nz*nz][pdotn*nz] : [m22][m23] 936 ps_sum0 tmp6, tmp6, tmp6, c_one 937 // tmp5 = [m12][m13] : STORE 938 psq_st tmp5, 24(m), 0, 0 939 // tmp6 = [m22][m23] : STORE 940 psq_st tmp6, 40(m), 0, 0 941 942 blr 943 .size ASM_MTXReflect,$-ASM_MTXReflect 944 945#undef m 946#undef p 947#undef n 948#undef c_one 949#undef vn_xy 950#undef vn_z1 951#undef n2vn_xy 952#undef n2vn_z1 953#undef pdotn 954#undef tmp0 955#undef tmp1 956#undef tmp2 957#undef tmp3 958#undef tmp4 959#undef tmp5 960#undef tmp6 961#undef tmp7 962 963 964 965//////////////////////////////////////////////////////////////////////////////// 966// void ASM_MTXScaleApply (const Mtx src, Mtx dst, f32 xS, f32 yS, f32 zS) 967#define src r3 968#define dst r4 969#define xS fp1 970#define yS fp2 971#define zS fp3 972 .global ASM_MTXScaleApply 973ASM_MTXScaleApply: 974 .type ASM_MTXScaleApply, @function 975 frsp xS, xS // to make sure xS = single precision 976 psq_l fp4, 0(src), 0, 0 977 frsp yS, yS // to make sure yS = single precision 978 psq_l fp5, 8(src), 0, 0 979 frsp zS, zS // to make sure zS = single precision 980 ps_muls0 fp4, fp4, xS 981 psq_l fp6, 16(src), 0, 0 982 ps_muls0 fp5, fp5, xS 983 psq_l fp7, 24(src), 0, 0 984 ps_muls0 fp6, fp6, yS 985 psq_l fp8, 32(src), 0, 0 986 psq_st fp4, 0(dst), 0, 0 987 ps_muls0 fp7, fp7, yS 988 psq_l fp2, 40(src), 0, 0 989 psq_st fp5, 8(dst), 0, 0 990 ps_muls0 fp8, fp8, zS 991 psq_st fp6, 16(dst), 0, 0 992 ps_muls0 fp2, fp2, zS 993 psq_st fp7, 24(dst), 0, 0 994 psq_st fp8, 32(dst), 0, 0 995 psq_st fp2, 40(dst), 0, 0 996 blr 997 .size ASM_MTXScaleApply,$-ASM_MTXScaleApply 998#undef src 999#undef dst 1000#undef xS 1001#undef yS 1002#undef zS 1003 1004 1005 1006//////////////////////////////////////////////////////////////////////////////// 1007// void _ASM_MTXRotAxisRadInternal(Mtx m, const Vec *axis, f32 sT, f32 cT) 1008 .global _ASM_MTXRotAxisRadInternal 1009#define m r3 1010#define axis r4 1011#define sT fp1 1012#define cT fp2 1013_ASM_MTXRotAxisRadInternal: 1014 .type _ASM_MTXRotAxisRadInternal, @function 1015#define tT fp3 1016#define fc0 fp4 1017#define tmp0 fp5 1018#define tmp1 fp6 1019#define tmp2 fp7 1020#define tmp3 fp8 1021#define tmp4 fp9 1022#define tmp5 fp10 1023#define tmp6 fp11 1024#define tmp7 fp12 1025#define tmp8 fp13 1026#define tmp9 fp14 1027 1028 mflr r0 1029 stwu r1, -24(r1) 1030 stw r0, 28(r1) 1031 1032 psq_st fp14, 8(r1), 0, 0 1033 stfd fp14, 16(r1) 1034 1035 // tmp8 = 3.0F; 1036 lis r6, CONST_3_0F@h 1037 ori r6, r6, CONST_3_0F@l 1038 lfs tmp8, 0(r6) 1039 1040 // tmp9 = 0.5F; 1041 lis r5, CONST_0_5F@h 1042 ori r5, r5, CONST_0_5F@l 1043 lfs tmp9, 0(r5) 1044 1045 // to make sure cT = (single precision float value) 1046 frsp cT, cT 1047 // tmp0 = [x][y] : LOAD 1048 psq_l tmp0, 0(axis), 0, 0 1049 // to make sure sT = (single precision float value) 1050 frsp sT, sT 1051 // tmp1 = [z][z] : LOAD 1052 lfs tmp1, 8(axis) 1053 1054 // tmp2 = [x*x][y*y] 1055 ps_mul tmp2, tmp0, tmp0 1056 // tmp7 = [1.0F] 1057 fadds tmp7, tmp9, tmp9 1058 // tmp3 = [x*x+z*z][y*y+z*z] 1059 ps_madd tmp3, tmp1, tmp1, tmp2 1060 // fc0 = [0.0F] 1061 fsubs fc0, tmp9, tmp9 1062 // tmp4 = [S = x*x+y*y+z*z][z] 1063 ps_sum0 tmp4, tmp3, tmp1, tmp2 1064 1065 // tT = 1.0F - cT 1066 fsubs tT, tmp7, cT 1067 1068 // tmp5 = [1.0/sqrt(S)] :estimation[E] 1069 frsqrte tmp5, tmp4 1070 // Newton-Rapson refinement step 1071 // E' = E/2(3.0 - E*E*S) 1072 fmuls tmp2, tmp5, tmp5 // E*E 1073 fmuls tmp3, tmp5, tmp9 // E/2 1074 fnmsubs tmp2, tmp2, tmp4, tmp8 // (3-E*E*S) 1075 fmuls tmp5, tmp2, tmp3 // (E/2)(3-E*E*S) 1076 1077 // cT = [c][c] 1078 ps_merge00 cT, cT, cT 1079 1080 // tmp0 = [nx = x/sqrt(S)][ny = y/sqrt(S)] 1081 ps_muls0 tmp0, tmp0, tmp5 1082 // tmp1 = [nz = z/sqrt(S)][nz = z/sqrt(S)] 1083 ps_muls0 tmp1, tmp1, tmp5 1084 1085 // tmp4 = [t*nx][t*ny] 1086 ps_muls0 tmp4, tmp0, tT 1087 // tmp9 = [s*nx][s*ny] 1088 ps_muls0 tmp9, tmp0, sT 1089 // tmp5 = [t*nz][t*nz] 1090 ps_muls0 tmp5, tmp1, tT 1091 1092 // tmp3 = [t*nx*ny][t*ny*ny] 1093 ps_muls1 tmp3, tmp4, tmp0 1094 // tmp2 = [t*nx*nx][t*ny*nx] 1095 ps_muls0 tmp2, tmp4, tmp0 1096 // tmp4 = [t*nx*nz][t*ny*nz] 1097 ps_muls0 tmp4, tmp4, tmp1 1098 1099 // tmp6 = [t*nx*ny-s*nz][t*nx*ny-s*nz] 1100 fnmsubs tmp6, tmp1, sT, tmp3 1101 // tmp7 = [t*nx*ny+s*nz][t*ny*ny+s*nz] 1102 fmadds tmp7, tmp1, sT, tmp3 1103 1104 // tmp0 = [-s*nx][-s*ny] 1105 ps_neg tmp0, tmp9 1106 // tmp8 = [t*nx*nz+s*ny][0] == [m02][m03] 1107 ps_sum0 tmp8, tmp4, fc0, tmp9 1108 // tmp2 = [t*nx*nx+c][t*nx*ny-s*nz] == [m00][m01] 1109 ps_sum0 tmp2, tmp2, tmp6, cT 1110 // tmp3 = [t*nx*ny+s*nz][t*ny*ny+c] == [m10][m11] 1111 ps_sum1 tmp3, cT, tmp7, tmp3 1112 // tmp6 = [t*ny*nz-s*nx][0] == [m12][m13] 1113 ps_sum0 tmp6, tmp0, fc0 ,tmp4 1114 1115 // tmp8 [m02][m03] : STORE 1116 psq_st tmp8, 8(m), 0, 0 1117 // tmp0 = [t*nx*nz-s*ny][t*ny*nz] 1118 ps_sum0 tmp0, tmp4, tmp4, tmp0 1119 // tmp2 [m00][m01] : STORE 1120 psq_st tmp2, 0(m), 0, 0 1121 // tmp5 = [t*nz*nz][t*nz*nz] 1122 ps_muls0 tmp5, tmp5, tmp1 1123 // tmp3 [m10][m11] : STORE 1124 psq_st tmp3, 16(m), 0, 0 1125 // tmp4 = [t*nx*nz-s*ny][t*ny*nz+s*nx] == [m20][m21] 1126 ps_sum1 tmp4, tmp9, tmp0, tmp4 1127 // tmp6 [m12][m13] : STORE 1128 psq_st tmp6, 24(m), 0, 0 1129 // tmp5 = [t*nz*nz+c][0] == [m22][m23] 1130 ps_sum0 tmp5, tmp5, fc0, cT 1131 // tmp4 [m20][m21] : STORE 1132 psq_st tmp4, 32(m), 0, 0 1133 // tmp5 [m22][m23] : STORE 1134 psq_st tmp5, 40(m), 0, 0 1135 1136 psq_l fp14, 8(r1), 0, 0 1137 lfd fp14, 16(r1) 1138 1139 lwz r0, 28(r1) 1140 mtlr r0 1141 addi r1, r1, 24 1142 1143 blr 1144 .size _ASM_MTXRotAxisRadInternal,$-_ASM_MTXRotAxisRadInternal 1145#undef m 1146#undef axis 1147#undef sT 1148#undef cT 1149#undef tT 1150#undef fc0 1151#undef tmp0 1152#undef tmp1 1153#undef tmp2 1154#undef tmp3 1155#undef tmp4 1156#undef tmp5 1157#undef tmp6 1158#undef tmp7 1159#undef tmp8 1160#undef tmp9 1161 1162 1163 1164//////////////////////////////////////////////////////////////////////////////// 1165 1166// void ASM_MTXTrans(Mtx m, f32 xT, f32 yT, f32 zT) 1167#define m r3 1168#define xT fp1 1169#define yT fp2 1170#define zT fp3 1171#define c_zero fp4 1172#define c_one fp5 1173 .global ASM_MTXTrans 1174ASM_MTXTrans: 1175 .type ASM_MTXTrans, @function 1176 1177 // c_zero = 0.0F; 1178 lis r4, CONST_0_0F@h 1179 ori r4, r4, CONST_0_0F@l 1180 lfs c_zero, 0(r4) 1181 1182 // c_one = 1.0F; 1183 lis r5, CONST_1_0F@h 1184 ori r5, r5, CONST_1_0F@l 1185 lfs c_one, 0(r5) 1186 1187 stfs xT, 12(m) 1188 stfs yT, 28(m) 1189 psq_st c_zero, 4(m), 0, 0 1190 psq_st c_zero, 32(m), 0, 0 1191 stfs c_zero, 16(m) 1192 stfs c_one, 20(m) 1193 stfs c_zero, 24(m) 1194 stfs c_one, 40(m) 1195 stfs zT, 44(m) 1196 stfs c_one, 0(m) 1197 1198 blr 1199 .size ASM_MTXTrans,$-ASM_MTXTrans 1200#undef m 1201#undef xT 1202#undef yT 1203#undef zT 1204#undef c_zero 1205#undef c_one 1206 1207 1208//////////////////////////////////////////////////////////////////////////////// 1209 1210// void ASM_MTXTransApply(const Mtx src, Mtx dst, f32 xT, f32 yT, f32 zT ) 1211#define src r3 1212#define dst r4 1213#define xT fp1 1214#define yT fp2 1215#define zT fp3 1216 .global ASM_MTXTransApply 1217ASM_MTXTransApply: 1218 .type ASM_MTXTransApply, @function 1219 1220 psq_l fp4, 0(src), 0, 0 1221 frsp xT, xT; // to make sure xT = single precision 1222 psq_l fp5, 8(src), 0, 0 1223 frsp yT, yT; // to make sure yT = single precision 1224 psq_l fp7, 24(src), 0, 0 1225 frsp zT, zT; // to make sure zT = single precision 1226 psq_l fp8, 40(src), 0, 0 1227 psq_st fp4, 0(dst), 0, 0 1228 ps_sum1 fp5, xT, fp5, fp5 1229 psq_l fp6, 16(src), 0, 0 1230 psq_st fp5, 8(dst), 0, 0 1231 ps_sum1 fp7, yT, fp7, fp7 1232 psq_l fp9, 32(src), 0, 0 1233 psq_st fp6, 16(dst), 0, 0 1234 ps_sum1 fp8, zT, fp8, fp8 1235 psq_st fp7, 24(dst), 0, 0 1236 psq_st fp9, 32(dst), 0, 0 1237 psq_st fp8, 40(dst), 0, 0 1238 1239 blr 1240 .size ASM_MTXTransApply,$-ASM_MTXTransApply 1241#undef src 1242#undef dst 1243#undef xT 1244#undef yT 1245#undef zT 1246 1247//////////////////////////////////////////////////////////////////////////////// 1248 1249// void ASM_MTXScale(Mtx m, f32 xS, f32 yS, f32 zS) 1250#define m r3 1251#define xS fp1 1252#define yS fp2 1253#define zS fp3 1254#define c_zero fp4 1255 .global ASM_MTXScale 1256ASM_MTXScale: 1257 .type ASM_MTXScale, @function 1258 1259 // c_zero = 0.0F; 1260 lis r4, CONST_0_0F@h 1261 ori r4, r4, CONST_0_0F@l 1262 lfs c_zero, 0(r4) 1263 1264 stfs xS, 0(m) 1265 psq_st c_zero, 4(m), 0, 0 1266 psq_st c_zero, 12(m), 0, 0 1267 stfs yS, 20(m) 1268 psq_st c_zero, 24(m), 0, 0 1269 psq_st c_zero, 32(m), 0, 0 1270 stfs zS, 40(m) 1271 stfs c_zero, 44(m) 1272 1273 blr 1274 .size ASM_MTXScale,$-ASM_MTXScale 1275#undef m 1276#undef xS 1277#undef yS 1278#undef zS 1279 1280 1281//////////////////////////////////////////////////////////////////////////////// 1282 1283// void ASM_MTXRotTrig(Mtx m, char axis, f32 sinA, f32 cosA); 1284#define m r3 1285#define axis r4 1286#define sinA fp1 1287#define cosA fp2 1288#define fc0 fp3 1289#define fc1 fp4 1290#define nsinA fp5 1291#define fw0 fp6 1292#define fw1 fp7 1293#define fw2 fp8 1294#define fw3 fp9 1295 1296 .global ASM_MTXRotTrig 1297ASM_MTXRotTrig: 1298 .type ASM_MTXRotTrig, @function 1299 1300 frsp sinA, sinA // to make sure sinA = single precision 1301 frsp cosA, cosA // to make sure cosA = single precision 1302 1303 // fc0 = 0.0F; 1304 lis r5, CONST_0_0F@h 1305 ori r5, r5, CONST_0_0F@l 1306 lfs fc0, 0(r5) 1307 1308 // fc1 = 1.0F; 1309 lis r6, CONST_1_0F@h 1310 ori r6, r6, CONST_1_0F@l 1311 lfs fc1, 0(r6) 1312 1313 // always lower case 1314 ori axis, axis, 0x20 1315 ps_neg nsinA, sinA 1316 1317 // branches 1318 cmplwi axis, 'x' 1319 beq _case_x 1320 cmplwi axis, 'y' 1321 beq _case_y 1322 cmplwi axis, 'z' 1323 beq _case_z 1324 b _end 1325 1326 _case_x: 1327 psq_st fc1, 0(m), 1, 0 1328 psq_st fc0, 4(m), 0, 0 1329 ps_merge00 fw0, sinA, cosA 1330 psq_st fc0, 12(m), 0, 0 1331 ps_merge00 fw1, cosA, nsinA 1332 psq_st fc0, 28(m), 0, 0 1333 psq_st fc0, 44(m), 1, 0 1334 psq_st fw0, 36(m), 0, 0 1335 psq_st fw1, 20(m), 0, 0 1336 b _end; 1337 1338 _case_y: 1339 ps_merge00 fw0, cosA, fc0 1340 ps_merge00 fw1, fc0, fc1 1341 psq_st fc0, 24(m), 0, 0 1342 psq_st fw0, 0(m), 0, 0 1343 ps_merge00 fw2, nsinA, fc0 1344 ps_merge00 fw3, sinA, fc0 1345 psq_st fw0, 40(m), 0, 0; 1346 psq_st fw1, 16(m), 0, 0; 1347 psq_st fw3, 8(m), 0, 0; 1348 psq_st fw2, 32(m), 0, 0; 1349 b _end; 1350 1351 _case_z: 1352 psq_st fc0, 8(m), 0, 0 1353 ps_merge00 fw0, sinA, cosA 1354 ps_merge00 fw2, cosA, nsinA 1355 psq_st fc0, 24(m), 0, 0 1356 psq_st fc0, 32(m), 0, 0 1357 ps_merge00 fw1, fc1, fc0 1358 psq_st fw0, 16(m), 0, 0 1359 psq_st fw2, 0(m), 0, 0 1360 psq_st fw1, 40(m), 0, 0 1361 1362 _end: 1363 1364 blr 1365 .size ASM_MTXRotTrig,$-ASM_MTXRotTrig 1366#undef m 1367#undef axis 1368#undef sinA 1369#undef cosA 1370#undef fc0 1371#undef fc1 1372#undef nsinA 1373#undef fw0 1374#undef fw1 1375#undef fw2 1376#undef fw3 1377 1378//////////////////////////////////////////////////////////////////////////////// 1379 1380// void ASM_MTXReorder(const Mtx src, ROMtx dest) 1381#define src r3 1382#define dest r4 1383 .global ASM_MTXReorder 1384#define S00_S01 fp1 1385#define S02_S03 fp2 1386#define S10_S11 fp3 1387#define S12_S13 fp4 1388#define S20_S21 fp5 1389#define S22_S23 fp6 1390#define D00_D10 fp7 1391#define D11_D21 fp8 1392#define D02_D12 fp9 1393#define D22_D03 fp10 1394#define D13_D23 fp11 1395#define D20_D01 fp12 1396 1397ASM_MTXReorder: 1398 .type ASM_MTXReorder, @function 1399 1400 psq_l S00_S01, 0(src), 0, 0 1401 psq_l S10_S11, 16(src), 0, 0 1402 psq_l S20_S21, 32(src), 0, 0 1403 psq_l S02_S03, 8(src), 0, 0 1404 ps_merge00 D00_D10, S00_S01, S10_S11 1405 psq_l S12_S13, 24(src), 0, 0 1406 ps_merge01 D20_D01, S20_S21, S00_S01 1407 psq_l S22_S23, 40(src), 0, 0 1408 ps_merge11 D11_D21, S10_S11, S20_S21 1409 psq_st D00_D10, 0(dest), 0, 0 1410 ps_merge00 D02_D12, S02_S03, S12_S13 1411 psq_st D20_D01, 8(dest), 0, 0 1412 ps_merge01 D22_D03, S22_S23, S02_S03 1413 psq_st D11_D21, 16(dest),0, 0 1414 ps_merge11 D13_D23, S12_S13, S22_S23 1415 psq_st D02_D12, 24(dest),0, 0 1416 psq_st D22_D03, 32(dest),0,0 1417 psq_st D13_D23, 40(dest),0,0 1418 1419 blr 1420 .size ASM_MTXReorder,$-ASM_MTXReorder 1421#undef src 1422#undef dest 1423#undef S00_S01 1424#undef S02_S03 1425#undef S10_S11 1426#undef S12_S13 1427#undef S20_S21 1428#undef S22_S23 1429#undef D00_D10 1430#undef D11_D21 1431#undef D02_D12 1432#undef D22_D03 1433#undef D13_D23 1434#undef D20_D01 1435 1436