1/*---------------------------------------------------------------------------* 2 Project: matrix vector Library 3 File: mtx44_asm.s 4 5 Copyright 1998-2011 Nintendo. All rights reserved. 6 7 These coded instructions, statements, and computer programs contain 8 proprietary information of Nintendo of America Inc. and/or Nintendo 9 Company Ltd., and are protected by Federal copyright law. They may 10 not be disclosed to third parties or copied or duplicated in any form, 11 in whole or in part, without the prior written consent of Nintendo. 12 13 *---------------------------------------------------------------------------*/ 14 15 .data 16 .align 2 17CONST_0_0F: .float 0.0 18CONST_0_5F: .float 0.5 19CONST_1_0F: .float 1.0 20CONST_3_0F: .float 3.0 21 22 .text 23 24//////////////////////////////////////////////////////////////////////////////// 25//void ASM_MTX44Identity(Mtx44 m) 26#define m r3 27#define c1 fp1 28#define c0 fp2 29 .global ASM_MTX44Identity 30ASM_MTX44Identity: 31 .type ASM_MTX44Identity, @function 32 33 // c1 = 1.0F; 34 lis r4, CONST_1_0F@h 35 ori r4, r4, CONST_1_0F@l 36 lfs c1, 0(r4) 37 38 // c0 = 0.0F; 39 lis r5, CONST_0_0F@h 40 ori r5, r5, CONST_0_0F@l 41 lfs c0, 0(r5) 42 43 stfs c1, 0(m) 44 psq_st c0, 4(m), 0, 0 45 psq_st c0, 12(m), 0, 0 46 stfs c1, 20(m) 47 psq_st c0, 24(m), 0, 0 48 psq_st c0, 32(m), 0, 0 49 stfs c1, 40(m) 50 psq_st c0, 44(m), 0, 0 51 psq_st c0, 52(m), 0, 0 52 stfs c1, 60(m) 53 54 blr 55 .size ASM_MTX44Identity,$-ASM_MTX44Identity 56 57#undef m 58#undef c1 59#undef c0 60 61//////////////////////////////////////////////////////////////////////////////// 62//void ASM_MTX44Copy(const Mtx44 src, Mtx44 dst) 63#define src r3 64#define dst r4 65 .global ASM_MTX44Copy 66ASM_MTX44Copy: 67 .type ASM_MTX44Copy, @function 68 69 psq_l fp1, 0(src), 0, 0 70 psq_st fp1, 0(dst), 0, 0 71 psq_l fp1, 8(src), 0, 0 72 psq_st fp1, 8(dst), 0, 0 73 psq_l fp1, 16(src), 0, 0 74 psq_st fp1, 16(dst), 0, 0 75 psq_l fp1, 24(src), 0, 0 76 psq_st fp1, 24(dst), 0, 0 77 psq_l fp1, 32(src), 0, 0 78 psq_st fp1, 32(dst), 0, 0 79 psq_l fp1, 40(src), 0, 0 80 psq_st fp1, 40(dst), 0, 0 81 psq_l fp1, 48(src), 0, 0 82 psq_st fp1, 48(dst), 0, 0 83 psq_l fp1, 56(src), 0, 0 84 psq_st fp1, 56(dst), 0, 0 85 86 blr 87 .size ASM_MTX44Copy,$-ASM_MTX44Copy 88 89#undef src 90#undef dst 91 92 93//////////////////////////////////////////////////////////////////////////////// 94//void ASM_MTX44Transpose(const Mtx44 src, Mtx44 xPose) 95#define src r3 96#define xPose r4 97 .global ASM_MTX44Transpose 98ASM_MTX44Transpose: 99 .type ASM_MTX44Transpose, @function 100 101 psq_l fp0, 0(src), 0, 0 // fp0 <= s00,s01 102 psq_l fp1, 16(src), 0, 0 // fp1 <= s10,s11 103 104 ps_merge00 fp4, fp0, fp1 // fp4 <= t00,t10 105 psq_l fp2, 8(src), 0, 0 // fp2 <= s02,s03 106 psq_st fp4, 0(xPose), 0, 0 107 108 ps_merge11 fp5, fp0, fp1 // fp5 <= t01,t11 109 psq_l fp3, 24(src), 0, 0 // fp3 <= s12,s13 110 psq_st fp5, 16(xPose), 0, 0; 111 112 ps_merge00 fp4, fp2, fp3 // fp4 <= t02,t12 113 psq_l fp0, 32(src), 0, 0 // fp0 <= s20,s21 114 psq_st fp4, 32(xPose), 0, 0 115 116 ps_merge11 fp5, fp2, fp3 // fp5 <= t03,t13 117 psq_l fp1, 48(src), 0, 0 // fp1 <= s30,s31 118 psq_st fp5, 48(xPose), 0, 0 119 120 ps_merge00 fp4, fp0, fp1 // fp4 <= t20,t30 121 psq_l fp2, 40(src), 0, 0 // fp2 <= s22,s23 122 psq_st fp4, 8(xPose), 0, 0 123 124 ps_merge11 fp5, fp0, fp1 // fp5 <= t21,t31 125 psq_l fp3, 56(src), 0, 0 // fp2 <= s32,s33 126 psq_st fp5, 24(xPose), 0, 0 127 128 ps_merge00 fp4, fp2, fp3 // fp4 <= s22,s32 129 psq_st fp4, 40(xPose), 0, 0 130 131 ps_merge11 fp5, fp2, fp3 // fp5 <= s23,s33 132 psq_st fp5, 56(xPose), 0, 0 133 134 blr 135 .size ASM_MTX44Transpose,$-ASM_MTX44Transpose 136 137#undef src 138#undef xPose 139 140//////////////////////////////////////////////////////////////////////////////// 141// void ASM_MTX44Concat(const Mtx44 a, const Mtx44 b, Mtx44 ab) 142#define a r3 143#define b r4 144#define ab r5 145 .global ASM_MTX44Concat 146ASM_MTX44Concat: 147 .type ASM_MTX44Concat, @function 148 149 psq_l fp0 , 0(a), 0, 0 // a00,a01 150 psq_l fp2 , 0(b), 0, 0 // b00,b01 151 ps_muls0 fp6 , fp2, fp0 // b00a00,b01a00 152 psq_l fp3 , 16(b), 0, 0 // b10,b11 153 psq_l fp4 , 32(b), 0, 0 // b20,b21 154 ps_madds1 fp6 , fp3, fp0, fp6 // b00a00+b10a01,b01a00+b11a01 155 psq_l fp1 , 8(a), 0, 0 // a02,a03 156 psq_l fp5 , 48(b), 0, 0 // b30,b31 157 158 // b00a00+b10a01+b20a02,b01a00+b11a01+b21a02 159 ps_madds0 fp6 , fp4, fp1, fp6 160 psq_l fp0 , 16(a), 0, 0 // a10,a11 161 162 // b00a00+b10a01+b20a02+b30a03,b01a00+b11a01+b21a02+b31a03 163 ps_madds1 fp6 , fp5, fp1, fp6 164 psq_l fp1 , 24(a), 0, 0 // a12,a13 165 ps_muls0 fp8 , fp2, fp0 // b00a10,b01a10 166 ps_madds1 fp8 , fp3, fp0, fp8 // b00a10+b10a11,b01a11+b11a11 167 psq_l fp0 , 32(a), 0, 0 // a20,a21 168 169 // b00a10+b10a11+b20a12,b01a11+b11a11+b21a12 170 ps_madds0 fp8 , fp4, fp1, fp8 171 172 // b00a10+b10a11+b20a12+b30a13,b01a10+b11a11+b21a12+b31a13 173 ps_madds1 fp8 , fp5, fp1, fp8 174 psq_l fp1 , 40(a), 0, 0 // a22,a23 175 ps_muls0 fp10, fp2, fp0 // b00a20,b01a20 176 ps_madds1 fp10, fp3, fp0, fp10 // b00a20+b10a21,b01a20+b11a21 177 psq_l fp0 , 48(a), 0, 0 // a30,a31 178 179 // b00a20+b10a21+b20a22,b01a20+b11a21+b21a22 180 ps_madds0 fp10, fp4, fp1, fp10 181 182 // b00a20+b10a21+b20a22+b30a23,b01a20+b11a21+b21a22+b31a23 183 ps_madds1 fp10, fp5, fp1, fp10 184 psq_l fp1 , 56(a), 0, 0 // a32,a33 185 186 ps_muls0 fp12, fp2, fp0 // b00a30,b01a30 187 psq_l fp2 , 8(b), 0, 0 // b02,b03 188 ps_madds1 fp12, fp3, fp0, fp12 // b00a30+b10a31,b01a30+b11a31 189 psq_l fp0 , 0(a), 0, 0 // a00,a01 190 191 // b00a30+b10a31+b20a32,b01a30+b11a31+b21a32 192 ps_madds0 fp12, fp4, fp1, fp12 193 psq_l fp3 , 24(b), 0, 0 // b12,b13 194 195 // b00a30+b10a31+b20a32+b30a33,b01a30+b11a31+b21a32+b31a33 196 ps_madds1 fp12, fp5, fp1, fp12 197 psq_l fp1 , 8(a), 0, 0 // a02,a03 198 199 ps_muls0 fp7 , fp2, fp0 // b02a00,b03a00 200 psq_l fp4 , 40(b), 0, 0 // b22,b23 201 ps_madds1 fp7 , fp3, fp0, fp7 // b02a00+b12a01,b03a00+b13a01 202 psq_l fp5 , 56(b), 0, 0 // b32,b33 203 204 // b02a00+b12a01+b22a02,b03a00+b13a01+b23a02 205 ps_madds0 fp7 , fp4, fp1, fp7 206 207 psq_l fp0 , 16(a), 0, 0 // a10,a11 208 209 // b02a00+b12a01+b22a02+b32a03,b03a00+b13a01+b23a02+b33a03 210 ps_madds1 fp7 , fp5, fp1, fp7 211 psq_l fp1 , 24(a), 0, 0 // a12,a13 212 213 ps_muls0 fp9 , fp2, fp0 // b02a10,b03a10 214 psq_st fp6 , 0(ab), 0, 0 // ab00,ab01 215 ps_madds1 fp9 , fp3, fp0, fp9 // b02a10+b12a11,b03a10+b13a11 216 psq_l fp0 , 32(a), 0, 0 // a20,a21 217 218 // b02a10+b12a11+b22a12,b03a10+b13a11+b23a12 219 ps_madds0 fp9, fp4, fp1, fp9 220 psq_st fp8 ,16(ab), 0, 0 // ab10,ab11 221 222 // b02a10+b12a11+b22a12+b32a13,b03a10+b13a11+b23a12+b33a13 223 ps_madds1 fp9 , fp5, fp1, fp9 224 psq_l fp1 , 40(a), 0, 0 // a22,a23 225 ps_muls0 fp11, fp2, fp0 // b02a20,b03a20 226 psq_st fp10,32(ab), 0, 0 // ab20,ab21 227 ps_madds1 fp11, fp3, fp0, fp11 // b02a20+b12a21,b03a20+b13a21 228 psq_l fp0 , 48(a), 0, 0 // a30,a31 229 230 // b02a20+b12a21+b22a22,b03a20+b13a21+b23a22 231 ps_madds0 fp11, fp4, fp1, fp11 232 psq_st fp12,48(ab), 0, 0 // ab30,ab31 233 234 // b02a20+b12a21+b22a22+b32a23,b03a20+b13a21+b23a22+b33a23 235 ps_madds1 fp11, fp5, fp1, fp11 236 237 psq_l fp1, 56(a), 0, 0 // a32,a33 238 ps_muls0 fp13, fp2, fp0 // b02a30,b03a30 239 psq_st fp7 , 8(ab), 0, 0 // ab02,ab03 240 ps_madds1 fp13, fp3, fp0, fp13 // b02a30+b12a31,b03a30+b13a31 241 psq_st fp9 ,24(ab), 0, 0 // ab12,ab13 242 243 // b02a30+b12a31+b22a32,b03a30+b13a31+b23a32 244 ps_madds0 fp13, fp4, fp1, fp13 245 psq_st fp11,40(ab), 0, 0 // ab22,ab23 246 247 // b02a30+b12a31+b22a32+b32a33,b03a30+b13a31+b23a32+b33a33 248 ps_madds1 fp13, fp5, fp1, fp13 249 250 psq_st fp13,56(ab), 0, 0 // ab32,ab33 251 blr 252 .size ASM_MTX44Concat,$-ASM_MTX44Concat 253 254#undef a 255#undef b 256#undef ab 257 258 259 260//////////////////////////////////////////////////////////////////////////////// 261// void _ASM_MTX44RotAxisRadInternal(Mtx44 m, const Vec *axis, f32 sT, f32 cT) 262 .global _ASM_MTX44RotAxisRadInternal 263#define m r3 264#define axis r4 265#define sT fp1 266#define cT fp2 267_ASM_MTX44RotAxisRadInternal: 268 .type _ASM_MTX44RotAxisRadInternal, @function 269 270#define tT fp3 271#define fc0 fp4 272#define tmp0 fp5 273#define tmp1 fp6 274#define tmp2 fp7 275#define tmp3 fp8 276#define tmp4 fp9 277#define tmp5 fp10 278#define tmp6 fp11 279#define tmp7 fp12 280#define tmp8 fp13 281#define tmp9 fp0 282 283 // tmp9 = 0.5F; 284 lis r5, CONST_0_5F@h 285 ori r5, r5, CONST_0_5F@l 286 lfs tmp9, 0(r5) 287 288 // tmp8 = 3.0F; 289 lis r5, CONST_3_0F@h 290 ori r5, r5, CONST_3_0F@l 291 lfs tmp8, 0(r5) 292 293 // to make sure cT = (single precision float value) 294 frsp cT, cT 295 // tmp0 = [x][y] : LOAD 296 psq_l tmp0, 0(axis), 0, 0 297 // to make sure sT = (single precision float value) 298 frsp sT, sT 299 // tmp1 = [z][z] : LOAD 300 lfs tmp1, 8(axis) 301 302 // tmp2 = [x*x][y*y] 303 ps_mul tmp2, tmp0, tmp0 304 // tmp7 = [1.0F] 305 fadds tmp7, tmp9, tmp9 306 // tmp3 = [x*x+z*z][y*y+z*z] 307 ps_madd tmp3, tmp1, tmp1, tmp2 308 // fc0 = [0.0F] 309 fsubs fc0, tmp9, tmp9 310 // tmp4 = [S = x*x+y*y+z*z][z] 311 ps_sum0 tmp4, tmp3, tmp1, tmp2 312 313 // tT = 1.0F - cT 314 fsubs tT, tmp7, cT 315 316 // tmp5 = [1.0/sqrt(S)] :estimation[E] 317 frsqrte tmp5, tmp4 318 // tmp7 = [0][1] 319 ps_merge00 tmp7, fc0, tmp7 320 // Newton-Rapson refinement step 321 // E' = E/2(3.0 - E*E*S) 322 fmuls tmp2, tmp5, tmp5 // E*E 323 fmuls tmp3, tmp5, tmp9 // E/2 324 // fc0 [m30=0][m31=0] : STORE 325 psq_st fc0, 48(m), 0, 0 326 fnmsubs tmp2, tmp2, tmp4, tmp8 // (3-E*E*S) 327 fmuls tmp5, tmp2, tmp3 // (E/2)(3-E*E*S) 328 // tmp7 [m32=0][m33=1] : STORE 329 psq_st tmp7, 56(m), 0, 0 330 // cT = [c][c] 331 ps_merge00 cT, cT, cT 332 333 // tmp0 = [nx = x/sqrt(S)][ny = y/sqrt(S)] 334 ps_muls0 tmp0, tmp0, tmp5 335 // tmp1 = [nz = z/sqrt(S)][nz = z/sqrt(S)] 336 ps_muls0 tmp1, tmp1, tmp5 337 // tmp4 = [t*nx][t*ny] 338 ps_muls0 tmp4, tmp0, tT 339 // tmp9 = [s*nx][s*ny] 340 ps_muls0 tmp9, tmp0, sT 341 // tmp5 = [t*nz][t*nz] 342 ps_muls0 tmp5, tmp1, tT 343 // tmp3 = [t*nx*ny][t*ny*ny] 344 ps_muls1 tmp3, tmp4, tmp0 345 // tmp2 = [t*nx*nx][t*ny*nx] 346 ps_muls0 tmp2, tmp4, tmp0 347 // tmp4 = [t*nx*nz][t*ny*nz] 348 ps_muls0 tmp4, tmp4, tmp1 349 350 // tmp6 = [t*nx*ny-s*nz][t*nx*ny-s*nz] 351 fnmsubs tmp6, tmp1, sT, tmp3 352 // tmp7 = [t*nx*ny+s*nz][t*ny*ny+s*nz] 353 fmadds tmp7, tmp1, sT, tmp3 354 355 // tmp0 = [-s*nx][-s*ny] 356 ps_neg tmp0, tmp9 357 // tmp8 = [t*nx*nz+s*ny][0] == [m02][m03] 358 ps_sum0 tmp8, tmp4, fc0, tmp9 359 // tmp2 = [t*nx*nx+c][t*nx*ny-s*nz] == [m00][m01] 360 ps_sum0 tmp2, tmp2, tmp6, cT 361 // tmp3 = [t*nx*ny+s*nz][t*ny*ny+c] == [m10][m11] 362 ps_sum1 tmp3, cT, tmp7, tmp3 363 // tmp6 = [t*ny*nz-s*nx][0] == [m12][m13] 364 ps_sum0 tmp6, tmp0, fc0 ,tmp4 365 366 // tmp8 [m02][m03] : STORE 367 psq_st tmp8, 8(m), 0, 0 368 // tmp0 = [t*nx*nz-s*ny][t*ny*nz] 369 ps_sum0 tmp0, tmp4, tmp4, tmp0 370 // tmp2 [m00][m01] : STORE 371 psq_st tmp2, 0(m), 0, 0 372 // tmp5 = [t*nz*nz][t*nz*nz] 373 ps_muls0 tmp5, tmp5, tmp1 374 // tmp3 [m10][m11] : STORE 375 psq_st tmp3, 16(m), 0, 0 376 // tmp4 = [t*nx*nz-s*ny][t*ny*nz+s*nx] == [m20][m21] 377 ps_sum1 tmp4, tmp9, tmp0, tmp4 378 // tmp6 [m12][m13] : STORE 379 psq_st tmp6, 24(m), 0, 0 380 // tmp5 = [t*nz*nz+c][0] == [m22][m23] 381 ps_sum0 tmp5, tmp5, fc0, cT 382 // tmp4 [m20][m21] : STORE 383 psq_st tmp4, 32(m), 0, 0 384 // tmp5 [m22][m23] : STORE 385 psq_st tmp5, 40(m), 0, 0 386 387 blr 388 .size _ASM_MTX44RotAxisRadInternal,$-_ASM_MTX44RotAxisRadInternal 389#undef m 390#undef axis 391#undef sT 392#undef cT 393#undef tT 394#undef fc0 395#undef tmp0 396#undef tmp1 397#undef tmp2 398#undef tmp3 399#undef tmp4 400#undef tmp5 401#undef tmp6 402#undef tmp7 403#undef tmp8 404#undef tmp9 405 406 407 408//////////////////////////////////////////////////////////////////////////////// 409// void ASM_MTX44ScaleApply(const Mtx44 src, Mtx44 dst, f32 xS, f32 yS, f32 zS) 410 .global ASM_MTX44ScaleApply 411#define src r3 412#define dst r4 413#define xS fp1 414#define yS fp2 415#define zS fp3 416ASM_MTX44ScaleApply: 417 .type ASM_MTX44ScaleApply, @function 418 419 psq_l fp4, 0(src), 0, 0 // fp4 <- src00,src01 420 frsp xS, xS // to make sure xS = single precision 421 psq_l fp5, 8(src), 0, 0 // fp5 <- src02,src03 422 frsp yS, yS // to make sure yS = single precision 423 psq_l fp6, 16(src), 0, 0 // fp6 <- src10,src11 424 ps_muls0 fp4, fp4, xS // fp4 <- src00*xS,src01*xS 425 psq_l fp7, 24(src), 0, 0 // fp7 <- src12,src13 426 ps_muls0 fp5, fp5, xS // fp5 <- src02*xS,src03*xS 427 psq_l fp8, 32(src), 0, 0 // fp8 <- src20,src21 428 frsp zS, zS // to make sure zS = single precision 429 psq_st fp4, 0(dst), 0, 0 // dst00,dst01 430 ps_muls0 fp6, fp6, yS // fp6 <- src10*yS,src11*yS 431 psq_l fp9, 40(src), 0, 0 // fp9 <- src22,src23 432 psq_st fp5, 8(dst), 0, 0 // dst02,dst03 433 ps_muls0 fp7, fp7, yS // fp7 <- src12*yS,src13*yS 434 psq_l fp10, 48(src), 0, 0 // fp10 <- src30src31 435 psq_st fp6, 16(dst), 0, 0 // dst10,dst11 436 ps_muls0 fp8, fp8, zS // fp8 <- src20*zS,src21*zS 437 psq_l fp11, 56(src), 0, 0 // fp11 <- src32,src33 438 psq_st fp7, 24(dst), 0, 0 // dst12,dst13 439 ps_muls0 fp9, fp9, zS // fp9 <- src22*zS,src23*zS 440 psq_st fp8, 32(dst), 0, 0 // dst20,dst21 441 psq_st fp9, 40(dst), 0, 0 // dst22,dst23 442 psq_st fp10, 48(dst), 0, 0 // dst30,dst31 443 psq_st fp11, 56(dst), 0, 0 // dst32,dst33 444 blr 445 .size ASM_MTX44ScaleApply,$-ASM_MTX44ScaleApply 446#undef src 447#undef dst 448#undef xS 449#undef yS 450#undef zS 451 452//////////////////////////////////////////////////////////////////////////////// 453//void ASM_MTX44Trans(Mtx44 m, f32 xT, f32 yT, f32 zT) 454 .global ASM_MTX44Trans 455#define m r3 456#define xT fp1 457#define yT fp2 458#define zT fp3 459#define c_one fp4 460#define c_zero fp5 461#define c_01 fp6 462ASM_MTX44Trans: 463 .type ASM_MTX44Trans, @function 464 465 // c_one = 1.0F; 466 lis r4, CONST_1_0F@h 467 ori r4, r4, CONST_1_0F@l 468 lfs c_one, 0(r4) 469 470 // c_zero = 0.0F; 471 lis r5, CONST_0_0F@h 472 ori r5, r5, CONST_0_0F@l 473 lfs c_zero, 0(r5) 474 475 stfs xT, 12(m) // m03 476 stfs yT, 28(m) // m13 477 ps_merge00 c_01, c_zero, c_one // c_01 <- 0.0, 1.0 478 stfs zT, 44(m) // m23 479 psq_st c_one, 0(m), 1, 0 // m00 480 psq_st c_zero, 4(m), 0, 0 // m01,m02 481 psq_st c_01, 16(m), 0, 0 // m10,m11 482 psq_st c_zero, 24(m), 1, 0 // m12 483 psq_st c_zero, 32(m), 0, 0 // m20,m21 484 psq_st c_one, 40(m), 1, 0 // m22 485 psq_st c_zero, 48(m), 0, 0 // m30,m31 486 psq_st c_01, 56(m), 0, 0 // m32,m33 487 488 blr 489 .size ASM_MTX44Trans,$-ASM_MTX44Trans 490#undef m 491#undef xT 492#undef yT 493#undef zT 494#undef c_zero 495#undef c_one 496#undef c_01 497 498//////////////////////////////////////////////////////////////////////////////// 499//void ASM_MTX44TransApply(const Mtx44 src, Mtx44 dst, f32 xT, f32 yT, f32 zT) 500 .global ASM_MTX44TransApply 501#define src r3 502#define dst r4 503#define xT fp1 504#define yT fp2 505#define zT fp3 506ASM_MTX44TransApply: 507 .type ASM_MTX44TransApply, @function 508 509 psq_l fp4, 0(src), 0, 0 510 frsp xT, xT // to make sure xS = single precision 511 psq_l fp5, 8(src), 0, 0 512 frsp yT, yT // to make sure yS = single precision 513 psq_l fp6, 16(src), 0, 0 514 frsp zT, zT // to make sure zS = single precision 515 psq_l fp7, 24(src), 0, 0 516 psq_st fp4, 0(dst), 0, 0 517 ps_sum1 fp5, xT, fp5, fp5 518 psq_l fp4, 40(src), 0, 0 519 psq_st fp6, 16(dst), 0, 0 520 ps_sum1 fp7, yT, fp7, fp7 521 psq_l fp8, 32(src), 0, 0 522 psq_st fp5, 8(dst), 0, 0 523 ps_sum1 fp4, zT, fp4, fp4 524 psq_st fp7, 24(dst), 0, 0 525 psq_st fp8, 32(dst), 0, 0 526 psq_l fp5, 48(src), 0, 0 527 psq_l fp6, 56(src), 0, 0 528 psq_st fp4, 40(dst), 0, 0 529 psq_st fp5, 48(dst), 0, 0 530 psq_st fp6, 56(dst), 0, 0 531 532 blr 533 .size ASM_MTX44TransApply,$-ASM_MTX44TransApply 534 535#undef m 536#undef xT 537#undef yT 538#undef zT 539#undef c_zero 540#undef c_one 541#undef c_01 542 543//////////////////////////////////////////////////////////////////////////////// 544//void ASM_MTX44Scale(Mtx44 m, f32 xS, f32 yS, f32 zS) 545 .global ASM_MTX44Scale 546#define m r3 547#define xS fp1 548#define yS fp2 549#define zS fp3 550#define c_one fp4 551#define c_zero fp5 552ASM_MTX44Scale: 553 .type ASM_MTX44Scale, @function 554 555 // c_one = 1.0F; 556 lis r4, CONST_1_0F@h 557 ori r4, r4, CONST_1_0F@l 558 lfs c_one, 0(r4) 559 560 // c_zero = 0.0F; 561 lis r5, CONST_0_0F@h 562 ori r5, r5, CONST_0_0F@l 563 lfs c_zero, 0(r5) 564 565 stfs xS, 0(m) 566 psq_st c_zero, 4(m), 0, 0 // m01,m02 567 psq_st c_zero, 12(m), 0, 0 // m03,m10 568 stfs yS, 20(m) // m11 569 psq_st c_zero, 24(m), 0, 0 // m12,m13 570 psq_st c_zero, 32(m), 0, 0 // m20,m21 571 stfs zS, 40(m) // m22 572 psq_st c_zero, 44(m), 0, 0 // m23,m30 573 psq_st c_zero, 52(m), 0, 0 // m31,m32 574 stfs c_one, 60(m) // m33 575 576 blr 577 .size ASM_MTX44Scale,$-ASM_MTX44Scale 578 579#undef m 580#undef xS 581#undef yS 582#undef zS 583#undef c_zero 584#undef c_one 585 586//////////////////////////////////////////////////////////////////////////////// 587//void ASM_MTX44RotTrig(Mtx44 m, char axis, f32 sinA, f32 cosA) 588 .global ASM_MTX44RotTrig 589#define m r3 590#define axis r4 591#define sinA fp1 592#define cosA fp2 593#define ftmp0 fp3 594#define ftmp1 fp4 595#define ftmp2 fp5 596#define ftmp3 fp6 597#define ftmp4 fp7 598#define c_one fp8 599#define c_zero fp9 600ASM_MTX44RotTrig: 601 .type ASM_MTX44RotTrig, @function 602 603 // c_one = 1.0F; 604 lis r5, CONST_1_0F@h 605 ori r5, r5, CONST_1_0F@l 606 lfs c_one, 0(r5) 607 608 // c_zero = 0.0F; 609 lis r6, CONST_0_0F@h 610 ori r6, r6, CONST_0_0F@l 611 lfs c_zero, 0(r6) 612 613 frsp sinA, sinA // to make sure sinA = single precision 614 615 // always lower case 616 ori axis, axis, 0x20 617 618 frsp cosA, cosA // to make sure cosA = single precision 619 620 // branches 621 cmplwi axis, 'x'; // if 'x' 622 beq _case_x; 623 cmplwi axis, 'y'; // if 'y' 624 beq _case_y; 625 cmplwi axis, 'z'; // if 'z' 626 beq _case_z; 627 b _end; 628 629 _case_x: 630 psq_st c_one, 0(m), 1, 0; // m00 <= 1.0 631 psq_st c_zero, 4(m), 0, 0; // m01,m02 <= 0.0,0.0 632 ps_neg ftmp0, sinA; // ftmp0 <= -sinA 633 psq_st c_zero, 12(m), 0, 0; // m03,m10 <= 0.0,0.0 634 ps_merge00 ftmp1, sinA, cosA; // ftmp1 <= sinA,cosA 635 psq_st c_zero, 28(m), 0, 0; // m13,m20 <= 0.0,0.0 636 ps_merge00 ftmp0, cosA, ftmp0; // ftmp0 <= cosA,-sinA 637 psq_st c_zero, 44(m), 0, 0; // m23,m30 <= 0.0,0.0 638 psq_st c_zero, 52(m), 0, 0; // m23,m30 <= 0.0,0.0 639 psq_st ftmp1, 36(m), 0, 0; // m21,m22 <= sinA,cosA 640 psq_st ftmp0, 20(m), 0, 0; // m11,m12 <= cosA,-sinA 641 psq_st c_one, 60(m), 1, 0; // m33 <= 0.0 642 b _end; 643 644 _case_y: 645 ps_merge00 ftmp1, cosA, c_zero; // ftmp1 <= cosA,0.0 646 psq_st c_zero, 48(m), 0, 0; // m30,m31 <= 0.0,0.0 647 ps_neg ftmp0, sinA; // ftmp0 <= -sinA 648 psq_st c_zero, 24(m), 0, 0; // m12,m13 <= 0.0,0.0 649 ps_merge00 ftmp3, c_zero, c_one; // ftmp3 <= 0.0,1.0 650 psq_st ftmp1, 0(m), 0, 0; // m00,m01 <= cosA,0.0 651 ps_merge00 ftmp4, ftmp0, c_zero; // ftmp4 <= -sinA,0.0 652 ps_merge00 ftmp2, sinA, c_zero; // ftmp2 <= sinA,0.0 653 psq_st ftmp3, 16(m), 0, 0; // m10,m11 <= 0.0,1.0 654 psq_st ftmp2, 8(m), 0, 0; // m02,m03 <= sinA,0.0 655 psq_st ftmp4, 32(m), 0, 0; // m20,m21 <= -sinA,0.0 656 psq_st ftmp1, 40(m), 0, 0; // m22,m23 <= cosA,0.0 657 psq_st ftmp3, 56(m), 0, 0; // m32,m33 <= 0.0,1.0 658 b _end; 659 660 _case_z: 661 psq_st c_zero, 8(m), 0, 0; // m02,m03 <= 0.0,0.0 662 ps_neg ftmp0, sinA; // ftmp0 <= -sinA 663 psq_st c_zero, 24(m), 0, 0; // m12,m13 <= 0.0,0.0 664 ps_merge00 ftmp1, sinA, cosA; // ftmp1 <= sinA,cosA 665 psq_st c_zero, 32(m), 0, 0; // m20,m21 <= 0.0,0.0 666 ps_merge00 ftmp2, c_one, c_zero; // ftmp2 <= 1.0,0.0 667 psq_st c_zero, 48(m), 0, 0; // m30,m31 <= 0.0,0.0 668 ps_merge00 ftmp3, c_zero, c_one; // ftmp2 <= 0.0,1.0 669 psq_st ftmp1, 16(m), 0, 0; // m10,m11 <= sinA,cosA 670 ps_merge00 ftmp4, cosA, ftmp0; // ftmp4 <= cosA, -sinA 671 psq_st ftmp2, 40(m), 0, 0; // m22,m23 <= 1.0,0.0 672 psq_st ftmp3, 56(m), 0, 0; // m32,m33 <= 0.0,1.0 673 psq_st ftmp4, 0(m), 0, 0; // m00,m00 <= cosA,-sinA 674 675 _end: 676 677 blr 678 .size ASM_MTX44RotTrig,$-ASM_MTX44RotTrig 679 680#undef m 681#undef axis 682#undef sinA 683#undef cosA 684#undef ftmp0 685#undef ftmp1 686#undef ftmp2 687#undef ftmp3 688#undef ftmp4 689#undef c_one 690#undef c_zero 691 692 693//////////////////////////////////////////////////////////////////////////////// 694//void ASM_MTX34To44( MTX_CONST Mtx src, Mtx44 dst ) 695 .global ASM_MTX34To44 696#define src r3 697#define dst r4 698#define c_00 fp1 699#define c_11 fp2 700#define c_01 fp3 701#define tmp fp4 702ASM_MTX34To44: 703 .type ASM_MTX34To44, @function 704 705 // c_11 = 1.0F; 706 lis r5, CONST_1_0F@h 707 ori r5, r5, CONST_1_0F@l 708 lfs c_11, 0(r5) 709 710 // c_00 = 0.0F; 711 lis r6, CONST_0_0F@h 712 ori r6, r6, CONST_0_0F@l 713 lfs c_00, 0(r6) 714 715 psq_l tmp, 0(src), 0, 0; 716 psq_st tmp, 0(dst), 0, 0; 717 psq_l tmp, 8(src), 0, 0; 718 psq_st tmp, 8(dst), 0, 0; 719 psq_l tmp, 16(src), 0, 0; 720 psq_st tmp, 16(dst), 0, 0; 721 ps_merge00 c_01, c_00, c_11; 722 psq_l tmp, 24(src), 0, 0; 723 psq_st tmp, 24(dst), 0, 0; 724 psq_l tmp, 32(src), 0, 0; 725 psq_st tmp, 32(dst), 0, 0; 726 psq_l tmp, 40(src), 0, 0; 727 psq_st tmp, 40(dst), 0, 0; 728 psq_st c_00, 48(dst), 0, 0; 729 psq_st c_01, 56(dst), 0, 0; 730 blr 731 .size ASM_MTX34To44,$-ASM_MTX34To44 732 733#undef src 734#undef dst 735#undef c_00 736#undef c_11 737#undef c_01 738 739