1/*---------------------------------------------------------------------------* 2 Project: Horizon 3 File: math_Matrix33.ipp 4 5 Copyright (C)2009-2010 Nintendo Co., Ltd. All rights reserved. 6 7 These coded instructions, statements, and computer programs contain 8 proprietary information of Nintendo of America Inc. and/or Nintendo 9 Company Ltd., and are protected by Federal copyright law. They may 10 not be disclosed to third parties or copied or duplicated in any form, 11 in whole or in part, without the prior written consent of Nintendo. 12 13 $Revision: 13623 $ 14 *---------------------------------------------------------------------------*/ 15#include <nn/math/math_Vector3.h> 16 17namespace nn { 18namespace math { 19namespace ARMv6 { 20 21#include <nn/hw/ARM/code32.h> 22/* Please see man pages for details 23 24 25*/ 26 27template<typename TMatrix> 28inline asm TMatrix* 29MTX33MultAsm_ORG(TMatrix*, const TMatrix*, const TMatrix*) 30{ 31 VPUSH {d8-d13} // Save registers 32 33 //Because a template is being used, calculate an offset value for the matrix... 34 MOV r3,#__cpp(offsetof(TMatrix,f)) 35 ADD r1,r1,r3 36 ADD r2,r2,r3 37 MOV r3,#__cpp(TMatrix::COLUMN_COUNT)*4 38 39 VLDMIA r2,{s9-s11} // First line of matrix p2 to registers [S9-S11] 40 VLDMIA r1,{s0-s2} // First line of matrix p1 to registers [S0-S2] 41 ADD r1,r1,r3 42 ADD r2,r2,r3 43 VLDMIA r2,{s12-s14} // Second line of matrix p2 to registers [S12-S14] 44 VLDMIA r1,{s3-s5} // Second line of matrix p1 to registers [S3-S5] 45 ADD r1,r1,r3 46 ADD r2,r2,r3 47 VLDMIA r2,{s15-s17} // Third line of matrix p2 to registers [S15-S17] 48 VLDMIA r1,{s6-s8} // Third line of matrix p1 to registers [S6-S8] 49 50 VMUL.F32 s18,s9,s0 51 VMUL.F32 s19,s10,s0 52 VMUL.F32 s20,s11,s0 53 54 VMUL.F32 s21,s9,s3 55 VMUL.F32 s22,s10,s3 56 VMUL.F32 s23,s11,s3 57 58 VMUL.F32 s24,s9,s6 59 VMUL.F32 s25,s10,s6 60 VMUL.F32 s26,s11,s6 61 62 VMLA.F32 s18,s12,s1 63 VMLA.F32 s19,s13,s1 64 VMLA.F32 s20,s14,s1 65 66 VMLA.F32 s21,s12,s4 67 VMLA.F32 s22,s13,s4 68 VMLA.F32 s23,s14,s4 69 70 VMLA.F32 s24,s12,s7 71 VMLA.F32 s25,s13,s7 72 VMLA.F32 s26,s14,s7 73 74 VMLA.F32 s18,s15,s2 75 VMLA.F32 s19,s16,s2 76 VMLA.F32 s20,s17,s2 77 78 VMLA.F32 s21,s15,s5 79 VMLA.F32 s22,s16,s5 80 VMLA.F32 s23,s17,s5 81 82 VMLA.F32 s24,s15,s8 83 VMLA.F32 s25,s16,s8 84 VMLA.F32 s26,s17,s8 85 86 VSTMIA r0,{s18-s20} // Store result 87 VPOP {d8-d9} // Faster to segment POP 88 ADD r1,r0,r3 89 VSTMIA r1,{s21-s23} // Store result 90 VPOP {d10-d11} // Faster to break into small pieces 91 ADD r1,r1,r3 92 VSTMIA r1,{s24-s26} // Store result 93 VPOP {d12-d13} // Faster to break into chunks 94 BX lr // Return 95} 96 97template<typename TMatrix> 98inline asm TMatrix* 99MTX33MultAsm(TMatrix*, const TMatrix*, const TMatrix*) 100{ 101 102 103 //Because a template is being used, calculate an offset value for the matrix... 104 MOV r3,#__cpp(offsetof(TMatrix,f)) 105 ADD r1,r1,r3 106 ADD r2,r2,r3 107 MOV r3,#__cpp(TMatrix::COLUMN_COUNT)*4 108 109 CMP r3,#3*4 110 BNE LABELX 111 112 VPUSH {d8} // Save registers 113 VLDMIA r2!,{s10-s15} // First and second line of matrix p2 to registers [S10-S15] 114 115 VLDR.F32 s16,[r1,#3*4*0+4*0] // Matrix p1[0][0] 116 VLDR.F32 s17,[r1,#3*4*1+4*0] // Matrix p1[1][0] 117 118 VMUL.F32 s0,s10,s16 119 VMUL.F32 s1,s11,s16 120 VMUL.F32 s2,s12,s16 121 VLDR.F32 s16,[r1,#3*4*2+4*0] // Matrix p1[2][0] 122 123 VMUL.F32 s3,s10,s17 124 VMUL.F32 s4,s11,s17 125 VMUL.F32 s5,s12,s17 126 VLDR.F32 s17,[r1,#3*4*0+4*1] // Matrix p1[0][1] 127 128 VMUL.F32 s6,s10,s16 129 VMUL.F32 s7,s11,s16 130 VMUL.F32 s8,s12,s16 131 VLDR.F32 s16,[r1,#3*4*1+4*1] // Matrix p1[1][1] 132 133 VLDMIA r2,{s10-s12} // Third line of matrix p2 to registers [S10-S12] 134 VMLA.F32 s0,s13,s17 135 VMLA.F32 s1,s14,s17 136 VMLA.F32 s2,s15,s17 137 VLDR.F32 s17,[r1,#3*4*2+4*1] // Matrix p1[2][1] 138 139 VMLA.F32 s3,s13,s16 140 VMLA.F32 s4,s14,s16 141 VMLA.F32 s5,s15,s16 142 VLDR.F32 s16,[r1,#3*4*0+4*2] // Matrix p1[0][2] 143 144 VMLA.F32 s6,s13,s17 145 VMLA.F32 s7,s14,s17 146 VMLA.F32 s8,s15,s17 147 VLDR.F32 s17,[r1,#3*4*1+4*2] // Matrix p1[1][2] 148 149 VMLA.F32 s0,s10,s16 150 VMLA.F32 s1,s11,s16 151 VMLA.F32 s2,s12,s16 152 VLDR.F32 s16,[r1,#3*4*2+4*2] // Matrix p1[2][2] 153 154 VMLA.F32 s3,s10,s17 155 VMLA.F32 s4,s11,s17 156 VMLA.F32 s5,s12,s17 157 158 VMLA.F32 s6,s10,s16 159 VMLA.F32 s7,s11,s16 160 VMLA.F32 s8,s12,s16 161 162 VPOP {d8} // POP 163 164 VSTMIA r0,{s0-s8} // Store result 165 BX lr // Return 166 167LABELX 168 VPUSH {d8-d13} // Save registers 169 VLDMIA r2,{s9-s11} // First line of matrix p2 to registers [S9-S11] 170 VLDMIA r1,{s18-s20} // First line of matrix p1 to registers [S18-S20] 171 ADD r1,r1,r3 172 ADD r2,r2,r3 173 VLDMIA r2,{s12-s14} // Second line of matrix p2 to registers [S12-S14] 174 VLDMIA r1,{s21-s23} // Second line of matrix p1 to registers [S21-S23] 175 ADD r1,r1,r3 176 ADD r2,r2,r3 177 VLDMIA r2,{s15-s17} // Third line of matrix p2 to registers [S15-S17] 178 VLDMIA r1,{s24-s26} // Third line of matrix p1 to registers [S24-S26] 179 180 VMUL.F32 s0,s9,s18 181 VMUL.F32 s1,s10,s18 182 VMUL.F32 s2,s11,s18 183 184 VMUL.F32 s3,s9,s21 185 VMUL.F32 s4,s10,s21 186 VMUL.F32 s5,s11,s21 187 188 VMUL.F32 s6,s9,s24 189 VMUL.F32 s7,s10,s24 190 VMUL.F32 s8,s11,s24 191 192 VMLA.F32 s0,s12,s19 193 VMLA.F32 s1,s13,s19 194 VMLA.F32 s2,s14,s19 195 196 VMLA.F32 s3,s12,s22 197 VMLA.F32 s4,s13,s22 198 VMLA.F32 s5,s14,s22 199 200 VMLA.F32 s6,s12,s25 201 VMLA.F32 s7,s13,s25 202 VMLA.F32 s8,s14,s25 203 204 VMLA.F32 s0,s15,s20 205 VMLA.F32 s1,s16,s20 206 VMLA.F32 s2,s17,s20 207 208 VMLA.F32 s3,s15,s23 209 VMLA.F32 s4,s16,s23 210 VMLA.F32 s5,s17,s23 211 212 VMLA.F32 s6,s15,s26 213 VMLA.F32 s7,s16,s26 214 VMLA.F32 s8,s17,s26 215 216 VPOP {d8-d13} // POP 217 218 ADD r1,r0,r3 219 ADD r2,r1,r3 220 VSTMIA r0,{s0-s2} // Store result 221 VSTMIA r1,{s3-s5} // Store result 222 VSTMIA r2,{s6-s8} // Store result 223 BX lr // Return 224 225} 226 227/* 228 229*/ 230 231#include <nn/hw/ARM/codereset.h> 232/* Please see man pages for details 233 234 235*/ 236 237/* 238 239 240 241 242 243 244 245*/ 246NN_MATH_INLINE VEC3* 247VEC3TransformC(VEC3* pOut, const MTX33* pM, const VEC3* pV) 248{ 249 NN_NULL_ASSERT(pOut); 250 NN_NULL_ASSERT(pM); 251 NN_NULL_ASSERT(pV); 252 253 VEC3 vTmp; 254 VEC3* pDst = (pOut == pV) ? &vTmp : pOut; 255 pDst->x = pM->f._00 * pV->x + pM->f._01 * pV->y + pM->f._02 * pV->z; 256 pDst->y = pM->f._10 * pV->x + pM->f._11 * pV->y + pM->f._12 * pV->z; 257 pDst->z = pM->f._20 * pV->x + pM->f._21 * pV->y + pM->f._22 * pV->z; 258 259 if (pDst == &vTmp) 260 { 261 pOut->x = pDst->x; 262 pOut->y = pDst->y; 263 pOut->z = pDst->z; 264 } 265 266 return pOut; 267} 268 269/* 270 271*/ 272 273/* ------------------------------------------------------------------------ 274 MTX33 275 ------------------------------------------------------------------------ */ 276 277/* Please see man pages for details 278 279 280*/ 281 282/* 283 284 285 286 287 288 289 290*/ 291template<typename TMatrix> 292inline TMatrix* 293MTX33MultC(TMatrix* pOut, const TMatrix* __restrict p1, const TMatrix* __restrict p2) 294{ 295 NN_NULL_ASSERT( p1 ); 296 NN_NULL_ASSERT( p2 ); 297 NN_NULL_ASSERT( pOut ); 298 299 TMatrix mTmp; 300 TMatrix* __restrict pDst = (pOut == p1 || pOut == p2) ? &mTmp : pOut; 301 302 pDst->f._00 = p1->f._00 * p2->f._00 + p1->f._01 * p2->f._10 + p1->f._02 * p2->f._20; 303 pDst->f._01 = p1->f._00 * p2->f._01 + p1->f._01 * p2->f._11 + p1->f._02 * p2->f._21; 304 pDst->f._02 = p1->f._00 * p2->f._02 + p1->f._01 * p2->f._12 + p1->f._02 * p2->f._22; 305 306 pDst->f._10 = p1->f._10 * p2->f._00 + p1->f._11 * p2->f._10 + p1->f._12 * p2->f._20; 307 pDst->f._11 = p1->f._10 * p2->f._01 + p1->f._11 * p2->f._11 + p1->f._12 * p2->f._21; 308 pDst->f._12 = p1->f._10 * p2->f._02 + p1->f._11 * p2->f._12 + p1->f._12 * p2->f._22; 309 310 pDst->f._20 = p1->f._20 * p2->f._00 + p1->f._21 * p2->f._10 + p1->f._22 * p2->f._20; 311 pDst->f._21 = p1->f._20 * p2->f._01 + p1->f._21 * p2->f._11 + p1->f._22 * p2->f._21; 312 pDst->f._22 = p1->f._20 * p2->f._02 + p1->f._21 * p2->f._12 + p1->f._22 * p2->f._22; 313 314 if (pDst == &mTmp) 315 { 316 pOut->f._00 = pDst->f._00; pOut->f._01 = pDst->f._01; pOut->f._02 = pDst->f._02; 317 pOut->f._10 = pDst->f._10; pOut->f._11 = pDst->f._11; pOut->f._12 = pDst->f._12; 318 pOut->f._20 = pDst->f._20; pOut->f._21 = pDst->f._21; pOut->f._22 = pDst->f._22; 319 } 320 321 return pOut; 322} 323 324/* 325 326 327 328 329 330 331*/ 332NN_MATH_INLINE MTX33* 333MTX33CopyC(MTX33* pOut, const MTX33* p) 334{ 335 if (pOut != p) 336 { 337 *pOut = *p; 338 } 339 340 return pOut; 341} 342 343 344 345/* 346 347 348 349 350 351 352 353 354*/ 355NN_MATH_INLINE MTX33* 356MTX33MAddC(MTX33* pOut, f32 t, const MTX33* p1, const MTX33* p2) 357{ 358 pOut->f._00 = t * p1->f._00 + p2->f._00; 359 pOut->f._01 = t * p1->f._01 + p2->f._01; 360 pOut->f._02 = t * p1->f._02 + p2->f._02; 361 362 pOut->f._10 = t * p1->f._10 + p2->f._10; 363 pOut->f._11 = t * p1->f._11 + p2->f._11; 364 pOut->f._12 = t * p1->f._12 + p2->f._12; 365 366 pOut->f._20 = t * p1->f._20 + p2->f._20; 367 pOut->f._21 = t * p1->f._21 + p2->f._21; 368 pOut->f._22 = t * p1->f._22 + p2->f._22; 369 370 return pOut; 371} 372NN_MATH_INLINE MTX33* 373MTX33MAddC_FAST(MTX33* pOut, f32 t, const MTX33* p1, const MTX33* p2) 374{ 375 register f32 m00, m01, m02, m10, m11, m12, m20, m21, m22; 376 377 m00 = p2->f._00; 378 m01 = p2->f._01; 379 m02 = p2->f._02; 380 381 m10 = p2->f._10; 382 m11 = p2->f._11; 383 m12 = p2->f._12; 384 385 m20 = p2->f._20; 386 m21 = p2->f._21; 387 m22 = p2->f._22; 388 389 m00 += t * p1->f._00; 390 m01 += t * p1->f._01; 391 m02 += t * p1->f._02; 392 393 m10 += t * p1->f._10; 394 m11 += t * p1->f._11; 395 m12 += t * p1->f._12; 396 397 m20 += t * p1->f._20; 398 m21 += t * p1->f._21; 399 m22 += t * p1->f._22; 400 401 pOut->f._00 = m00; 402 pOut->f._01 = m01; 403 pOut->f._02 = m02; 404 405 pOut->f._10 = m10; 406 pOut->f._11 = m11; 407 pOut->f._12 = m12; 408 409 pOut->f._20 = m20; 410 pOut->f._21 = m21; 411 pOut->f._22 = m22; 412 413 return pOut; 414 415} 416 417/* 418 419*/ 420} // namespace ARMv6 421} // namespace math 422} // namespace nn 423