1 /*---------------------------------------------------------------------------* 2 Project: Horizon 3 File: util_Float24.h 4 5 Copyright (C)2009-2012 Nintendo Co., Ltd. All rights reserved. 6 7 These coded instructions, statements, and computer programs contain 8 proprietary information of Nintendo of America Inc. and/or Nintendo 9 Company Ltd., and are protected by Federal copyright law. They may 10 not be disclosed to third parties or copied or duplicated in any form, 11 in whole or in part, without the prior written consent of Nintendo. 12 13 $Rev: 46347 $ 14 *---------------------------------------------------------------------------*/ 15 16 #ifndef NN_UTIL_UTIL_FLOAT24_H_ 17 #define NN_UTIL_UTIL_FLOAT24_H_ 18 19 #include <nn/math.h> 20 21 #ifdef __cplusplus 22 23 namespace nn { 24 namespace util { 25 26 //-------------------------------------------------------------------------- 27 // 28 // 29 // The 32-bit, floating point bit format is as follows. 30 // | sign | exponent | fraction | 31 // sign : Sign 1 bit. 32 // exponent : Exponent 8 bit. bias 127. 33 // fraction : mantissa 23 bit. 34 //--------------------------------------------------------------------------- 35 class Float32 36 { 37 public: 38 //-------------------------------------------------------------------------- 39 // 40 // 41 // 42 // 43 // 44 //-------------------------------------------------------------------------- Float32ToBits32(f32 value)45 static u32 Float32ToBits32(f32 value) 46 { 47 return *reinterpret_cast<u32*>(&value); 48 } 49 50 //-------------------------------------------------------------------------- 51 // 52 // 53 // 54 // 55 // 56 //-------------------------------------------------------------------------- Bits32ToFloat32(u32 value)57 static f32 Bits32ToFloat32(u32 value) 58 { 59 return *reinterpret_cast<f32*>(&value); 60 } 61 }; 62 63 //-------------------------------------------------------------------------- 64 // 65 // 66 // The 24-bit, floating point bit format is as follows. 67 // | sign | exponent | fraction | 68 // sign : Sign 1 bit. 69 // exponent : Exponent 7 bit. bias 63. 70 // fraction : mantissa 16 bit. 71 //--------------------------------------------------------------------------- 72 class Float24 73 { 74 public: 75 //-------------------------------------------------------------------------- 76 // 77 //-------------------------------------------------------------------------- Float24()78 Float24() : m_Float32( 0.0f ) {} 79 80 //-------------------------------------------------------------------------- 81 // 82 // 83 // 84 //-------------------------------------------------------------------------- Float24(u32 bits24)85 /* implicit */ Float24( u32 bits24 ) 86 { 87 m_Float32 = Bits24ToFloat32( bits24 ); 88 } 89 90 //-------------------------------------------------------------------------- 91 // 92 // 93 // 94 //-------------------------------------------------------------------------- Float24(f32 value)95 /* implicit */ Float24( f32 value ) : m_Float32( value ) {} 96 GetFloat32Value()97 f32 GetFloat32Value() const { return m_Float32; } GetFloat24Value()98 u32 GetFloat24Value() const { return Float32ToBits24( m_Float32 ); } 99 100 Float24& operator =(f32 value) { this->m_Float32 = value; return *this; } 101 Float24& operator =(u32 bits24) { this->m_Float32 = Bits24ToFloat32( bits24 ); return *this; } 102 f32()103 operator f32() const { return m_Float32; } 104 105 f32 operator +(f32 right) const { return this->m_Float32 + right; } 106 f32 operator -(f32 right) const { return this->m_Float32 - right; } 107 f32 operator *(f32 right) const { return this->m_Float32 * right; } 108 f32 operator /(f32 right) const { return this->m_Float32 / right; } 109 110 Float24& operator +=(f32 rhs) { this->m_Float32 += rhs; return *this; } 111 Float24& operator -=(f32 rhs) { this->m_Float32 -= rhs; return *this; } 112 Float24& operator *=(f32 rhs) { this->m_Float32 *= rhs; return *this; } 113 Float24& operator /=(f32 rhs) { this->m_Float32 /= rhs; return *this; } 114 115 bool operator ==(f32 rhs) const { return (rhs == this->m_Float32); } 116 bool operator !=(f32 rhs) const { return !(*this == rhs); } 117 118 //-------------------------------------------------------------------------- 119 // 120 // 121 // 122 // 123 // 124 //-------------------------------------------------------------------------- Bits24ToFloat32(u32 bits24)125 static f32 Bits24ToFloat32(u32 bits24) 126 { 127 u32 sign = bits24 & SIGN24; 128 int exp = (int)((bits24 & EXP_MASK24) >> FRACTION_WIDTH24); 129 u32 fraction = bits24 & FRACTION_MASK24; 130 131 u32 bits32 = 0; 132 bits32 |= (sign != 0) ? SIGN32 : 0; 133 134 if ((bits24 & ~SIGN24) == 0) 135 { 136 exp = 0; 137 } 138 else 139 { 140 exp = exp - EXP_BIAS24 + EXP_BIAS32; 141 } 142 143 fraction = fraction << (FRACTION_WIDTH32 - FRACTION_WIDTH24); 144 145 // This is a bit extension, so 'exp' will not overflow. 146 bits32 |= fraction & FRACTION_MASK32; 147 bits32 |= ((u32)exp & 0xFF) << FRACTION_WIDTH32; 148 149 return *reinterpret_cast<f32*>(&bits32); 150 } 151 152 //-------------------------------------------------------------------------- 153 // 154 // 155 // 156 // 157 // 158 //-------------------------------------------------------------------------- Float32ToBits24(f32 value)159 static u32 Float32ToBits24(f32 value) 160 { 161 u32 bits32 = *reinterpret_cast<u32*>(&value); 162 163 u32 sign = bits32 & SIGN32; 164 int exp = (int)((bits32 & EXP_MASK32) >> FRACTION_WIDTH32); 165 u32 fraction = bits32 & FRACTION_MASK32; 166 167 u32 bits24 = 0; 168 bits24 |= (sign != 0) ? SIGN24 : 0; 169 170 if ((bits32 & ~SIGN32) == 0) 171 { 172 exp = 0; 173 } 174 else 175 { 176 exp = exp - EXP_BIAS32 + EXP_BIAS24; 177 } 178 179 fraction = fraction >> (FRACTION_WIDTH32 - FRACTION_WIDTH24); 180 181 if (exp < 0) 182 { 183 // Use +0 or -0 without change. 184 } 185 else if (exp > 127) 186 { 187 // Infinity process 188 // TODO: Need to check whether expressions of IEEE float infinity are valid on the GPU 189 bits24 = (u32)0x7F << FRACTION_WIDTH24; 190 } 191 else 192 { 193 bits24 |= fraction & FRACTION_MASK24; 194 bits24 |= ((u32)exp & 0x7F) << FRACTION_WIDTH24; 195 } 196 197 return bits24; 198 } 199 200 private: 201 f32 m_Float32; 202 203 enum 204 { 205 SIGN32 = 0x80000000, 206 SIGN24 = 0x00800000, 207 208 EXP_BIAS32 = 127, 209 EXP_BIAS24 = 63, 210 EXP_MASK32 = 0x7F800000, 211 EXP_MASK24 = 0x007F0000, 212 213 FRACTION_WIDTH32 = 23, 214 FRACTION_MASK32 = 0x007FFFFF, 215 FRACTION_WIDTH24 = 16, 216 FRACTION_MASK24 = 0x0000FFFF 217 }; 218 }; 219 220 //-------------------------------------------------------------------------- 221 // 222 // 223 // The 31-bit, floating point bit format is as follows. 224 // | sign | exponent | fraction | 225 // sign : Sign 1 bit. 226 // exponent : Exponent 7 bit. bias 63. 227 // fraction : mantissa 23 bit. 228 //--------------------------------------------------------------------------- 229 class Float31 230 { 231 public: 232 //-------------------------------------------------------------------------- 233 // 234 //-------------------------------------------------------------------------- Float31()235 Float31() : m_Float32( 0.0f ) {} 236 237 //-------------------------------------------------------------------------- 238 // 239 // 240 // 241 //-------------------------------------------------------------------------- Float31(u32 bits31)242 /* implicit */ Float31( u32 bits31 ) 243 { 244 m_Float32 = Bits31ToFloat32( bits31 ); 245 } 246 247 //-------------------------------------------------------------------------- 248 // 249 // 250 // 251 //-------------------------------------------------------------------------- Float31(f32 value)252 /* implicit */ Float31( f32 value ) : m_Float32( value ) {} 253 GetFloat32Value()254 f32 GetFloat32Value() const { return m_Float32; } GetFloat31Value()255 u32 GetFloat31Value() const { return Float32ToBits31( m_Float32 ); } 256 257 Float31& operator =(f32 value) { this->m_Float32 = value; return *this; } 258 Float31& operator =(u32 bits31) { this->m_Float32 = Bits31ToFloat32( bits31 ); return *this; } 259 f32()260 operator f32() const { return m_Float32; } 261 262 f32 operator +(f32 right) const { return this->m_Float32 + right; } 263 f32 operator -(f32 right) const { return this->m_Float32 - right; } 264 f32 operator *(f32 right) const { return this->m_Float32 * right; } 265 f32 operator /(f32 right) const { return this->m_Float32 / right; } 266 267 Float31& operator +=(f32 rhs) { this->m_Float32 += rhs; return *this; } 268 Float31& operator -=(f32 rhs) { this->m_Float32 -= rhs; return *this; } 269 Float31& operator *=(f32 rhs) { this->m_Float32 *= rhs; return *this; } 270 Float31& operator /=(f32 rhs) { this->m_Float32 /= rhs; return *this; } 271 272 bool operator ==(f32 rhs) const { return (rhs == this->m_Float32); } 273 bool operator !=(f32 rhs) const { return !(*this == rhs); } 274 275 //-------------------------------------------------------------------------- 276 // 277 // 278 // 279 // 280 // 281 //-------------------------------------------------------------------------- Bits31ToFloat32(u32 bits31)282 static f32 Bits31ToFloat32(u32 bits31) 283 { 284 u32 sign = bits31 & SIGN31; 285 int exp = (int)((bits31 & EXP_MASK31) >> FRACTION_WIDTH31); 286 u32 fraction = bits31 & FRACTION_MASK31; 287 288 u32 bits32 = 0; 289 bits32 |= (sign != 0) ? SIGN32 : 0; 290 291 if ((bits31 & ~SIGN31) == 0) 292 { 293 exp = 0; 294 } 295 else 296 { 297 exp = exp - EXP_BIAS31 + EXP_BIAS32; 298 } 299 300 // This is a bit extension, so 'exp' will not overflow. 301 bits32 |= fraction & FRACTION_MASK32; 302 bits32 |= ((u32)exp & 0xFF) << FRACTION_WIDTH32; 303 304 return *reinterpret_cast<f32*>(&bits32); 305 } 306 307 //-------------------------------------------------------------------------- 308 // 309 // 310 // 311 // 312 // 313 //-------------------------------------------------------------------------- Float32ToBits31(f32 value)314 static u32 Float32ToBits31(f32 value) 315 { 316 u32 bits32 = *reinterpret_cast<u32*>(&value); 317 318 u32 sign = bits32 & SIGN32; 319 int exp = (int)((bits32 & EXP_MASK32) >> FRACTION_WIDTH32); 320 u32 fraction = bits32 & FRACTION_MASK32; 321 322 u32 bits31 = 0; 323 bits31 |= (sign != 0) ? SIGN31 : 0; 324 325 if ((bits32 & ~SIGN32) == 0) 326 { 327 exp = 0; 328 } 329 else 330 { 331 exp = exp - EXP_BIAS32 + EXP_BIAS31; 332 } 333 334 if (exp < 0) 335 { 336 // Use +0 or -0 without change. 337 } 338 else if (exp > 127) 339 { 340 // Infinity process 341 // TODO: Need to check whether expressions of IEEE float infinity are valid on the GPU 342 bits31 = (u32)0x7F << FRACTION_WIDTH31; 343 } 344 else 345 { 346 bits31 |= fraction & FRACTION_MASK31; 347 bits31 |= ((u32)exp & 0x7F) << FRACTION_WIDTH31; 348 } 349 350 return bits31; 351 } 352 353 private: 354 f32 m_Float32; 355 356 enum 357 { 358 SIGN32 = 0x80000000, 359 SIGN31 = 0x40000000, 360 361 EXP_BIAS32 = 127, 362 EXP_BIAS31 = 63, 363 EXP_MASK32 = 0x7F800000, 364 EXP_MASK31 = 0x3F800000, 365 366 FRACTION_WIDTH32 = 23, 367 FRACTION_MASK32 = 0x007FFFFF, 368 FRACTION_WIDTH31 = 23, 369 FRACTION_MASK31 = 0x007FFFFF 370 }; 371 }; 372 373 374 //-------------------------------------------------------------------------- 375 // 376 // 377 // The 20-bit, floating point bit format is as follows. 378 // | sign | exponent | fraction | 379 // sign : Sign 1 bit. 380 // exponent : Exponent 7 bit. bias 63. 381 // fraction : mantissa 12 bit. 382 //--------------------------------------------------------------------------- 383 class Float20 384 { 385 public: 386 //-------------------------------------------------------------------------- 387 // 388 //-------------------------------------------------------------------------- Float20()389 Float20() : m_Float32( 0.0f ) {} 390 391 //-------------------------------------------------------------------------- 392 // 393 // 394 // 395 //-------------------------------------------------------------------------- Float20(u32 bits20)396 /* implicit */ Float20( u32 bits20 ) 397 { 398 m_Float32 = Bits20ToFloat32( bits20 ); 399 } 400 401 //-------------------------------------------------------------------------- 402 // 403 // 404 // 405 //-------------------------------------------------------------------------- Float20(f32 value)406 /* implicit */ Float20( f32 value ) : m_Float32( value ) {} 407 GetFloat32Value()408 f32 GetFloat32Value() const { return m_Float32; } GetFloat20Value()409 u32 GetFloat20Value() const { return Float32ToBits20( m_Float32 ); } 410 411 Float20& operator =(f32 value) { this->m_Float32 = value; return *this; } 412 Float20& operator =(u32 bits20) { this->m_Float32 = Bits20ToFloat32( bits20 ); return *this; } 413 f32()414 operator f32() const { return m_Float32; } 415 416 f32 operator +(f32 right) const { return this->m_Float32 + right; } 417 f32 operator -(f32 right) const { return this->m_Float32 - right; } 418 f32 operator *(f32 right) const { return this->m_Float32 * right; } 419 f32 operator /(f32 right) const { return this->m_Float32 / right; } 420 421 Float20& operator +=(f32 rhs) { this->m_Float32 += rhs; return *this; } 422 Float20& operator -=(f32 rhs) { this->m_Float32 -= rhs; return *this; } 423 Float20& operator *=(f32 rhs) { this->m_Float32 *= rhs; return *this; } 424 Float20& operator /=(f32 rhs) { this->m_Float32 /= rhs; return *this; } 425 426 bool operator ==(f32 rhs) const { return (rhs == this->m_Float32); } 427 bool operator !=(f32 rhs) const { return !(*this == rhs); } 428 429 //-------------------------------------------------------------------------- 430 // 431 // 432 // 433 // 434 // 435 //-------------------------------------------------------------------------- Bits20ToFloat32(u32 bits20)436 static f32 Bits20ToFloat32(u32 bits20) 437 { 438 u32 sign = bits20 & SIGN20; 439 int exp = (int)((bits20 & EXP_MASK20) >> FRACTION_WIDTH20); 440 u32 fraction = bits20 & FRACTION_MASK20; 441 442 u32 bits32 = 0; 443 bits32 |= (sign != 0) ? SIGN32 : 0; 444 445 if ((bits20 & ~SIGN20) == 0) 446 { 447 exp = 0; 448 } 449 else 450 { 451 exp = exp - EXP_BIAS20 + EXP_BIAS32; 452 } 453 454 fraction = fraction << (FRACTION_WIDTH32 - FRACTION_WIDTH20); 455 456 // This is a bit extension, so 'exp' will not overflow. 457 bits32 |= fraction & FRACTION_MASK32; 458 bits32 |= ((u32)exp & 0xFF) << FRACTION_WIDTH32; 459 460 return *reinterpret_cast<f32*>(&bits32); 461 } 462 463 //-------------------------------------------------------------------------- 464 // 465 // 466 // 467 // 468 // 469 //-------------------------------------------------------------------------- Float32ToBits20(f32 value)470 static u32 Float32ToBits20(f32 value) 471 { 472 u32 bits32 = *reinterpret_cast<u32*>(&value); 473 474 u32 sign = bits32 & SIGN32; 475 int exp = (int)((bits32 & EXP_MASK32) >> FRACTION_WIDTH32); 476 u32 fraction = bits32 & FRACTION_MASK32; 477 478 u32 bits20 = 0; 479 bits20 |= (sign != 0) ? SIGN20 : 0; 480 481 if ((bits32 & ~SIGN32) == 0) 482 { 483 exp = 0; 484 } 485 else 486 { 487 exp = exp - EXP_BIAS32 + EXP_BIAS20; 488 } 489 490 fraction = fraction >> (FRACTION_WIDTH32 - FRACTION_WIDTH20); 491 492 if (exp < 0) 493 { 494 // Use +0 or -0 without change. 495 } 496 else if (exp > 127) 497 { 498 // Infinity process 499 // TODO: Need to check whether expressions of IEEE float infinity are valid on the GPU 500 bits20 = (u32)0x7F << FRACTION_WIDTH20; 501 } 502 else 503 { 504 bits20 |= fraction & FRACTION_MASK20; 505 bits20 |= ((u32)exp & 0x7F) << FRACTION_WIDTH20; 506 } 507 508 return bits20; 509 } 510 511 private: 512 f32 m_Float32; 513 514 enum 515 { 516 SIGN32 = 0x80000000, 517 SIGN20 = 0x00080000, 518 519 EXP_BIAS32 = 127, 520 EXP_BIAS20 = 63, 521 EXP_MASK32 = 0x7F800000, 522 EXP_MASK20 = 0x0007F000, 523 524 FRACTION_WIDTH32 = 23, 525 FRACTION_MASK32 = 0x007FFFFF, 526 FRACTION_WIDTH20 = 12, 527 FRACTION_MASK20 = 0x00000FFF 528 }; 529 }; 530 531 532 //-------------------------------------------------------------------------- 533 // 534 // 535 // The 16-bit, floating point bit format is as follows. 536 // | sign | exponent | fraction | 537 // sign : Sign 1 bit. 538 // exponent : Exponent 5 bit. bias 15. 539 // fraction : mantissa 10 bit. 540 //--------------------------------------------------------------------------- 541 class Float16 542 { 543 public: 544 //-------------------------------------------------------------------------- 545 // 546 //-------------------------------------------------------------------------- Float16()547 Float16() : m_Float32( 0.0f ) {} 548 549 //-------------------------------------------------------------------------- 550 // 551 // 552 // 553 //-------------------------------------------------------------------------- Float16(u32 bits16)554 /* implicit */ Float16( u32 bits16 ) 555 { 556 m_Float32 = Bits16ToFloat32( bits16 ); 557 } 558 559 //-------------------------------------------------------------------------- 560 // 561 // 562 // 563 //-------------------------------------------------------------------------- Float16(f32 value)564 /* implicit */ Float16( f32 value ) : m_Float32( value ) {} 565 GetFloat32Value()566 f32 GetFloat32Value() const { return m_Float32; } GetFloat16Value()567 u16 GetFloat16Value() const { return Float32ToBits16( m_Float32 ); } 568 569 Float16& operator =(f32 value) { this->m_Float32 = value; return *this; } 570 Float16& operator =(u32 bits16) { this->m_Float32 = Bits16ToFloat32( bits16 ); return *this; } 571 f32()572 operator f32() const { return m_Float32; } u16()573 operator u16() const { return GetFloat16Value(); } 574 575 f32 operator +(f32 right) const { return this->m_Float32 + right; } 576 f32 operator -(f32 right) const { return this->m_Float32 - right; } 577 f32 operator *(f32 right) const { return this->m_Float32 * right; } 578 f32 operator /(f32 right) const { return this->m_Float32 / right; } 579 580 Float16& operator +=(f32 rhs) { this->m_Float32 += rhs; return *this; } 581 Float16& operator -=(f32 rhs) { this->m_Float32 -= rhs; return *this; } 582 Float16& operator *=(f32 rhs) { this->m_Float32 *= rhs; return *this; } 583 Float16& operator /=(f32 rhs) { this->m_Float32 /= rhs; return *this; } 584 585 bool operator ==(f32 rhs) const { return (rhs == this->m_Float32); } 586 bool operator !=(f32 rhs) const { return !(*this == rhs); } 587 588 //-------------------------------------------------------------------------- 589 // 590 // 591 // 592 // 593 // 594 //-------------------------------------------------------------------------- Bits16ToFloat32(u32 bits16)595 static f32 Bits16ToFloat32(u32 bits16) 596 { 597 u32 sign = bits16 & SIGN16; 598 int exp = (int)((bits16 & EXP_MASK16) >> FRACTION_WIDTH16); 599 u32 fraction = bits16 & FRACTION_MASK16; 600 601 u32 bits32 = 0; 602 bits32 |= (sign != 0) ? SIGN32 : 0; 603 604 if ((bits16 & ~SIGN16) == 0) 605 { 606 exp = 0; 607 } 608 else 609 { 610 exp = exp - EXP_BIAS16 + EXP_BIAS32; 611 } 612 613 fraction = fraction << (FRACTION_WIDTH32 - FRACTION_WIDTH16); 614 615 // This is a bit extension, so 'exp' will not overflow. 616 bits32 |= fraction & FRACTION_MASK32; 617 bits32 |= ((u32)exp & 0xFF) << FRACTION_WIDTH32; 618 619 return *reinterpret_cast<f32*>(&bits32); 620 } 621 622 //-------------------------------------------------------------------------- 623 // 624 // 625 // 626 // 627 // 628 //-------------------------------------------------------------------------- Float32ToBits16(f32 value)629 static u16 Float32ToBits16(f32 value) 630 { 631 u32 bits32 = *reinterpret_cast<u32*>(&value); 632 633 u32 sign = bits32 & SIGN32; 634 int exp = (int)((bits32 & EXP_MASK32) >> FRACTION_WIDTH32); 635 u32 fraction = bits32 & FRACTION_MASK32; 636 637 u32 bits16 = 0; 638 bits16 |= (sign != 0) ? SIGN16 : 0; 639 640 if ((bits32 & ~SIGN32) == 0) 641 { 642 exp = 0; 643 } 644 else 645 { 646 exp = exp - EXP_BIAS32 + EXP_BIAS16; 647 } 648 649 fraction = fraction >> (FRACTION_WIDTH32 - FRACTION_WIDTH16); 650 651 if (exp < 0) 652 { 653 // Use +0 or -0 without change. 654 } 655 else if (exp > 31) 656 { 657 // Infinity process 658 // TODO: Need to check whether expressions of IEEE float infinity are valid on the GPU 659 bits16 = (u32)0x1F << FRACTION_WIDTH16; 660 } 661 else 662 { 663 bits16 |= fraction & FRACTION_MASK16; 664 bits16 |= ((u32)exp & 0x1F) << FRACTION_WIDTH16; 665 } 666 667 return static_cast<u16>(bits16); 668 } 669 670 private: 671 f32 m_Float32; 672 673 enum 674 { 675 SIGN32 = 0x80000000, 676 SIGN16 = 0x00008000, 677 678 EXP_BIAS32 = 127, 679 EXP_BIAS16 = 15, 680 EXP_MASK32 = 0x7F800000, 681 EXP_MASK16 = 0x00007C00, 682 683 FRACTION_WIDTH32 = 23, 684 FRACTION_MASK32 = 0x007FFFFF, 685 FRACTION_WIDTH16 = 10, 686 FRACTION_MASK16 = 0x000003FF 687 }; 688 }; 689 690 //-------------------------------------------------------------------------- 691 // 692 // 693 // The 13-bit, fixed-point bit format is as follows. 694 // | int | decimal | (2's complement) 695 // int : Integer portion 2 bit. 696 // decimal : Fractional portion 11 bit. 697 //--------------------------------------------------------------------------- 698 class Fixed13 699 { 700 public: 701 //-------------------------------------------------------------------------- 702 // 703 //-------------------------------------------------------------------------- Fixed13()704 Fixed13() : m_Float32( 0.0f ) {} 705 706 //-------------------------------------------------------------------------- 707 // 708 // 709 // 710 //-------------------------------------------------------------------------- Fixed13(u32 fixed13)711 explicit Fixed13( u32 fixed13 ) 712 { 713 m_Float32 = Fixed13ToFloat32( fixed13 ); 714 } 715 716 //-------------------------------------------------------------------------- 717 // 718 // 719 // 720 //-------------------------------------------------------------------------- Fixed13(f32 fvalue)721 explicit Fixed13( f32 fvalue ) 722 { 723 m_Float32 = fvalue; 724 } 725 726 GetFloat32Value()727 f32 GetFloat32Value() const { return m_Float32; } GetFixed13Value()728 u16 GetFixed13Value() const { return Float32ToFixed13( m_Float32 ); } 729 730 //-------------------------------------------------------------------------- 731 // 732 // 733 // 734 // 735 // 736 //-------------------------------------------------------------------------- Fixed13ToFloat32(u32 fixed13)737 static f32 Fixed13ToFloat32(u32 fixed13) 738 { 739 f32 float32 = static_cast<f32>(fixed13); 740 741 if (fixed13 & (0x1 << (TOTAL_WIDTH - 1))) 742 { 743 float32 -= (0x1 << TOTAL_WIDTH); 744 } 745 746 return float32 / (0x1 << DECIMAL_WIDTH); 747 } 748 749 //-------------------------------------------------------------------------- 750 // 751 // 752 // 753 // 754 // 755 //-------------------------------------------------------------------------- Float32ToFixed13(f32 value)756 static u16 Float32ToFixed13(f32 value) 757 { 758 f32 fixed = value; 759 fixed += ((0x1 << INT_WIDTH) / 2); 760 fixed *= (0x1 << DECIMAL_WIDTH); 761 762 if (fixed < 0) 763 { 764 fixed = 0.0f; 765 } 766 else if (fixed >= (0x1 << TOTAL_WIDTH)) 767 { 768 fixed = (0x1 << TOTAL_WIDTH) - 1; 769 } 770 771 fixed -= 0x1 << (TOTAL_WIDTH - 1); 772 773 return static_cast<u16>(static_cast<s16>(fixed) & MASK); 774 } 775 776 private: 777 f32 m_Float32; 778 779 enum 780 { 781 INT_WIDTH = 2, 782 DECIMAL_WIDTH = 11, 783 TOTAL_WIDTH = 13, 784 MASK = (0x1 << TOTAL_WIDTH) - 1 785 }; 786 }; 787 788 789 //-------------------------------------------------------------------------- 790 // 791 // 792 // The 11-bit, fixed-point bit format is as follows. 793 // decimal : Fractional portion 11 bit. 794 //--------------------------------------------------------------------------- 795 class Fixed11 796 { 797 public: 798 //-------------------------------------------------------------------------- 799 // 800 //-------------------------------------------------------------------------- Fixed11()801 Fixed11() : m_Float32( 0.0f ) {} 802 803 //-------------------------------------------------------------------------- 804 // 805 // 806 // 807 //-------------------------------------------------------------------------- Fixed11(u32 fixed11)808 explicit Fixed11( u32 fixed11 ) 809 { 810 m_Float32 = Fixed11ToFloat32( fixed11 ); 811 } 812 813 //-------------------------------------------------------------------------- 814 // 815 // 816 // 817 //-------------------------------------------------------------------------- Fixed11(f32 fvalue)818 explicit Fixed11( f32 fvalue ) 819 { 820 m_Float32 = fvalue; 821 } 822 823 GetFloat32Value()824 f32 GetFloat32Value() const { return m_Float32; } GetFixed11Value()825 u16 GetFixed11Value() const { return Float32ToFixed11( m_Float32 ); } 826 827 //-------------------------------------------------------------------------- 828 // 829 // 830 // 831 // 832 // 833 //-------------------------------------------------------------------------- Fixed11ToFloat32(u32 fixed11)834 static f32 Fixed11ToFloat32(u32 fixed11) 835 { 836 f32 float32 = static_cast<f32>(fixed11); 837 return float32 / MASK; 838 } 839 840 //-------------------------------------------------------------------------- 841 // 842 // 843 // 844 // 845 // 846 //-------------------------------------------------------------------------- Float32ToFixed11(f32 value)847 static u16 Float32ToFixed11(f32 value) 848 { 849 u32 fixed; 850 u32 v_ = *(u32*)&value; 851 852 if (value <= 0 || (v_ & 0x7f800000) == 0x7f800000) 853 { 854 fixed = 0; 855 } 856 else 857 { 858 value *= 1 << (DECIMAL_WIDTH - 0); 859 if (value >= (1 << DECIMAL_WIDTH)) 860 { 861 fixed = (1 << DECIMAL_WIDTH) - 1; 862 } 863 else 864 { 865 fixed = (unsigned)(value); 866 } 867 } 868 869 return static_cast<u16>(static_cast<s16>(fixed) & MASK); 870 } 871 872 private: 873 f32 m_Float32; 874 875 enum 876 { 877 INT_WIDTH = 2, 878 DECIMAL_WIDTH = 11, 879 TOTAL_WIDTH = 11, 880 MASK = (0x1 << TOTAL_WIDTH) - 1 881 }; 882 }; 883 884 } /* namespace util */ 885 } /* namespace nn */ 886 887 #endif // __cplusplus 888 889 #endif // NN_UTIL_UTIL_FLOAT24_H_ 890