1 /*---------------------------------------------------------------------------*
2   Project:  Horizon
3   File:     util_Float24.h
4 
5   Copyright (C)2009-2012 Nintendo Co., Ltd.  All rights reserved.
6 
7   These coded instructions, statements, and computer programs contain
8   proprietary information of Nintendo of America Inc. and/or Nintendo
9   Company Ltd., and are protected by Federal copyright law.  They may
10   not be disclosed to third parties or copied or duplicated in any form,
11   in whole or in part, without the prior written consent of Nintendo.
12 
13   $Rev: 46347 $
14  *---------------------------------------------------------------------------*/
15 
16 #ifndef NN_UTIL_UTIL_FLOAT24_H_
17 #define NN_UTIL_UTIL_FLOAT24_H_
18 
19 #include <nn/math.h>
20 
21 #ifdef __cplusplus
22 
23 namespace nn {
24 namespace util {
25 
26 //--------------------------------------------------------------------------
27 //
28 //
29 // The 32-bit, floating point bit format is as follows.
30 // | sign | exponent | fraction |
31 // sign     : Sign 1 bit.
32 // exponent : Exponent  8 bit. bias 127.
33 // fraction : mantissa  23 bit.
34 //---------------------------------------------------------------------------
35 class Float32
36 {
37 public:
38     //--------------------------------------------------------------------------
39     //
40     //
41     //
42     //
43     //
44     //--------------------------------------------------------------------------
Float32ToBits32(f32 value)45     static u32 Float32ToBits32(f32 value)
46     {
47         return *reinterpret_cast<u32*>(&value);
48     }
49 
50     //--------------------------------------------------------------------------
51     //
52     //
53     //
54     //
55     //
56     //--------------------------------------------------------------------------
Bits32ToFloat32(u32 value)57     static f32 Bits32ToFloat32(u32 value)
58     {
59         return *reinterpret_cast<f32*>(&value);
60     }
61 };
62 
63 //--------------------------------------------------------------------------
64 //
65 //
66 // The 24-bit, floating point bit format is as follows.
67 // | sign | exponent | fraction |
68 // sign     : Sign 1 bit.
69 // exponent : Exponent  7 bit. bias 63.
70 // fraction : mantissa  16 bit.
71 //---------------------------------------------------------------------------
72 class Float24
73 {
74 public:
75     //--------------------------------------------------------------------------
76     //
77     //--------------------------------------------------------------------------
Float24()78     Float24() : m_Float32( 0.0f ) {}
79 
80     //--------------------------------------------------------------------------
81     //
82     //
83     //
84     //--------------------------------------------------------------------------
Float24(u32 bits24)85     /* implicit */ Float24( u32 bits24 )
86     {
87         m_Float32 = Bits24ToFloat32( bits24 );
88     }
89 
90     //--------------------------------------------------------------------------
91     //
92     //
93     //
94     //--------------------------------------------------------------------------
Float24(f32 value)95     /* implicit */ Float24( f32 value ) : m_Float32( value ) {}
96 
GetFloat32Value()97     f32     GetFloat32Value() const { return m_Float32; }
GetFloat24Value()98     u32     GetFloat24Value() const { return Float32ToBits24( m_Float32 ); }
99 
100     Float24& operator =(f32 value) { this->m_Float32 = value; return *this; }
101     Float24& operator =(u32 bits24) { this->m_Float32 = Bits24ToFloat32( bits24 ); return *this; }
102 
f32()103     operator f32() const { return m_Float32; }
104 
105     f32 operator +(f32 right) const { return this->m_Float32 + right; }
106     f32 operator -(f32 right) const { return this->m_Float32 - right; }
107     f32 operator *(f32 right) const { return this->m_Float32 * right; }
108     f32 operator /(f32 right) const { return this->m_Float32 / right; }
109 
110     Float24& operator +=(f32 rhs) { this->m_Float32 += rhs; return *this; }
111     Float24& operator -=(f32 rhs) { this->m_Float32 -= rhs; return *this; }
112     Float24& operator *=(f32 rhs) { this->m_Float32 *= rhs; return *this; }
113     Float24& operator /=(f32 rhs) { this->m_Float32 /= rhs; return *this; }
114 
115     bool operator ==(f32 rhs) const { return (rhs == this->m_Float32); }
116     bool operator !=(f32 rhs) const { return !(*this == rhs); }
117 
118     //--------------------------------------------------------------------------
119     //
120     //
121     //
122     //
123     //
124     //--------------------------------------------------------------------------
Bits24ToFloat32(u32 bits24)125     static f32 Bits24ToFloat32(u32 bits24)
126     {
127         u32 sign = bits24 & SIGN24;
128         int exp = (int)((bits24 & EXP_MASK24) >> FRACTION_WIDTH24);
129         u32 fraction = bits24 & FRACTION_MASK24;
130 
131         u32 bits32 = 0;
132         bits32 |= (sign != 0) ? SIGN32 : 0;
133 
134         if ((bits24 & ~SIGN24) == 0)
135         {
136             exp = 0;
137         }
138         else
139         {
140             exp = exp - EXP_BIAS24 + EXP_BIAS32;
141         }
142 
143         fraction = fraction << (FRACTION_WIDTH32 - FRACTION_WIDTH24);
144 
145         // This is a bit extension, so 'exp' will not overflow.
146         bits32 |= fraction & FRACTION_MASK32;
147         bits32 |= ((u32)exp & 0xFF) << FRACTION_WIDTH32;
148 
149         return *reinterpret_cast<f32*>(&bits32);
150     }
151 
152     //--------------------------------------------------------------------------
153     //
154     //
155     //
156     //
157     //
158     //--------------------------------------------------------------------------
Float32ToBits24(f32 value)159     static u32 Float32ToBits24(f32 value)
160     {
161         u32 bits32 = *reinterpret_cast<u32*>(&value);
162 
163         u32 sign = bits32 & SIGN32;
164         int exp = (int)((bits32 & EXP_MASK32) >> FRACTION_WIDTH32);
165         u32 fraction = bits32 & FRACTION_MASK32;
166 
167         u32 bits24 = 0;
168         bits24 |= (sign != 0) ? SIGN24 : 0;
169 
170         if ((bits32 & ~SIGN32) == 0)
171         {
172             exp = 0;
173         }
174         else
175         {
176             exp = exp - EXP_BIAS32 + EXP_BIAS24;
177         }
178 
179         fraction = fraction >> (FRACTION_WIDTH32 - FRACTION_WIDTH24);
180 
181         if (exp < 0)
182         {
183             // Use +0 or -0 without change.
184         }
185         else if (exp > 127)
186         {
187             // Infinity process
188             // TODO: Need to check whether expressions of IEEE float infinity are valid on the GPU
189             bits24 = (u32)0x7F << FRACTION_WIDTH24;
190         }
191         else
192         {
193             bits24 |= fraction & FRACTION_MASK24;
194             bits24 |= ((u32)exp & 0x7F) << FRACTION_WIDTH24;
195         }
196 
197         return bits24;
198     }
199 
200 private:
201     f32 m_Float32;
202 
203     enum
204     {
205         SIGN32 = 0x80000000,
206         SIGN24 = 0x00800000,
207 
208         EXP_BIAS32 = 127,
209         EXP_BIAS24 = 63,
210         EXP_MASK32 = 0x7F800000,
211         EXP_MASK24 = 0x007F0000,
212 
213         FRACTION_WIDTH32 = 23,
214         FRACTION_MASK32  = 0x007FFFFF,
215         FRACTION_WIDTH24 = 16,
216         FRACTION_MASK24  = 0x0000FFFF
217     };
218 };
219 
220 //--------------------------------------------------------------------------
221 //
222 //
223 // The 31-bit, floating point bit format is as follows.
224 // | sign | exponent | fraction |
225 // sign     : Sign 1 bit.
226 // exponent : Exponent  7 bit. bias 63.
227 // fraction : mantissa  23 bit.
228 //---------------------------------------------------------------------------
229 class Float31
230 {
231 public:
232     //--------------------------------------------------------------------------
233     //
234     //--------------------------------------------------------------------------
Float31()235     Float31() : m_Float32( 0.0f ) {}
236 
237     //--------------------------------------------------------------------------
238     //
239     //
240     //
241     //--------------------------------------------------------------------------
Float31(u32 bits31)242     /* implicit */ Float31( u32 bits31 )
243     {
244         m_Float32 = Bits31ToFloat32( bits31 );
245     }
246 
247     //--------------------------------------------------------------------------
248     //
249     //
250     //
251     //--------------------------------------------------------------------------
Float31(f32 value)252     /* implicit */ Float31( f32 value ) : m_Float32( value ) {}
253 
GetFloat32Value()254     f32     GetFloat32Value() const { return m_Float32; }
GetFloat31Value()255     u32     GetFloat31Value() const { return Float32ToBits31( m_Float32 ); }
256 
257     Float31& operator =(f32 value) { this->m_Float32 = value; return *this; }
258     Float31& operator =(u32 bits31) { this->m_Float32 = Bits31ToFloat32( bits31 ); return *this; }
259 
f32()260     operator f32() const { return m_Float32; }
261 
262     f32 operator +(f32 right) const { return this->m_Float32 + right; }
263     f32 operator -(f32 right) const { return this->m_Float32 - right; }
264     f32 operator *(f32 right) const { return this->m_Float32 * right; }
265     f32 operator /(f32 right) const { return this->m_Float32 / right; }
266 
267     Float31& operator +=(f32 rhs) { this->m_Float32 += rhs; return *this; }
268     Float31& operator -=(f32 rhs) { this->m_Float32 -= rhs; return *this; }
269     Float31& operator *=(f32 rhs) { this->m_Float32 *= rhs; return *this; }
270     Float31& operator /=(f32 rhs) { this->m_Float32 /= rhs; return *this; }
271 
272     bool operator ==(f32 rhs) const { return (rhs == this->m_Float32); }
273     bool operator !=(f32 rhs) const { return !(*this == rhs); }
274 
275     //--------------------------------------------------------------------------
276     //
277     //
278     //
279     //
280     //
281     //--------------------------------------------------------------------------
Bits31ToFloat32(u32 bits31)282     static f32 Bits31ToFloat32(u32 bits31)
283     {
284         u32 sign = bits31 & SIGN31;
285         int exp = (int)((bits31 & EXP_MASK31) >> FRACTION_WIDTH31);
286         u32 fraction = bits31 & FRACTION_MASK31;
287 
288         u32 bits32 = 0;
289         bits32 |= (sign != 0) ? SIGN32 : 0;
290 
291         if ((bits31 & ~SIGN31) == 0)
292         {
293             exp = 0;
294         }
295         else
296         {
297             exp = exp - EXP_BIAS31 + EXP_BIAS32;
298         }
299 
300         // This is a bit extension, so 'exp' will not overflow.
301         bits32 |= fraction & FRACTION_MASK32;
302         bits32 |= ((u32)exp & 0xFF) << FRACTION_WIDTH32;
303 
304         return *reinterpret_cast<f32*>(&bits32);
305     }
306 
307     //--------------------------------------------------------------------------
308     //
309     //
310     //
311     //
312     //
313     //--------------------------------------------------------------------------
Float32ToBits31(f32 value)314     static u32 Float32ToBits31(f32 value)
315     {
316         u32 bits32 = *reinterpret_cast<u32*>(&value);
317 
318         u32 sign = bits32 & SIGN32;
319         int exp = (int)((bits32 & EXP_MASK32) >> FRACTION_WIDTH32);
320         u32 fraction = bits32 & FRACTION_MASK32;
321 
322         u32 bits31 = 0;
323         bits31 |= (sign != 0) ? SIGN31 : 0;
324 
325         if ((bits32 & ~SIGN32) == 0)
326         {
327             exp = 0;
328         }
329         else
330         {
331             exp = exp - EXP_BIAS32 + EXP_BIAS31;
332         }
333 
334         if (exp < 0)
335         {
336             // Use +0 or -0 without change.
337         }
338         else if (exp > 127)
339         {
340             // Infinity process
341             // TODO: Need to check whether expressions of IEEE float infinity are valid on the GPU
342             bits31 = (u32)0x7F << FRACTION_WIDTH31;
343         }
344         else
345         {
346             bits31 |= fraction & FRACTION_MASK31;
347             bits31 |= ((u32)exp & 0x7F) << FRACTION_WIDTH31;
348         }
349 
350         return bits31;
351     }
352 
353 private:
354     f32 m_Float32;
355 
356     enum
357     {
358         SIGN32 = 0x80000000,
359         SIGN31 = 0x40000000,
360 
361         EXP_BIAS32 = 127,
362         EXP_BIAS31 = 63,
363         EXP_MASK32 = 0x7F800000,
364         EXP_MASK31 = 0x3F800000,
365 
366         FRACTION_WIDTH32 = 23,
367         FRACTION_MASK32  = 0x007FFFFF,
368         FRACTION_WIDTH31 = 23,
369         FRACTION_MASK31  = 0x007FFFFF
370     };
371 };
372 
373 
374 //--------------------------------------------------------------------------
375 //
376 //
377 // The 20-bit, floating point bit format is as follows.
378 // | sign | exponent | fraction |
379 // sign     : Sign 1 bit.
380 // exponent : Exponent  7 bit. bias 63.
381 // fraction : mantissa  12 bit.
382 //---------------------------------------------------------------------------
383 class Float20
384 {
385 public:
386     //--------------------------------------------------------------------------
387     //
388     //--------------------------------------------------------------------------
Float20()389     Float20() : m_Float32( 0.0f ) {}
390 
391     //--------------------------------------------------------------------------
392     //
393     //
394     //
395     //--------------------------------------------------------------------------
Float20(u32 bits20)396     /* implicit */ Float20( u32 bits20 )
397     {
398         m_Float32 = Bits20ToFloat32( bits20 );
399     }
400 
401     //--------------------------------------------------------------------------
402     //
403     //
404     //
405     //--------------------------------------------------------------------------
Float20(f32 value)406     /* implicit */ Float20( f32 value ) : m_Float32( value ) {}
407 
GetFloat32Value()408     f32     GetFloat32Value() const { return m_Float32; }
GetFloat20Value()409     u32     GetFloat20Value() const { return Float32ToBits20( m_Float32 ); }
410 
411     Float20& operator =(f32 value) { this->m_Float32 = value; return *this; }
412     Float20& operator =(u32 bits20) { this->m_Float32 = Bits20ToFloat32( bits20 ); return *this; }
413 
f32()414     operator f32() const { return m_Float32; }
415 
416     f32 operator +(f32 right) const { return this->m_Float32 + right; }
417     f32 operator -(f32 right) const { return this->m_Float32 - right; }
418     f32 operator *(f32 right) const { return this->m_Float32 * right; }
419     f32 operator /(f32 right) const { return this->m_Float32 / right; }
420 
421     Float20& operator +=(f32 rhs) { this->m_Float32 += rhs; return *this; }
422     Float20& operator -=(f32 rhs) { this->m_Float32 -= rhs; return *this; }
423     Float20& operator *=(f32 rhs) { this->m_Float32 *= rhs; return *this; }
424     Float20& operator /=(f32 rhs) { this->m_Float32 /= rhs; return *this; }
425 
426     bool operator ==(f32 rhs) const { return (rhs == this->m_Float32); }
427     bool operator !=(f32 rhs) const { return !(*this == rhs); }
428 
429     //--------------------------------------------------------------------------
430     //
431     //
432     //
433     //
434     //
435     //--------------------------------------------------------------------------
Bits20ToFloat32(u32 bits20)436     static f32 Bits20ToFloat32(u32 bits20)
437     {
438         u32 sign = bits20 & SIGN20;
439         int exp = (int)((bits20 & EXP_MASK20) >> FRACTION_WIDTH20);
440         u32 fraction = bits20 & FRACTION_MASK20;
441 
442         u32 bits32 = 0;
443         bits32 |= (sign != 0) ? SIGN32 : 0;
444 
445         if ((bits20 & ~SIGN20) == 0)
446         {
447             exp = 0;
448         }
449         else
450         {
451             exp = exp - EXP_BIAS20 + EXP_BIAS32;
452         }
453 
454         fraction = fraction << (FRACTION_WIDTH32 - FRACTION_WIDTH20);
455 
456         // This is a bit extension, so 'exp' will not overflow.
457         bits32 |= fraction & FRACTION_MASK32;
458         bits32 |= ((u32)exp & 0xFF) << FRACTION_WIDTH32;
459 
460         return *reinterpret_cast<f32*>(&bits32);
461     }
462 
463     //--------------------------------------------------------------------------
464     //
465     //
466     //
467     //
468     //
469     //--------------------------------------------------------------------------
Float32ToBits20(f32 value)470     static u32 Float32ToBits20(f32 value)
471     {
472         u32 bits32 = *reinterpret_cast<u32*>(&value);
473 
474         u32 sign = bits32 & SIGN32;
475         int exp = (int)((bits32 & EXP_MASK32) >> FRACTION_WIDTH32);
476         u32 fraction = bits32 & FRACTION_MASK32;
477 
478         u32 bits20 = 0;
479         bits20 |= (sign != 0) ? SIGN20 : 0;
480 
481         if ((bits32 & ~SIGN32) == 0)
482         {
483             exp = 0;
484         }
485         else
486         {
487             exp = exp - EXP_BIAS32 + EXP_BIAS20;
488         }
489 
490         fraction = fraction >> (FRACTION_WIDTH32 - FRACTION_WIDTH20);
491 
492         if (exp < 0)
493         {
494             // Use +0 or -0 without change.
495         }
496         else if (exp > 127)
497         {
498             // Infinity process
499             // TODO: Need to check whether expressions of IEEE float infinity are valid on the GPU
500             bits20 = (u32)0x7F << FRACTION_WIDTH20;
501         }
502         else
503         {
504             bits20 |= fraction & FRACTION_MASK20;
505             bits20 |= ((u32)exp & 0x7F) << FRACTION_WIDTH20;
506         }
507 
508         return bits20;
509     }
510 
511 private:
512     f32 m_Float32;
513 
514     enum
515     {
516         SIGN32 = 0x80000000,
517         SIGN20 = 0x00080000,
518 
519         EXP_BIAS32 = 127,
520         EXP_BIAS20 = 63,
521         EXP_MASK32 = 0x7F800000,
522         EXP_MASK20 = 0x0007F000,
523 
524         FRACTION_WIDTH32 = 23,
525         FRACTION_MASK32  = 0x007FFFFF,
526         FRACTION_WIDTH20 = 12,
527         FRACTION_MASK20  = 0x00000FFF
528     };
529 };
530 
531 
532 //--------------------------------------------------------------------------
533 //
534 //
535 // The 16-bit, floating point bit format is as follows.
536 // | sign | exponent | fraction |
537 // sign     : Sign 1 bit.
538 // exponent : Exponent  5 bit. bias 15.
539 // fraction : mantissa  10 bit.
540 //---------------------------------------------------------------------------
541 class Float16
542 {
543 public:
544     //--------------------------------------------------------------------------
545     //
546     //--------------------------------------------------------------------------
Float16()547     Float16() : m_Float32( 0.0f ) {}
548 
549     //--------------------------------------------------------------------------
550     //
551     //
552     //
553     //--------------------------------------------------------------------------
Float16(u32 bits16)554     /* implicit */ Float16( u32 bits16 )
555     {
556         m_Float32 = Bits16ToFloat32( bits16 );
557     }
558 
559     //--------------------------------------------------------------------------
560     //
561     //
562     //
563     //--------------------------------------------------------------------------
Float16(f32 value)564     /* implicit */ Float16( f32 value ) : m_Float32( value ) {}
565 
GetFloat32Value()566     f32     GetFloat32Value() const { return m_Float32; }
GetFloat16Value()567     u16     GetFloat16Value() const { return Float32ToBits16( m_Float32 ); }
568 
569     Float16& operator =(f32 value) { this->m_Float32 = value; return *this; }
570     Float16& operator =(u32 bits16) { this->m_Float32 = Bits16ToFloat32( bits16 ); return *this; }
571 
f32()572     operator f32() const { return m_Float32; }
u16()573     operator u16() const { return GetFloat16Value(); }
574 
575     f32 operator +(f32 right) const { return this->m_Float32 + right; }
576     f32 operator -(f32 right) const { return this->m_Float32 - right; }
577     f32 operator *(f32 right) const { return this->m_Float32 * right; }
578     f32 operator /(f32 right) const { return this->m_Float32 / right; }
579 
580     Float16& operator +=(f32 rhs) { this->m_Float32 += rhs; return *this; }
581     Float16& operator -=(f32 rhs) { this->m_Float32 -= rhs; return *this; }
582     Float16& operator *=(f32 rhs) { this->m_Float32 *= rhs; return *this; }
583     Float16& operator /=(f32 rhs) { this->m_Float32 /= rhs; return *this; }
584 
585     bool operator ==(f32 rhs) const { return (rhs == this->m_Float32); }
586     bool operator !=(f32 rhs) const { return !(*this == rhs); }
587 
588     //--------------------------------------------------------------------------
589     //
590     //
591     //
592     //
593     //
594     //--------------------------------------------------------------------------
Bits16ToFloat32(u32 bits16)595     static f32 Bits16ToFloat32(u32 bits16)
596     {
597         u32 sign = bits16 & SIGN16;
598         int exp = (int)((bits16 & EXP_MASK16) >> FRACTION_WIDTH16);
599         u32 fraction = bits16 & FRACTION_MASK16;
600 
601         u32 bits32 = 0;
602         bits32 |= (sign != 0) ? SIGN32 : 0;
603 
604         if ((bits16 & ~SIGN16) == 0)
605         {
606             exp = 0;
607         }
608         else
609         {
610             exp = exp - EXP_BIAS16 + EXP_BIAS32;
611         }
612 
613         fraction = fraction << (FRACTION_WIDTH32 - FRACTION_WIDTH16);
614 
615         // This is a bit extension, so 'exp' will not overflow.
616         bits32 |= fraction & FRACTION_MASK32;
617         bits32 |= ((u32)exp & 0xFF) << FRACTION_WIDTH32;
618 
619         return *reinterpret_cast<f32*>(&bits32);
620     }
621 
622     //--------------------------------------------------------------------------
623     //
624     //
625     //
626     //
627     //
628     //--------------------------------------------------------------------------
Float32ToBits16(f32 value)629     static u16 Float32ToBits16(f32 value)
630     {
631         u32 bits32 = *reinterpret_cast<u32*>(&value);
632 
633         u32 sign = bits32 & SIGN32;
634         int exp = (int)((bits32 & EXP_MASK32) >> FRACTION_WIDTH32);
635         u32 fraction = bits32 & FRACTION_MASK32;
636 
637         u32 bits16 = 0;
638         bits16 |= (sign != 0) ? SIGN16 : 0;
639 
640         if ((bits32 & ~SIGN32) == 0)
641         {
642             exp = 0;
643         }
644         else
645         {
646             exp = exp - EXP_BIAS32 + EXP_BIAS16;
647         }
648 
649         fraction = fraction >> (FRACTION_WIDTH32 - FRACTION_WIDTH16);
650 
651         if (exp < 0)
652         {
653             // Use +0 or -0 without change.
654         }
655         else if (exp > 31)
656         {
657             // Infinity process
658             // TODO: Need to check whether expressions of IEEE float infinity are valid on the GPU
659             bits16 = (u32)0x1F << FRACTION_WIDTH16;
660         }
661         else
662         {
663             bits16 |= fraction & FRACTION_MASK16;
664             bits16 |= ((u32)exp & 0x1F) << FRACTION_WIDTH16;
665         }
666 
667         return static_cast<u16>(bits16);
668     }
669 
670 private:
671     f32 m_Float32;
672 
673     enum
674     {
675         SIGN32 = 0x80000000,
676         SIGN16 = 0x00008000,
677 
678         EXP_BIAS32 = 127,
679         EXP_BIAS16 = 15,
680         EXP_MASK32 = 0x7F800000,
681         EXP_MASK16 = 0x00007C00,
682 
683         FRACTION_WIDTH32 = 23,
684         FRACTION_MASK32  = 0x007FFFFF,
685         FRACTION_WIDTH16 = 10,
686         FRACTION_MASK16  = 0x000003FF
687     };
688 };
689 
690 //--------------------------------------------------------------------------
691 //
692 //
693 // The 13-bit, fixed-point bit format is as follows.
694 // | int | decimal | (2's complement)
695 // int      : Integer portion  2 bit.
696 // decimal  : Fractional portion  11 bit.
697 //---------------------------------------------------------------------------
698 class Fixed13
699 {
700 public:
701     //--------------------------------------------------------------------------
702     //
703     //--------------------------------------------------------------------------
Fixed13()704     Fixed13() : m_Float32( 0.0f ) {}
705 
706     //--------------------------------------------------------------------------
707     //
708     //
709     //
710     //--------------------------------------------------------------------------
Fixed13(u32 fixed13)711     explicit Fixed13( u32 fixed13 )
712     {
713         m_Float32 = Fixed13ToFloat32( fixed13 );
714     }
715 
716     //--------------------------------------------------------------------------
717     //
718     //
719     //
720     //--------------------------------------------------------------------------
Fixed13(f32 fvalue)721     explicit Fixed13( f32 fvalue )
722     {
723         m_Float32 = fvalue;
724     }
725 
726 
GetFloat32Value()727     f32     GetFloat32Value() const { return m_Float32; }
GetFixed13Value()728     u16     GetFixed13Value() const { return Float32ToFixed13( m_Float32 ); }
729 
730     //--------------------------------------------------------------------------
731     //
732     //
733     //
734     //
735     //
736     //--------------------------------------------------------------------------
Fixed13ToFloat32(u32 fixed13)737     static f32 Fixed13ToFloat32(u32 fixed13)
738     {
739         f32 float32 = static_cast<f32>(fixed13);
740 
741         if (fixed13 & (0x1 << (TOTAL_WIDTH - 1)))
742         {
743             float32 -= (0x1 << TOTAL_WIDTH);
744         }
745 
746         return float32 / (0x1 << DECIMAL_WIDTH);
747     }
748 
749     //--------------------------------------------------------------------------
750     //
751     //
752     //
753     //
754     //
755     //--------------------------------------------------------------------------
Float32ToFixed13(f32 value)756     static u16 Float32ToFixed13(f32 value)
757     {
758         f32 fixed = value;
759         fixed += ((0x1 << INT_WIDTH) / 2);
760         fixed *= (0x1 << DECIMAL_WIDTH);
761 
762         if (fixed < 0)
763         {
764             fixed = 0.0f;
765         }
766         else if (fixed >= (0x1 << TOTAL_WIDTH))
767         {
768             fixed = (0x1 << TOTAL_WIDTH) - 1;
769         }
770 
771         fixed -= 0x1 << (TOTAL_WIDTH - 1);
772 
773         return static_cast<u16>(static_cast<s16>(fixed) & MASK);
774     }
775 
776 private:
777     f32 m_Float32;
778 
779     enum
780     {
781         INT_WIDTH = 2,
782         DECIMAL_WIDTH = 11,
783         TOTAL_WIDTH = 13,
784         MASK = (0x1 << TOTAL_WIDTH) - 1
785     };
786 };
787 
788 
789 //--------------------------------------------------------------------------
790 //
791 //
792 // The 11-bit, fixed-point bit format is as follows.
793 // decimal  : Fractional portion  11 bit.
794 //---------------------------------------------------------------------------
795 class Fixed11
796 {
797 public:
798     //--------------------------------------------------------------------------
799     //
800     //--------------------------------------------------------------------------
Fixed11()801     Fixed11() : m_Float32( 0.0f ) {}
802 
803     //--------------------------------------------------------------------------
804     //
805     //
806     //
807     //--------------------------------------------------------------------------
Fixed11(u32 fixed11)808     explicit Fixed11( u32 fixed11 )
809     {
810         m_Float32 = Fixed11ToFloat32( fixed11 );
811     }
812 
813     //--------------------------------------------------------------------------
814     //
815     //
816     //
817     //--------------------------------------------------------------------------
Fixed11(f32 fvalue)818     explicit Fixed11( f32 fvalue )
819     {
820         m_Float32 = fvalue;
821     }
822 
823 
GetFloat32Value()824     f32     GetFloat32Value() const { return m_Float32; }
GetFixed11Value()825     u16     GetFixed11Value() const { return Float32ToFixed11( m_Float32 ); }
826 
827     //--------------------------------------------------------------------------
828     //
829     //
830     //
831     //
832     //
833     //--------------------------------------------------------------------------
Fixed11ToFloat32(u32 fixed11)834     static f32 Fixed11ToFloat32(u32 fixed11)
835     {
836         f32 float32 = static_cast<f32>(fixed11);
837         return float32 / MASK;
838     }
839 
840     //--------------------------------------------------------------------------
841     //
842     //
843     //
844     //
845     //
846     //--------------------------------------------------------------------------
Float32ToFixed11(f32 value)847     static u16 Float32ToFixed11(f32 value)
848     {
849         u32 fixed;
850         u32 v_ = *(u32*)&value;
851 
852         if (value <= 0 || (v_ & 0x7f800000) == 0x7f800000)
853         {
854             fixed = 0;
855         }
856         else
857         {
858             value *= 1 << (DECIMAL_WIDTH - 0);
859             if (value >= (1 << DECIMAL_WIDTH))
860             {
861                 fixed = (1 << DECIMAL_WIDTH) - 1;
862             }
863             else
864             {
865                 fixed = (unsigned)(value);
866             }
867         }
868 
869         return static_cast<u16>(static_cast<s16>(fixed) & MASK);
870     }
871 
872 private:
873     f32 m_Float32;
874 
875     enum
876     {
877         INT_WIDTH = 2,
878         DECIMAL_WIDTH = 11,
879         TOTAL_WIDTH = 11,
880         MASK = (0x1 << TOTAL_WIDTH) - 1
881     };
882 };
883 
884 } /* namespace util */
885 } /* namespace nn */
886 
887 #endif // __cplusplus
888 
889 #endif //  NN_UTIL_UTIL_FLOAT24_H_
890