xref: /aosp_15_r20/external/deqp/framework/delibs/debase/deFloat16.c (revision 35238bce31c2a825756842865a792f8cf7f89930)
1 /*-------------------------------------------------------------------------
2  * drawElements Base Portability Library
3  * -------------------------------------
4  *
5  * Copyright 2014 The Android Open Source Project
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  *      http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  *
19  *//*!
20  * \file
21  * \brief 16-bit floating-point math.
22  *//*--------------------------------------------------------------------*/
23 
24 #include "deFloat16.h"
25 
26 DE_BEGIN_EXTERN_C
27 
deFloat32To16(float val32)28 deFloat16 deFloat32To16(float val32)
29 {
30     uint32_t sign;
31     int expotent;
32     uint32_t mantissa;
33     union
34     {
35         float f;
36         uint32_t u;
37     } x;
38 
39     x.f      = val32;
40     sign     = (x.u >> 16u) & 0x00008000u;
41     expotent = (int)((x.u >> 23u) & 0x000000ffu) - (127 - 15);
42     mantissa = x.u & 0x007fffffu;
43 
44     if (expotent <= 0)
45     {
46         if (expotent < -10)
47         {
48             /* Rounds to zero. */
49             return (deFloat16)sign;
50         }
51 
52         /* Converted to denormalized half, add leading 1 to significand. */
53         mantissa = mantissa | 0x00800000u;
54 
55         /* Round mantissa to nearest (10+e) */
56         {
57             uint32_t t = 14u - expotent;
58             uint32_t a = (1u << (t - 1u)) - 1u;
59             uint32_t b = (mantissa >> t) & 1u;
60 
61             mantissa = (mantissa + a + b) >> t;
62         }
63 
64         return (deFloat16)(sign | mantissa);
65     }
66     else if (expotent == 0xff - (127 - 15))
67     {
68         if (mantissa == 0u)
69         {
70             /* InF */
71             return (deFloat16)(sign | 0x7c00u);
72         }
73         else
74         {
75             /* NaN */
76             mantissa >>= 13u;
77             return (deFloat16)(sign | 0x7c00u | mantissa | (mantissa == 0u));
78         }
79     }
80     else
81     {
82         /* Normalized float. */
83         mantissa = mantissa + 0x00000fffu + ((mantissa >> 13u) & 1u);
84 
85         if (mantissa & 0x00800000u)
86         {
87             /* Overflow in mantissa. */
88             mantissa = 0u;
89             expotent += 1;
90         }
91 
92         if (expotent > 30)
93         {
94             /* \todo [pyry] Cause hw fp overflow */
95             return (deFloat16)(sign | 0x7c00u);
96         }
97 
98         return (deFloat16)(sign | ((uint32_t)expotent << 10u) | (mantissa >> 13u));
99     }
100 }
101 
deFloat64To16(double val64)102 deFloat16 deFloat64To16(double val64)
103 {
104     uint64_t sign;
105     long expotent;
106     uint64_t mantissa;
107     union
108     {
109         double f;
110         uint64_t u;
111     } x;
112 
113     x.f      = val64;
114     sign     = (x.u >> 48u) & 0x00008000u;
115     expotent = (long int)((x.u >> 52u) & 0x000007ffu) - (1023 - 15);
116     mantissa = x.u & 0x00fffffffffffffu;
117 
118     if (expotent <= 0)
119     {
120         if (expotent < -10)
121         {
122             /* Rounds to zero. */
123             return (deFloat16)sign;
124         }
125 
126         /* Converted to denormalized half, add leading 1 to significand. */
127         mantissa = mantissa | 0x0010000000000000u;
128 
129         /* Round mantissa to nearest (10+e) */
130         {
131             uint64_t t = 43u - expotent;
132             uint64_t a = (1u << (t - 1u)) - 1u;
133             uint64_t b = (mantissa >> t) & 1u;
134 
135             mantissa = (mantissa + a + b) >> t;
136         }
137 
138         return (deFloat16)(sign | mantissa);
139     }
140     else if (expotent == 0x7ff - (1023 - 15))
141     {
142         if (mantissa == 0u)
143         {
144             /* InF */
145             return (deFloat16)(sign | 0x7c00u);
146         }
147         else
148         {
149             /* NaN */
150             mantissa >>= 42u;
151             return (deFloat16)(sign | 0x7c00u | mantissa | (mantissa == 0u));
152         }
153     }
154     else
155     {
156         /* Normalized float. */
157         mantissa = mantissa + 0x000001ffffffffffu + ((mantissa >> 42u) & 1u);
158 
159         if (mantissa & 0x010000000000000u)
160         {
161             /* Overflow in mantissa. */
162             mantissa = 0u;
163             expotent += 1;
164         }
165 
166         if (expotent > 30)
167         {
168             return (deFloat16)(sign | 0x7c00u);
169         }
170 
171         return (deFloat16)(sign | ((uint32_t)expotent << 10u) | (mantissa >> 42u));
172     }
173 }
174 
175 /*--------------------------------------------------------------------*//*!
176  * \brief Round the given number `val` to nearest even by discarding
177  *        the last `numBitsToDiscard` bits.
178  * \param val value to round
179  * \param numBitsToDiscard number of (least significant) bits to discard
180  * \return The rounded value with the last `numBitsToDiscard` removed
181  *//*--------------------------------------------------------------------*/
roundToNearestEven(uint32_t val,const uint32_t numBitsToDiscard)182 static uint32_t roundToNearestEven(uint32_t val, const uint32_t numBitsToDiscard)
183 {
184     const uint32_t lastBits = val & ((1 << numBitsToDiscard) - 1);
185     const uint32_t headBit  = val & (1 << (numBitsToDiscard - 1));
186 
187     DE_ASSERT(numBitsToDiscard > 0 && numBitsToDiscard < 32); /* Make sure no overflow. */
188     val >>= numBitsToDiscard;
189 
190     if (headBit == 0)
191     {
192         return val;
193     }
194     else if (headBit == lastBits)
195     {
196         if ((val & 0x1) == 0x1)
197         {
198             return val + 1;
199         }
200         else
201         {
202             return val;
203         }
204     }
205     else
206     {
207         return val + 1;
208     }
209 }
210 
deFloat32To16Round(float val32,deRoundingMode mode)211 deFloat16 deFloat32To16Round(float val32, deRoundingMode mode)
212 {
213     union
214     {
215         float f;    /* Interpret as 32-bit float */
216         uint32_t u; /* Interpret as 32-bit unsigned integer */
217     } x;
218     uint32_t sign;  /* sign : 0000 0000 0000 0000 X000 0000 0000 0000 */
219     uint32_t exp32; /* exp32: biased exponent for 32-bit floats */
220     int exp16;      /* exp16: biased exponent for 16-bit floats */
221     uint32_t mantissa;
222 
223     /* We only support these two rounding modes for now */
224     DE_ASSERT(mode == DE_ROUNDINGMODE_TO_ZERO || mode == DE_ROUNDINGMODE_TO_NEAREST_EVEN);
225 
226     x.f      = val32;
227     sign     = (x.u >> 16u) & 0x00008000u;
228     exp32    = (x.u >> 23u) & 0x000000ffu;
229     exp16    = (int)(exp32)-127 + 15; /* 15/127: exponent bias for 16-bit/32-bit floats */
230     mantissa = x.u & 0x007fffffu;
231 
232     /* Case: zero and denormalized floats */
233     if (exp32 == 0)
234     {
235         /* Denormalized floats are < 2^(1-127), not representable in 16-bit floats, rounding to zero. */
236         return (deFloat16)sign;
237     }
238     /* Case: Inf and NaN */
239     else if (exp32 == 0x000000ffu)
240     {
241         if (mantissa == 0u)
242         {
243             /* Inf */
244             return (deFloat16)(sign | 0x7c00u);
245         }
246         else
247         {
248             /* NaN */
249             mantissa >>= 13u; /* 16-bit floats has 10-bit for mantissa, 13-bit less than 32-bit floats. */
250             /* Make sure we don't turn NaN into zero by | (mantissa == 0). */
251             return (deFloat16)(sign | 0x7c00u | mantissa | (mantissa == 0u));
252         }
253     }
254     /* The following are cases for normalized floats.
255      *
256      * * If exp16 is less than 0, we are experiencing underflow for the exponent. To encode this underflowed exponent,
257      *   we can only shift the mantissa further right.
258      *   The real exponent is exp16 - 15. A denormalized 16-bit float can represent -14 via its exponent.
259      *   Note that the most significant bit in the mantissa of a denormalized float is already -1 as for exponent.
260      *   So, we just need to right shift the mantissa -exp16 bits.
261      * * If exp16 is 0, mantissa shifting requirement is similar to the above.
262      * * If exp16 is greater than 30 (0b11110), we are experiencing overflow for the exponent of 16-bit normalized floats.
263      */
264     /* Case: normalized floats -> zero */
265     else if (exp16 < -10)
266     {
267         /* 16-bit floats have only 10 bits for mantissa. Minimal 16-bit denormalized float is (2^-10) * (2^-14). */
268         /* Expecting a number < (2^-10) * (2^-14) here, not representable, round to zero. */
269         return (deFloat16)sign;
270     }
271     /* Case: normalized floats -> zero and denormalized halfs */
272     else if (exp16 <= 0)
273     {
274         /* Add the implicit leading 1 in mormalized float to mantissa. */
275         mantissa |= 0x00800000u;
276         /* We have a (23 + 1)-bit mantissa, but 16-bit floats only expect 10-bit mantissa.
277          * Need to discard the last 14-bits considering rounding mode.
278          * We also need to shift right -exp16 bits to encode the underflowed exponent.
279          */
280         if (mode == DE_ROUNDINGMODE_TO_ZERO)
281         {
282             mantissa >>= (14 - exp16);
283         }
284         else
285         {
286             /* mantissa in the above may exceed 10-bits, in which case overflow happens.
287              * The overflowed bit is automatically carried to exponent then.
288              */
289             mantissa = roundToNearestEven(mantissa, 14 - exp16);
290         }
291         return (deFloat16)(sign | mantissa);
292     }
293     /* Case: normalized floats -> normalized floats */
294     else if (exp16 <= 30)
295     {
296         if (mode == DE_ROUNDINGMODE_TO_ZERO)
297         {
298             return (deFloat16)(sign | ((uint32_t)exp16 << 10u) | (mantissa >> 13u));
299         }
300         else
301         {
302             mantissa = roundToNearestEven(mantissa, 13);
303             /* Handle overflow. exp16 may overflow (and become Inf) itself, but that's correct. */
304             exp16 = (exp16 << 10u) + (mantissa & (1 << 10));
305             mantissa &= (1u << 10) - 1;
306             return (deFloat16)(sign | ((uint32_t)exp16) | mantissa);
307         }
308     }
309     /* Case: normalized floats (too large to be representable as 16-bit floats) */
310     else
311     {
312         /* According to IEEE Std 754-2008 Section 7.4,
313          * * roundTiesToEven and roundTiesToAway carry all overflows to Inf with the sign
314          *   of the intermediate  result.
315          * * roundTowardZero carries all overflows to the format's largest finite number
316          *   with the sign of the intermediate result.
317          */
318         if (mode == DE_ROUNDINGMODE_TO_ZERO)
319         {
320             return (deFloat16)(sign | 0x7bffu); /* 111 1011 1111 1111 */
321         }
322         else
323         {
324             return (deFloat16)(sign | (0x1f << 10));
325         }
326     }
327 
328     /* Make compiler happy */
329     return (deFloat16)0;
330 }
331 
332 /*--------------------------------------------------------------------*//*!
333  * \brief Round the given number `val` to nearest even by discarding
334  *        the last `numBitsToDiscard` bits.
335  * \param val value to round
336  * \param numBitsToDiscard number of (least significant) bits to discard
337  * \return The rounded value with the last `numBitsToDiscard` removed
338  *//*--------------------------------------------------------------------*/
roundToNearestEven64(uint64_t val,const uint64_t numBitsToDiscard)339 static uint64_t roundToNearestEven64(uint64_t val, const uint64_t numBitsToDiscard)
340 {
341     const uint64_t lastBits = val & (((uint64_t)1 << numBitsToDiscard) - 1);
342     const uint64_t headBit  = val & ((uint64_t)1 << (numBitsToDiscard - 1));
343 
344     DE_ASSERT(numBitsToDiscard > 0 && numBitsToDiscard < 64); /* Make sure no overflow. */
345     val >>= numBitsToDiscard;
346 
347     if (headBit == 0)
348     {
349         return val;
350     }
351     else if (headBit == lastBits)
352     {
353         if ((val & 0x1) == 0x1)
354         {
355             return val + 1;
356         }
357         else
358         {
359             return val;
360         }
361     }
362     else
363     {
364         return val + 1;
365     }
366 }
367 
deFloat64To16Round(double val64,deRoundingMode mode)368 deFloat16 deFloat64To16Round(double val64, deRoundingMode mode)
369 {
370     union
371     {
372         double f;   /* Interpret as 64-bit float */
373         uint64_t u; /* Interpret as 64-bit unsigned integer */
374     } x;
375     uint64_t sign;  /* sign : 0000 0000 0000 0000 X000 0000 0000 0000 */
376     uint64_t exp64; /* exp32: biased exponent for 64-bit floats */
377     int exp16;      /* exp16: biased exponent for 16-bit floats */
378     uint64_t mantissa;
379 
380     /* We only support these two rounding modes for now */
381     DE_ASSERT(mode == DE_ROUNDINGMODE_TO_ZERO || mode == DE_ROUNDINGMODE_TO_NEAREST_EVEN);
382 
383     x.f      = val64;
384     sign     = (x.u >> 48u) & 0x00008000u;
385     exp64    = (x.u >> 52u) & 0x000007ffu;
386     exp16    = (int)(exp64)-1023 + 15; /* 15/127: exponent bias for 16-bit/32-bit floats */
387     mantissa = x.u & 0x00fffffffffffffu;
388 
389     /* Case: zero and denormalized floats */
390     if (exp64 == 0)
391     {
392         /* Denormalized floats are < 2^(1-1023), not representable in 16-bit floats, rounding to zero. */
393         return (deFloat16)sign;
394     }
395     /* Case: Inf and NaN */
396     else if (exp64 == 0x000007ffu)
397     {
398         if (mantissa == 0u)
399         {
400             /* Inf */
401             return (deFloat16)(sign | 0x7c00u);
402         }
403         else
404         {
405             /* NaN */
406             mantissa >>= 42u; /* 16-bit floats has 10-bit for mantissa, 42-bit less than 64-bit floats. */
407             /* Make sure we don't turn NaN into zero by | (mantissa == 0). */
408             return (deFloat16)(sign | 0x7c00u | mantissa | (mantissa == 0u));
409         }
410     }
411     /* The following are cases for normalized floats.
412      *
413      * * If exp16 is less than 0, we are experiencing underflow for the exponent. To encode this underflowed exponent,
414      *   we can only shift the mantissa further right.
415      *   The real exponent is exp16 - 15. A denormalized 16-bit float can represent -14 via its exponent.
416      *   Note that the most significant bit in the mantissa of a denormalized float is already -1 as for exponent.
417      *   So, we just need to right shift the mantissa -exp16 bits.
418      * * If exp16 is 0, mantissa shifting requirement is similar to the above.
419      * * If exp16 is greater than 30 (0b11110), we are experiencing overflow for the exponent of 16-bit normalized floats.
420      */
421     /* Case: normalized floats -> zero */
422     else if (exp16 < -10)
423     {
424         /* 16-bit floats have only 10 bits for mantissa. Minimal 16-bit denormalized float is (2^-10) * (2^-14). */
425         /* Expecting a number < (2^-10) * (2^-14) here, not representable, round to zero. */
426         return (deFloat16)sign;
427     }
428     /* Case: normalized floats -> zero and denormalized halfs */
429     else if (exp16 <= 0)
430     {
431         /* Add the implicit leading 1 in mormalized float to mantissa. */
432         mantissa |= 0x0010000000000000u;
433         /* We have a (23 + 1)-bit mantissa, but 16-bit floats only expect 10-bit mantissa.
434          * Need to discard the last 14-bits considering rounding mode.
435          * We also need to shift right -exp16 bits to encode the underflowed exponent.
436          */
437         if (mode == DE_ROUNDINGMODE_TO_ZERO)
438         {
439             mantissa >>= (43 - exp16);
440         }
441         else
442         {
443             /* mantissa in the above may exceed 10-bits, in which case overflow happens.
444              * The overflowed bit is automatically carried to exponent then.
445              */
446             mantissa = roundToNearestEven64(mantissa, 43 - exp16);
447         }
448         return (deFloat16)(sign | mantissa);
449     }
450     /* Case: normalized floats -> normalized floats */
451     else if (exp16 <= 30)
452     {
453         if (mode == DE_ROUNDINGMODE_TO_ZERO)
454         {
455             return (deFloat16)(sign | ((uint32_t)exp16 << 10u) | (mantissa >> 42u));
456         }
457         else
458         {
459             mantissa = roundToNearestEven64(mantissa, 42);
460             /* Handle overflow. exp16 may overflow (and become Inf) itself, but that's correct. */
461             exp16 = (exp16 << 10u) + (deFloat16)(mantissa & (1 << 10));
462             mantissa &= (1u << 10) - 1;
463             return (deFloat16)(sign | ((uint32_t)exp16) | mantissa);
464         }
465     }
466     /* Case: normalized floats (too large to be representable as 16-bit floats) */
467     else
468     {
469         /* According to IEEE Std 754-2008 Section 7.4,
470          * * roundTiesToEven and roundTiesToAway carry all overflows to Inf with the sign
471          *   of the intermediate  result.
472          * * roundTowardZero carries all overflows to the format's largest finite number
473          *   with the sign of the intermediate result.
474          */
475         if (mode == DE_ROUNDINGMODE_TO_ZERO)
476         {
477             return (deFloat16)(sign | 0x7bffu); /* 111 1011 1111 1111 */
478         }
479         else
480         {
481             return (deFloat16)(sign | (0x1f << 10));
482         }
483     }
484 
485     /* Make compiler happy */
486     return (deFloat16)0;
487 }
488 
deFloat16To32(deFloat16 val16)489 float deFloat16To32(deFloat16 val16)
490 {
491     uint32_t sign;
492     uint32_t expotent;
493     uint32_t mantissa;
494     union
495     {
496         float f;
497         uint32_t u;
498     } x;
499 
500     x.u = 0u;
501 
502     sign     = ((uint32_t)val16 >> 15u) & 0x00000001u;
503     expotent = ((uint32_t)val16 >> 10u) & 0x0000001fu;
504     mantissa = (uint32_t)val16 & 0x000003ffu;
505 
506     if (expotent == 0u)
507     {
508         if (mantissa == 0u)
509         {
510             /* +/- 0 */
511             x.u = sign << 31u;
512             return x.f;
513         }
514         else
515         {
516             /* Denormalized, normalize it. */
517 
518             while (!(mantissa & 0x00000400u))
519             {
520                 mantissa <<= 1u;
521                 expotent -= 1u;
522             }
523 
524             expotent += 1u;
525             mantissa &= ~0x00000400u;
526         }
527     }
528     else if (expotent == 31u)
529     {
530         if (mantissa == 0u)
531         {
532             /* +/- InF */
533             x.u = (sign << 31u) | 0x7f800000u;
534             return x.f;
535         }
536         else
537         {
538             /* +/- NaN */
539             x.u = (sign << 31u) | 0x7f800000u | (mantissa << 13u);
540             return x.f;
541         }
542     }
543 
544     expotent = expotent + (127u - 15u);
545     mantissa = mantissa << 13u;
546 
547     x.u = (sign << 31u) | (expotent << 23u) | mantissa;
548     return x.f;
549 }
550 
deFloat16To64(deFloat16 val16)551 double deFloat16To64(deFloat16 val16)
552 {
553     uint64_t sign;
554     uint64_t expotent;
555     uint64_t mantissa;
556     union
557     {
558         double f;
559         uint64_t u;
560     } x;
561 
562     x.u = 0u;
563 
564     sign     = ((uint32_t)val16 >> 15u) & 0x00000001u;
565     expotent = ((uint32_t)val16 >> 10u) & 0x0000001fu;
566     mantissa = (uint32_t)val16 & 0x000003ffu;
567 
568     if (expotent == 0u)
569     {
570         if (mantissa == 0u)
571         {
572             /* +/- 0 */
573             x.u = sign << 63u;
574             return x.f;
575         }
576         else
577         {
578             /* Denormalized, normalize it. */
579 
580             while (!(mantissa & 0x00000400u))
581             {
582                 mantissa <<= 1u;
583                 expotent -= 1u;
584             }
585 
586             expotent += 1u;
587             mantissa &= ~0x00000400u;
588         }
589     }
590     else if (expotent == 31u)
591     {
592         if (mantissa == 0u)
593         {
594             /* +/- InF */
595             x.u = (sign << 63u) | 0x7ff0000000000000u;
596             return x.f;
597         }
598         else
599         {
600             /* +/- NaN */
601             x.u = (sign << 63u) | 0x7ff0000000000000u | (mantissa << 42u);
602             return x.f;
603         }
604     }
605 
606     expotent = expotent + (1023u - 15u);
607     mantissa = mantissa << 42u;
608 
609     x.u = (sign << 63u) | (expotent << 52u) | mantissa;
610     return x.f;
611 }
612 
613 DE_END_EXTERN_C
614