xref: /aosp_15_r20/external/deqp/framework/common/tcuFloatFormat.cpp (revision 35238bce31c2a825756842865a792f8cf7f89930)
1 /*-------------------------------------------------------------------------
2  * drawElements Quality Program Tester Core
3  * ----------------------------------------
4  *
5  * Copyright 2014 The Android Open Source Project
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  *      http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  *
19  *//*!
20  * \file
21  * \brief Adjustable-precision floating point operations.
22  *//*--------------------------------------------------------------------*/
23 
24 #include "tcuFloatFormat.hpp"
25 
26 #include "deMath.h"
27 #include "deUniquePtr.hpp"
28 
29 #include <sstream>
30 #include <iomanip>
31 #include <limits>
32 
33 namespace tcu
34 {
35 namespace
36 {
37 
chooseInterval(YesNoMaybe choice,const Interval & no,const Interval & yes)38 Interval chooseInterval(YesNoMaybe choice, const Interval &no, const Interval &yes)
39 {
40     switch (choice)
41     {
42     case NO:
43         return no;
44     case YES:
45         return yes;
46     case MAYBE:
47         return no | yes;
48     default:
49         DE_FATAL("Impossible case");
50     }
51 
52     return Interval();
53 }
54 
computeMaxValue(int maxExp,int fractionBits)55 double computeMaxValue(int maxExp, int fractionBits)
56 {
57     return (deLdExp(1.0, maxExp) + deLdExp(double((1ull << fractionBits) - 1), maxExp - fractionBits));
58 }
59 
60 } // namespace
61 
FloatFormat(int minExp,int maxExp,int fractionBits,bool exactPrecision,YesNoMaybe hasSubnormal_,YesNoMaybe hasInf_,YesNoMaybe hasNaN_)62 FloatFormat::FloatFormat(int minExp, int maxExp, int fractionBits, bool exactPrecision, YesNoMaybe hasSubnormal_,
63                          YesNoMaybe hasInf_, YesNoMaybe hasNaN_)
64     : m_minExp(minExp)
65     , m_maxExp(maxExp)
66     , m_fractionBits(fractionBits)
67     , m_hasSubnormal(hasSubnormal_)
68     , m_hasInf(hasInf_)
69     , m_hasNaN(hasNaN_)
70     , m_exactPrecision(exactPrecision)
71     , m_maxValue(computeMaxValue(maxExp, fractionBits))
72 {
73     DE_ASSERT(minExp <= maxExp);
74 }
75 
76 /*-------------------------------------------------------------------------
77  * On the definition of ULP
78  *
79  * The GLSL spec does not define ULP. However, it refers to IEEE 754, which
80  * (reportedly) uses Harrison's definition:
81  *
82  * ULP(x) is the distance between the closest floating point numbers
83  * a and be such that a <= x <= b and a != b
84  *
85  * Note that this means that when x = 2^n, ULP(x) = 2^(n-p-1), i.e. it is the
86  * distance to the next lowest float, not next highest.
87  *
88  * Furthermore, it is assumed that ULP is calculated relative to the exact
89  * value, not the approximation. This is because otherwise a less accurate
90  * approximation could be closer in ULPs, because its ULPs are bigger.
91  *
92  * For details, see "On the definition of ulp(x)" by Jean-Michel Muller
93  *
94  *-----------------------------------------------------------------------*/
95 
ulp(double x,double count) const96 double FloatFormat::ulp(double x, double count) const
97 {
98     int exp           = 0;
99     const double frac = deFractExp(deAbs(x), &exp);
100 
101     if (deIsNaN(frac))
102         return TCU_NAN;
103     else if (deIsInf(frac))
104         return deLdExp(1.0, m_maxExp - m_fractionBits);
105     else if (frac == 1.0)
106     {
107         // Harrison's ULP: choose distance to closest (i.e. next lower) at binade
108         // boundary.
109         --exp;
110     }
111     else if (frac == 0.0)
112         exp = m_minExp;
113 
114     // ULP cannot be lower than the smallest quantum.
115     exp = de::max(exp, m_minExp);
116 
117     {
118         const double oneULP = deLdExp(1.0, exp - m_fractionBits);
119         ScopedRoundingMode ctx(DE_ROUNDINGMODE_TO_POSITIVE_INF);
120 
121         return oneULP * count;
122     }
123 }
124 
125 //! Return the difference between the given nominal exponent and
126 //! the exponent of the lowest significand bit of the
127 //! representation of a number with this format.
128 //! For normal numbers this is the number of significand bits, but
129 //! for subnormals it is less and for values of exp where 2^exp is too
130 //! small to represent it is <0
exponentShift(int exp) const131 int FloatFormat::exponentShift(int exp) const
132 {
133     return m_fractionBits - de::max(m_minExp - exp, 0);
134 }
135 
136 //! Return the number closest to `d` that is exactly representable with the
137 //! significand bits and minimum exponent of the floatformat. Round up if
138 //! `upward` is true, otherwise down.
round(double d,bool upward) const139 double FloatFormat::round(double d, bool upward) const
140 {
141     int exp                = 0;
142     const double frac      = deFractExp(d, &exp);
143     const int shift        = exponentShift(exp);
144     const double shiftFrac = deLdExp(frac, shift);
145     const double roundFrac = upward ? deCeil(shiftFrac) : deFloor(shiftFrac);
146 
147     return deLdExp(roundFrac, exp - shift);
148 }
149 
150 //! Return the range of numbers that `d` might be converted to in the
151 //! floatformat, given its limitations with infinities, subnormals and maximum
152 //! exponent.
clampValue(double d) const153 Interval FloatFormat::clampValue(double d) const
154 {
155     const double rSign = deSign(d);
156     int rExp           = 0;
157 
158     DE_ASSERT(!deIsNaN(d));
159 
160     deFractExp(d, &rExp);
161     if (rExp < m_minExp)
162         return chooseInterval(m_hasSubnormal, rSign * 0.0, d);
163     else if (deIsInf(d) || rExp > m_maxExp)
164         return chooseInterval(m_hasInf, rSign * getMaxValue(), rSign * TCU_INFINITY);
165 
166     return Interval(d);
167 }
168 
169 //! Return the range of numbers that might be used with this format to
170 //! represent a number within `x`.
convert(const Interval & x) const171 Interval FloatFormat::convert(const Interval &x) const
172 {
173     Interval ret;
174     Interval tmp = x;
175 
176     if (x.hasNaN())
177     {
178         // If NaN might be supported, NaN is a legal return value
179         if (m_hasNaN != NO)
180             ret |= TCU_NAN;
181 
182         // If NaN might not be supported, any (non-NaN) value is legal,
183         // _subject_ to clamping. Hence we modify tmp, not ret.
184         if (m_hasNaN != YES)
185             tmp = Interval::unbounded();
186     }
187 
188     // Round both bounds _inwards_ to closest representable values.
189     if (!tmp.empty())
190         ret |= clampValue(round(tmp.lo(), false)) | clampValue(round(tmp.hi(), true));
191 
192     // If this format's precision is not exact, the (possibly out-of-bounds)
193     // original value is also a possible result.
194     if (!m_exactPrecision)
195         ret |= x;
196 
197     return ret;
198 }
199 
roundOut(double d,bool upward,bool roundUnderOverflow) const200 double FloatFormat::roundOut(double d, bool upward, bool roundUnderOverflow) const
201 {
202     int exp = 0;
203     deFractExp(d, &exp);
204 
205     if (roundUnderOverflow && exp > m_maxExp && (upward == (d < 0.0)))
206         return deSign(d) * getMaxValue();
207     else
208         return round(d, upward);
209 }
210 
211 //! Round output of an operation.
212 //! \param roundUnderOverflow Can +/-inf rounded to min/max representable;
213 //!                              should be false if any of operands was inf, true otherwise.
roundOut(const Interval & x,bool roundUnderOverflow) const214 Interval FloatFormat::roundOut(const Interval &x, bool roundUnderOverflow) const
215 {
216     Interval ret = x.nan();
217 
218     if (!x.empty())
219         ret |= Interval(roundOut(x.lo(), false, roundUnderOverflow), roundOut(x.hi(), true, roundUnderOverflow));
220 
221     return ret;
222 }
223 
floatToHex(double x) const224 std::string FloatFormat::floatToHex(double x) const
225 {
226     if (deIsNaN(x))
227         return "NaN";
228     else if (deIsInf(x))
229         return (x < 0.0 ? "-" : "+") + std::string("inf");
230     else if (x == 0.0) // \todo [2014-03-27 lauri] Negative zero
231         return "0.0";
232 
233     int exp                 = 0;
234     const double frac       = deFractExp(deAbs(x), &exp);
235     const int shift         = exponentShift(exp);
236     const uint64_t bits     = uint64_t(deLdExp(frac, shift));
237     const uint64_t whole    = bits >> m_fractionBits;
238     const uint64_t fraction = bits & ((uint64_t(1) << m_fractionBits) - 1);
239     const int exponent      = exp + m_fractionBits - shift;
240     const int numDigits     = (m_fractionBits + 3) / 4;
241     const uint64_t aligned  = fraction << (numDigits * 4 - m_fractionBits);
242     std::ostringstream oss;
243 
244     oss << (x < 0 ? "-" : "") << "0x" << whole << "." << std::hex << std::setw(numDigits) << std::setfill('0')
245         << aligned << "p" << std::dec << std::setw(0) << exponent;
246 
247     return oss.str();
248 }
249 
intervalToHex(const Interval & interval) const250 std::string FloatFormat::intervalToHex(const Interval &interval) const
251 {
252     if (interval.empty())
253         return interval.hasNaN() ? "{ NaN }" : "{}";
254 
255     else if (interval.lo() == interval.hi())
256         return (std::string(interval.hasNaN() ? "{ NaN, " : "{ ") + floatToHex(interval.lo()) + " }");
257     else if (interval == Interval::unbounded(true))
258         return "<any>";
259 
260     return (std::string(interval.hasNaN() ? "{ NaN } | " : "") + "[" + floatToHex(interval.lo()) + ", " +
261             floatToHex(interval.hi()) + "]");
262 }
263 
264 template <typename T>
nativeFormat(void)265 static FloatFormat nativeFormat(void)
266 {
267     typedef std::numeric_limits<T> Limits;
268 
269     DE_ASSERT(Limits::radix == 2);
270 
271     return FloatFormat(Limits::min_exponent - 1, // These have a built-in offset of one
272                        Limits::max_exponent - 1,
273                        Limits::digits - 1, // don't count the hidden bit
274                        Limits::has_denorm != std::denorm_absent, Limits::has_infinity ? YES : NO,
275                        Limits::has_quiet_NaN ? YES : NO,
276                        ((Limits::has_denorm == std::denorm_present) ? YES :
277                         (Limits::has_denorm == std::denorm_absent)  ? NO :
278                                                                       MAYBE));
279 }
280 
nativeFloat(void)281 FloatFormat FloatFormat::nativeFloat(void)
282 {
283     return nativeFormat<float>();
284 }
285 
nativeDouble(void)286 FloatFormat FloatFormat::nativeDouble(void)
287 {
288     return nativeFormat<double>();
289 }
290 
NormalizedFormat(int fractionBits)291 NormalizedFormat::NormalizedFormat(int fractionBits) : FloatFormat(0, 0, fractionBits, true, tcu::YES)
292 {
293 }
294 
round(double d,bool upward) const295 double NormalizedFormat::round(double d, bool upward) const
296 {
297     const int fractionBits = getFractionBits();
298 
299     if (fractionBits <= 0)
300         return d;
301 
302     const int maxIntValue  = (1 << fractionBits) - 1;
303     const double value     = d * maxIntValue;
304     const double normValue = upward ? deCeil(value) : deFloor(value);
305     return normValue / maxIntValue;
306 }
307 
ulp(double x,double count) const308 double NormalizedFormat::ulp(double x, double count) const
309 {
310     (void)x;
311 
312     const int maxIntValue  = (1 << getFractionBits()) - 1;
313     const double precision = 1.0 / maxIntValue;
314     return precision * count;
315 }
316 
roundOut(double d,bool upward,bool roundUnderOverflow) const317 double NormalizedFormat::roundOut(double d, bool upward, bool roundUnderOverflow) const
318 {
319     if (roundUnderOverflow && deAbs(d) > 1.0 && (upward == (d < 0.0)))
320         return deSign(d);
321     else
322         return round(d, upward);
323 }
324 
325 namespace
326 {
327 
328 using de::MovePtr;
329 using de::UniquePtr;
330 using std::ostringstream;
331 using std::string;
332 
333 class Test
334 {
335 protected:
Test(MovePtr<FloatFormat> fmt)336     Test(MovePtr<FloatFormat> fmt) : m_fmt(fmt)
337     {
338     }
p(int e) const339     double p(int e) const
340     {
341         return deLdExp(1.0, e);
342     }
343     void check(const string &expr, double result, double reference) const;
344     void testULP(double arg, double ref) const;
345     void testRound(double arg, double refDown, double refUp) const;
346 
347     UniquePtr<FloatFormat> m_fmt;
348 };
349 
check(const string & expr,double result,double reference) const350 void Test::check(const string &expr, double result, double reference) const
351 {
352     if (result != reference)
353     {
354         ostringstream oss;
355         oss << expr << " returned " << result << ", expected " << reference;
356         TCU_FAIL(oss.str().c_str());
357     }
358 }
359 
testULP(double arg,double ref) const360 void Test::testULP(double arg, double ref) const
361 {
362     ostringstream oss;
363 
364     oss << "ulp(" << arg << ")";
365     check(oss.str(), m_fmt->ulp(arg), ref);
366 }
367 
testRound(double arg,double refDown,double refUp) const368 void Test::testRound(double arg, double refDown, double refUp) const
369 {
370     {
371         ostringstream oss;
372         oss << "round(" << arg << ", false)";
373         check(oss.str(), m_fmt->round(arg, false), refDown);
374     }
375     {
376         ostringstream oss;
377         oss << "round(" << arg << ", true)";
378         check(oss.str(), m_fmt->round(arg, true), refUp);
379     }
380 }
381 
382 class TestBinary32 : public Test
383 {
384 public:
TestBinary32(void)385     TestBinary32(void) : Test(MovePtr<FloatFormat>(new FloatFormat(-126, 127, 23, true)))
386     {
387     }
388 
389     void runTest(void) const;
390 };
391 
runTest(void) const392 void TestBinary32::runTest(void) const
393 {
394     testULP(p(0), p(-24));
395     testULP(p(0) + p(-23), p(-23));
396     testULP(p(-124), p(-148));
397     testULP(p(-125), p(-149));
398     testULP(p(-125) + p(-140), p(-148));
399     testULP(p(-126), p(-149));
400     testULP(p(-130), p(-149));
401 
402     testRound(p(0) + p(-20) + p(-40), p(0) + p(-20), p(0) + p(-20) + p(-23));
403     testRound(p(-126) - p(-150), p(-126) - p(-149), p(-126));
404 
405     TCU_CHECK(m_fmt->floatToHex(p(0)) == "0x1.000000p0");
406     TCU_CHECK(m_fmt->floatToHex(p(8) + p(-4)) == "0x1.001000p8");
407     TCU_CHECK(m_fmt->floatToHex(p(-140)) == "0x0.000400p-126");
408     TCU_CHECK(m_fmt->floatToHex(p(-140)) == "0x0.000400p-126");
409     TCU_CHECK(m_fmt->floatToHex(p(-126) + p(-125)) == "0x1.800000p-125");
410 }
411 
412 } // namespace
413 
FloatFormat_selfTest(void)414 void FloatFormat_selfTest(void)
415 {
416     TestBinary32 test32;
417     test32.runTest();
418 }
419 
420 } // namespace tcu
421