xref: /aosp_15_r20/external/OpenCL-CTS/test_conformance/math_brute_force/binary_double.cpp (revision 6467f958c7de8070b317fc65bcb0f6472e388d82)
1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 
17 #include "common.h"
18 #include "function_list.h"
19 #include "test_functions.h"
20 #include "utility.h"
21 
22 #include <cstring>
23 
24 namespace {
25 
26 const double twoToMinus1022 = MAKE_HEX_DOUBLE(0x1p-1022, 1, -1022);
27 
BuildKernelFn(cl_uint job_id,cl_uint thread_id UNUSED,void * p)28 cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
29 {
30     BuildKernelInfo &info = *(BuildKernelInfo *)p;
31     auto generator = [](const std::string &kernel_name, const char *builtin,
32                         cl_uint vector_size_index) {
33         return GetBinaryKernel(kernel_name, builtin, ParameterType::Double,
34                                ParameterType::Double, ParameterType::Double,
35                                vector_size_index);
36     };
37     return BuildKernels(info, job_id, generator);
38 }
39 
40 // Thread specific data for a worker thread
41 struct ThreadInfo
42 {
43     // Input and output buffers for the thread
44     clMemWrapper inBuf;
45     clMemWrapper inBuf2;
46     Buffers outBuf;
47 
48     float maxError; // max error value. Init to 0.
49     double
50         maxErrorValue; // position of the max error value (param 1).  Init to 0.
51     double maxErrorValue2; // position of the max error value (param 2).  Init
52                            // to 0.
53     MTdataHolder d;
54 
55     // Per thread command queue to improve performance
56     clCommandQueueWrapper tQueue;
57 };
58 
59 struct TestInfo
60 {
61     size_t subBufferSize; // Size of the sub-buffer in elements
62     const Func *f; // A pointer to the function info
63 
64     // Programs for various vector sizes.
65     Programs programs;
66 
67     // Thread-specific kernels for each vector size:
68     // k[vector_size][thread_id]
69     KernelMatrix k;
70 
71     // Array of thread specific information
72     std::vector<ThreadInfo> tinfo;
73 
74     cl_uint threadCount; // Number of worker threads
75     cl_uint jobCount; // Number of jobs
76     cl_uint step; // step between each chunk and the next.
77     cl_uint scale; // stride between individual test values
78     float ulps; // max_allowed ulps
79     int ftz; // non-zero if running in flush to zero mode
80 
81     int isFDim;
82     int skipNanInf;
83     int isNextafter;
84     bool relaxedMode; // True if test is running in relaxed mode, false
85                       // otherwise.
86 };
87 
88 // A table of more difficult cases to get right
89 const double specialValues[] = {
90     -NAN,
91     -INFINITY,
92     -DBL_MAX,
93     MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12),
94     MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64),
95     MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11),
96     MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11),
97     MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63),
98     MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10),
99     MAKE_HEX_DOUBLE(-0x1.000002p32, -0x1000002LL, 8),
100     MAKE_HEX_DOUBLE(-0x1.0p32, -0x1LL, 32),
101     MAKE_HEX_DOUBLE(-0x1.fffffffffffffp31, -0x1fffffffffffffLL, -21),
102     MAKE_HEX_DOUBLE(-0x1.0000000000001p31, -0x10000000000001LL, -21),
103     MAKE_HEX_DOUBLE(-0x1.0p31, -0x1LL, 31),
104     MAKE_HEX_DOUBLE(-0x1.fffffffffffffp30, -0x1fffffffffffffLL, -22),
105     -1000.0,
106     -100.0,
107     -4.0,
108     -3.5,
109     -3.0,
110     MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51),
111     -2.5,
112     MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51),
113     -2.0,
114     MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52),
115     -1.5,
116     MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52),
117     MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
118     -1.0,
119     MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53),
120     MAKE_HEX_DOUBLE(-0x1.0000000000001p-1, -0x10000000000001LL, -53),
121     -0.5,
122     MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-2, -0x1fffffffffffffLL, -54),
123     MAKE_HEX_DOUBLE(-0x1.0000000000001p-2, -0x10000000000001LL, -54),
124     -0.25,
125     MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-3, -0x1fffffffffffffLL, -55),
126     MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074),
127     -DBL_MIN,
128     MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074),
129     MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074),
130     MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074),
131     MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074),
132     MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074),
133     MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074),
134     MAKE_HEX_DOUBLE(-0x0.0000000000008p-1022, -0x00000000000008LL, -1074),
135     MAKE_HEX_DOUBLE(-0x0.0000000000007p-1022, -0x00000000000007LL, -1074),
136     MAKE_HEX_DOUBLE(-0x0.0000000000006p-1022, -0x00000000000006LL, -1074),
137     MAKE_HEX_DOUBLE(-0x0.0000000000005p-1022, -0x00000000000005LL, -1074),
138     MAKE_HEX_DOUBLE(-0x0.0000000000004p-1022, -0x00000000000004LL, -1074),
139     MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074),
140     MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074),
141     MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074),
142     -0.0,
143 
144     +NAN,
145     +INFINITY,
146     +DBL_MAX,
147     MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12),
148     MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64),
149     MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11),
150     MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11),
151     MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63),
152     MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10),
153     MAKE_HEX_DOUBLE(+0x1.000002p32, +0x1000002LL, 8),
154     MAKE_HEX_DOUBLE(+0x1.0p32, +0x1LL, 32),
155     MAKE_HEX_DOUBLE(+0x1.fffffffffffffp31, +0x1fffffffffffffLL, -21),
156     MAKE_HEX_DOUBLE(+0x1.0000000000001p31, +0x10000000000001LL, -21),
157     MAKE_HEX_DOUBLE(+0x1.0p31, +0x1LL, 31),
158     MAKE_HEX_DOUBLE(+0x1.fffffffffffffp30, +0x1fffffffffffffLL, -22),
159     +1000.0,
160     +100.0,
161     +4.0,
162     +3.5,
163     +3.0,
164     MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51),
165     +2.5,
166     MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51),
167     +2.0,
168     MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52),
169     +1.5,
170     MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52),
171     MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
172     +1.0,
173     MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53),
174     MAKE_HEX_DOUBLE(+0x1.0000000000001p-1, +0x10000000000001LL, -53),
175     +0.5,
176     MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-2, +0x1fffffffffffffLL, -54),
177     MAKE_HEX_DOUBLE(+0x1.0000000000001p-2, +0x10000000000001LL, -54),
178     +0.25,
179     MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-3, +0x1fffffffffffffLL, -55),
180     MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074),
181     +DBL_MIN,
182     MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074),
183     MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074),
184     MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074),
185     MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074),
186     MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074),
187     MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074),
188     MAKE_HEX_DOUBLE(+0x0.0000000000008p-1022, +0x00000000000008LL, -1074),
189     MAKE_HEX_DOUBLE(+0x0.0000000000007p-1022, +0x00000000000007LL, -1074),
190     MAKE_HEX_DOUBLE(+0x0.0000000000006p-1022, +0x00000000000006LL, -1074),
191     MAKE_HEX_DOUBLE(+0x0.0000000000005p-1022, +0x00000000000005LL, -1074),
192     MAKE_HEX_DOUBLE(+0x0.0000000000004p-1022, +0x00000000000004LL, -1074),
193     MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074),
194     MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074),
195     MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074),
196     +0.0,
197 };
198 
199 constexpr size_t specialValuesCount =
200     sizeof(specialValues) / sizeof(specialValues[0]);
201 
Test(cl_uint job_id,cl_uint thread_id,void * data)202 cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
203 {
204     TestInfo *job = (TestInfo *)data;
205     size_t buffer_elements = job->subBufferSize;
206     size_t buffer_size = buffer_elements * sizeof(cl_double);
207     cl_uint base = job_id * (cl_uint)job->step;
208     ThreadInfo *tinfo = &(job->tinfo[thread_id]);
209     float ulps = job->ulps;
210     dptr func = job->f->dfunc;
211     int ftz = job->ftz;
212     bool relaxedMode = job->relaxedMode;
213     MTdata d = tinfo->d;
214     cl_int error;
215     const char *name = job->f->name;
216 
217     int isNextafter = job->isNextafter;
218     cl_ulong *t;
219     cl_double *r;
220     cl_double *s;
221     cl_double *s2;
222 
223     Force64BitFPUPrecision();
224 
225     cl_event e[VECTOR_SIZE_COUNT];
226     cl_ulong *out[VECTOR_SIZE_COUNT];
227     if (gHostFill)
228     {
229         // start the map of the output arrays
230         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
231         {
232             out[j] = (cl_ulong *)clEnqueueMapBuffer(
233                 tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
234                 buffer_size, 0, NULL, e + j, &error);
235             if (error || NULL == out[j])
236             {
237                 vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
238                            error);
239                 return error;
240             }
241         }
242 
243         // Get that moving
244         if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
245     }
246 
247     // Init input array
248     cl_ulong *p = (cl_ulong *)gIn + thread_id * buffer_elements;
249     cl_ulong *p2 = (cl_ulong *)gIn2 + thread_id * buffer_elements;
250     cl_uint idx = 0;
251     int totalSpecialValueCount = specialValuesCount * specialValuesCount;
252     int lastSpecialJobIndex = (totalSpecialValueCount - 1) / buffer_elements;
253 
254     // Test edge cases
255     if (job_id <= (cl_uint)lastSpecialJobIndex)
256     {
257         cl_double *fp = (cl_double *)p;
258         cl_double *fp2 = (cl_double *)p2;
259         uint32_t x, y;
260 
261         x = (job_id * buffer_elements) % specialValuesCount;
262         y = (job_id * buffer_elements) / specialValuesCount;
263 
264         for (; idx < buffer_elements; idx++)
265         {
266             fp[idx] = specialValues[x];
267             fp2[idx] = specialValues[y];
268             if (++x >= specialValuesCount)
269             {
270                 x = 0;
271                 y++;
272                 if (y >= specialValuesCount) break;
273             }
274         }
275     }
276 
277     // Init any remaining values
278     for (; idx < buffer_elements; idx++)
279     {
280         p[idx] = genrand_int64(d);
281         p2[idx] = genrand_int64(d);
282     }
283 
284     if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
285                                       buffer_size, p, 0, NULL, NULL)))
286     {
287         vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
288         return error;
289     }
290 
291     if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
292                                       buffer_size, p2, 0, NULL, NULL)))
293     {
294         vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
295         return error;
296     }
297 
298     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
299     {
300         if (gHostFill)
301         {
302             // Wait for the map to finish
303             if ((error = clWaitForEvents(1, e + j)))
304             {
305                 vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
306                 return error;
307             }
308             if ((error = clReleaseEvent(e[j])))
309             {
310                 vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
311                 return error;
312             }
313         }
314 
315         // Fill the result buffer with garbage, so that old results don't carry
316         // over
317         uint32_t pattern = 0xffffdead;
318         if (gHostFill)
319         {
320             memset_pattern4(out[j], &pattern, buffer_size);
321             if ((error = clEnqueueUnmapMemObject(
322                      tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)))
323             {
324                 vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n",
325                            error);
326                 return error;
327             }
328         }
329         else
330         {
331             if ((error = clEnqueueFillBuffer(tinfo->tQueue, tinfo->outBuf[j],
332                                              &pattern, sizeof(pattern), 0,
333                                              buffer_size, 0, NULL, NULL)))
334             {
335                 vlog_error("Error: clEnqueueFillBuffer failed! err: %d\n",
336                            error);
337                 return error;
338             }
339         }
340 
341         // Run the kernel
342         size_t vectorCount =
343             (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
344         cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
345                                                  // own copy of the cl_kernel
346         cl_program program = job->programs[j];
347 
348         if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
349                                     &tinfo->outBuf[j])))
350         {
351             LogBuildError(program);
352             return error;
353         }
354         if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
355                                     &tinfo->inBuf)))
356         {
357             LogBuildError(program);
358             return error;
359         }
360         if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
361                                     &tinfo->inBuf2)))
362         {
363             LogBuildError(program);
364             return error;
365         }
366 
367         if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
368                                             &vectorCount, NULL, 0, NULL, NULL)))
369         {
370             vlog_error("FAILED -- could not execute kernel\n");
371             return error;
372         }
373     }
374 
375     // Get that moving
376     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
377 
378     if (gSkipCorrectnessTesting) return CL_SUCCESS;
379 
380     // Calculate the correctly rounded reference result
381     r = (cl_double *)gOut_Ref + thread_id * buffer_elements;
382     s = (cl_double *)gIn + thread_id * buffer_elements;
383     s2 = (cl_double *)gIn2 + thread_id * buffer_elements;
384     for (size_t j = 0; j < buffer_elements; j++)
385         r[j] = (cl_double)func.f_ff(s[j], s2[j]);
386 
387     // Read the data back -- no need to wait for the first N-1 buffers but wait
388     // for the last buffer. This is an in order queue.
389     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
390     {
391         cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
392         out[j] = (cl_ulong *)clEnqueueMapBuffer(
393             tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0,
394             buffer_size, 0, NULL, NULL, &error);
395         if (error || NULL == out[j])
396         {
397             vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
398                        error);
399             return error;
400         }
401     }
402 
403     // Verify data
404     t = (cl_ulong *)r;
405     for (size_t j = 0; j < buffer_elements; j++)
406     {
407         for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
408         {
409             cl_ulong *q = out[k];
410 
411             // If we aren't getting the correctly rounded result
412             if (t[j] != q[j])
413             {
414                 cl_double test = ((cl_double *)q)[j];
415                 long double correct = func.f_ff(s[j], s2[j]);
416                 float err = Bruteforce_Ulp_Error_Double(test, correct);
417                 int fail = !(fabsf(err) <= ulps);
418 
419                 if (fail && (ftz || relaxedMode))
420                 {
421                     // retry per section 6.5.3.2
422                     if (IsDoubleResultSubnormal(correct, ulps))
423                     {
424                         fail = fail && (test != 0.0f);
425                         if (!fail) err = 0.0f;
426                     }
427 
428                     // nextafter on FTZ platforms may return the smallest
429                     // normal float (2^-126) given a denormal or a zero
430                     // as the first argument. The rationale here is that
431                     // nextafter flushes the argument to zero and then
432                     // returns the next representable number in the
433                     // direction of the second argument, and since
434                     // denorms are considered as zero, the smallest
435                     // normal number is the next representable number.
436                     // In which case, it should have the same sign as the
437                     // second argument.
438                     if (isNextafter)
439                     {
440                         if (IsDoubleSubnormal(s[j]) || s[j] == 0.0f)
441                         {
442                             cl_double value = copysign(twoToMinus1022, s2[j]);
443                             fail = fail && (test != value);
444                             if (!fail) err = 0.0f;
445                         }
446                     }
447                     else
448                     {
449                         // retry per section 6.5.3.3
450                         if (IsDoubleSubnormal(s[j]))
451                         {
452                             long double correct2 = func.f_ff(0.0, s2[j]);
453                             long double correct3 = func.f_ff(-0.0, s2[j]);
454                             float err2 =
455                                 Bruteforce_Ulp_Error_Double(test, correct2);
456                             float err3 =
457                                 Bruteforce_Ulp_Error_Double(test, correct3);
458                             fail = fail
459                                 && ((!(fabsf(err2) <= ulps))
460                                     && (!(fabsf(err3) <= ulps)));
461                             if (fabsf(err2) < fabsf(err)) err = err2;
462                             if (fabsf(err3) < fabsf(err)) err = err3;
463 
464                             // retry per section 6.5.3.4
465                             if (IsDoubleResultSubnormal(correct2, ulps)
466                                 || IsDoubleResultSubnormal(correct3, ulps))
467                             {
468                                 fail = fail && (test != 0.0f);
469                                 if (!fail) err = 0.0f;
470                             }
471 
472                             // try with both args as zero
473                             if (IsDoubleSubnormal(s2[j]))
474                             {
475                                 correct2 = func.f_ff(0.0, 0.0);
476                                 correct3 = func.f_ff(-0.0, 0.0);
477                                 long double correct4 = func.f_ff(0.0, -0.0);
478                                 long double correct5 = func.f_ff(-0.0, -0.0);
479                                 err2 =
480                                     Bruteforce_Ulp_Error_Double(test, correct2);
481                                 err3 =
482                                     Bruteforce_Ulp_Error_Double(test, correct3);
483                                 float err4 =
484                                     Bruteforce_Ulp_Error_Double(test, correct4);
485                                 float err5 =
486                                     Bruteforce_Ulp_Error_Double(test, correct5);
487                                 fail = fail
488                                     && ((!(fabsf(err2) <= ulps))
489                                         && (!(fabsf(err3) <= ulps))
490                                         && (!(fabsf(err4) <= ulps))
491                                         && (!(fabsf(err5) <= ulps)));
492                                 if (fabsf(err2) < fabsf(err)) err = err2;
493                                 if (fabsf(err3) < fabsf(err)) err = err3;
494                                 if (fabsf(err4) < fabsf(err)) err = err4;
495                                 if (fabsf(err5) < fabsf(err)) err = err5;
496 
497                                 // retry per section 6.5.3.4
498                                 if (IsDoubleResultSubnormal(correct2, ulps)
499                                     || IsDoubleResultSubnormal(correct3, ulps)
500                                     || IsDoubleResultSubnormal(correct4, ulps)
501                                     || IsDoubleResultSubnormal(correct5, ulps))
502                                 {
503                                     fail = fail && (test != 0.0f);
504                                     if (!fail) err = 0.0f;
505                                 }
506                             }
507                         }
508                         else if (IsDoubleSubnormal(s2[j]))
509                         {
510                             long double correct2 = func.f_ff(s[j], 0.0);
511                             long double correct3 = func.f_ff(s[j], -0.0);
512                             float err2 =
513                                 Bruteforce_Ulp_Error_Double(test, correct2);
514                             float err3 =
515                                 Bruteforce_Ulp_Error_Double(test, correct3);
516                             fail = fail
517                                 && ((!(fabsf(err2) <= ulps))
518                                     && (!(fabsf(err3) <= ulps)));
519                             if (fabsf(err2) < fabsf(err)) err = err2;
520                             if (fabsf(err3) < fabsf(err)) err = err3;
521 
522                             // retry per section 6.5.3.4
523                             if (IsDoubleResultSubnormal(correct2, ulps)
524                                 || IsDoubleResultSubnormal(correct3, ulps))
525                             {
526                                 fail = fail && (test != 0.0f);
527                                 if (!fail) err = 0.0f;
528                             }
529                         }
530                     }
531                 }
532 
533                 if (fabsf(err) > tinfo->maxError)
534                 {
535                     tinfo->maxError = fabsf(err);
536                     tinfo->maxErrorValue = s[j];
537                     tinfo->maxErrorValue2 = s2[j];
538                 }
539                 if (fail)
540                 {
541                     vlog_error("\nERROR: %s%s: %f ulp error at {%.13la, "
542                                "%.13la}: *%.13la vs. %.13la\n",
543                                name, sizeNames[k], err, s[j], s2[j], r[j],
544                                test);
545                     return -1;
546                 }
547             }
548         }
549     }
550 
551     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
552     {
553         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
554                                              out[j], 0, NULL, NULL)))
555         {
556             vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
557                        j, error);
558             return error;
559         }
560     }
561 
562     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
563 
564 
565     if (0 == (base & 0x0fffffff))
566     {
567         if (gVerboseBruteForce)
568         {
569             vlog("base:%14u step:%10u scale:%10u buf_elements:%10zu ulps:%5.3f "
570                  "ThreadCount:%2u\n",
571                  base, job->step, job->scale, buffer_elements, job->ulps,
572                  job->threadCount);
573         }
574         else
575         {
576             vlog(".");
577         }
578         fflush(stdout);
579     }
580 
581     return CL_SUCCESS;
582 }
583 
584 } // anonymous namespace
585 
TestFunc_Double_Double_Double(const Func * f,MTdata d,bool relaxedMode)586 int TestFunc_Double_Double_Double(const Func *f, MTdata d, bool relaxedMode)
587 {
588     TestInfo test_info{};
589     cl_int error;
590     float maxError = 0.0f;
591     double maxErrorVal = 0.0;
592     double maxErrorVal2 = 0.0;
593 
594     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
595 
596     // Init test_info
597     test_info.threadCount = GetThreadCount();
598     test_info.subBufferSize = BUFFER_SIZE
599         / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
600     test_info.scale = getTestScale(sizeof(cl_double));
601 
602     test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
603     if (test_info.step / test_info.subBufferSize != test_info.scale)
604     {
605         // there was overflow
606         test_info.jobCount = 1;
607     }
608     else
609     {
610         test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
611     }
612 
613     test_info.f = f;
614     test_info.ulps = f->double_ulps;
615     test_info.ftz = f->ftz || gForceFTZ;
616     test_info.relaxedMode = relaxedMode;
617 
618     test_info.isFDim = 0 == strcmp("fdim", f->nameInCode);
619     test_info.skipNanInf = 0;
620     test_info.isNextafter = 0 == strcmp("nextafter", f->nameInCode);
621 
622     test_info.tinfo.resize(test_info.threadCount);
623     for (cl_uint i = 0; i < test_info.threadCount; i++)
624     {
625         cl_buffer_region region = {
626             i * test_info.subBufferSize * sizeof(cl_double),
627             test_info.subBufferSize * sizeof(cl_double)
628         };
629         test_info.tinfo[i].inBuf =
630             clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
631                               CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
632         if (error || NULL == test_info.tinfo[i].inBuf)
633         {
634             vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
635                        "region {%zd, %zd}\n",
636                        region.origin, region.size);
637             return error;
638         }
639         test_info.tinfo[i].inBuf2 =
640             clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
641                               CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
642         if (error || NULL == test_info.tinfo[i].inBuf2)
643         {
644             vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
645                        "region {%zd, %zd}\n",
646                        region.origin, region.size);
647             return error;
648         }
649 
650         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
651         {
652             test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
653                 gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
654                 &region, &error);
655             if (error || NULL == test_info.tinfo[i].outBuf[j])
656             {
657                 vlog_error("Error: Unable to create sub-buffer of "
658                            "gOutBuffer[%d] for region {%zd, %zd}\n",
659                            (int)j, region.origin, region.size);
660                 return error;
661             }
662         }
663         test_info.tinfo[i].tQueue =
664             clCreateCommandQueue(gContext, gDevice, 0, &error);
665         if (NULL == test_info.tinfo[i].tQueue || error)
666         {
667             vlog_error("clCreateCommandQueue failed. (%d)\n", error);
668             return error;
669         }
670 
671         test_info.tinfo[i].d = MTdataHolder(genrand_int32(d));
672     }
673 
674     // Init the kernels
675     BuildKernelInfo build_info{ test_info.threadCount, test_info.k,
676                                 test_info.programs, f->nameInCode,
677                                 relaxedMode };
678     if ((error = ThreadPool_Do(BuildKernelFn,
679                                gMaxVectorSizeIndex - gMinVectorSizeIndex,
680                                &build_info)))
681         return error;
682 
683     // Run the kernels
684     if (!gSkipCorrectnessTesting)
685     {
686         error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
687         if (error) return error;
688 
689         // Accumulate the arithmetic errors
690         for (cl_uint i = 0; i < test_info.threadCount; i++)
691         {
692             if (test_info.tinfo[i].maxError > maxError)
693             {
694                 maxError = test_info.tinfo[i].maxError;
695                 maxErrorVal = test_info.tinfo[i].maxErrorValue;
696                 maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
697             }
698         }
699 
700         if (gWimpyMode)
701             vlog("Wimp pass");
702         else
703             vlog("passed");
704 
705         vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
706     }
707 
708     vlog("\n");
709 
710     return CL_SUCCESS;
711 }
712