xref: /aosp_15_r20/external/OpenCL-CTS/test_conformance/math_brute_force/binary_operator_double.cpp (revision 6467f958c7de8070b317fc65bcb0f6472e388d82)
1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 
17 #include "common.h"
18 #include "function_list.h"
19 #include "test_functions.h"
20 #include "utility.h"
21 
22 #include <cstring>
23 
24 namespace {
25 
BuildKernelFn(cl_uint job_id,cl_uint thread_id UNUSED,void * p)26 cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
27 {
28     BuildKernelInfo &info = *(BuildKernelInfo *)p;
29     auto generator = [](const std::string &kernel_name, const char *builtin,
30                         cl_uint vector_size_index) {
31         return GetBinaryKernel(kernel_name, builtin, ParameterType::Double,
32                                ParameterType::Double, ParameterType::Double,
33                                vector_size_index);
34     };
35     return BuildKernels(info, job_id, generator);
36 }
37 
38 // Thread specific data for a worker thread
39 struct ThreadInfo
40 {
41     // Input and output buffers for the thread
42     clMemWrapper inBuf;
43     clMemWrapper inBuf2;
44     Buffers outBuf;
45 
46     float maxError; // max error value. Init to 0.
47     double
48         maxErrorValue; // position of the max error value (param 1).  Init to 0.
49     double maxErrorValue2; // position of the max error value (param 2).  Init
50                            // to 0.
51     MTdataHolder d;
52 
53     // Per thread command queue to improve performance
54     clCommandQueueWrapper tQueue;
55 };
56 
57 struct TestInfo
58 {
59     size_t subBufferSize; // Size of the sub-buffer in elements
60     const Func *f; // A pointer to the function info
61 
62     // Programs for various vector sizes.
63     Programs programs;
64 
65     // Thread-specific kernels for each vector size:
66     // k[vector_size][thread_id]
67     KernelMatrix k;
68 
69     // Array of thread specific information
70     std::vector<ThreadInfo> tinfo;
71 
72     cl_uint threadCount; // Number of worker threads
73     cl_uint jobCount; // Number of jobs
74     cl_uint step; // step between each chunk and the next.
75     cl_uint scale; // stride between individual test values
76     float ulps; // max_allowed ulps
77     int ftz; // non-zero if running in flush to zero mode
78     bool relaxedMode; // True if the test is being run in relaxed mode, false
79                       // otherwise.
80 
81     // no special fields
82 };
83 
84 // A table of more difficult cases to get right
85 const double specialValues[] = {
86     -NAN,
87     -INFINITY,
88     -DBL_MAX,
89     MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12),
90     MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64),
91     MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11),
92     MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11),
93     MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63),
94     MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10),
95     MAKE_HEX_DOUBLE(-0x1.000002p32, -0x1000002LL, 8),
96     MAKE_HEX_DOUBLE(-0x1.0p32, -0x1LL, 32),
97     MAKE_HEX_DOUBLE(-0x1.fffffffffffffp31, -0x1fffffffffffffLL, -21),
98     MAKE_HEX_DOUBLE(-0x1.0000000000001p31, -0x10000000000001LL, -21),
99     MAKE_HEX_DOUBLE(-0x1.0p31, -0x1LL, 31),
100     MAKE_HEX_DOUBLE(-0x1.fffffffffffffp30, -0x1fffffffffffffLL, -22),
101     -1000.,
102     -100.,
103     -4.0,
104     -3.5,
105     -3.0,
106     MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51),
107     -2.5,
108     MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51),
109     -2.0,
110     MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52),
111     -1.5,
112     MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52),
113     MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
114     -1.0,
115     MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53),
116     MAKE_HEX_DOUBLE(-0x1.0000000000001p-1, -0x10000000000001LL, -53),
117     -0.5,
118     MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-2, -0x1fffffffffffffLL, -54),
119     MAKE_HEX_DOUBLE(-0x1.0000000000001p-2, -0x10000000000001LL, -54),
120     -0.25,
121     MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-3, -0x1fffffffffffffLL, -55),
122     MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074),
123     -DBL_MIN,
124     MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074),
125     MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074),
126     MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074),
127     MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074),
128     MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074),
129     MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074),
130     MAKE_HEX_DOUBLE(-0x0.0000000000008p-1022, -0x00000000000008LL, -1074),
131     MAKE_HEX_DOUBLE(-0x0.0000000000007p-1022, -0x00000000000007LL, -1074),
132     MAKE_HEX_DOUBLE(-0x0.0000000000006p-1022, -0x00000000000006LL, -1074),
133     MAKE_HEX_DOUBLE(-0x0.0000000000005p-1022, -0x00000000000005LL, -1074),
134     MAKE_HEX_DOUBLE(-0x0.0000000000004p-1022, -0x00000000000004LL, -1074),
135     MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074),
136     MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074),
137     MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074),
138     -0.0,
139 
140     +NAN,
141     +INFINITY,
142     +DBL_MAX,
143     MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12),
144     MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64),
145     MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11),
146     MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11),
147     MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63),
148     MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10),
149     MAKE_HEX_DOUBLE(+0x1.000002p32, +0x1000002LL, 8),
150     MAKE_HEX_DOUBLE(+0x1.0p32, +0x1LL, 32),
151     MAKE_HEX_DOUBLE(+0x1.fffffffffffffp31, +0x1fffffffffffffLL, -21),
152     MAKE_HEX_DOUBLE(+0x1.0000000000001p31, +0x10000000000001LL, -21),
153     MAKE_HEX_DOUBLE(+0x1.0p31, +0x1LL, 31),
154     MAKE_HEX_DOUBLE(+0x1.fffffffffffffp30, +0x1fffffffffffffLL, -22),
155     +1000.0,
156     +100.0,
157     +4.0,
158     +3.5,
159     +3.0,
160     MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51),
161     +2.5,
162     MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51),
163     +2.0,
164     MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52),
165     +1.5,
166     MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52),
167     MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
168     +1.0,
169     MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53),
170     MAKE_HEX_DOUBLE(+0x1.0000000000001p-1, +0x10000000000001LL, -53),
171     +0.5,
172     MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-2, +0x1fffffffffffffLL, -54),
173     MAKE_HEX_DOUBLE(+0x1.0000000000001p-2, +0x10000000000001LL, -54),
174     +0.25,
175     MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-3, +0x1fffffffffffffLL, -55),
176     MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074),
177     +DBL_MIN,
178     MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074),
179     MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074),
180     MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074),
181     MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074),
182     MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074),
183     MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074),
184     MAKE_HEX_DOUBLE(+0x0.0000000000008p-1022, +0x00000000000008LL, -1074),
185     MAKE_HEX_DOUBLE(+0x0.0000000000007p-1022, +0x00000000000007LL, -1074),
186     MAKE_HEX_DOUBLE(+0x0.0000000000006p-1022, +0x00000000000006LL, -1074),
187     MAKE_HEX_DOUBLE(+0x0.0000000000005p-1022, +0x00000000000005LL, -1074),
188     MAKE_HEX_DOUBLE(+0x0.0000000000004p-1022, +0x00000000000004LL, -1074),
189     MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074),
190     MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074),
191     MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074),
192     +0.0,
193 };
194 
195 constexpr size_t specialValuesCount =
196     sizeof(specialValues) / sizeof(specialValues[0]);
197 
Test(cl_uint job_id,cl_uint thread_id,void * data)198 cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
199 {
200     TestInfo *job = (TestInfo *)data;
201     size_t buffer_elements = job->subBufferSize;
202     size_t buffer_size = buffer_elements * sizeof(cl_double);
203     cl_uint base = job_id * (cl_uint)job->step;
204     ThreadInfo *tinfo = &(job->tinfo[thread_id]);
205     float ulps = job->ulps;
206     dptr func = job->f->dfunc;
207     int ftz = job->ftz;
208     bool relaxedMode = job->relaxedMode;
209     MTdata d = tinfo->d;
210     cl_int error;
211     const char *name = job->f->name;
212     cl_ulong *t;
213     cl_double *r;
214     cl_double *s;
215     cl_double *s2;
216 
217     Force64BitFPUPrecision();
218 
219     cl_event e[VECTOR_SIZE_COUNT];
220     cl_ulong *out[VECTOR_SIZE_COUNT];
221     if (gHostFill)
222     {
223         // start the map of the output arrays
224         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
225         {
226             out[j] = (cl_ulong *)clEnqueueMapBuffer(
227                 tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
228                 buffer_size, 0, NULL, e + j, &error);
229             if (error || NULL == out[j])
230             {
231                 vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
232                            error);
233                 return error;
234             }
235         }
236     }
237 
238     // Get that moving
239     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
240 
241     // Init input array
242     cl_ulong *p = (cl_ulong *)gIn + thread_id * buffer_elements;
243     cl_ulong *p2 = (cl_ulong *)gIn2 + thread_id * buffer_elements;
244     cl_uint idx = 0;
245     int totalSpecialValueCount = specialValuesCount * specialValuesCount;
246     int lastSpecialJobIndex = (totalSpecialValueCount - 1) / buffer_elements;
247 
248     // Test edge cases
249     if (job_id <= (cl_uint)lastSpecialJobIndex)
250     {
251         cl_double *fp = (cl_double *)p;
252         cl_double *fp2 = (cl_double *)p2;
253         uint32_t x, y;
254 
255         x = (job_id * buffer_elements) % specialValuesCount;
256         y = (job_id * buffer_elements) / specialValuesCount;
257 
258         for (; idx < buffer_elements; idx++)
259         {
260             fp[idx] = specialValues[x];
261             fp2[idx] = specialValues[y];
262             if (++x >= specialValuesCount)
263             {
264                 x = 0;
265                 y++;
266                 if (y >= specialValuesCount) break;
267             }
268         }
269     }
270 
271     // Init any remaining values
272     for (; idx < buffer_elements; idx++)
273     {
274         p[idx] = genrand_int64(d);
275         p2[idx] = genrand_int64(d);
276     }
277 
278     if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
279                                       buffer_size, p, 0, NULL, NULL)))
280     {
281         vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
282         return error;
283     }
284 
285     if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
286                                       buffer_size, p2, 0, NULL, NULL)))
287     {
288         vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
289         return error;
290     }
291 
292     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
293     {
294         if (gHostFill)
295         {
296             // Wait for the map to finish
297             if ((error = clWaitForEvents(1, e + j)))
298             {
299                 vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
300                 return error;
301             }
302             if ((error = clReleaseEvent(e[j])))
303             {
304                 vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
305                 return error;
306             }
307         }
308 
309         // Fill the result buffer with garbage, so that old results don't carry
310         // over
311         uint32_t pattern = 0xffffdead;
312         if (gHostFill)
313         {
314             memset_pattern4(out[j], &pattern, buffer_size);
315             if ((error = clEnqueueUnmapMemObject(
316                      tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)))
317             {
318                 vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n",
319                            error);
320                 return error;
321             }
322         }
323         else
324         {
325             if ((error = clEnqueueFillBuffer(tinfo->tQueue, tinfo->outBuf[j],
326                                              &pattern, sizeof(pattern), 0,
327                                              buffer_size, 0, NULL, NULL)))
328             {
329                 vlog_error("Error: clEnqueueFillBuffer failed! err: %d\n",
330                            error);
331                 return error;
332             }
333         }
334 
335         // Run the kernel
336         size_t vectorCount =
337             (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
338         cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
339                                                  // own copy of the cl_kernel
340         cl_program program = job->programs[j];
341 
342         if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
343                                     &tinfo->outBuf[j])))
344         {
345             LogBuildError(program);
346             return error;
347         }
348         if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
349                                     &tinfo->inBuf)))
350         {
351             LogBuildError(program);
352             return error;
353         }
354         if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
355                                     &tinfo->inBuf2)))
356         {
357             LogBuildError(program);
358             return error;
359         }
360 
361         if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
362                                             &vectorCount, NULL, 0, NULL, NULL)))
363         {
364             vlog_error("FAILED -- could not execute kernel\n");
365             return error;
366         }
367     }
368 
369     // Get that moving
370     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
371 
372     if (gSkipCorrectnessTesting) return CL_SUCCESS;
373 
374     // Calculate the correctly rounded reference result
375     r = (cl_double *)gOut_Ref + thread_id * buffer_elements;
376     s = (cl_double *)gIn + thread_id * buffer_elements;
377     s2 = (cl_double *)gIn2 + thread_id * buffer_elements;
378     for (size_t j = 0; j < buffer_elements; j++)
379         r[j] = (cl_double)func.f_ff(s[j], s2[j]);
380 
381     // Read the data back -- no need to wait for the first N-1 buffers but wait
382     // for the last buffer. This is an in order queue.
383     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
384     {
385         cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
386         out[j] = (cl_ulong *)clEnqueueMapBuffer(
387             tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0,
388             buffer_size, 0, NULL, NULL, &error);
389         if (error || NULL == out[j])
390         {
391             vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
392                        error);
393             return error;
394         }
395     }
396 
397     // Verify data
398     t = (cl_ulong *)r;
399     for (size_t j = 0; j < buffer_elements; j++)
400     {
401         for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
402         {
403             cl_ulong *q = out[k];
404 
405             // If we aren't getting the correctly rounded result
406             if (t[j] != q[j])
407             {
408                 cl_double test = ((cl_double *)q)[j];
409                 long double correct = func.f_ff(s[j], s2[j]);
410                 float err = Bruteforce_Ulp_Error_Double(test, correct);
411                 int fail = !(fabsf(err) <= ulps);
412 
413                 if (fail && (ftz || relaxedMode))
414                 {
415                     // retry per section 6.5.3.2
416                     if (IsDoubleResultSubnormal(correct, ulps))
417                     {
418                         fail = fail && (test != 0.0f);
419                         if (!fail) err = 0.0f;
420                     }
421 
422 
423                     // retry per section 6.5.3.3
424                     if (IsDoubleSubnormal(s[j]))
425                     {
426                         long double correct2 = func.f_ff(0.0, s2[j]);
427                         long double correct3 = func.f_ff(-0.0, s2[j]);
428                         float err2 =
429                             Bruteforce_Ulp_Error_Double(test, correct2);
430                         float err3 =
431                             Bruteforce_Ulp_Error_Double(test, correct3);
432                         fail = fail
433                             && ((!(fabsf(err2) <= ulps))
434                                 && (!(fabsf(err3) <= ulps)));
435                         if (fabsf(err2) < fabsf(err)) err = err2;
436                         if (fabsf(err3) < fabsf(err)) err = err3;
437 
438                         // retry per section 6.5.3.4
439                         if (IsDoubleResultSubnormal(correct2, ulps)
440                             || IsDoubleResultSubnormal(correct3, ulps))
441                         {
442                             fail = fail && (test != 0.0f);
443                             if (!fail) err = 0.0f;
444                         }
445 
446                         // try with both args as zero
447                         if (IsDoubleSubnormal(s2[j]))
448                         {
449                             correct2 = func.f_ff(0.0, 0.0);
450                             correct3 = func.f_ff(-0.0, 0.0);
451                             long double correct4 = func.f_ff(0.0, -0.0);
452                             long double correct5 = func.f_ff(-0.0, -0.0);
453                             err2 = Bruteforce_Ulp_Error_Double(test, correct2);
454                             err3 = Bruteforce_Ulp_Error_Double(test, correct3);
455                             float err4 =
456                                 Bruteforce_Ulp_Error_Double(test, correct4);
457                             float err5 =
458                                 Bruteforce_Ulp_Error_Double(test, correct5);
459                             fail = fail
460                                 && ((!(fabsf(err2) <= ulps))
461                                     && (!(fabsf(err3) <= ulps))
462                                     && (!(fabsf(err4) <= ulps))
463                                     && (!(fabsf(err5) <= ulps)));
464                             if (fabsf(err2) < fabsf(err)) err = err2;
465                             if (fabsf(err3) < fabsf(err)) err = err3;
466                             if (fabsf(err4) < fabsf(err)) err = err4;
467                             if (fabsf(err5) < fabsf(err)) err = err5;
468 
469                             // retry per section 6.5.3.4
470                             if (IsDoubleResultSubnormal(correct2, ulps)
471                                 || IsDoubleResultSubnormal(correct3, ulps)
472                                 || IsDoubleResultSubnormal(correct4, ulps)
473                                 || IsDoubleResultSubnormal(correct5, ulps))
474                             {
475                                 fail = fail && (test != 0.0f);
476                                 if (!fail) err = 0.0f;
477                             }
478                         }
479                     }
480                     else if (IsDoubleSubnormal(s2[j]))
481                     {
482                         long double correct2 = func.f_ff(s[j], 0.0);
483                         long double correct3 = func.f_ff(s[j], -0.0);
484                         float err2 =
485                             Bruteforce_Ulp_Error_Double(test, correct2);
486                         float err3 =
487                             Bruteforce_Ulp_Error_Double(test, correct3);
488                         fail = fail
489                             && ((!(fabsf(err2) <= ulps))
490                                 && (!(fabsf(err3) <= ulps)));
491                         if (fabsf(err2) < fabsf(err)) err = err2;
492                         if (fabsf(err3) < fabsf(err)) err = err3;
493 
494                         // retry per section 6.5.3.4
495                         if (IsDoubleResultSubnormal(correct2, ulps)
496                             || IsDoubleResultSubnormal(correct3, ulps))
497                         {
498                             fail = fail && (test != 0.0f);
499                             if (!fail) err = 0.0f;
500                         }
501                     }
502                 }
503 
504                 if (fabsf(err) > tinfo->maxError)
505                 {
506                     tinfo->maxError = fabsf(err);
507                     tinfo->maxErrorValue = s[j];
508                     tinfo->maxErrorValue2 = s2[j];
509                 }
510                 if (fail)
511                 {
512                     vlog_error(
513                         "\nERROR: %s%s: %f ulp error at {%a, %a}: *%a vs. %a\n",
514                         name, sizeNames[k], err, s[j], s2[j], r[j], test);
515                     return -1;
516                 }
517             }
518         }
519     }
520 
521     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
522     {
523         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
524                                              out[j], 0, NULL, NULL)))
525         {
526             vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
527                        j, error);
528             return error;
529         }
530     }
531 
532     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
533 
534 
535     if (0 == (base & 0x0fffffff))
536     {
537         if (gVerboseBruteForce)
538         {
539             vlog("base:%14u step:%10u scale:%10u buf_elements:%10zu ulps:%5.3f "
540                  "ThreadCount:%2u\n",
541                  base, job->step, job->scale, buffer_elements, job->ulps,
542                  job->threadCount);
543         }
544         else
545         {
546             vlog(".");
547         }
548         fflush(stdout);
549     }
550 
551     return CL_SUCCESS;
552 }
553 
554 } // anonymous namespace
555 
TestFunc_Double_Double_Double_Operator(const Func * f,MTdata d,bool relaxedMode)556 int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
557                                            bool relaxedMode)
558 {
559     TestInfo test_info{};
560     cl_int error;
561     float maxError = 0.0f;
562     double maxErrorVal = 0.0;
563     double maxErrorVal2 = 0.0;
564 
565     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
566 
567     // Init test_info
568     test_info.threadCount = GetThreadCount();
569     test_info.subBufferSize = BUFFER_SIZE
570         / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
571     test_info.scale = getTestScale(sizeof(cl_double));
572 
573     test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
574     if (test_info.step / test_info.subBufferSize != test_info.scale)
575     {
576         // there was overflow
577         test_info.jobCount = 1;
578     }
579     else
580     {
581         test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
582     }
583 
584     test_info.f = f;
585     test_info.ulps = f->double_ulps;
586     test_info.ftz = f->ftz || gForceFTZ;
587 
588     test_info.tinfo.resize(test_info.threadCount);
589     for (cl_uint i = 0; i < test_info.threadCount; i++)
590     {
591         cl_buffer_region region = {
592             i * test_info.subBufferSize * sizeof(cl_double),
593             test_info.subBufferSize * sizeof(cl_double)
594         };
595         test_info.tinfo[i].inBuf =
596             clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
597                               CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
598         if (error || NULL == test_info.tinfo[i].inBuf)
599         {
600             vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
601                        "region {%zd, %zd}\n",
602                        region.origin, region.size);
603             return error;
604         }
605         test_info.tinfo[i].inBuf2 =
606             clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
607                               CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
608         if (error || NULL == test_info.tinfo[i].inBuf2)
609         {
610             vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
611                        "region {%zd, %zd}\n",
612                        region.origin, region.size);
613             return error;
614         }
615 
616         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
617         {
618             test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
619                 gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
620                 &region, &error);
621             if (error || NULL == test_info.tinfo[i].outBuf[j])
622             {
623                 vlog_error("Error: Unable to create sub-buffer of "
624                            "gOutBuffer[%d] for region {%zd, %zd}\n",
625                            (int)j, region.origin, region.size);
626                 return error;
627             }
628         }
629         test_info.tinfo[i].tQueue =
630             clCreateCommandQueue(gContext, gDevice, 0, &error);
631         if (NULL == test_info.tinfo[i].tQueue || error)
632         {
633             vlog_error("clCreateCommandQueue failed. (%d)\n", error);
634             return error;
635         }
636 
637         test_info.tinfo[i].d = MTdataHolder(genrand_int32(d));
638     }
639 
640     // Init the kernels
641     BuildKernelInfo build_info{ test_info.threadCount, test_info.k,
642                                 test_info.programs, f->nameInCode,
643                                 relaxedMode };
644     if ((error = ThreadPool_Do(BuildKernelFn,
645                                gMaxVectorSizeIndex - gMinVectorSizeIndex,
646                                &build_info)))
647         return error;
648 
649     // Run the kernels
650     if (!gSkipCorrectnessTesting)
651     {
652         error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
653         if (error) return error;
654 
655         // Accumulate the arithmetic errors
656         for (cl_uint i = 0; i < test_info.threadCount; i++)
657         {
658             if (test_info.tinfo[i].maxError > maxError)
659             {
660                 maxError = test_info.tinfo[i].maxError;
661                 maxErrorVal = test_info.tinfo[i].maxErrorValue;
662                 maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
663             }
664         }
665 
666         if (gWimpyMode)
667             vlog("Wimp pass");
668         else
669             vlog("passed");
670 
671         vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
672     }
673 
674     vlog("\n");
675 
676     return CL_SUCCESS;
677 }
678