1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16
17 #include "common.h"
18 #include "function_list.h"
19 #include "test_functions.h"
20 #include "utility.h"
21
22 #include <cstring>
23
24 namespace {
25
26 const double twoToMinus1022 = MAKE_HEX_DOUBLE(0x1p-1022, 1, -1022);
27
BuildKernelFn(cl_uint job_id,cl_uint thread_id UNUSED,void * p)28 cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
29 {
30 BuildKernelInfo &info = *(BuildKernelInfo *)p;
31 auto generator = [](const std::string &kernel_name, const char *builtin,
32 cl_uint vector_size_index) {
33 return GetBinaryKernel(kernel_name, builtin, ParameterType::Double,
34 ParameterType::Double, ParameterType::Double,
35 vector_size_index);
36 };
37 return BuildKernels(info, job_id, generator);
38 }
39
40 // Thread specific data for a worker thread
41 struct ThreadInfo
42 {
43 // Input and output buffers for the thread
44 clMemWrapper inBuf;
45 clMemWrapper inBuf2;
46 Buffers outBuf;
47
48 float maxError; // max error value. Init to 0.
49 double
50 maxErrorValue; // position of the max error value (param 1). Init to 0.
51 double maxErrorValue2; // position of the max error value (param 2). Init
52 // to 0.
53 MTdataHolder d;
54
55 // Per thread command queue to improve performance
56 clCommandQueueWrapper tQueue;
57 };
58
59 struct TestInfo
60 {
61 size_t subBufferSize; // Size of the sub-buffer in elements
62 const Func *f; // A pointer to the function info
63
64 // Programs for various vector sizes.
65 Programs programs;
66
67 // Thread-specific kernels for each vector size:
68 // k[vector_size][thread_id]
69 KernelMatrix k;
70
71 // Array of thread specific information
72 std::vector<ThreadInfo> tinfo;
73
74 cl_uint threadCount; // Number of worker threads
75 cl_uint jobCount; // Number of jobs
76 cl_uint step; // step between each chunk and the next.
77 cl_uint scale; // stride between individual test values
78 float ulps; // max_allowed ulps
79 int ftz; // non-zero if running in flush to zero mode
80
81 int isFDim;
82 int skipNanInf;
83 int isNextafter;
84 bool relaxedMode; // True if test is running in relaxed mode, false
85 // otherwise.
86 };
87
88 // A table of more difficult cases to get right
89 const double specialValues[] = {
90 -NAN,
91 -INFINITY,
92 -DBL_MAX,
93 MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12),
94 MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64),
95 MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11),
96 MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11),
97 MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63),
98 MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10),
99 MAKE_HEX_DOUBLE(-0x1.000002p32, -0x1000002LL, 8),
100 MAKE_HEX_DOUBLE(-0x1.0p32, -0x1LL, 32),
101 MAKE_HEX_DOUBLE(-0x1.fffffffffffffp31, -0x1fffffffffffffLL, -21),
102 MAKE_HEX_DOUBLE(-0x1.0000000000001p31, -0x10000000000001LL, -21),
103 MAKE_HEX_DOUBLE(-0x1.0p31, -0x1LL, 31),
104 MAKE_HEX_DOUBLE(-0x1.fffffffffffffp30, -0x1fffffffffffffLL, -22),
105 -1000.0,
106 -100.0,
107 -4.0,
108 -3.5,
109 -3.0,
110 MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51),
111 -2.5,
112 MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51),
113 -2.0,
114 MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52),
115 -1.5,
116 MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52),
117 MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
118 -1.0,
119 MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53),
120 MAKE_HEX_DOUBLE(-0x1.0000000000001p-1, -0x10000000000001LL, -53),
121 -0.5,
122 MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-2, -0x1fffffffffffffLL, -54),
123 MAKE_HEX_DOUBLE(-0x1.0000000000001p-2, -0x10000000000001LL, -54),
124 -0.25,
125 MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-3, -0x1fffffffffffffLL, -55),
126 MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074),
127 -DBL_MIN,
128 MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074),
129 MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074),
130 MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074),
131 MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074),
132 MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074),
133 MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074),
134 MAKE_HEX_DOUBLE(-0x0.0000000000008p-1022, -0x00000000000008LL, -1074),
135 MAKE_HEX_DOUBLE(-0x0.0000000000007p-1022, -0x00000000000007LL, -1074),
136 MAKE_HEX_DOUBLE(-0x0.0000000000006p-1022, -0x00000000000006LL, -1074),
137 MAKE_HEX_DOUBLE(-0x0.0000000000005p-1022, -0x00000000000005LL, -1074),
138 MAKE_HEX_DOUBLE(-0x0.0000000000004p-1022, -0x00000000000004LL, -1074),
139 MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074),
140 MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074),
141 MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074),
142 -0.0,
143
144 +NAN,
145 +INFINITY,
146 +DBL_MAX,
147 MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12),
148 MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64),
149 MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11),
150 MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11),
151 MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63),
152 MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10),
153 MAKE_HEX_DOUBLE(+0x1.000002p32, +0x1000002LL, 8),
154 MAKE_HEX_DOUBLE(+0x1.0p32, +0x1LL, 32),
155 MAKE_HEX_DOUBLE(+0x1.fffffffffffffp31, +0x1fffffffffffffLL, -21),
156 MAKE_HEX_DOUBLE(+0x1.0000000000001p31, +0x10000000000001LL, -21),
157 MAKE_HEX_DOUBLE(+0x1.0p31, +0x1LL, 31),
158 MAKE_HEX_DOUBLE(+0x1.fffffffffffffp30, +0x1fffffffffffffLL, -22),
159 +1000.0,
160 +100.0,
161 +4.0,
162 +3.5,
163 +3.0,
164 MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51),
165 +2.5,
166 MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51),
167 +2.0,
168 MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52),
169 +1.5,
170 MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52),
171 MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
172 +1.0,
173 MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53),
174 MAKE_HEX_DOUBLE(+0x1.0000000000001p-1, +0x10000000000001LL, -53),
175 +0.5,
176 MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-2, +0x1fffffffffffffLL, -54),
177 MAKE_HEX_DOUBLE(+0x1.0000000000001p-2, +0x10000000000001LL, -54),
178 +0.25,
179 MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-3, +0x1fffffffffffffLL, -55),
180 MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074),
181 +DBL_MIN,
182 MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074),
183 MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074),
184 MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074),
185 MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074),
186 MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074),
187 MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074),
188 MAKE_HEX_DOUBLE(+0x0.0000000000008p-1022, +0x00000000000008LL, -1074),
189 MAKE_HEX_DOUBLE(+0x0.0000000000007p-1022, +0x00000000000007LL, -1074),
190 MAKE_HEX_DOUBLE(+0x0.0000000000006p-1022, +0x00000000000006LL, -1074),
191 MAKE_HEX_DOUBLE(+0x0.0000000000005p-1022, +0x00000000000005LL, -1074),
192 MAKE_HEX_DOUBLE(+0x0.0000000000004p-1022, +0x00000000000004LL, -1074),
193 MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074),
194 MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074),
195 MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074),
196 +0.0,
197 };
198
199 constexpr size_t specialValuesCount =
200 sizeof(specialValues) / sizeof(specialValues[0]);
201
Test(cl_uint job_id,cl_uint thread_id,void * data)202 cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
203 {
204 TestInfo *job = (TestInfo *)data;
205 size_t buffer_elements = job->subBufferSize;
206 size_t buffer_size = buffer_elements * sizeof(cl_double);
207 cl_uint base = job_id * (cl_uint)job->step;
208 ThreadInfo *tinfo = &(job->tinfo[thread_id]);
209 float ulps = job->ulps;
210 dptr func = job->f->dfunc;
211 int ftz = job->ftz;
212 bool relaxedMode = job->relaxedMode;
213 MTdata d = tinfo->d;
214 cl_int error;
215 const char *name = job->f->name;
216
217 int isNextafter = job->isNextafter;
218 cl_ulong *t;
219 cl_double *r;
220 cl_double *s;
221 cl_double *s2;
222
223 Force64BitFPUPrecision();
224
225 cl_event e[VECTOR_SIZE_COUNT];
226 cl_ulong *out[VECTOR_SIZE_COUNT];
227 if (gHostFill)
228 {
229 // start the map of the output arrays
230 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
231 {
232 out[j] = (cl_ulong *)clEnqueueMapBuffer(
233 tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
234 buffer_size, 0, NULL, e + j, &error);
235 if (error || NULL == out[j])
236 {
237 vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
238 error);
239 return error;
240 }
241 }
242
243 // Get that moving
244 if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
245 }
246
247 // Init input array
248 cl_ulong *p = (cl_ulong *)gIn + thread_id * buffer_elements;
249 cl_ulong *p2 = (cl_ulong *)gIn2 + thread_id * buffer_elements;
250 cl_uint idx = 0;
251 int totalSpecialValueCount = specialValuesCount * specialValuesCount;
252 int lastSpecialJobIndex = (totalSpecialValueCount - 1) / buffer_elements;
253
254 // Test edge cases
255 if (job_id <= (cl_uint)lastSpecialJobIndex)
256 {
257 cl_double *fp = (cl_double *)p;
258 cl_double *fp2 = (cl_double *)p2;
259 uint32_t x, y;
260
261 x = (job_id * buffer_elements) % specialValuesCount;
262 y = (job_id * buffer_elements) / specialValuesCount;
263
264 for (; idx < buffer_elements; idx++)
265 {
266 fp[idx] = specialValues[x];
267 fp2[idx] = specialValues[y];
268 if (++x >= specialValuesCount)
269 {
270 x = 0;
271 y++;
272 if (y >= specialValuesCount) break;
273 }
274 }
275 }
276
277 // Init any remaining values
278 for (; idx < buffer_elements; idx++)
279 {
280 p[idx] = genrand_int64(d);
281 p2[idx] = genrand_int64(d);
282 }
283
284 if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
285 buffer_size, p, 0, NULL, NULL)))
286 {
287 vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
288 return error;
289 }
290
291 if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
292 buffer_size, p2, 0, NULL, NULL)))
293 {
294 vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
295 return error;
296 }
297
298 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
299 {
300 if (gHostFill)
301 {
302 // Wait for the map to finish
303 if ((error = clWaitForEvents(1, e + j)))
304 {
305 vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
306 return error;
307 }
308 if ((error = clReleaseEvent(e[j])))
309 {
310 vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
311 return error;
312 }
313 }
314
315 // Fill the result buffer with garbage, so that old results don't carry
316 // over
317 uint32_t pattern = 0xffffdead;
318 if (gHostFill)
319 {
320 memset_pattern4(out[j], &pattern, buffer_size);
321 if ((error = clEnqueueUnmapMemObject(
322 tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)))
323 {
324 vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n",
325 error);
326 return error;
327 }
328 }
329 else
330 {
331 if ((error = clEnqueueFillBuffer(tinfo->tQueue, tinfo->outBuf[j],
332 &pattern, sizeof(pattern), 0,
333 buffer_size, 0, NULL, NULL)))
334 {
335 vlog_error("Error: clEnqueueFillBuffer failed! err: %d\n",
336 error);
337 return error;
338 }
339 }
340
341 // Run the kernel
342 size_t vectorCount =
343 (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
344 cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
345 // own copy of the cl_kernel
346 cl_program program = job->programs[j];
347
348 if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
349 &tinfo->outBuf[j])))
350 {
351 LogBuildError(program);
352 return error;
353 }
354 if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
355 &tinfo->inBuf)))
356 {
357 LogBuildError(program);
358 return error;
359 }
360 if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
361 &tinfo->inBuf2)))
362 {
363 LogBuildError(program);
364 return error;
365 }
366
367 if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
368 &vectorCount, NULL, 0, NULL, NULL)))
369 {
370 vlog_error("FAILED -- could not execute kernel\n");
371 return error;
372 }
373 }
374
375 // Get that moving
376 if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
377
378 if (gSkipCorrectnessTesting) return CL_SUCCESS;
379
380 // Calculate the correctly rounded reference result
381 r = (cl_double *)gOut_Ref + thread_id * buffer_elements;
382 s = (cl_double *)gIn + thread_id * buffer_elements;
383 s2 = (cl_double *)gIn2 + thread_id * buffer_elements;
384 for (size_t j = 0; j < buffer_elements; j++)
385 r[j] = (cl_double)func.f_ff(s[j], s2[j]);
386
387 // Read the data back -- no need to wait for the first N-1 buffers but wait
388 // for the last buffer. This is an in order queue.
389 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
390 {
391 cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
392 out[j] = (cl_ulong *)clEnqueueMapBuffer(
393 tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0,
394 buffer_size, 0, NULL, NULL, &error);
395 if (error || NULL == out[j])
396 {
397 vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
398 error);
399 return error;
400 }
401 }
402
403 // Verify data
404 t = (cl_ulong *)r;
405 for (size_t j = 0; j < buffer_elements; j++)
406 {
407 for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
408 {
409 cl_ulong *q = out[k];
410
411 // If we aren't getting the correctly rounded result
412 if (t[j] != q[j])
413 {
414 cl_double test = ((cl_double *)q)[j];
415 long double correct = func.f_ff(s[j], s2[j]);
416 float err = Bruteforce_Ulp_Error_Double(test, correct);
417 int fail = !(fabsf(err) <= ulps);
418
419 if (fail && (ftz || relaxedMode))
420 {
421 // retry per section 6.5.3.2
422 if (IsDoubleResultSubnormal(correct, ulps))
423 {
424 fail = fail && (test != 0.0f);
425 if (!fail) err = 0.0f;
426 }
427
428 // nextafter on FTZ platforms may return the smallest
429 // normal float (2^-126) given a denormal or a zero
430 // as the first argument. The rationale here is that
431 // nextafter flushes the argument to zero and then
432 // returns the next representable number in the
433 // direction of the second argument, and since
434 // denorms are considered as zero, the smallest
435 // normal number is the next representable number.
436 // In which case, it should have the same sign as the
437 // second argument.
438 if (isNextafter)
439 {
440 if (IsDoubleSubnormal(s[j]) || s[j] == 0.0f)
441 {
442 cl_double value = copysign(twoToMinus1022, s2[j]);
443 fail = fail && (test != value);
444 if (!fail) err = 0.0f;
445 }
446 }
447 else
448 {
449 // retry per section 6.5.3.3
450 if (IsDoubleSubnormal(s[j]))
451 {
452 long double correct2 = func.f_ff(0.0, s2[j]);
453 long double correct3 = func.f_ff(-0.0, s2[j]);
454 float err2 =
455 Bruteforce_Ulp_Error_Double(test, correct2);
456 float err3 =
457 Bruteforce_Ulp_Error_Double(test, correct3);
458 fail = fail
459 && ((!(fabsf(err2) <= ulps))
460 && (!(fabsf(err3) <= ulps)));
461 if (fabsf(err2) < fabsf(err)) err = err2;
462 if (fabsf(err3) < fabsf(err)) err = err3;
463
464 // retry per section 6.5.3.4
465 if (IsDoubleResultSubnormal(correct2, ulps)
466 || IsDoubleResultSubnormal(correct3, ulps))
467 {
468 fail = fail && (test != 0.0f);
469 if (!fail) err = 0.0f;
470 }
471
472 // try with both args as zero
473 if (IsDoubleSubnormal(s2[j]))
474 {
475 correct2 = func.f_ff(0.0, 0.0);
476 correct3 = func.f_ff(-0.0, 0.0);
477 long double correct4 = func.f_ff(0.0, -0.0);
478 long double correct5 = func.f_ff(-0.0, -0.0);
479 err2 =
480 Bruteforce_Ulp_Error_Double(test, correct2);
481 err3 =
482 Bruteforce_Ulp_Error_Double(test, correct3);
483 float err4 =
484 Bruteforce_Ulp_Error_Double(test, correct4);
485 float err5 =
486 Bruteforce_Ulp_Error_Double(test, correct5);
487 fail = fail
488 && ((!(fabsf(err2) <= ulps))
489 && (!(fabsf(err3) <= ulps))
490 && (!(fabsf(err4) <= ulps))
491 && (!(fabsf(err5) <= ulps)));
492 if (fabsf(err2) < fabsf(err)) err = err2;
493 if (fabsf(err3) < fabsf(err)) err = err3;
494 if (fabsf(err4) < fabsf(err)) err = err4;
495 if (fabsf(err5) < fabsf(err)) err = err5;
496
497 // retry per section 6.5.3.4
498 if (IsDoubleResultSubnormal(correct2, ulps)
499 || IsDoubleResultSubnormal(correct3, ulps)
500 || IsDoubleResultSubnormal(correct4, ulps)
501 || IsDoubleResultSubnormal(correct5, ulps))
502 {
503 fail = fail && (test != 0.0f);
504 if (!fail) err = 0.0f;
505 }
506 }
507 }
508 else if (IsDoubleSubnormal(s2[j]))
509 {
510 long double correct2 = func.f_ff(s[j], 0.0);
511 long double correct3 = func.f_ff(s[j], -0.0);
512 float err2 =
513 Bruteforce_Ulp_Error_Double(test, correct2);
514 float err3 =
515 Bruteforce_Ulp_Error_Double(test, correct3);
516 fail = fail
517 && ((!(fabsf(err2) <= ulps))
518 && (!(fabsf(err3) <= ulps)));
519 if (fabsf(err2) < fabsf(err)) err = err2;
520 if (fabsf(err3) < fabsf(err)) err = err3;
521
522 // retry per section 6.5.3.4
523 if (IsDoubleResultSubnormal(correct2, ulps)
524 || IsDoubleResultSubnormal(correct3, ulps))
525 {
526 fail = fail && (test != 0.0f);
527 if (!fail) err = 0.0f;
528 }
529 }
530 }
531 }
532
533 if (fabsf(err) > tinfo->maxError)
534 {
535 tinfo->maxError = fabsf(err);
536 tinfo->maxErrorValue = s[j];
537 tinfo->maxErrorValue2 = s2[j];
538 }
539 if (fail)
540 {
541 vlog_error("\nERROR: %s%s: %f ulp error at {%.13la, "
542 "%.13la}: *%.13la vs. %.13la\n",
543 name, sizeNames[k], err, s[j], s2[j], r[j],
544 test);
545 return -1;
546 }
547 }
548 }
549 }
550
551 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
552 {
553 if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
554 out[j], 0, NULL, NULL)))
555 {
556 vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
557 j, error);
558 return error;
559 }
560 }
561
562 if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
563
564
565 if (0 == (base & 0x0fffffff))
566 {
567 if (gVerboseBruteForce)
568 {
569 vlog("base:%14u step:%10u scale:%10u buf_elements:%10zu ulps:%5.3f "
570 "ThreadCount:%2u\n",
571 base, job->step, job->scale, buffer_elements, job->ulps,
572 job->threadCount);
573 }
574 else
575 {
576 vlog(".");
577 }
578 fflush(stdout);
579 }
580
581 return CL_SUCCESS;
582 }
583
584 } // anonymous namespace
585
TestFunc_Double_Double_Double(const Func * f,MTdata d,bool relaxedMode)586 int TestFunc_Double_Double_Double(const Func *f, MTdata d, bool relaxedMode)
587 {
588 TestInfo test_info{};
589 cl_int error;
590 float maxError = 0.0f;
591 double maxErrorVal = 0.0;
592 double maxErrorVal2 = 0.0;
593
594 logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
595
596 // Init test_info
597 test_info.threadCount = GetThreadCount();
598 test_info.subBufferSize = BUFFER_SIZE
599 / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
600 test_info.scale = getTestScale(sizeof(cl_double));
601
602 test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
603 if (test_info.step / test_info.subBufferSize != test_info.scale)
604 {
605 // there was overflow
606 test_info.jobCount = 1;
607 }
608 else
609 {
610 test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
611 }
612
613 test_info.f = f;
614 test_info.ulps = f->double_ulps;
615 test_info.ftz = f->ftz || gForceFTZ;
616 test_info.relaxedMode = relaxedMode;
617
618 test_info.isFDim = 0 == strcmp("fdim", f->nameInCode);
619 test_info.skipNanInf = 0;
620 test_info.isNextafter = 0 == strcmp("nextafter", f->nameInCode);
621
622 test_info.tinfo.resize(test_info.threadCount);
623 for (cl_uint i = 0; i < test_info.threadCount; i++)
624 {
625 cl_buffer_region region = {
626 i * test_info.subBufferSize * sizeof(cl_double),
627 test_info.subBufferSize * sizeof(cl_double)
628 };
629 test_info.tinfo[i].inBuf =
630 clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
631 CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error);
632 if (error || NULL == test_info.tinfo[i].inBuf)
633 {
634 vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
635 "region {%zd, %zd}\n",
636 region.origin, region.size);
637 return error;
638 }
639 test_info.tinfo[i].inBuf2 =
640 clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
641 CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error);
642 if (error || NULL == test_info.tinfo[i].inBuf2)
643 {
644 vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
645 "region {%zd, %zd}\n",
646 region.origin, region.size);
647 return error;
648 }
649
650 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
651 {
652 test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
653 gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
654 ®ion, &error);
655 if (error || NULL == test_info.tinfo[i].outBuf[j])
656 {
657 vlog_error("Error: Unable to create sub-buffer of "
658 "gOutBuffer[%d] for region {%zd, %zd}\n",
659 (int)j, region.origin, region.size);
660 return error;
661 }
662 }
663 test_info.tinfo[i].tQueue =
664 clCreateCommandQueue(gContext, gDevice, 0, &error);
665 if (NULL == test_info.tinfo[i].tQueue || error)
666 {
667 vlog_error("clCreateCommandQueue failed. (%d)\n", error);
668 return error;
669 }
670
671 test_info.tinfo[i].d = MTdataHolder(genrand_int32(d));
672 }
673
674 // Init the kernels
675 BuildKernelInfo build_info{ test_info.threadCount, test_info.k,
676 test_info.programs, f->nameInCode,
677 relaxedMode };
678 if ((error = ThreadPool_Do(BuildKernelFn,
679 gMaxVectorSizeIndex - gMinVectorSizeIndex,
680 &build_info)))
681 return error;
682
683 // Run the kernels
684 if (!gSkipCorrectnessTesting)
685 {
686 error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
687 if (error) return error;
688
689 // Accumulate the arithmetic errors
690 for (cl_uint i = 0; i < test_info.threadCount; i++)
691 {
692 if (test_info.tinfo[i].maxError > maxError)
693 {
694 maxError = test_info.tinfo[i].maxError;
695 maxErrorVal = test_info.tinfo[i].maxErrorValue;
696 maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
697 }
698 }
699
700 if (gWimpyMode)
701 vlog("Wimp pass");
702 else
703 vlog("passed");
704
705 vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
706 }
707
708 vlog("\n");
709
710 return CL_SUCCESS;
711 }
712