1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16
17 #include "common.h"
18 #include "function_list.h"
19 #include "test_functions.h"
20 #include "utility.h"
21
22 #include <cstring>
23
24 namespace {
25
BuildKernelFn(cl_uint job_id,cl_uint thread_id UNUSED,void * p)26 cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
27 {
28 BuildKernelInfo &info = *(BuildKernelInfo *)p;
29 auto generator = [](const std::string &kernel_name, const char *builtin,
30 cl_uint vector_size_index) {
31 return GetBinaryKernel(kernel_name, builtin, ParameterType::Double,
32 ParameterType::Double, ParameterType::Double,
33 vector_size_index);
34 };
35 return BuildKernels(info, job_id, generator);
36 }
37
38 // Thread specific data for a worker thread
39 struct ThreadInfo
40 {
41 // Input and output buffers for the thread
42 clMemWrapper inBuf;
43 clMemWrapper inBuf2;
44 Buffers outBuf;
45
46 float maxError; // max error value. Init to 0.
47 double
48 maxErrorValue; // position of the max error value (param 1). Init to 0.
49 double maxErrorValue2; // position of the max error value (param 2). Init
50 // to 0.
51 MTdataHolder d;
52
53 // Per thread command queue to improve performance
54 clCommandQueueWrapper tQueue;
55 };
56
57 struct TestInfo
58 {
59 size_t subBufferSize; // Size of the sub-buffer in elements
60 const Func *f; // A pointer to the function info
61
62 // Programs for various vector sizes.
63 Programs programs;
64
65 // Thread-specific kernels for each vector size:
66 // k[vector_size][thread_id]
67 KernelMatrix k;
68
69 // Array of thread specific information
70 std::vector<ThreadInfo> tinfo;
71
72 cl_uint threadCount; // Number of worker threads
73 cl_uint jobCount; // Number of jobs
74 cl_uint step; // step between each chunk and the next.
75 cl_uint scale; // stride between individual test values
76 float ulps; // max_allowed ulps
77 int ftz; // non-zero if running in flush to zero mode
78 bool relaxedMode; // True if the test is being run in relaxed mode, false
79 // otherwise.
80
81 // no special fields
82 };
83
84 // A table of more difficult cases to get right
85 const double specialValues[] = {
86 -NAN,
87 -INFINITY,
88 -DBL_MAX,
89 MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12),
90 MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64),
91 MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11),
92 MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11),
93 MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63),
94 MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10),
95 MAKE_HEX_DOUBLE(-0x1.000002p32, -0x1000002LL, 8),
96 MAKE_HEX_DOUBLE(-0x1.0p32, -0x1LL, 32),
97 MAKE_HEX_DOUBLE(-0x1.fffffffffffffp31, -0x1fffffffffffffLL, -21),
98 MAKE_HEX_DOUBLE(-0x1.0000000000001p31, -0x10000000000001LL, -21),
99 MAKE_HEX_DOUBLE(-0x1.0p31, -0x1LL, 31),
100 MAKE_HEX_DOUBLE(-0x1.fffffffffffffp30, -0x1fffffffffffffLL, -22),
101 -1000.,
102 -100.,
103 -4.0,
104 -3.5,
105 -3.0,
106 MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51),
107 -2.5,
108 MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51),
109 -2.0,
110 MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52),
111 -1.5,
112 MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52),
113 MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
114 -1.0,
115 MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53),
116 MAKE_HEX_DOUBLE(-0x1.0000000000001p-1, -0x10000000000001LL, -53),
117 -0.5,
118 MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-2, -0x1fffffffffffffLL, -54),
119 MAKE_HEX_DOUBLE(-0x1.0000000000001p-2, -0x10000000000001LL, -54),
120 -0.25,
121 MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-3, -0x1fffffffffffffLL, -55),
122 MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074),
123 -DBL_MIN,
124 MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074),
125 MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074),
126 MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074),
127 MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074),
128 MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074),
129 MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074),
130 MAKE_HEX_DOUBLE(-0x0.0000000000008p-1022, -0x00000000000008LL, -1074),
131 MAKE_HEX_DOUBLE(-0x0.0000000000007p-1022, -0x00000000000007LL, -1074),
132 MAKE_HEX_DOUBLE(-0x0.0000000000006p-1022, -0x00000000000006LL, -1074),
133 MAKE_HEX_DOUBLE(-0x0.0000000000005p-1022, -0x00000000000005LL, -1074),
134 MAKE_HEX_DOUBLE(-0x0.0000000000004p-1022, -0x00000000000004LL, -1074),
135 MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074),
136 MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074),
137 MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074),
138 -0.0,
139
140 +NAN,
141 +INFINITY,
142 +DBL_MAX,
143 MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12),
144 MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64),
145 MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11),
146 MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11),
147 MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63),
148 MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10),
149 MAKE_HEX_DOUBLE(+0x1.000002p32, +0x1000002LL, 8),
150 MAKE_HEX_DOUBLE(+0x1.0p32, +0x1LL, 32),
151 MAKE_HEX_DOUBLE(+0x1.fffffffffffffp31, +0x1fffffffffffffLL, -21),
152 MAKE_HEX_DOUBLE(+0x1.0000000000001p31, +0x10000000000001LL, -21),
153 MAKE_HEX_DOUBLE(+0x1.0p31, +0x1LL, 31),
154 MAKE_HEX_DOUBLE(+0x1.fffffffffffffp30, +0x1fffffffffffffLL, -22),
155 +1000.0,
156 +100.0,
157 +4.0,
158 +3.5,
159 +3.0,
160 MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51),
161 +2.5,
162 MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51),
163 +2.0,
164 MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52),
165 +1.5,
166 MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52),
167 MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
168 +1.0,
169 MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53),
170 MAKE_HEX_DOUBLE(+0x1.0000000000001p-1, +0x10000000000001LL, -53),
171 +0.5,
172 MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-2, +0x1fffffffffffffLL, -54),
173 MAKE_HEX_DOUBLE(+0x1.0000000000001p-2, +0x10000000000001LL, -54),
174 +0.25,
175 MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-3, +0x1fffffffffffffLL, -55),
176 MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074),
177 +DBL_MIN,
178 MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074),
179 MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074),
180 MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074),
181 MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074),
182 MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074),
183 MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074),
184 MAKE_HEX_DOUBLE(+0x0.0000000000008p-1022, +0x00000000000008LL, -1074),
185 MAKE_HEX_DOUBLE(+0x0.0000000000007p-1022, +0x00000000000007LL, -1074),
186 MAKE_HEX_DOUBLE(+0x0.0000000000006p-1022, +0x00000000000006LL, -1074),
187 MAKE_HEX_DOUBLE(+0x0.0000000000005p-1022, +0x00000000000005LL, -1074),
188 MAKE_HEX_DOUBLE(+0x0.0000000000004p-1022, +0x00000000000004LL, -1074),
189 MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074),
190 MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074),
191 MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074),
192 +0.0,
193 };
194
195 constexpr size_t specialValuesCount =
196 sizeof(specialValues) / sizeof(specialValues[0]);
197
Test(cl_uint job_id,cl_uint thread_id,void * data)198 cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
199 {
200 TestInfo *job = (TestInfo *)data;
201 size_t buffer_elements = job->subBufferSize;
202 size_t buffer_size = buffer_elements * sizeof(cl_double);
203 cl_uint base = job_id * (cl_uint)job->step;
204 ThreadInfo *tinfo = &(job->tinfo[thread_id]);
205 float ulps = job->ulps;
206 dptr func = job->f->dfunc;
207 int ftz = job->ftz;
208 bool relaxedMode = job->relaxedMode;
209 MTdata d = tinfo->d;
210 cl_int error;
211 const char *name = job->f->name;
212 cl_ulong *t;
213 cl_double *r;
214 cl_double *s;
215 cl_double *s2;
216
217 Force64BitFPUPrecision();
218
219 cl_event e[VECTOR_SIZE_COUNT];
220 cl_ulong *out[VECTOR_SIZE_COUNT];
221 if (gHostFill)
222 {
223 // start the map of the output arrays
224 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
225 {
226 out[j] = (cl_ulong *)clEnqueueMapBuffer(
227 tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
228 buffer_size, 0, NULL, e + j, &error);
229 if (error || NULL == out[j])
230 {
231 vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
232 error);
233 return error;
234 }
235 }
236 }
237
238 // Get that moving
239 if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
240
241 // Init input array
242 cl_ulong *p = (cl_ulong *)gIn + thread_id * buffer_elements;
243 cl_ulong *p2 = (cl_ulong *)gIn2 + thread_id * buffer_elements;
244 cl_uint idx = 0;
245 int totalSpecialValueCount = specialValuesCount * specialValuesCount;
246 int lastSpecialJobIndex = (totalSpecialValueCount - 1) / buffer_elements;
247
248 // Test edge cases
249 if (job_id <= (cl_uint)lastSpecialJobIndex)
250 {
251 cl_double *fp = (cl_double *)p;
252 cl_double *fp2 = (cl_double *)p2;
253 uint32_t x, y;
254
255 x = (job_id * buffer_elements) % specialValuesCount;
256 y = (job_id * buffer_elements) / specialValuesCount;
257
258 for (; idx < buffer_elements; idx++)
259 {
260 fp[idx] = specialValues[x];
261 fp2[idx] = specialValues[y];
262 if (++x >= specialValuesCount)
263 {
264 x = 0;
265 y++;
266 if (y >= specialValuesCount) break;
267 }
268 }
269 }
270
271 // Init any remaining values
272 for (; idx < buffer_elements; idx++)
273 {
274 p[idx] = genrand_int64(d);
275 p2[idx] = genrand_int64(d);
276 }
277
278 if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
279 buffer_size, p, 0, NULL, NULL)))
280 {
281 vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
282 return error;
283 }
284
285 if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
286 buffer_size, p2, 0, NULL, NULL)))
287 {
288 vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
289 return error;
290 }
291
292 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
293 {
294 if (gHostFill)
295 {
296 // Wait for the map to finish
297 if ((error = clWaitForEvents(1, e + j)))
298 {
299 vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
300 return error;
301 }
302 if ((error = clReleaseEvent(e[j])))
303 {
304 vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
305 return error;
306 }
307 }
308
309 // Fill the result buffer with garbage, so that old results don't carry
310 // over
311 uint32_t pattern = 0xffffdead;
312 if (gHostFill)
313 {
314 memset_pattern4(out[j], &pattern, buffer_size);
315 if ((error = clEnqueueUnmapMemObject(
316 tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)))
317 {
318 vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n",
319 error);
320 return error;
321 }
322 }
323 else
324 {
325 if ((error = clEnqueueFillBuffer(tinfo->tQueue, tinfo->outBuf[j],
326 &pattern, sizeof(pattern), 0,
327 buffer_size, 0, NULL, NULL)))
328 {
329 vlog_error("Error: clEnqueueFillBuffer failed! err: %d\n",
330 error);
331 return error;
332 }
333 }
334
335 // Run the kernel
336 size_t vectorCount =
337 (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
338 cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
339 // own copy of the cl_kernel
340 cl_program program = job->programs[j];
341
342 if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
343 &tinfo->outBuf[j])))
344 {
345 LogBuildError(program);
346 return error;
347 }
348 if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
349 &tinfo->inBuf)))
350 {
351 LogBuildError(program);
352 return error;
353 }
354 if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
355 &tinfo->inBuf2)))
356 {
357 LogBuildError(program);
358 return error;
359 }
360
361 if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
362 &vectorCount, NULL, 0, NULL, NULL)))
363 {
364 vlog_error("FAILED -- could not execute kernel\n");
365 return error;
366 }
367 }
368
369 // Get that moving
370 if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
371
372 if (gSkipCorrectnessTesting) return CL_SUCCESS;
373
374 // Calculate the correctly rounded reference result
375 r = (cl_double *)gOut_Ref + thread_id * buffer_elements;
376 s = (cl_double *)gIn + thread_id * buffer_elements;
377 s2 = (cl_double *)gIn2 + thread_id * buffer_elements;
378 for (size_t j = 0; j < buffer_elements; j++)
379 r[j] = (cl_double)func.f_ff(s[j], s2[j]);
380
381 // Read the data back -- no need to wait for the first N-1 buffers but wait
382 // for the last buffer. This is an in order queue.
383 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
384 {
385 cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
386 out[j] = (cl_ulong *)clEnqueueMapBuffer(
387 tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0,
388 buffer_size, 0, NULL, NULL, &error);
389 if (error || NULL == out[j])
390 {
391 vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
392 error);
393 return error;
394 }
395 }
396
397 // Verify data
398 t = (cl_ulong *)r;
399 for (size_t j = 0; j < buffer_elements; j++)
400 {
401 for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
402 {
403 cl_ulong *q = out[k];
404
405 // If we aren't getting the correctly rounded result
406 if (t[j] != q[j])
407 {
408 cl_double test = ((cl_double *)q)[j];
409 long double correct = func.f_ff(s[j], s2[j]);
410 float err = Bruteforce_Ulp_Error_Double(test, correct);
411 int fail = !(fabsf(err) <= ulps);
412
413 if (fail && (ftz || relaxedMode))
414 {
415 // retry per section 6.5.3.2
416 if (IsDoubleResultSubnormal(correct, ulps))
417 {
418 fail = fail && (test != 0.0f);
419 if (!fail) err = 0.0f;
420 }
421
422
423 // retry per section 6.5.3.3
424 if (IsDoubleSubnormal(s[j]))
425 {
426 long double correct2 = func.f_ff(0.0, s2[j]);
427 long double correct3 = func.f_ff(-0.0, s2[j]);
428 float err2 =
429 Bruteforce_Ulp_Error_Double(test, correct2);
430 float err3 =
431 Bruteforce_Ulp_Error_Double(test, correct3);
432 fail = fail
433 && ((!(fabsf(err2) <= ulps))
434 && (!(fabsf(err3) <= ulps)));
435 if (fabsf(err2) < fabsf(err)) err = err2;
436 if (fabsf(err3) < fabsf(err)) err = err3;
437
438 // retry per section 6.5.3.4
439 if (IsDoubleResultSubnormal(correct2, ulps)
440 || IsDoubleResultSubnormal(correct3, ulps))
441 {
442 fail = fail && (test != 0.0f);
443 if (!fail) err = 0.0f;
444 }
445
446 // try with both args as zero
447 if (IsDoubleSubnormal(s2[j]))
448 {
449 correct2 = func.f_ff(0.0, 0.0);
450 correct3 = func.f_ff(-0.0, 0.0);
451 long double correct4 = func.f_ff(0.0, -0.0);
452 long double correct5 = func.f_ff(-0.0, -0.0);
453 err2 = Bruteforce_Ulp_Error_Double(test, correct2);
454 err3 = Bruteforce_Ulp_Error_Double(test, correct3);
455 float err4 =
456 Bruteforce_Ulp_Error_Double(test, correct4);
457 float err5 =
458 Bruteforce_Ulp_Error_Double(test, correct5);
459 fail = fail
460 && ((!(fabsf(err2) <= ulps))
461 && (!(fabsf(err3) <= ulps))
462 && (!(fabsf(err4) <= ulps))
463 && (!(fabsf(err5) <= ulps)));
464 if (fabsf(err2) < fabsf(err)) err = err2;
465 if (fabsf(err3) < fabsf(err)) err = err3;
466 if (fabsf(err4) < fabsf(err)) err = err4;
467 if (fabsf(err5) < fabsf(err)) err = err5;
468
469 // retry per section 6.5.3.4
470 if (IsDoubleResultSubnormal(correct2, ulps)
471 || IsDoubleResultSubnormal(correct3, ulps)
472 || IsDoubleResultSubnormal(correct4, ulps)
473 || IsDoubleResultSubnormal(correct5, ulps))
474 {
475 fail = fail && (test != 0.0f);
476 if (!fail) err = 0.0f;
477 }
478 }
479 }
480 else if (IsDoubleSubnormal(s2[j]))
481 {
482 long double correct2 = func.f_ff(s[j], 0.0);
483 long double correct3 = func.f_ff(s[j], -0.0);
484 float err2 =
485 Bruteforce_Ulp_Error_Double(test, correct2);
486 float err3 =
487 Bruteforce_Ulp_Error_Double(test, correct3);
488 fail = fail
489 && ((!(fabsf(err2) <= ulps))
490 && (!(fabsf(err3) <= ulps)));
491 if (fabsf(err2) < fabsf(err)) err = err2;
492 if (fabsf(err3) < fabsf(err)) err = err3;
493
494 // retry per section 6.5.3.4
495 if (IsDoubleResultSubnormal(correct2, ulps)
496 || IsDoubleResultSubnormal(correct3, ulps))
497 {
498 fail = fail && (test != 0.0f);
499 if (!fail) err = 0.0f;
500 }
501 }
502 }
503
504 if (fabsf(err) > tinfo->maxError)
505 {
506 tinfo->maxError = fabsf(err);
507 tinfo->maxErrorValue = s[j];
508 tinfo->maxErrorValue2 = s2[j];
509 }
510 if (fail)
511 {
512 vlog_error(
513 "\nERROR: %s%s: %f ulp error at {%a, %a}: *%a vs. %a\n",
514 name, sizeNames[k], err, s[j], s2[j], r[j], test);
515 return -1;
516 }
517 }
518 }
519 }
520
521 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
522 {
523 if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
524 out[j], 0, NULL, NULL)))
525 {
526 vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
527 j, error);
528 return error;
529 }
530 }
531
532 if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
533
534
535 if (0 == (base & 0x0fffffff))
536 {
537 if (gVerboseBruteForce)
538 {
539 vlog("base:%14u step:%10u scale:%10u buf_elements:%10zu ulps:%5.3f "
540 "ThreadCount:%2u\n",
541 base, job->step, job->scale, buffer_elements, job->ulps,
542 job->threadCount);
543 }
544 else
545 {
546 vlog(".");
547 }
548 fflush(stdout);
549 }
550
551 return CL_SUCCESS;
552 }
553
554 } // anonymous namespace
555
TestFunc_Double_Double_Double_Operator(const Func * f,MTdata d,bool relaxedMode)556 int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata d,
557 bool relaxedMode)
558 {
559 TestInfo test_info{};
560 cl_int error;
561 float maxError = 0.0f;
562 double maxErrorVal = 0.0;
563 double maxErrorVal2 = 0.0;
564
565 logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
566
567 // Init test_info
568 test_info.threadCount = GetThreadCount();
569 test_info.subBufferSize = BUFFER_SIZE
570 / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
571 test_info.scale = getTestScale(sizeof(cl_double));
572
573 test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
574 if (test_info.step / test_info.subBufferSize != test_info.scale)
575 {
576 // there was overflow
577 test_info.jobCount = 1;
578 }
579 else
580 {
581 test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
582 }
583
584 test_info.f = f;
585 test_info.ulps = f->double_ulps;
586 test_info.ftz = f->ftz || gForceFTZ;
587
588 test_info.tinfo.resize(test_info.threadCount);
589 for (cl_uint i = 0; i < test_info.threadCount; i++)
590 {
591 cl_buffer_region region = {
592 i * test_info.subBufferSize * sizeof(cl_double),
593 test_info.subBufferSize * sizeof(cl_double)
594 };
595 test_info.tinfo[i].inBuf =
596 clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
597 CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error);
598 if (error || NULL == test_info.tinfo[i].inBuf)
599 {
600 vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
601 "region {%zd, %zd}\n",
602 region.origin, region.size);
603 return error;
604 }
605 test_info.tinfo[i].inBuf2 =
606 clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
607 CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error);
608 if (error || NULL == test_info.tinfo[i].inBuf2)
609 {
610 vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
611 "region {%zd, %zd}\n",
612 region.origin, region.size);
613 return error;
614 }
615
616 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
617 {
618 test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
619 gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
620 ®ion, &error);
621 if (error || NULL == test_info.tinfo[i].outBuf[j])
622 {
623 vlog_error("Error: Unable to create sub-buffer of "
624 "gOutBuffer[%d] for region {%zd, %zd}\n",
625 (int)j, region.origin, region.size);
626 return error;
627 }
628 }
629 test_info.tinfo[i].tQueue =
630 clCreateCommandQueue(gContext, gDevice, 0, &error);
631 if (NULL == test_info.tinfo[i].tQueue || error)
632 {
633 vlog_error("clCreateCommandQueue failed. (%d)\n", error);
634 return error;
635 }
636
637 test_info.tinfo[i].d = MTdataHolder(genrand_int32(d));
638 }
639
640 // Init the kernels
641 BuildKernelInfo build_info{ test_info.threadCount, test_info.k,
642 test_info.programs, f->nameInCode,
643 relaxedMode };
644 if ((error = ThreadPool_Do(BuildKernelFn,
645 gMaxVectorSizeIndex - gMinVectorSizeIndex,
646 &build_info)))
647 return error;
648
649 // Run the kernels
650 if (!gSkipCorrectnessTesting)
651 {
652 error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
653 if (error) return error;
654
655 // Accumulate the arithmetic errors
656 for (cl_uint i = 0; i < test_info.threadCount; i++)
657 {
658 if (test_info.tinfo[i].maxError > maxError)
659 {
660 maxError = test_info.tinfo[i].maxError;
661 maxErrorVal = test_info.tinfo[i].maxErrorValue;
662 maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
663 }
664 }
665
666 if (gWimpyMode)
667 vlog("Wimp pass");
668 else
669 vlog("passed");
670
671 vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
672 }
673
674 vlog("\n");
675
676 return CL_SUCCESS;
677 }
678