1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16
17 #include "common.h"
18 #include "function_list.h"
19 #include "test_functions.h"
20 #include "utility.h"
21
22 #include <cinttypes>
23 #include <cstring>
24
25 namespace {
26
BuildKernelFn(cl_uint job_id,cl_uint thread_id UNUSED,void * p)27 cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
28 {
29 BuildKernelInfo &info = *(BuildKernelInfo *)p;
30 auto generator = [](const std::string &kernel_name, const char *builtin,
31 cl_uint vector_size_index) {
32 return GetUnaryKernel(kernel_name, builtin, ParameterType::Double,
33 ParameterType::Double, vector_size_index);
34 };
35 return BuildKernels(info, job_id, generator);
36 }
37
38 // Thread specific data for a worker thread
39 struct ThreadInfo
40 {
41 // Input and output buffers for the thread
42 clMemWrapper inBuf;
43 Buffers outBuf;
44
45 float maxError; // max error value. Init to 0.
46 double maxErrorValue; // position of the max error value. Init to 0.
47
48 // Per thread command queue to improve performance
49 clCommandQueueWrapper tQueue;
50 };
51
52 struct TestInfo
53 {
54 size_t subBufferSize; // Size of the sub-buffer in elements
55 const Func *f; // A pointer to the function info
56
57 // Programs for various vector sizes.
58 Programs programs;
59
60 // Thread-specific kernels for each vector size:
61 // k[vector_size][thread_id]
62 KernelMatrix k;
63
64 // Array of thread specific information
65 std::vector<ThreadInfo> tinfo;
66
67 cl_uint threadCount; // Number of worker threads
68 cl_uint jobCount; // Number of jobs
69 cl_uint step; // step between each chunk and the next.
70 cl_uint scale; // stride between individual test values
71 float ulps; // max_allowed ulps
72 int ftz; // non-zero if running in flush to zero mode
73
74 int isRangeLimited; // 1 if the function is only to be evaluated over a
75 // range
76 float half_sin_cos_tan_limit;
77 bool relaxedMode; // True if test is running in relaxed mode, false
78 // otherwise.
79 };
80
Test(cl_uint job_id,cl_uint thread_id,void * data)81 cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
82 {
83 TestInfo *job = (TestInfo *)data;
84 size_t buffer_elements = job->subBufferSize;
85 size_t buffer_size = buffer_elements * sizeof(cl_double);
86 cl_uint scale = job->scale;
87 cl_uint base = job_id * (cl_uint)job->step;
88 ThreadInfo *tinfo = &(job->tinfo[thread_id]);
89 float ulps = job->ulps;
90 dptr func = job->f->dfunc;
91 cl_int error;
92 int ftz = job->ftz;
93 bool relaxedMode = job->relaxedMode;
94
95 Force64BitFPUPrecision();
96
97 cl_event e[VECTOR_SIZE_COUNT];
98 cl_ulong *out[VECTOR_SIZE_COUNT];
99 if (gHostFill)
100 {
101 // start the map of the output arrays
102 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
103 {
104 out[j] = (cl_ulong *)clEnqueueMapBuffer(
105 tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
106 buffer_size, 0, NULL, e + j, &error);
107 if (error || NULL == out[j])
108 {
109 vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
110 error);
111 return error;
112 }
113 }
114
115 // Get that moving
116 if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
117 }
118
119 // Write the new values to the input array
120 cl_double *p = (cl_double *)gIn + thread_id * buffer_elements;
121 for (size_t j = 0; j < buffer_elements; j++)
122 p[j] = DoubleFromUInt32(base + j * scale);
123
124 if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
125 buffer_size, p, 0, NULL, NULL)))
126 {
127 vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
128 return error;
129 }
130
131 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
132 {
133 if (gHostFill)
134 {
135 // Wait for the map to finish
136 if ((error = clWaitForEvents(1, e + j)))
137 {
138 vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
139 return error;
140 }
141 if ((error = clReleaseEvent(e[j])))
142 {
143 vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
144 return error;
145 }
146 }
147
148 // Fill the result buffer with garbage, so that old results don't carry
149 // over
150 uint32_t pattern = 0xffffdead;
151 if (gHostFill)
152 {
153 memset_pattern4(out[j], &pattern, buffer_size);
154 if ((error = clEnqueueUnmapMemObject(
155 tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)))
156 {
157 vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n",
158 error);
159 return error;
160 }
161 }
162 else
163 {
164 if ((error = clEnqueueFillBuffer(tinfo->tQueue, tinfo->outBuf[j],
165 &pattern, sizeof(pattern), 0,
166 buffer_size, 0, NULL, NULL)))
167 {
168 vlog_error("Error: clEnqueueFillBuffer failed! err: %d\n",
169 error);
170 return error;
171 }
172 }
173
174 // Run the kernel
175 size_t vectorCount =
176 (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
177 cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
178 // own copy of the cl_kernel
179 cl_program program = job->programs[j];
180
181 if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
182 &tinfo->outBuf[j])))
183 {
184 LogBuildError(program);
185 return error;
186 }
187 if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
188 &tinfo->inBuf)))
189 {
190 LogBuildError(program);
191 return error;
192 }
193
194 if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
195 &vectorCount, NULL, 0, NULL, NULL)))
196 {
197 vlog_error("FAILED -- could not execute kernel\n");
198 return error;
199 }
200 }
201
202
203 // Get that moving
204 if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
205
206 if (gSkipCorrectnessTesting) return CL_SUCCESS;
207
208 // Calculate the correctly rounded reference result
209 cl_double *r = (cl_double *)gOut_Ref + thread_id * buffer_elements;
210 cl_double *s = (cl_double *)p;
211 for (size_t j = 0; j < buffer_elements; j++)
212 r[j] = (cl_double)func.f_f(s[j]);
213
214 // Read the data back -- no need to wait for the first N-1 buffers but wait
215 // for the last buffer. This is an in order queue.
216 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
217 {
218 cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
219 out[j] = (cl_ulong *)clEnqueueMapBuffer(
220 tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0,
221 buffer_size, 0, NULL, NULL, &error);
222 if (error || NULL == out[j])
223 {
224 vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
225 error);
226 return error;
227 }
228 }
229
230 // Verify data
231 cl_ulong *t = (cl_ulong *)r;
232 for (size_t j = 0; j < buffer_elements; j++)
233 {
234 for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
235 {
236 cl_ulong *q = out[k];
237
238 // If we aren't getting the correctly rounded result
239 if (t[j] != q[j])
240 {
241 cl_double test = ((cl_double *)q)[j];
242 long double correct = func.f_f(s[j]);
243 float err = Bruteforce_Ulp_Error_Double(test, correct);
244 int fail = !(fabsf(err) <= ulps);
245
246 if (fail)
247 {
248 if (ftz || relaxedMode)
249 {
250 // retry per section 6.5.3.2
251 if (IsDoubleResultSubnormal(correct, ulps))
252 {
253 fail = fail && (test != 0.0f);
254 if (!fail) err = 0.0f;
255 }
256
257 // retry per section 6.5.3.3
258 if (IsDoubleSubnormal(s[j]))
259 {
260 long double correct2 = func.f_f(0.0L);
261 long double correct3 = func.f_f(-0.0L);
262 float err2 =
263 Bruteforce_Ulp_Error_Double(test, correct2);
264 float err3 =
265 Bruteforce_Ulp_Error_Double(test, correct3);
266 fail = fail
267 && ((!(fabsf(err2) <= ulps))
268 && (!(fabsf(err3) <= ulps)));
269 if (fabsf(err2) < fabsf(err)) err = err2;
270 if (fabsf(err3) < fabsf(err)) err = err3;
271
272 // retry per section 6.5.3.4
273 if (IsDoubleResultSubnormal(correct2, ulps)
274 || IsDoubleResultSubnormal(correct3, ulps))
275 {
276 fail = fail && (test != 0.0f);
277 if (!fail) err = 0.0f;
278 }
279 }
280 }
281 }
282 if (fabsf(err) > tinfo->maxError)
283 {
284 tinfo->maxError = fabsf(err);
285 tinfo->maxErrorValue = s[j];
286 }
287 if (fail)
288 {
289 vlog_error("\nERROR: %s%s: %f ulp error at %.13la "
290 "(0x%16.16" PRIx64 "): *%.13la vs. %.13la\n",
291 job->f->name, sizeNames[k], err,
292 ((cl_double *)gIn)[j], ((cl_ulong *)gIn)[j],
293 ((cl_double *)gOut_Ref)[j], test);
294 return -1;
295 }
296 }
297 }
298 }
299
300 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
301 {
302 if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
303 out[j], 0, NULL, NULL)))
304 {
305 vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
306 j, error);
307 return error;
308 }
309 }
310
311 if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
312
313
314 if (0 == (base & 0x0fffffff))
315 {
316 if (gVerboseBruteForce)
317 {
318 vlog("base:%14u step:%10u scale:%10zd buf_elements:%10u ulps:%5.3f "
319 "ThreadCount:%2u\n",
320 base, job->step, buffer_elements, job->scale, job->ulps,
321 job->threadCount);
322 }
323 else
324 {
325 vlog(".");
326 }
327 fflush(stdout);
328 }
329
330 return CL_SUCCESS;
331 }
332
333 } // anonymous namespace
334
TestFunc_Double_Double(const Func * f,MTdata d,bool relaxedMode)335 int TestFunc_Double_Double(const Func *f, MTdata d, bool relaxedMode)
336 {
337 TestInfo test_info{};
338 cl_int error;
339 float maxError = 0.0f;
340 double maxErrorVal = 0.0;
341
342 logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
343 // Init test_info
344 test_info.threadCount = GetThreadCount();
345 test_info.subBufferSize = BUFFER_SIZE
346 / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
347 test_info.scale = getTestScale(sizeof(cl_double));
348
349 test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
350 if (test_info.step / test_info.subBufferSize != test_info.scale)
351 {
352 // there was overflow
353 test_info.jobCount = 1;
354 }
355 else
356 {
357 test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
358 }
359
360 test_info.f = f;
361 test_info.ulps = f->double_ulps;
362 test_info.ftz = f->ftz || gForceFTZ;
363 test_info.relaxedMode = relaxedMode;
364
365 test_info.tinfo.resize(test_info.threadCount);
366 for (cl_uint i = 0; i < test_info.threadCount; i++)
367 {
368 cl_buffer_region region = {
369 i * test_info.subBufferSize * sizeof(cl_double),
370 test_info.subBufferSize * sizeof(cl_double)
371 };
372 test_info.tinfo[i].inBuf =
373 clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
374 CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error);
375 if (error || NULL == test_info.tinfo[i].inBuf)
376 {
377 vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
378 "region {%zd, %zd}\n",
379 region.origin, region.size);
380 return error;
381 }
382
383 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
384 {
385 test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
386 gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
387 ®ion, &error);
388 if (error || NULL == test_info.tinfo[i].outBuf[j])
389 {
390 vlog_error("Error: Unable to create sub-buffer of "
391 "gOutBuffer[%d] for region {%zd, %zd}\n",
392 (int)j, region.origin, region.size);
393 return error;
394 }
395 }
396 test_info.tinfo[i].tQueue =
397 clCreateCommandQueue(gContext, gDevice, 0, &error);
398 if (NULL == test_info.tinfo[i].tQueue || error)
399 {
400 vlog_error("clCreateCommandQueue failed. (%d)\n", error);
401 return error;
402 }
403 }
404
405 // Init the kernels
406 BuildKernelInfo build_info{ test_info.threadCount, test_info.k,
407 test_info.programs, f->nameInCode,
408 relaxedMode };
409 if ((error = ThreadPool_Do(BuildKernelFn,
410 gMaxVectorSizeIndex - gMinVectorSizeIndex,
411 &build_info)))
412 return error;
413
414 // Run the kernels
415 if (!gSkipCorrectnessTesting)
416 {
417 error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
418 if (error) return error;
419
420 // Accumulate the arithmetic errors
421 for (cl_uint i = 0; i < test_info.threadCount; i++)
422 {
423 if (test_info.tinfo[i].maxError > maxError)
424 {
425 maxError = test_info.tinfo[i].maxError;
426 maxErrorVal = test_info.tinfo[i].maxErrorValue;
427 }
428 }
429
430 if (gWimpyMode)
431 vlog("Wimp pass");
432 else
433 vlog("passed");
434
435 vlog("\t%8.2f @ %a", maxError, maxErrorVal);
436 }
437
438 vlog("\n");
439
440 return CL_SUCCESS;
441 }
442