1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include <stdio.h>
17 #include <string.h>
18 #include "harness/testHarness.h"
19 #include "harness/typeWrappers.h"
20
21 #include <algorithm>
22 #include <vector>
23
24 #include "procs.h"
25 #include "utils.h"
26 #include <time.h>
27
28
29 #ifdef CL_VERSION_2_0
30 extern int gWimpyMode;
31 static const char *helper_ndrange_1d_glo[] = {
32 NL,
33 "void block_fn(int len, __global atomic_uint* val)" NL,
34 "{" NL,
35 " atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1u, "
36 "memory_order_relaxed, memory_scope_device);" NL,
37 "}" NL,
38 "" NL,
39 "kernel void helper_ndrange_1d_glo(__global int* res, uint n, uint len, "
40 "__global uint* glob_size_arr, __global uint* loc_size_arr, __global "
41 "atomic_uint* val, __global uint* ofs_arr)" NL,
42 "{" NL,
43 " size_t tid = get_global_id(0);" NL,
44 " void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL,
45 "" NL,
46 " for(int i = 0; i < n; i++)" NL,
47 " {" NL,
48 " ndrange_t ndrange = ndrange_1D(glob_size_arr[i]);" NL,
49 " int enq_res = enqueue_kernel(get_default_queue(), "
50 "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL,
51 " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL,
52 " }" NL,
53 "}" NL
54 };
55
56 static const char *helper_ndrange_1d_loc[] = {
57 NL,
58 "void block_fn(int len, __global atomic_uint* val)" NL,
59 "{" NL,
60 " atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1u, "
61 "memory_order_relaxed, memory_scope_device);" NL,
62 "}" NL,
63 "" NL,
64 "kernel void helper_ndrange_1d_loc(__global int* res, uint n, uint len, "
65 "__global uint* glob_size_arr, __global uint* loc_size_arr, __global "
66 "atomic_uint* val, __global uint* ofs_arr)" NL,
67 "{" NL,
68 " size_t tid = get_global_id(0);" NL,
69 " void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL,
70 "" NL,
71 " for(int k = 0; k < n; k++)" NL,
72 " {" NL,
73 " for(int i = 0; i < n; i++)" NL,
74 " {" NL,
75 " if (glob_size_arr[i] >= loc_size_arr[k])" NL,
76 " {" NL,
77 " ndrange_t ndrange = ndrange_1D(glob_size_arr[i], "
78 "loc_size_arr[k]);" NL,
79 " int enq_res = enqueue_kernel(get_default_queue(), "
80 "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL,
81 " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL,
82 " }" NL,
83 " }" NL,
84 " }" NL,
85 "}" NL
86 };
87
88 static const char *helper_ndrange_1d_ofs[] = {
89 NL,
90 "void block_fn(int len, __global atomic_uint* val)" NL,
91 "{" NL,
92 " atomic_fetch_add_explicit(&val[(get_global_offset(0) + "
93 "get_global_linear_id()) % len], 1u, memory_order_relaxed, "
94 "memory_scope_device);" NL,
95 "}" NL,
96 "" NL,
97 "kernel void helper_ndrange_1d_ofs(__global int* res, uint n, uint len, "
98 "__global uint* glob_size_arr, __global uint* loc_size_arr, __global "
99 "atomic_uint* val, __global uint* ofs_arr)" NL,
100 "{" NL,
101 " size_t tid = get_global_id(0);" NL,
102 " void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL,
103 "" NL,
104 " for(int l = 0; l < n; l++)" NL,
105 " {" NL,
106 " for(int k = 0; k < n; k++)" NL,
107 " {" NL,
108 " for(int i = 0; i < n; i++)" NL,
109 " {" NL,
110 " if (glob_size_arr[i] >= loc_size_arr[k])" NL,
111 " {" NL,
112 " ndrange_t ndrange = ndrange_1D(ofs_arr[l], glob_size_arr[i], "
113 "loc_size_arr[k]);" NL,
114 " int enq_res = enqueue_kernel(get_default_queue(), "
115 "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL,
116 " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL,
117 " }" NL,
118 " }" NL,
119 " }" NL,
120 " }" NL,
121 "}" NL
122 };
123
124 static const char *helper_ndrange_2d_glo[] = {
125 NL,
126 "void block_fn(int len, __global atomic_uint* val)" NL,
127 "{" NL,
128 " atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1u, "
129 "memory_order_relaxed, memory_scope_device);" NL,
130 "}" NL,
131 "" NL,
132 "kernel void helper_ndrange_2d_glo(__global int* res, uint n, uint len, "
133 "__global uint* glob_size_arr, __global uint* loc_size_arr, __global int* "
134 "val, __global uint* ofs_arr)" NL,
135 "{" NL,
136 " size_t tid = get_global_id(0);" NL,
137 " void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL,
138 "" NL,
139 " for(int i = 0; i < n; i++)" NL,
140 " {" NL,
141 " size_t glob_size[2] = { glob_size_arr[i], glob_size_arr[(i + 1) % n] "
142 "};" NL,
143 " ndrange_t ndrange = ndrange_2D(glob_size);" NL,
144 " int enq_res = enqueue_kernel(get_default_queue(), "
145 "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL,
146 " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL,
147 " }" NL,
148 "}" NL
149 };
150
151 static const char *helper_ndrange_2d_loc[] = {
152 NL,
153 "void block_fn(int len, __global atomic_uint* val)" NL,
154 "{" NL,
155 " atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1u, "
156 "memory_order_relaxed, memory_scope_device);" NL,
157 "}" NL,
158 "" NL,
159 "kernel void helper_ndrange_2d_loc(__global int* res, uint n, uint len, "
160 "__global uint* glob_size_arr, __global uint* loc_size_arr, __global int* "
161 "val, __global uint* ofs_arr)" NL,
162 "{" NL,
163 " size_t tid = get_global_id(0);" NL,
164 " void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL,
165 "" NL,
166 " for(int k = 0; k < n; k++)" NL,
167 " {" NL,
168 " for(int i = 0; i < n; i++)" NL,
169 " {" NL,
170 " if (glob_size_arr[(i + 1) % n] >= loc_size_arr[k])" NL,
171 " {" NL,
172 " size_t glob_size[] = { glob_size_arr[i], glob_size_arr[(i + 1) % "
173 "n] };" NL,
174 " size_t loc_size[] = { 1, loc_size_arr[k] };" NL,
175 "" NL,
176 " ndrange_t ndrange = ndrange_2D(glob_size, loc_size);" NL,
177 " int enq_res = enqueue_kernel(get_default_queue(), "
178 "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL,
179 " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL,
180 " }" NL,
181 " }" NL,
182 " }" NL,
183 "}" NL
184 };
185
186
187 static const char *helper_ndrange_2d_ofs[] = {
188 NL,
189 "void block_fn(int len, __global atomic_uint* val)" NL,
190 "{" NL,
191 " atomic_fetch_add_explicit(&val[(get_global_offset(1) * "
192 "get_global_size(0) + get_global_offset(0) + get_global_linear_id()) % "
193 "len], 1u, memory_order_relaxed, memory_scope_device);" NL,
194 "}" NL,
195 "" NL,
196 "kernel void helper_ndrange_2d_ofs(__global int* res, uint n, uint len, "
197 "__global uint* glob_size_arr, __global uint* loc_size_arr, __global int* "
198 "val, __global uint* ofs_arr)" NL,
199 "{" NL,
200 " size_t tid = get_global_id(0);" NL,
201 " void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL,
202 "" NL,
203 " for(int l = 0; l < n; l++)" NL,
204 " {" NL,
205 " for(int k = 0; k < n; k++)" NL,
206 " {" NL,
207 " for(int i = 0; i < n; i++)" NL,
208 " {" NL,
209 " if (glob_size_arr[(i + 1) % n] >= loc_size_arr[k])" NL,
210 " {" NL,
211 " size_t glob_size[] = { glob_size_arr[i], glob_size_arr[(i + 1) "
212 "% n]};" NL,
213 " size_t loc_size[] = { 1, loc_size_arr[k] };" NL,
214 " size_t ofs[] = { ofs_arr[l], ofs_arr[(l + 1) % n] };" NL,
215 "" NL,
216 " ndrange_t ndrange = ndrange_2D(ofs,glob_size,loc_size);" NL,
217 " int enq_res = enqueue_kernel(get_default_queue(), "
218 "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL,
219 " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL,
220 " }" NL,
221 " }" NL,
222 " }" NL,
223 " }" NL,
224 "}" NL
225 };
226
227
228 static const char *helper_ndrange_3d_glo[] = {
229 NL,
230 "void block_fn(int len, __global atomic_uint* val)" NL,
231 "{" NL,
232 " atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1u, "
233 "memory_order_relaxed, memory_scope_device);" NL,
234 "}" NL,
235 "" NL,
236 "kernel void helper_ndrange_3d_glo(__global int* res, uint n, uint len, "
237 "__global uint* glob_size_arr, __global uint* loc_size_arr, __global int* "
238 "val, __global uint* ofs_arr)" NL,
239 "{" NL,
240 " size_t tid = get_global_id(0);" NL,
241 " void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL,
242 "" NL,
243 " for(int i = 0; i < n; i++)" NL,
244 " {" NL,
245 " uint global_work_size = glob_size_arr[i] * glob_size_arr[(i + 1) % "
246 "n] * glob_size_arr[(i + 2) % n];" NL,
247 " if (global_work_size <= (len * len))" NL,
248 " {" NL,
249 " size_t glob_size[3] = { glob_size_arr[i], glob_size_arr[(i + 1) % "
250 "n], glob_size_arr[(i + 2) % n] };" NL,
251 " ndrange_t ndrange = ndrange_3D(glob_size);" NL,
252 " int enq_res = enqueue_kernel(get_default_queue(), "
253 "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL,
254 " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL,
255 " }" NL,
256 " }" NL,
257 "}" NL
258 };
259
260
261 static const char *helper_ndrange_3d_loc[] = {
262 NL,
263 "void block_fn(int len, __global atomic_uint* val)" NL,
264 "{" NL,
265 " atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1u, "
266 "memory_order_relaxed, memory_scope_device);" NL,
267 "}" NL,
268 "" NL,
269 "kernel void helper_ndrange_3d_loc(__global int* res, uint n, uint len, "
270 "__global uint* glob_size_arr, __global uint* loc_size_arr, __global int* "
271 "val, __global uint* ofs_arr)" NL,
272 "{" NL,
273 " size_t tid = get_global_id(0);" NL,
274 " void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL,
275 "" NL,
276 " for(int k = 0; k < n; k++)" NL,
277 " {" NL,
278 " for(int i = 0; i < n; i++)" NL,
279 " {" NL,
280 " uint global_work_size = glob_size_arr[i] * glob_size_arr[(i + 1) % "
281 "n] * glob_size_arr[(i + 2) % n];" NL,
282 " if (glob_size_arr[(i + 2) % n] >= loc_size_arr[k] && "
283 "global_work_size <= (len * len))" NL,
284 " {" NL,
285 " size_t glob_size[] = { glob_size_arr[i], glob_size_arr[(i + 1) % "
286 "n], glob_size_arr[(i + 2) % n] };" NL,
287 " size_t loc_size[] = { 1, 1, loc_size_arr[k] };" NL,
288 " ndrange_t ndrange = ndrange_3D(glob_size,loc_size);" NL,
289 " int enq_res = enqueue_kernel(get_default_queue(), "
290 "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL,
291 " " NL,
292 " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL,
293 " }" NL,
294 " }" NL,
295 " }" NL,
296 "}" NL
297 };
298
299 static const char *helper_ndrange_3d_ofs[] = {
300 NL,
301 "void block_fn(int len, __global atomic_uint* val)" NL,
302 "{" NL,
303 " atomic_fetch_add_explicit(&val[(get_global_offset(2) * "
304 "get_global_size(0) * get_global_size(1) + get_global_offset(1) * "
305 "get_global_size(0) + get_global_offset(0) + get_global_linear_id()) % "
306 "len], 1u, memory_order_relaxed, memory_scope_device);" NL,
307 "}" NL,
308 "" NL,
309 "kernel void helper_ndrange_3d_ofs(__global int* res, uint n, uint len, "
310 "__global uint* glob_size_arr, __global uint* loc_size_arr, __global int* "
311 "val, __global uint* ofs_arr)" NL,
312 "{" NL,
313 " size_t tid = get_global_id(0);" NL,
314 " void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL,
315 "" NL,
316 " for(int l = 0; l < n; l++)" NL,
317 " {" NL,
318 " for(int k = 0; k < n; k++)" NL,
319 " {" NL,
320 " for(int i = 0; i < n; i++)" NL,
321 " {" NL,
322 " uint global_work_size = glob_size_arr[i] * glob_size_arr[(i + 1) "
323 "% n] * glob_size_arr[(i + 2) % n];" NL,
324 " if (glob_size_arr[(i + 2) % n] >= loc_size_arr[k] && "
325 "global_work_size <= (len * len))" NL,
326 " {" NL,
327 " size_t glob_size[3] = { glob_size_arr[i], glob_size_arr[(i + 1) "
328 "% n], glob_size_arr[(i + 2) % n]};" NL,
329 " size_t loc_size[3] = { 1, 1, loc_size_arr[k] };" NL,
330 " size_t ofs[3] = { ofs_arr[l], ofs_arr[(l + 1) % n], ofs_arr[(l "
331 "+ 2) % n] };" NL,
332 " ndrange_t ndrange = ndrange_3D(ofs,glob_size,loc_size);" NL,
333 " int enq_res = enqueue_kernel(get_default_queue(), "
334 "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL,
335 " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL,
336 " }" NL,
337 " }" NL,
338 " }" NL,
339 " }" NL,
340 "}" NL
341 };
342
343 static const kernel_src_dim_check sources_ndrange_Xd[] =
344 {
345 { KERNEL(helper_ndrange_1d_glo), 1, CL_FALSE, CL_FALSE},
346 { KERNEL(helper_ndrange_1d_loc), 1, CL_TRUE, CL_FALSE},
347 { KERNEL(helper_ndrange_1d_ofs), 1, CL_TRUE, CL_TRUE},
348 { KERNEL(helper_ndrange_2d_glo), 2, CL_FALSE, CL_FALSE},
349 { KERNEL(helper_ndrange_2d_loc), 2, CL_TRUE, CL_FALSE},
350 { KERNEL(helper_ndrange_2d_ofs), 2, CL_TRUE, CL_TRUE},
351 { KERNEL(helper_ndrange_3d_glo), 3, CL_FALSE, CL_FALSE},
352 { KERNEL(helper_ndrange_3d_loc), 3, CL_TRUE, CL_FALSE},
353 { KERNEL(helper_ndrange_3d_ofs), 3, CL_TRUE, CL_TRUE},
354 };
355 static const size_t num_kernels_ndrange_Xd = arr_size(sources_ndrange_Xd);
356
check_kernel_results(cl_int * results,cl_int len)357 static int check_kernel_results(cl_int* results, cl_int len)
358 {
359 for(cl_int i = 0; i < len; ++i)
360 {
361 if(results[i] != 0) return i;
362 }
363 return -1;
364 }
365
generate_reference_1D(std::vector<cl_int> & reference_results,std::vector<cl_uint> & glob_size_arr)366 void generate_reference_1D(std::vector<cl_int> &reference_results, std::vector<cl_uint> &glob_size_arr)
367 {
368 for (size_t g = 0; g < glob_size_arr.size(); ++g)
369 {
370 for (size_t w = 0; w < glob_size_arr[g]; ++w)
371 {
372 ++reference_results[w];
373 }
374 }
375 }
376
generate_reference_1D_local(std::vector<cl_int> & reference_results,std::vector<cl_uint> & glob_size_arr,std::vector<cl_uint> & loc_size_arr)377 void generate_reference_1D_local(std::vector<cl_int> &reference_results, std::vector<cl_uint> &glob_size_arr, std::vector<cl_uint> &loc_size_arr)
378 {
379 for (size_t g = 0; g < glob_size_arr.size(); ++g)
380 {
381 for (size_t l = 0; l < loc_size_arr.size(); ++l)
382 {
383 if (glob_size_arr[g] >= loc_size_arr[l])
384 {
385 for (size_t w = 0; w < glob_size_arr[g]; ++w)
386 {
387 ++reference_results[w];
388 }
389 }
390 }
391 }
392 }
393
generate_reference_1D_offset(std::vector<cl_int> & reference_results,std::vector<cl_uint> & glob_size_arr,std::vector<cl_uint> & loc_size_arr,std::vector<cl_uint> & offset,cl_uint len)394 void generate_reference_1D_offset(std::vector<cl_int> &reference_results, std::vector<cl_uint> &glob_size_arr, std::vector<cl_uint> &loc_size_arr, std::vector<cl_uint> &offset, cl_uint len)
395 {
396 for (size_t g = 0; g < glob_size_arr.size(); ++g)
397 {
398 for (size_t l = 0; l < loc_size_arr.size(); ++l)
399 {
400 if (glob_size_arr[g] >= loc_size_arr[l])
401 {
402 for (size_t o = 0; o < offset.size(); ++o)
403 {
404 for (size_t w = 0; w < glob_size_arr[g]; ++w)
405 {
406 ++reference_results[(offset[o] + w) % len];
407 }
408 }
409 }
410 }
411 }
412 }
413
generate_reference_2D(std::vector<cl_int> & reference_results,std::vector<cl_uint> & glob_size_arr,cl_uint len)414 void generate_reference_2D(std::vector<cl_int> &reference_results, std::vector<cl_uint> &glob_size_arr, cl_uint len)
415 {
416 for (size_t g = 0; g < glob_size_arr.size(); ++g)
417 {
418 for (size_t h = 0; h < glob_size_arr[(g + 1) % glob_size_arr.size()]; ++h)
419 {
420 for (size_t w = 0; w < glob_size_arr[g]; ++w)
421 {
422 ++reference_results[(h * glob_size_arr[g] + w) % len];
423 }
424 }
425 }
426 }
427
generate_reference_2D_local(std::vector<cl_int> & reference_results,std::vector<cl_uint> & glob_size_arr,std::vector<cl_uint> & loc_size_arr,cl_uint len)428 void generate_reference_2D_local(std::vector<cl_int> &reference_results, std::vector<cl_uint> &glob_size_arr, std::vector<cl_uint> &loc_size_arr, cl_uint len)
429 {
430 size_t n = glob_size_arr.size();
431 for (size_t g = 0; g < glob_size_arr.size(); ++g)
432 {
433 for (size_t l = 0; l < loc_size_arr.size(); ++l)
434 {
435 if (glob_size_arr[(g + 1) % n] >= loc_size_arr[l])
436 {
437 for (size_t h = 0; h < glob_size_arr[(g + 1) % n]; ++h)
438 {
439 for (size_t w = 0; w < glob_size_arr[g]; ++w)
440 {
441 ++reference_results[(h * glob_size_arr[g] + w) % len];
442 }
443 }
444 }
445 }
446 }
447 }
448
generate_reference_2D_offset(std::vector<cl_int> & reference_results,std::vector<cl_uint> & glob_size_arr,std::vector<cl_uint> & loc_size_arr,std::vector<cl_uint> & offset,cl_uint len)449 void generate_reference_2D_offset(std::vector<cl_int> &reference_results, std::vector<cl_uint> &glob_size_arr, std::vector<cl_uint> &loc_size_arr, std::vector<cl_uint> &offset, cl_uint len)
450 {
451 size_t n = glob_size_arr.size();
452 for (size_t g = 0; g < glob_size_arr.size(); ++g)
453 {
454 for (size_t l = 0; l < loc_size_arr.size(); ++l)
455 {
456 if (glob_size_arr[(g + 1) % n] >= loc_size_arr[l])
457 {
458 for (size_t o = 0; o < offset.size(); ++o)
459 {
460 for (size_t h = 0; h < glob_size_arr[(g + 1) % n]; ++h)
461 {
462 for (size_t w = 0; w < glob_size_arr[g]; ++w)
463 {
464 ++reference_results[(glob_size_arr[g] * offset[(o + 1) % n] + offset[o] + h * glob_size_arr[g] + w) % len];
465 }
466 }
467 }
468 }
469 }
470 }
471 }
472
generate_reference_3D(std::vector<cl_int> & reference_results,std::vector<cl_uint> & glob_size_arr,cl_uint len)473 void generate_reference_3D(std::vector<cl_int> &reference_results, std::vector<cl_uint> &glob_size_arr, cl_uint len)
474 {
475 size_t n = glob_size_arr.size();
476 for (size_t g = 0; g < glob_size_arr.size(); ++g)
477 {
478 size_t global_work_size = glob_size_arr[(g + 2) % n] * glob_size_arr[(g + 1) % n] * glob_size_arr[g];
479 if(global_work_size <= (len * len))
480 {
481 for (size_t d = 0; d < glob_size_arr[(g + 2) % n]; ++d)
482 {
483 for (size_t h = 0; h < glob_size_arr[(g + 1) % n]; ++h)
484 {
485 for (size_t w = 0; w < glob_size_arr[g]; ++w)
486 {
487 ++reference_results[(d * glob_size_arr[(g + 1) % n] * glob_size_arr[g] + h * glob_size_arr[g] + w) % len];
488 }
489 }
490 }
491 }
492 }
493 }
494
generate_reference_3D_local(std::vector<cl_int> & reference_results,std::vector<cl_uint> & glob_size_arr,std::vector<cl_uint> & loc_size_arr,cl_uint len)495 void generate_reference_3D_local(std::vector<cl_int> &reference_results, std::vector<cl_uint> &glob_size_arr, std::vector<cl_uint> &loc_size_arr, cl_uint len)
496 {
497 size_t n = glob_size_arr.size();
498 for (size_t g = 0; g < glob_size_arr.size(); ++g)
499 {
500 for (size_t l = 0; l < loc_size_arr.size(); ++l)
501 {
502 size_t global_work_size = glob_size_arr[(g + 2) % n] * glob_size_arr[(g + 1) % n] * glob_size_arr[g];
503 if (glob_size_arr[(g + 2) % n] >= loc_size_arr[l] && global_work_size <= (len * len))
504 {
505 for (size_t d = 0; d < glob_size_arr[(g + 2) % n]; ++d)
506 {
507 for (size_t h = 0; h < glob_size_arr[(g + 1) % n]; ++h)
508 {
509 for (size_t w = 0; w < glob_size_arr[g]; ++w)
510 {
511 ++reference_results[(d * glob_size_arr[(g + 1) % n] * glob_size_arr[g] + h * glob_size_arr[g] + w) % len];
512 }
513 }
514 }
515 }
516 }
517 }
518 }
519
generate_reference_3D_offset(std::vector<cl_int> & reference_results,std::vector<cl_uint> & glob_size_arr,std::vector<cl_uint> & loc_size_arr,std::vector<cl_uint> & offset,cl_uint len)520 void generate_reference_3D_offset(std::vector<cl_int> &reference_results, std::vector<cl_uint> &glob_size_arr, std::vector<cl_uint> &loc_size_arr, std::vector<cl_uint> &offset, cl_uint len)
521 {
522 size_t n = glob_size_arr.size();
523 for (size_t g = 0; g < glob_size_arr.size(); ++g)
524 {
525 for (size_t l = 0; l < loc_size_arr.size(); ++l)
526 {
527 size_t global_work_size = glob_size_arr[(g + 2) % n] * glob_size_arr[(g + 1) % n] * glob_size_arr[g];
528 if (glob_size_arr[(g + 2) % n] >= loc_size_arr[l] && global_work_size <= (len * len))
529 {
530 for (size_t o = 0; o < offset.size(); ++o)
531 {
532 for (size_t d = 0; d < glob_size_arr[(g + 2) % n]; ++d)
533 {
534 for (size_t h = 0; h < glob_size_arr[(g + 1) % n]; ++h)
535 {
536 for (size_t w = 0; w < glob_size_arr[g]; ++w)
537 {
538 ++reference_results[(glob_size_arr[g] * glob_size_arr[(g + 1) % n] * offset[(o + 2) % n] + glob_size_arr[g] * offset[(o + 1) % n] + offset[o] + d * glob_size_arr[(g + 1) % n] * glob_size_arr[g] + h * glob_size_arr[g] + w) % len];
539 }
540 }
541 }
542 }
543 }
544 }
545 }
546 }
547
check_kernel_results(cl_int * results,cl_int len,std::vector<cl_uint> & glob_size_arr,std::vector<cl_uint> & loc_size_arr,std::vector<cl_uint> & offset,cl_int dim,cl_bool use_local,cl_bool use_offset)548 static int check_kernel_results(cl_int* results, cl_int len, std::vector<cl_uint> &glob_size_arr, std::vector<cl_uint> &loc_size_arr, std::vector<cl_uint> &offset, cl_int dim, cl_bool use_local, cl_bool use_offset)
549 {
550 std::vector<cl_int> reference_results(len, 0);
551 switch (dim)
552 {
553 case 1:
554 if (use_local == CL_FALSE)
555 {
556 generate_reference_1D(reference_results, glob_size_arr);
557 }
558 else if(use_local == CL_TRUE && use_offset == CL_FALSE)
559 {
560 generate_reference_1D_local(reference_results, glob_size_arr, loc_size_arr);
561 }
562 else
563 {
564 generate_reference_1D_offset(reference_results, glob_size_arr, loc_size_arr, offset, len);
565 }
566 break;
567 case 2:
568 if (use_local == CL_FALSE)
569 {
570 generate_reference_2D(reference_results, glob_size_arr, len);
571 }
572 else if (use_local == CL_TRUE && use_offset == CL_FALSE)
573 {
574 generate_reference_2D_local(reference_results, glob_size_arr, loc_size_arr, len);
575 }
576 else
577 {
578 generate_reference_2D_offset(reference_results, glob_size_arr, loc_size_arr, offset, len);
579 }
580 break;
581 case 3:
582 if (use_local == CL_FALSE)
583 {
584 generate_reference_3D(reference_results, glob_size_arr, len);
585 }
586 else if (use_local == CL_TRUE && use_offset == CL_FALSE)
587 {
588 generate_reference_3D_local(reference_results, glob_size_arr, loc_size_arr, len);
589 }
590 else
591 {
592 generate_reference_3D_offset(reference_results, glob_size_arr, loc_size_arr, offset, len);
593 }
594 break;
595 default:
596 return 0;
597 break;
598 }
599
600 for (cl_int i = 0; i < len; ++i)
601 {
602 if (results[i] != reference_results[i])
603 {
604 log_error("ERROR: Kernel returned %d vs. expected %d\n", results[i], reference_results[i]);
605 return i;
606 }
607 }
608
609 return -1;
610 }
611
test_enqueue_ndrange(cl_device_id device,cl_context context,cl_command_queue queue,int num_elements)612 int test_enqueue_ndrange(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
613 {
614 MTdata d;
615 cl_uint i;
616 cl_int err_ret, res = 0;
617 clCommandQueueWrapper dev_queue;
618 cl_int k, kernel_results[MAX_GWS] = { 0 };
619
620 size_t ret_len;
621 cl_uint max_queues = 1;
622 cl_uint maxQueueSize = 0;
623
624 d = init_genrand(gRandomSeed);
625
626 err_ret = clGetDeviceInfo(device, CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE, sizeof(maxQueueSize), &maxQueueSize, 0);
627 test_error(err_ret, "clGetDeviceInfo(CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE) failed");
628
629 err_ret = clGetDeviceInfo(device, CL_DEVICE_MAX_ON_DEVICE_QUEUES, sizeof(max_queues), &max_queues, &ret_len);
630 test_error(err_ret, "clGetDeviceInfo(CL_DEVICE_MAX_ON_DEVICE_QUEUES) failed");
631
632 size_t max_local_size = 1;
633 err_ret = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(max_local_size), &max_local_size, &ret_len);
634 test_error(err_ret, "clGetDeviceInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE) failed");
635
636 cl_queue_properties queue_prop_def[] =
637 {
638 CL_QUEUE_PROPERTIES, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE|CL_QUEUE_ON_DEVICE|CL_QUEUE_ON_DEVICE_DEFAULT,
639 CL_QUEUE_SIZE, maxQueueSize,
640 0
641 };
642
643 dev_queue = clCreateCommandQueueWithProperties(context, device, queue_prop_def, &err_ret);
644 test_error(err_ret, "clCreateCommandQueueWithProperties(CL_QUEUE_DEVICE|CL_QUEUE_DEFAULT) failed");
645
646 max_local_size = (max_local_size > MAX_GWS)? MAX_GWS: max_local_size;
647 if(gWimpyMode)
648 {
649 max_local_size = std::min((size_t)8, max_local_size);
650 }
651
652 cl_uint num = 10;
653 cl_uint global_work_size = max_local_size * 2;
654 std::vector<cl_uint> glob_size_arr(num);
655 std::vector<cl_uint> loc_size_arr(num);
656 std::vector<cl_uint> ofs_arr(num);
657 std::vector<cl_int> glob_results(global_work_size, 0);
658
659 glob_size_arr[0] = 1;
660 glob_size_arr[1] = global_work_size;
661 loc_size_arr[0] = 1;
662 loc_size_arr[1] = max_local_size;
663 ofs_arr[0] = 0;
664 ofs_arr[1] = 1;
665
666 for(i = 2; i < num; ++i)
667 {
668 glob_size_arr[i] = genrand_int32(d) % global_work_size;
669 glob_size_arr[i] = glob_size_arr[i] ? glob_size_arr[i]: 1;
670 loc_size_arr[i] = genrand_int32(d) % max_local_size;
671 loc_size_arr[i] = loc_size_arr[i] ? loc_size_arr[i]: 1;
672 ofs_arr[i] = genrand_int32(d) % global_work_size;
673 }
674
675 // check ndrange_dX functions
676 size_t failCnt = 0;
677 for(i = 0; i < num_kernels_ndrange_Xd; ++i)
678 {
679 if (!gKernelName.empty() && gKernelName != sources_ndrange_Xd[i].src.kernel_name)
680 continue;
681
682 clMemWrapper mem1 = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, glob_size_arr.size() * sizeof(cl_uint), &glob_size_arr[0], &err_ret);
683 test_error(err_ret, "clCreateBuffer() failed");
684 clMemWrapper mem2 = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, loc_size_arr.size() * sizeof(cl_uint), &loc_size_arr[0], &err_ret);
685 test_error(err_ret, "clCreateBuffer() failed");
686 clMemWrapper mem3 = clCreateBuffer(context, CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR, glob_results.size() * sizeof(cl_int), &glob_results[0], &err_ret);
687 test_error(err_ret, "clCreateBuffer() failed");
688 clMemWrapper mem4 = clCreateBuffer(context, CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR, ofs_arr.size() * sizeof(cl_uint), &ofs_arr[0], &err_ret);
689 test_error(err_ret, "clCreateBuffer() failed");
690
691 kernel_arg args[] =
692 {
693 { sizeof(cl_uint), &num },
694 { sizeof(cl_uint), &global_work_size },
695 { sizeof(cl_mem), &mem1 },
696 { sizeof(cl_mem), &mem2 },
697 { sizeof(cl_mem), &mem3 },
698 { sizeof(cl_mem), &mem4 },
699 };
700
701 log_info("Running '%s' kernel (%d of %d) ...\n", sources_ndrange_Xd[i].src.kernel_name, i + 1, num_kernels_ndrange_Xd);
702 err_ret = run_single_kernel_args(context, queue, sources_ndrange_Xd[i].src.lines, sources_ndrange_Xd[i].src.num_lines, sources_ndrange_Xd[i].src.kernel_name, kernel_results, sizeof(kernel_results), arr_size(args), args);
703
704 cl_int *ptr = (cl_int *)clEnqueueMapBuffer(queue, mem3, CL_TRUE, CL_MAP_READ, 0, glob_results.size() * sizeof(cl_int), 0, 0, 0, &err_ret);
705 test_error(err_ret, "clEnqueueMapBuffer() failed");
706
707 if(check_error(err_ret, "'%s' kernel execution failed", sources_ndrange_Xd[i].src.kernel_name)) { ++failCnt; res = -1; }
708 else if((k = check_kernel_results(kernel_results, arr_size(kernel_results))) >= 0 && check_error(-1, "'%s' kernel results validation failed: [%d] returned %d expected 0", sources_ndrange_Xd[i].src.kernel_name, k, kernel_results[k])) res = -1;
709 else if((k = check_kernel_results(ptr, global_work_size, glob_size_arr, loc_size_arr, ofs_arr, sources_ndrange_Xd[i].dim, sources_ndrange_Xd[i].localSize, sources_ndrange_Xd[i].offset)) >= 0 && check_error(-1, "'%s' global kernel results validation failed: [%d] returned %d expected 0", sources_ndrange_Xd[i].src.kernel_name, k, glob_results[k])) res = -1;
710 else log_info("'%s' kernel is OK.\n", sources_ndrange_Xd[i].src.kernel_name);
711
712 err_ret = clEnqueueUnmapMemObject(queue, mem3, ptr, 0, 0, 0);
713 test_error(err_ret, "clEnqueueUnmapMemObject() failed");
714
715 }
716
717 if (failCnt > 0)
718 {
719 log_error("ERROR: %d of %d kernels failed.\n", failCnt, num_kernels_ndrange_Xd);
720 }
721
722 return res;
723 }
724
725
726 #endif
727
728