xref: /aosp_15_r20/external/OpenCL-CTS/test_conformance/device_execution/enqueue_ndrange.cpp (revision 6467f958c7de8070b317fc65bcb0f6472e388d82)
1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include <stdio.h>
17 #include <string.h>
18 #include "harness/testHarness.h"
19 #include "harness/typeWrappers.h"
20 
21 #include <algorithm>
22 #include <vector>
23 
24 #include "procs.h"
25 #include "utils.h"
26 #include <time.h>
27 
28 
29 #ifdef CL_VERSION_2_0
30 extern int gWimpyMode;
31 static const char *helper_ndrange_1d_glo[] = {
32     NL,
33     "void block_fn(int len, __global atomic_uint* val)" NL,
34     "{" NL,
35     "  atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1u, "
36     "memory_order_relaxed, memory_scope_device);" NL,
37     "}" NL,
38     "" NL,
39     "kernel void helper_ndrange_1d_glo(__global int* res, uint n, uint len, "
40     "__global uint* glob_size_arr, __global uint* loc_size_arr, __global "
41     "atomic_uint* val,  __global uint* ofs_arr)" NL,
42     "{" NL,
43     "  size_t tid = get_global_id(0);" NL,
44     "  void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL,
45     "" NL,
46     "  for(int i = 0; i < n; i++)" NL,
47     "  {" NL,
48     "    ndrange_t ndrange = ndrange_1D(glob_size_arr[i]);" NL,
49     "    int enq_res = enqueue_kernel(get_default_queue(), "
50     "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL,
51     "    if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL,
52     "  }" NL,
53     "}" NL
54 };
55 
56 static const char *helper_ndrange_1d_loc[] = {
57     NL,
58     "void block_fn(int len, __global atomic_uint* val)" NL,
59     "{" NL,
60     "  atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1u, "
61     "memory_order_relaxed, memory_scope_device);" NL,
62     "}" NL,
63     "" NL,
64     "kernel void helper_ndrange_1d_loc(__global int* res, uint n, uint len, "
65     "__global uint* glob_size_arr, __global uint* loc_size_arr, __global "
66     "atomic_uint* val,  __global uint* ofs_arr)" NL,
67     "{" NL,
68     "  size_t tid = get_global_id(0);" NL,
69     "  void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL,
70     "" NL,
71     "  for(int k = 0; k < n; k++)" NL,
72     "  {" NL,
73     "    for(int i = 0; i < n; i++)" NL,
74     "    {" NL,
75     "      if (glob_size_arr[i] >= loc_size_arr[k])" NL,
76     "      {" NL,
77     "        ndrange_t ndrange = ndrange_1D(glob_size_arr[i], "
78     "loc_size_arr[k]);" NL,
79     "        int enq_res = enqueue_kernel(get_default_queue(), "
80     "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL,
81     "        if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL,
82     "      }" NL,
83     "    }" NL,
84     "  }" NL,
85     "}" NL
86 };
87 
88 static const char *helper_ndrange_1d_ofs[] = {
89     NL,
90     "void block_fn(int len, __global atomic_uint* val)" NL,
91     "{" NL,
92     "  atomic_fetch_add_explicit(&val[(get_global_offset(0) + "
93     "get_global_linear_id()) % len], 1u, memory_order_relaxed, "
94     "memory_scope_device);" NL,
95     "}" NL,
96     "" NL,
97     "kernel void helper_ndrange_1d_ofs(__global int* res, uint n, uint len, "
98     "__global uint* glob_size_arr, __global uint* loc_size_arr, __global "
99     "atomic_uint* val,  __global uint* ofs_arr)" NL,
100     "{" NL,
101     "  size_t tid = get_global_id(0);" NL,
102     "  void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL,
103     "" NL,
104     "  for(int l = 0; l < n; l++)" NL,
105     "  {" NL,
106     "    for(int k = 0; k < n; k++)" NL,
107     "    {" NL,
108     "      for(int i = 0; i < n; i++)" NL,
109     "      {" NL,
110     "        if (glob_size_arr[i] >= loc_size_arr[k])" NL,
111     "        {" NL,
112     "          ndrange_t ndrange = ndrange_1D(ofs_arr[l], glob_size_arr[i], "
113     "loc_size_arr[k]);" NL,
114     "          int enq_res = enqueue_kernel(get_default_queue(), "
115     "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL,
116     "          if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL,
117     "        }" NL,
118     "      }" NL,
119     "    }" NL,
120     "  }" NL,
121     "}" NL
122 };
123 
124 static const char *helper_ndrange_2d_glo[] = {
125     NL,
126     "void block_fn(int len, __global atomic_uint* val)" NL,
127     "{" NL,
128     "  atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1u, "
129     "memory_order_relaxed, memory_scope_device);" NL,
130     "}" NL,
131     "" NL,
132     "kernel void helper_ndrange_2d_glo(__global int* res, uint n, uint len, "
133     "__global uint* glob_size_arr, __global uint* loc_size_arr, __global int* "
134     "val,  __global uint* ofs_arr)" NL,
135     "{" NL,
136     "  size_t tid = get_global_id(0);" NL,
137     "  void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL,
138     "" NL,
139     "  for(int i = 0; i < n; i++)" NL,
140     "  {" NL,
141     "    size_t glob_size[2] = { glob_size_arr[i], glob_size_arr[(i + 1) % n] "
142     "};" NL,
143     "    ndrange_t ndrange = ndrange_2D(glob_size);" NL,
144     "    int enq_res = enqueue_kernel(get_default_queue(), "
145     "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL,
146     "    if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL,
147     "  }" NL,
148     "}" NL
149 };
150 
151 static const char *helper_ndrange_2d_loc[] = {
152     NL,
153     "void block_fn(int len, __global atomic_uint* val)" NL,
154     "{" NL,
155     "  atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1u, "
156     "memory_order_relaxed, memory_scope_device);" NL,
157     "}" NL,
158     "" NL,
159     "kernel void helper_ndrange_2d_loc(__global int* res, uint n, uint len, "
160     "__global uint* glob_size_arr, __global uint* loc_size_arr, __global int* "
161     "val,  __global uint* ofs_arr)" NL,
162     "{" NL,
163     "  size_t tid = get_global_id(0);" NL,
164     "  void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL,
165     "" NL,
166     "  for(int k = 0; k < n; k++)" NL,
167     "  {" NL,
168     "    for(int i = 0; i < n; i++)" NL,
169     "    {" NL,
170     "      if (glob_size_arr[(i + 1) % n] >= loc_size_arr[k])" NL,
171     "      {" NL,
172     "        size_t glob_size[] = { glob_size_arr[i], glob_size_arr[(i + 1) % "
173     "n] };" NL,
174     "        size_t loc_size[] = { 1, loc_size_arr[k] };" NL,
175     "" NL,
176     "        ndrange_t ndrange = ndrange_2D(glob_size, loc_size);" NL,
177     "        int enq_res = enqueue_kernel(get_default_queue(), "
178     "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL,
179     "        if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL,
180     "      }" NL,
181     "    }" NL,
182     "  }" NL,
183     "}" NL
184 };
185 
186 
187 static const char *helper_ndrange_2d_ofs[] = {
188     NL,
189     "void block_fn(int len, __global atomic_uint* val)" NL,
190     "{" NL,
191     "  atomic_fetch_add_explicit(&val[(get_global_offset(1) * "
192     "get_global_size(0) + get_global_offset(0) + get_global_linear_id()) % "
193     "len], 1u, memory_order_relaxed, memory_scope_device);" NL,
194     "}" NL,
195     "" NL,
196     "kernel void helper_ndrange_2d_ofs(__global int* res, uint n, uint len, "
197     "__global uint* glob_size_arr, __global uint* loc_size_arr, __global int* "
198     "val,  __global uint* ofs_arr)" NL,
199     "{" NL,
200     "  size_t tid = get_global_id(0);" NL,
201     "  void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL,
202     "" NL,
203     "  for(int l = 0; l < n; l++)" NL,
204     "  {" NL,
205     "    for(int k = 0; k < n; k++)" NL,
206     "    {" NL,
207     "      for(int i = 0; i < n; i++)" NL,
208     "      {" NL,
209     "        if (glob_size_arr[(i + 1) % n] >= loc_size_arr[k])" NL,
210     "        {" NL,
211     "          size_t glob_size[] = { glob_size_arr[i], glob_size_arr[(i + 1) "
212     "% n]};" NL,
213     "          size_t loc_size[] = { 1, loc_size_arr[k] };" NL,
214     "          size_t ofs[] = { ofs_arr[l], ofs_arr[(l + 1) % n] };" NL,
215     "" NL,
216     "          ndrange_t ndrange = ndrange_2D(ofs,glob_size,loc_size);" NL,
217     "          int enq_res = enqueue_kernel(get_default_queue(), "
218     "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL,
219     "          if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL,
220     "        }" NL,
221     "      }" NL,
222     "    }" NL,
223     "  }" NL,
224     "}" NL
225 };
226 
227 
228 static const char *helper_ndrange_3d_glo[] = {
229     NL,
230     "void block_fn(int len, __global atomic_uint* val)" NL,
231     "{" NL,
232     "  atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1u, "
233     "memory_order_relaxed, memory_scope_device);" NL,
234     "}" NL,
235     "" NL,
236     "kernel void helper_ndrange_3d_glo(__global int* res, uint n, uint len, "
237     "__global uint* glob_size_arr, __global uint* loc_size_arr, __global int* "
238     "val,  __global uint* ofs_arr)" NL,
239     "{" NL,
240     "  size_t tid = get_global_id(0);" NL,
241     "  void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL,
242     "" NL,
243     "  for(int i = 0; i < n; i++)" NL,
244     "  {" NL,
245     "    uint global_work_size = glob_size_arr[i] *  glob_size_arr[(i + 1) % "
246     "n] * glob_size_arr[(i + 2) % n];" NL,
247     "    if (global_work_size <= (len * len))" NL,
248     "    {" NL,
249     "      size_t glob_size[3] = { glob_size_arr[i], glob_size_arr[(i + 1) % "
250     "n], glob_size_arr[(i + 2) % n] };" NL,
251     "      ndrange_t ndrange = ndrange_3D(glob_size);" NL,
252     "      int enq_res = enqueue_kernel(get_default_queue(), "
253     "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL,
254     "      if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL,
255     "    }" NL,
256     "  }" NL,
257     "}" NL
258 };
259 
260 
261 static const char *helper_ndrange_3d_loc[] = {
262     NL,
263     "void block_fn(int len, __global atomic_uint* val)" NL,
264     "{" NL,
265     "  atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1u, "
266     "memory_order_relaxed, memory_scope_device);" NL,
267     "}" NL,
268     "" NL,
269     "kernel void helper_ndrange_3d_loc(__global int* res, uint n, uint len, "
270     "__global uint* glob_size_arr, __global uint* loc_size_arr, __global int* "
271     "val,  __global uint* ofs_arr)" NL,
272     "{" NL,
273     "  size_t tid = get_global_id(0);" NL,
274     "  void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL,
275     "" NL,
276     "  for(int k = 0; k < n; k++)" NL,
277     "  {" NL,
278     "    for(int i = 0; i < n; i++)" NL,
279     "    {" NL,
280     "      uint global_work_size = glob_size_arr[i] *  glob_size_arr[(i + 1) % "
281     "n] * glob_size_arr[(i + 2) % n];" NL,
282     "      if (glob_size_arr[(i + 2) % n] >= loc_size_arr[k] && "
283     "global_work_size <= (len * len))" NL,
284     "      {" NL,
285     "        size_t glob_size[] = { glob_size_arr[i], glob_size_arr[(i + 1) % "
286     "n], glob_size_arr[(i + 2) % n] };" NL,
287     "        size_t loc_size[] = { 1, 1, loc_size_arr[k] };" NL,
288     "        ndrange_t ndrange = ndrange_3D(glob_size,loc_size);" NL,
289     "        int enq_res = enqueue_kernel(get_default_queue(), "
290     "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL,
291     "      " NL,
292     "        if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL,
293     "      }" NL,
294     "    }" NL,
295     "  }" NL,
296     "}" NL
297 };
298 
299 static const char *helper_ndrange_3d_ofs[] = {
300     NL,
301     "void block_fn(int len, __global atomic_uint* val)" NL,
302     "{" NL,
303     "  atomic_fetch_add_explicit(&val[(get_global_offset(2) * "
304     "get_global_size(0) * get_global_size(1) + get_global_offset(1) * "
305     "get_global_size(0) + get_global_offset(0) + get_global_linear_id()) % "
306     "len], 1u, memory_order_relaxed, memory_scope_device);" NL,
307     "}" NL,
308     "" NL,
309     "kernel void helper_ndrange_3d_ofs(__global int* res, uint n, uint len, "
310     "__global uint* glob_size_arr, __global uint* loc_size_arr, __global int* "
311     "val,  __global uint* ofs_arr)" NL,
312     "{" NL,
313     "  size_t tid = get_global_id(0);" NL,
314     "  void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL,
315     "" NL,
316     "  for(int l = 0; l < n; l++)" NL,
317     "  {" NL,
318     "    for(int k = 0; k < n; k++)" NL,
319     "    {" NL,
320     "      for(int i = 0; i < n; i++)" NL,
321     "      {" NL,
322     "        uint global_work_size = glob_size_arr[i] *  glob_size_arr[(i + 1) "
323     "% n] * glob_size_arr[(i + 2) % n];" NL,
324     "        if (glob_size_arr[(i + 2) % n] >= loc_size_arr[k] && "
325     "global_work_size <= (len * len))" NL,
326     "        {" NL,
327     "          size_t glob_size[3] = { glob_size_arr[i], glob_size_arr[(i + 1) "
328     "% n], glob_size_arr[(i + 2) % n]};" NL,
329     "          size_t loc_size[3] = { 1, 1, loc_size_arr[k] };" NL,
330     "          size_t ofs[3] = { ofs_arr[l], ofs_arr[(l + 1) % n], ofs_arr[(l "
331     "+ 2) % n] };" NL,
332     "          ndrange_t ndrange = ndrange_3D(ofs,glob_size,loc_size);" NL,
333     "          int enq_res = enqueue_kernel(get_default_queue(), "
334     "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL,
335     "          if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL,
336     "        }" NL,
337     "      }" NL,
338     "    }" NL,
339     "  }" NL,
340     "}" NL
341 };
342 
343 static const kernel_src_dim_check sources_ndrange_Xd[] =
344 {
345     { KERNEL(helper_ndrange_1d_glo), 1, CL_FALSE, CL_FALSE},
346     { KERNEL(helper_ndrange_1d_loc), 1, CL_TRUE, CL_FALSE},
347     { KERNEL(helper_ndrange_1d_ofs), 1, CL_TRUE, CL_TRUE},
348     { KERNEL(helper_ndrange_2d_glo), 2, CL_FALSE, CL_FALSE},
349     { KERNEL(helper_ndrange_2d_loc), 2, CL_TRUE, CL_FALSE},
350     { KERNEL(helper_ndrange_2d_ofs), 2, CL_TRUE, CL_TRUE},
351     { KERNEL(helper_ndrange_3d_glo), 3, CL_FALSE, CL_FALSE},
352     { KERNEL(helper_ndrange_3d_loc), 3, CL_TRUE, CL_FALSE},
353     { KERNEL(helper_ndrange_3d_ofs), 3, CL_TRUE, CL_TRUE},
354 };
355 static const size_t num_kernels_ndrange_Xd = arr_size(sources_ndrange_Xd);
356 
check_kernel_results(cl_int * results,cl_int len)357 static int check_kernel_results(cl_int* results, cl_int len)
358 {
359     for(cl_int i = 0; i < len; ++i)
360     {
361         if(results[i] != 0) return i;
362     }
363     return -1;
364 }
365 
generate_reference_1D(std::vector<cl_int> & reference_results,std::vector<cl_uint> & glob_size_arr)366 void generate_reference_1D(std::vector<cl_int> &reference_results, std::vector<cl_uint> &glob_size_arr)
367 {
368     for (size_t g = 0; g < glob_size_arr.size(); ++g)
369     {
370         for (size_t w = 0; w < glob_size_arr[g]; ++w)
371         {
372             ++reference_results[w];
373         }
374     }
375 }
376 
generate_reference_1D_local(std::vector<cl_int> & reference_results,std::vector<cl_uint> & glob_size_arr,std::vector<cl_uint> & loc_size_arr)377 void generate_reference_1D_local(std::vector<cl_int> &reference_results, std::vector<cl_uint> &glob_size_arr, std::vector<cl_uint> &loc_size_arr)
378 {
379     for (size_t g = 0; g < glob_size_arr.size(); ++g)
380     {
381         for (size_t l = 0; l < loc_size_arr.size(); ++l)
382         {
383             if (glob_size_arr[g] >= loc_size_arr[l])
384             {
385                 for (size_t w = 0; w < glob_size_arr[g]; ++w)
386                 {
387                     ++reference_results[w];
388                 }
389             }
390         }
391     }
392 }
393 
generate_reference_1D_offset(std::vector<cl_int> & reference_results,std::vector<cl_uint> & glob_size_arr,std::vector<cl_uint> & loc_size_arr,std::vector<cl_uint> & offset,cl_uint len)394 void generate_reference_1D_offset(std::vector<cl_int> &reference_results, std::vector<cl_uint> &glob_size_arr, std::vector<cl_uint> &loc_size_arr, std::vector<cl_uint> &offset, cl_uint len)
395 {
396     for (size_t g = 0; g < glob_size_arr.size(); ++g)
397     {
398         for (size_t l = 0; l < loc_size_arr.size(); ++l)
399         {
400             if (glob_size_arr[g] >= loc_size_arr[l])
401             {
402                 for (size_t o = 0; o < offset.size(); ++o)
403                 {
404                     for (size_t w = 0; w < glob_size_arr[g]; ++w)
405                     {
406                         ++reference_results[(offset[o] + w) % len];
407                     }
408                 }
409             }
410         }
411     }
412 }
413 
generate_reference_2D(std::vector<cl_int> & reference_results,std::vector<cl_uint> & glob_size_arr,cl_uint len)414 void generate_reference_2D(std::vector<cl_int> &reference_results, std::vector<cl_uint> &glob_size_arr, cl_uint len)
415 {
416     for (size_t g = 0; g < glob_size_arr.size(); ++g)
417     {
418         for (size_t h = 0; h < glob_size_arr[(g + 1) % glob_size_arr.size()]; ++h)
419         {
420             for (size_t w = 0; w < glob_size_arr[g]; ++w)
421             {
422                 ++reference_results[(h * glob_size_arr[g] + w) % len];
423             }
424         }
425     }
426 }
427 
generate_reference_2D_local(std::vector<cl_int> & reference_results,std::vector<cl_uint> & glob_size_arr,std::vector<cl_uint> & loc_size_arr,cl_uint len)428 void generate_reference_2D_local(std::vector<cl_int> &reference_results, std::vector<cl_uint> &glob_size_arr, std::vector<cl_uint> &loc_size_arr, cl_uint len)
429 {
430     size_t n = glob_size_arr.size();
431     for (size_t g = 0; g < glob_size_arr.size(); ++g)
432     {
433         for (size_t l = 0; l < loc_size_arr.size(); ++l)
434         {
435             if (glob_size_arr[(g + 1) % n] >= loc_size_arr[l])
436             {
437                 for (size_t h = 0; h < glob_size_arr[(g + 1) % n]; ++h)
438                 {
439                     for (size_t w = 0; w < glob_size_arr[g]; ++w)
440                     {
441                         ++reference_results[(h * glob_size_arr[g] + w) % len];
442                     }
443                 }
444             }
445         }
446     }
447 }
448 
generate_reference_2D_offset(std::vector<cl_int> & reference_results,std::vector<cl_uint> & glob_size_arr,std::vector<cl_uint> & loc_size_arr,std::vector<cl_uint> & offset,cl_uint len)449 void generate_reference_2D_offset(std::vector<cl_int> &reference_results, std::vector<cl_uint> &glob_size_arr, std::vector<cl_uint> &loc_size_arr, std::vector<cl_uint> &offset, cl_uint len)
450 {
451     size_t n = glob_size_arr.size();
452     for (size_t g = 0; g < glob_size_arr.size(); ++g)
453     {
454         for (size_t l = 0; l < loc_size_arr.size(); ++l)
455         {
456             if (glob_size_arr[(g + 1) % n] >= loc_size_arr[l])
457             {
458                 for (size_t o = 0; o < offset.size(); ++o)
459                 {
460                     for (size_t h = 0; h < glob_size_arr[(g + 1) % n]; ++h)
461                     {
462                         for (size_t w = 0; w < glob_size_arr[g]; ++w)
463                         {
464                             ++reference_results[(glob_size_arr[g] * offset[(o + 1) % n] + offset[o] + h * glob_size_arr[g] + w) % len];
465                         }
466                     }
467                 }
468             }
469         }
470     }
471 }
472 
generate_reference_3D(std::vector<cl_int> & reference_results,std::vector<cl_uint> & glob_size_arr,cl_uint len)473 void generate_reference_3D(std::vector<cl_int> &reference_results, std::vector<cl_uint> &glob_size_arr, cl_uint len)
474 {
475     size_t n = glob_size_arr.size();
476     for (size_t g = 0; g < glob_size_arr.size(); ++g)
477     {
478         size_t global_work_size = glob_size_arr[(g + 2) % n] * glob_size_arr[(g + 1) % n] * glob_size_arr[g];
479         if(global_work_size <= (len * len))
480         {
481             for (size_t d = 0; d < glob_size_arr[(g + 2) % n]; ++d)
482             {
483                 for (size_t h = 0; h < glob_size_arr[(g + 1) % n]; ++h)
484                 {
485                     for (size_t w = 0; w < glob_size_arr[g]; ++w)
486                     {
487                         ++reference_results[(d * glob_size_arr[(g + 1) % n] * glob_size_arr[g] + h * glob_size_arr[g] + w) % len];
488                     }
489                 }
490             }
491         }
492     }
493 }
494 
generate_reference_3D_local(std::vector<cl_int> & reference_results,std::vector<cl_uint> & glob_size_arr,std::vector<cl_uint> & loc_size_arr,cl_uint len)495 void generate_reference_3D_local(std::vector<cl_int> &reference_results, std::vector<cl_uint> &glob_size_arr, std::vector<cl_uint> &loc_size_arr, cl_uint len)
496 {
497     size_t n = glob_size_arr.size();
498     for (size_t g = 0; g < glob_size_arr.size(); ++g)
499     {
500         for (size_t l = 0; l < loc_size_arr.size(); ++l)
501         {
502             size_t global_work_size = glob_size_arr[(g + 2) % n] * glob_size_arr[(g + 1) % n] * glob_size_arr[g];
503             if (glob_size_arr[(g + 2) % n] >= loc_size_arr[l] && global_work_size <= (len * len))
504             {
505                 for (size_t d = 0; d < glob_size_arr[(g + 2) % n]; ++d)
506                 {
507                     for (size_t h = 0; h < glob_size_arr[(g + 1) % n]; ++h)
508                     {
509                         for (size_t w = 0; w < glob_size_arr[g]; ++w)
510                         {
511                             ++reference_results[(d * glob_size_arr[(g + 1) % n] * glob_size_arr[g] + h * glob_size_arr[g] + w) % len];
512                         }
513                     }
514                 }
515             }
516         }
517     }
518 }
519 
generate_reference_3D_offset(std::vector<cl_int> & reference_results,std::vector<cl_uint> & glob_size_arr,std::vector<cl_uint> & loc_size_arr,std::vector<cl_uint> & offset,cl_uint len)520 void generate_reference_3D_offset(std::vector<cl_int> &reference_results, std::vector<cl_uint> &glob_size_arr, std::vector<cl_uint> &loc_size_arr, std::vector<cl_uint> &offset, cl_uint len)
521 {
522     size_t n = glob_size_arr.size();
523     for (size_t g = 0; g < glob_size_arr.size(); ++g)
524     {
525         for (size_t l = 0; l < loc_size_arr.size(); ++l)
526         {
527             size_t global_work_size = glob_size_arr[(g + 2) % n] * glob_size_arr[(g + 1) % n] * glob_size_arr[g];
528             if (glob_size_arr[(g + 2) % n] >= loc_size_arr[l] && global_work_size <= (len * len))
529             {
530                 for (size_t o = 0; o < offset.size(); ++o)
531                 {
532                     for (size_t d = 0; d < glob_size_arr[(g + 2) % n]; ++d)
533                     {
534                         for (size_t h = 0; h < glob_size_arr[(g + 1) % n]; ++h)
535                         {
536                             for (size_t w = 0; w < glob_size_arr[g]; ++w)
537                             {
538                                 ++reference_results[(glob_size_arr[g] * glob_size_arr[(g + 1) % n] * offset[(o + 2) % n] + glob_size_arr[g] * offset[(o + 1) % n] + offset[o] + d * glob_size_arr[(g + 1) % n] * glob_size_arr[g] + h * glob_size_arr[g] + w) % len];
539                             }
540                         }
541                     }
542                 }
543             }
544         }
545     }
546 }
547 
check_kernel_results(cl_int * results,cl_int len,std::vector<cl_uint> & glob_size_arr,std::vector<cl_uint> & loc_size_arr,std::vector<cl_uint> & offset,cl_int dim,cl_bool use_local,cl_bool use_offset)548 static int check_kernel_results(cl_int* results, cl_int len, std::vector<cl_uint> &glob_size_arr, std::vector<cl_uint> &loc_size_arr, std::vector<cl_uint> &offset, cl_int dim, cl_bool use_local, cl_bool use_offset)
549 {
550     std::vector<cl_int> reference_results(len, 0);
551     switch (dim)
552     {
553     case 1:
554         if (use_local == CL_FALSE)
555         {
556             generate_reference_1D(reference_results, glob_size_arr);
557         }
558         else if(use_local == CL_TRUE && use_offset == CL_FALSE)
559         {
560             generate_reference_1D_local(reference_results, glob_size_arr, loc_size_arr);
561         }
562         else
563         {
564             generate_reference_1D_offset(reference_results, glob_size_arr, loc_size_arr, offset, len);
565         }
566         break;
567     case 2:
568         if (use_local == CL_FALSE)
569         {
570             generate_reference_2D(reference_results, glob_size_arr, len);
571         }
572         else if (use_local == CL_TRUE && use_offset == CL_FALSE)
573         {
574             generate_reference_2D_local(reference_results, glob_size_arr, loc_size_arr, len);
575         }
576         else
577         {
578             generate_reference_2D_offset(reference_results, glob_size_arr, loc_size_arr, offset, len);
579         }
580         break;
581     case 3:
582         if (use_local == CL_FALSE)
583         {
584             generate_reference_3D(reference_results, glob_size_arr, len);
585         }
586         else if (use_local == CL_TRUE && use_offset == CL_FALSE)
587         {
588             generate_reference_3D_local(reference_results, glob_size_arr, loc_size_arr, len);
589         }
590         else
591         {
592             generate_reference_3D_offset(reference_results, glob_size_arr, loc_size_arr, offset, len);
593         }
594         break;
595     default:
596         return 0;
597         break;
598     }
599 
600     for (cl_int i = 0; i < len; ++i)
601     {
602         if (results[i] != reference_results[i])
603         {
604             log_error("ERROR: Kernel returned %d vs. expected %d\n", results[i], reference_results[i]);
605             return i;
606         }
607     }
608 
609     return -1;
610 }
611 
test_enqueue_ndrange(cl_device_id device,cl_context context,cl_command_queue queue,int num_elements)612 int test_enqueue_ndrange(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
613 {
614     MTdata d;
615     cl_uint i;
616     cl_int err_ret, res = 0;
617     clCommandQueueWrapper dev_queue;
618     cl_int k, kernel_results[MAX_GWS] = { 0 };
619 
620     size_t ret_len;
621     cl_uint max_queues = 1;
622     cl_uint maxQueueSize = 0;
623 
624     d = init_genrand(gRandomSeed);
625 
626     err_ret = clGetDeviceInfo(device, CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE, sizeof(maxQueueSize), &maxQueueSize, 0);
627     test_error(err_ret, "clGetDeviceInfo(CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE) failed");
628 
629     err_ret = clGetDeviceInfo(device, CL_DEVICE_MAX_ON_DEVICE_QUEUES, sizeof(max_queues), &max_queues, &ret_len);
630     test_error(err_ret, "clGetDeviceInfo(CL_DEVICE_MAX_ON_DEVICE_QUEUES) failed");
631 
632     size_t max_local_size = 1;
633     err_ret = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(max_local_size), &max_local_size, &ret_len);
634     test_error(err_ret, "clGetDeviceInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE) failed");
635 
636     cl_queue_properties queue_prop_def[] =
637     {
638         CL_QUEUE_PROPERTIES, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE|CL_QUEUE_ON_DEVICE|CL_QUEUE_ON_DEVICE_DEFAULT,
639         CL_QUEUE_SIZE, maxQueueSize,
640         0
641     };
642 
643     dev_queue = clCreateCommandQueueWithProperties(context, device, queue_prop_def, &err_ret);
644     test_error(err_ret, "clCreateCommandQueueWithProperties(CL_QUEUE_DEVICE|CL_QUEUE_DEFAULT) failed");
645 
646     max_local_size = (max_local_size > MAX_GWS)? MAX_GWS: max_local_size;
647     if(gWimpyMode)
648     {
649         max_local_size = std::min((size_t)8, max_local_size);
650     }
651 
652     cl_uint num = 10;
653     cl_uint global_work_size = max_local_size * 2;
654     std::vector<cl_uint> glob_size_arr(num);
655     std::vector<cl_uint> loc_size_arr(num);
656     std::vector<cl_uint> ofs_arr(num);
657     std::vector<cl_int> glob_results(global_work_size, 0);
658 
659     glob_size_arr[0] = 1;
660     glob_size_arr[1] = global_work_size;
661     loc_size_arr[0] = 1;
662     loc_size_arr[1] = max_local_size;
663     ofs_arr[0] = 0;
664     ofs_arr[1] = 1;
665 
666     for(i = 2; i < num; ++i)
667     {
668         glob_size_arr[i] = genrand_int32(d) % global_work_size;
669         glob_size_arr[i] = glob_size_arr[i] ? glob_size_arr[i]: 1;
670         loc_size_arr[i] = genrand_int32(d) % max_local_size;
671         loc_size_arr[i] = loc_size_arr[i] ? loc_size_arr[i]: 1;
672         ofs_arr[i] = genrand_int32(d) % global_work_size;
673     }
674 
675     // check ndrange_dX functions
676     size_t failCnt = 0;
677     for(i = 0; i < num_kernels_ndrange_Xd; ++i)
678     {
679         if (!gKernelName.empty() && gKernelName != sources_ndrange_Xd[i].src.kernel_name)
680             continue;
681 
682         clMemWrapper mem1 = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, glob_size_arr.size() * sizeof(cl_uint), &glob_size_arr[0], &err_ret);
683         test_error(err_ret, "clCreateBuffer() failed");
684         clMemWrapper mem2 = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, loc_size_arr.size() * sizeof(cl_uint), &loc_size_arr[0], &err_ret);
685         test_error(err_ret, "clCreateBuffer() failed");
686         clMemWrapper mem3 = clCreateBuffer(context, CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR, glob_results.size() * sizeof(cl_int), &glob_results[0], &err_ret);
687         test_error(err_ret, "clCreateBuffer() failed");
688         clMemWrapper mem4 = clCreateBuffer(context, CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR, ofs_arr.size() * sizeof(cl_uint), &ofs_arr[0], &err_ret);
689         test_error(err_ret, "clCreateBuffer() failed");
690 
691         kernel_arg args[] =
692         {
693             { sizeof(cl_uint), &num },
694             { sizeof(cl_uint), &global_work_size },
695             { sizeof(cl_mem), &mem1 },
696             { sizeof(cl_mem), &mem2 },
697             { sizeof(cl_mem), &mem3 },
698             { sizeof(cl_mem), &mem4 },
699         };
700 
701         log_info("Running '%s' kernel (%d of %d) ...\n",  sources_ndrange_Xd[i].src.kernel_name, i + 1, num_kernels_ndrange_Xd);
702         err_ret = run_single_kernel_args(context, queue, sources_ndrange_Xd[i].src.lines, sources_ndrange_Xd[i].src.num_lines, sources_ndrange_Xd[i].src.kernel_name, kernel_results, sizeof(kernel_results), arr_size(args), args);
703 
704         cl_int *ptr = (cl_int *)clEnqueueMapBuffer(queue, mem3, CL_TRUE, CL_MAP_READ, 0, glob_results.size() * sizeof(cl_int), 0, 0, 0, &err_ret);
705         test_error(err_ret, "clEnqueueMapBuffer() failed");
706 
707         if(check_error(err_ret, "'%s' kernel execution failed", sources_ndrange_Xd[i].src.kernel_name)) { ++failCnt; res = -1; }
708         else if((k = check_kernel_results(kernel_results, arr_size(kernel_results))) >= 0 && check_error(-1, "'%s' kernel results validation failed: [%d] returned %d expected 0", sources_ndrange_Xd[i].src.kernel_name, k, kernel_results[k])) res = -1;
709         else if((k = check_kernel_results(ptr, global_work_size, glob_size_arr, loc_size_arr, ofs_arr, sources_ndrange_Xd[i].dim, sources_ndrange_Xd[i].localSize, sources_ndrange_Xd[i].offset)) >= 0 && check_error(-1, "'%s' global kernel results validation failed: [%d] returned %d expected 0", sources_ndrange_Xd[i].src.kernel_name, k, glob_results[k])) res = -1;
710         else log_info("'%s' kernel is OK.\n", sources_ndrange_Xd[i].src.kernel_name);
711 
712         err_ret = clEnqueueUnmapMemObject(queue, mem3, ptr, 0, 0, 0);
713         test_error(err_ret, "clEnqueueUnmapMemObject() failed");
714 
715     }
716 
717     if (failCnt > 0)
718     {
719         log_error("ERROR: %d of %d kernels failed.\n", failCnt, num_kernels_ndrange_Xd);
720     }
721 
722     return res;
723 }
724 
725 
726 #endif
727 
728