1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "../../test_common/harness/compat.h"
17
18 #include <algorithm>
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22 #include <sys/stat.h>
23 #include <sys/types.h>
24
25 #include "../../test_common/harness/conversions.h"
26 #include "procs.h"
27
28 static const char *async_global_to_local_kernel2D = R"OpenCLC(
29 #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
30
31 #define STRUCT_SIZE %d
32 typedef struct __attribute__((packed))
33 {
34 uchar byte[STRUCT_SIZE];
35 } VarSizeStruct __attribute__((aligned(1)));
36
37
38 __kernel void test_fn(const __global VarSizeStruct *src, __global VarSizeStruct *dst,
39 __local VarSizeStruct *localBuffer, int numElementsPerLine,
40 int lineCopiesPerWorkgroup, int lineCopiesPerWorkItem,
41 int srcStride, int dstStride) {
42 // Zero the local storage first
43 for (int i = 0; i < lineCopiesPerWorkItem; i++) {
44 for (int j = 0; j < numElementsPerLine; j++) {
45 const int index = (get_local_id(0) * lineCopiesPerWorkItem + i) * dstStride + j;
46 for (int k = 0; k < STRUCT_SIZE; k++) {
47 localBuffer[index].byte[k] = 0;
48 }
49 }
50 }
51
52 // Do this to verify all kernels are done zeroing the local buffer before we
53 // try the copy
54 barrier( CLK_LOCAL_MEM_FENCE );
55 event_t event = async_work_group_copy_2D2D(localBuffer, 0, src,
56 lineCopiesPerWorkgroup * get_group_id(0) * srcStride, sizeof(VarSizeStruct),
57 (size_t)numElementsPerLine, (size_t)lineCopiesPerWorkgroup, srcStride, dstStride, 0);
58
59 // Wait for the copy to complete, then verify by manually copying to the dest
60 wait_group_events(1, &event);
61
62 for (int i = 0; i < lineCopiesPerWorkItem; i++) {
63 for (int j = 0; j < numElementsPerLine; j++) {
64 const int local_index = (get_local_id(0) * lineCopiesPerWorkItem + i) * dstStride + j;
65 const int global_index = (get_global_id(0) * lineCopiesPerWorkItem + i) * dstStride + j;
66 dst[global_index] = localBuffer[local_index];
67 }
68 }
69 }
70 )OpenCLC";
71
72 static const char *async_local_to_global_kernel2D = R"OpenCLC(
73 #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
74
75 #define STRUCT_SIZE %d
76 typedef struct __attribute__((packed))
77 {
78 uchar byte[STRUCT_SIZE];
79 } VarSizeStruct __attribute__((aligned(1)));
80
81
82 __kernel void test_fn(const __global VarSizeStruct *src, __global VarSizeStruct *dst, __local VarSizeStruct *localBuffer,
83 int numElementsPerLine, int lineCopiesPerWorkgroup,
84 int lineCopiesPerWorkItem, int srcStride, int dstStride) {
85 // Zero the local storage first
86 for (int i = 0; i < lineCopiesPerWorkItem; i++) {
87 for (int j = 0; j < numElementsPerLine; j++) {
88 const int index = (get_local_id(0) * lineCopiesPerWorkItem + i) * srcStride + j;
89 for (int k = 0; k < STRUCT_SIZE; k++) {
90 localBuffer[index].byte[k] = 0;
91 }
92 }
93 }
94
95 // Do this to verify all kernels are done zeroing the local buffer before we try the copy
96 barrier(CLK_LOCAL_MEM_FENCE);
97
98 for (int i = 0; i < lineCopiesPerWorkItem; i++) {
99 for (int j = 0; j < numElementsPerLine; j++) {
100 const int local_index = (get_local_id(0) * lineCopiesPerWorkItem + i) * srcStride + j;
101 const int global_index = (get_global_id(0)*lineCopiesPerWorkItem + i) * srcStride + j;
102 localBuffer[local_index] = src[global_index];
103 }
104 }
105
106 // Do this to verify all kernels are done copying to the local buffer before we try the copy
107 barrier(CLK_LOCAL_MEM_FENCE);
108 event_t event = async_work_group_copy_2D2D(dst, lineCopiesPerWorkgroup * get_group_id(0) * dstStride,
109 localBuffer, 0, sizeof(VarSizeStruct), (size_t)numElementsPerLine, (size_t)lineCopiesPerWorkgroup, srcStride,
110 dstStride, 0 );
111
112 wait_group_events(1, &event);
113 };
114 )OpenCLC";
115
test_copy2D(const cl_device_id deviceID,const cl_context context,const cl_command_queue queue,const char * const kernelCode,const size_t elementSize,const int srcMargin,const int dstMargin,const bool localIsDst)116 int test_copy2D(const cl_device_id deviceID, const cl_context context,
117 const cl_command_queue queue, const char *const kernelCode,
118 const size_t elementSize, const int srcMargin,
119 const int dstMargin, const bool localIsDst)
120 {
121 int error;
122
123 log_info("Testing %d byte element with srcMargin = %d, dstMargin = %d\n",
124 elementSize, srcMargin, dstMargin);
125
126 cl_long max_local_mem_size;
127 error =
128 clGetDeviceInfo(deviceID, CL_DEVICE_LOCAL_MEM_SIZE,
129 sizeof(max_local_mem_size), &max_local_mem_size, NULL);
130 test_error(error, "clGetDeviceInfo for CL_DEVICE_LOCAL_MEM_SIZE failed.");
131
132 cl_long max_global_mem_size;
133 error = clGetDeviceInfo(deviceID, CL_DEVICE_GLOBAL_MEM_SIZE,
134 sizeof(max_global_mem_size), &max_global_mem_size,
135 NULL);
136 test_error(error, "clGetDeviceInfo for CL_DEVICE_GLOBAL_MEM_SIZE failed.");
137
138 cl_long max_alloc_size;
139 error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
140 sizeof(max_alloc_size), &max_alloc_size, NULL);
141 test_error(error,
142 "clGetDeviceInfo for CL_DEVICE_MAX_MEM_ALLOC_SIZE failed.");
143
144 cl_long max_work_group_size;
145 error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_GROUP_SIZE,
146 sizeof(max_work_group_size), &max_work_group_size,
147 NULL);
148 test_error(error,
149 "clGetDeviceInfo for CL_DEVICE_MAX_WORK_GROUP_SIZE failed.");
150
151 if (max_alloc_size > max_global_mem_size / 2)
152 max_alloc_size = max_global_mem_size / 2;
153
154 unsigned int num_of_compute_devices;
155 error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_COMPUTE_UNITS,
156 sizeof(num_of_compute_devices),
157 &num_of_compute_devices, NULL);
158 test_error(error,
159 "clGetDeviceInfo for CL_DEVICE_MAX_COMPUTE_UNITS failed.");
160
161 char programSource[4096] = { 0 };
162 const char *programPtr = programSource;
163
164 sprintf(programSource, kernelCode, elementSize);
165 // log_info("program: %s\n", programSource);
166
167 clProgramWrapper program;
168 clKernelWrapper kernel;
169
170 error = create_single_kernel_helper(context, &program, &kernel, 1,
171 &programPtr, "test_fn");
172 test_error(error, "Unable to create testing kernel");
173
174 size_t max_workgroup_size;
175 error = clGetKernelWorkGroupInfo(
176 kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(max_workgroup_size),
177 &max_workgroup_size, NULL);
178 test_error(
179 error,
180 "clGetKernelWorkGroupInfo failed for CL_KERNEL_WORK_GROUP_SIZE.");
181
182 size_t max_local_workgroup_size[3];
183 error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_ITEM_SIZES,
184 sizeof(max_local_workgroup_size),
185 max_local_workgroup_size, NULL);
186 test_error(error,
187 "clGetDeviceInfo failed for CL_DEVICE_MAX_WORK_ITEM_SIZES");
188
189 // Pick the minimum of the device and the kernel
190 if (max_workgroup_size > max_local_workgroup_size[0])
191 max_workgroup_size = max_local_workgroup_size[0];
192
193 const size_t numElementsPerLine = 10;
194 const cl_int dstStride = numElementsPerLine + dstMargin;
195 const cl_int srcStride = numElementsPerLine + srcMargin;
196
197 const size_t lineCopiesPerWorkItem = 13;
198 const size_t localStorageSpacePerWorkitem = lineCopiesPerWorkItem
199 * elementSize * (localIsDst ? dstStride : srcStride);
200
201 size_t maxLocalWorkgroupSize =
202 (((int)max_local_mem_size / 2) / localStorageSpacePerWorkitem);
203
204 // Calculation can return 0 on embedded devices due to 1KB local mem limit
205 if (maxLocalWorkgroupSize == 0)
206 {
207 maxLocalWorkgroupSize = 1;
208 }
209
210 size_t localWorkgroupSize = maxLocalWorkgroupSize;
211 if (maxLocalWorkgroupSize > max_workgroup_size)
212 localWorkgroupSize = max_workgroup_size;
213
214 const size_t maxTotalLinesIn =
215 (max_alloc_size / elementSize + srcMargin) / srcStride;
216 const size_t maxTotalLinesOut =
217 (max_alloc_size / elementSize + dstMargin) / dstStride;
218 const size_t maxTotalLines = std::min(maxTotalLinesIn, maxTotalLinesOut);
219 const size_t maxLocalWorkgroups =
220 maxTotalLines / (localWorkgroupSize * lineCopiesPerWorkItem);
221
222 const size_t localBufferSize =
223 localWorkgroupSize * localStorageSpacePerWorkitem
224 - (localIsDst ? dstMargin : srcMargin);
225 const size_t numberOfLocalWorkgroups =
226 std::min(1111, (int)maxLocalWorkgroups);
227 const size_t totalLines =
228 numberOfLocalWorkgroups * localWorkgroupSize * lineCopiesPerWorkItem;
229 const size_t inBufferSize = elementSize
230 * (totalLines * numElementsPerLine + (totalLines - 1) * srcMargin);
231 const size_t outBufferSize = elementSize
232 * (totalLines * numElementsPerLine + (totalLines - 1) * dstMargin);
233 const size_t globalWorkgroupSize =
234 numberOfLocalWorkgroups * localWorkgroupSize;
235
236 if ((localBufferSize / 4) > max_work_group_size)
237 {
238 log_info("Skipping due to resource requirements local:%db "
239 "max_work_group_size:%d\n",
240 localBufferSize, max_work_group_size);
241 return 0;
242 }
243
244 void *const inBuffer = (void *)malloc(inBufferSize);
245 void *const outBuffer = (void *)malloc(outBufferSize);
246 void *const outBufferCopy = (void *)malloc(outBufferSize);
247
248 const cl_int lineCopiesPerWorkItemInt =
249 static_cast<cl_int>(lineCopiesPerWorkItem);
250 const cl_int numElementsPerLineInt =
251 static_cast<cl_int>(numElementsPerLine);
252 const cl_int lineCopiesPerWorkgroup =
253 static_cast<cl_int>(lineCopiesPerWorkItem * localWorkgroupSize);
254
255 log_info(
256 "Global: %d, local %d, local buffer %db, global in buffer %db, "
257 "global out buffer %db, each work group will copy %d lines and each "
258 "work item item will copy %d lines.\n",
259 (int)globalWorkgroupSize, (int)localWorkgroupSize, (int)localBufferSize,
260 (int)inBufferSize, (int)outBufferSize, lineCopiesPerWorkgroup,
261 lineCopiesPerWorkItemInt);
262
263 size_t threads[1], localThreads[1];
264
265 threads[0] = globalWorkgroupSize;
266 localThreads[0] = localWorkgroupSize;
267
268 MTdata d = init_genrand(gRandomSeed);
269 generate_random_data(kChar, inBufferSize, d, inBuffer);
270 generate_random_data(kChar, outBufferSize, d, outBuffer);
271 free_mtdata(d);
272 d = NULL;
273 memcpy(outBufferCopy, outBuffer, outBufferSize);
274
275 clMemWrapper streams[2];
276
277 streams[0] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, inBufferSize,
278 inBuffer, &error);
279 test_error(error, "Unable to create input buffer");
280 streams[1] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, outBufferSize,
281 outBuffer, &error);
282 test_error(error, "Unable to create output buffer");
283
284 error = clSetKernelArg(kernel, 0, sizeof(streams[0]), &streams[0]);
285 test_error(error, "Unable to set kernel argument");
286 error = clSetKernelArg(kernel, 1, sizeof(streams[1]), &streams[1]);
287 test_error(error, "Unable to set kernel argument");
288 error = clSetKernelArg(kernel, 2, localBufferSize, NULL);
289 test_error(error, "Unable to set kernel argument");
290 error = clSetKernelArg(kernel, 3, sizeof(numElementsPerLineInt),
291 &numElementsPerLineInt);
292 test_error(error, "Unable to set kernel argument");
293 error = clSetKernelArg(kernel, 4, sizeof(lineCopiesPerWorkgroup),
294 &lineCopiesPerWorkgroup);
295 test_error(error, "Unable to set kernel argument");
296 error = clSetKernelArg(kernel, 5, sizeof(lineCopiesPerWorkItemInt),
297 &lineCopiesPerWorkItemInt);
298 test_error(error, "Unable to set kernel argument");
299 error = clSetKernelArg(kernel, 6, sizeof(srcStride), &srcStride);
300 test_error(error, "Unable to set kernel argument");
301 error = clSetKernelArg(kernel, 7, sizeof(dstStride), &dstStride);
302 test_error(error, "Unable to set kernel argument");
303
304 // Enqueue
305 error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads,
306 localThreads, 0, NULL, NULL);
307 test_error(error, "Unable to queue kernel");
308
309 // Read
310 error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, outBufferSize,
311 outBuffer, 0, NULL, NULL);
312 test_error(error, "Unable to read results");
313
314 // Verify
315 int failuresPrinted = 0;
316
317 for (int i = 0;
318 i < (int)globalWorkgroupSize * lineCopiesPerWorkItem * elementSize;
319 i += elementSize)
320 {
321 for (int j = 0; j < (int)numElementsPerLine * elementSize;
322 j += elementSize)
323 {
324 int inIdx = i * srcStride + j;
325 int outIdx = i * dstStride + j;
326 if (memcmp(((char *)inBuffer) + inIdx, ((char *)outBuffer) + outIdx,
327 elementSize)
328 != 0)
329 {
330 unsigned char *inchar = (unsigned char *)inBuffer + inIdx;
331 unsigned char *outchar = (unsigned char *)outBuffer + outIdx;
332 char values[4096] = { 0 };
333
334 if (failuresPrinted == 0)
335 {
336 // Print first failure message
337 log_error("ERROR: Results of copy did not validate!\n");
338 }
339 sprintf(values + strlen(values), "%d -> [", inIdx);
340 for (int k = 0; k < (int)elementSize; k++)
341 sprintf(values + strlen(values), "%2x ", inchar[k]);
342 sprintf(values + strlen(values), "] != [");
343 for (int k = 0; k < (int)elementSize; k++)
344 sprintf(values + strlen(values), "%2x ", outchar[k]);
345 sprintf(values + strlen(values), "]");
346 log_error("%s\n", values);
347 failuresPrinted++;
348 }
349
350 if (failuresPrinted > 5)
351 {
352 log_error("Not printing further failures...\n");
353 return -1;
354 }
355 }
356 if (i < (int)(globalWorkgroupSize * lineCopiesPerWorkItem - 1)
357 * elementSize)
358 {
359 int outIdx = i * dstStride + numElementsPerLine * elementSize;
360 if (memcmp(((char *)outBuffer) + outIdx,
361 ((char *)outBufferCopy) + outIdx,
362 dstMargin * elementSize)
363 != 0)
364 {
365 if (failuresPrinted == 0)
366 {
367 // Print first failure message
368 log_error("ERROR: Results of copy did not validate!\n");
369 }
370 log_error(
371 "2D copy corrupted data in output buffer in the stride "
372 "offset of line %d\n",
373 i);
374 failuresPrinted++;
375 }
376 if (failuresPrinted > 5)
377 {
378 log_error("Not printing further failures...\n");
379 return -1;
380 }
381 }
382 }
383
384 free(inBuffer);
385 free(outBuffer);
386 free(outBufferCopy);
387
388 return failuresPrinted ? -1 : 0;
389 }
390
test_copy2D_all_types(cl_device_id deviceID,cl_context context,cl_command_queue queue,const char * kernelCode,bool localIsDst)391 int test_copy2D_all_types(cl_device_id deviceID, cl_context context,
392 cl_command_queue queue, const char *kernelCode,
393 bool localIsDst)
394 {
395 const unsigned int elemSizes[] = { 1, 2, 3, 4, 5, 6, 7,
396 8, 13, 16, 32, 47, 64 };
397 // The margins below represent the number of elements between the end of
398 // one line and the start of the next. The strides are equivalent to the
399 // length of the line plus the chosen margin.
400 // These have to be multipliers, because the margin must be a multiple of
401 // element size.
402 const unsigned int marginMultipliers[] = { 0, 10, 100 };
403
404 int errors = 0;
405
406 if (!is_extension_available(deviceID, "cl_khr_extended_async_copies"))
407 {
408 log_info(
409 "Device does not support extended async copies. Skipping test.\n");
410 }
411 else
412 {
413 for (const unsigned int elemSize : elemSizes)
414 {
415 for (const unsigned int srcMarginMultiplier : marginMultipliers)
416 {
417 for (const unsigned int dstMarginMultiplier : marginMultipliers)
418 {
419 if (test_copy2D(deviceID, context, queue, kernelCode,
420 elemSize, srcMarginMultiplier * elemSize,
421 dstMarginMultiplier * elemSize, localIsDst))
422 {
423 errors++;
424 }
425 }
426 }
427 }
428 }
429
430 return errors ? -1 : 0;
431 }
432
test_async_copy_global_to_local2D(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)433 int test_async_copy_global_to_local2D(cl_device_id deviceID, cl_context context,
434 cl_command_queue queue, int num_elements)
435 {
436 return test_copy2D_all_types(deviceID, context, queue,
437 async_global_to_local_kernel2D, true);
438 }
439
test_async_copy_local_to_global2D(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)440 int test_async_copy_local_to_global2D(cl_device_id deviceID, cl_context context,
441 cl_command_queue queue, int num_elements)
442 {
443 return test_copy2D_all_types(deviceID, context, queue,
444 async_local_to_global_kernel2D, false);
445 }
446