1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "../../test_common/harness/compat.h"
17
18 #include <algorithm>
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22 #include <sys/stat.h>
23 #include <sys/types.h>
24
25 #include "../../test_common/harness/conversions.h"
26 #include "procs.h"
27
28 static const char *async_global_to_local_kernel3D = R"OpenCLC(
29 #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
30
31 #define STRUCT_SIZE %d
32 typedef struct __attribute__((packed))
33 {
34 uchar byte[STRUCT_SIZE];
35 } VarSizeStruct __attribute__((aligned(1)));
36
37 __kernel void test_fn(const __global VarSizeStruct *src, __global VarSizeStruct *dst, __local VarSizeStruct *localBuffer,
38 int numElementsPerLine, int numLines, int planesCopiesPerWorkgroup,
39 int planesCopiesPerWorkItem, int srcLineStride,
40 int dstLineStride, int srcPlaneStride, int dstPlaneStride ) {
41 // Zero the local storage first
42 for (int i = 0; i < planesCopiesPerWorkItem; i++) {
43 for (int j = 0; j < numLines; j++) {
44 for (int k = 0; k < numElementsPerLine; k++) {
45 const int index = (get_local_id(0) * planesCopiesPerWorkItem + i) * dstPlaneStride + j * dstLineStride + k;
46 for (int k = 0; k < STRUCT_SIZE; k++) {
47 localBuffer[index].byte[k] = 0;
48 }
49 }
50 }
51 }
52
53 // Do this to verify all kernels are done zeroing the local buffer before we try the copy
54 barrier(CLK_LOCAL_MEM_FENCE);
55
56 event_t event = async_work_group_copy_3D3D(localBuffer, 0, src,
57 planesCopiesPerWorkgroup * get_group_id(0) * srcPlaneStride,
58 sizeof(VarSizeStruct), (size_t)numElementsPerLine, (size_t)numLines,
59 planesCopiesPerWorkgroup, srcLineStride, srcPlaneStride, dstLineStride,
60 dstPlaneStride, 0);
61
62 // Wait for the copy to complete, then verify by manually copying to the dest
63 wait_group_events(1, &event);
64
65 for (int i = 0; i < planesCopiesPerWorkItem; i++) {
66 for (int j = 0; j < numLines; j++) {
67 for(int k = 0; k < numElementsPerLine; k++) {
68 const int local_index = (get_local_id(0) * planesCopiesPerWorkItem + i) * dstPlaneStride + j * dstLineStride + k;
69 const int global_index = (get_global_id(0) * planesCopiesPerWorkItem + i) * dstPlaneStride + j * dstLineStride + k;
70 dst[global_index] = localBuffer[local_index];
71 }
72 }
73 }
74 }
75 )OpenCLC";
76
77 static const char *async_local_to_global_kernel3D = R"OpenCLC(
78 #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
79
80 #define STRUCT_SIZE %d
81 typedef struct __attribute__((packed))
82 {
83 uchar byte[STRUCT_SIZE];
84 } VarSizeStruct __attribute__((aligned(1)));
85
86 __kernel void test_fn(const __global VarSizeStruct *src, __global VarSizeStruct *dst, __local VarSizeStruct *localBuffer,
87 int numElementsPerLine, int numLines, int planesCopiesPerWorkgroup,
88 int planesCopiesPerWorkItem, int srcLineStride,
89 int dstLineStride, int srcPlaneStride, int dstPlaneStride) {
90 // Zero the local storage first
91 for (int i = 0; i < planesCopiesPerWorkItem; i++) {
92 for (int j = 0; j < numLines; j++) {
93 for (int k = 0; k < numElementsPerLine; k++) {
94 const int index = (get_local_id(0) * planesCopiesPerWorkItem + i) * srcPlaneStride + j * srcLineStride + k;
95 for (int k = 0; k < STRUCT_SIZE; k++) {
96 localBuffer[index].byte[k] = 0;
97 }
98 }
99 }
100 }
101
102 // Do this to verify all kernels are done zeroing the local buffer before we try the copy
103 barrier(CLK_LOCAL_MEM_FENCE);
104
105 for (int i=0; i < planesCopiesPerWorkItem; i++) {
106 for (int j=0; j < numLines; j++) {
107 for (int k=0; k < numElementsPerLine; k++) {
108 const int local_index = (get_local_id(0) * planesCopiesPerWorkItem + i) * srcPlaneStride + j * srcLineStride + k;
109 const int global_index = (get_global_id(0) * planesCopiesPerWorkItem + i) * srcPlaneStride + j*srcLineStride + k;
110 localBuffer[local_index] = src[global_index];
111 }
112 }
113 }
114
115 // Do this to verify all kernels are done copying to the local buffer before we try the copy
116 barrier(CLK_LOCAL_MEM_FENCE);
117
118 event_t event = async_work_group_copy_3D3D(dst,
119 planesCopiesPerWorkgroup * get_group_id(0) * dstPlaneStride, localBuffer, 0,
120 sizeof(VarSizeStruct), (size_t)numElementsPerLine, (size_t)numLines, planesCopiesPerWorkgroup,
121 srcLineStride, srcPlaneStride, dstLineStride, dstPlaneStride, 0);
122
123 wait_group_events(1, &event);
124 }
125 )OpenCLC";
126
test_copy3D(const cl_device_id deviceID,const cl_context context,const cl_command_queue queue,const char * const kernelCode,const size_t elementSize,const int srcLineMargin,const int dstLineMargin,const int srcPlaneMargin,const int dstPlaneMargin,const bool localIsDst)127 int test_copy3D(const cl_device_id deviceID, const cl_context context,
128 const cl_command_queue queue, const char *const kernelCode,
129 const size_t elementSize, const int srcLineMargin,
130 const int dstLineMargin, const int srcPlaneMargin,
131 const int dstPlaneMargin, const bool localIsDst)
132 {
133 int error;
134
135 log_info(
136 "Testing %d byte element with srcLineMargin = %d, dstLineMargin = %d, "
137 "srcPlaneMargin = %d, dstPlaneMargin = %d\n",
138 elementSize, srcLineMargin, dstLineMargin, srcPlaneMargin,
139 dstPlaneMargin);
140
141 cl_long max_local_mem_size;
142 error =
143 clGetDeviceInfo(deviceID, CL_DEVICE_LOCAL_MEM_SIZE,
144 sizeof(max_local_mem_size), &max_local_mem_size, NULL);
145 test_error(error, "clGetDeviceInfo for CL_DEVICE_LOCAL_MEM_SIZE failed.");
146
147 cl_long max_global_mem_size;
148 error = clGetDeviceInfo(deviceID, CL_DEVICE_GLOBAL_MEM_SIZE,
149 sizeof(max_global_mem_size), &max_global_mem_size,
150 NULL);
151 test_error(error, "clGetDeviceInfo for CL_DEVICE_GLOBAL_MEM_SIZE failed.");
152
153 cl_long max_alloc_size;
154 error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
155 sizeof(max_alloc_size), &max_alloc_size, NULL);
156 test_error(error,
157 "clGetDeviceInfo for CL_DEVICE_MAX_MEM_ALLOC_SIZE failed.");
158
159 if (max_alloc_size > max_global_mem_size / 2)
160 max_alloc_size = max_global_mem_size / 2;
161
162 unsigned int num_of_compute_devices;
163 error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_COMPUTE_UNITS,
164 sizeof(num_of_compute_devices),
165 &num_of_compute_devices, NULL);
166 test_error(error,
167 "clGetDeviceInfo for CL_DEVICE_MAX_COMPUTE_UNITS failed.");
168
169 char programSource[4096] = { 0 };
170 const char *programPtr = programSource;
171
172 sprintf(programSource, kernelCode, elementSize);
173 // log_info("program: %s\n", programSource);
174 clProgramWrapper program;
175 clKernelWrapper kernel;
176
177 error = create_single_kernel_helper(context, &program, &kernel, 1,
178 &programPtr, "test_fn");
179 test_error(error, "Unable to create testing kernel");
180
181 size_t max_workgroup_size;
182 error = clGetKernelWorkGroupInfo(
183 kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(max_workgroup_size),
184 &max_workgroup_size, NULL);
185 test_error(
186 error,
187 "clGetKernelWorkGroupInfo failed for CL_KERNEL_WORK_GROUP_SIZE.");
188
189 size_t max_local_workgroup_size[3];
190 error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_ITEM_SIZES,
191 sizeof(max_local_workgroup_size),
192 max_local_workgroup_size, NULL);
193 test_error(error,
194 "clGetDeviceInfo failed for CL_DEVICE_MAX_WORK_ITEM_SIZES");
195
196 cl_long max_work_group_size;
197 error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_GROUP_SIZE,
198 sizeof(max_work_group_size), &max_work_group_size,
199 NULL);
200 test_error(error,
201 "clGetDeviceInfo for CL_DEVICE_MAX_WORK_GROUP_SIZE failed.");
202
203 // Pick the minimum of the device and the kernel
204 if (max_workgroup_size > max_local_workgroup_size[0])
205 max_workgroup_size = max_local_workgroup_size[0];
206
207 const size_t numElementsPerLine = 10;
208 const cl_int dstLineStride = numElementsPerLine + dstLineMargin;
209 const cl_int srcLineStride = numElementsPerLine + srcLineMargin;
210
211 const size_t numLines = 13;
212 const cl_int dstPlaneStride = (numLines * dstLineStride) + dstPlaneMargin;
213 const cl_int srcPlaneStride = (numLines * srcLineStride) + srcPlaneMargin;
214
215 const size_t planesCopiesPerWorkItem = 2;
216 const size_t localStorageSpacePerWorkitem = elementSize
217 * planesCopiesPerWorkItem
218 * (localIsDst ? dstPlaneStride : srcPlaneStride);
219 size_t maxLocalWorkgroupSize =
220 (((int)max_local_mem_size / 2) / localStorageSpacePerWorkitem);
221
222 // Calculation can return 0 on embedded devices due to 1KB local mem limit
223 if (maxLocalWorkgroupSize == 0)
224 {
225 maxLocalWorkgroupSize = 1;
226 }
227
228 size_t localWorkgroupSize = maxLocalWorkgroupSize;
229 if (maxLocalWorkgroupSize > max_workgroup_size)
230 localWorkgroupSize = max_workgroup_size;
231
232 const size_t maxTotalPlanesIn =
233 ((max_alloc_size / elementSize) + srcPlaneMargin) / srcPlaneStride;
234 const size_t maxTotalPlanesOut =
235 ((max_alloc_size / elementSize) + dstPlaneMargin) / dstPlaneStride;
236 const size_t maxTotalPlanes = std::min(maxTotalPlanesIn, maxTotalPlanesOut);
237 const size_t maxLocalWorkgroups =
238 maxTotalPlanes / (localWorkgroupSize * planesCopiesPerWorkItem);
239
240 const size_t localBufferSize =
241 localWorkgroupSize * localStorageSpacePerWorkitem
242 - (localIsDst ? dstPlaneMargin : srcPlaneMargin);
243 const size_t numberOfLocalWorkgroups =
244 std::min(1111, (int)maxLocalWorkgroups);
245 const size_t totalPlanes =
246 numberOfLocalWorkgroups * localWorkgroupSize * planesCopiesPerWorkItem;
247 const size_t inBufferSize = elementSize
248 * (totalPlanes * numLines * srcLineStride
249 + (totalPlanes - 1) * srcPlaneMargin);
250 const size_t outBufferSize = elementSize
251 * (totalPlanes * numLines * dstLineStride
252 + (totalPlanes - 1) * dstPlaneMargin);
253 const size_t globalWorkgroupSize =
254 numberOfLocalWorkgroups * localWorkgroupSize;
255
256 if ((localBufferSize / 4) > max_work_group_size)
257 {
258 log_info("Skipping due to resource requirements local:%db "
259 "max_work_group_size:%d\n",
260 localBufferSize, max_work_group_size);
261 return 0;
262 }
263
264 void *const inBuffer = (void *)malloc(inBufferSize);
265 void *const outBuffer = (void *)malloc(outBufferSize);
266 void *const outBufferCopy = (void *)malloc(outBufferSize);
267
268 const cl_int planesCopiesPerWorkItemInt =
269 static_cast<cl_int>(planesCopiesPerWorkItem);
270 const cl_int numElementsPerLineInt =
271 static_cast<cl_int>(numElementsPerLine);
272 const cl_int numLinesInt = static_cast<cl_int>(numLines);
273 const cl_int planesCopiesPerWorkgroup =
274 static_cast<cl_int>(planesCopiesPerWorkItem * localWorkgroupSize);
275
276 log_info("Global: %d, local %d, local buffer %db, global in buffer %db, "
277 "global out buffer %db, each work group will copy %d planes and "
278 "each work item item will copy %d planes.\n",
279 (int)globalWorkgroupSize, (int)localWorkgroupSize,
280 (int)localBufferSize, (int)inBufferSize, (int)outBufferSize,
281 planesCopiesPerWorkgroup, planesCopiesPerWorkItemInt);
282
283 size_t threads[1], localThreads[1];
284
285 threads[0] = globalWorkgroupSize;
286 localThreads[0] = localWorkgroupSize;
287
288 MTdata d = init_genrand(gRandomSeed);
289 generate_random_data(kChar, inBufferSize, d, inBuffer);
290 generate_random_data(kChar, outBufferSize, d, outBuffer);
291 free_mtdata(d);
292 d = NULL;
293 memcpy(outBufferCopy, outBuffer, outBufferSize);
294
295 clMemWrapper streams[2];
296
297 streams[0] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, inBufferSize,
298 inBuffer, &error);
299 test_error(error, "Unable to create input buffer");
300 streams[1] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, outBufferSize,
301 outBuffer, &error);
302 test_error(error, "Unable to create output buffer");
303
304 error = clSetKernelArg(kernel, 0, sizeof(streams[0]), &streams[0]);
305 test_error(error, "Unable to set kernel argument");
306 error = clSetKernelArg(kernel, 1, sizeof(streams[1]), &streams[1]);
307 test_error(error, "Unable to set kernel argument");
308 error = clSetKernelArg(kernel, 2, localBufferSize, NULL);
309 test_error(error, "Unable to set kernel argument");
310 error = clSetKernelArg(kernel, 3, sizeof(numElementsPerLineInt),
311 &numElementsPerLineInt);
312 test_error(error, "Unable to set kernel argument");
313 error = clSetKernelArg(kernel, 4, sizeof(numLinesInt), &numLinesInt);
314 test_error(error, "Unable to set kernel argument");
315 error = clSetKernelArg(kernel, 5, sizeof(planesCopiesPerWorkgroup),
316 &planesCopiesPerWorkgroup);
317 test_error(error, "Unable to set kernel argument");
318 error = clSetKernelArg(kernel, 6, sizeof(planesCopiesPerWorkItemInt),
319 &planesCopiesPerWorkItemInt);
320 test_error(error, "Unable to set kernel argument");
321 error = clSetKernelArg(kernel, 7, sizeof(srcLineStride), &srcLineStride);
322 test_error(error, "Unable to set kernel argument");
323 error = clSetKernelArg(kernel, 8, sizeof(dstLineStride), &dstLineStride);
324 test_error(error, "Unable to set kernel argument");
325 error = clSetKernelArg(kernel, 9, sizeof(srcPlaneStride), &srcPlaneStride);
326 test_error(error, "Unable to set kernel argument");
327 error = clSetKernelArg(kernel, 10, sizeof(dstPlaneStride), &dstPlaneStride);
328 test_error(error, "Unable to set kernel argument");
329
330 // Enqueue
331 error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads,
332 localThreads, 0, NULL, NULL);
333 test_error(error, "Unable to queue kernel");
334
335 // Read
336 error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, outBufferSize,
337 outBuffer, 0, NULL, NULL);
338 test_error(error, "Unable to read results");
339
340 // Verify
341 int failuresPrinted = 0;
342
343 for (int i = 0;
344 i < (int)globalWorkgroupSize * planesCopiesPerWorkItem * elementSize;
345 i += elementSize)
346 {
347 for (int j = 0; j < (int)numLines * elementSize; j += elementSize)
348 {
349 for (int k = 0; k < (int)numElementsPerLine * elementSize;
350 k += elementSize)
351 {
352 int inIdx = i * srcPlaneStride + j * srcLineStride + k;
353 int outIdx = i * dstPlaneStride + j * dstLineStride + k;
354 if (memcmp(((char *)inBuffer) + inIdx,
355 ((char *)outBuffer) + outIdx, elementSize)
356 != 0)
357 {
358 unsigned char *inchar = (unsigned char *)inBuffer + inIdx;
359 unsigned char *outchar =
360 (unsigned char *)outBuffer + outIdx;
361 char values[4096] = { 0 };
362
363 if (failuresPrinted == 0)
364 {
365 // Print first failure message
366 log_error("ERROR: Results of copy did not validate!");
367 }
368 sprintf(values + strlen(values), "%d -> [", inIdx);
369 for (int l = 0; l < (int)elementSize; l++)
370 sprintf(values + strlen(values), "%2x ", inchar[l]);
371 sprintf(values + strlen(values), "] != [");
372 for (int l = 0; l < (int)elementSize; l++)
373 sprintf(values + strlen(values), "%2x ", outchar[l]);
374 sprintf(values + strlen(values), "]");
375 log_error("%s\n", values);
376 failuresPrinted++;
377 }
378
379 if (failuresPrinted > 5)
380 {
381 log_error("Not printing further failures...\n");
382 return -1;
383 }
384 }
385 if (j < (int)numLines * elementSize)
386 {
387 int outIdx = i * dstPlaneStride + j * dstLineStride
388 + numElementsPerLine * elementSize;
389 if (memcmp(((char *)outBuffer) + outIdx,
390 ((char *)outBufferCopy) + outIdx,
391 dstLineMargin * elementSize)
392 != 0)
393 {
394 if (failuresPrinted == 0)
395 {
396 // Print first failure message
397 log_error("ERROR: Results of copy did not validate!\n");
398 }
399 log_error(
400 "3D copy corrupted data in output buffer in the line "
401 "stride offset of plane %d line %d\n",
402 i, j);
403 failuresPrinted++;
404 }
405 if (failuresPrinted > 5)
406 {
407 log_error("Not printing further failures...\n");
408 return -1;
409 }
410 }
411 }
412 if (i < (int)(globalWorkgroupSize * planesCopiesPerWorkItem - 1)
413 * elementSize)
414 {
415 int outIdx =
416 i * dstPlaneStride + numLines * dstLineStride * elementSize;
417 if (memcmp(((char *)outBuffer) + outIdx,
418 ((char *)outBufferCopy) + outIdx,
419 dstPlaneMargin * elementSize)
420 != 0)
421 {
422 if (failuresPrinted == 0)
423 {
424 // Print first failure message
425 log_error("ERROR: Results of copy did not validate!\n");
426 }
427 log_error("3D copy corrupted data in output buffer in the "
428 "plane stride "
429 "offset of plane %d\n",
430 i);
431 failuresPrinted++;
432 }
433 if (failuresPrinted > 5)
434 {
435 log_error("Not printing further failures...\n");
436 return -1;
437 }
438 }
439 }
440
441 free(inBuffer);
442 free(outBuffer);
443 free(outBufferCopy);
444
445 return failuresPrinted ? -1 : 0;
446 }
447
test_copy3D_all_types(cl_device_id deviceID,cl_context context,cl_command_queue queue,const char * kernelCode,bool localIsDst)448 int test_copy3D_all_types(cl_device_id deviceID, cl_context context,
449 cl_command_queue queue, const char *kernelCode,
450 bool localIsDst)
451 {
452 const unsigned int elemSizes[] = { 1, 2, 3, 4, 5, 6, 7,
453 8, 13, 16, 32, 47, 64 };
454 // The margins below represent the number of elements between the end of
455 // one line and the start of the next. The strides are equivalent to the
456 // size of the line or plane plus the chosen margin.
457 // These have to be multipliers, because the margin must be a multiple of
458 // element size.
459 const unsigned int marginMultipliers[] = { 0, 10, 100 };
460
461 int errors = 0;
462
463 if (!is_extension_available(deviceID, "cl_khr_extended_async_copies"))
464 {
465 log_info(
466 "Device does not support extended async copies. Skipping test.\n");
467 }
468 else
469 {
470 for (const unsigned int elemSize : elemSizes)
471 {
472 for (const unsigned int srcLineMarginMultiplier : marginMultipliers)
473 {
474 for (const unsigned int dstLineMarginMultiplier :
475 marginMultipliers)
476 {
477 for (const unsigned int srcPlaneMarginMultiplier :
478 marginMultipliers)
479 {
480 for (const unsigned int dstPlaneMarginMultiplier :
481 marginMultipliers)
482 {
483 if (test_copy3D(deviceID, context, queue,
484 kernelCode, elemSize,
485 srcLineMarginMultiplier * elemSize,
486 dstLineMarginMultiplier * elemSize,
487 srcPlaneMarginMultiplier * elemSize,
488 dstPlaneMarginMultiplier * elemSize,
489 localIsDst))
490 {
491 errors++;
492 }
493 }
494 }
495 }
496 }
497 }
498 }
499 if (errors) return -1;
500 return 0;
501 }
502
test_async_copy_global_to_local3D(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)503 int test_async_copy_global_to_local3D(cl_device_id deviceID, cl_context context,
504 cl_command_queue queue, int num_elements)
505 {
506 return test_copy3D_all_types(deviceID, context, queue,
507 async_global_to_local_kernel3D, true);
508 }
509
test_async_copy_local_to_global3D(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)510 int test_async_copy_local_to_global3D(cl_device_id deviceID, cl_context context,
511 cl_command_queue queue, int num_elements)
512 {
513 return test_copy3D_all_types(deviceID, context, queue,
514 async_local_to_global_kernel3D, false);
515 }
516