xref: /aosp_15_r20/external/OpenCL-CTS/test_conformance/basic/test_async_copy3D.cpp (revision 6467f958c7de8070b317fc65bcb0f6472e388d82)
1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "../../test_common/harness/compat.h"
17 
18 #include <algorithm>
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22 #include <sys/stat.h>
23 #include <sys/types.h>
24 
25 #include "../../test_common/harness/conversions.h"
26 #include "procs.h"
27 
28 static const char *async_global_to_local_kernel3D = R"OpenCLC(
29 #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
30 
31 #define STRUCT_SIZE %d
32 typedef struct __attribute__((packed))
33 {
34     uchar byte[STRUCT_SIZE];
35 } VarSizeStruct __attribute__((aligned(1)));
36 
37 __kernel void test_fn(const __global VarSizeStruct *src, __global VarSizeStruct *dst, __local VarSizeStruct *localBuffer,
38                       int numElementsPerLine, int numLines, int planesCopiesPerWorkgroup,
39                       int planesCopiesPerWorkItem, int srcLineStride,
40                       int dstLineStride, int srcPlaneStride, int dstPlaneStride ) {
41   // Zero the local storage first
42   for (int i = 0; i < planesCopiesPerWorkItem; i++) {
43     for (int j = 0; j < numLines; j++) {
44       for (int k = 0; k < numElementsPerLine; k++) {
45         const int index = (get_local_id(0) * planesCopiesPerWorkItem + i) * dstPlaneStride + j * dstLineStride + k;
46         for (int k = 0; k < STRUCT_SIZE; k++) {
47           localBuffer[index].byte[k] = 0;
48         }
49       }
50     }
51   }
52 
53   // Do this to verify all kernels are done zeroing the local buffer before we try the copy
54   barrier(CLK_LOCAL_MEM_FENCE);
55 
56   event_t event = async_work_group_copy_3D3D(localBuffer, 0, src,
57     planesCopiesPerWorkgroup * get_group_id(0) * srcPlaneStride,
58     sizeof(VarSizeStruct), (size_t)numElementsPerLine, (size_t)numLines,
59     planesCopiesPerWorkgroup, srcLineStride, srcPlaneStride, dstLineStride,
60     dstPlaneStride, 0);
61 
62   // Wait for the copy to complete, then verify by manually copying to the dest
63   wait_group_events(1, &event);
64 
65   for (int i = 0; i < planesCopiesPerWorkItem; i++) {
66     for (int j = 0; j < numLines; j++) {
67       for(int k = 0; k < numElementsPerLine; k++) {
68         const int local_index = (get_local_id(0) * planesCopiesPerWorkItem + i) * dstPlaneStride + j * dstLineStride + k;
69         const int global_index = (get_global_id(0) * planesCopiesPerWorkItem + i) * dstPlaneStride + j * dstLineStride + k;
70         dst[global_index] = localBuffer[local_index];
71       }
72     }
73   }
74 }
75 )OpenCLC";
76 
77 static const char *async_local_to_global_kernel3D = R"OpenCLC(
78 #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
79 
80 #define STRUCT_SIZE %d
81 typedef struct __attribute__((packed))
82 {
83     uchar byte[STRUCT_SIZE];
84 } VarSizeStruct __attribute__((aligned(1)));
85 
86 __kernel void test_fn(const __global VarSizeStruct *src, __global VarSizeStruct *dst, __local VarSizeStruct *localBuffer,
87                       int numElementsPerLine, int numLines, int planesCopiesPerWorkgroup,
88                       int planesCopiesPerWorkItem, int srcLineStride,
89                       int dstLineStride, int srcPlaneStride, int dstPlaneStride) {
90   // Zero the local storage first
91   for (int i = 0; i < planesCopiesPerWorkItem; i++) {
92     for (int j = 0; j < numLines; j++) {
93       for (int k = 0; k < numElementsPerLine; k++) {
94         const int index = (get_local_id(0) * planesCopiesPerWorkItem + i) * srcPlaneStride + j * srcLineStride + k;
95         for (int k = 0; k < STRUCT_SIZE; k++) {
96           localBuffer[index].byte[k] = 0;
97         }
98       }
99     }
100   }
101 
102   // Do this to verify all kernels are done zeroing the local buffer before we try the copy
103   barrier(CLK_LOCAL_MEM_FENCE);
104 
105   for (int i=0; i < planesCopiesPerWorkItem; i++) {
106     for (int j=0; j < numLines; j++) {
107       for (int k=0; k < numElementsPerLine; k++) {
108         const int local_index = (get_local_id(0) * planesCopiesPerWorkItem + i) * srcPlaneStride + j * srcLineStride + k;
109         const int global_index = (get_global_id(0) * planesCopiesPerWorkItem + i) * srcPlaneStride + j*srcLineStride + k;
110         localBuffer[local_index] = src[global_index];
111       }
112     }
113   }
114 
115   // Do this to verify all kernels are done copying to the local buffer before we try the copy
116   barrier(CLK_LOCAL_MEM_FENCE);
117 
118   event_t event = async_work_group_copy_3D3D(dst,
119     planesCopiesPerWorkgroup * get_group_id(0) * dstPlaneStride, localBuffer, 0,
120     sizeof(VarSizeStruct), (size_t)numElementsPerLine, (size_t)numLines, planesCopiesPerWorkgroup,
121     srcLineStride, srcPlaneStride, dstLineStride, dstPlaneStride, 0);
122 
123   wait_group_events(1, &event);
124 }
125 )OpenCLC";
126 
test_copy3D(const cl_device_id deviceID,const cl_context context,const cl_command_queue queue,const char * const kernelCode,const size_t elementSize,const int srcLineMargin,const int dstLineMargin,const int srcPlaneMargin,const int dstPlaneMargin,const bool localIsDst)127 int test_copy3D(const cl_device_id deviceID, const cl_context context,
128                 const cl_command_queue queue, const char *const kernelCode,
129                 const size_t elementSize, const int srcLineMargin,
130                 const int dstLineMargin, const int srcPlaneMargin,
131                 const int dstPlaneMargin, const bool localIsDst)
132 {
133     int error;
134 
135     log_info(
136         "Testing %d byte element with srcLineMargin = %d, dstLineMargin = %d, "
137         "srcPlaneMargin = %d, dstPlaneMargin = %d\n",
138         elementSize, srcLineMargin, dstLineMargin, srcPlaneMargin,
139         dstPlaneMargin);
140 
141     cl_long max_local_mem_size;
142     error =
143         clGetDeviceInfo(deviceID, CL_DEVICE_LOCAL_MEM_SIZE,
144                         sizeof(max_local_mem_size), &max_local_mem_size, NULL);
145     test_error(error, "clGetDeviceInfo for CL_DEVICE_LOCAL_MEM_SIZE failed.");
146 
147     cl_long max_global_mem_size;
148     error = clGetDeviceInfo(deviceID, CL_DEVICE_GLOBAL_MEM_SIZE,
149                             sizeof(max_global_mem_size), &max_global_mem_size,
150                             NULL);
151     test_error(error, "clGetDeviceInfo for CL_DEVICE_GLOBAL_MEM_SIZE failed.");
152 
153     cl_long max_alloc_size;
154     error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
155                             sizeof(max_alloc_size), &max_alloc_size, NULL);
156     test_error(error,
157                "clGetDeviceInfo for CL_DEVICE_MAX_MEM_ALLOC_SIZE failed.");
158 
159     if (max_alloc_size > max_global_mem_size / 2)
160         max_alloc_size = max_global_mem_size / 2;
161 
162     unsigned int num_of_compute_devices;
163     error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_COMPUTE_UNITS,
164                             sizeof(num_of_compute_devices),
165                             &num_of_compute_devices, NULL);
166     test_error(error,
167                "clGetDeviceInfo for CL_DEVICE_MAX_COMPUTE_UNITS failed.");
168 
169     char programSource[4096] = { 0 };
170     const char *programPtr = programSource;
171 
172     sprintf(programSource, kernelCode, elementSize);
173     // log_info("program: %s\n", programSource);
174     clProgramWrapper program;
175     clKernelWrapper kernel;
176 
177     error = create_single_kernel_helper(context, &program, &kernel, 1,
178                                         &programPtr, "test_fn");
179     test_error(error, "Unable to create testing kernel");
180 
181     size_t max_workgroup_size;
182     error = clGetKernelWorkGroupInfo(
183         kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(max_workgroup_size),
184         &max_workgroup_size, NULL);
185     test_error(
186         error,
187         "clGetKernelWorkGroupInfo failed for CL_KERNEL_WORK_GROUP_SIZE.");
188 
189     size_t max_local_workgroup_size[3];
190     error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_ITEM_SIZES,
191                             sizeof(max_local_workgroup_size),
192                             max_local_workgroup_size, NULL);
193     test_error(error,
194                "clGetDeviceInfo failed for CL_DEVICE_MAX_WORK_ITEM_SIZES");
195 
196     cl_long max_work_group_size;
197     error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_GROUP_SIZE,
198                             sizeof(max_work_group_size), &max_work_group_size,
199                             NULL);
200     test_error(error,
201                "clGetDeviceInfo for CL_DEVICE_MAX_WORK_GROUP_SIZE failed.");
202 
203     // Pick the minimum of the device and the kernel
204     if (max_workgroup_size > max_local_workgroup_size[0])
205         max_workgroup_size = max_local_workgroup_size[0];
206 
207     const size_t numElementsPerLine = 10;
208     const cl_int dstLineStride = numElementsPerLine + dstLineMargin;
209     const cl_int srcLineStride = numElementsPerLine + srcLineMargin;
210 
211     const size_t numLines = 13;
212     const cl_int dstPlaneStride = (numLines * dstLineStride) + dstPlaneMargin;
213     const cl_int srcPlaneStride = (numLines * srcLineStride) + srcPlaneMargin;
214 
215     const size_t planesCopiesPerWorkItem = 2;
216     const size_t localStorageSpacePerWorkitem = elementSize
217         * planesCopiesPerWorkItem
218         * (localIsDst ? dstPlaneStride : srcPlaneStride);
219     size_t maxLocalWorkgroupSize =
220         (((int)max_local_mem_size / 2) / localStorageSpacePerWorkitem);
221 
222     // Calculation can return 0 on embedded devices due to 1KB local mem limit
223     if (maxLocalWorkgroupSize == 0)
224     {
225         maxLocalWorkgroupSize = 1;
226     }
227 
228     size_t localWorkgroupSize = maxLocalWorkgroupSize;
229     if (maxLocalWorkgroupSize > max_workgroup_size)
230         localWorkgroupSize = max_workgroup_size;
231 
232     const size_t maxTotalPlanesIn =
233         ((max_alloc_size / elementSize) + srcPlaneMargin) / srcPlaneStride;
234     const size_t maxTotalPlanesOut =
235         ((max_alloc_size / elementSize) + dstPlaneMargin) / dstPlaneStride;
236     const size_t maxTotalPlanes = std::min(maxTotalPlanesIn, maxTotalPlanesOut);
237     const size_t maxLocalWorkgroups =
238         maxTotalPlanes / (localWorkgroupSize * planesCopiesPerWorkItem);
239 
240     const size_t localBufferSize =
241         localWorkgroupSize * localStorageSpacePerWorkitem
242         - (localIsDst ? dstPlaneMargin : srcPlaneMargin);
243     const size_t numberOfLocalWorkgroups =
244         std::min(1111, (int)maxLocalWorkgroups);
245     const size_t totalPlanes =
246         numberOfLocalWorkgroups * localWorkgroupSize * planesCopiesPerWorkItem;
247     const size_t inBufferSize = elementSize
248         * (totalPlanes * numLines * srcLineStride
249            + (totalPlanes - 1) * srcPlaneMargin);
250     const size_t outBufferSize = elementSize
251         * (totalPlanes * numLines * dstLineStride
252            + (totalPlanes - 1) * dstPlaneMargin);
253     const size_t globalWorkgroupSize =
254         numberOfLocalWorkgroups * localWorkgroupSize;
255 
256     if ((localBufferSize / 4) > max_work_group_size)
257     {
258         log_info("Skipping due to resource requirements local:%db  "
259                  "max_work_group_size:%d\n",
260                  localBufferSize, max_work_group_size);
261         return 0;
262     }
263 
264     void *const inBuffer = (void *)malloc(inBufferSize);
265     void *const outBuffer = (void *)malloc(outBufferSize);
266     void *const outBufferCopy = (void *)malloc(outBufferSize);
267 
268     const cl_int planesCopiesPerWorkItemInt =
269         static_cast<cl_int>(planesCopiesPerWorkItem);
270     const cl_int numElementsPerLineInt =
271         static_cast<cl_int>(numElementsPerLine);
272     const cl_int numLinesInt = static_cast<cl_int>(numLines);
273     const cl_int planesCopiesPerWorkgroup =
274         static_cast<cl_int>(planesCopiesPerWorkItem * localWorkgroupSize);
275 
276     log_info("Global: %d, local %d, local buffer %db, global in buffer %db, "
277              "global out buffer %db, each work group will copy %d planes and "
278              "each work item item will copy %d planes.\n",
279              (int)globalWorkgroupSize, (int)localWorkgroupSize,
280              (int)localBufferSize, (int)inBufferSize, (int)outBufferSize,
281              planesCopiesPerWorkgroup, planesCopiesPerWorkItemInt);
282 
283     size_t threads[1], localThreads[1];
284 
285     threads[0] = globalWorkgroupSize;
286     localThreads[0] = localWorkgroupSize;
287 
288     MTdata d = init_genrand(gRandomSeed);
289     generate_random_data(kChar, inBufferSize, d, inBuffer);
290     generate_random_data(kChar, outBufferSize, d, outBuffer);
291     free_mtdata(d);
292     d = NULL;
293     memcpy(outBufferCopy, outBuffer, outBufferSize);
294 
295     clMemWrapper streams[2];
296 
297     streams[0] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, inBufferSize,
298                                 inBuffer, &error);
299     test_error(error, "Unable to create input buffer");
300     streams[1] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, outBufferSize,
301                                 outBuffer, &error);
302     test_error(error, "Unable to create output buffer");
303 
304     error = clSetKernelArg(kernel, 0, sizeof(streams[0]), &streams[0]);
305     test_error(error, "Unable to set kernel argument");
306     error = clSetKernelArg(kernel, 1, sizeof(streams[1]), &streams[1]);
307     test_error(error, "Unable to set kernel argument");
308     error = clSetKernelArg(kernel, 2, localBufferSize, NULL);
309     test_error(error, "Unable to set kernel argument");
310     error = clSetKernelArg(kernel, 3, sizeof(numElementsPerLineInt),
311                            &numElementsPerLineInt);
312     test_error(error, "Unable to set kernel argument");
313     error = clSetKernelArg(kernel, 4, sizeof(numLinesInt), &numLinesInt);
314     test_error(error, "Unable to set kernel argument");
315     error = clSetKernelArg(kernel, 5, sizeof(planesCopiesPerWorkgroup),
316                            &planesCopiesPerWorkgroup);
317     test_error(error, "Unable to set kernel argument");
318     error = clSetKernelArg(kernel, 6, sizeof(planesCopiesPerWorkItemInt),
319                            &planesCopiesPerWorkItemInt);
320     test_error(error, "Unable to set kernel argument");
321     error = clSetKernelArg(kernel, 7, sizeof(srcLineStride), &srcLineStride);
322     test_error(error, "Unable to set kernel argument");
323     error = clSetKernelArg(kernel, 8, sizeof(dstLineStride), &dstLineStride);
324     test_error(error, "Unable to set kernel argument");
325     error = clSetKernelArg(kernel, 9, sizeof(srcPlaneStride), &srcPlaneStride);
326     test_error(error, "Unable to set kernel argument");
327     error = clSetKernelArg(kernel, 10, sizeof(dstPlaneStride), &dstPlaneStride);
328     test_error(error, "Unable to set kernel argument");
329 
330     // Enqueue
331     error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads,
332                                    localThreads, 0, NULL, NULL);
333     test_error(error, "Unable to queue kernel");
334 
335     // Read
336     error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, outBufferSize,
337                                 outBuffer, 0, NULL, NULL);
338     test_error(error, "Unable to read results");
339 
340     // Verify
341     int failuresPrinted = 0;
342 
343     for (int i = 0;
344          i < (int)globalWorkgroupSize * planesCopiesPerWorkItem * elementSize;
345          i += elementSize)
346     {
347         for (int j = 0; j < (int)numLines * elementSize; j += elementSize)
348         {
349             for (int k = 0; k < (int)numElementsPerLine * elementSize;
350                  k += elementSize)
351             {
352                 int inIdx = i * srcPlaneStride + j * srcLineStride + k;
353                 int outIdx = i * dstPlaneStride + j * dstLineStride + k;
354                 if (memcmp(((char *)inBuffer) + inIdx,
355                            ((char *)outBuffer) + outIdx, elementSize)
356                     != 0)
357                 {
358                     unsigned char *inchar = (unsigned char *)inBuffer + inIdx;
359                     unsigned char *outchar =
360                         (unsigned char *)outBuffer + outIdx;
361                     char values[4096] = { 0 };
362 
363                     if (failuresPrinted == 0)
364                     {
365                         // Print first failure message
366                         log_error("ERROR: Results of copy did not validate!");
367                     }
368                     sprintf(values + strlen(values), "%d -> [", inIdx);
369                     for (int l = 0; l < (int)elementSize; l++)
370                         sprintf(values + strlen(values), "%2x ", inchar[l]);
371                     sprintf(values + strlen(values), "] != [");
372                     for (int l = 0; l < (int)elementSize; l++)
373                         sprintf(values + strlen(values), "%2x ", outchar[l]);
374                     sprintf(values + strlen(values), "]");
375                     log_error("%s\n", values);
376                     failuresPrinted++;
377                 }
378 
379                 if (failuresPrinted > 5)
380                 {
381                     log_error("Not printing further failures...\n");
382                     return -1;
383                 }
384             }
385             if (j < (int)numLines * elementSize)
386             {
387                 int outIdx = i * dstPlaneStride + j * dstLineStride
388                     + numElementsPerLine * elementSize;
389                 if (memcmp(((char *)outBuffer) + outIdx,
390                            ((char *)outBufferCopy) + outIdx,
391                            dstLineMargin * elementSize)
392                     != 0)
393                 {
394                     if (failuresPrinted == 0)
395                     {
396                         // Print first failure message
397                         log_error("ERROR: Results of copy did not validate!\n");
398                     }
399                     log_error(
400                         "3D copy corrupted data in output buffer in the line "
401                         "stride offset of plane %d line %d\n",
402                         i, j);
403                     failuresPrinted++;
404                 }
405                 if (failuresPrinted > 5)
406                 {
407                     log_error("Not printing further failures...\n");
408                     return -1;
409                 }
410             }
411         }
412         if (i < (int)(globalWorkgroupSize * planesCopiesPerWorkItem - 1)
413                 * elementSize)
414         {
415             int outIdx =
416                 i * dstPlaneStride + numLines * dstLineStride * elementSize;
417             if (memcmp(((char *)outBuffer) + outIdx,
418                        ((char *)outBufferCopy) + outIdx,
419                        dstPlaneMargin * elementSize)
420                 != 0)
421             {
422                 if (failuresPrinted == 0)
423                 {
424                     // Print first failure message
425                     log_error("ERROR: Results of copy did not validate!\n");
426                 }
427                 log_error("3D copy corrupted data in output buffer in the "
428                           "plane stride "
429                           "offset of plane %d\n",
430                           i);
431                 failuresPrinted++;
432             }
433             if (failuresPrinted > 5)
434             {
435                 log_error("Not printing further failures...\n");
436                 return -1;
437             }
438         }
439     }
440 
441     free(inBuffer);
442     free(outBuffer);
443     free(outBufferCopy);
444 
445     return failuresPrinted ? -1 : 0;
446 }
447 
test_copy3D_all_types(cl_device_id deviceID,cl_context context,cl_command_queue queue,const char * kernelCode,bool localIsDst)448 int test_copy3D_all_types(cl_device_id deviceID, cl_context context,
449                           cl_command_queue queue, const char *kernelCode,
450                           bool localIsDst)
451 {
452     const unsigned int elemSizes[] = { 1, 2,  3,  4,  5,  6, 7,
453                                        8, 13, 16, 32, 47, 64 };
454     // The margins below represent the number of elements between the end of
455     // one line and the start of the next. The strides are equivalent to the
456     // size of the line or plane plus the chosen margin.
457     // These have to be multipliers, because the margin must be a multiple of
458     // element size.
459     const unsigned int marginMultipliers[] = { 0, 10, 100 };
460 
461     int errors = 0;
462 
463     if (!is_extension_available(deviceID, "cl_khr_extended_async_copies"))
464     {
465         log_info(
466             "Device does not support extended async copies. Skipping test.\n");
467     }
468     else
469     {
470         for (const unsigned int elemSize : elemSizes)
471         {
472             for (const unsigned int srcLineMarginMultiplier : marginMultipliers)
473             {
474                 for (const unsigned int dstLineMarginMultiplier :
475                      marginMultipliers)
476                 {
477                     for (const unsigned int srcPlaneMarginMultiplier :
478                          marginMultipliers)
479                     {
480                         for (const unsigned int dstPlaneMarginMultiplier :
481                              marginMultipliers)
482                         {
483                             if (test_copy3D(deviceID, context, queue,
484                                             kernelCode, elemSize,
485                                             srcLineMarginMultiplier * elemSize,
486                                             dstLineMarginMultiplier * elemSize,
487                                             srcPlaneMarginMultiplier * elemSize,
488                                             dstPlaneMarginMultiplier * elemSize,
489                                             localIsDst))
490                             {
491                                 errors++;
492                             }
493                         }
494                     }
495                 }
496             }
497         }
498     }
499     if (errors) return -1;
500     return 0;
501 }
502 
test_async_copy_global_to_local3D(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)503 int test_async_copy_global_to_local3D(cl_device_id deviceID, cl_context context,
504                                       cl_command_queue queue, int num_elements)
505 {
506     return test_copy3D_all_types(deviceID, context, queue,
507                                  async_global_to_local_kernel3D, true);
508 }
509 
test_async_copy_local_to_global3D(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)510 int test_async_copy_local_to_global3D(cl_device_id deviceID, cl_context context,
511                                       cl_command_queue queue, int num_elements)
512 {
513     return test_copy3D_all_types(deviceID, context, queue,
514                                  async_local_to_global_kernel3D, false);
515 }
516