xref: /aosp_15_r20/external/OpenCL-CTS/test_conformance/basic/test_async_copy2D.cpp (revision 6467f958c7de8070b317fc65bcb0f6472e388d82)
1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "../../test_common/harness/compat.h"
17 
18 #include <algorithm>
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22 #include <sys/stat.h>
23 #include <sys/types.h>
24 
25 #include "../../test_common/harness/conversions.h"
26 #include "procs.h"
27 
28 static const char *async_global_to_local_kernel2D = R"OpenCLC(
29 #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
30 
31 #define STRUCT_SIZE %d
32 typedef struct __attribute__((packed))
33 {
34     uchar byte[STRUCT_SIZE];
35 } VarSizeStruct __attribute__((aligned(1)));
36 
37 
38 __kernel void test_fn(const __global VarSizeStruct *src, __global VarSizeStruct *dst,
39                       __local VarSizeStruct *localBuffer, int numElementsPerLine,
40                       int lineCopiesPerWorkgroup, int lineCopiesPerWorkItem,
41                       int srcStride, int dstStride) {
42   // Zero the local storage first
43   for (int i = 0; i < lineCopiesPerWorkItem; i++) {
44     for (int j = 0; j < numElementsPerLine; j++) {
45       const int index = (get_local_id(0) * lineCopiesPerWorkItem + i) * dstStride + j;
46       for (int k = 0; k < STRUCT_SIZE; k++) {
47         localBuffer[index].byte[k] = 0;
48       }
49     }
50   }
51 
52   // Do this to verify all kernels are done zeroing the local buffer before we
53   // try the copy
54   barrier( CLK_LOCAL_MEM_FENCE );
55   event_t event = async_work_group_copy_2D2D(localBuffer, 0, src,
56     lineCopiesPerWorkgroup * get_group_id(0) * srcStride, sizeof(VarSizeStruct),
57     (size_t)numElementsPerLine, (size_t)lineCopiesPerWorkgroup, srcStride, dstStride, 0);
58 
59   // Wait for the copy to complete, then verify by manually copying to the dest
60   wait_group_events(1, &event);
61 
62   for (int i = 0; i < lineCopiesPerWorkItem; i++) {
63     for (int j = 0; j < numElementsPerLine; j++) {
64       const int local_index = (get_local_id(0) * lineCopiesPerWorkItem + i) * dstStride + j;
65       const int global_index = (get_global_id(0) * lineCopiesPerWorkItem + i) * dstStride + j;
66       dst[global_index] = localBuffer[local_index];
67     }
68   }
69 }
70 )OpenCLC";
71 
72 static const char *async_local_to_global_kernel2D = R"OpenCLC(
73 #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
74 
75 #define STRUCT_SIZE %d
76 typedef struct __attribute__((packed))
77 {
78     uchar byte[STRUCT_SIZE];
79 } VarSizeStruct __attribute__((aligned(1)));
80 
81 
82 __kernel void test_fn(const __global VarSizeStruct *src, __global VarSizeStruct *dst, __local VarSizeStruct *localBuffer,
83                       int numElementsPerLine, int lineCopiesPerWorkgroup,
84                       int lineCopiesPerWorkItem, int srcStride, int dstStride) {
85   // Zero the local storage first
86   for (int i = 0; i < lineCopiesPerWorkItem; i++) {
87     for (int j = 0; j < numElementsPerLine; j++) {
88       const int index = (get_local_id(0) * lineCopiesPerWorkItem + i) * srcStride + j;
89       for (int k = 0; k < STRUCT_SIZE; k++) {
90         localBuffer[index].byte[k] = 0;
91       }
92     }
93   }
94 
95   // Do this to verify all kernels are done zeroing the local buffer before we try the copy
96   barrier(CLK_LOCAL_MEM_FENCE);
97 
98   for (int i = 0; i < lineCopiesPerWorkItem; i++) {
99     for (int j = 0; j < numElementsPerLine; j++) {
100       const int local_index = (get_local_id(0) * lineCopiesPerWorkItem + i) * srcStride + j;
101       const int global_index = (get_global_id(0)*lineCopiesPerWorkItem + i) * srcStride + j;
102       localBuffer[local_index] = src[global_index];
103     }
104   }
105 
106   // Do this to verify all kernels are done copying to the local buffer before we try the copy
107   barrier(CLK_LOCAL_MEM_FENCE);
108   event_t event = async_work_group_copy_2D2D(dst, lineCopiesPerWorkgroup * get_group_id(0) * dstStride,
109     localBuffer, 0, sizeof(VarSizeStruct), (size_t)numElementsPerLine, (size_t)lineCopiesPerWorkgroup, srcStride,
110    dstStride, 0 );
111 
112   wait_group_events(1, &event);
113 };
114 )OpenCLC";
115 
test_copy2D(const cl_device_id deviceID,const cl_context context,const cl_command_queue queue,const char * const kernelCode,const size_t elementSize,const int srcMargin,const int dstMargin,const bool localIsDst)116 int test_copy2D(const cl_device_id deviceID, const cl_context context,
117                 const cl_command_queue queue, const char *const kernelCode,
118                 const size_t elementSize, const int srcMargin,
119                 const int dstMargin, const bool localIsDst)
120 {
121     int error;
122 
123     log_info("Testing %d byte element with srcMargin = %d, dstMargin = %d\n",
124              elementSize, srcMargin, dstMargin);
125 
126     cl_long max_local_mem_size;
127     error =
128         clGetDeviceInfo(deviceID, CL_DEVICE_LOCAL_MEM_SIZE,
129                         sizeof(max_local_mem_size), &max_local_mem_size, NULL);
130     test_error(error, "clGetDeviceInfo for CL_DEVICE_LOCAL_MEM_SIZE failed.");
131 
132     cl_long max_global_mem_size;
133     error = clGetDeviceInfo(deviceID, CL_DEVICE_GLOBAL_MEM_SIZE,
134                             sizeof(max_global_mem_size), &max_global_mem_size,
135                             NULL);
136     test_error(error, "clGetDeviceInfo for CL_DEVICE_GLOBAL_MEM_SIZE failed.");
137 
138     cl_long max_alloc_size;
139     error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
140                             sizeof(max_alloc_size), &max_alloc_size, NULL);
141     test_error(error,
142                "clGetDeviceInfo for CL_DEVICE_MAX_MEM_ALLOC_SIZE failed.");
143 
144     cl_long max_work_group_size;
145     error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_GROUP_SIZE,
146                             sizeof(max_work_group_size), &max_work_group_size,
147                             NULL);
148     test_error(error,
149                "clGetDeviceInfo for CL_DEVICE_MAX_WORK_GROUP_SIZE failed.");
150 
151     if (max_alloc_size > max_global_mem_size / 2)
152         max_alloc_size = max_global_mem_size / 2;
153 
154     unsigned int num_of_compute_devices;
155     error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_COMPUTE_UNITS,
156                             sizeof(num_of_compute_devices),
157                             &num_of_compute_devices, NULL);
158     test_error(error,
159                "clGetDeviceInfo for CL_DEVICE_MAX_COMPUTE_UNITS failed.");
160 
161     char programSource[4096] = { 0 };
162     const char *programPtr = programSource;
163 
164     sprintf(programSource, kernelCode, elementSize);
165     // log_info("program: %s\n", programSource);
166 
167     clProgramWrapper program;
168     clKernelWrapper kernel;
169 
170     error = create_single_kernel_helper(context, &program, &kernel, 1,
171                                         &programPtr, "test_fn");
172     test_error(error, "Unable to create testing kernel");
173 
174     size_t max_workgroup_size;
175     error = clGetKernelWorkGroupInfo(
176         kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(max_workgroup_size),
177         &max_workgroup_size, NULL);
178     test_error(
179         error,
180         "clGetKernelWorkGroupInfo failed for CL_KERNEL_WORK_GROUP_SIZE.");
181 
182     size_t max_local_workgroup_size[3];
183     error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_ITEM_SIZES,
184                             sizeof(max_local_workgroup_size),
185                             max_local_workgroup_size, NULL);
186     test_error(error,
187                "clGetDeviceInfo failed for CL_DEVICE_MAX_WORK_ITEM_SIZES");
188 
189     // Pick the minimum of the device and the kernel
190     if (max_workgroup_size > max_local_workgroup_size[0])
191         max_workgroup_size = max_local_workgroup_size[0];
192 
193     const size_t numElementsPerLine = 10;
194     const cl_int dstStride = numElementsPerLine + dstMargin;
195     const cl_int srcStride = numElementsPerLine + srcMargin;
196 
197     const size_t lineCopiesPerWorkItem = 13;
198     const size_t localStorageSpacePerWorkitem = lineCopiesPerWorkItem
199         * elementSize * (localIsDst ? dstStride : srcStride);
200 
201     size_t maxLocalWorkgroupSize =
202         (((int)max_local_mem_size / 2) / localStorageSpacePerWorkitem);
203 
204     // Calculation can return 0 on embedded devices due to 1KB local mem limit
205     if (maxLocalWorkgroupSize == 0)
206     {
207         maxLocalWorkgroupSize = 1;
208     }
209 
210     size_t localWorkgroupSize = maxLocalWorkgroupSize;
211     if (maxLocalWorkgroupSize > max_workgroup_size)
212         localWorkgroupSize = max_workgroup_size;
213 
214     const size_t maxTotalLinesIn =
215         (max_alloc_size / elementSize + srcMargin) / srcStride;
216     const size_t maxTotalLinesOut =
217         (max_alloc_size / elementSize + dstMargin) / dstStride;
218     const size_t maxTotalLines = std::min(maxTotalLinesIn, maxTotalLinesOut);
219     const size_t maxLocalWorkgroups =
220         maxTotalLines / (localWorkgroupSize * lineCopiesPerWorkItem);
221 
222     const size_t localBufferSize =
223         localWorkgroupSize * localStorageSpacePerWorkitem
224         - (localIsDst ? dstMargin : srcMargin);
225     const size_t numberOfLocalWorkgroups =
226         std::min(1111, (int)maxLocalWorkgroups);
227     const size_t totalLines =
228         numberOfLocalWorkgroups * localWorkgroupSize * lineCopiesPerWorkItem;
229     const size_t inBufferSize = elementSize
230         * (totalLines * numElementsPerLine + (totalLines - 1) * srcMargin);
231     const size_t outBufferSize = elementSize
232         * (totalLines * numElementsPerLine + (totalLines - 1) * dstMargin);
233     const size_t globalWorkgroupSize =
234         numberOfLocalWorkgroups * localWorkgroupSize;
235 
236     if ((localBufferSize / 4) > max_work_group_size)
237     {
238         log_info("Skipping due to resource requirements local:%db  "
239                  "max_work_group_size:%d\n",
240                  localBufferSize, max_work_group_size);
241         return 0;
242     }
243 
244     void *const inBuffer = (void *)malloc(inBufferSize);
245     void *const outBuffer = (void *)malloc(outBufferSize);
246     void *const outBufferCopy = (void *)malloc(outBufferSize);
247 
248     const cl_int lineCopiesPerWorkItemInt =
249         static_cast<cl_int>(lineCopiesPerWorkItem);
250     const cl_int numElementsPerLineInt =
251         static_cast<cl_int>(numElementsPerLine);
252     const cl_int lineCopiesPerWorkgroup =
253         static_cast<cl_int>(lineCopiesPerWorkItem * localWorkgroupSize);
254 
255     log_info(
256         "Global: %d, local %d, local buffer %db, global in buffer %db, "
257         "global out buffer %db, each work group will copy %d lines and each "
258         "work item item will copy %d lines.\n",
259         (int)globalWorkgroupSize, (int)localWorkgroupSize, (int)localBufferSize,
260         (int)inBufferSize, (int)outBufferSize, lineCopiesPerWorkgroup,
261         lineCopiesPerWorkItemInt);
262 
263     size_t threads[1], localThreads[1];
264 
265     threads[0] = globalWorkgroupSize;
266     localThreads[0] = localWorkgroupSize;
267 
268     MTdata d = init_genrand(gRandomSeed);
269     generate_random_data(kChar, inBufferSize, d, inBuffer);
270     generate_random_data(kChar, outBufferSize, d, outBuffer);
271     free_mtdata(d);
272     d = NULL;
273     memcpy(outBufferCopy, outBuffer, outBufferSize);
274 
275     clMemWrapper streams[2];
276 
277     streams[0] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, inBufferSize,
278                                 inBuffer, &error);
279     test_error(error, "Unable to create input buffer");
280     streams[1] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, outBufferSize,
281                                 outBuffer, &error);
282     test_error(error, "Unable to create output buffer");
283 
284     error = clSetKernelArg(kernel, 0, sizeof(streams[0]), &streams[0]);
285     test_error(error, "Unable to set kernel argument");
286     error = clSetKernelArg(kernel, 1, sizeof(streams[1]), &streams[1]);
287     test_error(error, "Unable to set kernel argument");
288     error = clSetKernelArg(kernel, 2, localBufferSize, NULL);
289     test_error(error, "Unable to set kernel argument");
290     error = clSetKernelArg(kernel, 3, sizeof(numElementsPerLineInt),
291                            &numElementsPerLineInt);
292     test_error(error, "Unable to set kernel argument");
293     error = clSetKernelArg(kernel, 4, sizeof(lineCopiesPerWorkgroup),
294                            &lineCopiesPerWorkgroup);
295     test_error(error, "Unable to set kernel argument");
296     error = clSetKernelArg(kernel, 5, sizeof(lineCopiesPerWorkItemInt),
297                            &lineCopiesPerWorkItemInt);
298     test_error(error, "Unable to set kernel argument");
299     error = clSetKernelArg(kernel, 6, sizeof(srcStride), &srcStride);
300     test_error(error, "Unable to set kernel argument");
301     error = clSetKernelArg(kernel, 7, sizeof(dstStride), &dstStride);
302     test_error(error, "Unable to set kernel argument");
303 
304     // Enqueue
305     error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threads,
306                                    localThreads, 0, NULL, NULL);
307     test_error(error, "Unable to queue kernel");
308 
309     // Read
310     error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, outBufferSize,
311                                 outBuffer, 0, NULL, NULL);
312     test_error(error, "Unable to read results");
313 
314     // Verify
315     int failuresPrinted = 0;
316 
317     for (int i = 0;
318          i < (int)globalWorkgroupSize * lineCopiesPerWorkItem * elementSize;
319          i += elementSize)
320     {
321         for (int j = 0; j < (int)numElementsPerLine * elementSize;
322              j += elementSize)
323         {
324             int inIdx = i * srcStride + j;
325             int outIdx = i * dstStride + j;
326             if (memcmp(((char *)inBuffer) + inIdx, ((char *)outBuffer) + outIdx,
327                        elementSize)
328                 != 0)
329             {
330                 unsigned char *inchar = (unsigned char *)inBuffer + inIdx;
331                 unsigned char *outchar = (unsigned char *)outBuffer + outIdx;
332                 char values[4096] = { 0 };
333 
334                 if (failuresPrinted == 0)
335                 {
336                     // Print first failure message
337                     log_error("ERROR: Results of copy did not validate!\n");
338                 }
339                 sprintf(values + strlen(values), "%d -> [", inIdx);
340                 for (int k = 0; k < (int)elementSize; k++)
341                     sprintf(values + strlen(values), "%2x ", inchar[k]);
342                 sprintf(values + strlen(values), "] != [");
343                 for (int k = 0; k < (int)elementSize; k++)
344                     sprintf(values + strlen(values), "%2x ", outchar[k]);
345                 sprintf(values + strlen(values), "]");
346                 log_error("%s\n", values);
347                 failuresPrinted++;
348             }
349 
350             if (failuresPrinted > 5)
351             {
352                 log_error("Not printing further failures...\n");
353                 return -1;
354             }
355         }
356         if (i < (int)(globalWorkgroupSize * lineCopiesPerWorkItem - 1)
357                 * elementSize)
358         {
359             int outIdx = i * dstStride + numElementsPerLine * elementSize;
360             if (memcmp(((char *)outBuffer) + outIdx,
361                        ((char *)outBufferCopy) + outIdx,
362                        dstMargin * elementSize)
363                 != 0)
364             {
365                 if (failuresPrinted == 0)
366                 {
367                     // Print first failure message
368                     log_error("ERROR: Results of copy did not validate!\n");
369                 }
370                 log_error(
371                     "2D copy corrupted data in output buffer in the stride "
372                     "offset of line %d\n",
373                     i);
374                 failuresPrinted++;
375             }
376             if (failuresPrinted > 5)
377             {
378                 log_error("Not printing further failures...\n");
379                 return -1;
380             }
381         }
382     }
383 
384     free(inBuffer);
385     free(outBuffer);
386     free(outBufferCopy);
387 
388     return failuresPrinted ? -1 : 0;
389 }
390 
test_copy2D_all_types(cl_device_id deviceID,cl_context context,cl_command_queue queue,const char * kernelCode,bool localIsDst)391 int test_copy2D_all_types(cl_device_id deviceID, cl_context context,
392                           cl_command_queue queue, const char *kernelCode,
393                           bool localIsDst)
394 {
395     const unsigned int elemSizes[] = { 1, 2,  3,  4,  5,  6, 7,
396                                        8, 13, 16, 32, 47, 64 };
397     // The margins below represent the number of elements between the end of
398     // one line and the start of the next. The strides are equivalent to the
399     // length of the line plus the chosen margin.
400     // These have to be multipliers, because the margin must be a multiple of
401     // element size.
402     const unsigned int marginMultipliers[] = { 0, 10, 100 };
403 
404     int errors = 0;
405 
406     if (!is_extension_available(deviceID, "cl_khr_extended_async_copies"))
407     {
408         log_info(
409             "Device does not support extended async copies. Skipping test.\n");
410     }
411     else
412     {
413         for (const unsigned int elemSize : elemSizes)
414         {
415             for (const unsigned int srcMarginMultiplier : marginMultipliers)
416             {
417                 for (const unsigned int dstMarginMultiplier : marginMultipliers)
418                 {
419                     if (test_copy2D(deviceID, context, queue, kernelCode,
420                                     elemSize, srcMarginMultiplier * elemSize,
421                                     dstMarginMultiplier * elemSize, localIsDst))
422                     {
423                         errors++;
424                     }
425                 }
426             }
427         }
428     }
429 
430     return errors ? -1 : 0;
431 }
432 
test_async_copy_global_to_local2D(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)433 int test_async_copy_global_to_local2D(cl_device_id deviceID, cl_context context,
434                                       cl_command_queue queue, int num_elements)
435 {
436     return test_copy2D_all_types(deviceID, context, queue,
437                                  async_global_to_local_kernel2D, true);
438 }
439 
test_async_copy_local_to_global2D(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)440 int test_async_copy_local_to_global2D(cl_device_id deviceID, cl_context context,
441                                       cl_command_queue queue, int num_elements)
442 {
443     return test_copy2D_all_types(deviceID, context, queue,
444                                  async_local_to_global_kernel2D, false);
445 }
446