xref: /aosp_15_r20/external/ComputeLibrary/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1 /*
2  * Copyright (c) 2022-2023 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #include "ClKernelRuntime.h"
25 #include "arm_compute/core/CL/ICLTensor.h"
26 #include "src/core/CL/CLUtils.h"
27 #include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h"
28 #include "src/gpu/cl/ClKernelLibrary.h"
29 
30 #include "support/Cast.h"
31 namespace arm_compute
32 {
33 namespace experimental
34 {
35 namespace dynamic_fusion
36 {
37 using namespace arm_compute::opencl;
38 
configure(const ClCompileContext & compile_ctx,const GpuKernelSourceCode & code)39 void ClKernelRuntime::configure(const ClCompileContext &compile_ctx, const GpuKernelSourceCode &code)
40 {
41     // Create kernel from kernel source string
42     opencl::ClKernelLibrary &klib = opencl::ClKernelLibrary::get();
43     _kernel                       = static_cast<cl::Kernel>(compile_ctx.create_kernel(code.name(),
44                                                                                       code.name(), // program name has to be provided to differentiate between different unfusable components' kernels.
45                                                                                       code.code(),
46                                                                                       klib.kernel_path() /* Kernel path: Used in cases of embedded kernels */,
47                                                                                       code.build_options().options(),
48                                                                                       false /* Is source binary */));
49 
50     // Configure execution window
51     IClKernel::configure_internal(code.window());
52 
53     // Set config id for lws tuning
54     _config_id = code.config_id();
55 
56     // Set kernel arguments
57     _arguments = code.arguments();
58 }
59 
add_tensor_argument(unsigned int & idx,const GpuKernelArgumentInfo & arg,const ICLTensor * tensor,const Window & arg_slice,std::vector<cl::Image2D> & cl_images)60 inline void ClKernelRuntime::add_tensor_argument(unsigned int &idx, const GpuKernelArgumentInfo &arg, const ICLTensor *tensor, const Window &arg_slice, std::vector<cl::Image2D> &cl_images)
61 {
62     switch(arg.type)
63     {
64         case GpuKernelArgumentInfo::Type::Scalar:
65         {
66             ARM_COMPUTE_ERROR("Unsupported yet");
67             break;
68         }
69 
70         case GpuKernelArgumentInfo::Type::Vector:
71         {
72             add_1D_tensor_argument(idx, tensor, arg_slice);
73             break;
74         }
75 
76         case GpuKernelArgumentInfo::Type::Image:
77         {
78             add_2D_tensor_argument(idx, tensor, arg_slice);
79             break;
80         }
81         case GpuKernelArgumentInfo::Type::Image_Reinterpret_As_3D:
82         {
83             add_2D_tensor_argument(idx, tensor, arg_slice);
84             const unsigned int total_cross_plane_pad = tensor->info()->padding().top + tensor->info()->padding().bottom;
85             _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad));
86             break;
87         }
88         case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D:
89         {
90             const TensorShape shape2d(tensor->info()->dimension(0) / 4, tensor->info()->dimension(1) * tensor->info()->dimension(2) * tensor->info()->dimension(3));
91             const size_t      image_row_pitch = tensor->info()->strides_in_bytes()[1];
92             cl::Image2D       tensor_image2d  = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), shape2d, tensor->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
93             cl_images.push_back(tensor_image2d);
94             _kernel.setArg(idx++, tensor_image2d);
95             break;
96         }
97 
98         case GpuKernelArgumentInfo::Type::Image_3D:
99         {
100             add_2D_tensor_argument(idx, tensor, arg_slice);
101             _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(tensor->info()->strides_in_bytes()[2]));
102             break;
103         }
104         case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D:
105         {
106             const TensorShape shape2d(tensor->info()->dimension(0) / 4, tensor->info()->dimension(1) * tensor->info()->dimension(2) * tensor->info()->dimension(3));
107             const size_t      image_row_pitch = tensor->info()->strides_in_bytes()[1];
108             cl::Image2D       tensor_image2d  = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), shape2d, tensor->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
109             cl_images.push_back(tensor_image2d);
110             _kernel.setArg(idx++, tensor_image2d);
111             _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(tensor->info()->strides_in_bytes()[2]));
112             break;
113         }
114 
115         case GpuKernelArgumentInfo::Type::Tensor_3D:
116         {
117             add_3D_tensor_argument(idx, tensor, arg_slice);
118             break;
119         }
120 
121         case GpuKernelArgumentInfo::Type::Tensor_4D:
122         {
123             add_4D_tensor_argument(idx, tensor, arg_slice);
124             break;
125         }
126         case GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer:
127         {
128             add_4d_tensor_nhwc_argument(idx, tensor);
129             break;
130         }
131         case GpuKernelArgumentInfo::Type::Tensor_4D_t_Image:
132         {
133             const size_t image_w        = tensor->info()->dimension(0) / 4;
134             const size_t image_h        = tensor->info()->tensor_shape().total_size_upper(1);
135             const size_t image_stride_y = tensor->info()->strides_in_bytes()[1];
136 
137             cl::Image2D tensor_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(),
138                                                                     TensorShape(image_w, image_h), tensor->info()->data_type(), image_stride_y, CLImage2DType::ReadOnly);
139             cl_images.push_back(tensor_image2d);
140 
141             _kernel.setArg(idx++, tensor_image2d);
142             add_4d_tensor_nhwc_argument(idx, tensor);
143             break;
144         }
145         default:
146         {
147             ARM_COMPUTE_ERROR("Unsupported");
148         }
149     }
150 }
151 
run_op(ITensorPack & tensors,const Window & window,cl::CommandQueue & queue)152 void ClKernelRuntime::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
153 {
154     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
155     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
156 
157     Window slice = window.first_slice_window_3D();
158     // Don't slice matrix along the z dimension if matrix has just 2 dimensions and matrix A more than 2
159     // This scenario can happen when the matrix multiplication is used to perform a convolution operation
160     Window slice_fixed_z = slice;
161     slice_fixed_z.set(Window::DimX, Window::Dimension(0, 1, 1));
162     slice_fixed_z.set(Window::DimY, Window::Dimension(0, 1, 1));
163 
164     /// NOTE: Parameters extracted from old kernels. So far they seem to be constant
165     /// but we may need to make them into another configuration passed from GpuWorkloadSourceCode if needed in the future
166     constexpr bool slide_along_dimz     = true;
167     constexpr bool skip_sliding_window  = false;
168     constexpr bool use_dummy_work_items = false;
169 
170     unsigned int idx = 0;
171     do
172     {
173         // Set kernel arguments
174         Window arg_slice = slice;
175         // CLImages created from tensor arguments. Need to be retained until enqueue
176         std::vector<cl::Image2D> cl_images;
177         for(auto id_arg : _arguments)
178         {
179             const auto arg    = id_arg.second;
180             auto       tensor = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(id_arg.first));
181             ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
182             ARM_COMPUTE_ERROR_ON_NULLPTR(tensor->info());
183             if(!slide_along_dimz)
184             {
185                 // The stride_z for matrix must be zero if we do not slice
186                 ARM_COMPUTE_ERROR_ON(tensor->info()->strides_in_bytes()[3] != 0);
187                 arg_slice = slice_fixed_z;
188             }
189             add_tensor_argument(idx, *arg.kernel_argument_info(), tensor, arg_slice, cl_images);
190         }
191 
192         // Dispatch kernel
193         enqueue(queue, *this, slice, lws_hint(), use_dummy_work_items);
194     }
195     while(skip_sliding_window && window.slide_window_slice_3D(slice));
196 }
197 
198 } // namespace dynamic_fusion
199 } // namespace experimental
200 } // namespace arm_compute
201