1 /*
2 * Copyright (c) 2022-2023 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.h"
26
27 #include "src/core/helpers/WindowHelpers.h"
28 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
29 #include "support/StringSupport.h"
30
31 namespace arm_compute
32 {
33 namespace experimental
34 {
35 namespace dynamic_fusion
36 {
37 namespace
38 {
39 constexpr unsigned int serial_vector_size = 8;
40 } // namespace
ClTemplateLogits1DMaxShiftExpSum(ComponentId id,const ArgumentPack<ITensorInfo> & tensors,const Attributes & attributes)41 ClTemplateLogits1DMaxShiftExpSum::ClTemplateLogits1DMaxShiftExpSum(ComponentId id,
42 const ArgumentPack<ITensorInfo> &tensors,
43 const Attributes &attributes)
44 : IGpuTemplateComponentWriter{ id, tensors },
45 _src{},
46 _sum{},
47 _dst{},
48 _attributes{ attributes }
49 {
50 _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
51 _sum = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
52 _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_1);
53 ARM_COMPUTE_ERROR_ON_NULLPTR(_src);
54 ARM_COMPUTE_ERROR_ON_NULLPTR(_sum);
55 ARM_COMPUTE_ERROR_ON_NULLPTR(_dst);
56 }
57
get_name() const58 std::string ClTemplateLogits1DMaxShiftExpSum::get_name() const
59 {
60 return "logits_1d_max_shift_exp_sum";
61 }
62
get_component_code(const ComponentGroup & comp_group) const63 std::string ClTemplateLogits1DMaxShiftExpSum::get_component_code(const ComponentGroup &comp_group) const
64 {
65 ARM_COMPUTE_UNUSED(comp_group);
66
67 std::string code = R"_(
68 //------------------ START KERNEL {{meta_kernel_id}} ---------------------
69 #define VEC_TYPE VEC_DATA_TYPE({{DATA_TYPE}}, N0)
70 #define SELECT_TYPE SELECT_VEC_DATA_TYPE({{DATA_TYPE}}, N0)
71 {
72 __global uchar *src_addr = {{src}}_ptr + {{src}}_offset_first_element_in_bytes + g_ind_1 * {{src}}_stride_y + g_ind_2 * {{src}}_stride_z;
73 __global uchar *dst_addr = {{dst}}_ptr + {{dst}}_offset_first_element_in_bytes + g_ind_1 * {{dst}}_stride_y + g_ind_2 * {{dst}}_stride_z;
74 Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT({{sum}});
75 VEC_TYPE max_val_vec = (VEC_TYPE)({{MINVAL}});
76 )_";
77
78 const bool beta_defined = (_attributes.beta() != 1.f);
79
80 if(beta_defined)
81 {
82 code += R"_(
83 VEC_TYPE beta = (VEC_TYPE){{BETA}};
84 )_";
85 }
86
87 constexpr unsigned int _serial_vector_size = 8;
88 const unsigned int reduction_dim_size = _src->dimension(0);
89 const unsigned int vector_size = adjust_vec_size(_serial_vector_size, reduction_dim_size);
90 const bool non_multiple_of_n0 = ((reduction_dim_size % vector_size) != 0);
91
92 if(non_multiple_of_n0)
93 {
94 code += R"_(
95 VEC_TYPE data = VLOAD(N0)(0, (__global {{DATA_TYPE}} *)src_addr);
96 SELECT_TYPE widx = (SELECT_TYPE)PARTIAL_N0 > VEC_OFFS(SELECT_DATA_TYPE({{DATA_TYPE}}), N0);
97 max_val_vec = max(max_val_vec, select((VEC_TYPE)({{MINVAL}}), data, widx));
98 )_";
99 }
100
101 code += R"_(
102 for(uint i = PARTIAL_N0; i < {{SRC_WIDTH}}; i += N0)
103 {
104 VEC_TYPE data = VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(src_addr + i * sizeof({{DATA_TYPE}})));
105 max_val_vec = max(data, max_val_vec);
106 }
107
108 {{DATA_TYPE}} max_val = MAX_REDUCE(max_val_vec, N0);
109 VEC_TYPE sum1D = 0;
110 )_";
111
112 if(non_multiple_of_n0)
113 {
114 code += R"_(
115 data -= max_val;
116 )_";
117 if(beta_defined)
118 {
119 code += R"_(
120 data *= beta;
121 )_";
122 }
123
124 if(_attributes.is_log_softmax())
125 {
126 code += R"_(
127 VSTORE_PARTIAL(N0, PARTIAL_N0)
128 (data, 0, (__global {{DATA_TYPE}} *)dst_addr);
129 data = exp(data);
130 data = select(0, data, widx);
131 )_";
132 }
133 else
134 {
135 code += R"_(
136 data = exp(data);
137 data = select(0, data, widx);
138 VSTORE_PARTIAL(N0, PARTIAL_N0)
139 (data, 0, (__global {{DATA_TYPE}} *)dst_addr);
140 )_";
141 }
142
143 code += R"_(
144 sum1D += data;
145 )_";
146 }
147 code += R"_(
148 for(uint i = PARTIAL_N0; i < {{SRC_WIDTH}}; i += N0)
149 {
150 VEC_TYPE data = VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(src_addr + i * sizeof({{DATA_TYPE}})));
151 data -= max_val;
152 )_";
153
154 if(beta_defined)
155 {
156 code += R"_(
157 data *= beta;
158 )_";
159 }
160
161 if(_attributes.is_log_softmax())
162 {
163 code += R"_(
164 VSTORE(N0)
165 (data, 0, (__global {{DATA_TYPE}} *)(dst_addr + i * sizeof({{DATA_TYPE}})));
166 data = exp(data);
167 )_";
168 }
169 else
170 {
171 code += R"_(
172 data = exp(data);
173 VSTORE(N0)
174 (data, 0, (__global {{DATA_TYPE}} *)(dst_addr + i * sizeof({{DATA_TYPE}})));
175 )_";
176 }
177
178 code += R"_(
179 sum1D += data;
180 }
181 )_";
182
183 code += R"_(
184 *((__global {{DATA_TYPE}} *)sum.ptr) = SUM_REDUCE(sum1D, N0);
185 }
186 //------------------ END KERNEL {{meta_kernel_id}} ---------------------
187 )_";
188
189 return code;
190 }
191
declare_variables(GpuKernelVariableTable & vtable,const ComponentGroup & comp_group) const192 void ClTemplateLogits1DMaxShiftExpSum::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
193 {
194 vtable.declare_variable(
195 comp_group,
196 _src,
197 GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D),
198 "src");
199
200 vtable.declare_variable(
201 comp_group,
202 _sum,
203 GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D),
204 "sum");
205
206 vtable.declare_variable(
207 comp_group,
208 _dst,
209 GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D),
210 "dst");
211 }
212
get_tag_lut(const GpuKernelVariableTable & vtable,const ComponentGroup & comp_group) const213 TagLUT ClTemplateLogits1DMaxShiftExpSum::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
214 {
215 ARM_COMPUTE_UNUSED(comp_group);
216
217 TagLUT lut{};
218
219 // Arguments and global shared variables
220 lut["src"] = vtable.get_variable(_src);
221 lut["sum"] = vtable.get_variable(_sum);
222 lut["dst"] = vtable.get_variable(_dst);
223
224 // Local build options
225 lut["meta_kernel_id"] = id();
226
227 const DataType data_type = _src->data_type();
228
229 lut["DATA_TYPE"] = get_cl_type_from_data_type(data_type);
230 lut["BETA"] = float_to_string_with_full_precision(_attributes.beta());
231 lut["MINVAL"] = (data_type == DataType::F16) ? std::string("-HALF_MAX") : std::string("-FLT_MAX");
232 lut["SRC_WIDTH"] = support::cpp11::to_string(_src->dimension(0));
233
234 return lut;
235 }
236
get_build_options(const ComponentGroup & comp_group) const237 CLBuildOptions ClTemplateLogits1DMaxShiftExpSum::get_build_options(const ComponentGroup &comp_group) const
238 {
239 ARM_COMPUTE_UNUSED(comp_group);
240 CLBuildOptions build_opts{};
241
242 const unsigned int reduction_dim_size = _src->dimension(0);
243 const unsigned int vector_size = adjust_vec_size(serial_vector_size, reduction_dim_size);
244
245 build_opts.add_option("-DN0=" + support::cpp11::to_string(vector_size));
246 build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string((reduction_dim_size % vector_size)));
247
248 return build_opts;
249 }
250
get_config_id() const251 std::string ClTemplateLogits1DMaxShiftExpSum::get_config_id() const
252 {
253 std::string config_id = get_name();
254
255 config_id += "_";
256 config_id += support::cpp11::to_string(_src->dimension(0));
257 config_id += "_";
258 config_id += string_from_data_type(_src->data_type());
259
260 return config_id;
261 }
262
get_headers_list() const263 std::set<std::string> ClTemplateLogits1DMaxShiftExpSum::get_headers_list() const
264 {
265 return std::set<std::string>{ "helpers.h", "tile_helpers.h" };
266 }
267
get_window() const268 Window ClTemplateLogits1DMaxShiftExpSum::get_window() const
269 {
270 ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
271
272 Window win = calculate_max_window(*_dst, Steps(_src->dimension(0)));
273 return win.collapse(win, Window::DimZ);
274 }
275
276 } // namespace dynamic_fusion
277 } // namespace experimental
278 } // namespace arm_compute
279