1 /*
2 * Copyright (c) 2022 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24 #include "GpuKernelComponentGroup.h"
25
26 #include "arm_compute/core/ITensorInfo.h"
27 #include "arm_compute/core/Validate.h"
28 #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
29
30 #include <algorithm>
31
32 namespace arm_compute
33 {
34 namespace experimental
35 {
36 namespace dynamic_fusion
37 {
add_component(ComponentPtr component)38 bool GpuKernelComponentGroup::add_component(ComponentPtr component)
39 {
40 ARM_COMPUTE_ERROR_ON_MSG(
41 _finalized, "The component group has been finalized and cannot be altered.");
42
43 // note: Constraint 1 is guaranteed as a precondition
44 // Constraint 2
45 if(component->type() != GpuComponentType::Output && _components.size() >= max_fused_components)
46 {
47 return false;
48 }
49 // Constraint 3.1: Pattern: (Unfusable + Output)
50 if(!_components.empty() && get_root_component()->type() == GpuComponentType::Unfusable && component->type() != GpuComponentType::Output)
51 {
52 return false;
53 }
54 // Constraint 3.2
55 if(!_components.empty() && (component->type() != GpuComponentType::Simple && component->type() != GpuComponentType::Output))
56 {
57 return false;
58 }
59 // Constraint 4
60 if(component->type() != GpuComponentType::Unfusable && component->tensors().get_const_dst_tensors().size() != 1U)
61 {
62 return false;
63 }
64 // Constraint 5
65 if(!_components.empty() && !(get_root_component()->properties() == component->properties()))
66 {
67 return false;
68 }
69 // Constraint 7
70 if(!_components.empty())
71 {
72 const auto root_dst_tensors = get_root_component()->tensors().get_const_dst_tensors();
73 ARM_COMPUTE_ERROR_ON(root_dst_tensors.empty());
74 const auto first_dst_tensor = root_dst_tensors[0];
75 const auto dst_tensors = component->tensors().get_const_dst_tensors();
76 for(const auto &t : root_dst_tensors)
77 {
78 if(detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0))
79 {
80 return false;
81 }
82 }
83 for(const auto &t : dst_tensors)
84 {
85 if(detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0))
86 {
87 return false;
88 }
89 }
90 }
91 // Constraint 8
92 if(!_components.empty())
93 {
94 const auto root_dst_tensors = get_root_component()->tensors().get_const_dst_tensors();
95 ARM_COMPUTE_ERROR_ON(root_dst_tensors.empty());
96 const auto first_dst_tensor_layout = root_dst_tensors[0]->data_layout();
97 const auto dst_tensors = component->tensors().get_const_dst_tensors();
98 for(const auto &t : root_dst_tensors)
99 {
100 if(t->data_layout() != first_dst_tensor_layout)
101 {
102 return false;
103 }
104 }
105 for(const auto &t : dst_tensors)
106 {
107 if(t->data_layout() != first_dst_tensor_layout)
108 {
109 return false;
110 }
111 }
112 }
113 // Constraint 9
114 if(component->tensors().get_const_dst_tensors().size() >= max_dst_tensors)
115 {
116 return false;
117 }
118 // Constraint 9 corollary
119 if(component->type() == GpuComponentType::Output && _components.size() >= max_fused_components + max_dst_tensors)
120 {
121 return false;
122 }
123 _components.push_back(component);
124 return true;
125 }
126
finalize()127 void GpuKernelComponentGroup::finalize()
128 {
129 if(_finalized)
130 {
131 return;
132 }
133
134 _finalized = true;
135
136 std::set<const ITensorInfo *> output_tensors;
137 std::map<const ITensorInfo *, std::vector<const ITensorInfo *>> possible_tile_map;
138 std::map<const ITensorInfo *, int32_t> tile_usages;
139
140 for(auto component : _components)
141 {
142 const auto tensors = component->tensors();
143 const auto src_tensors = tensors.get_const_src_tensors();
144 const auto dst_tensors = tensors.get_const_dst_tensors();
145
146 // Detect input, output and intermediate tensors.
147 for(auto tensor : src_tensors)
148 {
149 const auto output_tensors_it = output_tensors.find(tensor);
150
151 if(output_tensors_it != output_tensors.end())
152 {
153 // This tensor is the output of another operator.
154 // It must be marked as intermediate tensor.
155 output_tensors.erase(output_tensors_it);
156 _interm_tensors.insert(tensor);
157 }
158 else if(_interm_tensors.find(tensor) == _interm_tensors.end())
159 {
160 _input_tensors.insert(tensor);
161
162 tile_usages[tensor] = 0;
163 possible_tile_map.emplace(tensor, std::vector<const ITensorInfo *>());
164 }
165 }
166
167 for(auto tensor : dst_tensors)
168 {
169 ARM_COMPUTE_ERROR_ON(_input_tensors.find(tensor) != _input_tensors.end());
170 ARM_COMPUTE_ERROR_ON(output_tensors.find(tensor) != output_tensors.end());
171 ARM_COMPUTE_ERROR_ON(_interm_tensors.find(tensor) != _interm_tensors.end());
172 output_tensors.insert(tensor);
173
174 tile_usages[tensor] = 0;
175 possible_tile_map.emplace(tensor, std::vector<const ITensorInfo *>());
176 }
177
178 // Check if the output can overwrite the input tile.
179 const auto component_type = component->type();
180 if(component_type == GpuComponentType::Simple || component_type == GpuComponentType::Output)
181 {
182 ARM_COMPUTE_ERROR_ON(dst_tensors.size() != 1);
183
184 const auto dst_tensor = dst_tensors[0];
185 const auto &dst_shape = dst_tensor->tensor_shape();
186 const auto &dst_type = dst_tensor->data_type();
187
188 tile_usages[dst_tensor] = 0;
189
190 for(auto src_tensor : src_tensors)
191 {
192 const auto &src_shape = src_tensor->tensor_shape();
193 const auto &src_type = src_tensor->data_type();
194
195 if(src_shape == dst_shape && src_type == dst_type)
196 {
197 const auto tile_usages_it = tile_usages.find(src_tensor);
198 ARM_COMPUTE_ERROR_ON(tile_usages_it == tile_usages.end());
199
200 if(component_type == GpuComponentType::Simple || tile_usages_it->second > 0)
201 {
202 // Increase the number of tile usages unless this component is an output
203 // and the tile has not been shared with any component.
204 // (Reason: output component doesn't change the content of the tile)
205 ++tile_usages_it->second;
206 }
207
208 possible_tile_map[dst_tensor].push_back(src_tensor);
209 }
210 }
211 }
212 else
213 {
214 // Outputs of complex and unfusable components need dedicated tile.
215 for(auto tensor : dst_tensors)
216 {
217 tile_usages[tensor] = 0;
218 }
219 }
220 }
221
222 // Find the smallest list of tiles that the intermediate tensors need to write to.
223 for(auto tensor : _input_tensors)
224 {
225 _tile_map[tensor] = tensor;
226 }
227
228 for(auto component : _components)
229 {
230 const auto dst_tensors = component->tensors().get_const_dst_tensors();
231
232 for(auto tensor : dst_tensors)
233 {
234 const auto target_tiles = possible_tile_map.at(tensor);
235 _tile_map[tensor] = tensor;
236
237 for(auto target : target_tiles)
238 {
239 const auto num_usage = tile_usages[target];
240
241 if(num_usage <= 1)
242 {
243 // The target tile is consumed by only this operator, so we can reuse it
244 // for the destination tensor data.
245 _tile_map[tensor] = _tile_map.at(target);
246 break;
247 }
248 }
249 }
250 }
251
252 for(auto tensor : output_tensors)
253 {
254 _tile_map[tensor] = tensor;
255 }
256
257 // All intermediate tensors that cannot be shared with any previous tensor
258 // will need to be declared as tile variable.
259 for(auto tensor_tile : _tile_map)
260 {
261 if(tensor_tile.first == tensor_tile.second &&
262 _interm_tensors.find(tensor_tile.first) != _interm_tensors.end())
263 {
264 _tiles.push_back(tensor_tile.first);
265 }
266 }
267
268 std::set_union(
269 _input_tensors.begin(), _input_tensors.end(),
270 output_tensors.begin(), output_tensors.end(),
271 std::back_inserter(_argument_tensors));
272 _any_output_tensor = *output_tensors.begin();
273 }
274
get_tiles() const275 std::vector<const ITensorInfo *> GpuKernelComponentGroup::get_tiles() const
276 {
277 ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized.");
278 return _tiles;
279 }
280
get_tile_for_tensor(const ITensorInfo * tensor) const281 const ITensorInfo *GpuKernelComponentGroup::get_tile_for_tensor(const ITensorInfo *tensor) const
282 {
283 ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized.");
284
285 if(_tile_map.find(tensor) != _tile_map.end())
286 {
287 return _tile_map.at(tensor);
288 }
289
290 return tensor;
291 }
292
get_any_dst_tensor() const293 const ITensorInfo *GpuKernelComponentGroup::get_any_dst_tensor() const
294 {
295 ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized.");
296 return _any_output_tensor;
297 }
298
get_argument_tensors() const299 std::vector<const ITensorInfo *> GpuKernelComponentGroup::get_argument_tensors() const
300 {
301 ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized.");
302 return _argument_tensors;
303 }
304
get_root_component() const305 GpuKernelComponentGroup::ComponentPtr GpuKernelComponentGroup::get_root_component() const
306 {
307 if(empty())
308 {
309 return nullptr;
310 }
311 return _components[0];
312 }
313
is_intermediate_tensor(const ITensorInfo * tensor) const314 bool GpuKernelComponentGroup::is_intermediate_tensor(const ITensorInfo *tensor) const
315 {
316 ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized.");
317 return _interm_tensors.find(tensor) != _interm_tensors.end();
318 }
319
is_input_tensor(const ITensorInfo * tensor) const320 bool GpuKernelComponentGroup::is_input_tensor(const ITensorInfo *tensor) const
321 {
322 ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized.");
323 return _input_tensors.find(tensor) != _input_tensors.end();
324 }
325
size() const326 size_t GpuKernelComponentGroup::size() const
327 {
328 return _components.size();
329 }
empty() const330 bool GpuKernelComponentGroup::empty() const
331 {
332 return _components.empty();
333 }
operator [](size_t index)334 GpuKernelComponentGroup::ComponentPtr &GpuKernelComponentGroup::operator[](size_t index)
335 {
336 return _components[index];
337 }
operator [](size_t index) const338 const GpuKernelComponentGroup::ComponentPtr &GpuKernelComponentGroup::operator[](size_t index) const
339 {
340 return _components[index];
341 }
begin()342 typename std::vector<GpuKernelComponentGroup::ComponentPtr>::iterator GpuKernelComponentGroup::begin()
343 {
344 return _components.begin();
345 }
end()346 typename std::vector<GpuKernelComponentGroup::ComponentPtr>::iterator GpuKernelComponentGroup::end()
347 {
348 return _components.end();
349 }
begin() const350 typename std::vector<GpuKernelComponentGroup::ComponentPtr>::const_iterator GpuKernelComponentGroup::begin() const
351 {
352 return _components.cbegin();
353 }
end() const354 typename std::vector<GpuKernelComponentGroup::ComponentPtr>::const_iterator GpuKernelComponentGroup::end() const
355 {
356 return _components.cend();
357 }
cbegin() const358 typename std::vector<GpuKernelComponentGroup::ComponentPtr>::const_iterator GpuKernelComponentGroup::cbegin() const
359 {
360 return _components.cbegin();
361 }
cend() const362 typename std::vector<GpuKernelComponentGroup::ComponentPtr>::const_iterator GpuKernelComponentGroup::cend() const
363 {
364 return _components.cend();
365 }
366
367 } // namespace dynamic_fusion
368 } // namespace experimental
369 } // namespace arm_compute
370