xref: /aosp_15_r20/external/ComputeLibrary/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1 /*
2  * Copyright (c) 2022 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #include "GpuKernelComponentGroup.h"
25 
26 #include "arm_compute/core/ITensorInfo.h"
27 #include "arm_compute/core/Validate.h"
28 #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
29 
30 #include <algorithm>
31 
32 namespace arm_compute
33 {
34 namespace experimental
35 {
36 namespace dynamic_fusion
37 {
add_component(ComponentPtr component)38 bool GpuKernelComponentGroup::add_component(ComponentPtr component)
39 {
40     ARM_COMPUTE_ERROR_ON_MSG(
41         _finalized, "The component group has been finalized and cannot be altered.");
42 
43     // note: Constraint 1 is guaranteed as a precondition
44     // Constraint 2
45     if(component->type() != GpuComponentType::Output && _components.size() >= max_fused_components)
46     {
47         return false;
48     }
49     // Constraint 3.1: Pattern: (Unfusable + Output)
50     if(!_components.empty() && get_root_component()->type() == GpuComponentType::Unfusable && component->type() != GpuComponentType::Output)
51     {
52         return false;
53     }
54     // Constraint 3.2
55     if(!_components.empty() && (component->type() != GpuComponentType::Simple && component->type() != GpuComponentType::Output))
56     {
57         return false;
58     }
59     // Constraint 4
60     if(component->type() != GpuComponentType::Unfusable && component->tensors().get_const_dst_tensors().size() != 1U)
61     {
62         return false;
63     }
64     // Constraint 5
65     if(!_components.empty() && !(get_root_component()->properties() == component->properties()))
66     {
67         return false;
68     }
69     // Constraint 7
70     if(!_components.empty())
71     {
72         const auto root_dst_tensors = get_root_component()->tensors().get_const_dst_tensors();
73         ARM_COMPUTE_ERROR_ON(root_dst_tensors.empty());
74         const auto first_dst_tensor = root_dst_tensors[0];
75         const auto dst_tensors      = component->tensors().get_const_dst_tensors();
76         for(const auto &t : root_dst_tensors)
77         {
78             if(detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0))
79             {
80                 return false;
81             }
82         }
83         for(const auto &t : dst_tensors)
84         {
85             if(detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0))
86             {
87                 return false;
88             }
89         }
90     }
91     // Constraint 8
92     if(!_components.empty())
93     {
94         const auto root_dst_tensors = get_root_component()->tensors().get_const_dst_tensors();
95         ARM_COMPUTE_ERROR_ON(root_dst_tensors.empty());
96         const auto first_dst_tensor_layout = root_dst_tensors[0]->data_layout();
97         const auto dst_tensors             = component->tensors().get_const_dst_tensors();
98         for(const auto &t : root_dst_tensors)
99         {
100             if(t->data_layout() != first_dst_tensor_layout)
101             {
102                 return false;
103             }
104         }
105         for(const auto &t : dst_tensors)
106         {
107             if(t->data_layout() != first_dst_tensor_layout)
108             {
109                 return false;
110             }
111         }
112     }
113     // Constraint 9
114     if(component->tensors().get_const_dst_tensors().size() >= max_dst_tensors)
115     {
116         return false;
117     }
118     // Constraint 9 corollary
119     if(component->type() == GpuComponentType::Output && _components.size() >= max_fused_components + max_dst_tensors)
120     {
121         return false;
122     }
123     _components.push_back(component);
124     return true;
125 }
126 
finalize()127 void GpuKernelComponentGroup::finalize()
128 {
129     if(_finalized)
130     {
131         return;
132     }
133 
134     _finalized = true;
135 
136     std::set<const ITensorInfo *> output_tensors;
137     std::map<const ITensorInfo *, std::vector<const ITensorInfo *>> possible_tile_map;
138     std::map<const ITensorInfo *, int32_t> tile_usages;
139 
140     for(auto component : _components)
141     {
142         const auto tensors = component->tensors();
143         const auto src_tensors = tensors.get_const_src_tensors();
144         const auto dst_tensors = tensors.get_const_dst_tensors();
145 
146         // Detect input, output and intermediate tensors.
147         for(auto tensor : src_tensors)
148         {
149             const auto output_tensors_it = output_tensors.find(tensor);
150 
151             if(output_tensors_it != output_tensors.end())
152             {
153                 // This tensor is the output of another operator.
154                 // It must be marked as intermediate tensor.
155                 output_tensors.erase(output_tensors_it);
156                 _interm_tensors.insert(tensor);
157             }
158             else if(_interm_tensors.find(tensor) == _interm_tensors.end())
159             {
160                 _input_tensors.insert(tensor);
161 
162                 tile_usages[tensor] = 0;
163                 possible_tile_map.emplace(tensor, std::vector<const ITensorInfo *>());
164             }
165         }
166 
167         for(auto tensor : dst_tensors)
168         {
169             ARM_COMPUTE_ERROR_ON(_input_tensors.find(tensor) != _input_tensors.end());
170             ARM_COMPUTE_ERROR_ON(output_tensors.find(tensor) != output_tensors.end());
171             ARM_COMPUTE_ERROR_ON(_interm_tensors.find(tensor) != _interm_tensors.end());
172             output_tensors.insert(tensor);
173 
174             tile_usages[tensor] = 0;
175             possible_tile_map.emplace(tensor, std::vector<const ITensorInfo *>());
176         }
177 
178         // Check if the output can overwrite the input tile.
179         const auto component_type = component->type();
180         if(component_type == GpuComponentType::Simple || component_type == GpuComponentType::Output)
181         {
182             ARM_COMPUTE_ERROR_ON(dst_tensors.size() != 1);
183 
184             const auto dst_tensor = dst_tensors[0];
185             const auto &dst_shape = dst_tensor->tensor_shape();
186             const auto &dst_type = dst_tensor->data_type();
187 
188             tile_usages[dst_tensor] = 0;
189 
190             for(auto src_tensor : src_tensors)
191             {
192                 const auto &src_shape = src_tensor->tensor_shape();
193                 const auto &src_type = src_tensor->data_type();
194 
195                 if(src_shape == dst_shape && src_type == dst_type)
196                 {
197                     const auto tile_usages_it = tile_usages.find(src_tensor);
198                     ARM_COMPUTE_ERROR_ON(tile_usages_it == tile_usages.end());
199 
200                     if(component_type == GpuComponentType::Simple || tile_usages_it->second > 0)
201                     {
202                         // Increase the number of tile usages unless this component is an output
203                         // and the tile has not been shared with any component.
204                         // (Reason: output component doesn't change the content of the tile)
205                         ++tile_usages_it->second;
206                     }
207 
208                     possible_tile_map[dst_tensor].push_back(src_tensor);
209                 }
210             }
211         }
212         else
213         {
214             // Outputs of complex and unfusable components need dedicated tile.
215             for(auto tensor : dst_tensors)
216             {
217                 tile_usages[tensor] = 0;
218             }
219         }
220     }
221 
222     // Find the smallest list of tiles that the intermediate tensors need to write to.
223     for(auto tensor : _input_tensors)
224     {
225         _tile_map[tensor] = tensor;
226     }
227 
228     for(auto component : _components)
229     {
230         const auto dst_tensors = component->tensors().get_const_dst_tensors();
231 
232         for(auto tensor : dst_tensors)
233         {
234             const auto target_tiles = possible_tile_map.at(tensor);
235             _tile_map[tensor] = tensor;
236 
237             for(auto target : target_tiles)
238             {
239                 const auto num_usage = tile_usages[target];
240 
241                 if(num_usage <= 1)
242                 {
243                     // The target tile is consumed by only this operator, so we can reuse it
244                     // for the destination tensor data.
245                     _tile_map[tensor] = _tile_map.at(target);
246                     break;
247                 }
248             }
249         }
250     }
251 
252     for(auto tensor : output_tensors)
253     {
254         _tile_map[tensor] = tensor;
255     }
256 
257     // All intermediate tensors that cannot be shared with any previous tensor
258     // will need to be declared as tile variable.
259     for(auto tensor_tile : _tile_map)
260     {
261         if(tensor_tile.first == tensor_tile.second &&
262            _interm_tensors.find(tensor_tile.first) != _interm_tensors.end())
263         {
264             _tiles.push_back(tensor_tile.first);
265         }
266     }
267 
268     std::set_union(
269         _input_tensors.begin(), _input_tensors.end(),
270         output_tensors.begin(), output_tensors.end(),
271         std::back_inserter(_argument_tensors));
272     _any_output_tensor = *output_tensors.begin();
273 }
274 
get_tiles() const275 std::vector<const ITensorInfo *> GpuKernelComponentGroup::get_tiles() const
276 {
277     ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized.");
278     return _tiles;
279 }
280 
get_tile_for_tensor(const ITensorInfo * tensor) const281 const ITensorInfo *GpuKernelComponentGroup::get_tile_for_tensor(const ITensorInfo *tensor) const
282 {
283     ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized.");
284 
285     if(_tile_map.find(tensor) != _tile_map.end())
286     {
287         return _tile_map.at(tensor);
288     }
289 
290     return tensor;
291 }
292 
get_any_dst_tensor() const293 const ITensorInfo *GpuKernelComponentGroup::get_any_dst_tensor() const
294 {
295     ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized.");
296     return _any_output_tensor;
297 }
298 
get_argument_tensors() const299 std::vector<const ITensorInfo *> GpuKernelComponentGroup::get_argument_tensors() const
300 {
301     ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized.");
302     return _argument_tensors;
303 }
304 
get_root_component() const305 GpuKernelComponentGroup::ComponentPtr GpuKernelComponentGroup::get_root_component() const
306 {
307     if(empty())
308     {
309         return nullptr;
310     }
311     return _components[0];
312 }
313 
is_intermediate_tensor(const ITensorInfo * tensor) const314 bool GpuKernelComponentGroup::is_intermediate_tensor(const ITensorInfo *tensor) const
315 {
316     ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized.");
317     return _interm_tensors.find(tensor) != _interm_tensors.end();
318 }
319 
is_input_tensor(const ITensorInfo * tensor) const320 bool GpuKernelComponentGroup::is_input_tensor(const ITensorInfo *tensor) const
321 {
322     ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized.");
323     return _input_tensors.find(tensor) != _input_tensors.end();
324 }
325 
size() const326 size_t GpuKernelComponentGroup::size() const
327 {
328     return _components.size();
329 }
empty() const330 bool GpuKernelComponentGroup::empty() const
331 {
332     return _components.empty();
333 }
operator [](size_t index)334 GpuKernelComponentGroup::ComponentPtr &GpuKernelComponentGroup::operator[](size_t index)
335 {
336     return _components[index];
337 }
operator [](size_t index) const338 const GpuKernelComponentGroup::ComponentPtr &GpuKernelComponentGroup::operator[](size_t index) const
339 {
340     return _components[index];
341 }
begin()342 typename std::vector<GpuKernelComponentGroup::ComponentPtr>::iterator GpuKernelComponentGroup::begin()
343 {
344     return _components.begin();
345 }
end()346 typename std::vector<GpuKernelComponentGroup::ComponentPtr>::iterator GpuKernelComponentGroup::end()
347 {
348     return _components.end();
349 }
begin() const350 typename std::vector<GpuKernelComponentGroup::ComponentPtr>::const_iterator GpuKernelComponentGroup::begin() const
351 {
352     return _components.cbegin();
353 }
end() const354 typename std::vector<GpuKernelComponentGroup::ComponentPtr>::const_iterator GpuKernelComponentGroup::end() const
355 {
356     return _components.cend();
357 }
cbegin() const358 typename std::vector<GpuKernelComponentGroup::ComponentPtr>::const_iterator GpuKernelComponentGroup::cbegin() const
359 {
360     return _components.cbegin();
361 }
cend() const362 typename std::vector<GpuKernelComponentGroup::ComponentPtr>::const_iterator GpuKernelComponentGroup::cend() const
363 {
364     return _components.cend();
365 }
366 
367 } // namespace dynamic_fusion
368 } // namespace experimental
369 } // namespace arm_compute
370