xref: /aosp_15_r20/external/ComputeLibrary/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp (revision c217d954acce2dbc11938adb493fc0abd69584f3)
1 /*
2  * Copyright (c) 2019-2021 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #include "arm_compute/runtime/CPP/functions/CPPDetectionPostProcessLayer.h"
25 
26 #include "arm_compute/core/Error.h"
27 #include "arm_compute/core/Helpers.h"
28 #include "arm_compute/core/Validate.h"
29 #include "src/core/helpers/AutoConfiguration.h"
30 
31 #include "src/common/utils/Log.h"
32 
33 #include <cstddef>
34 #include <ios>
35 #include <list>
36 
37 namespace arm_compute
38 {
39 namespace
40 {
validate_arguments(const ITensorInfo * input_box_encoding,const ITensorInfo * input_class_score,const ITensorInfo * input_anchors,ITensorInfo * output_boxes,ITensorInfo * output_classes,ITensorInfo * output_scores,ITensorInfo * num_detection,DetectionPostProcessLayerInfo info,const unsigned int kBatchSize,const unsigned int kNumCoordBox)41 Status validate_arguments(const ITensorInfo *input_box_encoding, const ITensorInfo *input_class_score, const ITensorInfo *input_anchors,
42                           ITensorInfo *output_boxes, ITensorInfo *output_classes, ITensorInfo *output_scores, ITensorInfo *num_detection,
43                           DetectionPostProcessLayerInfo info, const unsigned int kBatchSize, const unsigned int kNumCoordBox)
44 {
45     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input_box_encoding, input_class_score, input_anchors);
46     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_box_encoding, 1, DataType::F32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
47     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_box_encoding, input_anchors);
48     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_box_encoding->num_dimensions() > 3, "The location input tensor shape should be [4, N, kBatchSize].");
49     if(input_box_encoding->num_dimensions() > 2)
50     {
51         ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(input_box_encoding->dimension(2) != kBatchSize, "The third dimension of the input box_encoding tensor should be equal to %d.", kBatchSize);
52     }
53     ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(input_box_encoding->dimension(0) != kNumCoordBox, "The first dimension of the input box_encoding tensor should be equal to %d.", kNumCoordBox);
54     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_class_score->dimension(0) != (info.num_classes() + 1),
55                                     "The first dimension of the input class_prediction should be equal to the number of classes plus one.");
56 
57     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_anchors->num_dimensions() > 3, "The anchors input tensor shape should be [4, N, kBatchSize].");
58     if(input_anchors->num_dimensions() > 2)
59     {
60         ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(input_anchors->dimension(0) != kNumCoordBox, "The first dimension of the input anchors tensor should be equal to %d.", kNumCoordBox);
61     }
62     ARM_COMPUTE_RETURN_ERROR_ON_MSG((input_box_encoding->dimension(1) != input_class_score->dimension(1))
63                                     || (input_box_encoding->dimension(1) != input_anchors->dimension(1)),
64                                     "The second dimension of the inputs should be the same.");
65     ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_detection->num_dimensions() > 1, "The num_detection output tensor shape should be [M].");
66     ARM_COMPUTE_RETURN_ERROR_ON_MSG((info.iou_threshold() <= 0.0f) || (info.iou_threshold() > 1.0f), "The intersection over union should be positive and less than 1.");
67     ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_classes_per_detection() <= 0, "The number of max classes per detection should be positive.");
68 
69     const unsigned int num_detected_boxes = info.max_detections() * info.max_classes_per_detection();
70 
71     // Validate configured outputs
72     if(output_boxes->total_size() != 0)
73     {
74         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_boxes->tensor_shape(), TensorShape(4U, num_detected_boxes, 1U));
75         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_boxes, 1, DataType::F32);
76     }
77     if(output_classes->total_size() != 0)
78     {
79         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_classes->tensor_shape(), TensorShape(num_detected_boxes, 1U));
80         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_classes, 1, DataType::F32);
81     }
82     if(output_scores->total_size() != 0)
83     {
84         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_scores->tensor_shape(), TensorShape(num_detected_boxes, 1U));
85         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_scores, 1, DataType::F32);
86     }
87     if(num_detection->total_size() != 0)
88     {
89         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(num_detection->tensor_shape(), TensorShape(1U));
90         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(num_detection, 1, DataType::F32);
91     }
92 
93     return Status{};
94 }
95 
DecodeBoxCorner(BBox & box_centersize,BBox & anchor,Iterator & decoded_it,DetectionPostProcessLayerInfo info)96 inline void DecodeBoxCorner(BBox &box_centersize, BBox &anchor, Iterator &decoded_it, DetectionPostProcessLayerInfo info)
97 {
98     const float half_factor = 0.5f;
99 
100     // BBox is equavalent to CenterSizeEncoding [y,x,h,w]
101     const float y_center = box_centersize[0] / info.scale_value_y() * anchor[2] + anchor[0];
102     const float x_center = box_centersize[1] / info.scale_value_x() * anchor[3] + anchor[1];
103     const float half_h   = half_factor * static_cast<float>(std::exp(box_centersize[2] / info.scale_value_h())) * anchor[2];
104     const float half_w   = half_factor * static_cast<float>(std::exp(box_centersize[3] / info.scale_value_w())) * anchor[3];
105 
106     // Box Corner encoding boxes are saved as [xmin, ymin, xmax, ymax]
107     auto decoded_ptr   = reinterpret_cast<float *>(decoded_it.ptr());
108     *(decoded_ptr)     = x_center - half_w; // xmin
109     *(1 + decoded_ptr) = y_center - half_h; // ymin
110     *(2 + decoded_ptr) = x_center + half_w; // xmax
111     *(3 + decoded_ptr) = y_center + half_h; // ymax
112 }
113 
114 /** Decode a bbox according to a anchors and scale info.
115  *
116  * @param[in]  input_box_encoding The input prior bounding boxes.
117  * @param[in]  input_anchors      The corresponding input variance.
118  * @param[in]  info               The detection informations
119  * @param[out] decoded_boxes      The decoded bboxes.
120  */
DecodeCenterSizeBoxes(const ITensor * input_box_encoding,const ITensor * input_anchors,DetectionPostProcessLayerInfo info,Tensor * decoded_boxes)121 void DecodeCenterSizeBoxes(const ITensor *input_box_encoding, const ITensor *input_anchors, DetectionPostProcessLayerInfo info, Tensor *decoded_boxes)
122 {
123     const QuantizationInfo &qi_box     = input_box_encoding->info()->quantization_info();
124     const QuantizationInfo &qi_anchors = input_anchors->info()->quantization_info();
125     BBox                    box_centersize{ {} };
126     BBox                    anchor{ {} };
127 
128     Window win;
129     win.use_tensor_dimensions(input_box_encoding->info()->tensor_shape());
130     win.set_dimension_step(0U, 4U);
131     win.set_dimension_step(1U, 1U);
132     Iterator box_it(input_box_encoding, win);
133     Iterator anchor_it(input_anchors, win);
134     Iterator decoded_it(decoded_boxes, win);
135 
136     if(input_box_encoding->info()->data_type() == DataType::QASYMM8)
137     {
138         execute_window_loop(win, [&](const Coordinates &)
139         {
140             const auto box_ptr    = reinterpret_cast<const qasymm8_t *>(box_it.ptr());
141             const auto anchor_ptr = reinterpret_cast<const qasymm8_t *>(anchor_it.ptr());
142             box_centersize        = BBox({ dequantize_qasymm8(*box_ptr, qi_box), dequantize_qasymm8(*(box_ptr + 1), qi_box),
143                                            dequantize_qasymm8(*(2 + box_ptr), qi_box), dequantize_qasymm8(*(3 + box_ptr), qi_box)
144                                          });
145             anchor = BBox({ dequantize_qasymm8(*anchor_ptr, qi_anchors), dequantize_qasymm8(*(anchor_ptr + 1), qi_anchors),
146                             dequantize_qasymm8(*(2 + anchor_ptr), qi_anchors), dequantize_qasymm8(*(3 + anchor_ptr), qi_anchors)
147                           });
148             DecodeBoxCorner(box_centersize, anchor, decoded_it, info);
149         },
150         box_it, anchor_it, decoded_it);
151     }
152     else if(input_box_encoding->info()->data_type() == DataType::QASYMM8_SIGNED)
153     {
154         execute_window_loop(win, [&](const Coordinates &)
155         {
156             const auto box_ptr    = reinterpret_cast<const qasymm8_signed_t *>(box_it.ptr());
157             const auto anchor_ptr = reinterpret_cast<const qasymm8_signed_t *>(anchor_it.ptr());
158             box_centersize        = BBox({ dequantize_qasymm8_signed(*box_ptr, qi_box), dequantize_qasymm8_signed(*(box_ptr + 1), qi_box),
159                                            dequantize_qasymm8_signed(*(2 + box_ptr), qi_box), dequantize_qasymm8_signed(*(3 + box_ptr), qi_box)
160                                          });
161             anchor = BBox({ dequantize_qasymm8_signed(*anchor_ptr, qi_anchors), dequantize_qasymm8_signed(*(anchor_ptr + 1), qi_anchors),
162                             dequantize_qasymm8_signed(*(2 + anchor_ptr), qi_anchors), dequantize_qasymm8_signed(*(3 + anchor_ptr), qi_anchors)
163                           });
164             DecodeBoxCorner(box_centersize, anchor, decoded_it, info);
165         },
166         box_it, anchor_it, decoded_it);
167     }
168     else
169     {
170         execute_window_loop(win, [&](const Coordinates &)
171         {
172             const auto box_ptr    = reinterpret_cast<const float *>(box_it.ptr());
173             const auto anchor_ptr = reinterpret_cast<const float *>(anchor_it.ptr());
174             box_centersize        = BBox({ *box_ptr, *(box_ptr + 1), *(2 + box_ptr), *(3 + box_ptr) });
175             anchor                = BBox({ *anchor_ptr, *(anchor_ptr + 1), *(2 + anchor_ptr), *(3 + anchor_ptr) });
176             DecodeBoxCorner(box_centersize, anchor, decoded_it, info);
177         },
178         box_it, anchor_it, decoded_it);
179     }
180 }
181 
SaveOutputs(const Tensor * decoded_boxes,const std::vector<int> & result_idx_boxes_after_nms,const std::vector<float> & result_scores_after_nms,const std::vector<int> & result_classes_after_nms,std::vector<unsigned int> & sorted_indices,const unsigned int num_output,const unsigned int max_detections,ITensor * output_boxes,ITensor * output_classes,ITensor * output_scores,ITensor * num_detection)182 void SaveOutputs(const Tensor *decoded_boxes, const std::vector<int> &result_idx_boxes_after_nms, const std::vector<float> &result_scores_after_nms, const std::vector<int> &result_classes_after_nms,
183                  std::vector<unsigned int> &sorted_indices, const unsigned int num_output, const unsigned int max_detections, ITensor *output_boxes, ITensor *output_classes, ITensor *output_scores,
184                  ITensor *num_detection)
185 {
186     // xmin,ymin,xmax,ymax -> ymin,xmin,ymax,xmax
187     unsigned int i = 0;
188     for(; i < num_output; ++i)
189     {
190         const unsigned int box_in_idx = result_idx_boxes_after_nms[sorted_indices[i]];
191         *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(0, i)))) = *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(1, box_in_idx))));
192         *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(1, i)))) = *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(0, box_in_idx))));
193         *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(2, i)))) = *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(3, box_in_idx))));
194         *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(3, i)))) = *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(2, box_in_idx))));
195         *(reinterpret_cast<float *>(output_classes->ptr_to_element(Coordinates(i)))) = static_cast<float>(result_classes_after_nms[sorted_indices[i]]);
196         *(reinterpret_cast<float *>(output_scores->ptr_to_element(Coordinates(i))))  = result_scores_after_nms[sorted_indices[i]];
197     }
198     for(; i < max_detections; ++i)
199     {
200         *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(1, i)))) = 0.0f;
201         *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(0, i)))) = 0.0f;
202         *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(3, i)))) = 0.0f;
203         *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(2, i)))) = 0.0f;
204         *(reinterpret_cast<float *>(output_classes->ptr_to_element(Coordinates(i)))) = 0.0f;
205         *(reinterpret_cast<float *>(output_scores->ptr_to_element(Coordinates(i))))  = 0.0f;
206     }
207     *(reinterpret_cast<float *>(num_detection->ptr_to_element(Coordinates(0)))) = num_output;
208 }
209 } // namespace
210 
CPPDetectionPostProcessLayer(std::shared_ptr<IMemoryManager> memory_manager)211 CPPDetectionPostProcessLayer::CPPDetectionPostProcessLayer(std::shared_ptr<IMemoryManager> memory_manager)
212     : _memory_group(std::move(memory_manager)), _nms(), _input_box_encoding(nullptr), _input_scores(nullptr), _input_anchors(nullptr), _output_boxes(nullptr), _output_classes(nullptr),
213       _output_scores(nullptr), _num_detection(nullptr), _info(), _num_boxes(), _num_classes_with_background(), _num_max_detected_boxes(), _dequantize_scores(false), _decoded_boxes(), _decoded_scores(),
214       _selected_indices(), _class_scores(), _input_scores_to_use(nullptr)
215 {
216 }
217 
configure(const ITensor * input_box_encoding,const ITensor * input_scores,const ITensor * input_anchors,ITensor * output_boxes,ITensor * output_classes,ITensor * output_scores,ITensor * num_detection,DetectionPostProcessLayerInfo info)218 void CPPDetectionPostProcessLayer::configure(const ITensor *input_box_encoding, const ITensor *input_scores,
219                                              const ITensor *input_anchors, ITensor *output_boxes, ITensor *output_classes,
220                                              ITensor *output_scores, ITensor *num_detection, DetectionPostProcessLayerInfo info)
221 {
222     ARM_COMPUTE_ERROR_ON_NULLPTR(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores);
223     ARM_COMPUTE_LOG_PARAMS(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores,
224                            num_detection, info);
225 
226     _num_max_detected_boxes = info.max_detections() * info.max_classes_per_detection();
227 
228     auto_init_if_empty(*output_boxes->info(), TensorInfo(TensorShape(_kNumCoordBox, _num_max_detected_boxes, _kBatchSize), 1, DataType::F32));
229     auto_init_if_empty(*output_classes->info(), TensorInfo(TensorShape(_num_max_detected_boxes, _kBatchSize), 1, DataType::F32));
230     auto_init_if_empty(*output_scores->info(), TensorInfo(TensorShape(_num_max_detected_boxes, _kBatchSize), 1, DataType::F32));
231     auto_init_if_empty(*num_detection->info(), TensorInfo(TensorShape(1U), 1, DataType::F32));
232 
233     // Perform validation step
234     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_box_encoding->info(), input_scores->info(), input_anchors->info(), output_boxes->info(), output_classes->info(), output_scores->info(),
235                                                   num_detection->info(),
236                                                   info, _kBatchSize, _kNumCoordBox));
237 
238     _input_box_encoding          = input_box_encoding;
239     _input_scores                = input_scores;
240     _input_anchors               = input_anchors;
241     _output_boxes                = output_boxes;
242     _output_classes              = output_classes;
243     _output_scores               = output_scores;
244     _num_detection               = num_detection;
245     _info                        = info;
246     _num_boxes                   = input_box_encoding->info()->dimension(1);
247     _num_classes_with_background = _input_scores->info()->dimension(0);
248     _dequantize_scores           = (info.dequantize_scores() && is_data_type_quantized(input_box_encoding->info()->data_type()));
249 
250     auto_init_if_empty(*_decoded_boxes.info(), TensorInfo(TensorShape(_kNumCoordBox, _input_box_encoding->info()->dimension(1), _kBatchSize), 1, DataType::F32));
251     auto_init_if_empty(*_decoded_scores.info(), TensorInfo(TensorShape(_input_scores->info()->dimension(0), _input_scores->info()->dimension(1), _kBatchSize), 1, DataType::F32));
252     auto_init_if_empty(*_selected_indices.info(), TensorInfo(TensorShape(info.use_regular_nms() ? info.detection_per_class() : info.max_detections()), 1, DataType::S32));
253     const unsigned int num_classes_per_box = std::min(info.max_classes_per_detection(), info.num_classes());
254     auto_init_if_empty(*_class_scores.info(), TensorInfo(info.use_regular_nms() ? TensorShape(_num_boxes) : TensorShape(_num_boxes * num_classes_per_box), 1, DataType::F32));
255 
256     _input_scores_to_use = _dequantize_scores ? &_decoded_scores : _input_scores;
257 
258     // Manage intermediate buffers
259     _memory_group.manage(&_decoded_boxes);
260     _memory_group.manage(&_decoded_scores);
261     _memory_group.manage(&_selected_indices);
262     _memory_group.manage(&_class_scores);
263     _nms.configure(&_decoded_boxes, &_class_scores, &_selected_indices, info.use_regular_nms() ? info.detection_per_class() : info.max_detections(), info.nms_score_threshold(), info.iou_threshold());
264 
265     // Allocate and reserve intermediate tensors and vectors
266     _decoded_boxes.allocator()->allocate();
267     _decoded_scores.allocator()->allocate();
268     _selected_indices.allocator()->allocate();
269     _class_scores.allocator()->allocate();
270 }
271 
validate(const ITensorInfo * input_box_encoding,const ITensorInfo * input_class_score,const ITensorInfo * input_anchors,ITensorInfo * output_boxes,ITensorInfo * output_classes,ITensorInfo * output_scores,ITensorInfo * num_detection,DetectionPostProcessLayerInfo info)272 Status CPPDetectionPostProcessLayer::validate(const ITensorInfo *input_box_encoding, const ITensorInfo *input_class_score, const ITensorInfo *input_anchors,
273                                               ITensorInfo *output_boxes, ITensorInfo *output_classes, ITensorInfo *output_scores, ITensorInfo *num_detection, DetectionPostProcessLayerInfo info)
274 {
275     constexpr unsigned int kBatchSize             = 1;
276     constexpr unsigned int kNumCoordBox           = 4;
277     const TensorInfo       _decoded_boxes_info    = TensorInfo(TensorShape(kNumCoordBox, input_box_encoding->dimension(1)), 1, DataType::F32);
278     const TensorInfo       _decoded_scores_info   = TensorInfo(TensorShape(input_box_encoding->dimension(1)), 1, DataType::F32);
279     const TensorInfo       _selected_indices_info = TensorInfo(TensorShape(info.max_detections()), 1, DataType::S32);
280 
281     ARM_COMPUTE_RETURN_ON_ERROR(CPPNonMaximumSuppression::validate(&_decoded_boxes_info, &_decoded_scores_info, &_selected_indices_info, info.max_detections(), info.nms_score_threshold(),
282                                                                    info.iou_threshold()));
283     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_box_encoding, input_class_score, input_anchors, output_boxes, output_classes, output_scores, num_detection, info, kBatchSize, kNumCoordBox));
284 
285     return Status{};
286 }
287 
run()288 void CPPDetectionPostProcessLayer::run()
289 {
290     const unsigned int num_classes    = _info.num_classes();
291     const unsigned int max_detections = _info.max_detections();
292 
293     DecodeCenterSizeBoxes(_input_box_encoding, _input_anchors, _info, &_decoded_boxes);
294 
295     // Decode scores if necessary
296     if(_dequantize_scores)
297     {
298         if(_input_box_encoding->info()->data_type() == DataType::QASYMM8)
299         {
300             for(unsigned int idx_c = 0; idx_c < _num_classes_with_background; ++idx_c)
301             {
302                 for(unsigned int idx_b = 0; idx_b < _num_boxes; ++idx_b)
303                 {
304                     *(reinterpret_cast<float *>(_decoded_scores.ptr_to_element(Coordinates(idx_c, idx_b)))) =
305                         dequantize_qasymm8(*(reinterpret_cast<qasymm8_t *>(_input_scores->ptr_to_element(Coordinates(idx_c, idx_b)))), _input_scores->info()->quantization_info());
306                 }
307             }
308         }
309         else if(_input_box_encoding->info()->data_type() == DataType::QASYMM8_SIGNED)
310         {
311             for(unsigned int idx_c = 0; idx_c < _num_classes_with_background; ++idx_c)
312             {
313                 for(unsigned int idx_b = 0; idx_b < _num_boxes; ++idx_b)
314                 {
315                     *(reinterpret_cast<float *>(_decoded_scores.ptr_to_element(Coordinates(idx_c, idx_b)))) =
316                         dequantize_qasymm8_signed(*(reinterpret_cast<qasymm8_signed_t *>(_input_scores->ptr_to_element(Coordinates(idx_c, idx_b)))), _input_scores->info()->quantization_info());
317                 }
318             }
319         }
320     }
321 
322     // Regular NMS
323     if(_info.use_regular_nms())
324     {
325         std::vector<int>          result_idx_boxes_after_nms;
326         std::vector<int>          result_classes_after_nms;
327         std::vector<float>        result_scores_after_nms;
328         std::vector<unsigned int> sorted_indices;
329 
330         for(unsigned int c = 0; c < num_classes; ++c)
331         {
332             // For each boxes get scores of the boxes for the class c
333             for(unsigned int i = 0; i < _num_boxes; ++i)
334             {
335                 *(reinterpret_cast<float *>(_class_scores.ptr_to_element(Coordinates(i)))) =
336                     *(reinterpret_cast<float *>(_input_scores_to_use->ptr_to_element(Coordinates(c + 1, i)))); // i * _num_classes_with_background + c + 1
337             }
338 
339             // Run Non-maxima Suppression
340             _nms.run();
341 
342             for(unsigned int i = 0; i < _info.detection_per_class(); ++i)
343             {
344                 const auto selected_index = *(reinterpret_cast<int *>(_selected_indices.ptr_to_element(Coordinates(i))));
345                 if(selected_index == -1)
346                 {
347                     // Nms will return -1 for all the last M-elements not valid
348                     break;
349                 }
350                 result_idx_boxes_after_nms.emplace_back(selected_index);
351                 result_scores_after_nms.emplace_back((reinterpret_cast<float *>(_class_scores.buffer()))[selected_index]);
352                 result_classes_after_nms.emplace_back(c);
353             }
354         }
355 
356         // We select the max detection numbers of the highest score of all classes
357         const auto num_selected = result_scores_after_nms.size();
358         const auto num_output   = std::min<unsigned int>(max_detections, num_selected);
359 
360         // Sort selected indices based on result scores
361         sorted_indices.resize(num_selected);
362         std::iota(sorted_indices.begin(), sorted_indices.end(), 0);
363         std::partial_sort(sorted_indices.data(),
364                           sorted_indices.data() + num_output,
365                           sorted_indices.data() + num_selected,
366                           [&](unsigned int first, unsigned int second)
367         {
368 
369             return result_scores_after_nms[first] > result_scores_after_nms[second];
370         });
371 
372         SaveOutputs(&_decoded_boxes, result_idx_boxes_after_nms, result_scores_after_nms, result_classes_after_nms, sorted_indices,
373                     num_output, max_detections, _output_boxes, _output_classes, _output_scores, _num_detection);
374     }
375     // Fast NMS
376     else
377     {
378         const unsigned int num_classes_per_box = std::min<unsigned int>(_info.max_classes_per_detection(), _info.num_classes());
379         std::vector<float> max_scores;
380         std::vector<int>   box_indices;
381         std::vector<int>   max_score_classes;
382 
383         for(unsigned int b = 0; b < _num_boxes; ++b)
384         {
385             std::vector<float> box_scores;
386             for(unsigned int c = 0; c < num_classes; ++c)
387             {
388                 box_scores.emplace_back(*(reinterpret_cast<float *>(_input_scores_to_use->ptr_to_element(Coordinates(c + 1, b)))));
389             }
390 
391             std::vector<unsigned int> max_score_indices;
392             max_score_indices.resize(_info.num_classes());
393             std::iota(max_score_indices.data(), max_score_indices.data() + _info.num_classes(), 0);
394             std::partial_sort(max_score_indices.data(),
395                               max_score_indices.data() + num_classes_per_box,
396                               max_score_indices.data() + num_classes,
397                               [&](unsigned int first, unsigned int second)
398             {
399                 return box_scores[first] > box_scores[second];
400             });
401 
402             for(unsigned int i = 0; i < num_classes_per_box; ++i)
403             {
404                 const float score_to_add                                                                             = box_scores[max_score_indices[i]];
405                 *(reinterpret_cast<float *>(_class_scores.ptr_to_element(Coordinates(b * num_classes_per_box + i)))) = score_to_add;
406                 max_scores.emplace_back(score_to_add);
407                 box_indices.emplace_back(b);
408                 max_score_classes.emplace_back(max_score_indices[i]);
409             }
410         }
411 
412         // Run Non-maxima Suppression
413         _nms.run();
414         std::vector<unsigned int> selected_indices;
415         for(unsigned int i = 0; i < max_detections; ++i)
416         {
417             // NMS returns M valid indices, the not valid tail is filled with -1
418             if(*(reinterpret_cast<int *>(_selected_indices.ptr_to_element(Coordinates(i)))) == -1)
419             {
420                 // Nms will return -1 for all the last M-elements not valid
421                 break;
422             }
423             selected_indices.emplace_back(*(reinterpret_cast<int *>(_selected_indices.ptr_to_element(Coordinates(i)))));
424         }
425         // We select the max detection numbers of the highest score of all classes
426         const auto num_output = std::min<unsigned int>(_info.max_detections(), selected_indices.size());
427 
428         SaveOutputs(&_decoded_boxes, box_indices, max_scores, max_score_classes, selected_indices,
429                     num_output, max_detections, _output_boxes, _output_classes, _output_scores, _num_detection);
430     }
431 }
432 } // namespace arm_compute
433