1 /*
2  * Copyright (c) 2018-2021 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #include "arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h"
25 
26 #include "arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h"
27 #include "arm_compute/runtime/Scheduler.h"
28 
29 #include "src/common/utils/Log.h"
30 
31 namespace arm_compute
32 {
33 namespace
34 {
dequantize_tensor(const ITensor * input,ITensor * output)35 void dequantize_tensor(const ITensor *input, ITensor *output)
36 {
37     const UniformQuantizationInfo qinfo     = input->info()->quantization_info().uniform();
38     const DataType                data_type = input->info()->data_type();
39 
40     Window window;
41     window.use_tensor_dimensions(input->info()->tensor_shape());
42     Iterator input_it(input, window);
43     Iterator output_it(output, window);
44 
45     switch(data_type)
46     {
47         case DataType::QASYMM8:
48             execute_window_loop(window, [&](const Coordinates &)
49             {
50                 *reinterpret_cast<float *>(output_it.ptr()) = dequantize(*reinterpret_cast<const uint8_t *>(input_it.ptr()), qinfo.scale, qinfo.offset);
51             },
52             input_it, output_it);
53             break;
54         case DataType::QASYMM8_SIGNED:
55             execute_window_loop(window, [&](const Coordinates &)
56             {
57                 *reinterpret_cast<float *>(output_it.ptr()) = dequantize_qasymm8_signed(*reinterpret_cast<const int8_t *>(input_it.ptr()), qinfo);
58             },
59             input_it, output_it);
60             break;
61         case DataType::QASYMM16:
62             execute_window_loop(window, [&](const Coordinates &)
63             {
64                 *reinterpret_cast<float *>(output_it.ptr()) = dequantize(*reinterpret_cast<const uint16_t *>(input_it.ptr()), qinfo.scale, qinfo.offset);
65             },
66             input_it, output_it);
67             break;
68         default:
69             ARM_COMPUTE_ERROR("Unsupported data type");
70     }
71 }
72 
quantize_tensor(const ITensor * input,ITensor * output)73 void quantize_tensor(const ITensor *input, ITensor *output)
74 {
75     const UniformQuantizationInfo qinfo     = output->info()->quantization_info().uniform();
76     const DataType                data_type = output->info()->data_type();
77 
78     Window window;
79     window.use_tensor_dimensions(input->info()->tensor_shape());
80     Iterator input_it(input, window);
81     Iterator output_it(output, window);
82 
83     switch(data_type)
84     {
85         case DataType::QASYMM8:
86             execute_window_loop(window, [&](const Coordinates &)
87             {
88                 *reinterpret_cast<uint8_t *>(output_it.ptr()) = quantize_qasymm8(*reinterpret_cast<const float *>(input_it.ptr()), qinfo);
89             },
90             input_it, output_it);
91             break;
92         case DataType::QASYMM8_SIGNED:
93             execute_window_loop(window, [&](const Coordinates &)
94             {
95                 *reinterpret_cast<int8_t *>(output_it.ptr()) = quantize_qasymm8_signed(*reinterpret_cast<const float *>(input_it.ptr()), qinfo);
96             },
97             input_it, output_it);
98             break;
99         case DataType::QASYMM16:
100             execute_window_loop(window, [&](const Coordinates &)
101             {
102                 *reinterpret_cast<uint16_t *>(output_it.ptr()) = quantize_qasymm16(*reinterpret_cast<const float *>(input_it.ptr()), qinfo);
103             },
104             input_it, output_it);
105             break;
106         default:
107             ARM_COMPUTE_ERROR("Unsupported data type");
108     }
109 }
110 } // namespace
111 
CPPBoxWithNonMaximaSuppressionLimit(std::shared_ptr<IMemoryManager> memory_manager)112 CPPBoxWithNonMaximaSuppressionLimit::CPPBoxWithNonMaximaSuppressionLimit(std::shared_ptr<IMemoryManager> memory_manager)
113     : _memory_group(std::move(memory_manager)),
114       _box_with_nms_limit_kernel(),
115       _scores_in(),
116       _boxes_in(),
117       _batch_splits_in(),
118       _scores_out(),
119       _boxes_out(),
120       _classes(),
121       _batch_splits_out(),
122       _keeps(),
123       _scores_in_f32(),
124       _boxes_in_f32(),
125       _batch_splits_in_f32(),
126       _scores_out_f32(),
127       _boxes_out_f32(),
128       _classes_f32(),
129       _batch_splits_out_f32(),
130       _keeps_f32(),
131       _is_qasymm8(false)
132 {
133 }
134 
configure(const ITensor * scores_in,const ITensor * boxes_in,const ITensor * batch_splits_in,ITensor * scores_out,ITensor * boxes_out,ITensor * classes,ITensor * batch_splits_out,ITensor * keeps,ITensor * keeps_size,const BoxNMSLimitInfo info)135 void CPPBoxWithNonMaximaSuppressionLimit::configure(const ITensor *scores_in, const ITensor *boxes_in, const ITensor *batch_splits_in,
136                                                     ITensor *scores_out, ITensor *boxes_out, ITensor *classes, ITensor *batch_splits_out,
137                                                     ITensor *keeps, ITensor *keeps_size, const BoxNMSLimitInfo info)
138 {
139     ARM_COMPUTE_ERROR_ON_NULLPTR(scores_in, boxes_in, scores_out, boxes_out, classes);
140     ARM_COMPUTE_LOG_PARAMS(scores_in, boxes_in, batch_splits_in, scores_out, boxes_out, classes, batch_splits_out, keeps, keeps_size, info);
141 
142     _is_qasymm8 = scores_in->info()->data_type() == DataType::QASYMM8 || scores_in->info()->data_type() == DataType::QASYMM8_SIGNED;
143 
144     _scores_in        = scores_in;
145     _boxes_in         = boxes_in;
146     _batch_splits_in  = batch_splits_in;
147     _scores_out       = scores_out;
148     _boxes_out        = boxes_out;
149     _classes          = classes;
150     _batch_splits_out = batch_splits_out;
151     _keeps            = keeps;
152 
153     if(_is_qasymm8)
154     {
155         // Manage intermediate buffers
156         _memory_group.manage(&_scores_in_f32);
157         _memory_group.manage(&_boxes_in_f32);
158         _memory_group.manage(&_scores_out_f32);
159         _memory_group.manage(&_boxes_out_f32);
160         _memory_group.manage(&_classes_f32);
161         _scores_in_f32.allocator()->init(scores_in->info()->clone()->set_data_type(DataType::F32));
162         _boxes_in_f32.allocator()->init(boxes_in->info()->clone()->set_data_type(DataType::F32));
163         if(batch_splits_in != nullptr)
164         {
165             _memory_group.manage(&_batch_splits_in_f32);
166             _batch_splits_in_f32.allocator()->init(batch_splits_in->info()->clone()->set_data_type(DataType::F32));
167         }
168         _scores_out_f32.allocator()->init(scores_out->info()->clone()->set_data_type(DataType::F32));
169         _boxes_out_f32.allocator()->init(boxes_out->info()->clone()->set_data_type(DataType::F32));
170         _classes_f32.allocator()->init(classes->info()->clone()->set_data_type(DataType::F32));
171         if(batch_splits_out != nullptr)
172         {
173             _memory_group.manage(&_batch_splits_out_f32);
174             _batch_splits_out_f32.allocator()->init(batch_splits_out->info()->clone()->set_data_type(DataType::F32));
175         }
176         if(keeps != nullptr)
177         {
178             _memory_group.manage(&_keeps_f32);
179             _keeps_f32.allocator()->init(keeps->info()->clone()->set_data_type(DataType::F32));
180         }
181 
182         _box_with_nms_limit_kernel.configure(&_scores_in_f32, &_boxes_in_f32, (batch_splits_in != nullptr) ? &_batch_splits_in_f32 : nullptr,
183                                              &_scores_out_f32, &_boxes_out_f32, &_classes_f32,
184                                              (batch_splits_out != nullptr) ? &_batch_splits_out_f32 : nullptr, (keeps != nullptr) ? &_keeps_f32 : nullptr,
185                                              keeps_size, info);
186     }
187     else
188     {
189         _box_with_nms_limit_kernel.configure(scores_in, boxes_in, batch_splits_in, scores_out, boxes_out, classes, batch_splits_out, keeps, keeps_size, info);
190     }
191 
192     if(_is_qasymm8)
193     {
194         _scores_in_f32.allocator()->allocate();
195         _boxes_in_f32.allocator()->allocate();
196         if(_batch_splits_in != nullptr)
197         {
198             _batch_splits_in_f32.allocator()->allocate();
199         }
200         _scores_out_f32.allocator()->allocate();
201         _boxes_out_f32.allocator()->allocate();
202         _classes_f32.allocator()->allocate();
203         if(batch_splits_out != nullptr)
204         {
205             _batch_splits_out_f32.allocator()->allocate();
206         }
207         if(keeps != nullptr)
208         {
209             _keeps_f32.allocator()->allocate();
210         }
211     }
212 }
213 
validate(const ITensorInfo * scores_in,const ITensorInfo * boxes_in,const ITensorInfo * batch_splits_in,const ITensorInfo * scores_out,const ITensorInfo * boxes_out,const ITensorInfo * classes,const ITensorInfo * batch_splits_out,const ITensorInfo * keeps,const ITensorInfo * keeps_size,const BoxNMSLimitInfo info)214 Status validate(const ITensorInfo *scores_in, const ITensorInfo *boxes_in, const ITensorInfo *batch_splits_in, const ITensorInfo *scores_out, const ITensorInfo *boxes_out, const ITensorInfo *classes,
215                 const ITensorInfo *batch_splits_out, const ITensorInfo *keeps, const ITensorInfo *keeps_size, const BoxNMSLimitInfo info)
216 {
217     ARM_COMPUTE_UNUSED(batch_splits_in, batch_splits_out, keeps, keeps_size, info);
218     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(scores_in, boxes_in, scores_out, boxes_out, classes);
219     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores_in, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
220 
221     const bool is_qasymm8 = scores_in->data_type() == DataType::QASYMM8 || scores_in->data_type() == DataType::QASYMM8_SIGNED;
222     if(is_qasymm8)
223     {
224         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(boxes_in, 1, DataType::QASYMM16);
225         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(boxes_in, boxes_out);
226         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(boxes_in, boxes_out);
227         const UniformQuantizationInfo boxes_qinfo = boxes_in->quantization_info().uniform();
228         ARM_COMPUTE_RETURN_ERROR_ON(boxes_qinfo.scale != 0.125f);
229         ARM_COMPUTE_RETURN_ERROR_ON(boxes_qinfo.offset != 0);
230     }
231 
232     return Status{};
233 }
234 
run()235 void CPPBoxWithNonMaximaSuppressionLimit::run()
236 {
237     // Acquire all the temporaries
238     MemoryGroupResourceScope scope_mg(_memory_group);
239 
240     if(_is_qasymm8)
241     {
242         dequantize_tensor(_scores_in, &_scores_in_f32);
243         dequantize_tensor(_boxes_in, &_boxes_in_f32);
244         if(_batch_splits_in != nullptr)
245         {
246             dequantize_tensor(_batch_splits_in, &_batch_splits_in_f32);
247         }
248     }
249 
250     Scheduler::get().schedule(&_box_with_nms_limit_kernel, Window::DimY);
251 
252     if(_is_qasymm8)
253     {
254         quantize_tensor(&_scores_out_f32, _scores_out);
255         quantize_tensor(&_boxes_out_f32, _boxes_out);
256         quantize_tensor(&_classes_f32, _classes);
257         if(_batch_splits_out != nullptr)
258         {
259             quantize_tensor(&_batch_splits_out_f32, _batch_splits_out);
260         }
261         if(_keeps != nullptr)
262         {
263             quantize_tensor(&_keeps_f32, _keeps);
264         }
265     }
266 }
267 } // namespace arm_compute
268