1 #include <utility>
2 #include <vector>
3
4 #include <ATen/ATen.h>
5 #include <ATen/nnapi/nnapi_bind.h>
6 #include <ATen/nnapi/nnapi_wrapper.h>
7 #include <ATen/nnapi/nnapi_model_loader.h>
8 #include <c10/util/irange.h>
9
10 namespace torch {
11 namespace nnapi {
12 namespace bind {
13
14 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
15 nnapi_wrapper* nnapi;
16 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
17 nnapi_wrapper* check_nnapi;
18
load_platform_library()19 static void load_platform_library() {
20 static int run_once = [](){
21 nnapi_wrapper_load(&nnapi, &check_nnapi);
22 CAFFE_ENFORCE(nnapi);
23 CAFFE_ENFORCE(nnapi->Model_free);
24 CAFFE_ENFORCE(nnapi->Compilation_free);
25 CAFFE_ENFORCE(nnapi->Execution_free);
26 return 0;
27 }();
28 (void)run_once;
29 }
30
31 // NnapiCompilation functon definitions:
32
33 // Could possibly call load_platform_library in constructor, but error reporting
34 // can be complicated if the constructor is called during model loading.
35 // Instead, delay all work until the explicit init call.
init(at::Tensor serialized_model_tensor,std::vector<at::Tensor> parameter_buffers)36 void NnapiCompilation::init(
37 at::Tensor serialized_model_tensor,
38 std::vector<at::Tensor> parameter_buffers
39 ) {
40 init2(
41 std::move(serialized_model_tensor),
42 std::move(parameter_buffers),
43 ANEURALNETWORKS_PREFER_SUSTAINED_SPEED,
44 false);
45 }
46
init2(at::Tensor serialized_model_tensor,const std::vector<at::Tensor> & parameter_buffers,int64_t compilation_preference,bool relax_f32_to_f16)47 void NnapiCompilation::init2(
48 at::Tensor serialized_model_tensor,
49 const std::vector<at::Tensor>& parameter_buffers,
50 int64_t compilation_preference,
51 bool relax_f32_to_f16
52 ) {
53 TORCH_CHECK(!model_, "Attempted to re-initialize NnapiCompilation.");
54
55 load_platform_library();
56
57 std::vector<const void*> buffers;
58 buffers.reserve(parameter_buffers.size());
59 std::vector<int32_t> buffer_sizes;
60 buffer_sizes.reserve(parameter_buffers.size());
61 for (auto& t : parameter_buffers) {
62 TORCH_CHECK(t.is_contiguous());
63 buffers.push_back(t.data_ptr());
64 buffer_sizes.push_back(t.nbytes());
65 }
66
67 TORCH_CHECK(serialized_model_tensor.is_contiguous());
68 // This is currently always int32_t, but support uint8_t for old models
69 // and possible future changes to the generator.
70 uint8_t* ser_model_ptr =
71 serialized_model_tensor.scalar_type() == at::ScalarType::Byte
72 ? serialized_model_tensor.data_ptr<uint8_t>()
73 : reinterpret_cast<uint8_t*>(serialized_model_tensor.data_ptr<int32_t>());
74 c10::ArrayRef<uint8_t> ser_model = {
75 ser_model_ptr,
76 serialized_model_tensor.nbytes()
77 };
78 TORCH_CHECK(!ser_model.empty());
79
80 ANeuralNetworksModel* model{};
81 check_nnapi->Model_create(&model);
82 CAFFE_ENFORCE(model);
83 model_.reset(model);
84
85 int load_result = ::caffe2::nnapi::load_nnapi_model(
86 nnapi,
87 model_.get(),
88 ser_model.data(),
89 ser_model.size(),
90 buffers.size(),
91 buffers.data(),
92 buffer_sizes.data(),
93 0,
94 nullptr,
95 nullptr,
96 &num_inputs_,
97 &num_outputs_,
98 nullptr);
99 CAFFE_ENFORCE(load_result == 0);
100
101 if (relax_f32_to_f16) {
102 check_nnapi->Model_relaxComputationFloat32toFloat16(model_.get(), true);
103 }
104 check_nnapi->Model_finish(model_.get());
105
106 ANeuralNetworksCompilation* compilation{};
107 check_nnapi->Compilation_create(model_.get(), &compilation);
108 // TODO: Make this configurable.
109 check_nnapi->Compilation_setPreference(compilation, static_cast<int32_t>(compilation_preference));
110 check_nnapi->Compilation_finish(compilation);
111 compilation_.reset(compilation);
112 }
113
run(std::vector<at::Tensor> inputs,std::vector<at::Tensor> outputs)114 void NnapiCompilation::run(
115 std::vector<at::Tensor> inputs,
116 std::vector<at::Tensor> outputs) {
117 // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
118 ANeuralNetworksExecution* execution;
119 check_nnapi->Execution_create(compilation_.get(), &execution);
120 ExecutionPtr execution_unique_ptr(execution);
121
122 TORCH_CHECK((int32_t)inputs.size() == num_inputs_);
123 TORCH_CHECK((int32_t)outputs.size() == num_outputs_);
124
125 for (const auto i : c10::irange(inputs.size())) {
126 auto& t = inputs[i];
127 // TODO: Check contiguous and dtype.
128 ANeuralNetworksOperandType op_type;
129 std::vector<uint32_t> dim;
130 get_operand_type(t, &op_type, &dim);
131 check_nnapi->Execution_setInput(
132 execution,
133 i,
134 &op_type,
135 t.data_ptr(),
136 t.nbytes());
137 }
138
139 for (const auto i : c10::irange(outputs.size())) {
140 auto& t = outputs[i];
141 // TODO: Check contiguous and dtype.
142 check_nnapi->Execution_setOutput(
143 execution,
144 i,
145 nullptr,
146 t.data_ptr(),
147 t.nbytes());
148 }
149
150 check_nnapi->Execution_compute(execution);
151
152 // TODO: Maybe skip this for fixed-size outputs?
153 for (const auto i : c10::irange(outputs.size())) {
154 auto& t = outputs[i];
155 // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
156 uint32_t rank;
157 check_nnapi->Execution_getOutputOperandRank(execution, i, &rank);
158 std::vector<uint32_t> dims(rank);
159 check_nnapi->Execution_getOutputOperandDimensions(execution, i, dims.data());
160 std::vector<int64_t> long_dims(dims.begin(), dims.end());
161 // TODO: Maybe check that only the batch dimension is changed?
162 t.resize_(long_dims);
163 }
164 }
165
get_operand_type(const at::Tensor & t,ANeuralNetworksOperandType * operand,std::vector<uint32_t> * dims)166 void NnapiCompilation::get_operand_type(const at::Tensor& t, ANeuralNetworksOperandType* operand, std::vector<uint32_t>* dims) {
167 operand->dimensionCount = t.dim();
168 TORCH_CHECK(operand->dimensionCount == t.dim()); // Check for overflow.
169 dims->resize(t.dim());
170 operand->dimensions = dims->data();
171 for (const auto i : c10::irange(dims->size())) {
172 (*dims)[i] = t.sizes()[i];
173 TORCH_CHECK((*dims)[i] == t.sizes()[i]); // Check for overflow.
174 }
175 if (t.scalar_type() == c10::kFloat) {
176 operand->type = ANEURALNETWORKS_TENSOR_FLOAT32;
177 operand->scale = 0;
178 operand->zeroPoint = 0;
179 return;
180 }
181 if (t.scalar_type() == c10::kQUInt8) {
182 TORCH_CHECK(t.is_quantized());
183 operand->type = ANEURALNETWORKS_TENSOR_QUANT8_ASYMM;
184 // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
185 operand->scale = t.q_scale();
186 operand->zeroPoint = t.q_zero_point();
187 return;
188 }
189 if (t.scalar_type() == c10::kInt) {
190 operand->type = ANEURALNETWORKS_TENSOR_INT32;
191 operand->scale = 0;
192 operand->zeroPoint = 0;
193 return;
194 }
195 if (t.scalar_type() == c10::kShort) {
196 TORCH_WARN(
197 "NNAPI qint16 inputs to model are only supported for ",
198 "testing with fixed scale, zero_point. Please change your ",
199 "inputs if you see this in production");
200 operand->type = ANEURALNETWORKS_TENSOR_QUANT16_ASYMM;
201 // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
202 operand->scale = 0.125;
203 operand->zeroPoint = 0;
204 return;
205 }
206
207 // TODO: Support more dtypes.
208 CAFFE_THROW("Bad dtype: " + std::to_string(static_cast<int8_t>(t.scalar_type())));
209 }
210
211 } // namespace bind
212 } // namespace nnapi
213 } // namespace torch
214