xref: /aosp_15_r20/external/pytorch/aten/src/ATen/nnapi/nnapi_bind.cpp (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 #include <utility>
2 #include <vector>
3 
4 #include <ATen/ATen.h>
5 #include <ATen/nnapi/nnapi_bind.h>
6 #include <ATen/nnapi/nnapi_wrapper.h>
7 #include <ATen/nnapi/nnapi_model_loader.h>
8 #include <c10/util/irange.h>
9 
10 namespace torch {
11 namespace nnapi {
12 namespace bind {
13 
14 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
15 nnapi_wrapper* nnapi;
16 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
17 nnapi_wrapper* check_nnapi;
18 
load_platform_library()19 static void load_platform_library() {
20   static int run_once = [](){
21     nnapi_wrapper_load(&nnapi, &check_nnapi);
22     CAFFE_ENFORCE(nnapi);
23     CAFFE_ENFORCE(nnapi->Model_free);
24     CAFFE_ENFORCE(nnapi->Compilation_free);
25     CAFFE_ENFORCE(nnapi->Execution_free);
26     return 0;
27   }();
28   (void)run_once;
29 }
30 
31 // NnapiCompilation functon definitions:
32 
33 // Could possibly call load_platform_library in constructor, but error reporting
34 // can be complicated if the constructor is called during model loading.
35 // Instead, delay all work until the explicit init call.
init(at::Tensor serialized_model_tensor,std::vector<at::Tensor> parameter_buffers)36 void NnapiCompilation::init(
37     at::Tensor serialized_model_tensor,
38     std::vector<at::Tensor> parameter_buffers
39 ) {
40   init2(
41     std::move(serialized_model_tensor),
42     std::move(parameter_buffers),
43     ANEURALNETWORKS_PREFER_SUSTAINED_SPEED,
44     false);
45 }
46 
init2(at::Tensor serialized_model_tensor,const std::vector<at::Tensor> & parameter_buffers,int64_t compilation_preference,bool relax_f32_to_f16)47 void NnapiCompilation::init2(
48     at::Tensor serialized_model_tensor,
49     const std::vector<at::Tensor>& parameter_buffers,
50     int64_t compilation_preference,
51     bool relax_f32_to_f16
52   ) {
53   TORCH_CHECK(!model_, "Attempted to re-initialize NnapiCompilation.");
54 
55   load_platform_library();
56 
57   std::vector<const void*> buffers;
58   buffers.reserve(parameter_buffers.size());
59   std::vector<int32_t> buffer_sizes;
60   buffer_sizes.reserve(parameter_buffers.size());
61   for (auto& t : parameter_buffers) {
62     TORCH_CHECK(t.is_contiguous());
63     buffers.push_back(t.data_ptr());
64     buffer_sizes.push_back(t.nbytes());
65   }
66 
67   TORCH_CHECK(serialized_model_tensor.is_contiguous());
68   // This is currently always int32_t, but support uint8_t for old models
69   // and possible future changes to the generator.
70   uint8_t* ser_model_ptr =
71     serialized_model_tensor.scalar_type() == at::ScalarType::Byte
72       ? serialized_model_tensor.data_ptr<uint8_t>()
73       : reinterpret_cast<uint8_t*>(serialized_model_tensor.data_ptr<int32_t>());
74   c10::ArrayRef<uint8_t> ser_model = {
75     ser_model_ptr,
76     serialized_model_tensor.nbytes()
77   };
78   TORCH_CHECK(!ser_model.empty());
79 
80   ANeuralNetworksModel* model{};
81   check_nnapi->Model_create(&model);
82   CAFFE_ENFORCE(model);
83   model_.reset(model);
84 
85   int load_result = ::caffe2::nnapi::load_nnapi_model(
86       nnapi,
87       model_.get(),
88       ser_model.data(),
89       ser_model.size(),
90       buffers.size(),
91       buffers.data(),
92       buffer_sizes.data(),
93       0,
94       nullptr,
95       nullptr,
96       &num_inputs_,
97       &num_outputs_,
98       nullptr);
99   CAFFE_ENFORCE(load_result == 0);
100 
101   if (relax_f32_to_f16) {
102     check_nnapi->Model_relaxComputationFloat32toFloat16(model_.get(), true);
103   }
104   check_nnapi->Model_finish(model_.get());
105 
106   ANeuralNetworksCompilation* compilation{};
107   check_nnapi->Compilation_create(model_.get(), &compilation);
108   // TODO: Make this configurable.
109   check_nnapi->Compilation_setPreference(compilation, static_cast<int32_t>(compilation_preference));
110   check_nnapi->Compilation_finish(compilation);
111   compilation_.reset(compilation);
112 }
113 
run(std::vector<at::Tensor> inputs,std::vector<at::Tensor> outputs)114 void NnapiCompilation::run(
115     std::vector<at::Tensor> inputs,
116     std::vector<at::Tensor> outputs) {
117   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
118   ANeuralNetworksExecution* execution;
119   check_nnapi->Execution_create(compilation_.get(), &execution);
120   ExecutionPtr execution_unique_ptr(execution);
121 
122   TORCH_CHECK((int32_t)inputs.size() == num_inputs_);
123   TORCH_CHECK((int32_t)outputs.size() == num_outputs_);
124 
125   for (const auto i : c10::irange(inputs.size())) {
126     auto& t = inputs[i];
127     // TODO: Check contiguous and dtype.
128     ANeuralNetworksOperandType op_type;
129     std::vector<uint32_t> dim;
130     get_operand_type(t, &op_type, &dim);
131     check_nnapi->Execution_setInput(
132         execution,
133         i,
134         &op_type,
135         t.data_ptr(),
136         t.nbytes());
137   }
138 
139   for (const auto i : c10::irange(outputs.size())) {
140     auto& t = outputs[i];
141     // TODO: Check contiguous and dtype.
142     check_nnapi->Execution_setOutput(
143         execution,
144         i,
145         nullptr,
146         t.data_ptr(),
147         t.nbytes());
148   }
149 
150   check_nnapi->Execution_compute(execution);
151 
152   // TODO: Maybe skip this for fixed-size outputs?
153   for (const auto i : c10::irange(outputs.size())) {
154     auto& t = outputs[i];
155     // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
156     uint32_t rank;
157     check_nnapi->Execution_getOutputOperandRank(execution, i, &rank);
158     std::vector<uint32_t> dims(rank);
159     check_nnapi->Execution_getOutputOperandDimensions(execution, i, dims.data());
160     std::vector<int64_t> long_dims(dims.begin(), dims.end());
161     // TODO: Maybe check that only the batch dimension is changed?
162     t.resize_(long_dims);
163   }
164 }
165 
get_operand_type(const at::Tensor & t,ANeuralNetworksOperandType * operand,std::vector<uint32_t> * dims)166 void NnapiCompilation::get_operand_type(const at::Tensor& t, ANeuralNetworksOperandType* operand, std::vector<uint32_t>* dims) {
167   operand->dimensionCount = t.dim();
168   TORCH_CHECK(operand->dimensionCount == t.dim()); // Check for overflow.
169   dims->resize(t.dim());
170   operand->dimensions = dims->data();
171   for (const auto i : c10::irange(dims->size())) {
172     (*dims)[i] = t.sizes()[i];
173     TORCH_CHECK((*dims)[i] == t.sizes()[i]); // Check for overflow.
174   }
175   if (t.scalar_type() == c10::kFloat) {
176     operand->type = ANEURALNETWORKS_TENSOR_FLOAT32;
177     operand->scale = 0;
178     operand->zeroPoint = 0;
179     return;
180   }
181   if (t.scalar_type() == c10::kQUInt8) {
182     TORCH_CHECK(t.is_quantized());
183     operand->type = ANEURALNETWORKS_TENSOR_QUANT8_ASYMM;
184     // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
185     operand->scale = t.q_scale();
186     operand->zeroPoint = t.q_zero_point();
187     return;
188   }
189   if (t.scalar_type() == c10::kInt) {
190     operand->type = ANEURALNETWORKS_TENSOR_INT32;
191     operand->scale = 0;
192     operand->zeroPoint = 0;
193     return;
194   }
195   if (t.scalar_type() == c10::kShort) {
196     TORCH_WARN(
197       "NNAPI qint16 inputs to model are only supported for ",
198       "testing with fixed scale, zero_point. Please change your ",
199       "inputs if you see this in production");
200     operand->type = ANEURALNETWORKS_TENSOR_QUANT16_ASYMM;
201     // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
202     operand->scale = 0.125;
203     operand->zeroPoint = 0;
204     return;
205   }
206 
207   // TODO: Support more dtypes.
208   CAFFE_THROW("Bad dtype: " + std::to_string(static_cast<int8_t>(t.scalar_type())));
209 }
210 
211 } // namespace bind
212 } // namespace nnapi
213 } // namespace torch
214