xref: /aosp_15_r20/external/pytorch/aten/src/ATen/native/NNPACK.cpp (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
2 #include <ATen/core/Tensor.h>
3 #include <ATen/Config.h>
4 
5 #include <c10/util/CallOnce.h>
6 
7 #include <thread>
8 
9 #ifndef AT_PER_OPERATOR_HEADERS
10 #include <ATen/Functions.h>
11 #include <ATen/NativeFunctions.h>
12 #else
13 #include <ATen/ops/_nnpack_available_native.h>
14 #include <ATen/ops/_nnpack_spatial_convolution_native.h>
15 #include <ATen/ops/empty.h>
16 #include <ATen/ops/zeros.h>
17 #endif
18 
19 #if !AT_NNPACK_ENABLED()
20 
21 namespace at::native {
22 
_nnpack_spatial_convolution(const Tensor & input,const Tensor & weight,const std::optional<Tensor> & bias_opt,const IntArrayRef padding,const IntArrayRef stride)23 at::Tensor _nnpack_spatial_convolution(
24     const Tensor& input,
25     const Tensor& weight, const std::optional<Tensor>& bias_opt,
26     const IntArrayRef padding,
27     const IntArrayRef stride) {
28   throw std::runtime_error(
29       "nnpack_spatial_convolution: ATen not compiled with NNPACK support");
30 }
31 
_nnpack_available()32 bool _nnpack_available() {
33   return false;
34 }
35 
36 } // namespace at::native
37 
38 #else
39 
40 #include <nnpack.h>
41 
42 #include <caffe2/utils/threadpool/pthreadpool-cpp.h>
43 #include <ATen/native/ConvUtils.h>
44 #include <ATen/Parallel.h>
45 #include <c10/util/irange.h>
46 
47 namespace at::native {
48 
init_nnpack()49 static bool init_nnpack() {
50   static c10::once_flag once_;
51   static bool nnpack_successfully_initialized_ = false;
52 
53   c10::call_once(once_, []() {
54     const nnp_status nnpack_status = nnp_initialize();
55     nnpack_successfully_initialized_ = (nnp_status_success == nnpack_status);
56 
57     if (nnpack_status != nnp_status_success) {
58       if (nnpack_status == nnp_status_out_of_memory) {
59         LOG(WARNING) << "Could not initialize NNPACK! Reason: Out of memory.";
60       } else if (nnpack_status == nnp_status_unsupported_hardware) {
61         LOG(WARNING) << "Could not initialize NNPACK! Reason: Unsupported hardware.";
62       } else {
63         LOG(WARNING) << "Could not initialize NNPACK! Reason: Unknown error!";
64       }
65     }
66   });
67 
68   return nnpack_successfully_initialized_;
69 }
70 
nnpack_threadpool()71 static pthreadpool_t nnpack_threadpool() {
72 #ifdef C10_MOBILE
73   return caffe2::pthreadpool_();
74 #else
75   static pthreadpool_t nnpack_threadpool_ = nullptr;
76   static bool called_nnpack_threadpool_ = false;
77 
78   if (!called_nnpack_threadpool_) {
79     called_nnpack_threadpool_ = true;
80 
81 #ifdef INTRA_OP_PARALLEL
82     const uint32_t threads = at::get_num_threads();
83 #else
84     const uint32_t threads = std::thread::hardware_concurrency();
85 #endif
86 
87     nnpack_threadpool_ = pthreadpool_create(threads);
88     if (!nnpack_threadpool_) {
89       LOG(WARNING) << "Failed to initialize pthreadpool! Running NNPACK in single-threaded mode.";
90     }
91   }
92 
93   return nnpack_threadpool_;
94 #endif
95 }
96 
_nnpack_available()97 bool _nnpack_available() {
98   return init_nnpack();
99 }
100 
101 namespace {
102 struct Workspace {
103   void* buffer = nullptr;
104   size_t size = 0;
105 
deallocateat::native::__anon5f67c6060211::Workspace106   void deallocate() {
107     if (buffer) {
108       // NOLINTNEXTLINE(cppcoreguidelines-no-malloc)
109       std::free(buffer);
110       buffer = nullptr;
111     }
112   }
113 
allocateat::native::__anon5f67c6060211::Workspace114   void allocate() {
115     deallocate();
116 
117     // NNPack has alignment requirements
118     constexpr size_t nnpack_memory_alignment_boundary = 64;
119 
120     // Won't work on Windows, but NNPACK doesn't support Windows either
121     auto res = posix_memalign(&buffer, nnpack_memory_alignment_boundary, size);
122     if (res != 0) {
123       TORCH_CHECK(false, "posix_memalign failed:", strerror(errno), " (", errno, ")");
124     }
125     return;
126   }
127 
~Workspaceat::native::__anon5f67c6060211::Workspace128   ~Workspace() {
129     deallocate();
130   }
131 };
132 } // namespace
133 
134 // Make thread_local for safety in cases where we have multiple threads running
135 // Convs at once
136 static thread_local Workspace workspace;
137 
_nnpack_spatial_convolution(const Tensor & input,const Tensor & weight,const std::optional<Tensor> & bias_opt,const IntArrayRef padding,const IntArrayRef stride)138 Tensor _nnpack_spatial_convolution(
139     const Tensor& input,
140     const Tensor& weight, const std::optional<Tensor>& bias_opt,
141     const IntArrayRef padding,
142     const IntArrayRef stride) {
143   // See [Note: hacky wrapper removal for optional tensor]
144   c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
145   const Tensor& bias = *bias_maybe_owned;
146 
147   at::Tensor output = at::empty(
148       conv_output_size(input.sizes(), weight.sizes(), padding, stride),
149       input.options());
150 
151   // Our input Tensor must be in the form N,C,H,W
152   if (input.ndimension() != 4) {
153     throw std::runtime_error(
154         "NNPack convolutionOutput expects 4D input Tensor N,C,H,W");
155   }
156   // Our weight Tensor must be in the form oC,iC,kH,kW
157   if (weight.ndimension() != 4) {
158     throw std::runtime_error(
159         "NNPack convolutionOutput expects 4D weight Tensor oC,iC,kH,kW");
160   }
161   // Our output Tensor must be in the form N,oC,oH,oW
162   if (output.ndimension() != 4) {
163     throw std::runtime_error(
164         "NNPack convolutionOutput expects 4D output Tensor N,oC,oH,oW");
165   }
166 
167   // Some basic shape checking, not comprehensive
168   if (input.size(1) != weight.size(1)) {
169     std::stringstream err;
170     err << "Mismatch between number of input channels in input Tensor ("
171         << input.size(1) << ") and weight Tensor (" << weight.size(1)
172         << ") in NNPack convolutionOutput";
173     throw std::runtime_error(err.str());
174   }
175   if (weight.size(0) != output.size(1)) {
176     std::stringstream err;
177     err << "Mismatch between number of output channels in weight Tensor ("
178         << weight.size(0) << ") and output Tensor (" << output.size(1)
179         << ") in NNPack convolutionOutput";
180     throw std::runtime_error(err.str());
181   }
182   if (input.size(0) != output.size(0)) {
183     std::stringstream err;
184     err << "Mismatch between batch size in input Tensor (" << input.size(0)
185         << ") and output Tensor (" << output.size(0)
186         << ") in NNPack convolutionOutput";
187     throw std::runtime_error(err.str());
188   }
189 
190   // All Tensors must be float Tensors
191   if (input.device().type() != kCPU || input.scalar_type() != kFloat ||
192       weight.device().type() != kCPU || weight.scalar_type() != kFloat ||
193       output.device().type() != kCPU || output.scalar_type() != kFloat ||
194       (bias.defined() && (bias.device().type() != kCPU || bias.scalar_type() != kFloat))) {
195     throw std::runtime_error(
196         "Mismatched Tensor types in NNPack convolutionOutput");
197   }
198 
199   const auto algorithm = nnp_convolution_algorithm_auto;
200   const size_t input_channels = input.size(1);
201   const size_t output_channels = weight.size(0);
202   const struct nnp_size input_size = {
203       .width = (size_t)input.size(3),
204       .height = (size_t)input.size(2),
205   };
206   const struct nnp_padding input_padding = {
207       .top = (size_t)padding[0],
208       .right = (size_t)padding[1],
209       .bottom = (size_t)padding[0],
210       .left = (size_t)padding[1],
211   };
212   const struct nnp_size kernel_size = {
213       .width = (size_t)weight.size(3),
214       .height = (size_t)weight.size(2),
215   };
216   const struct nnp_size output_size = {
217       .width = (size_t)output.size(3),
218       .height = (size_t)output.size(2),
219   };
220   const nnp_size output_subsample = {
221       .width = static_cast<std::size_t>(stride[1]),
222       .height = static_cast<std::size_t>(stride[0]),
223   };
224 
225   const auto input_ = input.contiguous();
226   const auto weight_ = weight.contiguous();
227   // If we don't have a defined bias Tensor, we need to create one filled with zeroes
228   const auto bias_ = bias.defined() ? bias.contiguous() : at::zeros({weight.size(0)}, input.options());
229 
230   const auto compute = [&](const size_t batch_size) -> nnp_status {
231     if ((batch_size == 1) || (output_subsample.width != 1) || (output_subsample.height != 1)) {
232       const size_t input_size_per_batch = input_channels * input_size.width * input_size.height;
233       const size_t output_size_per_batch = output_channels * output_size.width * output_size.height;
234 
235       for (const auto batch : c10::irange(0u, batch_size)) {
236         const nnp_status status = nnp_convolution_inference(
237             algorithm,
238             nnp_convolution_transform_strategy_compute,
239             input_channels,
240             output_channels,
241             input_size,
242             input_padding,
243             kernel_size,
244             output_subsample,
245             input_.data_ptr<float>() + batch * input_size_per_batch,
246             weight_.data_ptr<float>(),
247             bias_.data_ptr<float>(),
248             output.data_ptr<float>() + batch * output_size_per_batch,
249             workspace.buffer,
250             &workspace.size,
251             nnp_activation_identity,
252             nullptr,
253             nnpack_threadpool(),
254             nullptr );
255 
256         if (nnp_status_success != status) {
257           return status;
258         }
259       }
260 
261       return nnp_status_success;
262     }
263     else {
264       return nnp_convolution_output(
265         algorithm,
266         batch_size,
267         input_channels,
268         output_channels,
269         input_size,
270         input_padding,
271         kernel_size,
272         input_.data_ptr<float>(),
273         weight_.data_ptr<float>(),
274         bias_.data_ptr<float>(),
275         output.data_ptr<float>(),
276         workspace.buffer,
277         &workspace.size,
278         nnp_activation_identity,
279         nullptr,
280         nnpack_threadpool(),
281         nullptr );
282     }
283   };
284 
285   const size_t batch_size = input.size(0);
286 
287   auto size_and_allocate_ws = [&]() {
288     // Run a single pass to get the size of memory workspace buffer
289     const auto status = compute(batch_size);
290     if (status != nnp_status_success) {
291       throw std::runtime_error("NNPACK SpatialConvolution_updateOutput failed");
292     }
293     workspace.allocate();
294   };
295 
296   // If no workspace created yet, allocate it
297   if (workspace.buffer == nullptr) {
298     size_and_allocate_ws();
299   }
300 
301   // Try to run with the newly created, or existing workspace
302   auto status = compute(batch_size);
303 
304   if (status == nnp_status_insufficient_buffer) {
305     // Need to reallocate the workspace
306     workspace.deallocate();
307     size_and_allocate_ws();
308 
309     // Try one more time
310     status = compute(batch_size);
311   }
312 
313   if (status != nnp_status_success) {
314     throw std::runtime_error("NNPACK SpatialConvolution_updateOutput failed");
315   }
316 
317   return output;
318 }
319 
320 } // namespace at::native
321 
322 #endif // AT_NNPACK_ENABLED
323