1 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
2 #include <ATen/core/Tensor.h>
3 #include <ATen/Config.h>
4
5 #include <c10/util/CallOnce.h>
6
7 #include <thread>
8
9 #ifndef AT_PER_OPERATOR_HEADERS
10 #include <ATen/Functions.h>
11 #include <ATen/NativeFunctions.h>
12 #else
13 #include <ATen/ops/_nnpack_available_native.h>
14 #include <ATen/ops/_nnpack_spatial_convolution_native.h>
15 #include <ATen/ops/empty.h>
16 #include <ATen/ops/zeros.h>
17 #endif
18
19 #if !AT_NNPACK_ENABLED()
20
21 namespace at::native {
22
_nnpack_spatial_convolution(const Tensor & input,const Tensor & weight,const std::optional<Tensor> & bias_opt,const IntArrayRef padding,const IntArrayRef stride)23 at::Tensor _nnpack_spatial_convolution(
24 const Tensor& input,
25 const Tensor& weight, const std::optional<Tensor>& bias_opt,
26 const IntArrayRef padding,
27 const IntArrayRef stride) {
28 throw std::runtime_error(
29 "nnpack_spatial_convolution: ATen not compiled with NNPACK support");
30 }
31
_nnpack_available()32 bool _nnpack_available() {
33 return false;
34 }
35
36 } // namespace at::native
37
38 #else
39
40 #include <nnpack.h>
41
42 #include <caffe2/utils/threadpool/pthreadpool-cpp.h>
43 #include <ATen/native/ConvUtils.h>
44 #include <ATen/Parallel.h>
45 #include <c10/util/irange.h>
46
47 namespace at::native {
48
init_nnpack()49 static bool init_nnpack() {
50 static c10::once_flag once_;
51 static bool nnpack_successfully_initialized_ = false;
52
53 c10::call_once(once_, []() {
54 const nnp_status nnpack_status = nnp_initialize();
55 nnpack_successfully_initialized_ = (nnp_status_success == nnpack_status);
56
57 if (nnpack_status != nnp_status_success) {
58 if (nnpack_status == nnp_status_out_of_memory) {
59 LOG(WARNING) << "Could not initialize NNPACK! Reason: Out of memory.";
60 } else if (nnpack_status == nnp_status_unsupported_hardware) {
61 LOG(WARNING) << "Could not initialize NNPACK! Reason: Unsupported hardware.";
62 } else {
63 LOG(WARNING) << "Could not initialize NNPACK! Reason: Unknown error!";
64 }
65 }
66 });
67
68 return nnpack_successfully_initialized_;
69 }
70
nnpack_threadpool()71 static pthreadpool_t nnpack_threadpool() {
72 #ifdef C10_MOBILE
73 return caffe2::pthreadpool_();
74 #else
75 static pthreadpool_t nnpack_threadpool_ = nullptr;
76 static bool called_nnpack_threadpool_ = false;
77
78 if (!called_nnpack_threadpool_) {
79 called_nnpack_threadpool_ = true;
80
81 #ifdef INTRA_OP_PARALLEL
82 const uint32_t threads = at::get_num_threads();
83 #else
84 const uint32_t threads = std::thread::hardware_concurrency();
85 #endif
86
87 nnpack_threadpool_ = pthreadpool_create(threads);
88 if (!nnpack_threadpool_) {
89 LOG(WARNING) << "Failed to initialize pthreadpool! Running NNPACK in single-threaded mode.";
90 }
91 }
92
93 return nnpack_threadpool_;
94 #endif
95 }
96
_nnpack_available()97 bool _nnpack_available() {
98 return init_nnpack();
99 }
100
101 namespace {
102 struct Workspace {
103 void* buffer = nullptr;
104 size_t size = 0;
105
deallocateat::native::__anon5f67c6060211::Workspace106 void deallocate() {
107 if (buffer) {
108 // NOLINTNEXTLINE(cppcoreguidelines-no-malloc)
109 std::free(buffer);
110 buffer = nullptr;
111 }
112 }
113
allocateat::native::__anon5f67c6060211::Workspace114 void allocate() {
115 deallocate();
116
117 // NNPack has alignment requirements
118 constexpr size_t nnpack_memory_alignment_boundary = 64;
119
120 // Won't work on Windows, but NNPACK doesn't support Windows either
121 auto res = posix_memalign(&buffer, nnpack_memory_alignment_boundary, size);
122 if (res != 0) {
123 TORCH_CHECK(false, "posix_memalign failed:", strerror(errno), " (", errno, ")");
124 }
125 return;
126 }
127
~Workspaceat::native::__anon5f67c6060211::Workspace128 ~Workspace() {
129 deallocate();
130 }
131 };
132 } // namespace
133
134 // Make thread_local for safety in cases where we have multiple threads running
135 // Convs at once
136 static thread_local Workspace workspace;
137
_nnpack_spatial_convolution(const Tensor & input,const Tensor & weight,const std::optional<Tensor> & bias_opt,const IntArrayRef padding,const IntArrayRef stride)138 Tensor _nnpack_spatial_convolution(
139 const Tensor& input,
140 const Tensor& weight, const std::optional<Tensor>& bias_opt,
141 const IntArrayRef padding,
142 const IntArrayRef stride) {
143 // See [Note: hacky wrapper removal for optional tensor]
144 c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
145 const Tensor& bias = *bias_maybe_owned;
146
147 at::Tensor output = at::empty(
148 conv_output_size(input.sizes(), weight.sizes(), padding, stride),
149 input.options());
150
151 // Our input Tensor must be in the form N,C,H,W
152 if (input.ndimension() != 4) {
153 throw std::runtime_error(
154 "NNPack convolutionOutput expects 4D input Tensor N,C,H,W");
155 }
156 // Our weight Tensor must be in the form oC,iC,kH,kW
157 if (weight.ndimension() != 4) {
158 throw std::runtime_error(
159 "NNPack convolutionOutput expects 4D weight Tensor oC,iC,kH,kW");
160 }
161 // Our output Tensor must be in the form N,oC,oH,oW
162 if (output.ndimension() != 4) {
163 throw std::runtime_error(
164 "NNPack convolutionOutput expects 4D output Tensor N,oC,oH,oW");
165 }
166
167 // Some basic shape checking, not comprehensive
168 if (input.size(1) != weight.size(1)) {
169 std::stringstream err;
170 err << "Mismatch between number of input channels in input Tensor ("
171 << input.size(1) << ") and weight Tensor (" << weight.size(1)
172 << ") in NNPack convolutionOutput";
173 throw std::runtime_error(err.str());
174 }
175 if (weight.size(0) != output.size(1)) {
176 std::stringstream err;
177 err << "Mismatch between number of output channels in weight Tensor ("
178 << weight.size(0) << ") and output Tensor (" << output.size(1)
179 << ") in NNPack convolutionOutput";
180 throw std::runtime_error(err.str());
181 }
182 if (input.size(0) != output.size(0)) {
183 std::stringstream err;
184 err << "Mismatch between batch size in input Tensor (" << input.size(0)
185 << ") and output Tensor (" << output.size(0)
186 << ") in NNPack convolutionOutput";
187 throw std::runtime_error(err.str());
188 }
189
190 // All Tensors must be float Tensors
191 if (input.device().type() != kCPU || input.scalar_type() != kFloat ||
192 weight.device().type() != kCPU || weight.scalar_type() != kFloat ||
193 output.device().type() != kCPU || output.scalar_type() != kFloat ||
194 (bias.defined() && (bias.device().type() != kCPU || bias.scalar_type() != kFloat))) {
195 throw std::runtime_error(
196 "Mismatched Tensor types in NNPack convolutionOutput");
197 }
198
199 const auto algorithm = nnp_convolution_algorithm_auto;
200 const size_t input_channels = input.size(1);
201 const size_t output_channels = weight.size(0);
202 const struct nnp_size input_size = {
203 .width = (size_t)input.size(3),
204 .height = (size_t)input.size(2),
205 };
206 const struct nnp_padding input_padding = {
207 .top = (size_t)padding[0],
208 .right = (size_t)padding[1],
209 .bottom = (size_t)padding[0],
210 .left = (size_t)padding[1],
211 };
212 const struct nnp_size kernel_size = {
213 .width = (size_t)weight.size(3),
214 .height = (size_t)weight.size(2),
215 };
216 const struct nnp_size output_size = {
217 .width = (size_t)output.size(3),
218 .height = (size_t)output.size(2),
219 };
220 const nnp_size output_subsample = {
221 .width = static_cast<std::size_t>(stride[1]),
222 .height = static_cast<std::size_t>(stride[0]),
223 };
224
225 const auto input_ = input.contiguous();
226 const auto weight_ = weight.contiguous();
227 // If we don't have a defined bias Tensor, we need to create one filled with zeroes
228 const auto bias_ = bias.defined() ? bias.contiguous() : at::zeros({weight.size(0)}, input.options());
229
230 const auto compute = [&](const size_t batch_size) -> nnp_status {
231 if ((batch_size == 1) || (output_subsample.width != 1) || (output_subsample.height != 1)) {
232 const size_t input_size_per_batch = input_channels * input_size.width * input_size.height;
233 const size_t output_size_per_batch = output_channels * output_size.width * output_size.height;
234
235 for (const auto batch : c10::irange(0u, batch_size)) {
236 const nnp_status status = nnp_convolution_inference(
237 algorithm,
238 nnp_convolution_transform_strategy_compute,
239 input_channels,
240 output_channels,
241 input_size,
242 input_padding,
243 kernel_size,
244 output_subsample,
245 input_.data_ptr<float>() + batch * input_size_per_batch,
246 weight_.data_ptr<float>(),
247 bias_.data_ptr<float>(),
248 output.data_ptr<float>() + batch * output_size_per_batch,
249 workspace.buffer,
250 &workspace.size,
251 nnp_activation_identity,
252 nullptr,
253 nnpack_threadpool(),
254 nullptr );
255
256 if (nnp_status_success != status) {
257 return status;
258 }
259 }
260
261 return nnp_status_success;
262 }
263 else {
264 return nnp_convolution_output(
265 algorithm,
266 batch_size,
267 input_channels,
268 output_channels,
269 input_size,
270 input_padding,
271 kernel_size,
272 input_.data_ptr<float>(),
273 weight_.data_ptr<float>(),
274 bias_.data_ptr<float>(),
275 output.data_ptr<float>(),
276 workspace.buffer,
277 &workspace.size,
278 nnp_activation_identity,
279 nullptr,
280 nnpack_threadpool(),
281 nullptr );
282 }
283 };
284
285 const size_t batch_size = input.size(0);
286
287 auto size_and_allocate_ws = [&]() {
288 // Run a single pass to get the size of memory workspace buffer
289 const auto status = compute(batch_size);
290 if (status != nnp_status_success) {
291 throw std::runtime_error("NNPACK SpatialConvolution_updateOutput failed");
292 }
293 workspace.allocate();
294 };
295
296 // If no workspace created yet, allocate it
297 if (workspace.buffer == nullptr) {
298 size_and_allocate_ws();
299 }
300
301 // Try to run with the newly created, or existing workspace
302 auto status = compute(batch_size);
303
304 if (status == nnp_status_insufficient_buffer) {
305 // Need to reallocate the workspace
306 workspace.deallocate();
307 size_and_allocate_ws();
308
309 // Try one more time
310 status = compute(batch_size);
311 }
312
313 if (status != nnp_status_success) {
314 throw std::runtime_error("NNPACK SpatialConvolution_updateOutput failed");
315 }
316
317 return output;
318 }
319
320 } // namespace at::native
321
322 #endif // AT_NNPACK_ENABLED
323