xref: /aosp_15_r20/external/pytorch/aten/src/ATen/native/quantized/cudnn/Pooling.cpp (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 #include <c10/util/Exception.h>
2 #ifdef USE_CUDA
3 #include <ATen/cuda/CUDAConfig.h>  // for the definition of AT_CUDNN_ENABLED
4 
5 #if AT_CUDNN_ENABLED()
6 #include <ATen/cuda/Exceptions.h>
7 #include <ATen/cudnn/Descriptors.h>
8 #include <ATen/cudnn/Handle.h>
9 #include <ATen/cudnn/Types.h>
10 #endif // AT_CUDNN_ENABLED
11 #endif // USE_CUDA
12 
13 #include <ATen/ATen.h>
14 #include <ATen/native/Pool.h>
15 #include <ATen/native/TensorIterator.h>
16 #include <c10/core/QScheme.h>
17 #include <c10/core/ScalarType.h>
18 #include <c10/util/ArrayRef.h>
19 #include <torch/library.h>
20 
21 #include <vector>
22 
23 
24 namespace at::native {
25 namespace {
26 // TODO: This function is the same as that of Pooling.cpp. We should refactor this into quantized directory
27 // so that we don't need to duplicate the function
28 #ifdef USE_CUDA
29 #if AT_CUDNN_ENABLED()
check_maxpool2d_params(IntArrayRef kernel_size,IntArrayRef stride,IntArrayRef padding,IntArrayRef dilation)30 void check_maxpool2d_params(
31     IntArrayRef kernel_size,
32     IntArrayRef stride,
33     IntArrayRef padding,
34     IntArrayRef dilation) {
35   TORCH_CHECK(kernel_size.size() == 1 || kernel_size.size() == 2,
36               "Expected 1d or 2d kernel size, got ", kernel_size.size());
37   TORCH_CHECK(stride.empty() || stride.size() == 2,
38               "Expected no strides or 2d strides, got", stride.size());
39   TORCH_CHECK(padding.size() == 1 || padding.size() == 2,
40               "Expected 1d or 2d padding, got ", padding.size());
41   TORCH_CHECK(dilation.size() == 1 || dilation.size() == 2,
42               "Expected 1d or 2d dilation, got ", dilation.size());
43 }
44 #endif
45 #endif
46 }
47 
48 // The current implementation of quantized cuda adaptive average pooling uses the following:
49 // dequant -> fp32 adaptive average pooling -> quant. This is the same numerically as
50 // quantized adaptive average pooling. This is not the ideal implementation, as we desire to
51 // operate on the quantized values directly.
52 // However, we are currently blocked on this as we are waiting for cudnn's 8.5.0 release, which is anticipated
53 // to support adaptive average pooling. When that support is made available, we will use it directly. TODO
adaptive_avg_pool2d_quantized_cuda(const at::Tensor & input,IntArrayRef output_size)54 Tensor adaptive_avg_pool2d_quantized_cuda(
55     const at::Tensor& input,
56     IntArrayRef output_size) {
57 // TODO: renable these cudnn preprocessors like quantized_max_pool2d_cudnn below when we implement this function with cudnn
58 #ifdef USE_CUDA
59 // #if AT_CUDNN_ENABLED()
60     // TODO: limit this to per tensor quantized tensors for now, though should be easy to adapt
61     // to per channel quantized tensors
62     TORCH_CHECK(input.qscheme() == at::kPerTensorAffine, "adaptive_avg_pool2d_quantized_cuda oonly supports per tensor quantized tensors");
63     auto input_fp32 = at::dequantize(input);
64     auto result_fp32 = at::adaptive_avg_pool2d(input_fp32, output_size);
65     return at::quantize_per_tensor(result_fp32, input.q_scale(), input.q_zero_point(), input.scalar_type());
66 #else // USE_CUDA
67   AT_ERROR("at::native::adaptive_avg_pool2d_quantized_cuda: ATen not compiled with USE_CUDA support");
68   return Tensor{}; // never reached, placates the compiler
69 #endif
70 }
71 
72 // Currently we support 4D and 3D input (qx) tensors, the latter of which is supported for
73 // legacy reasons. The first dimension of a 4D input tensor is the batch size.
74 // For a 3D tensor, there is no batch size dimension -- it can be viewed as a single batch.
75 // cudnn's 2D pooling operation requires the input and output to be 4D tensors, so we must cast
76 // any 3D tensors to 4D prior to using cudnn
77 // This implementation currently uses the v7 cudnn APIs as v8 cudnn APIs are not yet available for
78 // pooling operations.
79 // Consult https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnPoolingForward for
80 // documentation on the APIs
81 // Currently, it appears there is no cudnn support for dilated pooling -- we will
82 // submit a feature request for this with cudnn
83 // TODO: ideally, we would like to use structured kernel support here so we do not have to repeat
84 // the input checks, however, that would require us to implement max_pool2d_with_indices_out_quantized_cuda
85 // based on how the dispatch table is currently constructed in native_functions.yaml. currently,
86 // there is no support for producing indices with cudnn max pooling, so until that becomes available, this cannot be done.
quantized_max_pool2d_cudnn(const Tensor & qx,IntArrayRef kernel_size,IntArrayRef stride,IntArrayRef padding,IntArrayRef dilation,bool ceil_mode)87 Tensor quantized_max_pool2d_cudnn(
88     const Tensor& qx,
89     IntArrayRef kernel_size,
90     IntArrayRef stride,
91     IntArrayRef padding,
92     IntArrayRef dilation,
93     bool ceil_mode) {
94 #ifdef USE_CUDA
95 #if AT_CUDNN_ENABLED()
96   check_maxpool2d_params(
97       kernel_size,
98       stride,
99       padding,
100       dilation);
101   if (stride.empty()) {
102     stride = kernel_size;
103   }
104   auto ndim = qx.dim();
105   TORCH_CHECK(
106       ndim == 3 || ndim == 4, "Expecting the input tensor of rank 3 or 4.");
107   TORCH_CHECK(
108       kernel_size.size() == 2,
109       "quantized_max_pool2d_cudnn(): Expected kernel_size to be 2-dimensional: got ",
110       kernel_size.size());
111   TORCH_CHECK(
112       stride.size() == 2,
113       "quantized_max_pool2d_cudnn(): Expected stride to be 2-dimensional: got ",
114       stride.size());
115   TORCH_CHECK(
116       dilation.size() == 2,
117       "quantized_max_pool2d_cudnn(): Expected dilation to be 2-dimensional: got ",
118       dilation.size());
119   TORCH_CHECK(
120       dilation[0] == 1 && dilation[1] == 1,
121       "quantized_max_pool2d_cudnn(): Expected dilation=[1, 1] (cudnn does not currently support dilation[i] != 1), got",
122       dilation);
123   TORCH_CHECK(
124       padding.size() == 2,
125       "quantized_max_pool2d_cudnn(): Expected padding to be 2-dimensional: got ",
126       padding.size());
127 
128   auto input = qx;
129   if (ndim == 4) {
130     input = qx.to(MemoryFormat::ChannelsLast);
131   } else { // 3D
132     std::vector<int64_t> new_sizes{1, qx.size(0), qx.size(1), qx.size(2)};
133     input = qx.view(new_sizes);
134   }
135   int batch_size = input.size(0);
136   int64_t inC = input.size(1);
137   int64_t inH = input.size(2);
138   int64_t inW = input.size(3);
139   // Check output dimensions.
140   int64_t padH = padding[0];
141   int64_t padW = padding[1];
142   int64_t kH = kernel_size[0];
143   int64_t kW = kernel_size[1];
144   int64_t strideH = stride[0];
145   int64_t strideW = stride[1];
146   TORCH_CHECK(
147       kH > 0 && kW > 0,
148       "qnnpack_maxpool2d(): kernel_size should be greater than zero.");
149   TORCH_CHECK(
150       strideH > 0 && strideW > 0,
151       "qnnpack_maxpool2d(): strides should be greater than zero.");
152   int64_t dilationH = dilation[0];
153   int64_t dilationW = dilation[1];
154   int64_t outC = inC;
155   int64_t outH = pooling_output_shape(inH, kH, padH, strideH, dilationH, ceil_mode);
156   int64_t outW = pooling_output_shape(inW, kW, padW, strideW, dilationW, ceil_mode);
157   TORCH_CHECK(outH > 0 && outW > 0,
158               "Given input size: (",
159               inC, "x", inH, "x", inW,
160               "). Calculated output size: (",
161               outC, "x", outH, "x", outW,
162               "). Output size is too small.");
163 
164   std::vector<int64_t> output_shape;
165   if (ndim == 3) {
166     // cudnn requires 4D input and output for 2D pooling, so we prepend a dummy dimension
167     // whose size represents the batch size (1)
168     output_shape = {1, outC, outH, outW};
169   } else {
170     output_shape = {batch_size, outC, outH, outW};
171   }
172   auto qy = at::_empty_affine_quantized(
173       output_shape,
174       at::device(at::kCUDA).dtype(at::ScalarType::QInt8),
175       input.q_scale(),
176       input.q_zero_point(),
177       (ndim == 4 ? MemoryFormat::ChannelsLast : MemoryFormat::Contiguous));
178 
179   cudnnHandle_t handle = getCudnnHandle();
180   cudnnPoolingDescriptor_t poolingDesc = nullptr;
181   AT_CUDNN_CHECK_WITH_SHAPES(cudnnCreatePoolingDescriptor(&poolingDesc));
182   AT_CUDNN_CHECK_WITH_SHAPES(cudnnSetPooling2dDescriptor(
183       poolingDesc,
184       CUDNN_POOLING_MAX_DETERMINISTIC,
185       CUDNN_NOT_PROPAGATE_NAN,
186       kernel_size[0], // kernel height
187       kernel_size[1], // kernel width
188       padding[0], // vertical padding
189       padding[1], // horizontal padding
190       stride[0], // vertical stride
191       stride[1])); // horizontal stride
192 
193   float one{1};
194   float zero{0.0};
195   TensorDescriptor xDesc;
196   at::MemoryFormat memory_format = (ndim == 4 ? at::MemoryFormat::ChannelsLast : at::MemoryFormat::Contiguous);
197   xDesc.set(input, memory_format);
198   TensorDescriptor yDesc;
199   yDesc.set(qy, memory_format);
200   cudnnPoolingForward(handle,
201                       poolingDesc,
202                       &one,
203                       xDesc.desc(),
204                       input.data_ptr<int8_t>(),
205                       &zero,
206                       yDesc.desc(),
207                       qy.data_ptr<int8_t>());
208 
209   // recall we casted our input and output to 4D if qx was 3D, so we recast it back to 3D prior to returning
210   return (ndim == 3 ? qy.view(std::vector<int64_t>(output_shape.begin() + 1, output_shape.end())) : qy);
211 #else // AT_CUDNN_ENABLED()
212   AT_ERROR("at::native::quantized_max_pool2d_cudnn: ATen not compiled with cuDNN support");
213   return Tensor{}; // never reached, placates the compiler
214 #endif // AT_CUDNN_ENABLED()
215 #else // USE_CUDA
216   AT_ERROR("at::native::quantized_max_pool2d_cudnn: ATen not compiled with USE_CUDA support");
217   return Tensor{}; // never reached, placates the compiler
218 #endif
219 }
220 
221 // Keep the registry in the anonymous namespace.
222 namespace {
223 template <uint32_t kSpatialDim>
224 class QMaxPool_arr_args final {
225  public:
run(const Tensor & qx,std::vector<int64_t> kernel_size,std::vector<int64_t> stride,std::vector<int64_t> padding,std::vector<int64_t> dilation,bool ceil_mode)226   static Tensor run(
227       const Tensor& qx,
228       std::vector<int64_t> kernel_size,
229       std::vector<int64_t> stride,
230       std::vector<int64_t> padding,
231       std::vector<int64_t> dilation,
232       bool ceil_mode) {
233     static_assert(kSpatialDim == 2, "quantized max pool is only valid for 2D");
234     return quantized_max_pool2d_cudnn(qx, kernel_size, stride, padding,
235                                     dilation, ceil_mode);
236   }
237 };
238 
TORCH_LIBRARY_IMPL(quantized,QuantizedCUDA,m)239 TORCH_LIBRARY_IMPL(quantized, QuantizedCUDA, m) {
240   m.impl(TORCH_SELECTIVE_NAME("quantized::max_pool2d"), TORCH_FN(QMaxPool_arr_args<2>::run));
241 }
242 
243 } // namespace
244 } // namespace at::native
245