1 #include <c10/util/Exception.h>
2 #ifdef USE_CUDA
3 #include <ATen/cuda/CUDAConfig.h> // for the definition of AT_CUDNN_ENABLED
4
5 #if AT_CUDNN_ENABLED()
6 #include <ATen/cuda/Exceptions.h>
7 #include <ATen/cudnn/Descriptors.h>
8 #include <ATen/cudnn/Handle.h>
9 #include <ATen/cudnn/Types.h>
10 #endif // AT_CUDNN_ENABLED
11 #endif // USE_CUDA
12
13 #include <ATen/ATen.h>
14 #include <ATen/native/Pool.h>
15 #include <ATen/native/TensorIterator.h>
16 #include <c10/core/QScheme.h>
17 #include <c10/core/ScalarType.h>
18 #include <c10/util/ArrayRef.h>
19 #include <torch/library.h>
20
21 #include <vector>
22
23
24 namespace at::native {
25 namespace {
26 // TODO: This function is the same as that of Pooling.cpp. We should refactor this into quantized directory
27 // so that we don't need to duplicate the function
28 #ifdef USE_CUDA
29 #if AT_CUDNN_ENABLED()
check_maxpool2d_params(IntArrayRef kernel_size,IntArrayRef stride,IntArrayRef padding,IntArrayRef dilation)30 void check_maxpool2d_params(
31 IntArrayRef kernel_size,
32 IntArrayRef stride,
33 IntArrayRef padding,
34 IntArrayRef dilation) {
35 TORCH_CHECK(kernel_size.size() == 1 || kernel_size.size() == 2,
36 "Expected 1d or 2d kernel size, got ", kernel_size.size());
37 TORCH_CHECK(stride.empty() || stride.size() == 2,
38 "Expected no strides or 2d strides, got", stride.size());
39 TORCH_CHECK(padding.size() == 1 || padding.size() == 2,
40 "Expected 1d or 2d padding, got ", padding.size());
41 TORCH_CHECK(dilation.size() == 1 || dilation.size() == 2,
42 "Expected 1d or 2d dilation, got ", dilation.size());
43 }
44 #endif
45 #endif
46 }
47
48 // The current implementation of quantized cuda adaptive average pooling uses the following:
49 // dequant -> fp32 adaptive average pooling -> quant. This is the same numerically as
50 // quantized adaptive average pooling. This is not the ideal implementation, as we desire to
51 // operate on the quantized values directly.
52 // However, we are currently blocked on this as we are waiting for cudnn's 8.5.0 release, which is anticipated
53 // to support adaptive average pooling. When that support is made available, we will use it directly. TODO
adaptive_avg_pool2d_quantized_cuda(const at::Tensor & input,IntArrayRef output_size)54 Tensor adaptive_avg_pool2d_quantized_cuda(
55 const at::Tensor& input,
56 IntArrayRef output_size) {
57 // TODO: renable these cudnn preprocessors like quantized_max_pool2d_cudnn below when we implement this function with cudnn
58 #ifdef USE_CUDA
59 // #if AT_CUDNN_ENABLED()
60 // TODO: limit this to per tensor quantized tensors for now, though should be easy to adapt
61 // to per channel quantized tensors
62 TORCH_CHECK(input.qscheme() == at::kPerTensorAffine, "adaptive_avg_pool2d_quantized_cuda oonly supports per tensor quantized tensors");
63 auto input_fp32 = at::dequantize(input);
64 auto result_fp32 = at::adaptive_avg_pool2d(input_fp32, output_size);
65 return at::quantize_per_tensor(result_fp32, input.q_scale(), input.q_zero_point(), input.scalar_type());
66 #else // USE_CUDA
67 AT_ERROR("at::native::adaptive_avg_pool2d_quantized_cuda: ATen not compiled with USE_CUDA support");
68 return Tensor{}; // never reached, placates the compiler
69 #endif
70 }
71
72 // Currently we support 4D and 3D input (qx) tensors, the latter of which is supported for
73 // legacy reasons. The first dimension of a 4D input tensor is the batch size.
74 // For a 3D tensor, there is no batch size dimension -- it can be viewed as a single batch.
75 // cudnn's 2D pooling operation requires the input and output to be 4D tensors, so we must cast
76 // any 3D tensors to 4D prior to using cudnn
77 // This implementation currently uses the v7 cudnn APIs as v8 cudnn APIs are not yet available for
78 // pooling operations.
79 // Consult https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnPoolingForward for
80 // documentation on the APIs
81 // Currently, it appears there is no cudnn support for dilated pooling -- we will
82 // submit a feature request for this with cudnn
83 // TODO: ideally, we would like to use structured kernel support here so we do not have to repeat
84 // the input checks, however, that would require us to implement max_pool2d_with_indices_out_quantized_cuda
85 // based on how the dispatch table is currently constructed in native_functions.yaml. currently,
86 // there is no support for producing indices with cudnn max pooling, so until that becomes available, this cannot be done.
quantized_max_pool2d_cudnn(const Tensor & qx,IntArrayRef kernel_size,IntArrayRef stride,IntArrayRef padding,IntArrayRef dilation,bool ceil_mode)87 Tensor quantized_max_pool2d_cudnn(
88 const Tensor& qx,
89 IntArrayRef kernel_size,
90 IntArrayRef stride,
91 IntArrayRef padding,
92 IntArrayRef dilation,
93 bool ceil_mode) {
94 #ifdef USE_CUDA
95 #if AT_CUDNN_ENABLED()
96 check_maxpool2d_params(
97 kernel_size,
98 stride,
99 padding,
100 dilation);
101 if (stride.empty()) {
102 stride = kernel_size;
103 }
104 auto ndim = qx.dim();
105 TORCH_CHECK(
106 ndim == 3 || ndim == 4, "Expecting the input tensor of rank 3 or 4.");
107 TORCH_CHECK(
108 kernel_size.size() == 2,
109 "quantized_max_pool2d_cudnn(): Expected kernel_size to be 2-dimensional: got ",
110 kernel_size.size());
111 TORCH_CHECK(
112 stride.size() == 2,
113 "quantized_max_pool2d_cudnn(): Expected stride to be 2-dimensional: got ",
114 stride.size());
115 TORCH_CHECK(
116 dilation.size() == 2,
117 "quantized_max_pool2d_cudnn(): Expected dilation to be 2-dimensional: got ",
118 dilation.size());
119 TORCH_CHECK(
120 dilation[0] == 1 && dilation[1] == 1,
121 "quantized_max_pool2d_cudnn(): Expected dilation=[1, 1] (cudnn does not currently support dilation[i] != 1), got",
122 dilation);
123 TORCH_CHECK(
124 padding.size() == 2,
125 "quantized_max_pool2d_cudnn(): Expected padding to be 2-dimensional: got ",
126 padding.size());
127
128 auto input = qx;
129 if (ndim == 4) {
130 input = qx.to(MemoryFormat::ChannelsLast);
131 } else { // 3D
132 std::vector<int64_t> new_sizes{1, qx.size(0), qx.size(1), qx.size(2)};
133 input = qx.view(new_sizes);
134 }
135 int batch_size = input.size(0);
136 int64_t inC = input.size(1);
137 int64_t inH = input.size(2);
138 int64_t inW = input.size(3);
139 // Check output dimensions.
140 int64_t padH = padding[0];
141 int64_t padW = padding[1];
142 int64_t kH = kernel_size[0];
143 int64_t kW = kernel_size[1];
144 int64_t strideH = stride[0];
145 int64_t strideW = stride[1];
146 TORCH_CHECK(
147 kH > 0 && kW > 0,
148 "qnnpack_maxpool2d(): kernel_size should be greater than zero.");
149 TORCH_CHECK(
150 strideH > 0 && strideW > 0,
151 "qnnpack_maxpool2d(): strides should be greater than zero.");
152 int64_t dilationH = dilation[0];
153 int64_t dilationW = dilation[1];
154 int64_t outC = inC;
155 int64_t outH = pooling_output_shape(inH, kH, padH, strideH, dilationH, ceil_mode);
156 int64_t outW = pooling_output_shape(inW, kW, padW, strideW, dilationW, ceil_mode);
157 TORCH_CHECK(outH > 0 && outW > 0,
158 "Given input size: (",
159 inC, "x", inH, "x", inW,
160 "). Calculated output size: (",
161 outC, "x", outH, "x", outW,
162 "). Output size is too small.");
163
164 std::vector<int64_t> output_shape;
165 if (ndim == 3) {
166 // cudnn requires 4D input and output for 2D pooling, so we prepend a dummy dimension
167 // whose size represents the batch size (1)
168 output_shape = {1, outC, outH, outW};
169 } else {
170 output_shape = {batch_size, outC, outH, outW};
171 }
172 auto qy = at::_empty_affine_quantized(
173 output_shape,
174 at::device(at::kCUDA).dtype(at::ScalarType::QInt8),
175 input.q_scale(),
176 input.q_zero_point(),
177 (ndim == 4 ? MemoryFormat::ChannelsLast : MemoryFormat::Contiguous));
178
179 cudnnHandle_t handle = getCudnnHandle();
180 cudnnPoolingDescriptor_t poolingDesc = nullptr;
181 AT_CUDNN_CHECK_WITH_SHAPES(cudnnCreatePoolingDescriptor(&poolingDesc));
182 AT_CUDNN_CHECK_WITH_SHAPES(cudnnSetPooling2dDescriptor(
183 poolingDesc,
184 CUDNN_POOLING_MAX_DETERMINISTIC,
185 CUDNN_NOT_PROPAGATE_NAN,
186 kernel_size[0], // kernel height
187 kernel_size[1], // kernel width
188 padding[0], // vertical padding
189 padding[1], // horizontal padding
190 stride[0], // vertical stride
191 stride[1])); // horizontal stride
192
193 float one{1};
194 float zero{0.0};
195 TensorDescriptor xDesc;
196 at::MemoryFormat memory_format = (ndim == 4 ? at::MemoryFormat::ChannelsLast : at::MemoryFormat::Contiguous);
197 xDesc.set(input, memory_format);
198 TensorDescriptor yDesc;
199 yDesc.set(qy, memory_format);
200 cudnnPoolingForward(handle,
201 poolingDesc,
202 &one,
203 xDesc.desc(),
204 input.data_ptr<int8_t>(),
205 &zero,
206 yDesc.desc(),
207 qy.data_ptr<int8_t>());
208
209 // recall we casted our input and output to 4D if qx was 3D, so we recast it back to 3D prior to returning
210 return (ndim == 3 ? qy.view(std::vector<int64_t>(output_shape.begin() + 1, output_shape.end())) : qy);
211 #else // AT_CUDNN_ENABLED()
212 AT_ERROR("at::native::quantized_max_pool2d_cudnn: ATen not compiled with cuDNN support");
213 return Tensor{}; // never reached, placates the compiler
214 #endif // AT_CUDNN_ENABLED()
215 #else // USE_CUDA
216 AT_ERROR("at::native::quantized_max_pool2d_cudnn: ATen not compiled with USE_CUDA support");
217 return Tensor{}; // never reached, placates the compiler
218 #endif
219 }
220
221 // Keep the registry in the anonymous namespace.
222 namespace {
223 template <uint32_t kSpatialDim>
224 class QMaxPool_arr_args final {
225 public:
run(const Tensor & qx,std::vector<int64_t> kernel_size,std::vector<int64_t> stride,std::vector<int64_t> padding,std::vector<int64_t> dilation,bool ceil_mode)226 static Tensor run(
227 const Tensor& qx,
228 std::vector<int64_t> kernel_size,
229 std::vector<int64_t> stride,
230 std::vector<int64_t> padding,
231 std::vector<int64_t> dilation,
232 bool ceil_mode) {
233 static_assert(kSpatialDim == 2, "quantized max pool is only valid for 2D");
234 return quantized_max_pool2d_cudnn(qx, kernel_size, stride, padding,
235 dilation, ceil_mode);
236 }
237 };
238
TORCH_LIBRARY_IMPL(quantized,QuantizedCUDA,m)239 TORCH_LIBRARY_IMPL(quantized, QuantizedCUDA, m) {
240 m.impl(TORCH_SELECTIVE_NAME("quantized::max_pool2d"), TORCH_FN(QMaxPool_arr_args<2>::run));
241 }
242
243 } // namespace
244 } // namespace at::native
245