xref: /aosp_15_r20/external/tensorflow/tensorflow/core/kernels/depthwise_conv_grad_op.cc (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #define EIGEN_USE_THREADS
17 
18 #include <algorithm>
19 #include <cmath>
20 
21 #include "tensorflow/core/framework/bounds_check.h"
22 #include "tensorflow/core/framework/kernel_shape_util.h"
23 #include "tensorflow/core/framework/numeric_op.h"
24 #include "tensorflow/core/framework/op_kernel.h"
25 #include "tensorflow/core/framework/register_types.h"
26 #include "tensorflow/core/framework/tensor.h"
27 #include "tensorflow/core/framework/tensor_shape.h"
28 #include "tensorflow/core/framework/tensor_types.h"
29 #include "tensorflow/core/framework/types.h"
30 #include "tensorflow/core/kernels/cast_op.h"
31 #include "tensorflow/core/kernels/conv_grad_ops.h"
32 #include "tensorflow/core/kernels/depthwise_conv_op.h"
33 #include "tensorflow/core/lib/core/status.h"
34 #include "tensorflow/core/platform/logging.h"
35 #include "tensorflow/core/platform/types.h"
36 #include "tensorflow/core/util/determinism.h"
37 #include "tensorflow/core/util/padding.h"
38 #include "tensorflow/core/util/tensor_format.h"
39 #include "tensorflow/core/util/use_cudnn.h"
40 #include "tensorflow/core/util/work_sharder.h"
41 
42 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
43 
44 #if GOOGLE_CUDA
45 #include "third_party/gpus/cudnn/cudnn.h"
46 #endif
47 
48 #include "tensorflow/core/platform/stream_executor.h"
49 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
50 
51 namespace tensorflow {
52 
53 // Gradient operations for depthwise convolution.
54 
55 typedef Eigen::ThreadPoolDevice CPUDevice;
56 typedef Eigen::GpuDevice GPUDevice;
57 
58 // Common code between the two backward pass kernels: verifies that the
59 // dimensions all match and extract the padded rows and columns.
60 #define EXTRACT_AND_VERIFY_DIMENSIONS(label)                                   \
61   const Tensor& out_backprop = context->input(2);                              \
62   OP_REQUIRES(                                                                 \
63       context, input_shape.dims() == 4,                                        \
64       errors::InvalidArgument(label, ": input must be 4-dimensional"));        \
65   OP_REQUIRES(                                                                 \
66       context, filter_shape.dims() == 4,                                       \
67       errors::InvalidArgument(label, ": filter must be 4-dimensional"));       \
68   OP_REQUIRES(                                                                 \
69       context, out_backprop.dims() == 4,                                       \
70       errors::InvalidArgument(label, ": out_backprop must be 4-dimensional")); \
71   const int64_t batch = input_shape.dim_size(0);                               \
72   OP_REQUIRES(                                                                 \
73       context, batch == out_backprop.dim_size(0),                              \
74       errors::InvalidArgument(                                                 \
75           label, ": input and out_backprop must have the same batch size"));   \
76   const int64_t input_rows_raw = GetTensorDim(input_shape, data_format_, 'H'); \
77   OP_REQUIRES(                                                                 \
78       context,                                                                 \
79       FastBoundsCheck(input_rows_raw, std::numeric_limits<int32>::max()),      \
80       errors::InvalidArgument("Input rows too large"));                        \
81   const int32 input_rows = static_cast<int32>(input_rows_raw);                 \
82   const int64_t input_cols_raw = GetTensorDim(input_shape, data_format_, 'W'); \
83   OP_REQUIRES(                                                                 \
84       context,                                                                 \
85       FastBoundsCheck(input_cols_raw, std::numeric_limits<int32>::max()),      \
86       errors::InvalidArgument("Input cols too large"));                        \
87   const int32 input_cols = static_cast<int32>(input_cols_raw);                 \
88   const int64_t filter_rows = filter_shape.dim_size(0);                        \
89   const int64_t filter_cols = filter_shape.dim_size(1);                        \
90   const int64_t output_rows_raw =                                              \
91       GetTensorDim(out_backprop.shape(), data_format_, 'H');                   \
92   OP_REQUIRES(                                                                 \
93       context,                                                                 \
94       FastBoundsCheck(output_rows_raw, std::numeric_limits<int32>::max()),     \
95       errors::InvalidArgument("Output rows too large"));                       \
96   const int32 output_rows = static_cast<int32>(output_rows_raw);               \
97   const int64_t output_cols_raw =                                              \
98       GetTensorDim(out_backprop.shape(), data_format_, 'W');                   \
99   OP_REQUIRES(                                                                 \
100       context,                                                                 \
101       FastBoundsCheck(output_cols_raw, std::numeric_limits<int32>::max()),     \
102       errors::InvalidArgument("Output cols too large"));                       \
103   const int32 output_cols = static_cast<int32>(output_cols_raw);               \
104   const int64_t in_depth = GetTensorDim(input_shape, data_format_, 'C');       \
105   OP_REQUIRES(context, in_depth == filter_shape.dim_size(2),                   \
106               errors::InvalidArgument(                                         \
107                   label, ": input and filter must have the same in_depth"));   \
108   const int64_t depth_multiplier = filter_shape.dim_size(3);                   \
109   const int64_t out_depth_raw =                                                \
110       GetTensorDim(out_backprop.shape(), data_format_, 'C');                   \
111   OP_REQUIRES(                                                                 \
112       context,                                                                 \
113       FastBoundsCheck(out_depth_raw, std::numeric_limits<int32>::max()),       \
114       errors::InvalidArgument("Output depth too large"));                      \
115   const int32 out_depth = static_cast<int32>(out_depth_raw);                   \
116   OP_REQUIRES(                                                                 \
117       context, (depth_multiplier * in_depth) == out_depth,                     \
118       errors::InvalidArgument(                                                 \
119           label, ": depth_multiplier * in_depth not equal to out_depth"));     \
120   const auto stride = stride_;                                                 \
121   int64_t out_rows = 0, out_cols = 0, pad_top = 0, pad_bottom = 0,             \
122           pad_left = 0, pad_right = 0;                                         \
123   if (padding_ == Padding::EXPLICIT) {                                         \
124     GetExplicitPaddingForDim(explicit_paddings_, data_format_, 'H', &pad_top,  \
125                              &pad_bottom);                                     \
126     GetExplicitPaddingForDim(explicit_paddings_, data_format_, 'W', &pad_left, \
127                              &pad_right);                                      \
128   }                                                                            \
129   OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(                        \
130                               input_rows, filter_rows, stride_, padding_,      \
131                               &out_rows, &pad_top, &pad_bottom));              \
132   OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(                        \
133                               input_cols, filter_cols, stride_, padding_,      \
134                               &out_cols, &pad_left, &pad_right));              \
135   OP_REQUIRES(                                                                 \
136       context, output_rows == out_rows,                                        \
137       errors::InvalidArgument(                                                 \
138           label, ": Number of rows of out_backprop doesn't match computed: ",  \
139           "actual = ", output_rows, ", computed = ", out_rows));               \
140   OP_REQUIRES(                                                                 \
141       context, output_cols == out_cols,                                        \
142       errors::InvalidArgument(                                                 \
143           label, ": Number of cols of out_backprop doesn't match computed: ",  \
144           "actual = ", output_cols, ", computed = ", out_cols));               \
145   DepthwiseArgs args;                                                          \
146   args.batch = batch;                                                          \
147   args.in_rows = input_rows;                                                   \
148   args.in_cols = input_cols;                                                   \
149   args.in_depth = in_depth;                                                    \
150   args.filter_rows = filter_rows;                                              \
151   args.filter_cols = filter_cols;                                              \
152   args.depth_multiplier = depth_multiplier;                                    \
153   args.stride = stride;                                                        \
154   args.pad_rows = pad_top;                                                     \
155   args.pad_cols = pad_left;                                                    \
156   args.out_rows = out_rows;                                                    \
157   args.out_cols = out_cols;                                                    \
158   args.out_depth = out_depth;                                                  \
159   VLOG(2) << "DepthwiseConv2d: " << label << " Input: [" << batch << ", "      \
160           << input_rows << ", " << input_cols << ", " << in_depth              \
161           << "]; Filter: [" << filter_rows << ", " << filter_cols << ", "      \
162           << in_depth << ", " << depth_multiplier << "]; stride = " << stride  \
163           << ", pad_rows = " << pad_top << ", pad_cols = " << pad_left         \
164           << ", output: [" << batch << ", " << out_rows << ", " << out_cols    \
165           << ", " << out_depth << "]";
166 
167 // Copies data from local region in 'out_backprop' into 'buffer'.
168 // The local region coordinates are calculated as the set of output points which
169 // used the input point ('in_r', 'in_'c') as input during the forward pass.
170 // Rather than spatially reversing the filter, the input is reversed during
171 // the copy. The copied data is padded to vector register-width boundaries so
172 // that it is aligned for efficient traversal and vector multiply-add by the
173 // depthwise input kernel.
174 //
175 // EX:
176 //   in_depth = 3, depth_multiplier = 2, filter [2, 2], register_width = 4
177 //
178 //   'out_backprop': [batch, out_rows, out_cols, out_depth]
179 //
180 //     [a00, a01, a10, a11] [a20, a21, b00, b01]
181 //     [b10, b11, b20, b21] [...]
182 //     [e00, e01, e10, e11] [e20, e21, f00, f01]
183 //     [f10, f11, f20, f21] [...]
184 //
185 //   'buffer' (register boundaries shown):
186 //
187 //     [f00, f01, f10, f11] [f20, f21, 0, 0]   in_row = 0, in_col = 0
188 //     [e00, e01, e10, e11] [e20, e21, 0, 0]   in_row = 0, in_col = 1
189 //     [b00, b01, b10, b11] [b20, b21, 0, 0]   in_row = 1, in_col = 0
190 //     [a00, a01, a10, a11] [a20, a21, 0, 0]   in_row = 1, in_col = 1
191 //
192 template <typename T>
CopyOutputBackpropRegion(const DepthwiseArgs & args,const int64_t padded_filter_inner_dim_size,const int64_t in_r,const int64_t in_c,const T * out_backprop,T * buffer)193 static void CopyOutputBackpropRegion(const DepthwiseArgs& args,
194                                      const int64_t padded_filter_inner_dim_size,
195                                      const int64_t in_r, const int64_t in_c,
196                                      const T* out_backprop, T* buffer) {
197   typedef typename Eigen::internal::packet_traits<T>::type Packet;
198   static const int64_t kPacketSize = (sizeof(Packet) / sizeof(T));
199 
200   const int64_t stride = args.stride;
201   const int64_t filter_rows = args.filter_rows;
202   const int64_t filter_cols = args.filter_cols;
203   const int64_t pad_rows = args.pad_rows;
204   const int64_t pad_cols = args.pad_cols;
205   const int64_t out_rows = args.out_rows;
206   const int64_t out_cols = args.out_cols;
207 
208   // Calculate the output spatial region which used point (in_r, in_c) as input.
209   const int64_t out_r_start =
210       std::max(static_cast<int64_t>(0),
211                (in_r - filter_rows + pad_rows + stride) / stride);
212   const int64_t out_r_end = std::min(out_rows - 1, (in_r + pad_rows) / stride);
213   const int64_t out_c_start =
214       std::max(static_cast<int64_t>(0),
215                (in_c - filter_cols + pad_cols + stride) / stride);
216   const int64_t out_c_end = std::min(out_cols - 1, (in_c + pad_cols) / stride);
217 
218   // Zero-pad 'buffer' if output region is smaller than filter spatial size.
219   const int64_t filter_spatial_size = args.filter_rows * args.filter_cols;
220   if ((out_r_end - out_r_start + 1) < args.filter_rows ||
221       (out_c_end - out_c_start + 1) < args.filter_cols) {
222     memset(buffer, 0,
223            filter_spatial_size * padded_filter_inner_dim_size * sizeof(T));
224   }
225 
226   // Calculate vectorized and scalar (residual) lengths for 'in_depth'.
227   const int64_t vectorized_size = (args.out_depth / kPacketSize) * kPacketSize;
228   const int64_t scalar_size = args.out_depth % kPacketSize;
229   const int64_t pad_size = scalar_size > 0 ? kPacketSize - scalar_size : 0;
230 
231   for (int out_r = out_r_start; out_r <= out_r_end; ++out_r) {
232     const int64_t f_r = in_r + pad_rows - out_r * stride;
233     for (int out_c = out_c_start; out_c <= out_c_end; ++out_c) {
234       const int64_t f_c = in_c + pad_cols - out_c * stride;
235       const int64_t buf_base =
236           (f_r * filter_cols + f_c) * padded_filter_inner_dim_size;
237       // Calculate index into 'out_backprop' for coordinate (out_r, out_c).
238       auto* out_bprop =
239           out_backprop + (out_r * args.out_cols + out_c) * args.out_depth;
240 
241       // Copy vectorized portion of inner dimension into 'buffer'.
242       for (int64_t d = 0; d < vectorized_size; d += kPacketSize) {
243         auto v = Eigen::internal::ploadu<Packet>(out_bprop + d);
244         Eigen::internal::pstoreu<T>(buffer + buf_base + d, v);
245       }
246       // Copy scalar portion of out_bprop to 'buffer'
247       for (int64_t d = 0; d < scalar_size; ++d) {
248         buffer[buf_base + vectorized_size + d] = out_bprop[vectorized_size + d];
249       }
250       // Pad to vector-register width (if needed).
251       for (int64_t d = 0; d < pad_size; ++d) {
252         buffer[buf_base + vectorized_size + scalar_size + d] =
253             static_cast<T>(0);
254       }
255     }
256   }
257 }
258 
259 // Computes the vectorized product of 'buffer' and 'filter' and stores
260 // result in 'output' at location computed from 'in_r' and 'in_c'.
261 // If depth_multiplier is > 1, the intermediate output is reduced along
262 // the depth_multiplier dimension.
263 //
264 // EX:
265 //   in_depth = 3, depth_multiplier = 2, filter [2, 2], register_width = 4
266 //   Both 'input_buffer' and 'filter' are padded to register-width boundaries.
267 //
268 //   'buffer' [rows, cols, in_depth, depth_multiplier]
269 //
270 //     [f00, f01, f10, f11] [f20, f21, 0, 0]   in_row = 0, in_col = 0
271 //     [e00, e01, e10, e11] [e20, e21, 0, 0]   in_row = 0, in_col = 1
272 //     [b00, b01, b10, b11] [b20, b21, 0, 0]   in_row = 1, in_col = 0
273 //     [a00, a01, a10, a11] [a20, a21, 0, 0]   in_row = 1, in_col = 1
274 //
275 //   filter [rows, cols, in_depth, depth_multiplier]
276 //     [u0, v0, w0, x0] [y0, z0, 0, 0] [u1, v1, w1, x1] [y1, z1, 0, 0]
277 //     [u2, v2, w2, x2] [y2, z2, 0, 0] [u3, v3, w3, x3] [y3, z3, 0, 0]
278 //
279 //   First output register [in_depth, depth_multiplier]
280 //     [q00, q01, q10, q11] = ([f00, f01, f10, f11] x [u0, v0, w0, x0]) +
281 //                            ([e00, e01, e10, e11] x [u1, v1, w1, x1]) +
282 //                            ([b00, b01, b10, b11] x [u2, v2, w2, x2]) +
283 //                            ([a00, a01, a10, a11] x [u3, v3, w3, x3])
284 //
285 //   Reduction step along depth-multiplier dimension:
286 //
287 //     [q00, q01, q10, q11] [q20, q21, 0, 0] -> [r0, r1, r2, 0]
288 //
289 
290 template <typename T>
ComputeBackpropInput(const DepthwiseArgs & args,const int64_t padded_filter_inner_dim_size,const int64_t in_r,const int64_t in_c,const T * filter,const T * buffer,T * out_buffer,T * output)291 static void ComputeBackpropInput(const DepthwiseArgs& args,
292                                  const int64_t padded_filter_inner_dim_size,
293                                  const int64_t in_r, const int64_t in_c,
294                                  const T* filter, const T* buffer,
295                                  T* out_buffer, T* output) {
296   typedef typename Eigen::internal::packet_traits<T>::type Packet;
297   static const int64_t kPacketSize = (sizeof(Packet) / sizeof(T));
298 
299   const int64_t in_depth = args.in_depth;
300   const int64_t depth_multiplier = args.depth_multiplier;
301   const int64_t out_depth = args.out_depth;
302   const int64_t filter_spatial_size = args.filter_rows * args.filter_cols;
303 
304   // Calculate vectorized and scalar lengths of 'out_depth'.
305   const int64_t output_vectorized_size =
306       (out_depth / kPacketSize) * kPacketSize;
307   const int64_t output_scalar_size = out_depth % kPacketSize;
308 
309   // Calculate base index at which to begin writing output.
310   const int64_t base_output_index = (in_r * args.in_cols + in_c) * in_depth;
311 
312   // Calculate vectorized and scalar lengths for 'depth_multiplier'. This is
313   // used to efficiently reduce output when 'depth_multiplier' > kPacketSize.
314   const int64_t dm_vectorized_size =
315       (depth_multiplier / kPacketSize) * kPacketSize;
316   const int64_t dm_scalar_size = depth_multiplier % kPacketSize;
317 
318   for (int i = 0; i < output_vectorized_size; i += kPacketSize) {
319     // Reset accumulator.
320     auto vaccum = Eigen::internal::pset1<Packet>(static_cast<T>(0));
321     for (int j = 0; j < filter_spatial_size; ++j) {
322       // Calculate index.
323       const int64_t index = i + j * padded_filter_inner_dim_size;
324       // Load filter.
325       const auto filter_block = Eigen::internal::ploadu<Packet>(filter + index);
326       // Load input.
327       const auto data_block = Eigen::internal::ploadu<Packet>(buffer + index);
328       // Vector multiply-add.
329       vaccum = Eigen::internal::pmadd<Packet>(filter_block, data_block, vaccum);
330     }
331     if (depth_multiplier == 1) {
332       // Write directly to the output.
333       Eigen::internal::pstoreu<T>(output + base_output_index + i, vaccum);
334     } else {
335       // Buffer output for subsequent reduction step.
336       Eigen::internal::pstoreu<T>(out_buffer + i, vaccum);
337     }
338   }
339 
340   if (output_scalar_size > 0) {
341     auto vaccum = Eigen::internal::pset1<Packet>(static_cast<T>(0));
342     for (int j = 0; j < filter_spatial_size; ++j) {
343       const int64_t index =
344           output_vectorized_size + j * padded_filter_inner_dim_size;
345       const auto filter_block = Eigen::internal::ploadu<Packet>(filter + index);
346       const auto data_block = Eigen::internal::ploadu<Packet>(buffer + index);
347       vaccum = Eigen::internal::pmadd<Packet>(filter_block, data_block, vaccum);
348     }
349     // Load accumulator into an array and loop through output.
350     T out_buf[kPacketSize];
351     Eigen::internal::pstoreu<T>(out_buf, vaccum);
352     if (depth_multiplier == 1) {
353       // Write directly to the output.
354       for (int j = 0; j < output_scalar_size; ++j) {
355         output[base_output_index + output_vectorized_size + j] = out_buf[j];
356       }
357     } else {
358       // Buffer output for subsequent reduction step.
359       for (int j = 0; j < output_scalar_size; ++j) {
360         out_buffer[output_vectorized_size + j] = out_buf[j];
361       }
362     }
363   }
364 
365   // Iterate over 'in_depth', reduce over 'depth_multiplier', write 'output'.
366   if (depth_multiplier > 1) {
367     for (int64_t d = 0; d < in_depth; ++d) {
368       const int64_t index = d * args.depth_multiplier;
369       T accum = static_cast<T>(0);
370       for (int64_t dm = 0; dm < dm_vectorized_size; dm += kPacketSize) {
371         const auto v = Eigen::internal::ploadu<Packet>(out_buffer + index + dm);
372         accum += Eigen::internal::predux(v);
373       }
374       // Copy scalar portion of replicated output.
375       for (int64_t dm = 0; dm < dm_scalar_size; ++dm) {
376         accum += out_buffer[index + dm_vectorized_size + dm];
377       }
378       // Copy to output.
379       output[base_output_index + d] = accum;
380     }
381   }
382 }
383 
384 // Computes the depthwise conv2d backprop input of 'out_backprop' by
385 // 'depthwise_filter' and stores the result in 'in_backprop'.
386 template <typename T>
387 struct LaunchDepthwiseConvBackpropInputOp<CPUDevice, T> {
388   typedef typename Eigen::internal::packet_traits<T>::type Packet;
389 
operator ()tensorflow::LaunchDepthwiseConvBackpropInputOp390   void operator()(OpKernelContext* ctx, const DepthwiseArgs& args,
391                   const T* out_backprop, const T* depthwise_filter,
392                   T* in_backprop, TensorFormat data_format) {
393     OP_REQUIRES(
394         ctx, data_format == FORMAT_NHWC,
395         errors::Unimplemented(
396             "Depthwise convolution on CPU is only supported for NHWC format"));
397 
398     static const int64_t kPacketSize = (sizeof(Packet) / sizeof(T));
399 
400     // Pad 'depthwise_filter' to vector register width (if needed).
401     const bool pad_filter = (args.out_depth % kPacketSize) == 0 ? false : true;
402     Tensor padded_filter;
403     if (pad_filter) {
404       // Allocate space for padded filter.
405       const int64_t filter_spatial_size = args.filter_rows * args.filter_cols;
406       const int64_t padded_filter_inner_dim_size =
407           ((args.out_depth + kPacketSize - 1) / kPacketSize) * kPacketSize;
408       OP_REQUIRES_OK(
409           ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
410                                   TensorShape({filter_spatial_size,
411                                                padded_filter_inner_dim_size}),
412                                   &padded_filter));
413       // Write out padded filter.
414       functor::DepthwiseFilterPadOp<T>()(
415           args, depthwise_filter, padded_filter.template flat<T>().data());
416     }
417     const T* filter_data =
418         pad_filter ? padded_filter.template flat<T>().data() : depthwise_filter;
419 
420     // Computes one shard of depthwise conv2d backprop input.
421     auto shard = [&ctx, &args, &out_backprop, &filter_data, &in_backprop](
422                      int64_t start, int64_t limit) {
423       static const int64_t kPacketSize = (sizeof(Packet) / sizeof(T));
424 
425       const int64_t input_image_size =
426           args.in_rows * args.in_cols * args.in_depth;
427       const int64_t output_image_size =
428           args.out_rows * args.out_cols * args.out_depth;
429       const int64_t filter_spatial_size = args.filter_rows * args.filter_cols;
430       const int64_t padded_filter_inner_dim_size =
431           ((args.out_depth + kPacketSize - 1) / kPacketSize) * kPacketSize;
432 
433       // Allocate buffer to copy regions from 'out_backprop'.
434       Tensor out_bprop_buffer;
435       OP_REQUIRES_OK(
436           ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
437                                   TensorShape({filter_spatial_size,
438                                                padded_filter_inner_dim_size}),
439                                   &out_bprop_buffer));
440       T* out_bprop_buf = out_bprop_buffer.template flat<T>().data();
441 
442       // Allocate buffer for intermediate results.
443       Tensor in_bprop_buffer;
444       OP_REQUIRES_OK(
445           ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
446                                   TensorShape({padded_filter_inner_dim_size}),
447                                   &in_bprop_buffer));
448       T* in_bprop_buf = in_bprop_buffer.template flat<T>().data();
449 
450       for (int64_t b = start; b < limit; ++b) {
451         for (int64_t in_r = 0; in_r < args.in_rows; ++in_r) {
452           for (int64_t in_c = 0; in_c < args.in_cols; ++in_c) {
453             // Populate 'out_bprop_buf' from local 'out_backprop' region.
454             CopyOutputBackpropRegion<T>(
455                 args, padded_filter_inner_dim_size, in_r, in_c,
456                 out_backprop + b * output_image_size, out_bprop_buf);
457 
458             // Compute depthwise backprop input.
459             ComputeBackpropInput<T>(args, padded_filter_inner_dim_size, in_r,
460                                     in_c, filter_data, out_bprop_buf,
461                                     in_bprop_buf,
462                                     in_backprop + b * input_image_size);
463           }
464         }
465       }
466     };
467 
468     const int64_t shard_cost = args.in_rows * args.in_cols * args.out_depth;
469     auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
470     Shard(worker_threads.num_threads, worker_threads.workers, args.batch,
471           shard_cost, shard);
472   }
473 };
474 
475 template <typename T>
DepthwiseConvBackpropInputReference(const DepthwiseArgs & args,const T * out_backprop,const T * filter,T * in_backprop)476 static void DepthwiseConvBackpropInputReference(const DepthwiseArgs& args,
477                                                 const T* out_backprop,
478                                                 const T* filter,
479                                                 T* in_backprop) {
480   // Naive for loop as a reference point without concerns about performance.
481   for (int b = 0; b < args.batch; ++b) {
482     for (int in_r = 0; in_r < args.in_rows; ++in_r) {
483       for (int in_c = 0; in_c < args.in_cols; ++in_c) {
484         for (int in_d = 0; in_d < args.in_depth; ++in_d) {
485           T sum = 0;
486           const int stride = args.stride;
487           const int out_d_start = in_d * args.depth_multiplier;
488           const int out_d_end = out_d_start + args.depth_multiplier;
489 
490           for (int out_d = out_d_start; out_d < out_d_end; ++out_d) {
491             const int out_r_start = std::max(
492                 0, (in_r - args.filter_rows + args.pad_rows + stride) / stride);
493             const int out_r_end =
494                 std::min(args.out_rows - 1, (in_r + args.pad_rows) / stride);
495 
496             for (int out_r = out_r_start; out_r <= out_r_end; ++out_r) {
497               const int out_c_start = std::max(
498                   0,
499                   (in_c - args.filter_cols + args.pad_cols + stride) / stride);
500               const int out_c_end =
501                   std::min(args.out_cols - 1, (in_c + args.pad_cols) / stride);
502 
503               for (int out_c = out_c_start; out_c <= out_c_end; ++out_c) {
504                 int f_r = in_r + args.pad_rows - out_r * stride;
505                 int f_c = in_c + args.pad_cols - out_c * stride;
506                 int filter_dm = out_d - out_d_start;
507                 int out_backprop_offset =
508                     out_d +
509                     args.out_depth *
510                         (out_c + args.out_cols * (out_r + args.out_rows * b));
511                 int filter_offset =
512                     filter_dm +
513                     args.depth_multiplier *
514                         (in_d + args.in_depth * (f_c + args.filter_cols * f_r));
515                 sum +=
516                     out_backprop[out_backprop_offset] * filter[filter_offset];
517               }
518             }
519           }
520 
521           int in_backprop_offset =
522               in_d +
523               args.in_depth * (in_c + args.in_cols * (in_r + args.in_rows * b));
524           in_backprop[in_backprop_offset] = sum;
525         }
526       }
527     }
528   }
529 }
530 
531 // Extern template instantiated in conv_grad_input_ops.cc.
532 extern template struct LaunchConv2DBackpropInputOp<CPUDevice, bfloat16>;
533 extern template struct LaunchConv2DBackpropInputOp<CPUDevice, Eigen::half>;
534 extern template struct LaunchConv2DBackpropInputOp<CPUDevice, float>;
535 extern template struct LaunchConv2DBackpropInputOp<CPUDevice, double>;
536 
537 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
538 
539 // Extern template instantiated in conv_grad_input_ops.cc.
540 extern template struct LaunchConv2DBackpropInputOp<GPUDevice, Eigen::half>;
541 extern template struct LaunchConv2DBackpropInputOp<GPUDevice, float>;
542 extern template struct LaunchConv2DBackpropInputOp<GPUDevice, double>;
543 
544 // Extern template instantiated in depthwise_conv_op_gpu.cu.cc.
545 extern template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice,
546                                                           Eigen::half>;
547 extern template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, float>;
548 extern template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, double>;
549 
550 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
551 
552 // Kernel to compute the input backprop for depthwise convolution.
553 template <typename Device, class T>
554 class DepthwiseConv2dNativeBackpropInputOp : public OpKernel {
555  public:
DepthwiseConv2dNativeBackpropInputOp(OpKernelConstruction * context)556   explicit DepthwiseConv2dNativeBackpropInputOp(OpKernelConstruction* context)
557       : OpKernel(context) {
558     OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
559     OP_REQUIRES(context, strides_.size() == 4,
560                 errors::InvalidArgument("Sliding window strides field must "
561                                         "specify 4 dimensions"));
562 
563     string data_format;
564     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
565     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
566                 errors::InvalidArgument("Invalid data format"));
567 
568     stride_ = GetTensorDim(strides_, data_format_, 'H');
569     const int64_t stride_w = GetTensorDim(strides_, data_format_, 'W');
570     const int64_t stride_n = GetTensorDim(strides_, data_format_, 'N');
571     const int64_t stride_c = GetTensorDim(strides_, data_format_, 'C');
572 
573     OP_REQUIRES(context, stride_ == stride_w,
574                 errors::InvalidArgument(
575                     "Current implementation only supports equal length "
576                     "strides in the row and column dimensions."));
577     OP_REQUIRES(
578         context, (stride_n == 1 && stride_c == 1),
579         errors::InvalidArgument("Current implementation does not yet support "
580                                 "strides in the batch and depth dimensions."));
581     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
582     OP_REQUIRES_OK(context,
583                    context->GetAttr("explicit_paddings", &explicit_paddings_));
584     OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings_,
585                                               /*num_dims=*/4, data_format_));
586 
587     cudnn_use_autotune_ = CudnnUseAutotune();
588     dtype_ = DataTypeToEnum<T>::value;
589 #if CUDNN_VERSION >= 8000
590     // From the cuDNN release note 8.0: We’ve extended the fprop and dgrad
591     // NHWC depthwise kernels to support more combinations (filter
592     // sizes/strides) such as 5x5/1x1, 5x5/2x2, 7x7/1x1, 7x7/2x2 (in addition
593     // to what we already have, 1x1/1x1, 3x3/1x1, 3x3/2x2), which provides
594     // good performance. (https://docs.nvidia.com/deeplearning/sdk/cudnn-
595     // release-notes/rel_8.html#rel_8)
596     use_cudnn_grouped_conv_ =
597         dtype_ == DT_HALF &&
598         ((data_format_ == FORMAT_NCHW && stride_ == 1 && stride_w == 1) ||
599          (data_format_ == FORMAT_NHWC && stride_ == stride_w &&
600           (stride_ == 1 || stride_ == 2)));
601 #elif CUDNN_VERSION >= 7603
602     // Use CuDNN grouped conv (input gradient) when stride = 1, input/output is
603     // NCHW and float16(half). See cudnn release note 7.6.3 (https://docs.nvidi
604     // a.com/deeplearning/sdk/cudnn-release-notes/rel_763.html#rel_763).
605     use_cudnn_grouped_conv_ = dtype_ == DT_HALF &&
606                               data_format_ == FORMAT_NCHW && stride_ == 1 &&
607                               stride_w == 1;
608 #else
609     use_cudnn_grouped_conv_ = false;
610 #endif
611   }
612 
Compute(OpKernelContext * context)613   void Compute(OpKernelContext* context) override {
614     const Tensor& input_sizes = context->input(0);
615     const Tensor& filter = context->input(1);
616     OP_REQUIRES(
617         context, TensorShapeUtils::IsVector(input_sizes.shape()),
618         errors::InvalidArgument(
619             "Conv2DBackpropInput: input_sizes input must be 1-dim, not ",
620             input_sizes.dims()));
621     TensorShape input_shape;
622     const int32* in_sizes_data = input_sizes.template flat<int32>().data();
623 
624     for (int i = 0; i < input_sizes.NumElements(); ++i) {
625       OP_REQUIRES(context, in_sizes_data[i] >= 0,
626                   errors::InvalidArgument("Dimension ", i,
627                                           " of input_sizes must be >= 0"));
628       OP_REQUIRES_OK(context, input_shape.AddDimWithStatus(in_sizes_data[i]));
629     }
630     const TensorShape& filter_shape = filter.shape();
631     EXTRACT_AND_VERIFY_DIMENSIONS("DepthwiseConv2DBackpropInput");
632 
633     Tensor* in_backprop = nullptr;
634     OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
635                                 {0}, 0, input_shape, &in_backprop));
636 
637     // If there is nothing to compute, return.
638     if (input_shape.num_elements() == 0) {
639       return;
640     }
641 
642     // If in_depth==1, this operation is just a standard convolution.
643     // Depthwise convolution is a special case of cuDNN's grouped convolution.
644     bool use_cudnn =
645         std::is_same<Device, GPUDevice>::value &&
646         (in_depth == 1 || (use_cudnn_grouped_conv_ &&
647                            ShouldCudnnGroupedConvolutionBeUsed(
648                                filter_rows, filter_cols, in_depth, out_depth)));
649 
650     VLOG(2) << "DepthwiseConv2dNativeBackpropInput: "
651             << " Input: [" << batch << ", " << input_rows << ", " << input_cols
652             << ", " << in_depth << "]; Filter: [" << filter_rows << ", "
653             << filter_cols << ", " << in_depth << ", " << depth_multiplier
654             << "]; Output: [" << batch << ", " << out_rows << ", " << out_cols
655             << ", " << out_depth << "], stride = " << stride_
656             << ", pad_rows = " << pad_top << ", pad_cols = " << pad_left
657             << ", Use cuDNN: " << use_cudnn;
658 
659     if (use_cudnn) {
660       // Reshape from TF depthwise filter to cuDNN grouped convolution filter:
661       //
662       //                  | TensorFlow       | cuDNN
663       // --------------------------------------------------------------------
664       // filter_out_depth | depth_multiplier | depth_multiplier * group_count
665       // filter_in_depth  | in_depth         | in_depth / group_count
666       //
667       // For depthwise convolution, we have group_count == in_depth.
668       int32_t filter_in_depth = 1;
669       TensorShape shape =
670           TensorShape{filter_rows, filter_cols, filter_in_depth, out_depth};
671       Tensor reshaped_filter(/*type=*/dtype_);
672       OP_REQUIRES(
673           context, reshaped_filter.CopyFrom(filter, shape),
674           errors::Internal(
675               "Failed to reshape filter tensor for grouped convolution."));
676       // TODO(yangzihao): Send in arbitrary dilation rates after the dilated
677       // conv is supported.
678       launcher_(context, /*use_cudnn=*/true, cudnn_use_autotune_, out_backprop,
679                 reshaped_filter, /*row_dilation=*/1, /*col_dilation=*/1,
680                 stride_, stride_, padding_, explicit_paddings_, in_backprop,
681                 data_format_);
682       return;
683     }
684 
685     auto out_backprop_ptr = out_backprop.template flat<T>().data();
686     auto filter_ptr = filter.template flat<T>().data();
687     auto in_backprop_ptr = in_backprop->template flat<T>().data();
688     LaunchDepthwiseConvBackpropInputOp<Device, T>()(
689         context, args, out_backprop_ptr, filter_ptr, in_backprop_ptr,
690         data_format_);
691   }
692 
693  protected:
694   bool use_cudnn_grouped_conv_;
695 
696  private:
697   std::vector<int32> strides_;
698   Padding padding_;
699   std::vector<int64_t> explicit_paddings_;
700   TensorFormat data_format_;
701   int64_t stride_;
702 
703   // For in_depth == 1 and grouped convolutions.
704   LaunchConv2DBackpropInputOp<Device, T> launcher_;
705   bool cudnn_use_autotune_;
706   DataType dtype_;
707 
708   TF_DISALLOW_COPY_AND_ASSIGN(DepthwiseConv2dNativeBackpropInputOp);
709 };
710 
711 #define REGISTER_CPU_KERNEL(T)                                       \
712   REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropInput") \
713                               .Device(DEVICE_CPU)                    \
714                               .TypeConstraint<T>("T"),               \
715                           DepthwiseConv2dNativeBackpropInputOp<CPUDevice, T>);
716 
717 TF_CALL_bfloat16(REGISTER_CPU_KERNEL);
718 TF_CALL_half(REGISTER_CPU_KERNEL);
719 TF_CALL_float(REGISTER_CPU_KERNEL);
720 #if !defined(PLATFORM_WINDOWS) || !defined(_DEBUG)
721 TF_CALL_double(REGISTER_CPU_KERNEL);
722 #endif
723 #undef REGISTER_CPU_KERNEL
724 
725 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
726 
727 #define REGISTER_GPU_KERNEL(T)                                       \
728   REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropInput") \
729                               .Device(DEVICE_GPU)                    \
730                               .TypeConstraint<T>("T")                \
731                               .HostMemory("input_sizes"),            \
732                           DepthwiseConv2dNativeBackpropInputOp<GPUDevice, T>)
733 
734 TF_CALL_half(REGISTER_GPU_KERNEL);
735 TF_CALL_float(REGISTER_GPU_KERNEL);
736 TF_CALL_double(REGISTER_GPU_KERNEL);
737 #undef REGISTER_GPU_KERNEL
738 
739 #if CUDNN_VERSION >= 7000
740 template <typename T>
741 class DepthwiseConv2dGroupedConvBackpropInputOp
742     : public DepthwiseConv2dNativeBackpropInputOp<GPUDevice, T> {
743  public:
DepthwiseConv2dGroupedConvBackpropInputOp(OpKernelConstruction * context)744   DepthwiseConv2dGroupedConvBackpropInputOp(OpKernelConstruction* context)
745       : DepthwiseConv2dNativeBackpropInputOp<GPUDevice, T>(context) {
746     this->use_cudnn_grouped_conv_ = true;
747   }
748 };
749 
750 #define REGISTER_GROUPED_CONV_KERNEL(T)                              \
751   REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropInput") \
752                               .Device(DEVICE_GPU)                    \
753                               .TypeConstraint<T>("T")                \
754                               .HostMemory("input_sizes")             \
755                               .Label("cudnn_grouped_convolution"),   \
756                           DepthwiseConv2dGroupedConvBackpropInputOp<T>)
757 
758 TF_CALL_half(REGISTER_GROUPED_CONV_KERNEL);
759 TF_CALL_float(REGISTER_GROUPED_CONV_KERNEL);
760 TF_CALL_double(REGISTER_GROUPED_CONV_KERNEL);
761 #undef REGISTER_GROUPED_CONV_KERNEL
762 #endif  // CUDNN_VERSION
763 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
764 
765 // Kernels to compute the gradients of the filters for depthwise convolution.
766 
767 // Computes filter backprop using 'out_backprop' and 'input_buffer', storing the
768 // result in 'output_buffer' at an index computed from 'out_r' and 'out_c'.
769 //
770 // EX:
771 //   in_depth = 3, depth_multiplier = 2, filter [2, 2], register_width = 4
772 //   Both 'input_buffer' and 'filter' are padded to register-width boundaries.
773 //
774 //   'input_buffer' [rows, cols, in_depth, depth_multiplier]
775 //
776 //     [f00, f01, f10, f11] [f20, f21, 0, 0]   in_row = 0, in_col = 0
777 //     [e00, e01, e10, e11] [e20, e21, 0, 0]   in_row = 0, in_col = 1
778 //     [b00, b01, b10, b11] [b20, b21, 0, 0]   in_row = 1, in_col = 0
779 //     [a00, a01, a10, a11] [a20, a21, 0, 0]   in_row = 1, in_col = 1
780 //
781 //   'out_backprop' [out_rows, out_cols, in_depth, depth_multiplier]
782 //
783 //     [q00, q01, q10, q11] [q20, q21, r00, r01]
784 //     [r10, r11, r20, r21] [s00, s01, s10, s11]
785 //     [s20, s21, t00, t01] [t10, t11, t20, a21]
786 //
787 //   First output register of 'filter_backprop'
788 //     [u0, v0, w0, x0] += ([f00, f01, f10, f11] x [q00, q01, q10, q11])
789 //
790 template <typename T>
ComputeBackpropFilter(const DepthwiseArgs & args,const int64_t padded_out_depth_size,const int64_t out_r,const int64_t out_c,const T * out_backprop,const T * input_buffer,T * output_buffer)791 static void ComputeBackpropFilter(const DepthwiseArgs& args,
792                                   const int64_t padded_out_depth_size,
793                                   const int64_t out_r, const int64_t out_c,
794                                   const T* out_backprop, const T* input_buffer,
795                                   T* output_buffer) {
796   typedef typename Eigen::internal::packet_traits<T>::type Packet;
797   static const int64_t kPacketSize = (sizeof(Packet) / sizeof(T));
798   // Calculate vectorized size of 'padded_out_depth_size'.
799   const int64_t out_depth = args.out_depth;
800   const int64_t filter_spatial_size = args.filter_rows * args.filter_cols;
801   const int64_t output_vectorized_size =
802       (padded_out_depth_size / kPacketSize) * kPacketSize;
803   const int64_t base_output_index = (out_r * args.out_cols + out_c) * out_depth;
804   // Determine whether we can execute fast or slow code path.
805   const int64_t output_image_size =
806       args.out_rows * args.out_cols * args.out_depth;
807   const int64_t output_last_vector_index =
808       output_image_size - (filter_spatial_size * padded_out_depth_size);
809   const bool fast_path = base_output_index <= output_last_vector_index;
810 
811   if (fast_path) {
812     // TODO(andydavis) Process multiple inputs in 'input_buffer' so we can
813     // amortize the cost of 'output_buffer' load store in the loop below.
814     for (int i = 0; i < output_vectorized_size; i += kPacketSize) {
815       // Load vector register from 'out_backprop'.
816       const auto out_bprop_block =
817           Eigen::internal::ploadu<Packet>(out_backprop + base_output_index + i);
818       for (int j = 0; j < filter_spatial_size; ++j) {
819         const int64_t index = i + j * padded_out_depth_size;
820         // Load vector register from 'input_buffer'.
821         const auto input_block =
822             Eigen::internal::ploadu<Packet>(input_buffer + index);
823         // Load output block into vector register.
824         auto out_block_data = output_buffer + index;
825         auto out_block = Eigen::internal::ploadu<Packet>(out_block_data);
826         // Vector multiply-add.
827         out_block = Eigen::internal::pmadd<Packet>(out_bprop_block, input_block,
828                                                    out_block);
829         // Store 'out_block' back to memory.
830         Eigen::internal::pstoreu<T>(out_block_data, out_block);
831       }
832     }
833   } else {
834     // Slow path (cant do vector reads from non-padded 'out_backprop'.
835     for (int i = 0; i < output_vectorized_size; i += kPacketSize) {
836       // Calculate safe read size from 'out_backprop'.
837       const int64_t out_bprop_index = base_output_index + i;
838       const int64_t out_bprop_limit =
839           std::min(output_image_size, out_bprop_index + kPacketSize);
840       T out_buf[kPacketSize];
841       memset(&out_buf, 0, kPacketSize * sizeof(T));
842       const int64_t scalar_size = out_bprop_limit - out_bprop_index;
843       for (int64_t j = 0; j < scalar_size; ++j) {
844         out_buf[j] = out_backprop[out_bprop_index + j];
845       }
846       // Load vector register from 'out_buf'.
847       const auto out_bprop_block = Eigen::internal::ploadu<Packet>(out_buf);
848       for (int j = 0; j < filter_spatial_size; ++j) {
849         const int64_t index = i + j * padded_out_depth_size;
850         // Load vector register from 'input_buffer'.
851         const auto input_block =
852             Eigen::internal::ploadu<Packet>(input_buffer + index);
853         // Load output block into vector register.
854         auto out_block_data = output_buffer + index;
855         auto out_block = Eigen::internal::ploadu<Packet>(out_block_data);
856         // Vector multiply-add.
857         out_block = Eigen::internal::pmadd<Packet>(out_bprop_block, input_block,
858                                                    out_block);
859         // Store 'out_block' back to memory.
860         Eigen::internal::pstoreu<T>(out_block_data, out_block);
861       }
862     }
863   }
864 }
865 
866 template <typename Device, typename T>
867 struct LaunchDepthwiseConvBackpropFilterOp;
868 
869 template <typename T>
870 struct LaunchDepthwiseConvBackpropFilterOp<CPUDevice, T> {
871   typedef typename Eigen::internal::packet_traits<T>::type Packet;
872 
operator ()tensorflow::LaunchDepthwiseConvBackpropFilterOp873   void operator()(OpKernelContext* ctx, const DepthwiseArgs& args,
874                   const T* out_backprop, const T* input, T* filter_backprop,
875                   TensorFormat data_format) {
876     OP_REQUIRES(
877         ctx, data_format == FORMAT_NHWC,
878         errors::Unimplemented(
879             "Depthwise convolution on CPU is only supported for NHWC format"));
880 
881     static const int64_t kPacketSize = (sizeof(Packet) / sizeof(T));
882 
883     const int64_t filter_spatial_size = args.filter_rows * args.filter_cols;
884     const int64_t padded_out_depth_size =
885         ((args.out_depth + kPacketSize - 1) / kPacketSize) * kPacketSize;
886 
887     // Allocate output buffers for each image in 'batch' (padded to vector
888     // register boundaries).
889     Tensor output_buffer;
890     OP_REQUIRES_OK(
891         ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
892                                 TensorShape({args.batch, filter_spatial_size,
893                                              padded_out_depth_size}),
894                                 &output_buffer));
895     T* output_buffer_data = output_buffer.template flat<T>().data();
896 
897     // Computes one shard of depthwise conv2d backprop filter.
898     auto shard = [&ctx, &args, &out_backprop, &input, &output_buffer_data](
899                      int64_t start, int64_t limit) {
900       static const int64_t kPacketSize = (sizeof(Packet) / sizeof(T));
901       const int64_t filter_spatial_size = args.filter_rows * args.filter_cols;
902       const int64_t padded_out_depth_size =
903           ((args.out_depth + kPacketSize - 1) / kPacketSize) * kPacketSize;
904 
905       // Allocate buffer for local input regions.
906       Tensor input_buffer;
907       OP_REQUIRES_OK(
908           ctx, ctx->allocate_temp(
909                    DataTypeToEnum<T>::value,
910                    TensorShape({filter_spatial_size, padded_out_depth_size}),
911                    &input_buffer));
912       T* input_buffer_data = input_buffer.template flat<T>().data();
913 
914       const int64_t input_image_size =
915           args.in_rows * args.in_cols * args.in_depth;
916       const int64_t output_image_size =
917           args.out_rows * args.out_cols * args.out_depth;
918       const int64_t padded_filter_size =
919           filter_spatial_size * padded_out_depth_size;
920 
921       for (int b = start; b < limit; ++b) {
922         // Initialize 'output_buffer' for 'b'.
923         auto* output_buffer = output_buffer_data + b * padded_filter_size;
924         memset(output_buffer, 0, padded_filter_size * sizeof(T));
925 
926         for (int out_r = 0; out_r < args.out_rows; ++out_r) {
927           for (int out_c = 0; out_c < args.out_cols; ++out_c) {
928             // Populate 'input_buffer_data' with data from local input region.
929             functor::DepthwiseInputCopyOp<T>()(
930                 args, padded_out_depth_size, out_r, out_c,
931                 input + b * input_image_size, input_buffer_data);
932             // Compute depthwise backprop filter.
933             ComputeBackpropFilter(args, padded_out_depth_size, out_r, out_c,
934                                   out_backprop + b * output_image_size,
935                                   input_buffer_data, output_buffer);
936           }
937         }
938       }
939     };
940     const int64_t shard_cost = args.out_rows * args.out_cols * args.out_depth;
941     auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
942     Shard(worker_threads.num_threads, worker_threads.workers, args.batch,
943           shard_cost, shard);
944 
945     // Accumulate 'output_buffer' from each shard into 'output'.
946     const int64_t out_depth = args.out_depth;
947     const int64_t vectorized_size = (out_depth / kPacketSize) * kPacketSize;
948     const int64_t scalar_size = out_depth - vectorized_size;
949     const int64_t padded_filter_size =
950         filter_spatial_size * padded_out_depth_size;
951     memset(filter_backprop, 0, filter_spatial_size * out_depth * sizeof(T));
952 
953     for (int64_t i = 0; i < filter_spatial_size; ++i) {
954       const int64_t buffer_base = i * padded_out_depth_size;
955       const int64_t output_base = i * out_depth;
956       // Write vectorized length of filter's inner dimension to output.
957       for (int64_t j = 0; j < vectorized_size; j += kPacketSize) {
958         // Load data from 'filter_backprop' into vector register.
959         auto out_block_data = filter_backprop + output_base + j;
960         auto out_block = Eigen::internal::ploadu<Packet>(out_block_data);
961         for (int b = 0; b < args.batch; ++b) {
962           // Load data from 'output_buffer' for 'b'.
963           const auto* output_buffer =
964               output_buffer_data + b * padded_filter_size;
965           const auto v =
966               Eigen::internal::ploadu<Packet>(output_buffer + buffer_base + j);
967           // Add 'v' to 'out_block'.
968           out_block = Eigen::internal::padd<Packet>(out_block, v);
969         }
970         // Store 'out_block' back to memory.
971         Eigen::internal::pstoreu<T>(out_block_data, out_block);
972       }
973       // Write scalar length of filter's inner dimension to output.
974       for (int64_t j = 0; j < scalar_size; ++j) {
975         for (int b = 0; b < args.batch; ++b) {
976           const auto* output_buffer =
977               output_buffer_data + b * padded_filter_size;
978           filter_backprop[output_base + vectorized_size + j] +=
979               output_buffer[buffer_base + vectorized_size + j];
980         }
981       }
982     }
983   }
984 };
985 
986 template <typename T>
DepthwiseConvBackpropFilterReference(const DepthwiseArgs & args,const T * out_backprop,const T * input,T * filter_backprop)987 static void DepthwiseConvBackpropFilterReference(const DepthwiseArgs& args,
988                                                  const T* out_backprop,
989                                                  const T* input,
990                                                  T* filter_backprop) {
991   int num_filter_backprop = args.filter_rows * args.filter_cols *
992                             args.in_depth * args.depth_multiplier;
993   memset(filter_backprop, 0, num_filter_backprop * sizeof(T));
994   // Naive for loop as a reference point without concerns about performance.
995   for (int b = 0; b < args.batch; ++b) {
996     for (int out_r = 0; out_r < args.out_rows; ++out_r) {
997       for (int out_c = 0; out_c < args.out_cols; ++out_c) {
998         for (int out_d = 0; out_d < args.out_depth; ++out_d) {
999           const int in_d = out_d / args.depth_multiplier;
1000           const int dm = out_d % args.depth_multiplier;
1001           const int in_r_start = out_r * args.stride - args.pad_rows;
1002           const int in_c_start = out_c * args.stride - args.pad_cols;
1003 
1004           for (int f_r = 0; f_r < args.filter_rows; ++f_r) {
1005             for (int f_c = 0; f_c < args.filter_cols; ++f_c) {
1006               const int in_r = in_r_start + f_r;
1007               const int in_c = in_c_start + f_c;
1008 
1009               if (in_r >= 0 && in_r < args.in_rows && in_c >= 0 &&
1010                   in_c < args.in_cols) {
1011                 int out_backprop_offset =
1012                     out_d +
1013                     args.out_depth *
1014                         (out_c + args.out_cols * (out_r + args.out_rows * b));
1015                 int input_offset =
1016                     in_d +
1017                     args.in_depth *
1018                         (in_c + args.in_cols * (in_r + args.in_rows * b));
1019                 int filter_backprop_offset =
1020                     dm +
1021                     args.depth_multiplier *
1022                         (in_d + args.in_depth * (f_c + args.filter_cols * f_r));
1023                 filter_backprop[filter_backprop_offset] +=
1024                     input[input_offset] * out_backprop[out_backprop_offset];
1025               }
1026             }
1027           }
1028         }
1029       }
1030     }
1031   }
1032 }
1033 
1034 // Extern template instantiated in conv_grad_filter_ops.cc.
1035 extern template struct LaunchConv2DBackpropFilterOp<CPUDevice, bfloat16>;
1036 extern template struct LaunchConv2DBackpropFilterOp<CPUDevice, Eigen::half>;
1037 extern template struct LaunchConv2DBackpropFilterOp<CPUDevice, float>;
1038 extern template struct LaunchConv2DBackpropFilterOp<CPUDevice, double>;
1039 
1040 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
1041 
1042 // Extern template instantiated in conv_grad_filter_ops.cc.
1043 extern template struct LaunchConv2DBackpropFilterOp<GPUDevice, Eigen::half>;
1044 extern template struct LaunchConv2DBackpropFilterOp<GPUDevice, float>;
1045 extern template struct LaunchConv2DBackpropFilterOp<GPUDevice, double>;
1046 
1047 // Extern template instantiated in depthwise_conv_op_gpu.cu.cc.
1048 extern template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice,
1049                                                            Eigen::half>;
1050 extern template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, float>;
1051 extern template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, double>;
1052 
1053 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
1054 
1055 // Kernel to compute the filter backprop for depthwise convolution.
1056 template <typename Device, class T>
1057 class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel {
1058  public:
DepthwiseConv2dNativeBackpropFilterOp(OpKernelConstruction * context)1059   explicit DepthwiseConv2dNativeBackpropFilterOp(OpKernelConstruction* context)
1060       : OpKernel(context) {
1061     OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
1062     OP_REQUIRES(context, strides_.size() == 4,
1063                 errors::InvalidArgument("Sliding window strides field must "
1064                                         "specify 4 dimensions"));
1065 
1066     string data_format;
1067     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
1068     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
1069                 errors::InvalidArgument("Invalid data format"));
1070 
1071     stride_ = GetTensorDim(strides_, data_format_, 'H');
1072     const int64_t stride_w = GetTensorDim(strides_, data_format_, 'W');
1073     const int64_t stride_n = GetTensorDim(strides_, data_format_, 'N');
1074     const int64_t stride_c = GetTensorDim(strides_, data_format_, 'C');
1075 
1076     OP_REQUIRES(context, stride_ == stride_w,
1077                 errors::InvalidArgument(
1078                     "Current implementation only supports equal length "
1079                     "strides in the row and column dimensions."));
1080     OP_REQUIRES(
1081         context, (stride_n == 1 && stride_c == 1),
1082         errors::InvalidArgument("Current implementation does not yet support "
1083                                 "strides in the batch and depth dimensions."));
1084     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
1085     OP_REQUIRES_OK(context,
1086                    context->GetAttr("explicit_paddings", &explicit_paddings_));
1087     OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings_,
1088                                               /*num_dims=*/4, data_format_));
1089 
1090     cudnn_use_autotune_ = CudnnUseAutotune();
1091 
1092     if (std::is_same<T, bfloat16>::value) {
1093       dtype_ = DT_BFLOAT16;
1094     } else if (std::is_same<T, Eigen::half>::value) {
1095       dtype_ = DT_HALF;
1096     } else if (std::is_same<T, float>::value) {
1097       dtype_ = DT_FLOAT;
1098     } else if (std::is_same<T, double>::value) {
1099       dtype_ = DT_DOUBLE;
1100     } else {
1101       LOG(ERROR) << "Only bfloat16, half, float, and double are supported.";
1102     }
1103 #if CUDNN_VERSION >= 7603
1104     // Use CuDNN grouped conv (filter gradients) when input/output is
1105     // float16(half). See cudnn release note 7.6.3. (https://docs.nvidia.com/dee
1106     // plearning/sdk/cudnn-release-notes/rel_763.html#rel_763)
1107     //
1108     // Grouped convolution was added to cuDNN in version 7.0.1 but
1109     // TensorFlow op-determinism has been added only for cuDNN versions 7.6.3
1110     // and later intentionally. This is to avoid potential issues with earlier
1111     // versions of cuDNN.
1112     use_cudnn_grouped_conv_ = OpDeterminismRequired() || dtype_ == DT_HALF;
1113 #else
1114     use_cudnn_grouped_conv_ = false;
1115 #endif
1116   }
1117 
Compute(OpKernelContext * context)1118   void Compute(OpKernelContext* context) override {
1119     const Tensor& input = context->input(0);
1120     const Tensor& filter_sizes = context->input(1);
1121     OP_REQUIRES(
1122         context, TensorShapeUtils::IsVector(filter_sizes.shape()),
1123         errors::InvalidArgument(
1124             "Conv2DBackpropFilter: filter_sizes input must be 1-dim, not ",
1125             filter_sizes.dims()));
1126     TensorShape filter_shape;
1127     const int32* filter_sizes_data = filter_sizes.template flat<int32>().data();
1128     for (int i = 0; i < filter_sizes.NumElements(); ++i) {
1129       OP_REQUIRES(context, filter_sizes_data[i] >= 0,
1130                   errors::InvalidArgument("Dimension ", i,
1131                                           " of filter_sizes must be >= 0"));
1132       OP_REQUIRES_OK(context,
1133                      filter_shape.AddDimWithStatus(filter_sizes_data[i]));
1134     }
1135     const TensorShape& input_shape = input.shape();
1136 
1137     EXTRACT_AND_VERIFY_DIMENSIONS("DepthwiseConv2DBackpropFilter");
1138     Tensor* filter_backprop = nullptr;
1139     OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
1140                                 {1}, 0, filter_shape, &filter_backprop));
1141 
1142     // If there is nothing to compute, return.
1143     if (out_backprop.shape().num_elements() == 0) {
1144       return;
1145     }
1146 
1147     // If in_depth==1, this operation is just a standard convolution.
1148     // Depthwise convolution is a special case of cuDNN's grouped convolution.
1149     bool use_cudnn = std::is_same<Device, GPUDevice>::value &&
1150                      (in_depth == 1 ||
1151                       (use_cudnn_grouped_conv_ &&
1152                        (ShouldCudnnGroupedConvolutionBeUsed(
1153                             filter_rows, filter_cols, in_depth, out_depth) ||
1154                         OpDeterminismRequired())));
1155 
1156     VLOG(2) << "DepthwiseConv2dNativeBackpropFilter: "
1157             << " Input: [" << batch << ", " << input_rows << ", " << input_cols
1158             << ", " << in_depth << "]; Filter: [" << filter_rows << ", "
1159             << filter_cols << ", " << in_depth << ", " << depth_multiplier
1160             << "]; Output: [" << batch << ", " << out_rows << ", " << out_cols
1161             << ", " << out_depth << "], stride = " << stride_
1162             << ", pad_rows = " << pad_top << ", pad_cols = " << pad_left
1163             << ", Use cuDNN: " << use_cudnn;
1164 
1165     if (use_cudnn) {
1166       // Reshape from TF depthwise filter to cuDNN grouped convolution filter:
1167       //
1168       //                  | TensorFlow       | cuDNN
1169       // --------------------------------------------------------------------
1170       // filter_out_depth | depth_multiplier | depth_multiplier * group_count
1171       // filter_in_depth  | in_depth         | in_depth / group_count
1172       //
1173       // For depthwise convolution, we have group_count == in_depth.
1174       int32_t filter_in_depth = 1;
1175       TensorShape shape =
1176           TensorShape{filter_rows, filter_cols, filter_in_depth, out_depth};
1177       Tensor reshaped_filter(/*type=*/dtype_);
1178       OP_REQUIRES(
1179           context, reshaped_filter.CopyFrom(*filter_backprop, shape),
1180           errors::Internal(
1181               "Failed to reshape filter tensor for grouped convolution."));
1182 
1183       // TODO(yangzihao): Send in arbitrary dilation rates after the dilated
1184       // conv is supported.
1185       launcher_(context, /*use_cudnn=*/true, cudnn_use_autotune_, out_backprop,
1186                 input,
1187                 /*row_dilation=*/1, /*col_dilation=*/1, stride_, stride_,
1188                 padding_, explicit_paddings_, &reshaped_filter, data_format_);
1189       return;
1190     }
1191 
1192     // For GPU inputs with type half, we cast inputs to float and outputs back
1193     // to half, as half implementation is slow and does not use full precision
1194     // accumulation in some cases.
1195     constexpr bool cast_to_float = std::is_same<T, Eigen::half>::value &&
1196                                    std::is_same<Device, GPUDevice>::value;
1197     using U = typename std::conditional<cast_to_float, float, T>::type;
1198     Tensor casted_out_backprop = out_backprop;
1199     Tensor casted_input = input;
1200     Tensor casted_filter_backprop = *filter_backprop;
1201     const Device& device = context->template eigen_device<Device>();
1202     if (cast_to_float) {
1203       functor::CastFunctor<Device, float, Eigen::half> cast;
1204       OP_REQUIRES_OK(context,
1205                      context->allocate_temp(DT_FLOAT, out_backprop.shape(),
1206                                             &casted_out_backprop));
1207       cast(device, casted_out_backprop.template flat<float>(),
1208            out_backprop.template flat<Eigen::half>());
1209       OP_REQUIRES_OK(context, context->allocate_temp(DT_FLOAT, input.shape(),
1210                                                      &casted_input));
1211       cast(device, casted_input.template flat<float>(),
1212            input.template flat<Eigen::half>());
1213       OP_REQUIRES_OK(context,
1214                      context->allocate_temp(DT_FLOAT, filter_backprop->shape(),
1215                                             &casted_filter_backprop));
1216     }
1217 
1218     auto out_backprop_ptr = casted_out_backprop.template flat<U>().data();
1219     auto input_ptr = casted_input.template flat<U>().data();
1220     auto filter_backprop_ptr = casted_filter_backprop.template flat<U>().data();
1221     LaunchDepthwiseConvBackpropFilterOp<Device, U>()(
1222         context, args, out_backprop_ptr, input_ptr, filter_backprop_ptr,
1223         data_format_);
1224 
1225     if (cast_to_float) {
1226       functor::CastFunctor<Device, Eigen::half, float> cast;
1227       const Tensor& casted_filter_backprop_const = casted_filter_backprop;
1228       cast(device, filter_backprop->template flat<Eigen::half>(),
1229            casted_filter_backprop_const.template flat<float>());
1230     }
1231   }
1232 
1233  protected:
1234   bool use_cudnn_grouped_conv_;
1235 
1236  private:
1237   std::vector<int32> strides_;
1238   Padding padding_;
1239   std::vector<int64_t> explicit_paddings_;
1240   TensorFormat data_format_;
1241   int64_t stride_;
1242 
1243   // For in_depth == 1 and grouped convolutions.
1244   LaunchConv2DBackpropFilterOp<Device, T> launcher_;
1245   bool cudnn_use_autotune_;
1246   DataType dtype_;
1247 
1248   TF_DISALLOW_COPY_AND_ASSIGN(DepthwiseConv2dNativeBackpropFilterOp);
1249 };
1250 
1251 #define REGISTER_CPU_KERNEL(T)                    \
1252   REGISTER_KERNEL_BUILDER(                        \
1253       Name("DepthwiseConv2dNativeBackpropFilter") \
1254           .Device(DEVICE_CPU)                     \
1255           .TypeConstraint<T>("T"),                \
1256       DepthwiseConv2dNativeBackpropFilterOp<CPUDevice, T>);
1257 TF_CALL_bfloat16(REGISTER_CPU_KERNEL);
1258 TF_CALL_half(REGISTER_CPU_KERNEL);
1259 TF_CALL_float(REGISTER_CPU_KERNEL);
1260 #if !defined(PLATFORM_WINDOWS) || !defined(_DEBUG)
1261 TF_CALL_double(REGISTER_CPU_KERNEL);
1262 #endif
1263 #undef REGISTER_CPU_KERNEL
1264 
1265 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
1266 #define REGISTER_GPU_KERNEL(T)                                        \
1267   REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropFilter") \
1268                               .Device(DEVICE_GPU)                     \
1269                               .TypeConstraint<T>("T")                 \
1270                               .HostMemory("filter_sizes"),            \
1271                           DepthwiseConv2dNativeBackpropFilterOp<GPUDevice, T>)
1272 
1273 TF_CALL_half(REGISTER_GPU_KERNEL);
1274 TF_CALL_float(REGISTER_GPU_KERNEL);
1275 TF_CALL_double(REGISTER_GPU_KERNEL);
1276 #undef REGISTER_GPU_KERNEL
1277 
1278 #if CUDNN_VERSION >= 7000
1279 template <typename T>
1280 class DepthwiseConv2dGroupedConvBackpropFilterOp
1281     : public DepthwiseConv2dNativeBackpropFilterOp<GPUDevice, T> {
1282  public:
DepthwiseConv2dGroupedConvBackpropFilterOp(OpKernelConstruction * context)1283   DepthwiseConv2dGroupedConvBackpropFilterOp(OpKernelConstruction* context)
1284       : DepthwiseConv2dNativeBackpropFilterOp<GPUDevice, T>(context) {
1285     this->use_cudnn_grouped_conv_ = true;
1286   }
1287 };
1288 
1289 #define REGISTER_GROUPED_CONV_KERNEL(T)                               \
1290   REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropFilter") \
1291                               .Device(DEVICE_GPU)                     \
1292                               .TypeConstraint<T>("T")                 \
1293                               .HostMemory("filter_sizes")             \
1294                               .Label("cudnn_grouped_convolution"),    \
1295                           DepthwiseConv2dGroupedConvBackpropFilterOp<T>)
1296 
1297 TF_CALL_half(REGISTER_GROUPED_CONV_KERNEL);
1298 TF_CALL_float(REGISTER_GROUPED_CONV_KERNEL);
1299 TF_CALL_double(REGISTER_GROUPED_CONV_KERNEL);
1300 #undef REGISTER_GROUPED_CONV_KERNEL
1301 #endif  // CUDNN_VERSION
1302 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
1303 
1304 }  // namespace tensorflow
1305