xref: /aosp_15_r20/external/tensorflow/tensorflow/core/kernels/pooling_ops_common.h (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_CORE_KERNELS_POOLING_OPS_COMMON_H_
17 #define TENSORFLOW_CORE_KERNELS_POOLING_OPS_COMMON_H_
18 
19 #include <vector>
20 
21 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
22 #define EIGEN_USE_GPU
23 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
24 
25 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
26 #include "tensorflow/core/framework/bounds_check.h"
27 #include "tensorflow/core/framework/numeric_op.h"
28 #include "tensorflow/core/framework/op_kernel.h"
29 #include "tensorflow/core/framework/tensor_shape.h"
30 #include "tensorflow/core/kernels/avgpooling_op.h"
31 #include "tensorflow/core/kernels/maxpooling_op.h"
32 #include "tensorflow/core/kernels/ops_util.h"
33 #include "tensorflow/core/util/padding.h"
34 #include "tensorflow/core/util/tensor_format.h"
35 #include "tensorflow/core/util/work_sharder.h"
36 
37 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
38 #include "tensorflow/core/kernels/maxpooling_op_gpu.h"
39 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
40 
41 namespace tensorflow {
42 
43 typedef Eigen::GpuDevice GPUDevice;
44 
45 // A helper class to manage sizes and shapes for pooling operations.
46 struct PoolParameters {
47   // Updates context->status if there is an invalid input.
48   // explicit_paddings has eight elements if padding==EXPLIICT, and zero
49   // elements otherwise.
50   PoolParameters(OpKernelContext* context, const std::vector<int32>& ksize,
51                  const std::vector<int32>& stride, Padding padding,
52                  std::vector<int64_t> explicit_paddings,
53                  TensorFormat data_format, const TensorShape& tensor_in_shape);
54 
55   // Returns the shape of the output for "forward" pooling operations.
56   TensorShape forward_output_shape();
57 
58   int depth;
59 
60   int tensor_in_cols;
61   int tensor_in_rows;
62   int tensor_in_batch;
63 
64   int window_rows;
65   int window_cols;
66   int depth_window;
67 
68   int row_stride;
69   int col_stride;
70   int depth_stride;
71 
72   int64_t out_height;
73   int64_t out_width;
74   int out_depth;
75 
76   int64_t pad_top;
77   int64_t pad_bottom;
78   int64_t pad_left;
79   int64_t pad_right;
80 
81   int pad_depth;
82 
83   TensorFormat data_format;
84 };
85 
86 // An implementation of MaxPooling (forward).
87 // TODO (yongtang): Remove MaxPoolingOp and use MaxPoolingV2Op,
88 //     QuantizedMaxPoolingOp depends on MaxPoolingOp so keep intact for now
89 template <typename Device, typename T>
90 class MaxPoolingOp : public OpKernel {
91  public:
MaxPoolingOp(OpKernelConstruction * context)92   explicit MaxPoolingOp(OpKernelConstruction* context) : OpKernel(context) {
93     string data_format;
94     auto status = context->GetAttr("data_format", &data_format);
95     if (status.ok()) {
96       OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
97                   errors::InvalidArgument("Invalid data format"));
98       OP_REQUIRES(
99           context, data_format_ == FORMAT_NHWC,
100           errors::InvalidArgument("Default MaxPoolingOp only supports NHWC ",
101                                   "on device type ",
102                                   DeviceTypeString(context->device_type())));
103     } else {
104       data_format_ = FORMAT_NHWC;
105     }
106     OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
107     OP_REQUIRES(context, ksize_.size() == 4,
108                 errors::InvalidArgument("Sliding window ksize field must "
109                                         "specify 4 dimensions"));
110     for (int i = 0; i < ksize_.size(); ++i) {
111       OP_REQUIRES(context, ksize_[i] > 0,
112                   errors::InvalidArgument("Sliding window ksize for dimension ",
113                                           i, " was zero."));
114     }
115     OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
116     OP_REQUIRES(context, stride_.size() == 4,
117                 errors::InvalidArgument("Sliding window stride field must "
118                                         "specify 4 dimensions"));
119     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
120     if (padding_ == Padding::EXPLICIT) {
121       OP_REQUIRES_OK(
122           context, context->GetAttr("explicit_paddings", &explicit_paddings_));
123     }
124     OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
125                 errors::Unimplemented(
126                     "Pooling is not yet supported on the batch dimension."));
127   }
128 
Compute(OpKernelContext * context)129   void Compute(OpKernelContext* context) override {
130     const Tensor& tensor_in = context->input(0);
131     PoolParameters params{
132         context,     ksize_,           stride_, padding_, explicit_paddings_,
133         FORMAT_NHWC, tensor_in.shape()};
134     if (!context->status().ok()) {
135       return;
136     }
137 
138     Tensor* output = nullptr;
139     OP_REQUIRES_OK(context, context->allocate_output(
140                                 0, params.forward_output_shape(), &output));
141 
142     if (params.depth_window > 1) {
143       // Validate spec against the current implementation.  A
144       // relaxation of these requirements would be ideal.
145       OP_REQUIRES(context, params.depth % params.depth_window == 0,
146                   errors::Unimplemented(
147                       "Depthwise max pooling requires "
148                       "the depth window to evenly divide the input depth."));
149       OP_REQUIRES(
150           context, params.depth_window == params.depth_stride,
151           errors::Unimplemented("Depthwise max pooling requires "
152                                 "the depth window to equal the depth stride."));
153       OP_REQUIRES(
154           context, padding_ != EXPLICIT,
155           errors::Unimplemented("Depthwise max pooling does not support "
156                                 "explicit padding."));
157 
158       DepthwiseMaxPool(context, output, tensor_in, params);
159     } else {
160       // MaxPoolingOp is only called on the GPU when the eigen_tensor label
161       // is used. In this case, explicit padding is not supported
162       if (std::is_same<Device, GPUDevice>::value &&
163           padding_ == Padding::EXPLICIT) {
164         context->SetStatus(errors::Unimplemented(
165             "MaxPoolingOp does not support explicit padding."));
166         return;
167       }
168       SpatialMaxPool(context, output, tensor_in, params, padding_);
169     }
170   }
171 
172  private:
173   // Single-threaded implementation of DepthwiseMaxPool which
174   // does not handle all of the same options as SpatialMaxPool
175   // (strict assumptions on no padding, stride).
176   //
177   // TODO(vrv): implement a more general depthwise-max pool that works
178   // on GPU as well.
DepthwiseMaxPool(OpKernelContext * context,Tensor * output,const Tensor & tensor_in,const PoolParameters & params)179   void DepthwiseMaxPool(OpKernelContext* context, Tensor* output,
180                         const Tensor& tensor_in, const PoolParameters& params) {
181     Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
182         in_by_pool(tensor_in.flat<T>().data(), params.depth_window,
183                    tensor_in.NumElements() / params.depth_window);
184     Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> out_by_pool(
185         output->flat<T>().data(), 1, output->NumElements());
186     out_by_pool = in_by_pool.colwise().maxCoeff();
187   }
188 
SpatialMaxPool(OpKernelContext * context,Tensor * output,const Tensor & tensor_in,const PoolParameters & params,const Padding & padding)189   void SpatialMaxPool(OpKernelContext* context, Tensor* output,
190                       const Tensor& tensor_in, const PoolParameters& params,
191                       const Padding& padding) {
192     if (output->NumElements() == 0) {
193       return;
194     }
195     // On GPU, use Eigen's Spatial Max Pooling.  On CPU, use an
196     // EigenMatrix version that is currently faster than Eigen's
197     // Spatial MaxPooling implementation.
198     //
199     // TODO(vrv): Remove this once we no longer need it.
200     if (std::is_same<Device, GPUDevice>::value) {
201       Eigen::PaddingType pt = BrainPadding2EigenPadding(padding);
202       functor::SpatialMaxPooling<Device, T>()(
203           context->eigen_device<Device>(), output->tensor<T, 4>(),
204           tensor_in.tensor<T, 4>(), params.window_rows, params.window_cols,
205           params.row_stride, params.col_stride, pt);
206     } else {
207       typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
208           ConstEigenMatrixMap;
209       typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
210           EigenMatrixMap;
211 
212       ConstEigenMatrixMap in_mat(tensor_in.flat<T>().data(), params.depth,
213                                  params.tensor_in_cols * params.tensor_in_rows *
214                                      params.tensor_in_batch);
215       EigenMatrixMap out_mat(
216           output->flat<T>().data(), params.depth,
217           params.out_width * params.out_height * params.tensor_in_batch);
218 
219       const DeviceBase::CpuWorkerThreads& worker_threads =
220           *(context->device()->tensorflow_cpu_worker_threads());
221 
222       // The following code basically does the following:
223       // 1. Flattens the input and output tensors into two dimensional arrays.
224       //    tensor_in_as_matrix:
225       //      depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch)
226       //    output_as_matrix:
227       //      depth by (out_width * out_height * tensor_in_batch)
228       //
229       // 2. Walks through the set of columns in the flattened
230       // tensor_in_as_matrix,
231       //    and updates the corresponding column(s) in output_as_matrix with the
232       //    max value.
233       auto shard = [&params, &in_mat, &out_mat](int64_t start, int64_t limit) {
234         const int32_t in_rows = params.tensor_in_rows;
235         const int32_t in_cols = params.tensor_in_cols;
236         const int32_t pad_top = params.pad_top;
237         const int32_t pad_left = params.pad_left;
238         const int32_t window_rows = params.window_rows;
239         const int32_t window_cols = params.window_cols;
240         const int32_t row_stride = params.row_stride;
241         const int32_t col_stride = params.col_stride;
242         const int32_t out_height = params.out_height;
243         const int32_t out_width = params.out_width;
244 
245         {
246           // Initializes the output tensor with MIN<T>.
247           const int32_t output_image_size =
248               out_height * out_width * params.depth;
249           EigenMatrixMap out_shard(out_mat.data() + start * output_image_size,
250                                    1, (limit - start) * output_image_size);
251           out_shard.setConstant(Eigen::NumTraits<T>::lowest());
252         }
253 
254         for (int32_t b = start; b < limit; ++b) {
255           const int32_t out_offset_batch = b * out_height;
256           for (int32_t h = 0; h < in_rows; ++h) {
257             for (int32_t w = 0; w < in_cols; ++w) {
258               // (h_start, h_end) * (w_start, w_end) is the range that the input
259               // vector projects to.
260               const int32_t hpad = h + pad_top;
261               const int32_t wpad = w + pad_left;
262               const int32_t h_start =
263                   (hpad < window_rows) ? 0
264                                        : (hpad - window_rows) / row_stride + 1;
265               const int32_t h_end = std::min(hpad / row_stride + 1, out_height);
266               const int32_t w_start =
267                   (wpad < window_cols) ? 0
268                                        : (wpad - window_cols) / col_stride + 1;
269               const int32_t w_end = std::min(wpad / col_stride + 1, out_width);
270               // compute elementwise max
271               const int32_t in_offset = (b * in_rows + h) * in_cols + w;
272               for (int32_t ph = h_start; ph < h_end; ++ph) {
273                 const int32_t out_offset_base =
274                     (out_offset_batch + ph) * out_width;
275                 for (int32_t pw = w_start; pw < w_end; ++pw) {
276                   const int32_t out_offset = out_offset_base + pw;
277                   out_mat.col(out_offset) =
278                       out_mat.col(out_offset).cwiseMax(in_mat.col(in_offset));
279                 }
280               }
281             }
282           }
283         }
284       };
285 
286       // TODO(andydavis) Consider sharding across batch x rows x cols.
287       // TODO(andydavis) Consider a higher resolution shard cost model.
288       const int64_t shard_cost =
289           params.tensor_in_rows * params.tensor_in_cols * params.depth;
290       Shard(worker_threads.num_threads, worker_threads.workers,
291             params.tensor_in_batch, shard_cost, shard);
292     }
293   }
294 
295   std::vector<int32> ksize_;
296   std::vector<int32> stride_;
297   Padding padding_;
298   std::vector<int64_t> explicit_paddings_;
299   TensorFormat data_format_;
300 };
301 
302 template <typename Device>
303 struct LaunchMaxPoolingNoMask_NCHW_VECT_C;
304 
305 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
306 template <>
307 struct LaunchMaxPoolingNoMask_NCHW_VECT_C<Eigen::GpuDevice> {
308   static void launch(OpKernelContext* context, const PoolParameters& params,
309                      const Tensor& input, Tensor* output) {
310 #if GOOGLE_CUDA
311     bool status = functor::MaxPoolForwardNoMask_NCHW_VECT_C()(
312         reinterpret_cast<const int32*>(input.flat<qint8>().data()),
313         params.tensor_in_batch, params.tensor_in_rows, params.tensor_in_cols,
314         params.depth, params.out_height, params.out_width, params.window_rows,
315         params.window_cols, params.row_stride, params.col_stride,
316         params.pad_top, params.pad_left,
317         reinterpret_cast<int32*>(output->flat<qint8>().data()),
318         context->eigen_gpu_device());
319     if (!status) {
320       context->SetStatus(errors::Internal(
321           "Failed launching LaunchMaxPoolingNoMask_NCHW_VECT_C"));
322     }
323 #else
324     // ROCm TODO: add support __vmaxs4 on ROCm
325     context->SetStatus(errors::Internal(
326         "Failed launching LaunchMaxPoolingNoMask_NCHW_VECT_C"));
327 #endif  // GOOGLE_CUDA
328   }
329 };
330 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
331 
332 template <typename Device, typename T>
333 class MaxPoolingV2Op : public OpKernel {
334  public:
335   explicit MaxPoolingV2Op(OpKernelConstruction* context) : OpKernel(context) {
336     string data_format;
337     auto status = context->GetAttr("data_format", &data_format);
338     if (status.ok()) {
339       OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
340                   errors::InvalidArgument("Invalid data format"));
341       OP_REQUIRES(
342           context,
343           data_format_ == FORMAT_NHWC || data_format_ == FORMAT_NCHW_VECT_C,
344           errors::InvalidArgument(
345               "MaxPoolingV2Op only supports NHWC or NCHW_VECT_C. Got: ",
346               data_format));
347     } else {
348       data_format_ = FORMAT_NHWC;
349     }
350     if (context->num_inputs() == 1) {
351       OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
352       OP_REQUIRES(context, ksize_.size() == 4,
353                   errors::InvalidArgument("Sliding window ksize field must "
354                                           "specify 4 dimensions"));
355       OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
356       OP_REQUIRES(context, stride_.size() == 4,
357                   errors::InvalidArgument("Sliding window stride field must "
358                                           "specify 4 dimensions"));
359       OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
360                   errors::Unimplemented(
361                       "Pooling is not yet supported on the batch dimension."));
362     }
363     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
364   }
365 
366   void Compute(OpKernelContext* context) override {
367     const Tensor& tensor_in = context->input(0);
368 
369     std::vector<int32> ksize = ksize_;
370     std::vector<int32> stride = stride_;
371 
372     if (context->num_inputs() != 1) {
373       const Tensor& tensor_ksize = context->input(1);
374       auto value_ksize = tensor_ksize.flat<int32>();
375       ksize.resize(tensor_ksize.shape().num_elements());
376       std::copy_n(&value_ksize(0), ksize.size(), ksize.begin());
377 
378       const Tensor& tensor_stride = context->input(2);
379       auto value_stride = tensor_stride.flat<int32>();
380       stride.resize(tensor_stride.shape().num_elements());
381       std::copy_n(&value_stride(0), stride.size(), stride.begin());
382     }
383 
384     OP_REQUIRES(context, ksize.size() == 4,
385                 errors::InvalidArgument("Sliding window ksize field must "
386                                         "specify 4 dimensions"));
387     OP_REQUIRES(context, stride.size() == 4,
388                 errors::InvalidArgument("Sliding window stride field must "
389                                         "specify 4 dimensions"));
390     OP_REQUIRES(context, ksize[0] == 1 && stride[0] == 1,
391                 errors::Unimplemented(
392                     "Pooling is not yet supported on the batch dimension."));
393 
394     PoolParameters params{
395         context,
396         ksize,
397         stride,
398         padding_,
399         /*explicit_paddings=*/{},
400         data_format_,
401         tensor_in.shape(),
402     };
403     if (!context->status().ok()) {
404       return;
405     }
406 
407     Tensor* output = nullptr;
408     OP_REQUIRES_OK(context, context->allocate_output(
409                                 0, params.forward_output_shape(), &output));
410 
411     if (params.depth_window > 1) {
412       // Validate spec against the current implementation.  A
413       // relaxation of these requirements would be ideal.
414       OP_REQUIRES(context, params.depth % params.depth_window == 0,
415                   errors::Unimplemented(
416                       "Depthwise max pooling requires "
417                       "the depth window to evenly divide the input depth."));
418       OP_REQUIRES(
419           context, params.depth_window == params.depth_stride,
420           errors::Unimplemented("Depthwise max pooling requires "
421                                 "the depth window to equal the depth stride."));
422 
423       DepthwiseMaxPool(context, output, tensor_in, params);
424     } else {
425       SpatialMaxPool(context, output, tensor_in, params, padding_);
426     }
427   }
428 
429  private:
430   // Single-threaded implementation of DepthwiseMaxPool which
431   // does not handle all of the same options as SpatialMaxPool
432   // (strict assumptions on no padding, stride).
433   //
434   // TODO(vrv): implement a more general depthwise-max pool that works
435   // on GPU as well.
436   void DepthwiseMaxPool(OpKernelContext* context, Tensor* output,
437                         const Tensor& tensor_in, const PoolParameters& params) {
438     Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
439         in_by_pool(tensor_in.flat<T>().data(), params.depth_window,
440                    tensor_in.NumElements() / params.depth_window);
441     Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> out_by_pool(
442         output->flat<T>().data(), 1, output->NumElements());
443     out_by_pool = in_by_pool.colwise().maxCoeff();
444   }
445 
446   void SpatialMaxPool(OpKernelContext* context, Tensor* output,
447                       const Tensor& tensor_in, const PoolParameters& params,
448                       const Padding& padding) {
449     if (output->NumElements() == 0) {
450       return;
451     }
452     // On GPU, use Eigen's Spatial Max Pooling.  On CPU, use an
453     // EigenMatrix version that is currently faster than Eigen's
454     // Spatial MaxPooling implementation.
455     //
456     // TODO(vrv): Remove this once we no longer need it.
457 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
458     if (std::is_same<Device, GPUDevice>::value) {
459       Eigen::PaddingType pt = BrainPadding2EigenPadding(padding);
460       if (std::is_same<T, qint8>::value) {
461         LaunchMaxPoolingNoMask_NCHW_VECT_C<GPUDevice>::launch(
462             context, params, tensor_in, output);
463       } else {
464         functor::SpatialMaxPooling<Device, T>()(
465             context->eigen_device<Device>(), output->tensor<T, 4>(),
466             tensor_in.tensor<T, 4>(), params.window_rows, params.window_cols,
467             params.row_stride, params.col_stride, pt);
468       }
469     } else
470 #endif
471     {
472       typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
473           ConstEigenMatrixMap;
474       typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
475           EigenMatrixMap;
476 
477       ConstEigenMatrixMap in_mat(tensor_in.flat<T>().data(), params.depth,
478                                  params.tensor_in_cols * params.tensor_in_rows *
479                                      params.tensor_in_batch);
480       EigenMatrixMap out_mat(
481           output->flat<T>().data(), params.depth,
482           params.out_width * params.out_height * params.tensor_in_batch);
483 
484       const DeviceBase::CpuWorkerThreads& worker_threads =
485           *(context->device()->tensorflow_cpu_worker_threads());
486 
487       // The following code basically does the following:
488       // 1. Flattens the input and output tensors into two dimensional arrays.
489       //    tensor_in_as_matrix:
490       //      depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch)
491       //    output_as_matrix:
492       //      depth by (out_width * out_height * tensor_in_batch)
493       //
494       // 2. Walks through the set of columns in the flattened
495       // tensor_in_as_matrix,
496       //    and updates the corresponding column(s) in output_as_matrix with the
497       //    max value.
498       auto shard = [&params, &in_mat, &out_mat](int64_t start, int64_t limit) {
499         const int32_t in_rows = params.tensor_in_rows;
500         const int32_t in_cols = params.tensor_in_cols;
501         const int32_t pad_top = params.pad_top;
502         const int32_t pad_left = params.pad_left;
503         const int32_t window_rows = params.window_rows;
504         const int32_t window_cols = params.window_cols;
505         const int32_t row_stride = params.row_stride;
506         const int32_t col_stride = params.col_stride;
507         const int32_t out_height = params.out_height;
508         const int32_t out_width = params.out_width;
509 
510         {
511           // Initializes the output tensor with MIN<T>.
512           const int32_t output_image_size =
513               out_height * out_width * params.depth;
514           EigenMatrixMap out_shard(out_mat.data() + start * output_image_size,
515                                    1, (limit - start) * output_image_size);
516           out_shard.setConstant(Eigen::NumTraits<T>::lowest());
517         }
518 
519         for (int32_t b = start; b < limit; ++b) {
520           const int32_t out_offset_batch = b * out_height;
521           for (int32_t h = 0; h < in_rows; ++h) {
522             for (int32_t w = 0; w < in_cols; ++w) {
523               // (h_start, h_end) * (w_start, w_end) is the range that the input
524               // vector projects to.
525               const int32_t hpad = h + pad_top;
526               const int32_t wpad = w + pad_left;
527               const int32_t h_start =
528                   (hpad < window_rows) ? 0
529                                        : (hpad - window_rows) / row_stride + 1;
530               const int32_t h_end = std::min(hpad / row_stride + 1, out_height);
531               const int32_t w_start =
532                   (wpad < window_cols) ? 0
533                                        : (wpad - window_cols) / col_stride + 1;
534               const int32_t w_end = std::min(wpad / col_stride + 1, out_width);
535               // compute elementwise max
536               const int32_t in_offset = (b * in_rows + h) * in_cols + w;
537               for (int32_t ph = h_start; ph < h_end; ++ph) {
538                 const int32_t out_offset_base =
539                     (out_offset_batch + ph) * out_width;
540                 for (int32_t pw = w_start; pw < w_end; ++pw) {
541                   const int32_t out_offset = out_offset_base + pw;
542                   out_mat.col(out_offset) =
543                       out_mat.col(out_offset).cwiseMax(in_mat.col(in_offset));
544                 }
545               }
546             }
547           }
548         }
549       };
550 
551       // TODO(andydavis) Consider sharding across batch x rows x cols.
552       // TODO(andydavis) Consider a higher resolution shard cost model.
553       const int64_t shard_cost =
554           params.tensor_in_rows * params.tensor_in_cols * params.depth;
555       Shard(worker_threads.num_threads, worker_threads.workers,
556             params.tensor_in_batch, shard_cost, shard);
557     }
558   }
559 
560   std::vector<int32> ksize_;
561   std::vector<int32> stride_;
562   Padding padding_;
563   TensorFormat data_format_;
564 };
565 
566 template <typename Device, typename T>
567 void SpatialAvgPool(OpKernelContext* context, Tensor* output,
568                     const Tensor& input, const PoolParameters& params,
569                     const Padding& padding) {
570   if (output->NumElements() == 0) {
571     return;
572   }
573   typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
574       ConstEigenMatrixMap;
575   typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
576       EigenMatrixMap;
577 
578   auto in_flat = input.flat<T>();
579   auto out_flat = output->flat<T>();
580 
581   auto shard = [&params, &in_flat, &out_flat](int64_t start, int64_t limit) {
582     // Calculate indices for this shards chunk of work.
583     const int64_t input_image_size =
584         params.tensor_in_rows * params.tensor_in_cols * params.depth;
585     const int64_t output_image_size =
586         params.out_width * params.out_height * params.depth;
587     const int64_t shard_batch_size = limit - start;
588 
589     ConstEigenMatrixMap in_mat(
590         in_flat.data() + start * input_image_size, params.depth,
591         params.tensor_in_cols * params.tensor_in_rows * shard_batch_size);
592     EigenMatrixMap out_mat(
593         out_flat.data() + start * output_image_size, params.depth,
594         params.out_width * params.out_height * shard_batch_size);
595     Eigen::Matrix<T, Eigen::Dynamic, 1> out_count(out_mat.cols());
596     out_count.setZero();
597 
598     // Initializes output to zero.
599     out_mat.setZero();
600 
601     // The following code basically does the following:
602     // 1. Flattens the input and output tensors into two dimensional arrays.
603     //    tensor_in_as_matrix:
604     //      depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch)
605     //    output_as_matrix:
606     //      depth by (out_width * out_height * tensor_in_batch)
607     //
608     // 2. Walks through the set of columns in the flattened
609     // tensor_in_as_matrix,
610     //    and updates the corresponding column(s) in output_as_matrix with the
611     //    average value.
612     for (int b = 0; b < shard_batch_size; ++b) {
613       for (int h = 0; h < params.tensor_in_rows; ++h) {
614         for (int w = 0; w < params.tensor_in_cols; ++w) {
615           // (h_start, h_end) * (w_start, w_end) is the range that the input
616           // vector projects to.
617           const int hpad = h + params.pad_top;
618           const int wpad = w + params.pad_left;
619           const int h_start =
620               (hpad < params.window_rows)
621                   ? 0
622                   : (hpad - params.window_rows) / params.row_stride + 1;
623           const int h_end =
624               std::min<int>(hpad / params.row_stride + 1, params.out_height);
625           const int w_start =
626               (wpad < params.window_cols)
627                   ? 0
628                   : (wpad - params.window_cols) / params.col_stride + 1;
629           const int w_end =
630               std::min<int>(wpad / params.col_stride + 1, params.out_width);
631           const int in_offset =
632               (b * params.tensor_in_rows + h) * params.tensor_in_cols + w;
633           Eigen::DSizes<Eigen::DenseIndex, 2> in_indices(0, in_offset);
634           for (int ph = h_start; ph < h_end; ++ph) {
635             for (int pw = w_start; pw < w_end; ++pw) {
636               const int out_offset =
637                   (b * params.out_height + ph) * params.out_width + pw;
638               out_mat.col(out_offset) += in_mat.col(in_offset);
639               out_count(out_offset) += T(1);
640             }
641           }
642         }
643       }
644     }
645 
646     DCHECK_GT(out_count.minCoeff(), T(0));
647     out_mat.array().rowwise() /= out_count.transpose().array();
648   };
649 
650   const int64_t work_unit_size =
651       params.tensor_in_rows * params.tensor_in_cols * params.depth;
652   // NOTE: Constants in calculation below were estimated based on benchmarking.
653   // Nanoseconds/work_unit for benchmarks ranged from 0.01 to 0.001, and
654   // so the factor 0.01 (i.e. 1/100) with a max of 10000, was chosen to limit
655   // the work unit cost to an operating range in which it empirically performed
656   // best.
657   const int64_t work_unit_cost = std::max(int64_t{10000}, work_unit_size / 100);
658   const DeviceBase::CpuWorkerThreads& worker_threads =
659       *(context->device()->tensorflow_cpu_worker_threads());
660   Shard(worker_threads.num_threads, worker_threads.workers,
661         params.tensor_in_batch, work_unit_cost, shard);
662 }
663 
664 }  // namespace tensorflow
665 
666 #endif  // TENSORFLOW_CORE_KERNELS_POOLING_OPS_COMMON_H_
667