xref: /aosp_15_r20/external/tensorflow/tensorflow/core/kernels/conv_ops_gpu.h (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_CORE_KERNELS_CONV_OPS_GPU_H_
17 #define TENSORFLOW_CORE_KERNELS_CONV_OPS_GPU_H_
18 
19 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
20 
21 #include <tuple>
22 #include <unordered_map>
23 
24 #include "absl/strings/str_cat.h"
25 #include "absl/strings/str_format.h"
26 #include "tensorflow/core/framework/op_kernel.h"
27 #include "tensorflow/core/kernels/gpu_utils.h"
28 #include "tensorflow/core/lib/gtl/inlined_vector.h"
29 #include "tensorflow/core/lib/hash/hash.h"
30 #include "tensorflow/core/util/autotune_maps/conv_parameters.h"
31 #include "tensorflow/core/util/tensor_format.h"
32 
33 namespace tensorflow {
34 
35 // Get the Dnn workspace limit from the environment variable, which is in MB.
36 // Return the workspace memory limit in bytes. If no value is set, return the
37 // default value.
38 int64 GetDnnWorkspaceLimit(const string& envvar_in_mb,
39                            int64_t default_value_in_bytes);
40 
41 // Call the Dnn workspace limit from TF_CUDNN_WORKSPACE_LIMIT_IN_MB or default.
42 int64 GetDnnWorkspaceLimitOrDefault();
43 
44 // A class to provide scratch-space allocator for Stream-Executor Cudnn
45 // callback. TensorFlow is responsible for releasing the temporary buffers after
46 // the kernel finishes.
47 class DnnScratchAllocator : public se::ScratchAllocator {
48  public:
~DnnScratchAllocator()49   virtual ~DnnScratchAllocator() {}
DnnScratchAllocator(int64_t memory_limit,OpKernelContext * context)50   DnnScratchAllocator(int64_t memory_limit, OpKernelContext* context)
51       : memory_limit_(memory_limit), total_byte_size_(0), context_(context) {}
GetMemoryLimitInBytes()52   int64 GetMemoryLimitInBytes() override { return memory_limit_; }
AllocateBytes(int64_t byte_size)53   se::port::StatusOr<se::DeviceMemory<uint8>> AllocateBytes(
54       int64_t byte_size) override {
55     Tensor temporary_memory;
56     if (byte_size < 0) {
57       return se::port::Status{se::port::error::INVALID_ARGUMENT,
58                               "Requested negative byte size!"};
59     }
60     if (byte_size > memory_limit_) {
61       return se::port::Status{se::port::error::UNAVAILABLE,
62                               absl::StrCat("Requested memory size (", byte_size,
63                                            ") exceeds the max memory limit (",
64                                            memory_limit_, ").")};
65     }
66     AllocationAttributes allocation_attr;
67     allocation_attr.retry_on_failure = false;
68     Status allocation_status(context_->allocate_temp(
69         DT_UINT8, TensorShape({byte_size}), &temporary_memory,
70         AllocatorAttributes(), allocation_attr));
71     if (!allocation_status.ok()) {
72       return se::port::Status{
73           se::port::error::UNAVAILABLE,
74           absl::StrCat("Failed to allocate the requested memory size (",
75                        byte_size, ").")};
76     }
77     // Hold the reference of the allocated tensors until the end of the
78     // allocator.
79     allocated_tensors_.push_back(temporary_memory);
80     total_byte_size_ += byte_size;
81     return se::port::StatusOr<se::DeviceMemory<uint8>>(
82         AsDeviceMemory(temporary_memory.flat<uint8>().data(),
83                        temporary_memory.flat<uint8>().size()));
84   }
TotalByteSize()85   int64 TotalByteSize() { return total_byte_size_; }
86 
87  private:
88   int64 memory_limit_;
89   int64 total_byte_size_;
90   OpKernelContext* context_;
91   std::vector<Tensor> allocated_tensors_;
92 };
93 
94 typedef Eigen::GpuDevice GPUDevice;
95 
96 // Select an algorithm for the given convolution, either by running actual
97 // autotuning with a cache, or by falling back to a default if
98 // 'cudnn_use_autotune' is true and cuDNN is the statically-chosen DNN backend.
99 template <typename T>
100 StatusOr<AutotuneEntry<se::dnn::FusedConvOp>> AutotuneFusedConv(
101     bool cudnn_use_autotune,
102     AutotuneMap<ConvParameters, AutotuneEntry<se::dnn::FusedConvOp>>*
103         autotune_map,
104     const ConvParameters& params, OpKernelContext* ctx,
105     const se::dnn::BatchDescriptor& input_desc,
106     const se::dnn::FilterDescriptor& filter_desc,
107     const se::dnn::BatchDescriptor& bias_desc,
108     const se::dnn::BatchDescriptor& output_desc,
109     const se::dnn::ConvolutionDescriptor& conv_desc,
110     const se::dnn::ActivationMode activation_mode, double conv_input_scale,
111     double side_input_scale, double leakyrelu_alpha,
112     se::DeviceMemory<T> input_ptr, se::DeviceMemory<T> filter_ptr,
113     se::DeviceMemory<T> output_ptr, se::DeviceMemory<T> bias_ptr,
114     se::DeviceMemory<T> side_input_ptr, int64_t scratch_size);
115 
116 template <typename T>
117 StatusOr<AutotuneEntry<se::dnn::ConvOp>> AutotuneUnfusedConv(
118     bool cudnn_use_autotune,
119     AutotuneMap<ConvParameters, AutotuneEntry<se::dnn::ConvOp>>* autotune_map,
120     const ConvParameters& conv_parameters, OpKernelContext* ctx,
121     se::dnn::ConvolutionKind kind, const se::dnn::BatchDescriptor& input_desc,
122     se::DeviceMemory<T> input_ptr, const se::dnn::FilterDescriptor& filter_desc,
123     se::DeviceMemory<T> filter_ptr,
124     const se::dnn::ConvolutionDescriptor& conv_desc,
125     const se::dnn::BatchDescriptor& output_desc, se::DeviceMemory<T> output_ptr,
126     int64_t scratch_size_limit);
127 
128 // Returns a pointer to the primary 'OpRunner' of 'runners' and allocated
129 // scratch memory if allocatable; else a pointer to its fallback
130 // no-scratch-space runner, and a null 'DeviceMemoryBase'.
131 template <typename Sig>
132 StatusOr<std::tuple<const se::dnn::OpRunner<Sig>*, se::DeviceMemoryBase>>
AllocateScratchOrFallback(se::ScratchAllocator * scratch_allocator,const se::dnn::OpRunner<Sig> * primary,const se::dnn::OpRunner<Sig> * no_scratch_fallback)133 AllocateScratchOrFallback(se::ScratchAllocator* scratch_allocator,
134                           const se::dnn::OpRunner<Sig>* primary,
135                           const se::dnn::OpRunner<Sig>* no_scratch_fallback) {
136   const se::dnn::OpRunner<Sig>* selected_runner = primary;
137 
138   auto workspace_size = selected_runner->GetWorkspaceSize();
139 
140   se::DeviceMemoryBase scratch_memory;
141   if (workspace_size > 0) {
142     auto scratch_or = scratch_allocator->AllocateBytes(workspace_size);
143     if (scratch_or.ok()) {
144       scratch_memory = scratch_or.ValueOrDie();
145     } else if ((selected_runner = no_scratch_fallback)) {
146       if (selected_runner->GetWorkspaceSize() > 0) {
147         return errors::Internal(
148             "No-scratch fallback runner requires nonzero scratch space");
149       }
150     } else {
151       return errors::Unknown(
152           "CUDNN failed to allocate the scratch space for the runner or to "
153           "find a working no-scratch runner.");
154     }
155   }
156 
157   return std::make_tuple(selected_runner, scratch_memory);
158 }
159 
160 template <typename T>
LaunchAutotunedConv(const AutotuneEntry<se::dnn::ConvOp> & autotune_entry,DnnScratchAllocator * scratch_allocator,se::dnn::ConvolutionKind kind,se::Stream * stream,const se::dnn::BatchDescriptor & input_desc,se::DeviceMemory<T> in_ptr,const se::dnn::FilterDescriptor & filter_desc,se::DeviceMemory<T> filter_ptr,const se::dnn::ConvolutionDescriptor & conv_desc,const se::dnn::BatchDescriptor & output_desc,se::DeviceMemory<T> out_ptr)161 Status LaunchAutotunedConv(const AutotuneEntry<se::dnn::ConvOp>& autotune_entry,
162                            DnnScratchAllocator* scratch_allocator,
163                            se::dnn::ConvolutionKind kind, se::Stream* stream,
164                            const se::dnn::BatchDescriptor& input_desc,
165                            se::DeviceMemory<T> in_ptr,
166                            const se::dnn::FilterDescriptor& filter_desc,
167                            se::DeviceMemory<T> filter_ptr,
168                            const se::dnn::ConvolutionDescriptor& conv_desc,
169                            const se::dnn::BatchDescriptor& output_desc,
170                            se::DeviceMemory<T> out_ptr) {
171   if (!autotune_entry.is_algorithm_config()) {
172     const auto& runners = autotune_entry.GetOpRunners();
173     se::dnn::DataType element_type = se::dnn::ToDataType<T>::value;
174     se::dnn::ConvOp::Config config{kind,       element_type, element_type,
175                                    input_desc, filter_desc,  output_desc,
176                                    conv_desc};
177     TF_ASSIGN_OR_RETURN(auto* primary,
178                         runners.primary->GetOrCreateRunner(config, stream));
179 
180     const se::dnn::ConvRunner* no_scratch_fallback = nullptr;
181     if (runners.no_scratch_fallback) {
182       TF_ASSIGN_OR_RETURN(
183           no_scratch_fallback,
184           runners.no_scratch_fallback->GetOrCreateRunner(config, stream));
185     }
186 
187     TF_ASSIGN_OR_RETURN(auto runner_and_scratch,
188                         AllocateScratchOrFallback<se::dnn::ConvOp::Signature>(
189                             scratch_allocator, primary, no_scratch_fallback));
190     auto& runner = *std::get<const se::dnn::ConvRunner*>(runner_and_scratch);
191     return runner(stream, nullptr,
192                   std::get<se::DeviceMemoryBase>(runner_and_scratch), in_ptr,
193                   filter_ptr, out_ptr);
194   } else {
195     return stream->ConvolveWithAlgorithm(
196         kind, input_desc, in_ptr, filter_desc, filter_ptr, output_desc, out_ptr,
197         conv_desc, scratch_allocator, autotune_entry.GetAlgorithmConfig(),
198         nullptr);
199   }
200 }
201 
202 }  // namespace tensorflow
203 
204 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
205 
206 #endif  // TENSORFLOW_CORE_KERNELS_CONV_OPS_GPU_H_
207