xref: /aosp_15_r20/external/tensorflow/tensorflow/core/kernels/gpu_utils.cc (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/kernels/gpu_utils.h"
17 
18 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
19 
20 #include <iterator>
21 
22 #include "google/protobuf/any.pb.h"
23 #include "absl/algorithm/container.h"
24 #include "absl/base/call_once.h"
25 #include "tensorflow/core/platform/logger.h"
26 #include "tensorflow/core/protobuf/autotuning.pb.h"
27 #include "tensorflow/core/protobuf/conv_autotuning.pb.h"
28 #include "tensorflow/core/util/determinism.h"
29 #include "tensorflow/core/util/env_var.h"
30 #include "tensorflow/core/util/proto/proto_utils.h"
31 #include "tensorflow/stream_executor/gpu/asm_compiler.h"
32 #include "tensorflow/stream_executor/gpu/redzone_allocator.h"
33 
34 namespace tensorflow {
35 
RedzoneCheckDisabled()36 bool RedzoneCheckDisabled() {
37   const char* disable_rz_str = std::getenv("TF_DISABLE_RZ_CHECK");
38   return disable_rz_str != nullptr && std::strcmp(disable_rz_str, "1") == 0;
39 }
40 
WrapRedzoneBestEffort(se::RedzoneAllocator * rz_allocator,se::DeviceMemoryBase buffer)41 se::DeviceMemoryBase WrapRedzoneBestEffort(se::RedzoneAllocator* rz_allocator,
42                                            se::DeviceMemoryBase buffer) {
43   if (RedzoneCheckDisabled()) {
44     return buffer;
45   }
46   auto output_rz_or = rz_allocator->AllocateBytes(buffer.size());
47   if (!output_rz_or.ok()) {
48     static absl::once_flag rz_allocation_failure_logged;
49     absl::call_once(rz_allocation_failure_logged, []() {
50       LOG(WARNING) << "Failed to allocate memory for convolution redzone "
51                    << "checking; skipping this check. This is benign and only "
52                    << "means that we won't check cudnn for out-of-bounds reads "
53                    << "and writes. This message will only be printed once.";
54     });
55     return buffer;
56   }
57   return se::DeviceMemoryBase(output_rz_or.ValueOrDie());
58 }
59 
CheckRedzones(const se::RedzoneAllocator & rz_allocator,tensorflow::AutotuneResult * autotune_result)60 void CheckRedzones(const se::RedzoneAllocator& rz_allocator,
61                    tensorflow::AutotuneResult* autotune_result) {
62   if (RedzoneCheckDisabled()) {
63     return;
64   }
65   se::port::StatusOr<se::RedzoneAllocator::RedzoneCheckStatus> rz_status =
66       rz_allocator.CheckRedzones();
67   if (!rz_status.ok()) {
68     static absl::once_flag failure_logged;
69     absl::call_once(failure_logged, [&]() {
70       LOG(WARNING) << "Failed to check cudnn convolutions for out-of-bounds "
71                    << "reads and writes with an error message: '"
72                    << rz_status.status().error_message()
73                    << "'; skipping this check. This only means that we won't "
74                    << "check cudnn for out-of-bounds reads and writes. This "
75                    << "message will only be printed once.";
76     });
77     return;
78   }
79   auto rz_check_status = rz_status.ValueOrDie();
80   if (!rz_check_status.ok()) {
81     auto* fail = autotune_result->mutable_failure();
82     fail->set_msg(rz_check_status.RedzoneFailureMsg());
83     fail->set_kind(AutotuneResult::REDZONE_MODIFIED);
84     fail->set_buffer_address(
85         reinterpret_cast<uint64>(rz_check_status.user_buffer_address));
86     LOG(ERROR)
87         << "Detected cudnn out-of-bounds write in convolution buffer! This is "
88            "likely a cudnn bug. We will skip this algorithm in the future, but "
89            "your GPU state may already be corrupted, leading to incorrect "
90            "results. Within Google, no action is needed on your part. Outside "
91            "of Google, please ensure you're running the latest version of "
92            "cudnn. If that doesn't fix the problem, please file a bug with "
93            "this full error message and we'll contact nvidia.";
94     LOG(ERROR) << rz_check_status.RedzoneFailureMsg();
95   }
96 }
97 
EnableCublasLtGemm()98 bool EnableCublasLtGemm() {
99   static const bool enable_cublaslt_gemm = [] {
100     bool cublaslt_gemm = false;
101     TF_CHECK_OK(tensorflow::ReadBoolFromEnvVar("TF_USE_CUBLASLT",
102                                                /*default_val=*/false,
103                                                &cublaslt_gemm));
104     return cublaslt_gemm;
105   }();
106   return enable_cublaslt_gemm;
107 }
108 
109 namespace {
110 
GetCudnnVersion(se::StreamExecutor * stream_executor)111 tensorflow::CudnnVersion GetCudnnVersion(se::StreamExecutor* stream_executor) {
112   tensorflow::CudnnVersion cudnn_version;
113   if (auto* dnn = stream_executor->AsDnn()) {
114     se::port::StatusOr<se::dnn::VersionInfo> version_or = dnn->GetVersion();
115     if (version_or.ok()) {
116       const auto& version = version_or.ValueOrDie();
117       cudnn_version.set_major(version.major_version());
118       cudnn_version.set_minor(version.minor_version());
119       cudnn_version.set_patch(version.patch());
120     }
121   }
122   return cudnn_version;
123 }
124 
GetComputeCapability(se::StreamExecutor * stream_executor)125 tensorflow::ComputeCapability GetComputeCapability(
126     se::StreamExecutor* stream_executor) {
127   tensorflow::ComputeCapability cc_proto;
128   se::CudaComputeCapability cc =
129       stream_executor->GetDeviceDescription().cuda_compute_capability();
130   cc_proto.set_major(cc.major);
131   cc_proto.set_minor(cc.minor);
132   return cc_proto;
133 }
134 
135 }  // namespace
136 
LogConvAutotuneResults(se::dnn::ConvolutionKind kind,se::dnn::DataType element_type,se::DeviceMemoryBase input_buffer,se::DeviceMemoryBase filter_buffer,se::DeviceMemoryBase output_buffer,const se::dnn::BatchDescriptor & input_desc,const se::dnn::FilterDescriptor & filter_desc,const se::dnn::BatchDescriptor & output_desc,const se::dnn::ConvolutionDescriptor & conv_desc,se::StreamExecutor * stream_exec,absl::Span<const AutotuneResult> results)137 void LogConvAutotuneResults(se::dnn::ConvolutionKind kind,
138                             se::dnn::DataType element_type,
139                             se::DeviceMemoryBase input_buffer,
140                             se::DeviceMemoryBase filter_buffer,
141                             se::DeviceMemoryBase output_buffer,
142                             const se::dnn::BatchDescriptor& input_desc,
143                             const se::dnn::FilterDescriptor& filter_desc,
144                             const se::dnn::BatchDescriptor& output_desc,
145                             const se::dnn::ConvolutionDescriptor& conv_desc,
146                             se::StreamExecutor* stream_exec,
147                             absl::Span<const AutotuneResult> results) {
148   AutotuningLog log;
149   {
150     ConvolutionProto instr;
151     instr.set_kind(kind);
152     *instr.mutable_input() = input_desc.ToProto(element_type);
153     *instr.mutable_filter() = filter_desc.ToProto(element_type);
154     *instr.mutable_output() = output_desc.ToProto(element_type);
155     *instr.mutable_conv_desc() = conv_desc.ToProto();
156     instr.set_conv_scale(1);
157     instr.set_side_value_scale(0);
158     instr.set_input_address(reinterpret_cast<uint64>(input_buffer.opaque()));
159     instr.set_filter_address(reinterpret_cast<uint64>(filter_buffer.opaque()));
160     instr.set_output_address(reinterpret_cast<uint64>(output_buffer.opaque()));
161     log.mutable_instr()->PackFrom(std::move(instr));
162   }
163   *log.mutable_cudnn_version() = GetCudnnVersion(stream_exec);
164   *log.mutable_compute_capability() = GetComputeCapability(stream_exec);
165   log.set_device_pci_bus_id(stream_exec->GetDeviceDescription().pci_bus_id());
166   {
167     string blas_version;
168     if (auto* blas = stream_exec->AsBlas()) {
169       if (blas->GetVersion(&blas_version).ok()) {
170         log.set_blas_version(blas_version);
171       }
172     }
173   }
174   for (const auto& result : results) {
175     *log.add_results() = result;
176   }
177   VLOG(2) << log.DebugString();
178   Logger::GetSingleton()->LogProto(log);
179 }
180 
LogFusedConvForwardAutotuneResults(se::dnn::DataType element_type,se::DeviceMemoryBase input_buffer,se::DeviceMemoryBase filter_buffer,se::DeviceMemoryBase output_buffer,se::DeviceMemoryBase bias_buffer,se::DeviceMemoryBase side_input_buffer,const se::dnn::BatchDescriptor & input_desc,const se::dnn::FilterDescriptor & filter_desc,const se::dnn::BatchDescriptor & output_desc,const se::dnn::ConvolutionDescriptor & conv_desc,double conv_scale,double side_value_scale,se::dnn::ActivationMode activation_mode,se::StreamExecutor * stream_exec,absl::Span<const AutotuneResult> results)181 void LogFusedConvForwardAutotuneResults(
182     se::dnn::DataType element_type, se::DeviceMemoryBase input_buffer,
183     se::DeviceMemoryBase filter_buffer, se::DeviceMemoryBase output_buffer,
184     se::DeviceMemoryBase bias_buffer, se::DeviceMemoryBase side_input_buffer,
185     const se::dnn::BatchDescriptor& input_desc,
186     const se::dnn::FilterDescriptor& filter_desc,
187     const se::dnn::BatchDescriptor& output_desc,
188     const se::dnn::ConvolutionDescriptor& conv_desc, double conv_scale,
189     double side_value_scale, se::dnn::ActivationMode activation_mode,
190     se::StreamExecutor* stream_exec, absl::Span<const AutotuneResult> results) {
191   AutotuningLog log;
192   {
193     ConvolutionProto instr;
194     instr.set_kind(se::dnn::ConvolutionKind::FORWARD_BIAS_ACTIVATION);
195     *instr.mutable_input() = input_desc.ToProto(element_type);
196     *instr.mutable_filter() = filter_desc.ToProto(element_type);
197     *instr.mutable_output() = output_desc.ToProto(element_type);
198     *instr.mutable_conv_desc() = conv_desc.ToProto();
199     instr.set_conv_scale(conv_scale);
200     instr.set_side_value_scale(side_value_scale);
201     instr.set_activation(activation_mode);
202     instr.set_input_address(reinterpret_cast<uint64>(input_buffer.opaque()));
203     instr.set_filter_address(reinterpret_cast<uint64>(filter_buffer.opaque()));
204     instr.set_output_address(reinterpret_cast<uint64>(output_buffer.opaque()));
205     instr.set_bias_address(reinterpret_cast<uint64>(bias_buffer.opaque()));
206     instr.set_side_input_address(
207         reinterpret_cast<uint64>(side_input_buffer.opaque()));
208     log.mutable_instr()->PackFrom(std::move(instr));
209   }
210   *log.mutable_cudnn_version() = GetCudnnVersion(stream_exec);
211   *log.mutable_compute_capability() = GetComputeCapability(stream_exec);
212   log.set_device_pci_bus_id(stream_exec->GetDeviceDescription().pci_bus_id());
213   {
214     string blas_version;
215     if (auto* blas = stream_exec->AsBlas()) {
216       if (blas->GetVersion(&blas_version).ok()) {
217         log.set_blas_version(blas_version);
218       }
219     }
220   }
221   for (const auto& result : results) {
222     *log.add_results() = result;
223   }
224   VLOG(2) << log.DebugString();
225   Logger::GetSingleton()->LogProto(log);
226 }
227 
228 namespace {
BestCudnnConvAlgorithmIndices(absl::Span<const AutotuneResult> results)229 StatusOr<std::tuple<int, int>> BestCudnnConvAlgorithmIndices(
230     absl::Span<const AutotuneResult> results) {
231   auto compare_run_times = [](const AutotuneResult& lhs,
232                               const AutotuneResult& rhs) {
233     return proto_utils::FromDurationProto(lhs.run_time()) <
234            proto_utils::FromDurationProto(rhs.run_time());
235   };
236   int idx = -1;
237   int idx_no_scratch = -1;
238   for (int i = 0; i < results.size(); i++) {
239     if (!results[i].has_failure()) {
240       if (OpDeterminismRequired()) {
241         // When determinism is enabled, choose first working algorithm, and
242         // don't choose a no_scratch algorithm.
243         idx = i;
244         break;
245       }
246       if (idx == -1 || compare_run_times(results[i], results[idx])) {
247         idx = i;
248       }
249       if (results[i].scratch_bytes() == 0 &&
250           (idx_no_scratch == -1 ||
251            compare_run_times(results[i], results[idx_no_scratch]))) {
252         idx_no_scratch = i;
253       }
254     }
255   }
256 
257   if (idx == -1) {
258     std::ostringstream msg;
259     msg << "No algorithm worked!  Error messages:";
260     // TODO(awpr): identify the algorithm as part of this error message, too.
261     for (const auto& result : results) {
262       msg << "\n  " << result.failure().msg();
263     }
264     return errors::NotFound(msg.str());
265   }
266 
267   return std::make_tuple(idx, idx_no_scratch);
268 }
269 }  // namespace
270 
BestCudnnConvAlgorithm(absl::Span<const AutotuneResult> results)271 StatusOr<se::dnn::AlgorithmConfig> BestCudnnConvAlgorithm(
272     absl::Span<const AutotuneResult> results) {
273   int idx;
274   int idx_no_scratch;
275   TF_ASSIGN_OR_RETURN(std::tie(idx, idx_no_scratch),
276                       BestCudnnConvAlgorithmIndices(results));
277   VLOG(2) << "fastest algorithm: "
278           << proto_utils::FromDurationProto(results[idx].run_time())
279           << " with algo " << results[idx].algorithm().algo_id()
280           << ", workspace bytes " << results[idx].scratch_bytes();
281 
282   se::dnn::AlgorithmConfig result(
283       se::dnn::AlgorithmDesc(results[idx].algorithm()),
284       results[idx].scratch_bytes());
285 
286   if (idx_no_scratch != -1) {
287     result.set_algorithm_no_scratch(
288         se::dnn::AlgorithmDesc(results[idx_no_scratch].algorithm()));
289   }
290   return result;
291 }
292 
293 template <typename Op>
BestCudnnConvAlgorithm(absl::Span<const AutotuneResult> results,std::vector<std::unique_ptr<const se::dnn::OpRunner<typename Op::Signature>>> runners)294 StatusOr<AutotuneEntry<Op>> BestCudnnConvAlgorithm(
295     absl::Span<const AutotuneResult> results,
296     std::vector<
297         std::unique_ptr<const se::dnn::OpRunner<typename Op::Signature>>>
298         runners) {
299   if (runners.size() != results.size()) {
300     return errors::Internal(
301         "Mismatched size of autotune results and runners vectors.");
302   }
303   int idx;
304   int idx_no_scratch;
305   TF_ASSIGN_OR_RETURN(std::tie(idx, idx_no_scratch),
306                       BestCudnnConvAlgorithmIndices(results));
307   VLOG(2) << "fastest algorithm: "
308           << proto_utils::FromDurationProto(results[idx].run_time())
309           << " with algo " << runners[idx]->ToString() << ", workspace bytes "
310           << results[idx].scratch_bytes();
311   return AutotuneEntry<Op>::FromOpRunners(
312       std::move(runners[idx]), idx_no_scratch == -1 || idx_no_scratch == idx
313                                    ? nullptr
314                                    : std::move(runners[idx_no_scratch]));
315 }
316 
317 template StatusOr<AutotuneEntry<se::dnn::ConvOp>>
318 BestCudnnConvAlgorithm<se::dnn::ConvOp>(
319     absl::Span<const AutotuneResult> results,
320     std::vector<
321         std::unique_ptr<const se::dnn::OpRunner<se::dnn::ConvSignature>>>
322         runners);
323 
324 template StatusOr<AutotuneEntry<se::dnn::FusedConvOp>>
325 BestCudnnConvAlgorithm<se::dnn::FusedConvOp>(
326     absl::Span<const AutotuneResult> results,
327     std::vector<
328         std::unique_ptr<const se::dnn::OpRunner<se::dnn::FusedConvSignature>>>
329         runners);
330 
331 }  // namespace tensorflow
332 
333 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
334