1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/kernels/gpu_utils.h"
17
18 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
19
20 #include <iterator>
21
22 #include "google/protobuf/any.pb.h"
23 #include "absl/algorithm/container.h"
24 #include "absl/base/call_once.h"
25 #include "tensorflow/core/platform/logger.h"
26 #include "tensorflow/core/protobuf/autotuning.pb.h"
27 #include "tensorflow/core/protobuf/conv_autotuning.pb.h"
28 #include "tensorflow/core/util/determinism.h"
29 #include "tensorflow/core/util/env_var.h"
30 #include "tensorflow/core/util/proto/proto_utils.h"
31 #include "tensorflow/stream_executor/gpu/asm_compiler.h"
32 #include "tensorflow/stream_executor/gpu/redzone_allocator.h"
33
34 namespace tensorflow {
35
RedzoneCheckDisabled()36 bool RedzoneCheckDisabled() {
37 const char* disable_rz_str = std::getenv("TF_DISABLE_RZ_CHECK");
38 return disable_rz_str != nullptr && std::strcmp(disable_rz_str, "1") == 0;
39 }
40
WrapRedzoneBestEffort(se::RedzoneAllocator * rz_allocator,se::DeviceMemoryBase buffer)41 se::DeviceMemoryBase WrapRedzoneBestEffort(se::RedzoneAllocator* rz_allocator,
42 se::DeviceMemoryBase buffer) {
43 if (RedzoneCheckDisabled()) {
44 return buffer;
45 }
46 auto output_rz_or = rz_allocator->AllocateBytes(buffer.size());
47 if (!output_rz_or.ok()) {
48 static absl::once_flag rz_allocation_failure_logged;
49 absl::call_once(rz_allocation_failure_logged, []() {
50 LOG(WARNING) << "Failed to allocate memory for convolution redzone "
51 << "checking; skipping this check. This is benign and only "
52 << "means that we won't check cudnn for out-of-bounds reads "
53 << "and writes. This message will only be printed once.";
54 });
55 return buffer;
56 }
57 return se::DeviceMemoryBase(output_rz_or.ValueOrDie());
58 }
59
CheckRedzones(const se::RedzoneAllocator & rz_allocator,tensorflow::AutotuneResult * autotune_result)60 void CheckRedzones(const se::RedzoneAllocator& rz_allocator,
61 tensorflow::AutotuneResult* autotune_result) {
62 if (RedzoneCheckDisabled()) {
63 return;
64 }
65 se::port::StatusOr<se::RedzoneAllocator::RedzoneCheckStatus> rz_status =
66 rz_allocator.CheckRedzones();
67 if (!rz_status.ok()) {
68 static absl::once_flag failure_logged;
69 absl::call_once(failure_logged, [&]() {
70 LOG(WARNING) << "Failed to check cudnn convolutions for out-of-bounds "
71 << "reads and writes with an error message: '"
72 << rz_status.status().error_message()
73 << "'; skipping this check. This only means that we won't "
74 << "check cudnn for out-of-bounds reads and writes. This "
75 << "message will only be printed once.";
76 });
77 return;
78 }
79 auto rz_check_status = rz_status.ValueOrDie();
80 if (!rz_check_status.ok()) {
81 auto* fail = autotune_result->mutable_failure();
82 fail->set_msg(rz_check_status.RedzoneFailureMsg());
83 fail->set_kind(AutotuneResult::REDZONE_MODIFIED);
84 fail->set_buffer_address(
85 reinterpret_cast<uint64>(rz_check_status.user_buffer_address));
86 LOG(ERROR)
87 << "Detected cudnn out-of-bounds write in convolution buffer! This is "
88 "likely a cudnn bug. We will skip this algorithm in the future, but "
89 "your GPU state may already be corrupted, leading to incorrect "
90 "results. Within Google, no action is needed on your part. Outside "
91 "of Google, please ensure you're running the latest version of "
92 "cudnn. If that doesn't fix the problem, please file a bug with "
93 "this full error message and we'll contact nvidia.";
94 LOG(ERROR) << rz_check_status.RedzoneFailureMsg();
95 }
96 }
97
EnableCublasLtGemm()98 bool EnableCublasLtGemm() {
99 static const bool enable_cublaslt_gemm = [] {
100 bool cublaslt_gemm = false;
101 TF_CHECK_OK(tensorflow::ReadBoolFromEnvVar("TF_USE_CUBLASLT",
102 /*default_val=*/false,
103 &cublaslt_gemm));
104 return cublaslt_gemm;
105 }();
106 return enable_cublaslt_gemm;
107 }
108
109 namespace {
110
GetCudnnVersion(se::StreamExecutor * stream_executor)111 tensorflow::CudnnVersion GetCudnnVersion(se::StreamExecutor* stream_executor) {
112 tensorflow::CudnnVersion cudnn_version;
113 if (auto* dnn = stream_executor->AsDnn()) {
114 se::port::StatusOr<se::dnn::VersionInfo> version_or = dnn->GetVersion();
115 if (version_or.ok()) {
116 const auto& version = version_or.ValueOrDie();
117 cudnn_version.set_major(version.major_version());
118 cudnn_version.set_minor(version.minor_version());
119 cudnn_version.set_patch(version.patch());
120 }
121 }
122 return cudnn_version;
123 }
124
GetComputeCapability(se::StreamExecutor * stream_executor)125 tensorflow::ComputeCapability GetComputeCapability(
126 se::StreamExecutor* stream_executor) {
127 tensorflow::ComputeCapability cc_proto;
128 se::CudaComputeCapability cc =
129 stream_executor->GetDeviceDescription().cuda_compute_capability();
130 cc_proto.set_major(cc.major);
131 cc_proto.set_minor(cc.minor);
132 return cc_proto;
133 }
134
135 } // namespace
136
LogConvAutotuneResults(se::dnn::ConvolutionKind kind,se::dnn::DataType element_type,se::DeviceMemoryBase input_buffer,se::DeviceMemoryBase filter_buffer,se::DeviceMemoryBase output_buffer,const se::dnn::BatchDescriptor & input_desc,const se::dnn::FilterDescriptor & filter_desc,const se::dnn::BatchDescriptor & output_desc,const se::dnn::ConvolutionDescriptor & conv_desc,se::StreamExecutor * stream_exec,absl::Span<const AutotuneResult> results)137 void LogConvAutotuneResults(se::dnn::ConvolutionKind kind,
138 se::dnn::DataType element_type,
139 se::DeviceMemoryBase input_buffer,
140 se::DeviceMemoryBase filter_buffer,
141 se::DeviceMemoryBase output_buffer,
142 const se::dnn::BatchDescriptor& input_desc,
143 const se::dnn::FilterDescriptor& filter_desc,
144 const se::dnn::BatchDescriptor& output_desc,
145 const se::dnn::ConvolutionDescriptor& conv_desc,
146 se::StreamExecutor* stream_exec,
147 absl::Span<const AutotuneResult> results) {
148 AutotuningLog log;
149 {
150 ConvolutionProto instr;
151 instr.set_kind(kind);
152 *instr.mutable_input() = input_desc.ToProto(element_type);
153 *instr.mutable_filter() = filter_desc.ToProto(element_type);
154 *instr.mutable_output() = output_desc.ToProto(element_type);
155 *instr.mutable_conv_desc() = conv_desc.ToProto();
156 instr.set_conv_scale(1);
157 instr.set_side_value_scale(0);
158 instr.set_input_address(reinterpret_cast<uint64>(input_buffer.opaque()));
159 instr.set_filter_address(reinterpret_cast<uint64>(filter_buffer.opaque()));
160 instr.set_output_address(reinterpret_cast<uint64>(output_buffer.opaque()));
161 log.mutable_instr()->PackFrom(std::move(instr));
162 }
163 *log.mutable_cudnn_version() = GetCudnnVersion(stream_exec);
164 *log.mutable_compute_capability() = GetComputeCapability(stream_exec);
165 log.set_device_pci_bus_id(stream_exec->GetDeviceDescription().pci_bus_id());
166 {
167 string blas_version;
168 if (auto* blas = stream_exec->AsBlas()) {
169 if (blas->GetVersion(&blas_version).ok()) {
170 log.set_blas_version(blas_version);
171 }
172 }
173 }
174 for (const auto& result : results) {
175 *log.add_results() = result;
176 }
177 VLOG(2) << log.DebugString();
178 Logger::GetSingleton()->LogProto(log);
179 }
180
LogFusedConvForwardAutotuneResults(se::dnn::DataType element_type,se::DeviceMemoryBase input_buffer,se::DeviceMemoryBase filter_buffer,se::DeviceMemoryBase output_buffer,se::DeviceMemoryBase bias_buffer,se::DeviceMemoryBase side_input_buffer,const se::dnn::BatchDescriptor & input_desc,const se::dnn::FilterDescriptor & filter_desc,const se::dnn::BatchDescriptor & output_desc,const se::dnn::ConvolutionDescriptor & conv_desc,double conv_scale,double side_value_scale,se::dnn::ActivationMode activation_mode,se::StreamExecutor * stream_exec,absl::Span<const AutotuneResult> results)181 void LogFusedConvForwardAutotuneResults(
182 se::dnn::DataType element_type, se::DeviceMemoryBase input_buffer,
183 se::DeviceMemoryBase filter_buffer, se::DeviceMemoryBase output_buffer,
184 se::DeviceMemoryBase bias_buffer, se::DeviceMemoryBase side_input_buffer,
185 const se::dnn::BatchDescriptor& input_desc,
186 const se::dnn::FilterDescriptor& filter_desc,
187 const se::dnn::BatchDescriptor& output_desc,
188 const se::dnn::ConvolutionDescriptor& conv_desc, double conv_scale,
189 double side_value_scale, se::dnn::ActivationMode activation_mode,
190 se::StreamExecutor* stream_exec, absl::Span<const AutotuneResult> results) {
191 AutotuningLog log;
192 {
193 ConvolutionProto instr;
194 instr.set_kind(se::dnn::ConvolutionKind::FORWARD_BIAS_ACTIVATION);
195 *instr.mutable_input() = input_desc.ToProto(element_type);
196 *instr.mutable_filter() = filter_desc.ToProto(element_type);
197 *instr.mutable_output() = output_desc.ToProto(element_type);
198 *instr.mutable_conv_desc() = conv_desc.ToProto();
199 instr.set_conv_scale(conv_scale);
200 instr.set_side_value_scale(side_value_scale);
201 instr.set_activation(activation_mode);
202 instr.set_input_address(reinterpret_cast<uint64>(input_buffer.opaque()));
203 instr.set_filter_address(reinterpret_cast<uint64>(filter_buffer.opaque()));
204 instr.set_output_address(reinterpret_cast<uint64>(output_buffer.opaque()));
205 instr.set_bias_address(reinterpret_cast<uint64>(bias_buffer.opaque()));
206 instr.set_side_input_address(
207 reinterpret_cast<uint64>(side_input_buffer.opaque()));
208 log.mutable_instr()->PackFrom(std::move(instr));
209 }
210 *log.mutable_cudnn_version() = GetCudnnVersion(stream_exec);
211 *log.mutable_compute_capability() = GetComputeCapability(stream_exec);
212 log.set_device_pci_bus_id(stream_exec->GetDeviceDescription().pci_bus_id());
213 {
214 string blas_version;
215 if (auto* blas = stream_exec->AsBlas()) {
216 if (blas->GetVersion(&blas_version).ok()) {
217 log.set_blas_version(blas_version);
218 }
219 }
220 }
221 for (const auto& result : results) {
222 *log.add_results() = result;
223 }
224 VLOG(2) << log.DebugString();
225 Logger::GetSingleton()->LogProto(log);
226 }
227
228 namespace {
BestCudnnConvAlgorithmIndices(absl::Span<const AutotuneResult> results)229 StatusOr<std::tuple<int, int>> BestCudnnConvAlgorithmIndices(
230 absl::Span<const AutotuneResult> results) {
231 auto compare_run_times = [](const AutotuneResult& lhs,
232 const AutotuneResult& rhs) {
233 return proto_utils::FromDurationProto(lhs.run_time()) <
234 proto_utils::FromDurationProto(rhs.run_time());
235 };
236 int idx = -1;
237 int idx_no_scratch = -1;
238 for (int i = 0; i < results.size(); i++) {
239 if (!results[i].has_failure()) {
240 if (OpDeterminismRequired()) {
241 // When determinism is enabled, choose first working algorithm, and
242 // don't choose a no_scratch algorithm.
243 idx = i;
244 break;
245 }
246 if (idx == -1 || compare_run_times(results[i], results[idx])) {
247 idx = i;
248 }
249 if (results[i].scratch_bytes() == 0 &&
250 (idx_no_scratch == -1 ||
251 compare_run_times(results[i], results[idx_no_scratch]))) {
252 idx_no_scratch = i;
253 }
254 }
255 }
256
257 if (idx == -1) {
258 std::ostringstream msg;
259 msg << "No algorithm worked! Error messages:";
260 // TODO(awpr): identify the algorithm as part of this error message, too.
261 for (const auto& result : results) {
262 msg << "\n " << result.failure().msg();
263 }
264 return errors::NotFound(msg.str());
265 }
266
267 return std::make_tuple(idx, idx_no_scratch);
268 }
269 } // namespace
270
BestCudnnConvAlgorithm(absl::Span<const AutotuneResult> results)271 StatusOr<se::dnn::AlgorithmConfig> BestCudnnConvAlgorithm(
272 absl::Span<const AutotuneResult> results) {
273 int idx;
274 int idx_no_scratch;
275 TF_ASSIGN_OR_RETURN(std::tie(idx, idx_no_scratch),
276 BestCudnnConvAlgorithmIndices(results));
277 VLOG(2) << "fastest algorithm: "
278 << proto_utils::FromDurationProto(results[idx].run_time())
279 << " with algo " << results[idx].algorithm().algo_id()
280 << ", workspace bytes " << results[idx].scratch_bytes();
281
282 se::dnn::AlgorithmConfig result(
283 se::dnn::AlgorithmDesc(results[idx].algorithm()),
284 results[idx].scratch_bytes());
285
286 if (idx_no_scratch != -1) {
287 result.set_algorithm_no_scratch(
288 se::dnn::AlgorithmDesc(results[idx_no_scratch].algorithm()));
289 }
290 return result;
291 }
292
293 template <typename Op>
BestCudnnConvAlgorithm(absl::Span<const AutotuneResult> results,std::vector<std::unique_ptr<const se::dnn::OpRunner<typename Op::Signature>>> runners)294 StatusOr<AutotuneEntry<Op>> BestCudnnConvAlgorithm(
295 absl::Span<const AutotuneResult> results,
296 std::vector<
297 std::unique_ptr<const se::dnn::OpRunner<typename Op::Signature>>>
298 runners) {
299 if (runners.size() != results.size()) {
300 return errors::Internal(
301 "Mismatched size of autotune results and runners vectors.");
302 }
303 int idx;
304 int idx_no_scratch;
305 TF_ASSIGN_OR_RETURN(std::tie(idx, idx_no_scratch),
306 BestCudnnConvAlgorithmIndices(results));
307 VLOG(2) << "fastest algorithm: "
308 << proto_utils::FromDurationProto(results[idx].run_time())
309 << " with algo " << runners[idx]->ToString() << ", workspace bytes "
310 << results[idx].scratch_bytes();
311 return AutotuneEntry<Op>::FromOpRunners(
312 std::move(runners[idx]), idx_no_scratch == -1 || idx_no_scratch == idx
313 ? nullptr
314 : std::move(runners[idx_no_scratch]));
315 }
316
317 template StatusOr<AutotuneEntry<se::dnn::ConvOp>>
318 BestCudnnConvAlgorithm<se::dnn::ConvOp>(
319 absl::Span<const AutotuneResult> results,
320 std::vector<
321 std::unique_ptr<const se::dnn::OpRunner<se::dnn::ConvSignature>>>
322 runners);
323
324 template StatusOr<AutotuneEntry<se::dnn::FusedConvOp>>
325 BestCudnnConvAlgorithm<se::dnn::FusedConvOp>(
326 absl::Span<const AutotuneResult> results,
327 std::vector<
328 std::unique_ptr<const se::dnn::OpRunner<se::dnn::FusedConvSignature>>>
329 runners);
330
331 } // namespace tensorflow
332
333 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
334