xref: /aosp_15_r20/external/tensorflow/tensorflow/compiler/xla/stream_executor/gpu/gpu_diagnostics.h (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_GPU_GPU_DIAGNOSTICS_H_
17 #define TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_GPU_GPU_DIAGNOSTICS_H_
18 
19 #include <tuple>
20 
21 #include "tensorflow/compiler/xla/stream_executor/lib/statusor.h"
22 #include "tensorflow/compiler/xla/stream_executor/platform/port.h"
23 
24 namespace stream_executor {
25 namespace gpu {
26 
27 // e.g. DriverVersion{346, 3, 4}
28 using DriverVersion = std::tuple<int, int, int>;
29 
30 // FIXME: These functions are in stream_executor::cuda namespaces for now
31 // Will move to stream_executor::gpu namespace in the near future
32 //
33 //// Converts a parsed driver version to string form.
34 // string DriverVersionToString(DriverVersion version);
35 //
36 //// Converts a parsed driver version or status value to natural string form.
37 // string DriverVersionStatusToString(port::StatusOr<DriverVersion> version);
38 //
39 //// Converts a string of a form like "331.79" to a DriverVersion{331, 79}.
40 // port::StatusOr<DriverVersion> StringToDriverVersion(const string& value);
41 
42 class Diagnostician {
43  public:
44   // Logs diagnostic information when CUDA appears to be misconfigured (e.g. is
45   // not initializing).
46   //
47   // Note: if we're running on a machine that has no GPUs, we don't want to
48   // produce very much log spew beyond saying, "looks like there's no CUDA
49   // kernel
50   // module running".
51   //
52   // Note: we use non-Google-File:: API here because we may be called before
53   // InitGoogle has completed.
54   static void LogDiagnosticInformation();
55 
56   // Given the driver version file contents, finds the kernel module version and
57   // returns it as a string.
58   //
59   // This is solely used for more informative log messages when the user is
60   // running on a machine that happens to have a libcuda/kernel driver mismatch.
61   static port::StatusOr<DriverVersion> FindKernelModuleVersion(
62       const std::string& driver_version_file_contents);
63 
64   // Extracts the kernel driver version from the current host.
65   static port::StatusOr<DriverVersion> FindKernelDriverVersion();
66 
67   // Iterates through loaded DSOs with DlIteratePhdrCallback to find the
68   // driver-interfacing DSO version number. Returns it as a string.
69   static port::StatusOr<DriverVersion> FindDsoVersion();
70 
71   // Logs information about the kernel driver version and userspace driver
72   // library version.
73   static void LogDriverVersionInformation();
74 
75  private:
76   // Given the DSO version number and the driver version file contents, extracts
77   // the driver version and compares, warning the user in the case of
78   // incompatibility.
79   //
80   // This is solely used for more informative log messages when the user is
81   // running on a machine that happens to have a libcuda/kernel driver mismatch.
82   static void WarnOnDsoKernelMismatch(
83       port::StatusOr<DriverVersion> dso_version,
84       port::StatusOr<DriverVersion> kernel_version);
85 
86   // Logs information about the dev nodes present on this machine: their
87   // existence, permissions, accessibility from this uid/gid.
88   static void LogDevNodeDiagnosticInformation();
89 
90   static std::string GetDevNodePath(int dev_node_ordinal);
91 
92   SE_DISALLOW_COPY_AND_ASSIGN(Diagnostician);
93 };
94 
95 }  // namespace gpu
96 }  // namespace stream_executor
97 
98 #endif  // TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_GPU_GPU_DIAGNOSTICS_H_
99