xref: /aosp_15_r20/external/tensorflow/tensorflow/core/profiler/utils/hardware_type_utils.cc (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/profiler/utils/hardware_type_utils.h"
17 
18 #include "absl/strings/match.h"
19 #include "tensorflow/core/platform/logging.h"
20 #include "tensorflow/core/platform/types.h"
21 #include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
22 #include "tensorflow/core/profiler/utils/xplane_schema.h"
23 
24 namespace tensorflow {
25 namespace profiler {
26 namespace {
27 
28 // Get theoretical upperbound of single precision FMA throughput of the GPU per
29 // cycle per streaming multiprocessor.
30 // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#arithmetic-instructions__throughput-native-arithmetic-instructions
GetFmaMaxThroughputPerSMPerCycle(const DeviceCapabilities & device_cap)31 uint32 GetFmaMaxThroughputPerSMPerCycle(const DeviceCapabilities& device_cap) {
32   if (device_cap.device_vendor() == kDeviceVendorNvidia) {
33     uint32 n_fp32_cores = 0;
34     uint32 n_tc_cores = 0;
35     switch (device_cap.compute_capability().major()) {
36       case 2:
37         // Fermi
38         n_fp32_cores = 32;
39         break;
40       case 3:
41         // Kepler
42         n_fp32_cores = 192;
43         break;
44       case 5:
45         // Maxwell
46         n_fp32_cores = 128;
47         break;
48       case 6:
49         // Pascal
50         if (device_cap.compute_capability().minor() > 0) {
51           // Pascal SM61/62
52           n_fp32_cores = 128;
53         } else {
54           // Pascal SM60
55           n_fp32_cores = 64;
56         }
57         break;
58       case 7:
59         // Volta and Turing
60         n_fp32_cores = 64;
61         n_tc_cores = 8;
62         break;
63       case 8:
64         // Ampere
65         if (device_cap.compute_capability().minor() >= 6) {
66           // Ampere SM86
67           n_fp32_cores = 128;
68         } else {
69           // Ampere SM80
70           n_fp32_cores = 64;
71         }
72         n_tc_cores = 4;
73         break;
74       default:
75         LOG(ERROR) << "Invalid GPU compute capability.";
76         break;
77     }
78     // GPU TensorCore can execute 64 FMAs per cycle.
79     // https://devblogs.nvidia.com/programming-tensor-cores-cuda-9/
80     return n_fp32_cores + n_tc_cores * 64;
81   } else if (device_cap.device_vendor() == kDeviceVendorAMD) {
82     uint32_t n_xdlops = 0;
83     uint32_t n_fp32_cores = 0;
84 
85     if (device_cap.compute_capability().major() <= 9) {
86       n_fp32_cores = 64;
87     } else {
88       n_fp32_cores = 32;
89     }
90     // TODO(rocm-profiler): verify with new devices
91     return n_fp32_cores + n_xdlops * 1;
92   } else {
93     LOG(ERROR) << "Unknown device vendor " << device_cap.device_vendor();
94     return 0;
95   }
96 }
97 
98 }  // namespace
99 
GetFlopMaxThroughputPerSM(const DeviceCapabilities & device_cap)100 double GetFlopMaxThroughputPerSM(const DeviceCapabilities& device_cap) {
101   // One FMA = 2 floating point operations, one multiply and one add.
102   return GetFmaMaxThroughputPerSMPerCycle(device_cap) * 2 *
103          device_cap.clock_rate_in_ghz();
104 }
105 
GpuModelName(const DeviceCapabilities & device_cap)106 absl::string_view GpuModelName(const DeviceCapabilities& device_cap) {
107   if (device_cap.device_vendor() == kDeviceVendorNvidia) {
108     switch (device_cap.compute_capability().major()) {
109       case 2:
110         return "Nvidia GPU (Fermi)";
111       case 3:
112         return "Nvidia GPU (Kepler)";
113       case 5:
114         return "Nvidia GPU (Maxwell)";
115       case 6:
116         return "Nvidia GPU (Pascal)";
117       case 7:
118         if (device_cap.compute_capability().minor() < 5) {
119           return "Nvidia GPU (Volta)";
120         } else {
121           return "Nvidia GPU (Turing)";
122         }
123       case 8:
124         return "Nvidia GPU (Ampere)";
125       default:
126         return "Nvidia GPU";
127     }
128   } else if (device_cap.device_vendor() == kDeviceVendorAMD) {
129     switch (device_cap.compute_capability().major()) {
130       case 9:
131         return "AMD GPU - gfx-9XX series";
132       case 10:
133         return "AMD GPU - gfx-10XX series";
134       case 11:
135         return "AMD GPU - gfx-11XX series";
136       default:
137         return "AMD GPU";
138     }
139   } else {
140     LOG(ERROR) << "Unknown device vendor " << device_cap.device_vendor();
141     return "";
142   }
143 }
144 
ParseHardwareType(absl::string_view device_type)145 HardwareType ParseHardwareType(absl::string_view device_type) {
146   if (absl::StrContains(device_type, "GPU")) return HardwareType::GPU;
147   if (device_type == "CPU") return HardwareType::CPU_ONLY;
148   if (absl::StrContains(device_type, "TPU")) return HardwareType::TPU;
149   return HardwareType::UNKNOWN_HARDWARE;
150 }
151 
HasDevice(HardwareType x)152 bool HasDevice(HardwareType x) { return x > tensorflow::profiler::CPU_ONLY; }
153 
154 }  // namespace profiler
155 }  // namespace tensorflow
156