1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #include "tensorflow/core/profiler/utils/hardware_type_utils.h" 17 18 #include "absl/strings/match.h" 19 #include "tensorflow/core/platform/logging.h" 20 #include "tensorflow/core/platform/types.h" 21 #include "tensorflow/core/profiler/protobuf/hardware_types.pb.h" 22 #include "tensorflow/core/profiler/utils/xplane_schema.h" 23 24 namespace tensorflow { 25 namespace profiler { 26 namespace { 27 28 // Get theoretical upperbound of single precision FMA throughput of the GPU per 29 // cycle per streaming multiprocessor. 30 // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#arithmetic-instructions__throughput-native-arithmetic-instructions GetFmaMaxThroughputPerSMPerCycle(const DeviceCapabilities & device_cap)31uint32 GetFmaMaxThroughputPerSMPerCycle(const DeviceCapabilities& device_cap) { 32 if (device_cap.device_vendor() == kDeviceVendorNvidia) { 33 uint32 n_fp32_cores = 0; 34 uint32 n_tc_cores = 0; 35 switch (device_cap.compute_capability().major()) { 36 case 2: 37 // Fermi 38 n_fp32_cores = 32; 39 break; 40 case 3: 41 // Kepler 42 n_fp32_cores = 192; 43 break; 44 case 5: 45 // Maxwell 46 n_fp32_cores = 128; 47 break; 48 case 6: 49 // Pascal 50 if (device_cap.compute_capability().minor() > 0) { 51 // Pascal SM61/62 52 n_fp32_cores = 128; 53 } else { 54 // Pascal SM60 55 n_fp32_cores = 64; 56 } 57 break; 58 case 7: 59 // Volta and Turing 60 n_fp32_cores = 64; 61 n_tc_cores = 8; 62 break; 63 case 8: 64 // Ampere 65 if (device_cap.compute_capability().minor() >= 6) { 66 // Ampere SM86 67 n_fp32_cores = 128; 68 } else { 69 // Ampere SM80 70 n_fp32_cores = 64; 71 } 72 n_tc_cores = 4; 73 break; 74 default: 75 LOG(ERROR) << "Invalid GPU compute capability."; 76 break; 77 } 78 // GPU TensorCore can execute 64 FMAs per cycle. 79 // https://devblogs.nvidia.com/programming-tensor-cores-cuda-9/ 80 return n_fp32_cores + n_tc_cores * 64; 81 } else if (device_cap.device_vendor() == kDeviceVendorAMD) { 82 uint32_t n_xdlops = 0; 83 uint32_t n_fp32_cores = 0; 84 85 if (device_cap.compute_capability().major() <= 9) { 86 n_fp32_cores = 64; 87 } else { 88 n_fp32_cores = 32; 89 } 90 // TODO(rocm-profiler): verify with new devices 91 return n_fp32_cores + n_xdlops * 1; 92 } else { 93 LOG(ERROR) << "Unknown device vendor " << device_cap.device_vendor(); 94 return 0; 95 } 96 } 97 98 } // namespace 99 GetFlopMaxThroughputPerSM(const DeviceCapabilities & device_cap)100double GetFlopMaxThroughputPerSM(const DeviceCapabilities& device_cap) { 101 // One FMA = 2 floating point operations, one multiply and one add. 102 return GetFmaMaxThroughputPerSMPerCycle(device_cap) * 2 * 103 device_cap.clock_rate_in_ghz(); 104 } 105 GpuModelName(const DeviceCapabilities & device_cap)106absl::string_view GpuModelName(const DeviceCapabilities& device_cap) { 107 if (device_cap.device_vendor() == kDeviceVendorNvidia) { 108 switch (device_cap.compute_capability().major()) { 109 case 2: 110 return "Nvidia GPU (Fermi)"; 111 case 3: 112 return "Nvidia GPU (Kepler)"; 113 case 5: 114 return "Nvidia GPU (Maxwell)"; 115 case 6: 116 return "Nvidia GPU (Pascal)"; 117 case 7: 118 if (device_cap.compute_capability().minor() < 5) { 119 return "Nvidia GPU (Volta)"; 120 } else { 121 return "Nvidia GPU (Turing)"; 122 } 123 case 8: 124 return "Nvidia GPU (Ampere)"; 125 default: 126 return "Nvidia GPU"; 127 } 128 } else if (device_cap.device_vendor() == kDeviceVendorAMD) { 129 switch (device_cap.compute_capability().major()) { 130 case 9: 131 return "AMD GPU - gfx-9XX series"; 132 case 10: 133 return "AMD GPU - gfx-10XX series"; 134 case 11: 135 return "AMD GPU - gfx-11XX series"; 136 default: 137 return "AMD GPU"; 138 } 139 } else { 140 LOG(ERROR) << "Unknown device vendor " << device_cap.device_vendor(); 141 return ""; 142 } 143 } 144 ParseHardwareType(absl::string_view device_type)145HardwareType ParseHardwareType(absl::string_view device_type) { 146 if (absl::StrContains(device_type, "GPU")) return HardwareType::GPU; 147 if (device_type == "CPU") return HardwareType::CPU_ONLY; 148 if (absl::StrContains(device_type, "TPU")) return HardwareType::TPU; 149 return HardwareType::UNKNOWN_HARDWARE; 150 } 151 HasDevice(HardwareType x)152bool HasDevice(HardwareType x) { return x > tensorflow::profiler::CPU_ONLY; } 153 154 } // namespace profiler 155 } // namespace tensorflow 156