1syntax = "proto3"; 2 3package tensorflow.profiler; 4 5import "tensorflow/core/profiler/protobuf/diagnostics.proto"; 6import "tensorflow/core/profiler/protobuf/kernel_stats.proto"; 7import "tensorflow/core/profiler/protobuf/op_metrics.proto"; 8import "tensorflow/core/profiler/protobuf/steps_db.proto"; 9import "tensorflow/core/profiler/protobuf/tf_function.proto"; 10 11// Performance environment, e.g the peak performance capabilities of the device. 12message PerfEnv { 13 // Peak performance of a TPU core or a GPU in TFLOP/s. 14 double peak_tera_flops_per_second = 1; 15 // Peak memory bandwidth of a TPU core or a GPU in GiBs/s. 16 double peak_hbm_bw_giga_bytes_per_second = 2; 17 // The ridge point of roofline model in FLOP/Byte. (i.e., minimum operational 18 // intensity required to achieve maximum performance). 19 double ridge_point = 3; 20} 21 22// Result proto for host-independent job information. 23message HostIndependentJobInfoResult { 24 // The change-list number of this build. 25 int64 change_list = 1; 26 // The time of this build (nanoseconds since the Unix epoch). 27 int64 build_time = 2; 28 // The target of this build. 29 string build_target = 3; 30 // Profiling duration (in ms). 31 uint32 profile_duration_ms = 4; 32} 33 34// Result proto for host-dependent job information. 35message HostDependentJobInfoResult { 36 // This ID of the host where the job was run on. 37 string host_id = 1; 38 // The command line used to run the job. 39 string command_line = 2; 40 // The start time of this run (nanoseconds since the Unix epoch). 41 int64 start_time = 3; 42 // BNS address specified by client at time of profiling request. 43 string bns_address = 4; 44 // Profiling start walltime (in ns). 45 uint64 profile_time_ns = 5; 46} 47 48// System topology, which describes the number of chips in a pod 49// and the connectivity style. 50message SystemTopology { 51 // The X, Y, and Z dimensions of this topology. 0 means that dimension does 52 // not exist. 53 int64 x_dimension = 1; 54 int64 y_dimension = 2; 55 int64 z_dimension = 3; 56 // The number of expected bad chips in this system. 57 int64 num_expected_reduced_chips = 4; 58} 59 60// The run environment of a profiling session. 61message RunEnvironment { 62 // Number of hosts used. 63 int32 host_count = 1; 64 // Number of tasks used. 65 int32 task_count = 2; 66 // Distinct hostnames seen. 67 map<string, bool> hostnames = 3; 68 // The type of device used. 69 string device_type = 4; 70 // The number of device cores used. 71 // In TPU case, this corresponds to the number of TPU cores 72 // In GPU case, this corresponds to the number of GPUs (not the number of 73 // SMs). 74 int32 device_core_count = 5; 75 // Host-independent information about this job. 76 HostIndependentJobInfoResult host_independent_job_info = 7; 77 // Host-dependent information about this job. 78 repeated HostDependentJobInfoResult host_dependent_job_info = 8; 79 // The number of replicas, corresponds to input parallelism. 80 // If there is no model parallelism, replica_count = device_core_count 81 int32 replica_count = 9; 82 // The number of cores used for a single replica, e.g. model parallelism. 83 // If there is no model parallelism, then num_cores_per_replica = 1 84 int32 num_cores_per_replica = 10; 85 // The chip interconnection topology. 86 SystemTopology topology = 11; 87 // Host trace level. 88 uint32 host_trace_level = 12; 89 reserved 6; 90} 91 92// Next ID: 7 93message CoreDetails { 94 string hostname = 1; 95 uint32 device_ordinal = 2; // unique within host, TPU core only 96 uint32 core_num = 3; // unique within chip per core type 97 uint32 local_chip_id = 4; // unique within host 98 uint32 global_chip_id = 5; // unique within mesh 99 uint32 global_core_id = 6; // unique within mesh, TPU core only 100} 101 102// Metrics based on hardware performance counters. 103message PerformanceCounterResult { 104 // Overall matrix unit utilization in percentage. 105 double matrix_unit_utilization_percent = 1; 106} 107 108// Next ID: 14 109// Operator Statistics. 110message OpStats { 111 // The database for the op metrics collected from the host over the entire 112 // profiling session including incomplete steps. 113 OpMetricsDb host_op_metrics_db = 1; 114 // The database for the op metrics collected from the device over the entire 115 // profiling session including incomplete steps. 116 OpMetricsDb device_op_metrics_db = 2; 117 // The result for the HLO-metric database over the complete steps only. 118 OpMetricsDb hlo_metrics_db_complete_steps_only = 10; 119 // Performance environment of the op metrics collected. 120 PerfEnv perf_env = 3; 121 // The database of step sequences. 122 StepDatabaseResult step_db = 4; 123 // The run environment of this profiling session. 124 RunEnvironment run_environment = 5; 125 // Kernel stats results from all GPUs. 126 KernelStatsDb kernel_stats_db = 6; 127 // Statistics for all tf-functions. 128 TfFunctionDb tf_function_db = 8; 129 // A map from core ID to details. 130 map<uint32, CoreDetails> core_id_to_details = 11; 131 // Error and warning messages for diagnosing profiling issues. 132 Diagnostics diagnostics = 9; 133 // A map from program ID to program name. 134 map<uint64, string> program_id_to_name_map = 12; 135 // Performance counters. 136 PerformanceCounterResult performance_counter_result = 13; 137 reserved 7; 138} 139