1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_CORE_FRAMEWORK_METRICS_H_ 17 #define TENSORFLOW_CORE_FRAMEWORK_METRICS_H_ 18 19 #include "absl/container/flat_hash_map.h" 20 #include "tensorflow/core/framework/dataset_options.pb.h" 21 #include "tensorflow/core/lib/monitoring/counter.h" 22 #include "tensorflow/core/lib/monitoring/gauge.h" 23 #include "tensorflow/core/platform/env.h" 24 #include "tensorflow/core/platform/statusor.h" 25 #include "tensorflow/core/platform/types.h" 26 #include "tensorflow/core/protobuf/data_service.pb.h" 27 28 namespace tensorflow { 29 namespace metrics { 30 31 // Records that a tf.data.Dataset executed by the program used autotuning. 32 // 33 // The `name` argument identifies the Dataset type (e.g. "ParallelMap"). 34 void RecordTFDataAutotune(const string& name); 35 36 // Returns a counter that can be used to record the number of bytes produced by 37 // a tf.data.Dataset. 38 // 39 // The `name` argument identifies the Dataset type (e.g. "Batch" or "Map"). 40 monitoring::CounterCell* GetTFDataBytesConsumedCounter(const string& name); 41 42 // Returns a counter that can be used to record the number of bytes produced by 43 // a tf.data.Dataset. 44 // 45 // The `name` argument identifies the Dataset type (e.g. "Batch" or "Map"). 46 monitoring::CounterCell* GetTFDataBytesProducedCounter(const string& name); 47 48 // Returns a counter than can be used to record the number of bytes read from 49 // the filesystem by a tf.data.Dataset source. 50 // 51 // The `name` argument identifies the Dataset type (e.g. "TFRecordDataset"). 52 // 53 // TODO(jsimsa): Remove this now that we have GetTFDataBytesConsumedCounter? 54 monitoring::CounterCell* GetTFDataBytesReadCounter(const string& name); 55 56 // Returns a counter than can be used to record the number of elements produced 57 // by a tf.data.Dataset. 58 // 59 // The `name` argument identifies the Dataset type (e.g. "Batch" or "Map"). 60 monitoring::CounterCell* GetTFDataElementsCounter(const string& name); 61 62 // Returns a gauge than can be used to record the performance model information. 63 // 64 // The `id` argument represents the (unique) model ID. 65 monitoring::GaugeCell<std::function<std::string()>>* GetTFDataModelGauge( 66 const string& id); 67 68 // Records the number of bytes fetched from tf.data.Dataset iterator. 69 void RecordTFDataBytesFetched(int64_t num_bytes); 70 71 // Records the number of times tf.data experiment is applied to input pipelines. 72 void RecordTFDataExperiment(const string& name); 73 74 // Records the time (in microseconds) spent in a single invocation of 75 // `ItertatorResource::GetNext()`. 76 void RecordTFDataGetNextDuration(uint64 duration_us); 77 78 // Records the histogram of ratios of tf.data autotune algorithm used RAM over 79 // the ram budget. 80 void RecordTFDataAutotuneUsedRamBudgetRatio(const double ratio); 81 82 // Records the histogram of ratios of tf.data autotune algorithm max buffer 83 // bytes over the ram budget. 84 void RecordTFDataAutotuneMaxBufferBudgetRatio(const double ratio); 85 86 // Records the number of times each tf.data fingerprint is used 87 // to measure duplicate pre-processing. 88 // 89 // The `name` argument identifies the Dataset graph fingerprint, 90 // created using GraphHash(). 91 void RecordTFDataFingerprint(const string& name); 92 93 // Records the time (in microseconds) during which `IteratorResource` was busy 94 // processing at least one `GetNext()` request. 95 void RecordTFDataIteratorBusy(uint64 duration_us); 96 97 // Records the time (in microseconds) between `IteratorResource` receiving the 98 // first `GetNext()` request and responding to the last `GetNext()` request. 99 void RecordTFDataIteratorLifetime(uint64 duration_us); 100 101 // Records the time histogram (in microseconds) between `IteratorResource` 102 // responding to a `GetNext()` request and receiving the next `GetNext()` 103 // request. 104 void RecordTFDataIteratorGap(uint64 duration_us); 105 106 // Records the number of independent graph changes resulting from the 107 // application of a tf.data optimization. 108 // 109 // The `name` argument identifies the optimization (e.g. "noop_elimination"). 110 void RecordTFDataOptimization(const string& name, int64_t num_changes); 111 112 // Records that a tf.data service worker has been created. 113 void RecordTFDataServiceWorkerCreated(); 114 115 // Records that a tf.data service job has been created. 116 void RecordTFDataServiceJobsCreated( 117 const tensorflow::data::ProcessingModeDef& processing_mode, 118 bool is_coordinated_read); 119 120 // Records tf.data service iterators created by clients. 121 void RecordTFDataServiceClientIterators( 122 int64_t worker_uid, tensorflow::data::DeploymentMode deployment_mode, 123 const tensorflow::data::ProcessingModeDef& processing_mode, 124 bool is_coordinated_read); 125 126 // Records tf.data service cross-trainer cache queries. 127 void RecordTFDataServiceCrossTrainerCacheQuery(bool cache_hit); 128 129 // Records tf.data service cross-trainer cache memory usage in bytes. 130 void RecordTFDataServiceCrossTrainerCacheSizeBytes(size_t bytes); 131 132 // Records the file name read by a tf.data Dataset. 133 // 134 // The `name` argument identifies the Dataset type (e.g. "TFRecordDataset"). 135 void RecordTFDataFilename(const string& name, const string& filename); 136 137 // Records statistics of tf.data auto sharding. 138 // 139 // The `id` is a unique identifier of the input pipeline. The `policy` 140 // identifies the auto-sharding policy used, the `num_workers` identifies the 141 // number of workers, and `num_replicas` identifies the number of replicas. 142 void RecordTFDataAutoShard(const string& id, data::AutoShardPolicy policy, 143 int64 num_workers, int64 num_replicas); 144 145 // Records statistics of whether we can rewrite batch size in tf.data auto 146 // sharding. 147 // 148 // The `id` is a unique identifier of the input pipeline. The `eligible` 149 // indicates whether the input pipeline is eligible for the rewrite. The 150 // `ineligible_reason` is the reason if the input pipeline is ineligible. 151 void RecordTFDataAutoShardRewriteBatchSize( 152 bool eligible, const std::vector<string>& ineligible_reason); 153 154 // Records the number of times each tf.data autotuning algorithm stopping 155 // criterion is met. 156 void RecordTFDataAutotuneStoppingCriteria(const string& name); 157 158 // Records parsing of dense tensor features. 159 void RecordParseDenseFeature(int64_t num_features); 160 161 // Records parsing of sparse tensor features. 162 void RecordParseSparseFeature(int64_t num_features); 163 164 // Records parsing of ragged tensor features. 165 void RecordParseRaggedFeature(int64_t num_features); 166 167 // Records the size of input/output tensors in bytes. 168 void RecordGraphInputTensors(const size_t size); 169 void RecordGraphOutputTensors(const size_t size); 170 171 // Records the number of cores requested by graphs with XLA SPMD enabled. 172 void RecordTPUXlaSpmdCoresPerReplica(int64_t cores_per_replica); 173 174 void UpdateGraphExecTime(const uint64 running_time_usecs); 175 void UpdateGraphPendingQueueLength(uint64 len); 176 177 // Records that one output of an op of type `op_name` was unused. 178 void RecordUnusedOutput(const string& op_name); 179 180 // Updates the metrics stored about time spent building graphs. 181 // 182 // By "GraphBuild", we refer to building a client graph, which is a sub-graph of 183 // the full graph, induced by a set of options. In particular, these options 184 // include the feeds and fetches requested. 185 // 186 // This includes time spent: 187 // * optimizing the graphs with Grappler 188 // * pruning the sub-graph (unless the place_pruned_graph option is set) 189 // 190 // When executing eagerly, this will not record any activity. 191 // 192 // TODO(jtkeeling): Should we record building/optimizing tf.functions? 193 void UpdateGraphBuildTime(const uint64 running_time_usecs); 194 195 // Records the status of a graph passing through various states/stages of 196 // TfMlirGraphOptimizationPass processing using 197 // tf_metadata.tf_mlir_update_graph_optimization_pass_state_counter metric. 198 // 'pass_state' identifies the state of the pass 199 // (or "PassState" metric field) and 'processing_state' refers to the stage 200 // in the process the graph is at (or "ProcessingState" metric field). 201 void UpdateTfMlirGraphOptimizationPassStateCounter( 202 const std::string& pass_state, const std::string& processing_state); 203 204 // Records the activity of the first phase of the mlir bridge using the 205 // tf_metadata.tf_mlir_bridge_first_phase_count metric. 206 // device_type: tpu, cpu, gpu, etc. 207 // bridge_version: v1 compat, v2, etc. 208 // fallback_enabled: true if fallback will happen, false if not 209 // result: outcome of bridge (success, failure, disabled, invalid_graph, etc.) 210 void UpdateTfMlirBridgeFirstPhaseCounter(const std::string& device_type, 211 const std::string& bridge_version, 212 bool fallback_enabled, 213 const std::string& result); 214 215 // Records the activity per op using the 216 // tf_metadata.tf_mlir_bridge_graph_analysis_per_op. 217 // op_name: the name of op. 218 // construction_context: eager, session, Not tracked. 219 // is_single_core_inference_mode: true, false. 220 // unsupported_reason: the reason why the graph is not supported in MLIR-based 221 // bridge, like invalid graph, has unsupported ops, etc. 222 // has_unsupported_features: true indicates MLIR-based bridge is disabled, 223 // false indicates MLIR-based bridge is enabled. 224 225 void UpdateTfMlirBridgeGraphAnalysisPerOp( 226 const std::string& op_name, const std::string& construction_context, 227 bool is_single_core_inference_mode, const std::string& num_replicas, 228 const std::string& num_cores_per_replica, const std::string& use_tpu, 229 const std::string& allow_soft_placement, 230 const std::string& use_spmd_for_xla_partitioning, 231 const std::string& unsupported_reason, bool has_unsupported_features); 232 233 // Convenience class allowing RAII style of reporting for a monitoring::Counter. 234 template <int NumLabels> 235 class ScopedCounter final { 236 public: ScopedCounter(monitoring::Counter<NumLabels> * const counter,const std::array<std::string,NumLabels> & labels)237 ScopedCounter(monitoring::Counter<NumLabels>* const counter, 238 const std::array<std::string, NumLabels>& labels) 239 : counter_(counter), labels_(labels) { 240 Init(); 241 } 242 243 // Report counter and stop it. Counter needs to be reset to perform 244 // next measurement. ReportAndStop()245 void ReportAndStop() { 246 if (started_) { 247 started_ = false; 248 ReportInternal(std::make_index_sequence<NumLabels>()); 249 } 250 } 251 252 // Start the measurement with the new set of labels. Reset(const std::array<std::string,NumLabels> & labels)253 void Reset(const std::array<std::string, NumLabels>& labels) { 254 labels_ = labels; 255 Init(); 256 } 257 258 // Start the measurement with the existing set of labels. Reset()259 void Reset() { Init(); } 260 261 // Returns duration of the current interval in case the timer has started. 262 // Returns nullopt otherwise. DurationMicroSec()263 absl::optional<uint64> DurationMicroSec() const { 264 return started_ ? absl::optional<uint64>( 265 accumulated_time_ + 266 tensorflow::Env::Default()->NowMicros() - start_time_) 267 : absl::nullopt; 268 } 269 270 // Temporarily stop the timer, but keep accumulated time. AccumulateAndStop()271 void AccumulateAndStop() { 272 if (started_) { 273 accumulated_time_ = tensorflow::Env::Default()->NowMicros() - start_time_; 274 started_ = false; 275 } 276 } 277 278 // Start previously stopped timer. Start()279 void Start() { 280 if (started_) return; 281 282 // Keep previously accumulated time if any. 283 start_time_ = tensorflow::Env::Default()->NowMicros(); 284 started_ = true; 285 } 286 ~ScopedCounter()287 ~ScopedCounter() { ReportAndStop(); } 288 289 private: 290 template <std::size_t... S> ReportInternal(std::index_sequence<S...>)291 void ReportInternal(std::index_sequence<S...>) { 292 uint64 time_interval = 293 tensorflow::Env::Default()->NowMicros() - start_time_; 294 time_interval += accumulated_time_; 295 if (time_interval > 0) { 296 counter_->GetCell(labels_[S]...)->IncrementBy(time_interval); 297 } 298 } 299 Init()300 void Init() { 301 start_time_ = tensorflow::Env::Default()->NowMicros(); 302 started_ = true; 303 accumulated_time_ = 0; 304 } 305 306 monitoring::Counter<NumLabels>* counter_; 307 std::array<std::string, NumLabels> labels_; 308 bool started_{false}; 309 uint64 start_time_; 310 uint64 accumulated_time_; 311 }; 312 313 // Returns a counter used to capture timing metrics for graph optimization 314 // passes. 315 monitoring::Counter<2>* GetGraphOptimizationCounter(); 316 317 // Updates metrics for time to distribute variables to all TPU hosts. 318 void UpdateTpuVariableDistributionTime(const uint64 distribution_time_usecs); 319 320 // Updates the metrics stored about time XLA spents compiling graphs. 321 void UpdateXlaCompilationTime(const uint64 compilation_time_usecs); 322 323 // Updates the metrics stored about time BFC allocator spents during delay. 324 void UpdateBfcAllocatorDelayTime(const uint64 delay_usecs); 325 326 // Increments (by 1) a simple integer counter that is exposed for testing. 327 void IncrementTestCounter(const string& name, const string& label); 328 329 // Read-only access to a counter for testing. 330 const monitoring::CounterCell* TestCounter(const string& name, 331 const string& label); 332 333 // Read-only wrapper for a TestCounter to track increments between calls. 334 class TestDelta { 335 public: 336 TestDelta(const string& name, const string& label); 337 void Reset(); 338 int64 Get(); 339 340 private: 341 const monitoring::CounterCell* cell_; 342 int64 last_value_; 343 }; 344 void UpdateTpuErrorCounter(const string& op, const string& error_type); 345 void UpdateEagerClientErrorCounter(const string& error_source, 346 const string& error_type); 347 348 } // namespace metrics 349 } // namespace tensorflow 350 351 #endif // TENSORFLOW_CORE_FRAMEWORK_METRICS_H_ 352