xref: /aosp_15_r20/external/tensorflow/tensorflow/core/framework/metrics.h (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_CORE_FRAMEWORK_METRICS_H_
17 #define TENSORFLOW_CORE_FRAMEWORK_METRICS_H_
18 
19 #include "absl/container/flat_hash_map.h"
20 #include "tensorflow/core/framework/dataset_options.pb.h"
21 #include "tensorflow/core/lib/monitoring/counter.h"
22 #include "tensorflow/core/lib/monitoring/gauge.h"
23 #include "tensorflow/core/platform/env.h"
24 #include "tensorflow/core/platform/statusor.h"
25 #include "tensorflow/core/platform/types.h"
26 #include "tensorflow/core/protobuf/data_service.pb.h"
27 
28 namespace tensorflow {
29 namespace metrics {
30 
31 // Records that a tf.data.Dataset executed by the program used autotuning.
32 //
33 // The `name` argument identifies the Dataset type (e.g. "ParallelMap").
34 void RecordTFDataAutotune(const string& name);
35 
36 // Returns a counter that can be used to record the number of bytes produced by
37 // a tf.data.Dataset.
38 //
39 // The `name` argument identifies the Dataset type (e.g. "Batch" or "Map").
40 monitoring::CounterCell* GetTFDataBytesConsumedCounter(const string& name);
41 
42 // Returns a counter that can be used to record the number of bytes produced by
43 // a tf.data.Dataset.
44 //
45 // The `name` argument identifies the Dataset type (e.g. "Batch" or "Map").
46 monitoring::CounterCell* GetTFDataBytesProducedCounter(const string& name);
47 
48 // Returns a counter than can be used to record the number of bytes read from
49 // the filesystem by a tf.data.Dataset source.
50 //
51 // The `name` argument identifies the Dataset type (e.g. "TFRecordDataset").
52 //
53 // TODO(jsimsa): Remove this now that we have GetTFDataBytesConsumedCounter?
54 monitoring::CounterCell* GetTFDataBytesReadCounter(const string& name);
55 
56 // Returns a counter than can be used to record the number of elements produced
57 // by a tf.data.Dataset.
58 //
59 // The `name` argument identifies the Dataset type (e.g. "Batch" or "Map").
60 monitoring::CounterCell* GetTFDataElementsCounter(const string& name);
61 
62 // Returns a gauge than can be used to record the performance model information.
63 //
64 // The `id` argument represents the (unique) model ID.
65 monitoring::GaugeCell<std::function<std::string()>>* GetTFDataModelGauge(
66     const string& id);
67 
68 // Records the number of bytes fetched from tf.data.Dataset iterator.
69 void RecordTFDataBytesFetched(int64_t num_bytes);
70 
71 // Records the number of times tf.data experiment is applied to input pipelines.
72 void RecordTFDataExperiment(const string& name);
73 
74 // Records the time (in microseconds) spent in a single invocation of
75 // `ItertatorResource::GetNext()`.
76 void RecordTFDataGetNextDuration(uint64 duration_us);
77 
78 // Records the histogram of ratios of tf.data autotune algorithm used RAM over
79 // the ram budget.
80 void RecordTFDataAutotuneUsedRamBudgetRatio(const double ratio);
81 
82 // Records the histogram of ratios of tf.data autotune algorithm max buffer
83 // bytes over the ram budget.
84 void RecordTFDataAutotuneMaxBufferBudgetRatio(const double ratio);
85 
86 // Records the number of times each tf.data fingerprint is used
87 // to measure duplicate pre-processing.
88 //
89 // The `name` argument identifies the Dataset graph fingerprint,
90 // created using GraphHash().
91 void RecordTFDataFingerprint(const string& name);
92 
93 // Records the time (in microseconds) during which `IteratorResource` was busy
94 // processing at least one `GetNext()` request.
95 void RecordTFDataIteratorBusy(uint64 duration_us);
96 
97 // Records the time (in microseconds) between `IteratorResource` receiving the
98 // first `GetNext()` request and responding to the last `GetNext()` request.
99 void RecordTFDataIteratorLifetime(uint64 duration_us);
100 
101 // Records the time histogram (in microseconds) between `IteratorResource`
102 // responding to a `GetNext()` request and receiving the next `GetNext()`
103 // request.
104 void RecordTFDataIteratorGap(uint64 duration_us);
105 
106 // Records the number of independent graph changes resulting from the
107 // application of a tf.data optimization.
108 //
109 // The `name` argument identifies the optimization (e.g. "noop_elimination").
110 void RecordTFDataOptimization(const string& name, int64_t num_changes);
111 
112 // Records that a tf.data service worker has been created.
113 void RecordTFDataServiceWorkerCreated();
114 
115 // Records that a tf.data service job has been created.
116 void RecordTFDataServiceJobsCreated(
117     const tensorflow::data::ProcessingModeDef& processing_mode,
118     bool is_coordinated_read);
119 
120 // Records tf.data service iterators created by clients.
121 void RecordTFDataServiceClientIterators(
122     int64_t worker_uid, tensorflow::data::DeploymentMode deployment_mode,
123     const tensorflow::data::ProcessingModeDef& processing_mode,
124     bool is_coordinated_read);
125 
126 // Records tf.data service cross-trainer cache queries.
127 void RecordTFDataServiceCrossTrainerCacheQuery(bool cache_hit);
128 
129 // Records tf.data service cross-trainer cache memory usage in bytes.
130 void RecordTFDataServiceCrossTrainerCacheSizeBytes(size_t bytes);
131 
132 // Records the file name read by a tf.data Dataset.
133 //
134 // The `name` argument identifies the Dataset type (e.g. "TFRecordDataset").
135 void RecordTFDataFilename(const string& name, const string& filename);
136 
137 // Records statistics of tf.data auto sharding.
138 //
139 // The `id` is a unique identifier of the input pipeline. The `policy`
140 // identifies the auto-sharding policy used, the `num_workers` identifies the
141 // number of workers, and `num_replicas` identifies the number of replicas.
142 void RecordTFDataAutoShard(const string& id, data::AutoShardPolicy policy,
143                            int64 num_workers, int64 num_replicas);
144 
145 // Records statistics of whether we can rewrite batch size in tf.data auto
146 // sharding.
147 //
148 // The `id` is a unique identifier of the input pipeline. The `eligible`
149 // indicates whether the input pipeline is eligible for the rewrite. The
150 // `ineligible_reason` is the reason if the input pipeline is ineligible.
151 void RecordTFDataAutoShardRewriteBatchSize(
152     bool eligible, const std::vector<string>& ineligible_reason);
153 
154 // Records the number of times each tf.data autotuning algorithm stopping
155 // criterion is met.
156 void RecordTFDataAutotuneStoppingCriteria(const string& name);
157 
158 // Records parsing of dense tensor features.
159 void RecordParseDenseFeature(int64_t num_features);
160 
161 // Records parsing of sparse tensor features.
162 void RecordParseSparseFeature(int64_t num_features);
163 
164 // Records parsing of ragged tensor features.
165 void RecordParseRaggedFeature(int64_t num_features);
166 
167 // Records the size of input/output tensors in bytes.
168 void RecordGraphInputTensors(const size_t size);
169 void RecordGraphOutputTensors(const size_t size);
170 
171 // Records the number of cores requested by graphs with XLA SPMD enabled.
172 void RecordTPUXlaSpmdCoresPerReplica(int64_t cores_per_replica);
173 
174 void UpdateGraphExecTime(const uint64 running_time_usecs);
175 void UpdateGraphPendingQueueLength(uint64 len);
176 
177 // Records that one output of an op of type `op_name` was unused.
178 void RecordUnusedOutput(const string& op_name);
179 
180 // Updates the metrics stored about time spent building graphs.
181 //
182 // By "GraphBuild", we refer to building a client graph, which is a sub-graph of
183 // the full graph, induced by a set of options. In particular, these options
184 // include the feeds and fetches requested.
185 //
186 // This includes time spent:
187 //   * optimizing the graphs with Grappler
188 //   * pruning the sub-graph (unless the place_pruned_graph option is set)
189 //
190 // When executing eagerly, this will not record any activity.
191 //
192 // TODO(jtkeeling): Should we record building/optimizing tf.functions?
193 void UpdateGraphBuildTime(const uint64 running_time_usecs);
194 
195 // Records the status of a graph passing through various states/stages of
196 // TfMlirGraphOptimizationPass processing using
197 // tf_metadata.tf_mlir_update_graph_optimization_pass_state_counter metric.
198 // 'pass_state' identifies the state of the pass
199 // (or "PassState" metric field) and 'processing_state' refers to the stage
200 // in the process the graph is at (or "ProcessingState" metric field).
201 void UpdateTfMlirGraphOptimizationPassStateCounter(
202     const std::string& pass_state, const std::string& processing_state);
203 
204 // Records the activity of the first phase of the mlir bridge using the
205 // tf_metadata.tf_mlir_bridge_first_phase_count metric.
206 // device_type: tpu, cpu, gpu, etc.
207 // bridge_version: v1 compat, v2, etc.
208 // fallback_enabled: true if fallback will happen, false if not
209 // result: outcome of bridge (success, failure, disabled, invalid_graph, etc.)
210 void UpdateTfMlirBridgeFirstPhaseCounter(const std::string& device_type,
211                                          const std::string& bridge_version,
212                                          bool fallback_enabled,
213                                          const std::string& result);
214 
215 // Records the activity per op using the
216 // tf_metadata.tf_mlir_bridge_graph_analysis_per_op.
217 // op_name: the name of op.
218 // construction_context: eager, session, Not tracked.
219 // is_single_core_inference_mode: true, false.
220 // unsupported_reason: the reason why the graph is not supported in MLIR-based
221 // bridge, like invalid graph, has unsupported ops, etc.
222 // has_unsupported_features: true indicates MLIR-based bridge is disabled,
223 // false indicates MLIR-based bridge is enabled.
224 
225 void UpdateTfMlirBridgeGraphAnalysisPerOp(
226     const std::string& op_name, const std::string& construction_context,
227     bool is_single_core_inference_mode, const std::string& num_replicas,
228     const std::string& num_cores_per_replica, const std::string& use_tpu,
229     const std::string& allow_soft_placement,
230     const std::string& use_spmd_for_xla_partitioning,
231     const std::string& unsupported_reason, bool has_unsupported_features);
232 
233 // Convenience class allowing RAII style of reporting for a monitoring::Counter.
234 template <int NumLabels>
235 class ScopedCounter final {
236  public:
ScopedCounter(monitoring::Counter<NumLabels> * const counter,const std::array<std::string,NumLabels> & labels)237   ScopedCounter(monitoring::Counter<NumLabels>* const counter,
238                 const std::array<std::string, NumLabels>& labels)
239       : counter_(counter), labels_(labels) {
240     Init();
241   }
242 
243   // Report counter and stop it. Counter needs to be reset to perform
244   // next measurement.
ReportAndStop()245   void ReportAndStop() {
246     if (started_) {
247       started_ = false;
248       ReportInternal(std::make_index_sequence<NumLabels>());
249     }
250   }
251 
252   // Start the measurement with the new set of labels.
Reset(const std::array<std::string,NumLabels> & labels)253   void Reset(const std::array<std::string, NumLabels>& labels) {
254     labels_ = labels;
255     Init();
256   }
257 
258   // Start the measurement with the existing set of labels.
Reset()259   void Reset() { Init(); }
260 
261   // Returns duration of the current interval in case the timer has started.
262   // Returns nullopt otherwise.
DurationMicroSec()263   absl::optional<uint64> DurationMicroSec() const {
264     return started_ ? absl::optional<uint64>(
265                           accumulated_time_ +
266                           tensorflow::Env::Default()->NowMicros() - start_time_)
267                     : absl::nullopt;
268   }
269 
270   // Temporarily stop the timer, but keep accumulated time.
AccumulateAndStop()271   void AccumulateAndStop() {
272     if (started_) {
273       accumulated_time_ = tensorflow::Env::Default()->NowMicros() - start_time_;
274       started_ = false;
275     }
276   }
277 
278   // Start previously stopped timer.
Start()279   void Start() {
280     if (started_) return;
281 
282     // Keep previously accumulated time if any.
283     start_time_ = tensorflow::Env::Default()->NowMicros();
284     started_ = true;
285   }
286 
~ScopedCounter()287   ~ScopedCounter() { ReportAndStop(); }
288 
289  private:
290   template <std::size_t... S>
ReportInternal(std::index_sequence<S...>)291   void ReportInternal(std::index_sequence<S...>) {
292     uint64 time_interval =
293         tensorflow::Env::Default()->NowMicros() - start_time_;
294     time_interval += accumulated_time_;
295     if (time_interval > 0) {
296       counter_->GetCell(labels_[S]...)->IncrementBy(time_interval);
297     }
298   }
299 
Init()300   void Init() {
301     start_time_ = tensorflow::Env::Default()->NowMicros();
302     started_ = true;
303     accumulated_time_ = 0;
304   }
305 
306   monitoring::Counter<NumLabels>* counter_;
307   std::array<std::string, NumLabels> labels_;
308   bool started_{false};
309   uint64 start_time_;
310   uint64 accumulated_time_;
311 };
312 
313 // Returns a counter used to capture timing metrics for graph optimization
314 // passes.
315 monitoring::Counter<2>* GetGraphOptimizationCounter();
316 
317 // Updates metrics for time to distribute variables to all TPU hosts.
318 void UpdateTpuVariableDistributionTime(const uint64 distribution_time_usecs);
319 
320 // Updates the metrics stored about time XLA spents compiling graphs.
321 void UpdateXlaCompilationTime(const uint64 compilation_time_usecs);
322 
323 // Updates the metrics stored about time BFC allocator spents during delay.
324 void UpdateBfcAllocatorDelayTime(const uint64 delay_usecs);
325 
326 // Increments (by 1) a simple integer counter that is exposed for testing.
327 void IncrementTestCounter(const string& name, const string& label);
328 
329 // Read-only access to a counter for testing.
330 const monitoring::CounterCell* TestCounter(const string& name,
331                                            const string& label);
332 
333 // Read-only wrapper for a TestCounter to track increments between calls.
334 class TestDelta {
335  public:
336   TestDelta(const string& name, const string& label);
337   void Reset();
338   int64 Get();
339 
340  private:
341   const monitoring::CounterCell* cell_;
342   int64 last_value_;
343 };
344 void UpdateTpuErrorCounter(const string& op, const string& error_type);
345 void UpdateEagerClientErrorCounter(const string& error_source,
346                                    const string& error_type);
347 
348 }  // namespace metrics
349 }  // namespace tensorflow
350 
351 #endif  // TENSORFLOW_CORE_FRAMEWORK_METRICS_H_
352