xref: /aosp_15_r20/external/tensorflow/tensorflow/cc/saved_model/metrics.cc (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/cc/saved_model/metrics.h"
17 
18 #include <string>
19 
20 #include "tensorflow/core/lib/monitoring/counter.h"
21 #include "tensorflow/core/lib/monitoring/sampler.h"
22 
23 namespace tensorflow {
24 namespace metrics {
25 
26 namespace {
27 
28 // Counter that tracks total number and `write_version` of SavedModels written.
29 auto* saved_model_write_counter = monitoring::Counter<1>::New(
30     "/tensorflow/core/saved_model/write/count",
31     "The number of SavedModels successfully written.", "write_version");
32 
33 // Counter that tracks total number and `write_version` of SavedModels read.
34 auto* saved_model_read_counter = monitoring::Counter<1>::New(
35     "/tensorflow/core/saved_model/read/count",
36     "The number of SavedModels successfully loaded.", "write_version");
37 
38 // Counter that tracks number of calls for each SavedModel write API. Summing
39 // across "api_label" is not expected to equal the ".../write/count" cell value
40 // because programs can invoke more than one API to save a single SM and
41 // because the API may error out before successfully writing a SM.
42 auto* saved_model_write_api = monitoring::Counter<1>::New(
43     "/tensorflow/core/saved_model/write/api",
44     "The API used to write the SavedModel.", "api_label");
45 
46 // Counter that tracks number of calls for each SavedModel read API. Summing
47 // across "api_label" is not expected to equal the ".../read/count" cell value
48 // because programs can invoke more than one API to load a single SM and
49 // because the API may error out before successfully reading a SM.
50 auto* saved_model_read_api = monitoring::Counter<1>::New(
51     "/tensorflow/core/saved_model/read/api",
52     "The API used to load the SavedModel.", "api_label");
53 
54 // Distribution of checkpoint write durations.
55 auto* checkpoint_write_durations = monitoring::Sampler<1>::New(
56     {
57         "/tensorflow/core/checkpoint/write/write_durations",  // Metric name.
58         "Distribution of the wall time duration in microseconds of the "
59         "checkpoint write operation.",  // Metric description.
60         "api_label"                     // Cell label.
61     },
62     // Scale of 1000, growth factor of 1.5 with upper bound of ~184 minutes.
63     monitoring::Buckets::Exponential(1000, 1.5, 41));
64 
65 // Distribution of checkpoint read durations.
66 auto* checkpoint_read_durations = monitoring::Sampler<1>::New(
67     {
68         "/tensorflow/core/checkpoint/read/read_durations",  // Metric name.
69         "Distribution of the wall time duration in microseconds of the "
70         "checkpoint read operation.",  // Metric description.
71         "api_label"                    // Cell label.
72     },
73     // Scale of 1000, growth factor of 1.5 with upper bound of ~184 minutes.
74     monitoring::Buckets::Exponential(1000, 1.5, 41));
75 
76 // Distribution of async checkpoint write durations.
77 auto* async_checkpoint_write_durations = monitoring::Sampler<1>::New(
78     {
79         "/tensorflow/core/checkpoint/write/async_write_durations",  // Metric
80                                                                     // name.
81         "Distribution of the wall time duration in microseconds of the async "
82         "checkpoint write operation",  // Metric description.
83         "api_label"                    // Cell label.
84     },
85     // Scale of 1000, growth factor of 1.5 with upper bound of ~184 minutes.
86     monitoring::Buckets::Exponential(1000, 1.5, 41));
87 
88 // Counter that accumulates total time elapsed between module import time and
89 // the last successful Checkpoint write prior to job pre-emption or completion.
90 auto* checkpoint_training_time_saved = monitoring::Counter<1>::New(
91     "/tensorflow/core/checkpoint/write/training_time_saved",
92     "Total time in microseconds elapsed between two consecutive write "
93     "operations in a single job or between Checkpoint construction and the "
94     "first write operation.",
95     "api_label");
96 
97 // Counter that records filesize (MB) of written checkpoint. Contains two cells:
98 // (api_label, filesize). Cardinality should not be an issue as the filesize
99 // should be equal among all checkpoints written per job.
100 auto* checkpoint_size = monitoring::Counter<2>::New(
101     "/tensorflow/core/checkpoint/write/checkpoint_size",
102     "Size of checkpoint (.index and sharded data files), rounded to the "
103     "nearest 100 MB.",
104     "api_label", "filesize");
105 
106 }  // namespace
107 
SavedModelWrite(absl::string_view write_version)108 monitoring::CounterCell& SavedModelWrite(absl::string_view write_version) {
109   return *saved_model_write_counter->GetCell(std::string(write_version));
110 }
111 
SavedModelRead(absl::string_view write_version)112 monitoring::CounterCell& SavedModelRead(absl::string_view write_version) {
113   return *saved_model_read_counter->GetCell(std::string(write_version));
114 }
115 
SavedModelWriteApi(absl::string_view api_label)116 monitoring::CounterCell& SavedModelWriteApi(absl::string_view api_label) {
117   return *saved_model_write_api->GetCell(std::string(api_label));
118 }
119 
SavedModelReadApi(absl::string_view api_label)120 monitoring::CounterCell& SavedModelReadApi(absl::string_view api_label) {
121   return *saved_model_read_api->GetCell(std::string(api_label));
122 }
123 
CheckpointReadDuration(absl::string_view api_label)124 monitoring::SamplerCell& CheckpointReadDuration(absl::string_view api_label) {
125   return *checkpoint_read_durations->GetCell(std::string(api_label));
126 }
127 
CheckpointWriteDuration(absl::string_view api_label)128 monitoring::SamplerCell& CheckpointWriteDuration(absl::string_view api_label) {
129   return *checkpoint_write_durations->GetCell(std::string(api_label));
130 }
131 
AsyncCheckpointWriteDuration(absl::string_view api_label)132 monitoring::SamplerCell& AsyncCheckpointWriteDuration(
133     absl::string_view api_label) {
134   return *async_checkpoint_write_durations->GetCell(std::string(api_label));
135 }
136 
TrainingTimeSaved(absl::string_view api_label)137 monitoring::CounterCell& TrainingTimeSaved(absl::string_view api_label) {
138   return *checkpoint_training_time_saved->GetCell(std::string(api_label));
139 }
140 
CheckpointSize(absl::string_view api_label,int64_t filesize)141 monitoring::CounterCell& CheckpointSize(absl::string_view api_label,
142                                         int64_t filesize) {
143   return *checkpoint_size->GetCell(std::string(api_label),
144                                    std::to_string(filesize));
145 }
146 
147 }  // namespace metrics
148 }  // namespace tensorflow
149