1 /* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/cc/saved_model/metrics.h"
17
18 #include <string>
19
20 #include "tensorflow/core/lib/monitoring/counter.h"
21 #include "tensorflow/core/lib/monitoring/sampler.h"
22
23 namespace tensorflow {
24 namespace metrics {
25
26 namespace {
27
28 // Counter that tracks total number and `write_version` of SavedModels written.
29 auto* saved_model_write_counter = monitoring::Counter<1>::New(
30 "/tensorflow/core/saved_model/write/count",
31 "The number of SavedModels successfully written.", "write_version");
32
33 // Counter that tracks total number and `write_version` of SavedModels read.
34 auto* saved_model_read_counter = monitoring::Counter<1>::New(
35 "/tensorflow/core/saved_model/read/count",
36 "The number of SavedModels successfully loaded.", "write_version");
37
38 // Counter that tracks number of calls for each SavedModel write API. Summing
39 // across "api_label" is not expected to equal the ".../write/count" cell value
40 // because programs can invoke more than one API to save a single SM and
41 // because the API may error out before successfully writing a SM.
42 auto* saved_model_write_api = monitoring::Counter<1>::New(
43 "/tensorflow/core/saved_model/write/api",
44 "The API used to write the SavedModel.", "api_label");
45
46 // Counter that tracks number of calls for each SavedModel read API. Summing
47 // across "api_label" is not expected to equal the ".../read/count" cell value
48 // because programs can invoke more than one API to load a single SM and
49 // because the API may error out before successfully reading a SM.
50 auto* saved_model_read_api = monitoring::Counter<1>::New(
51 "/tensorflow/core/saved_model/read/api",
52 "The API used to load the SavedModel.", "api_label");
53
54 // Distribution of checkpoint write durations.
55 auto* checkpoint_write_durations = monitoring::Sampler<1>::New(
56 {
57 "/tensorflow/core/checkpoint/write/write_durations", // Metric name.
58 "Distribution of the wall time duration in microseconds of the "
59 "checkpoint write operation.", // Metric description.
60 "api_label" // Cell label.
61 },
62 // Scale of 1000, growth factor of 1.5 with upper bound of ~184 minutes.
63 monitoring::Buckets::Exponential(1000, 1.5, 41));
64
65 // Distribution of checkpoint read durations.
66 auto* checkpoint_read_durations = monitoring::Sampler<1>::New(
67 {
68 "/tensorflow/core/checkpoint/read/read_durations", // Metric name.
69 "Distribution of the wall time duration in microseconds of the "
70 "checkpoint read operation.", // Metric description.
71 "api_label" // Cell label.
72 },
73 // Scale of 1000, growth factor of 1.5 with upper bound of ~184 minutes.
74 monitoring::Buckets::Exponential(1000, 1.5, 41));
75
76 // Distribution of async checkpoint write durations.
77 auto* async_checkpoint_write_durations = monitoring::Sampler<1>::New(
78 {
79 "/tensorflow/core/checkpoint/write/async_write_durations", // Metric
80 // name.
81 "Distribution of the wall time duration in microseconds of the async "
82 "checkpoint write operation", // Metric description.
83 "api_label" // Cell label.
84 },
85 // Scale of 1000, growth factor of 1.5 with upper bound of ~184 minutes.
86 monitoring::Buckets::Exponential(1000, 1.5, 41));
87
88 // Counter that accumulates total time elapsed between module import time and
89 // the last successful Checkpoint write prior to job pre-emption or completion.
90 auto* checkpoint_training_time_saved = monitoring::Counter<1>::New(
91 "/tensorflow/core/checkpoint/write/training_time_saved",
92 "Total time in microseconds elapsed between two consecutive write "
93 "operations in a single job or between Checkpoint construction and the "
94 "first write operation.",
95 "api_label");
96
97 // Counter that records filesize (MB) of written checkpoint. Contains two cells:
98 // (api_label, filesize). Cardinality should not be an issue as the filesize
99 // should be equal among all checkpoints written per job.
100 auto* checkpoint_size = monitoring::Counter<2>::New(
101 "/tensorflow/core/checkpoint/write/checkpoint_size",
102 "Size of checkpoint (.index and sharded data files), rounded to the "
103 "nearest 100 MB.",
104 "api_label", "filesize");
105
106 } // namespace
107
SavedModelWrite(absl::string_view write_version)108 monitoring::CounterCell& SavedModelWrite(absl::string_view write_version) {
109 return *saved_model_write_counter->GetCell(std::string(write_version));
110 }
111
SavedModelRead(absl::string_view write_version)112 monitoring::CounterCell& SavedModelRead(absl::string_view write_version) {
113 return *saved_model_read_counter->GetCell(std::string(write_version));
114 }
115
SavedModelWriteApi(absl::string_view api_label)116 monitoring::CounterCell& SavedModelWriteApi(absl::string_view api_label) {
117 return *saved_model_write_api->GetCell(std::string(api_label));
118 }
119
SavedModelReadApi(absl::string_view api_label)120 monitoring::CounterCell& SavedModelReadApi(absl::string_view api_label) {
121 return *saved_model_read_api->GetCell(std::string(api_label));
122 }
123
CheckpointReadDuration(absl::string_view api_label)124 monitoring::SamplerCell& CheckpointReadDuration(absl::string_view api_label) {
125 return *checkpoint_read_durations->GetCell(std::string(api_label));
126 }
127
CheckpointWriteDuration(absl::string_view api_label)128 monitoring::SamplerCell& CheckpointWriteDuration(absl::string_view api_label) {
129 return *checkpoint_write_durations->GetCell(std::string(api_label));
130 }
131
AsyncCheckpointWriteDuration(absl::string_view api_label)132 monitoring::SamplerCell& AsyncCheckpointWriteDuration(
133 absl::string_view api_label) {
134 return *async_checkpoint_write_durations->GetCell(std::string(api_label));
135 }
136
TrainingTimeSaved(absl::string_view api_label)137 monitoring::CounterCell& TrainingTimeSaved(absl::string_view api_label) {
138 return *checkpoint_training_time_saved->GetCell(std::string(api_label));
139 }
140
CheckpointSize(absl::string_view api_label,int64_t filesize)141 monitoring::CounterCell& CheckpointSize(absl::string_view api_label,
142 int64_t filesize) {
143 return *checkpoint_size->GetCell(std::string(api_label),
144 std::to_string(filesize));
145 }
146
147 } // namespace metrics
148 } // namespace tensorflow
149