1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #include "tensorflow/core/profiler/convert/step_events_to_steps_db.h"
16
17 #include <sstream>
18 #include <utility>
19 #include <vector>
20
21 #include "google/protobuf/any.pb.h"
22 #include "absl/algorithm/container.h"
23 #include "absl/container/flat_hash_map.h"
24 #include "tensorflow/core/lib/gtl/map_util.h"
25 #include "tensorflow/core/platform/logging.h"
26 #include "tensorflow/core/platform/types.h"
27 #include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
28 #include "tensorflow/core/profiler/utils/event_span.h"
29 #include "tensorflow/core/profiler/utils/timespan.h"
30
31 namespace tensorflow {
32 namespace profiler {
33
34 // Local core id should start from 1.
35 const uint32 kDefaultGpuLocalCoreId = 1;
36
37 namespace {
38
39 // Converts from StepDetails to StepInfoResult.
ConvertStepDetailsToStepInfo(bool has_device,int64_t step_num,const StepDetails & step_details)40 StepInfoResult ConvertStepDetailsToStepInfo(bool has_device, int64_t step_num,
41 const StepDetails& step_details) {
42 GenericStepBreakdown generic;
43 Timespan step_time = step_details.StepTime();
44 auto& type_ps = *(generic.mutable_type_ps());
45 uint64 total_event_duration = 0;
46 for (const auto& event : step_details.Events()) {
47 // Ignore event duration outside the step marker.
48 uint64 event_duration = step_time.OverlappedDurationPs(event.span);
49 type_ps[event.type] += event_duration;
50 total_event_duration += event_duration;
51 }
52 if (total_event_duration < step_time.duration_ps()) {
53 // Some time in the step is not associated with any event. Classify them as
54 // "unknown time".
55 type_ps[UNKNOWN_TIME] += step_time.duration_ps() - total_event_duration;
56 }
57 // Determines if this particular step is a well-formed one.
58 bool well_formed_step = has_device ? (type_ps.contains(DEVICE_COMPUTE_16) ||
59 type_ps.contains(DEVICE_COMPUTE_32))
60 : type_ps.contains(HOST_COMPUTE);
61 StepInfoResult step_info;
62 step_info.mutable_step_breakdown()->PackFrom(generic);
63 if (well_formed_step) {
64 step_info.set_step_num(step_num);
65 step_info.set_step_name(step_details.StepName());
66 step_info.set_begin_ps(step_time.begin_ps());
67 step_info.set_duration_ps(step_time.duration_ps());
68 } else {
69 // For a non-well-formed step, sets its duration to 0 so that it will be
70 // ignored by the caller of this function.
71 step_info.set_duration_ps(0);
72 }
73 return step_info;
74 }
75
DebugGenericStepBreakdown(const GenericStepBreakdown & generic)76 string DebugGenericStepBreakdown(const GenericStepBreakdown& generic) {
77 std::ostringstream out;
78 uint64 total_ps = 0;
79 const auto& type_ps_map = generic.type_ps();
80 for (const auto& type_ps : type_ps_map) {
81 total_ps += type_ps.second;
82 }
83 out << "Total ps = " << total_ps << std::endl;
84 for (int type = LAST_EVENT_TYPE; type >= 0; --type) {
85 const auto* ps = gtl::FindOrNull(type_ps_map, type);
86 if (ps == nullptr) continue;
87 double percent = (*ps * 100.0) / total_ps;
88 auto event_type = static_cast<EventType>(type);
89 out << PrintEventType(event_type) << ": " << percent << "%"
90 << ", ps = " << *ps << std::endl;
91 }
92 return out.str();
93 }
94
DebugStepInfo(const StepInfoResult & step_info)95 string DebugStepInfo(const StepInfoResult& step_info) {
96 std::ostringstream out;
97 out << "step_num=" << step_info.step_num()
98 << ", duration_ps=" << step_info.duration_ps()
99 << ", begin_ps=" << step_info.begin_ps() << std::endl;
100 GenericStepBreakdown generic;
101 if (step_info.step_breakdown().UnpackTo(&generic)) {
102 out << "Generic step breakdown:" << std::endl;
103 out << DebugGenericStepBreakdown(generic) << std::endl;
104 } else {
105 out << step_info.step_breakdown().DebugString() << std::endl;
106 }
107 return out.str();
108 }
109
110 } // namespace
111
ConvertStepEventsToStepDb(bool has_device,bool maybe_drop_incomplete_steps,const StepEvents & nonoverlapped_step_events)112 StepDatabaseResult ConvertStepEventsToStepDb(
113 bool has_device, bool maybe_drop_incomplete_steps,
114 const StepEvents& nonoverlapped_step_events) {
115 StepDatabaseResult step_db;
116 // Gets sorted step numbers.
117 std::vector<int64_t> step_numbers;
118 step_numbers.reserve(nonoverlapped_step_events.size());
119 for (const auto& step_events : nonoverlapped_step_events) {
120 step_numbers.push_back(step_events.first);
121 }
122 absl::c_sort(step_numbers);
123 for (const auto& step : step_numbers) {
124 const auto* step_details = gtl::FindOrNull(nonoverlapped_step_events, step);
125 if (step_details == nullptr) continue;
126 StepInfoResult step_info =
127 ConvertStepDetailsToStepInfo(has_device, step, *step_details);
128 if (step_info.duration_ps() == 0)
129 continue; // Do not include non-well-formed steps.
130 PerCoreStepInfo per_core_step_info;
131 per_core_step_info.set_step_num(step);
132 // When we generated StepEvents, we already put events from all device
133 // cores and cpu threads on this host into a single event stream, therefore
134 // we can't separate them anymore. Simply assigns all events to Core-0.
135 (*per_core_step_info.mutable_step_info_per_core())[kDefaultGpuLocalCoreId] =
136 std::move(step_info);
137 VLOG(2) << std::endl
138 << "step_id: " << step << ", step_info:" << std::endl
139 << DebugStepInfo((
140 *per_core_step_info
141 .mutable_step_info_per_core())[kDefaultGpuLocalCoreId]);
142 // Populates the collective ops information.
143 auto& collectives = *per_core_step_info.mutable_all_reduce_db_per_core();
144 for (const auto& it : step_details->Collectives()) {
145 collectives[it.first] = it.second;
146 }
147 // Populates the device transfer stats for this step.
148 auto& device_memory_transfers =
149 *per_core_step_info.mutable_device_memory_transfers();
150 for (const auto& dma : step_details->DeviceMemoryTransfers()) {
151 *device_memory_transfers.Add() = dma;
152 }
153 // The remaining fields in PerCoreStepInfo are not filled.
154 *step_db.add_step_sequence() = per_core_step_info;
155 }
156
157 // If we are using sampling mode and we get enough steps, we would like to
158 // drop the incomplete steps at the beginning and the end.
159 // (Sometimes CUTPI instrumentation will prolong the first step too).
160 int kDropIncomplteteStepThreshold = 5;
161 if (maybe_drop_incomplete_steps &&
162 step_db.step_sequence_size() > kDropIncomplteteStepThreshold) {
163 step_db.mutable_step_sequence()->erase(
164 step_db.mutable_step_sequence()->begin());
165 step_db.mutable_step_sequence()->RemoveLast();
166 }
167 return step_db;
168 }
169
170 } // namespace profiler
171 } // namespace tensorflow
172