1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/profiler/convert/xplane_to_op_stats.h"
17
18 #include <string>
19 #include <vector>
20
21 #include "absl/container/flat_hash_map.h"
22 #include "absl/container/flat_hash_set.h"
23 #include "absl/strings/match.h"
24 #include "absl/strings/string_view.h"
25 #include "tensorflow/core/platform/env.h"
26 #include "tensorflow/core/platform/logging.h"
27 #include "tensorflow/core/platform/types.h"
28 #include "tensorflow/core/profiler/convert/op_metrics_db_combiner.h"
29 #include "tensorflow/core/profiler/convert/op_stats_combiner.h"
30 #include "tensorflow/core/profiler/convert/step_events_to_steps_db.h"
31 #include "tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.h"
32 #include "tensorflow/core/profiler/convert/xplane_to_op_metrics_db.h"
33 #include "tensorflow/core/profiler/convert/xplane_to_step_events.h"
34 #include "tensorflow/core/profiler/convert/xplane_to_tf_functions.h"
35 #include "tensorflow/core/profiler/protobuf/diagnostics.pb.h"
36 #include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
37 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
38 #include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
39 #include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
40 #include "tensorflow/core/profiler/protobuf/tf_function.pb.h"
41 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
42 #include "tensorflow/core/profiler/utils/device_caps_utils.h"
43 #include "tensorflow/core/profiler/utils/event_span.h"
44 #include "tensorflow/core/profiler/utils/hardware_type_utils.h"
45 #include "tensorflow/core/profiler/utils/kernel_stats_utils.h"
46 #include "tensorflow/core/profiler/utils/math_utils.h"
47 #include "tensorflow/core/profiler/utils/step_intersection.h"
48 #include "tensorflow/core/profiler/utils/tf_op_utils.h"
49 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
50 #include "tensorflow/core/profiler/utils/tpu_xplane_utils.h"
51 #include "tensorflow/core/profiler/utils/xplane_schema.h"
52 #include "tensorflow/core/profiler/utils/xplane_utils.h"
53 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
54
55 namespace tensorflow {
56 namespace profiler {
57 namespace {
58
Hostname(const XSpace & space)59 std::string Hostname(const XSpace& space) {
60 if (space.hostnames().empty()) return "localhost";
61 DCHECK_EQ(space.hostnames_size(), 1);
62 const std::string& hostname = space.hostnames(0);
63 // This shouldn't be a taskname in host:port format.
64 DCHECK(!absl::StrContains(hostname, ':'));
65 return hostname;
66 }
67
68 } // namespace
69
MakePerfEnv(double peak_tera_flops_per_second,double peak_hbm_bw_giga_bytes_per_second)70 PerfEnv MakePerfEnv(double peak_tera_flops_per_second,
71 double peak_hbm_bw_giga_bytes_per_second) {
72 PerfEnv result;
73 result.set_peak_tera_flops_per_second(peak_tera_flops_per_second);
74 result.set_peak_hbm_bw_giga_bytes_per_second(
75 peak_hbm_bw_giga_bytes_per_second);
76 result.set_ridge_point(TeraToGiga(peak_tera_flops_per_second) /
77 peak_hbm_bw_giga_bytes_per_second);
78 return result;
79 }
80
GetPerfEnvFromXPlane(const XPlane & device_plane)81 PerfEnv GetPerfEnvFromXPlane(const XPlane& device_plane) {
82 DeviceCapabilities cap = GetDeviceCaps(device_plane);
83 if (!absl::StartsWith(device_plane.name(), kTpuPlanePrefix)) {
84 return MakePerfEnv(
85 GigaToTera(GetFlopMaxThroughputPerSM(cap)) * cap.num_cores(),
86 UniToGiga(cap.memory_bandwidth()));
87 } else {
88 XPlaneVisitor visitor = CreateTfXPlaneVisitor(&device_plane);
89 auto peak_tera_flops_per_second =
90 visitor.GetStat(StatType::kDevCapPeakTeraflopsPerSecond);
91 auto peak_hbm_bw_giga_bytes_per_second =
92 visitor.GetStat(StatType::kDevCapPeakHbmBwGigabytesPerSecond);
93 return MakePerfEnv(peak_tera_flops_per_second->DoubleValue(),
94 peak_hbm_bw_giga_bytes_per_second->DoubleValue());
95 }
96 }
97
SetRunEnvironment(const XSpace & space,RunEnvironment * env)98 void SetRunEnvironment(const XSpace& space, RunEnvironment* env) {
99 // Currently, we only support profiling one host and one program.
100 env->set_host_count(1);
101 env->set_task_count(1);
102 env->mutable_hostnames()->insert({Hostname(space), true});
103
104 std::vector<const XPlane*> gpu_planes =
105 FindPlanesWithPrefix(space, kGpuPlanePrefix);
106 if (!gpu_planes.empty()) {
107 absl::string_view gpu_model =
108 GpuModelName(GetDeviceCaps(*gpu_planes.front()));
109 if (!gpu_model.empty()) {
110 env->set_device_type(std::string(gpu_model));
111 } else {
112 env->set_device_type("GPU");
113 }
114 env->set_device_core_count(gpu_planes.size());
115 } else if (std::vector<const XPlane*> tpu_planes =
116 FindTensorCorePlanes(space);
117 !tpu_planes.empty()) {
118 XPlaneVisitor visitor = CreateTfXPlaneVisitor(tpu_planes.at(0));
119 auto xstat = visitor.GetStat(StatType::kDeviceTypeString);
120 if (xstat.has_value()) {
121 env->set_device_type(std::string(xstat->StrOrRefValue()));
122 }
123 env->set_device_core_count(tpu_planes.size());
124 } else {
125 env->set_device_type("CPU");
126 env->set_device_core_count(0);
127 }
128 }
129
PropagateXSpaceDiagnosticsToOpStats(const XSpace & space,OpStats * op_stats)130 void PropagateXSpaceDiagnosticsToOpStats(const XSpace& space,
131 OpStats* op_stats) {
132 if (!space.errors().empty()) {
133 absl::flat_hash_set<std::string> unique_errors;
134 unique_errors.insert(space.errors().begin(), space.errors().end());
135 *op_stats->mutable_diagnostics()->mutable_errors() = {unique_errors.begin(),
136 unique_errors.end()};
137 }
138 if (!space.warnings().empty()) {
139 absl::flat_hash_set<std::string> unique_warnings;
140 unique_warnings.insert(space.warnings().begin(), space.warnings().end());
141 *op_stats->mutable_diagnostics()->mutable_warnings() = {
142 unique_warnings.begin(), unique_warnings.end()};
143 }
144 }
145
ConvertXSpaceToOpStats(const XSpace & space,const OpStatsOptions & options)146 OpStats ConvertXSpaceToOpStats(const XSpace& space,
147 const OpStatsOptions& options) {
148 std::vector<const XPlane*> device_planes = FindTensorCorePlanes(space);
149 bool is_gpu = device_planes.empty();
150 if (is_gpu) {
151 device_planes = FindPlanesWithPrefix(space, kGpuPlanePrefix);
152 }
153
154 OpStats op_stats;
155 StepEvents step_events;
156 PropagateXSpaceDiagnosticsToOpStats(space, &op_stats);
157 // Convert device planes.
158 OpMetricsDbCombiner op_metrics_db_combiner(
159 op_stats.mutable_device_op_metrics_db());
160 SetRunEnvironment(space, op_stats.mutable_run_environment());
161
162 KernelReportMap reports;
163
164 // TODO(b/161942993) parallelize XPlane processing per thread.
165 for (const XPlane* device_trace : device_planes) {
166 if (options.generate_op_metrics_db) {
167 if (!op_stats.has_perf_env()) {
168 *op_stats.mutable_perf_env() = GetPerfEnvFromXPlane(*device_trace);
169 }
170 if (is_gpu) {
171 OpMetricsDb device_op_metrics_db =
172 ConvertDeviceTraceXPlaneToOpMetricsDb(*device_trace);
173 op_metrics_db_combiner.Combine(device_op_metrics_db);
174 } else {
175 XPlane aggregated_xplane;
176 AggregateXPlane(*device_trace, aggregated_xplane);
177 OpMetricsDb device_op_metrics_db =
178 ConvertTpuDeviceTraceXPlaneToOpMetricsDb(aggregated_xplane);
179 op_metrics_db_combiner.Combine(device_op_metrics_db);
180 }
181 }
182 if (options.generate_step_db) {
183 StepEvents device_step_events =
184 ConvertDeviceTraceXPlaneToStepEvents(*device_trace);
185 CombineStepEvents(device_step_events, &step_events);
186 }
187 if (options.generate_kernel_stats_db) {
188 ConvertDeviceTraceXPlaneToKernelReports(*device_trace,
189 /*on_kernel_fn=*/{}, &reports);
190 }
191 }
192
193 // Combine into reports.
194 if (options.generate_kernel_stats_db) {
195 CopyTopKDurationKernelReportsToDb(reports,
196 op_stats.mutable_kernel_stats_db());
197 }
198
199 bool has_device = !device_planes.empty();
200 // Convert a host plane.
201 const XPlane* host_plane = FindPlaneWithName(space, kHostThreadsPlaneName);
202 if (host_plane) {
203 if (options.generate_op_metrics_db) {
204 *op_stats.mutable_host_op_metrics_db() =
205 ConvertHostThreadsXPlaneToOpMetricsDb(*host_plane);
206 }
207 if (options.generate_step_db) {
208 const StepEvents* device_step_events =
209 has_device ? &step_events : nullptr;
210 StepEvents host_step_events =
211 ConvertHostThreadsXPlaneToStepEvents(*host_plane, device_step_events);
212 CombineStepEvents(host_step_events, &step_events);
213 }
214 XPlaneVisitor visitor = CreateTfXPlaneVisitor(host_plane);
215 auto stat = visitor.GetStat(StatType::kMatrixUnitUtilizationPercent);
216 if (stat.has_value()) {
217 op_stats.mutable_performance_counter_result()
218 ->set_matrix_unit_utilization_percent(stat->DoubleValue());
219 }
220 }
221 if (options.generate_step_db) {
222 StepEvents nonoverlapped_step_events =
223 ToNonOverlappedStepEvents(step_events);
224 *op_stats.mutable_step_db() = ConvertStepEventsToStepDb(
225 has_device, options.maybe_drop_incomplete_steps,
226 nonoverlapped_step_events);
227 *op_stats.mutable_device_op_metrics_db()->mutable_precision_stats() =
228 ComputePrecisionStats(nonoverlapped_step_events);
229 }
230
231 // TODO(bvandermoon): Add the TPU equivalent for setting core details hostname
232 if (is_gpu) {
233 CoreDetails& details =
234 (*op_stats.mutable_core_id_to_details())[kDefaultGpuLocalCoreId];
235 details.set_hostname(Hostname(space));
236 }
237 return op_stats;
238 }
239
ConvertMultiXSpacesToCombinedOpStats(const std::vector<XSpace> & xspaces,const OpStatsOptions & options,OpStats * combined_op_stats)240 Status ConvertMultiXSpacesToCombinedOpStats(const std::vector<XSpace>& xspaces,
241 const OpStatsOptions& options,
242 OpStats* combined_op_stats) {
243 // A shortcut code path for a single XSpace. There is no need to merge OpStats
244 // if there is only a single XSpace.
245 if (xspaces.size() == 1) {
246 *combined_op_stats = ConvertXSpaceToOpStats(xspaces[0], options);
247 return OkStatus();
248 }
249
250 // Read multiple XSpaces and convert to multiple OpStats.
251 std::vector<OpStats> all_op_stats;
252 all_op_stats.reserve(xspaces.size());
253 for (const XSpace& xspace : xspaces) {
254 all_op_stats.push_back(ConvertXSpaceToOpStats(xspace, options));
255 }
256
257 // Combine OpStats.
258 std::vector<OpStatsInfo> all_op_stats_info;
259 all_op_stats_info.reserve(all_op_stats.size());
260 for (int i = 0; i < all_op_stats.size(); i++) {
261 all_op_stats_info.emplace_back(
262 &all_op_stats[i],
263 ParseHardwareType(all_op_stats[i].run_environment().device_type()), i);
264 }
265
266 // Do not limit the maximum number of steps during the merge of OpStats.
267 StepIntersection step_intersection =
268 ComputeStepIntersectionToMergeOpStats(all_op_stats_info, kuint32max);
269 CombineAllOpStats(all_op_stats_info, step_intersection, combined_op_stats);
270
271 return OkStatus();
272 }
273
274 } // namespace profiler
275 } // namespace tensorflow
276