xref: /aosp_15_r20/external/tensorflow/tensorflow/core/profiler/convert/xplane_to_op_stats.cc (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/profiler/convert/xplane_to_op_stats.h"
17 
18 #include <string>
19 #include <vector>
20 
21 #include "absl/container/flat_hash_map.h"
22 #include "absl/container/flat_hash_set.h"
23 #include "absl/strings/match.h"
24 #include "absl/strings/string_view.h"
25 #include "tensorflow/core/platform/env.h"
26 #include "tensorflow/core/platform/logging.h"
27 #include "tensorflow/core/platform/types.h"
28 #include "tensorflow/core/profiler/convert/op_metrics_db_combiner.h"
29 #include "tensorflow/core/profiler/convert/op_stats_combiner.h"
30 #include "tensorflow/core/profiler/convert/step_events_to_steps_db.h"
31 #include "tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.h"
32 #include "tensorflow/core/profiler/convert/xplane_to_op_metrics_db.h"
33 #include "tensorflow/core/profiler/convert/xplane_to_step_events.h"
34 #include "tensorflow/core/profiler/convert/xplane_to_tf_functions.h"
35 #include "tensorflow/core/profiler/protobuf/diagnostics.pb.h"
36 #include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
37 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
38 #include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
39 #include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
40 #include "tensorflow/core/profiler/protobuf/tf_function.pb.h"
41 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
42 #include "tensorflow/core/profiler/utils/device_caps_utils.h"
43 #include "tensorflow/core/profiler/utils/event_span.h"
44 #include "tensorflow/core/profiler/utils/hardware_type_utils.h"
45 #include "tensorflow/core/profiler/utils/kernel_stats_utils.h"
46 #include "tensorflow/core/profiler/utils/math_utils.h"
47 #include "tensorflow/core/profiler/utils/step_intersection.h"
48 #include "tensorflow/core/profiler/utils/tf_op_utils.h"
49 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
50 #include "tensorflow/core/profiler/utils/tpu_xplane_utils.h"
51 #include "tensorflow/core/profiler/utils/xplane_schema.h"
52 #include "tensorflow/core/profiler/utils/xplane_utils.h"
53 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
54 
55 namespace tensorflow {
56 namespace profiler {
57 namespace {
58 
Hostname(const XSpace & space)59 std::string Hostname(const XSpace& space) {
60   if (space.hostnames().empty()) return "localhost";
61   DCHECK_EQ(space.hostnames_size(), 1);
62   const std::string& hostname = space.hostnames(0);
63   // This shouldn't be a taskname in host:port format.
64   DCHECK(!absl::StrContains(hostname, ':'));
65   return hostname;
66 }
67 
68 }  // namespace
69 
MakePerfEnv(double peak_tera_flops_per_second,double peak_hbm_bw_giga_bytes_per_second)70 PerfEnv MakePerfEnv(double peak_tera_flops_per_second,
71                     double peak_hbm_bw_giga_bytes_per_second) {
72   PerfEnv result;
73   result.set_peak_tera_flops_per_second(peak_tera_flops_per_second);
74   result.set_peak_hbm_bw_giga_bytes_per_second(
75       peak_hbm_bw_giga_bytes_per_second);
76   result.set_ridge_point(TeraToGiga(peak_tera_flops_per_second) /
77                          peak_hbm_bw_giga_bytes_per_second);
78   return result;
79 }
80 
GetPerfEnvFromXPlane(const XPlane & device_plane)81 PerfEnv GetPerfEnvFromXPlane(const XPlane& device_plane) {
82   DeviceCapabilities cap = GetDeviceCaps(device_plane);
83   if (!absl::StartsWith(device_plane.name(), kTpuPlanePrefix)) {
84     return MakePerfEnv(
85         GigaToTera(GetFlopMaxThroughputPerSM(cap)) * cap.num_cores(),
86         UniToGiga(cap.memory_bandwidth()));
87   } else {
88     XPlaneVisitor visitor = CreateTfXPlaneVisitor(&device_plane);
89     auto peak_tera_flops_per_second =
90         visitor.GetStat(StatType::kDevCapPeakTeraflopsPerSecond);
91     auto peak_hbm_bw_giga_bytes_per_second =
92         visitor.GetStat(StatType::kDevCapPeakHbmBwGigabytesPerSecond);
93     return MakePerfEnv(peak_tera_flops_per_second->DoubleValue(),
94                        peak_hbm_bw_giga_bytes_per_second->DoubleValue());
95   }
96 }
97 
SetRunEnvironment(const XSpace & space,RunEnvironment * env)98 void SetRunEnvironment(const XSpace& space, RunEnvironment* env) {
99   // Currently, we only support profiling one host and one program.
100   env->set_host_count(1);
101   env->set_task_count(1);
102   env->mutable_hostnames()->insert({Hostname(space), true});
103 
104   std::vector<const XPlane*> gpu_planes =
105       FindPlanesWithPrefix(space, kGpuPlanePrefix);
106   if (!gpu_planes.empty()) {
107     absl::string_view gpu_model =
108         GpuModelName(GetDeviceCaps(*gpu_planes.front()));
109     if (!gpu_model.empty()) {
110       env->set_device_type(std::string(gpu_model));
111     } else {
112       env->set_device_type("GPU");
113     }
114     env->set_device_core_count(gpu_planes.size());
115   } else if (std::vector<const XPlane*> tpu_planes =
116                  FindTensorCorePlanes(space);
117              !tpu_planes.empty()) {
118     XPlaneVisitor visitor = CreateTfXPlaneVisitor(tpu_planes.at(0));
119     auto xstat = visitor.GetStat(StatType::kDeviceTypeString);
120     if (xstat.has_value()) {
121       env->set_device_type(std::string(xstat->StrOrRefValue()));
122     }
123     env->set_device_core_count(tpu_planes.size());
124   } else {
125     env->set_device_type("CPU");
126     env->set_device_core_count(0);
127   }
128 }
129 
PropagateXSpaceDiagnosticsToOpStats(const XSpace & space,OpStats * op_stats)130 void PropagateXSpaceDiagnosticsToOpStats(const XSpace& space,
131                                          OpStats* op_stats) {
132   if (!space.errors().empty()) {
133     absl::flat_hash_set<std::string> unique_errors;
134     unique_errors.insert(space.errors().begin(), space.errors().end());
135     *op_stats->mutable_diagnostics()->mutable_errors() = {unique_errors.begin(),
136                                                           unique_errors.end()};
137   }
138   if (!space.warnings().empty()) {
139     absl::flat_hash_set<std::string> unique_warnings;
140     unique_warnings.insert(space.warnings().begin(), space.warnings().end());
141     *op_stats->mutable_diagnostics()->mutable_warnings() = {
142         unique_warnings.begin(), unique_warnings.end()};
143   }
144 }
145 
ConvertXSpaceToOpStats(const XSpace & space,const OpStatsOptions & options)146 OpStats ConvertXSpaceToOpStats(const XSpace& space,
147                                const OpStatsOptions& options) {
148   std::vector<const XPlane*> device_planes = FindTensorCorePlanes(space);
149   bool is_gpu = device_planes.empty();
150   if (is_gpu) {
151     device_planes = FindPlanesWithPrefix(space, kGpuPlanePrefix);
152   }
153 
154   OpStats op_stats;
155   StepEvents step_events;
156   PropagateXSpaceDiagnosticsToOpStats(space, &op_stats);
157   // Convert device planes.
158   OpMetricsDbCombiner op_metrics_db_combiner(
159       op_stats.mutable_device_op_metrics_db());
160   SetRunEnvironment(space, op_stats.mutable_run_environment());
161 
162   KernelReportMap reports;
163 
164   // TODO(b/161942993) parallelize XPlane processing per thread.
165   for (const XPlane* device_trace : device_planes) {
166     if (options.generate_op_metrics_db) {
167       if (!op_stats.has_perf_env()) {
168         *op_stats.mutable_perf_env() = GetPerfEnvFromXPlane(*device_trace);
169       }
170       if (is_gpu) {
171         OpMetricsDb device_op_metrics_db =
172             ConvertDeviceTraceXPlaneToOpMetricsDb(*device_trace);
173         op_metrics_db_combiner.Combine(device_op_metrics_db);
174       } else {
175         XPlane aggregated_xplane;
176         AggregateXPlane(*device_trace, aggregated_xplane);
177         OpMetricsDb device_op_metrics_db =
178             ConvertTpuDeviceTraceXPlaneToOpMetricsDb(aggregated_xplane);
179         op_metrics_db_combiner.Combine(device_op_metrics_db);
180       }
181     }
182     if (options.generate_step_db) {
183       StepEvents device_step_events =
184           ConvertDeviceTraceXPlaneToStepEvents(*device_trace);
185       CombineStepEvents(device_step_events, &step_events);
186     }
187     if (options.generate_kernel_stats_db) {
188       ConvertDeviceTraceXPlaneToKernelReports(*device_trace,
189                                               /*on_kernel_fn=*/{}, &reports);
190     }
191   }
192 
193   // Combine into reports.
194   if (options.generate_kernel_stats_db) {
195     CopyTopKDurationKernelReportsToDb(reports,
196                                       op_stats.mutable_kernel_stats_db());
197   }
198 
199   bool has_device = !device_planes.empty();
200   // Convert a host plane.
201   const XPlane* host_plane = FindPlaneWithName(space, kHostThreadsPlaneName);
202   if (host_plane) {
203     if (options.generate_op_metrics_db) {
204       *op_stats.mutable_host_op_metrics_db() =
205           ConvertHostThreadsXPlaneToOpMetricsDb(*host_plane);
206     }
207     if (options.generate_step_db) {
208       const StepEvents* device_step_events =
209           has_device ? &step_events : nullptr;
210       StepEvents host_step_events =
211           ConvertHostThreadsXPlaneToStepEvents(*host_plane, device_step_events);
212       CombineStepEvents(host_step_events, &step_events);
213     }
214     XPlaneVisitor visitor = CreateTfXPlaneVisitor(host_plane);
215     auto stat = visitor.GetStat(StatType::kMatrixUnitUtilizationPercent);
216     if (stat.has_value()) {
217       op_stats.mutable_performance_counter_result()
218           ->set_matrix_unit_utilization_percent(stat->DoubleValue());
219     }
220   }
221   if (options.generate_step_db) {
222     StepEvents nonoverlapped_step_events =
223         ToNonOverlappedStepEvents(step_events);
224     *op_stats.mutable_step_db() = ConvertStepEventsToStepDb(
225         has_device, options.maybe_drop_incomplete_steps,
226         nonoverlapped_step_events);
227     *op_stats.mutable_device_op_metrics_db()->mutable_precision_stats() =
228         ComputePrecisionStats(nonoverlapped_step_events);
229   }
230 
231   // TODO(bvandermoon): Add the TPU equivalent for setting core details hostname
232   if (is_gpu) {
233     CoreDetails& details =
234         (*op_stats.mutable_core_id_to_details())[kDefaultGpuLocalCoreId];
235     details.set_hostname(Hostname(space));
236   }
237   return op_stats;
238 }
239 
ConvertMultiXSpacesToCombinedOpStats(const std::vector<XSpace> & xspaces,const OpStatsOptions & options,OpStats * combined_op_stats)240 Status ConvertMultiXSpacesToCombinedOpStats(const std::vector<XSpace>& xspaces,
241                                             const OpStatsOptions& options,
242                                             OpStats* combined_op_stats) {
243   // A shortcut code path for a single XSpace. There is no need to merge OpStats
244   // if there is only a single XSpace.
245   if (xspaces.size() == 1) {
246     *combined_op_stats = ConvertXSpaceToOpStats(xspaces[0], options);
247     return OkStatus();
248   }
249 
250   // Read multiple XSpaces and convert to multiple OpStats.
251   std::vector<OpStats> all_op_stats;
252   all_op_stats.reserve(xspaces.size());
253   for (const XSpace& xspace : xspaces) {
254     all_op_stats.push_back(ConvertXSpaceToOpStats(xspace, options));
255   }
256 
257   // Combine OpStats.
258   std::vector<OpStatsInfo> all_op_stats_info;
259   all_op_stats_info.reserve(all_op_stats.size());
260   for (int i = 0; i < all_op_stats.size(); i++) {
261     all_op_stats_info.emplace_back(
262         &all_op_stats[i],
263         ParseHardwareType(all_op_stats[i].run_environment().device_type()), i);
264   }
265 
266   // Do not limit the maximum number of steps during the merge of OpStats.
267   StepIntersection step_intersection =
268       ComputeStepIntersectionToMergeOpStats(all_op_stats_info, kuint32max);
269   CombineAllOpStats(all_op_stats_info, step_intersection, combined_op_stats);
270 
271   return OkStatus();
272 }
273 
274 }  // namespace profiler
275 }  // namespace tensorflow
276