1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/profiler/utils/xplane_schema.h"
17
18 #include <cstdint>
19
20 #include "absl/container/flat_hash_map.h"
21 #include "absl/strings/string_view.h"
22 #include "absl/types/optional.h"
23 #include "tensorflow/core/lib/gtl/map_util.h"
24 #include "tensorflow/core/platform/logging.h"
25 #include "tensorflow/core/platform/types.h"
26 #include "tensorflow/core/profiler/utils/tf_op_utils.h"
27
28 namespace tensorflow {
29 namespace profiler {
30
31 const absl::string_view kHostThreadsPlaneName = "/host:CPU";
32 const absl::string_view kGpuPlanePrefix = "/device:GPU:";
33 const absl::string_view kTpuPlanePrefix = "/device:TPU:";
34 const char kTpuPlaneRegex[] = {"/device:TPU:[0-9]*$"};
35 // TODO(b/195582092): change it to /device:custom once all literals are
36 // migrated.
37 const absl::string_view kCustomPlanePrefix = "/device:CUSTOM:";
38
39 const absl::string_view kTpuRuntimePlaneName = "/host:TPU-runtime";
40 const absl::string_view kCuptiDriverApiPlaneName = "/host:CUPTI";
41 const absl::string_view kRoctracerApiPlaneName = "/host:ROCTRACER";
42 const absl::string_view kMetadataPlaneName = "/host:metadata";
43 const absl::string_view kTFStreamzPlaneName = "/host:tfstreamz";
44 const absl::string_view kPythonTracerPlaneName = "/host:python-tracer";
45
46 const absl::string_view kStepLineName = "Steps";
47 const absl::string_view kTensorFlowNameScopeLineName = "TensorFlow Name Scope";
48 const absl::string_view kTensorFlowOpLineName = "TensorFlow Ops";
49 const absl::string_view kXlaModuleLineName = "XLA Modules";
50 const absl::string_view kXlaOpLineName = "XLA Ops";
51 const absl::string_view kXlaAsyncOpLineName = "Async XLA Ops";
52 const absl::string_view kKernelLaunchLineName = "Launch Stats";
53 const absl::string_view kSourceLineName = "Source code";
54
55 const absl::string_view kDeviceVendorNvidia = "Nvidia";
56 const absl::string_view kDeviceVendorAMD = "AMD";
57
58 namespace {
59
60 constexpr int kNumHostEventTypes =
61 HostEventType::kLastHostEventType - HostEventType::kFirstHostEventType + 1;
62
63 constexpr int kNumStatTypes =
64 StatType::kLastStatType - StatType::kFirstStatType + 1;
65
66 using HostEventTypeMap = absl::flat_hash_map<absl::string_view, HostEventType>;
67 using HostEventTypeStrMap =
68 absl::flat_hash_map<HostEventType, absl::string_view>;
69 using StatTypeMap = absl::flat_hash_map<absl::string_view, StatType>;
70 using StatTypeStrMap = absl::flat_hash_map<StatType, absl::string_view>;
71
GetHostEventTypeMap()72 const HostEventTypeMap& GetHostEventTypeMap() {
73 static auto* host_event_type_map = new HostEventTypeMap({
74 {"UnknownHostEventType", kUnknownHostEventType},
75 {"TraceContext", kTraceContext},
76 {"SessionRun", kSessionRun},
77 {"FunctionRun", kFunctionRun},
78 {"RunGraph", kRunGraph},
79 {"RunGraphDone", kRunGraphDone},
80 {"TfOpRun", kTfOpRun},
81 {"EagerExecute", kEagerKernelExecute},
82 {"ExecutorState::Process", kExecutorStateProcess},
83 {"ExecutorDoneCallback", kExecutorDoneCallback},
84 {"MemoryAllocation", kMemoryAllocation},
85 {"MemoryDeallocation", kMemoryDeallocation},
86 // Performance counter related.
87 {"RemotePerfCounter", kRemotePerf},
88 // tf data captured function events.
89 {"InstantiatedCapturedFunction::Run", kTfDataCapturedFunctionRun},
90 {"InstantiatedCapturedFunction::RunWithBorrowedArgs",
91 kTfDataCapturedFunctionRunWithBorrowedArgs},
92 {"InstantiatedCapturedFunction::RunInstantiated",
93 kTfDataCapturedFunctionRunInstantiated},
94 {"InstantiatedCapturedFunction::RunAsync",
95 kTfDataCapturedFunctionRunAsync},
96 // Loop ops.
97 {"ParallelForOp", kParallelForOp},
98 {"ForeverOp", kForeverOp},
99 {"WhileOp-EvalCond", kWhileOpEvalCond},
100 {"WhileOp-StartBody", kWhileOpStartBody},
101 {"ForOp", kForOp},
102 // tf.data related.
103 {"IteratorGetNextOp::DoCompute", kIteratorGetNextOp},
104 {"IteratorGetNextAsOptionalOp::DoCompute", kIteratorGetNextAsOptionalOp},
105 {"Iterator", kIterator},
106 {"Iterator::Prefetch::Generator", kDeviceInputPipelineSecondIterator},
107 {"PrefetchProduce", kPrefetchProduce},
108 {"PrefetchConsume", kPrefetchConsume},
109 {"ParallelInterleaveProduce", kParallelInterleaveProduce},
110 {"ParallelInterleaveConsume", kParallelInterleaveConsume},
111 {"ParallelInterleaveInitializeInput",
112 kParallelInterleaveInitializedInput},
113 {"ParallelMapProduce", kParallelMapProduce},
114 {"ParallelMapConsume", kParallelMapConsume},
115 {"MapAndBatchProduce", kMapAndBatchProduce},
116 {"MapAndBatchConsume", kMapAndBatchConsume},
117 {"ParseExampleProduce", kParseExampleProduce},
118 {"ParseExampleConsume", kParseExampleConsume},
119 {"ParallelBatchProduce", kParallelBatchProduce},
120 {"ParallelBatchConsume", kParallelBatchConsume},
121 // Batching related.
122 {"BatchingSessionRun", kBatchingSessionRun},
123 {"ProcessBatch", kProcessBatch},
124 {"ConcatInputTensors", kConcatInputTensors},
125 {"MergeInputTensors", kMergeInputTensors},
126 {"ScheduleWithoutSplit", kScheduleWithoutSplit},
127 {"ScheduleWithSplit", kScheduleWithSplit},
128 {"ScheduleWithEagerSplit", kScheduleWithEagerSplit},
129 {"ASBSQueue::Schedule", kASBSQueueSchedule},
130 // TFRT related.
131 {"TfrtModelRun", kTfrtModelRun},
132 // JAX related.
133 {"LocalExecutable::ExecuteOnLocalDevices", kExecuteOnLocalDevices},
134 // GPU related.
135 {"KernelLaunch", kKernelLaunch},
136 {"KernelExecute", kKernelExecute},
137 // TPU related.
138 {"EnqueueRequestLocked", kEnqueueRequestLocked},
139 {"RunProgramRequest", kRunProgramRequest},
140 {"HostCallbackRequest", kHostCallbackRequest},
141 {"TransferH2DRequest", kTransferH2DRequest},
142 {"TransferPreprocessedH2DRequest", kTransferPreprocessedH2DRequest},
143 {"TransferD2HRequest", kTransferD2HRequest},
144 {"OnDeviceSendRequest", kOnDeviceSendRequest},
145 {"OnDeviceRecvRequest", kOnDeviceRecvRequest},
146 {"OnDeviceSendRecvLocalRequest", kOnDeviceSendRecvLocalRequest},
147 {"CustomWait", kCustomWait},
148 {"OnDeviceSendRequestMulti", kOnDeviceSendRequestMulti},
149 {"OnDeviceRecvRequestMulti", kOnDeviceRecvRequestMulti},
150 {"PjrtAsyncWait", kPjrtAsyncWait},
151 {"DoEnqueueProgram", kDoEnqueueProgram},
152 {"DoEnqueueContinuationProgram", kDoEnqueueContinuationProgram},
153 {"WriteHbm", kWriteHbm},
154 {"ReadHbm", kReadHbm},
155 {"TpuExecuteOp", kTpuExecuteOp},
156 {"CompleteCallbacks", kCompleteCallbacks},
157 {"TPUPartitionedCallOp-InitializeVarOnTPU",
158 kTpuPartitionedCallOpInitializeVarOnTpu},
159 {"TPUPartitionedCallOp-ExecuteRemote",
160 kTpuPartitionedCallOpExecuteRemote},
161 {"TPUPartitionedCallOp-ExecuteLocal", kTpuPartitionedCallOpExecuteLocal},
162 {"Linearize", kLinearize},
163 {"Delinearize", kDelinearize},
164 {"TransferBufferFromDevice-FastPath", kTransferBufferFromDeviceFastPath},
165 {"tpu::System::TransferToDevice=>IssueEvent",
166 kTransferToDeviceIssueEvent},
167 {"tpu::System::TransferToDevice=>IssueEvent=>Done",
168 kTransferToDeviceDone},
169 {"tpu::System::TransferFromDevice=>IssueEvent",
170 kTransferFromDeviceIssueEvent},
171 {"tpu::System::TransferFromDevice=>IssueEvent=>Done",
172 kTransferFromDeviceDone},
173 {"tpu::System::Execute", kTpuSystemExecute},
174 });
175 DCHECK_EQ(host_event_type_map->size(), kNumHostEventTypes);
176 return *host_event_type_map;
177 }
178
GetStatTypeMap()179 const StatTypeMap& GetStatTypeMap() {
180 static auto* stat_type_map = new StatTypeMap({
181 {"UnknownStatType", kUnknownStatType},
182 // TraceMe arguments.
183 {"id", kStepId},
184 {"device_ordinal", kDeviceOrdinal},
185 {"chip_ordinal", kChipOrdinal},
186 {"node_ordinal", kNodeOrdinal},
187 {"model_id", kModelId},
188 {"queue_addr", kQueueAddr},
189 {"queue_id", kQueueId},
190 {"request_id", kRequestId},
191 {"run_id", kRunId},
192 {"replica_id", kReplicaId},
193 {"graph_type", kGraphType},
194 {"step_num", kStepNum},
195 {"iter_num", kIterNum},
196 {"index_on_host", kIndexOnHost},
197 {"allocator_name", kAllocatorName},
198 {"bytes_reserved", kBytesReserved},
199 {"bytes_allocated", kBytesAllocated},
200 {"bytes_available", kBytesAvailable},
201 {"fragmentation", kFragmentation},
202 {"peak_bytes_in_use", kPeakBytesInUse},
203 {"requested_bytes", kRequestedBytes},
204 {"allocation_bytes", kAllocationBytes},
205 {"addr", kAddress},
206 {"region_type", kRegionType},
207 {"data_type", kDataType},
208 {"shape", kTensorShapes},
209 {"layout", kTensorLayout},
210 {"kpi_name", kKpiName},
211 {"kpi_value", kKpiValue},
212 {"element_id", kElementId},
213 {"parent_id", kParentId},
214 // XPlane semantics related.
215 {"_pt", kProducerType},
216 {"_ct", kConsumerType},
217 {"_p", kProducerId},
218 {"_c", kConsumerId},
219 {"_r", kIsRoot},
220 {"_a", kIsAsync},
221 // Device trace arguments.
222 {"device_id", kDeviceId},
223 {"device_type_string", kDeviceTypeString},
224 {"context_id", kContextId},
225 {"correlation_id", kCorrelationId},
226 {"memcpy_details", kMemcpyDetails},
227 {"memalloc_details", kMemallocDetails},
228 {"MemFree_details", kMemFreeDetails},
229 {"Memset_details", kMemsetDetails},
230 {"MemoryResidency_details", kMemoryResidencyDetails},
231 {"kernel_details", kKernelDetails},
232 {"nvtx_range", kNVTXRange},
233 {"stream", kStream},
234 // Stats added when processing traces.
235 {"group_id", kGroupId},
236 {"flow", kFlow},
237 {"step_name", kStepName},
238 {"tf_op", kTfOp},
239 {"hlo_op", kHloOp},
240 {"hlo_category", kHloCategory},
241 {"hlo_module", kHloModule},
242 {"program_id", kProgramId},
243 {"equation", kEquation},
244 {"is_eager", kIsEager},
245 {"is_func", kIsFunc},
246 {"tf_function_call", kTfFunctionCall},
247 {"tracing_count", kTfFunctionTracingCount},
248 {"flops", kFlops},
249 {"bytes_accessed", kBytesAccessed},
250 {"source", kSourceInfo},
251 {"model_name", kModelName},
252 {"model_version", kModelVersion},
253 {"bytes_transferred", kBytesTransferred},
254 {"queue", kDmaQueue},
255 // Performance counter related.
256 {"Raw Value", kRawValue},
257 {"Scaled Value", kScaledValue},
258 {"Thread Id", kThreadId},
259 {"matrix_unit_utilization_percent", kMatrixUnitUtilizationPercent},
260 // XLA metadata map related.
261 {"Hlo Proto", kHloProto},
262 // Device capability related.
263 {"clock_rate", kDevCapClockRateKHz},
264 {"core_count", kDevCapCoreCount},
265 {"memory_bandwidth", kDevCapMemoryBandwidth},
266 {"memory_size", kDevCapMemorySize},
267 {"compute_cap_major", kDevCapComputeCapMajor},
268 {"compute_cap_minor", kDevCapComputeCapMinor},
269 {"peak_teraflops_per_second", kDevCapPeakTeraflopsPerSecond},
270 {"peak_hbm_bw_gigabytes_per_second", kDevCapPeakHbmBwGigabytesPerSecond},
271 {"device_vendor", kDevVendor},
272 // Batching related.
273 {"batch_size_after_padding", kBatchSizeAfterPadding},
274 {"padding_amount", kPaddingAmount},
275 {"batching_input_task_size", kBatchingInputTaskSize},
276 // GPU related metrics.
277 {"theoretical_occupancy_pct", kTheoreticalOccupancyPct},
278 {"occupancy_min_grid_size", kOccupancyMinGridSize},
279 {"occupancy_suggested_block_size", kOccupancySuggestedBlockSize},
280 // Aggregrated Stat
281 {"self_duration_ps", kSelfDurationPs},
282 {"min_duration_ps", kMinDurationPs},
283 {"max_iteration_num", kMaxIterationNum},
284 {"device_type", kDeviceType},
285 {"uses_megacore", kUsesMegaCore},
286 {"symbol_id", kSymbolId},
287 {"hlo_category", kHloCategory},
288 {"tf_op_name", kTfOpName},
289 {"dma_stall_duration_ps", kDmaStallDurationPs},
290 });
291 DCHECK_EQ(stat_type_map->size(), kNumStatTypes);
292 return *stat_type_map;
293 }
294
GetHostEventTypeStrMap()295 const HostEventTypeStrMap& GetHostEventTypeStrMap() {
296 static auto* host_event_type_str_map = new HostEventTypeStrMap(
297 gtl::ReverseMap<HostEventTypeStrMap>(GetHostEventTypeMap()));
298 return *host_event_type_str_map;
299 }
300
GetStatTypeStrMap()301 const StatTypeStrMap& GetStatTypeStrMap() {
302 static auto* stat_type_str_map =
303 new StatTypeStrMap(gtl::ReverseMap<StatTypeStrMap>(GetStatTypeMap()));
304 return *stat_type_str_map;
305 }
306
307 } // namespace
308
GetHostEventTypeStr(HostEventType event_type)309 absl::string_view GetHostEventTypeStr(HostEventType event_type) {
310 return GetHostEventTypeStrMap().at(event_type);
311 }
312
FindHostEventType(absl::string_view event_name)313 absl::optional<int64_t> FindHostEventType(absl::string_view event_name) {
314 if (auto event_type = gtl::FindOrNull(GetHostEventTypeMap(), event_name)) {
315 return *event_type;
316 }
317 return absl::nullopt;
318 }
319
FindTfOpEventType(absl::string_view event_name)320 absl::optional<int64_t> FindTfOpEventType(absl::string_view event_name) {
321 // TF op names.
322 Category category = ParseTfOpFullname(event_name).category;
323 switch (category) {
324 case Category::kTensorFlow:
325 return HostEventType::kTfOpRun;
326 case Category::kTfData:
327 return HostEventType::kIterator;
328 default:
329 return absl::nullopt;
330 }
331 }
332
GetStatTypeStr(StatType stat_type)333 absl::string_view GetStatTypeStr(StatType stat_type) {
334 return GetStatTypeStrMap().at(stat_type);
335 }
336
FindStatType(absl::string_view stat_name)337 absl::optional<int64_t> FindStatType(absl::string_view stat_name) {
338 if (auto stat_type = gtl::FindOrNull(GetStatTypeMap(), stat_name)) {
339 return *stat_type;
340 }
341 return absl::nullopt;
342 }
343
IsInternalEvent(absl::optional<int64_t> event_type)344 bool IsInternalEvent(absl::optional<int64_t> event_type) {
345 // TODO(b/162102421): Introduce a prefix for internal event names.
346 if (!event_type.has_value()) return false;
347 switch (*event_type) {
348 case HostEventType::kMemoryAllocation:
349 case HostEventType::kMemoryDeallocation:
350 case HostEventType::kPrefetchProduce:
351 case HostEventType::kPrefetchConsume:
352 case HostEventType::kParallelInterleaveProduce:
353 case HostEventType::kParallelInterleaveConsume:
354 case HostEventType::kParallelInterleaveInitializedInput:
355 case HostEventType::kParallelMapProduce:
356 case HostEventType::kParallelMapConsume:
357 case HostEventType::kMapAndBatchProduce:
358 case HostEventType::kMapAndBatchConsume:
359 case HostEventType::kParseExampleProduce:
360 case HostEventType::kParseExampleConsume:
361 return true;
362 default:
363 return false;
364 }
365 }
366
IsInternalStat(absl::optional<int64_t> stat_type)367 bool IsInternalStat(absl::optional<int64_t> stat_type) {
368 // TODO(b/162102421): Introduce a prefix for internal stat names.
369 if (!stat_type.has_value()) return false;
370 switch (*stat_type) {
371 case StatType::kKernelDetails:
372 case StatType::kProducerType:
373 case StatType::kProducerId:
374 case StatType::kConsumerType:
375 case StatType::kConsumerId:
376 case StatType::kIsRoot:
377 case StatType::kFlops:
378 case StatType::kBytesAccessed:
379 case StatType::kProgramId:
380 case StatType::kSymbolId:
381 return true;
382 default:
383 return false;
384 }
385 }
386
387 /*static*/ std::atomic<uint64_t> XFlow::next_flow_id_(0);
388
389 } // namespace profiler
390 } // namespace tensorflow
391