xref: /aosp_15_r20/external/tensorflow/tensorflow/core/profiler/utils/xplane_schema.cc (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/profiler/utils/xplane_schema.h"
17 
18 #include <cstdint>
19 
20 #include "absl/container/flat_hash_map.h"
21 #include "absl/strings/string_view.h"
22 #include "absl/types/optional.h"
23 #include "tensorflow/core/lib/gtl/map_util.h"
24 #include "tensorflow/core/platform/logging.h"
25 #include "tensorflow/core/platform/types.h"
26 #include "tensorflow/core/profiler/utils/tf_op_utils.h"
27 
28 namespace tensorflow {
29 namespace profiler {
30 
31 const absl::string_view kHostThreadsPlaneName = "/host:CPU";
32 const absl::string_view kGpuPlanePrefix = "/device:GPU:";
33 const absl::string_view kTpuPlanePrefix = "/device:TPU:";
34 const char kTpuPlaneRegex[] = {"/device:TPU:[0-9]*$"};
35 // TODO(b/195582092): change it to /device:custom once all literals are
36 // migrated.
37 const absl::string_view kCustomPlanePrefix = "/device:CUSTOM:";
38 
39 const absl::string_view kTpuRuntimePlaneName = "/host:TPU-runtime";
40 const absl::string_view kCuptiDriverApiPlaneName = "/host:CUPTI";
41 const absl::string_view kRoctracerApiPlaneName = "/host:ROCTRACER";
42 const absl::string_view kMetadataPlaneName = "/host:metadata";
43 const absl::string_view kTFStreamzPlaneName = "/host:tfstreamz";
44 const absl::string_view kPythonTracerPlaneName = "/host:python-tracer";
45 
46 const absl::string_view kStepLineName = "Steps";
47 const absl::string_view kTensorFlowNameScopeLineName = "TensorFlow Name Scope";
48 const absl::string_view kTensorFlowOpLineName = "TensorFlow Ops";
49 const absl::string_view kXlaModuleLineName = "XLA Modules";
50 const absl::string_view kXlaOpLineName = "XLA Ops";
51 const absl::string_view kXlaAsyncOpLineName = "Async XLA Ops";
52 const absl::string_view kKernelLaunchLineName = "Launch Stats";
53 const absl::string_view kSourceLineName = "Source code";
54 
55 const absl::string_view kDeviceVendorNvidia = "Nvidia";
56 const absl::string_view kDeviceVendorAMD = "AMD";
57 
58 namespace {
59 
60 constexpr int kNumHostEventTypes =
61     HostEventType::kLastHostEventType - HostEventType::kFirstHostEventType + 1;
62 
63 constexpr int kNumStatTypes =
64     StatType::kLastStatType - StatType::kFirstStatType + 1;
65 
66 using HostEventTypeMap = absl::flat_hash_map<absl::string_view, HostEventType>;
67 using HostEventTypeStrMap =
68     absl::flat_hash_map<HostEventType, absl::string_view>;
69 using StatTypeMap = absl::flat_hash_map<absl::string_view, StatType>;
70 using StatTypeStrMap = absl::flat_hash_map<StatType, absl::string_view>;
71 
GetHostEventTypeMap()72 const HostEventTypeMap& GetHostEventTypeMap() {
73   static auto* host_event_type_map = new HostEventTypeMap({
74       {"UnknownHostEventType", kUnknownHostEventType},
75       {"TraceContext", kTraceContext},
76       {"SessionRun", kSessionRun},
77       {"FunctionRun", kFunctionRun},
78       {"RunGraph", kRunGraph},
79       {"RunGraphDone", kRunGraphDone},
80       {"TfOpRun", kTfOpRun},
81       {"EagerExecute", kEagerKernelExecute},
82       {"ExecutorState::Process", kExecutorStateProcess},
83       {"ExecutorDoneCallback", kExecutorDoneCallback},
84       {"MemoryAllocation", kMemoryAllocation},
85       {"MemoryDeallocation", kMemoryDeallocation},
86       // Performance counter related.
87       {"RemotePerfCounter", kRemotePerf},
88       // tf data captured function events.
89       {"InstantiatedCapturedFunction::Run", kTfDataCapturedFunctionRun},
90       {"InstantiatedCapturedFunction::RunWithBorrowedArgs",
91        kTfDataCapturedFunctionRunWithBorrowedArgs},
92       {"InstantiatedCapturedFunction::RunInstantiated",
93        kTfDataCapturedFunctionRunInstantiated},
94       {"InstantiatedCapturedFunction::RunAsync",
95        kTfDataCapturedFunctionRunAsync},
96       // Loop ops.
97       {"ParallelForOp", kParallelForOp},
98       {"ForeverOp", kForeverOp},
99       {"WhileOp-EvalCond", kWhileOpEvalCond},
100       {"WhileOp-StartBody", kWhileOpStartBody},
101       {"ForOp", kForOp},
102       // tf.data related.
103       {"IteratorGetNextOp::DoCompute", kIteratorGetNextOp},
104       {"IteratorGetNextAsOptionalOp::DoCompute", kIteratorGetNextAsOptionalOp},
105       {"Iterator", kIterator},
106       {"Iterator::Prefetch::Generator", kDeviceInputPipelineSecondIterator},
107       {"PrefetchProduce", kPrefetchProduce},
108       {"PrefetchConsume", kPrefetchConsume},
109       {"ParallelInterleaveProduce", kParallelInterleaveProduce},
110       {"ParallelInterleaveConsume", kParallelInterleaveConsume},
111       {"ParallelInterleaveInitializeInput",
112        kParallelInterleaveInitializedInput},
113       {"ParallelMapProduce", kParallelMapProduce},
114       {"ParallelMapConsume", kParallelMapConsume},
115       {"MapAndBatchProduce", kMapAndBatchProduce},
116       {"MapAndBatchConsume", kMapAndBatchConsume},
117       {"ParseExampleProduce", kParseExampleProduce},
118       {"ParseExampleConsume", kParseExampleConsume},
119       {"ParallelBatchProduce", kParallelBatchProduce},
120       {"ParallelBatchConsume", kParallelBatchConsume},
121       // Batching related.
122       {"BatchingSessionRun", kBatchingSessionRun},
123       {"ProcessBatch", kProcessBatch},
124       {"ConcatInputTensors", kConcatInputTensors},
125       {"MergeInputTensors", kMergeInputTensors},
126       {"ScheduleWithoutSplit", kScheduleWithoutSplit},
127       {"ScheduleWithSplit", kScheduleWithSplit},
128       {"ScheduleWithEagerSplit", kScheduleWithEagerSplit},
129       {"ASBSQueue::Schedule", kASBSQueueSchedule},
130       // TFRT related.
131       {"TfrtModelRun", kTfrtModelRun},
132       // JAX related.
133       {"LocalExecutable::ExecuteOnLocalDevices", kExecuteOnLocalDevices},
134       // GPU related.
135       {"KernelLaunch", kKernelLaunch},
136       {"KernelExecute", kKernelExecute},
137       // TPU related.
138       {"EnqueueRequestLocked", kEnqueueRequestLocked},
139       {"RunProgramRequest", kRunProgramRequest},
140       {"HostCallbackRequest", kHostCallbackRequest},
141       {"TransferH2DRequest", kTransferH2DRequest},
142       {"TransferPreprocessedH2DRequest", kTransferPreprocessedH2DRequest},
143       {"TransferD2HRequest", kTransferD2HRequest},
144       {"OnDeviceSendRequest", kOnDeviceSendRequest},
145       {"OnDeviceRecvRequest", kOnDeviceRecvRequest},
146       {"OnDeviceSendRecvLocalRequest", kOnDeviceSendRecvLocalRequest},
147       {"CustomWait", kCustomWait},
148       {"OnDeviceSendRequestMulti", kOnDeviceSendRequestMulti},
149       {"OnDeviceRecvRequestMulti", kOnDeviceRecvRequestMulti},
150       {"PjrtAsyncWait", kPjrtAsyncWait},
151       {"DoEnqueueProgram", kDoEnqueueProgram},
152       {"DoEnqueueContinuationProgram", kDoEnqueueContinuationProgram},
153       {"WriteHbm", kWriteHbm},
154       {"ReadHbm", kReadHbm},
155       {"TpuExecuteOp", kTpuExecuteOp},
156       {"CompleteCallbacks", kCompleteCallbacks},
157       {"TPUPartitionedCallOp-InitializeVarOnTPU",
158        kTpuPartitionedCallOpInitializeVarOnTpu},
159       {"TPUPartitionedCallOp-ExecuteRemote",
160        kTpuPartitionedCallOpExecuteRemote},
161       {"TPUPartitionedCallOp-ExecuteLocal", kTpuPartitionedCallOpExecuteLocal},
162       {"Linearize", kLinearize},
163       {"Delinearize", kDelinearize},
164       {"TransferBufferFromDevice-FastPath", kTransferBufferFromDeviceFastPath},
165       {"tpu::System::TransferToDevice=>IssueEvent",
166        kTransferToDeviceIssueEvent},
167       {"tpu::System::TransferToDevice=>IssueEvent=>Done",
168        kTransferToDeviceDone},
169       {"tpu::System::TransferFromDevice=>IssueEvent",
170        kTransferFromDeviceIssueEvent},
171       {"tpu::System::TransferFromDevice=>IssueEvent=>Done",
172        kTransferFromDeviceDone},
173       {"tpu::System::Execute", kTpuSystemExecute},
174   });
175   DCHECK_EQ(host_event_type_map->size(), kNumHostEventTypes);
176   return *host_event_type_map;
177 }
178 
GetStatTypeMap()179 const StatTypeMap& GetStatTypeMap() {
180   static auto* stat_type_map = new StatTypeMap({
181       {"UnknownStatType", kUnknownStatType},
182       // TraceMe arguments.
183       {"id", kStepId},
184       {"device_ordinal", kDeviceOrdinal},
185       {"chip_ordinal", kChipOrdinal},
186       {"node_ordinal", kNodeOrdinal},
187       {"model_id", kModelId},
188       {"queue_addr", kQueueAddr},
189       {"queue_id", kQueueId},
190       {"request_id", kRequestId},
191       {"run_id", kRunId},
192       {"replica_id", kReplicaId},
193       {"graph_type", kGraphType},
194       {"step_num", kStepNum},
195       {"iter_num", kIterNum},
196       {"index_on_host", kIndexOnHost},
197       {"allocator_name", kAllocatorName},
198       {"bytes_reserved", kBytesReserved},
199       {"bytes_allocated", kBytesAllocated},
200       {"bytes_available", kBytesAvailable},
201       {"fragmentation", kFragmentation},
202       {"peak_bytes_in_use", kPeakBytesInUse},
203       {"requested_bytes", kRequestedBytes},
204       {"allocation_bytes", kAllocationBytes},
205       {"addr", kAddress},
206       {"region_type", kRegionType},
207       {"data_type", kDataType},
208       {"shape", kTensorShapes},
209       {"layout", kTensorLayout},
210       {"kpi_name", kKpiName},
211       {"kpi_value", kKpiValue},
212       {"element_id", kElementId},
213       {"parent_id", kParentId},
214       // XPlane semantics related.
215       {"_pt", kProducerType},
216       {"_ct", kConsumerType},
217       {"_p", kProducerId},
218       {"_c", kConsumerId},
219       {"_r", kIsRoot},
220       {"_a", kIsAsync},
221       // Device trace arguments.
222       {"device_id", kDeviceId},
223       {"device_type_string", kDeviceTypeString},
224       {"context_id", kContextId},
225       {"correlation_id", kCorrelationId},
226       {"memcpy_details", kMemcpyDetails},
227       {"memalloc_details", kMemallocDetails},
228       {"MemFree_details", kMemFreeDetails},
229       {"Memset_details", kMemsetDetails},
230       {"MemoryResidency_details", kMemoryResidencyDetails},
231       {"kernel_details", kKernelDetails},
232       {"nvtx_range", kNVTXRange},
233       {"stream", kStream},
234       // Stats added when processing traces.
235       {"group_id", kGroupId},
236       {"flow", kFlow},
237       {"step_name", kStepName},
238       {"tf_op", kTfOp},
239       {"hlo_op", kHloOp},
240       {"hlo_category", kHloCategory},
241       {"hlo_module", kHloModule},
242       {"program_id", kProgramId},
243       {"equation", kEquation},
244       {"is_eager", kIsEager},
245       {"is_func", kIsFunc},
246       {"tf_function_call", kTfFunctionCall},
247       {"tracing_count", kTfFunctionTracingCount},
248       {"flops", kFlops},
249       {"bytes_accessed", kBytesAccessed},
250       {"source", kSourceInfo},
251       {"model_name", kModelName},
252       {"model_version", kModelVersion},
253       {"bytes_transferred", kBytesTransferred},
254       {"queue", kDmaQueue},
255       // Performance counter related.
256       {"Raw Value", kRawValue},
257       {"Scaled Value", kScaledValue},
258       {"Thread Id", kThreadId},
259       {"matrix_unit_utilization_percent", kMatrixUnitUtilizationPercent},
260       // XLA metadata map related.
261       {"Hlo Proto", kHloProto},
262       // Device capability related.
263       {"clock_rate", kDevCapClockRateKHz},
264       {"core_count", kDevCapCoreCount},
265       {"memory_bandwidth", kDevCapMemoryBandwidth},
266       {"memory_size", kDevCapMemorySize},
267       {"compute_cap_major", kDevCapComputeCapMajor},
268       {"compute_cap_minor", kDevCapComputeCapMinor},
269       {"peak_teraflops_per_second", kDevCapPeakTeraflopsPerSecond},
270       {"peak_hbm_bw_gigabytes_per_second", kDevCapPeakHbmBwGigabytesPerSecond},
271       {"device_vendor", kDevVendor},
272       // Batching related.
273       {"batch_size_after_padding", kBatchSizeAfterPadding},
274       {"padding_amount", kPaddingAmount},
275       {"batching_input_task_size", kBatchingInputTaskSize},
276       // GPU related metrics.
277       {"theoretical_occupancy_pct", kTheoreticalOccupancyPct},
278       {"occupancy_min_grid_size", kOccupancyMinGridSize},
279       {"occupancy_suggested_block_size", kOccupancySuggestedBlockSize},
280       // Aggregrated Stat
281       {"self_duration_ps", kSelfDurationPs},
282       {"min_duration_ps", kMinDurationPs},
283       {"max_iteration_num", kMaxIterationNum},
284       {"device_type", kDeviceType},
285       {"uses_megacore", kUsesMegaCore},
286       {"symbol_id", kSymbolId},
287       {"hlo_category", kHloCategory},
288       {"tf_op_name", kTfOpName},
289       {"dma_stall_duration_ps", kDmaStallDurationPs},
290   });
291   DCHECK_EQ(stat_type_map->size(), kNumStatTypes);
292   return *stat_type_map;
293 }
294 
GetHostEventTypeStrMap()295 const HostEventTypeStrMap& GetHostEventTypeStrMap() {
296   static auto* host_event_type_str_map = new HostEventTypeStrMap(
297       gtl::ReverseMap<HostEventTypeStrMap>(GetHostEventTypeMap()));
298   return *host_event_type_str_map;
299 }
300 
GetStatTypeStrMap()301 const StatTypeStrMap& GetStatTypeStrMap() {
302   static auto* stat_type_str_map =
303       new StatTypeStrMap(gtl::ReverseMap<StatTypeStrMap>(GetStatTypeMap()));
304   return *stat_type_str_map;
305 }
306 
307 }  // namespace
308 
GetHostEventTypeStr(HostEventType event_type)309 absl::string_view GetHostEventTypeStr(HostEventType event_type) {
310   return GetHostEventTypeStrMap().at(event_type);
311 }
312 
FindHostEventType(absl::string_view event_name)313 absl::optional<int64_t> FindHostEventType(absl::string_view event_name) {
314   if (auto event_type = gtl::FindOrNull(GetHostEventTypeMap(), event_name)) {
315     return *event_type;
316   }
317   return absl::nullopt;
318 }
319 
FindTfOpEventType(absl::string_view event_name)320 absl::optional<int64_t> FindTfOpEventType(absl::string_view event_name) {
321   // TF op names.
322   Category category = ParseTfOpFullname(event_name).category;
323   switch (category) {
324     case Category::kTensorFlow:
325       return HostEventType::kTfOpRun;
326     case Category::kTfData:
327       return HostEventType::kIterator;
328     default:
329       return absl::nullopt;
330   }
331 }
332 
GetStatTypeStr(StatType stat_type)333 absl::string_view GetStatTypeStr(StatType stat_type) {
334   return GetStatTypeStrMap().at(stat_type);
335 }
336 
FindStatType(absl::string_view stat_name)337 absl::optional<int64_t> FindStatType(absl::string_view stat_name) {
338   if (auto stat_type = gtl::FindOrNull(GetStatTypeMap(), stat_name)) {
339     return *stat_type;
340   }
341   return absl::nullopt;
342 }
343 
IsInternalEvent(absl::optional<int64_t> event_type)344 bool IsInternalEvent(absl::optional<int64_t> event_type) {
345   // TODO(b/162102421): Introduce a prefix for internal event names.
346   if (!event_type.has_value()) return false;
347   switch (*event_type) {
348     case HostEventType::kMemoryAllocation:
349     case HostEventType::kMemoryDeallocation:
350     case HostEventType::kPrefetchProduce:
351     case HostEventType::kPrefetchConsume:
352     case HostEventType::kParallelInterleaveProduce:
353     case HostEventType::kParallelInterleaveConsume:
354     case HostEventType::kParallelInterleaveInitializedInput:
355     case HostEventType::kParallelMapProduce:
356     case HostEventType::kParallelMapConsume:
357     case HostEventType::kMapAndBatchProduce:
358     case HostEventType::kMapAndBatchConsume:
359     case HostEventType::kParseExampleProduce:
360     case HostEventType::kParseExampleConsume:
361       return true;
362     default:
363       return false;
364   }
365 }
366 
IsInternalStat(absl::optional<int64_t> stat_type)367 bool IsInternalStat(absl::optional<int64_t> stat_type) {
368   // TODO(b/162102421): Introduce a prefix for internal stat names.
369   if (!stat_type.has_value()) return false;
370   switch (*stat_type) {
371     case StatType::kKernelDetails:
372     case StatType::kProducerType:
373     case StatType::kProducerId:
374     case StatType::kConsumerType:
375     case StatType::kConsumerId:
376     case StatType::kIsRoot:
377     case StatType::kFlops:
378     case StatType::kBytesAccessed:
379     case StatType::kProgramId:
380     case StatType::kSymbolId:
381       return true;
382     default:
383       return false;
384   }
385 }
386 
387 /*static*/ std::atomic<uint64_t> XFlow::next_flow_id_(0);
388 
389 }  // namespace profiler
390 }  // namespace tensorflow
391