xref: /aosp_15_r20/external/tensorflow/tensorflow/core/profiler/utils/xplane_schema.h (revision b6fb3261f9314811a0f4371741dbb8839866f948)
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_XPLANE_SCHEMA_H_
17 #define TENSORFLOW_CORE_PROFILER_UTILS_XPLANE_SCHEMA_H_
18 
19 #include <atomic>
20 #include <cstdint>
21 #include <string>
22 
23 #include "absl/hash/hash.h"
24 #include "absl/strings/match.h"
25 #include "absl/strings/str_cat.h"
26 #include "absl/strings/string_view.h"
27 #include "absl/types/optional.h"
28 #include "tensorflow/core/platform/logging.h"
29 #include "tensorflow/core/platform/macros.h"
30 #include "tensorflow/core/platform/types.h"
31 #include "tensorflow/core/profiler/lib/context_types.h"
32 
33 namespace tensorflow {
34 namespace profiler {
35 
36 // Name of XPlane that contains TraceMe events.
37 TF_CONST_INIT extern const absl::string_view kHostThreadsPlaneName;
38 // Name prefix of XPlane that contains GPU events.
39 TF_CONST_INIT extern const absl::string_view kGpuPlanePrefix;
40 // Name prefix of XPlane that contains TPU events.
41 TF_CONST_INIT extern const absl::string_view kTpuPlanePrefix;
42 // Regex for XPlanes that contain TensorCore planes.
43 TF_CONST_INIT extern const char kTpuPlaneRegex[];
44 // Name prefix of XPlane that contains custom device events.
45 TF_CONST_INIT extern const absl::string_view kCustomPlanePrefix;
46 // Name prefix of XPlane that contains TPU runtime events.
47 TF_CONST_INIT extern const absl::string_view kTpuRuntimePlaneName;
48 // Name of XPlane that contains CUPTI driver API generated events.
49 TF_CONST_INIT extern const absl::string_view kCuptiDriverApiPlaneName;
50 // Name of XPlane that contains Roctracer API generated events.
51 TF_CONST_INIT extern const absl::string_view kRoctracerApiPlaneName;
52 // Name of XPlane that contains profile metadata such as XLA debug info.
53 TF_CONST_INIT extern const absl::string_view kMetadataPlaneName;
54 // Name of XPlane that contains kpi related metrics.
55 TF_CONST_INIT extern const absl::string_view kTFStreamzPlaneName;
56 // Name of XPlane that contains events from python tracer.
57 TF_CONST_INIT extern const absl::string_view kPythonTracerPlaneName;
58 
59 // Names of XLines that contain ML-level events.
60 TF_CONST_INIT extern const absl::string_view kStepLineName;
61 TF_CONST_INIT extern const absl::string_view kTensorFlowNameScopeLineName;
62 TF_CONST_INIT extern const absl::string_view kTensorFlowOpLineName;
63 TF_CONST_INIT extern const absl::string_view kXlaModuleLineName;
64 TF_CONST_INIT extern const absl::string_view kXlaOpLineName;
65 TF_CONST_INIT extern const absl::string_view kXlaAsyncOpLineName;
66 TF_CONST_INIT extern const absl::string_view kKernelLaunchLineName;
67 TF_CONST_INIT extern const absl::string_view kSourceLineName;
68 
69 // GPU device vendors.
70 TF_CONST_INIT extern const absl::string_view kDeviceVendorNvidia;
71 TF_CONST_INIT extern const absl::string_view kDeviceVendorAMD;
72 
73 // Interesting event types (i.e., TraceMe names).
74 enum HostEventType {
75   kFirstHostEventType = 0,
76   kUnknownHostEventType = kFirstHostEventType,
77   kTraceContext,
78   kSessionRun,
79   kFunctionRun,
80   kRunGraph,
81   kRunGraphDone,
82   kTfOpRun,
83   kEagerKernelExecute,
84   kExecutorStateProcess,
85   kExecutorDoneCallback,
86   kMemoryAllocation,
87   kMemoryDeallocation,
88   // Performance counter related.
89   kRemotePerf,
90   // tf.data captured function events.
91   kTfDataCapturedFunctionRun,
92   kTfDataCapturedFunctionRunWithBorrowedArgs,
93   kTfDataCapturedFunctionRunInstantiated,
94   kTfDataCapturedFunctionRunAsync,
95   // Loop ops.
96   kParallelForOp,
97   kForeverOp,
98   kWhileOpEvalCond,
99   kWhileOpStartBody,
100   kForOp,
101   // tf.data related.
102   kIteratorGetNextOp,
103   kIteratorGetNextAsOptionalOp,
104   kIterator,
105   kDeviceInputPipelineSecondIterator,
106   kPrefetchProduce,
107   kPrefetchConsume,
108   kParallelInterleaveProduce,
109   kParallelInterleaveConsume,
110   kParallelInterleaveInitializedInput,
111   kParallelMapProduce,
112   kParallelMapConsume,
113   kMapAndBatchProduce,
114   kMapAndBatchConsume,
115   kParseExampleProduce,
116   kParseExampleConsume,
117   kParallelBatchProduce,
118   kParallelBatchConsume,
119   // Batching related.
120   kBatchingSessionRun,
121   kProcessBatch,
122   kConcatInputTensors,
123   kMergeInputTensors,
124   kScheduleWithoutSplit,
125   kScheduleWithSplit,
126   kScheduleWithEagerSplit,
127   kASBSQueueSchedule,
128   // TFRT related.
129   kTfrtModelRun,
130   // JAX related.
131   kExecuteOnLocalDevices,
132   // GPU related.
133   kKernelLaunch,
134   kKernelExecute,
135   // TPU related
136   kEnqueueRequestLocked,
137   kRunProgramRequest,
138   kHostCallbackRequest,
139   kTransferH2DRequest,
140   kTransferPreprocessedH2DRequest,
141   kTransferD2HRequest,
142   kOnDeviceSendRequest,
143   kOnDeviceRecvRequest,
144   kOnDeviceSendRecvLocalRequest,
145   kCustomWait,
146   kOnDeviceSendRequestMulti,
147   kOnDeviceRecvRequestMulti,
148   kPjrtAsyncWait,
149   kDoEnqueueProgram,
150   kDoEnqueueContinuationProgram,
151   kWriteHbm,
152   kReadHbm,
153   kTpuExecuteOp,
154   kCompleteCallbacks,
155   kTransferToDeviceIssueEvent,
156   kTransferToDeviceDone,
157   kTransferFromDeviceIssueEvent,
158   kTransferFromDeviceDone,
159   kTpuSystemExecute,
160   kTpuPartitionedCallOpInitializeVarOnTpu,
161   kTpuPartitionedCallOpExecuteRemote,
162   kTpuPartitionedCallOpExecuteLocal,
163   kLinearize,
164   kDelinearize,
165   kTransferBufferFromDeviceFastPath,
166   kLastHostEventType = kTransferBufferFromDeviceFastPath,
167 };
168 
169 enum StatType {
170   kFirstStatType = 0,
171   kUnknownStatType = kFirstStatType,
172   // TraceMe arguments.
173   kStepId,
174   kDeviceOrdinal,
175   kChipOrdinal,
176   kNodeOrdinal,
177   kModelId,
178   kQueueId,
179   kQueueAddr,
180   kRequestId,
181   kRunId,
182   kReplicaId,
183   kGraphType,
184   kStepNum,
185   kIterNum,
186   kIndexOnHost,
187   kAllocatorName,
188   kBytesReserved,
189   kBytesAllocated,
190   kBytesAvailable,
191   kFragmentation,
192   kPeakBytesInUse,
193   kRequestedBytes,
194   kAllocationBytes,
195   kAddress,
196   kRegionType,
197   kDataType,
198   kTensorShapes,
199   kTensorLayout,
200   kKpiName,
201   kKpiValue,
202   kElementId,
203   kParentId,
204   // XPlane semantics related.
205   kProducerType,
206   kConsumerType,
207   kProducerId,
208   kConsumerId,
209   kIsRoot,
210   kIsAsync,
211   // Device trace arguments.
212   kDeviceId,
213   kDeviceTypeString,
214   kContextId,
215   kCorrelationId,
216   // TODO(b/176137043): These "details" should differentiate between activity
217   // and API event sources.
218   kMemcpyDetails,
219   kMemallocDetails,
220   kMemFreeDetails,
221   kMemsetDetails,
222   kMemoryResidencyDetails,
223   kNVTXRange,
224   kKernelDetails,
225   kStream,
226   // Stats added when processing traces.
227   kGroupId,
228   kFlow,
229   kStepName,
230   kTfOp,
231   kHloOp,
232   kHloCategory,
233   kHloModule,
234   kProgramId,
235   kEquation,
236   kIsEager,
237   kIsFunc,
238   kTfFunctionCall,
239   kTfFunctionTracingCount,
240   kFlops,
241   kBytesAccessed,
242   kSourceInfo,
243   kModelName,
244   kModelVersion,
245   kBytesTransferred,
246   kDmaQueue,
247   // Performance counter related.
248   kRawValue,
249   kScaledValue,
250   kThreadId,
251   kMatrixUnitUtilizationPercent,
252   // XLA metadata map related.
253   kHloProto,
254   // Device capability related.
255   kDevCapClockRateKHz,
256   kDevCapCoreCount,
257   kDevCapMemoryBandwidth,
258   kDevCapMemorySize,
259   kDevCapComputeCapMajor,
260   kDevCapComputeCapMinor,
261   kDevCapPeakTeraflopsPerSecond,
262   kDevCapPeakHbmBwGigabytesPerSecond,
263   kDevVendor,
264   // Batching related.
265   kBatchSizeAfterPadding,
266   kPaddingAmount,
267   kBatchingInputTaskSize,
268   // GPU occupancy metrics
269   kTheoreticalOccupancyPct,
270   kOccupancyMinGridSize,
271   kOccupancySuggestedBlockSize,
272   // Aggregrated Stats
273   kSelfDurationPs,
274   kMinDurationPs,
275   kMaxIterationNum,
276   kDeviceType,
277   kUsesMegaCore,
278   kSymbolId,
279   kTfOpName,
280   kDmaStallDurationPs,
281   kLastStatType = kDmaStallDurationPs
282 };
283 
TpuPlaneName(int32_t device_ordinal)284 inline std::string TpuPlaneName(int32_t device_ordinal) {
285   return absl::StrCat(kTpuPlanePrefix, device_ordinal);
286 }
287 
GpuPlaneName(int32_t device_ordinal)288 inline std::string GpuPlaneName(int32_t device_ordinal) {
289   return absl::StrCat(kGpuPlanePrefix, device_ordinal);
290 }
291 
292 absl::string_view GetHostEventTypeStr(HostEventType event_type);
293 
294 bool IsHostEventType(HostEventType event_type, absl::string_view event_name);
295 
IsHostEventType(HostEventType event_type,absl::string_view event_name)296 inline bool IsHostEventType(HostEventType event_type,
297                             absl::string_view event_name) {
298   return GetHostEventTypeStr(event_type) == event_name;
299 }
300 
301 absl::optional<int64_t> FindHostEventType(absl::string_view event_name);
302 
303 absl::optional<int64_t> FindTfOpEventType(absl::string_view event_name);
304 
305 absl::string_view GetStatTypeStr(StatType stat_type);
306 
307 bool IsStatType(StatType stat_type, absl::string_view stat_name);
308 
IsStatType(StatType stat_type,absl::string_view stat_name)309 inline bool IsStatType(StatType stat_type, absl::string_view stat_name) {
310   return GetStatTypeStr(stat_type) == stat_name;
311 }
312 
313 absl::optional<int64_t> FindStatType(absl::string_view stat_name);
314 
315 // Returns true if the given event shouldn't be shown in the trace viewer.
316 bool IsInternalEvent(absl::optional<int64_t> event_type);
317 
318 // Returns true if the given stat shouldn't be shown in the trace viewer.
319 bool IsInternalStat(absl::optional<int64_t> stat_type);
320 
321 // Support for flow events:
322 // This class enables encoding/decoding the flow id and direction, stored as
323 // XStat value. The flow id are limited to 56 bits.
324 class XFlow {
325  public:
326   enum FlowDirection {
327     kFlowUnspecified = 0x0,
328     kFlowIn = 0x1,
329     kFlowOut = 0x2,
330     kFlowInOut = 0x3,
331   };
332 
333   XFlow(uint64_t flow_id, FlowDirection direction,
334         ContextType category = ContextType::kGeneric) {
335     DCHECK_NE(direction, kFlowUnspecified);
336     encoded_.parts.direction = direction;
337     encoded_.parts.flow_id = flow_id;
338     encoded_.parts.category = static_cast<uint64_t>(category);
339   }
340 
341   // Encoding
ToStatValue()342   uint64 ToStatValue() const { return encoded_.whole; }
343 
344   // Decoding
FromStatValue(uint64_t encoded)345   static XFlow FromStatValue(uint64_t encoded) { return XFlow(encoded); }
346 
347   /* NOTE: absl::HashOf is not consistent across processes (some process level
348    * salt is added), even different executions of the same program.
349    * However we are not tracking cross-host flows, i.e. A single flow's
350    * participating events are from the same XSpace. On the other hand,
351    * events from the same XSpace is always processed in the same profiler
352    * process. Flows from different hosts are unlikely to collide because of
353    * 2^56 hash space. Therefore, we can consider this is good for now. We should
354    * revisit the hash function when cross-hosts flows became more popular.
355    */
356   template <typename... Args>
GetFlowId(Args &&...args)357   static uint64_t GetFlowId(Args&&... args) {
358     return absl::HashOf(std::forward<Args>(args)...) & kFlowMask;
359   }
360 
Id()361   uint64_t Id() const { return encoded_.parts.flow_id; }
Category()362   ContextType Category() const {
363     return GetSafeContextType(encoded_.parts.category);
364   }
Direction()365   FlowDirection Direction() const {
366     return FlowDirection(encoded_.parts.direction);
367   }
368 
GetUniqueId()369   static uint64_t GetUniqueId() {  // unique in current process.
370     return next_flow_id_.fetch_add(1);
371   }
372 
373  private:
XFlow(uint64_t encoded)374   explicit XFlow(uint64_t encoded) { encoded_.whole = encoded; }
375   static constexpr uint64_t kFlowMask = (1ULL << 56) - 1;
376   static std::atomic<uint64_t> next_flow_id_;
377 
378   union {
379     // Encoded representation.
380     uint64_t whole;
381     struct {
382       uint64_t direction : 2;
383       uint64_t flow_id : 56;
384       uint64_t category : 6;
385     } parts;
386   } encoded_ ABSL_ATTRIBUTE_PACKED;
387 
388   static_assert(sizeof(encoded_) == sizeof(uint64_t), "Must be 64 bits.");
389 };
390 
391 }  // namespace profiler
392 }  // namespace tensorflow
393 
394 #endif  // TENSORFLOW_CORE_PROFILER_UTILS_XPLANE_SCHEMA_H_
395