1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_XPLANE_SCHEMA_H_
17 #define TENSORFLOW_CORE_PROFILER_UTILS_XPLANE_SCHEMA_H_
18
19 #include <atomic>
20 #include <cstdint>
21 #include <string>
22
23 #include "absl/hash/hash.h"
24 #include "absl/strings/match.h"
25 #include "absl/strings/str_cat.h"
26 #include "absl/strings/string_view.h"
27 #include "absl/types/optional.h"
28 #include "tensorflow/core/platform/logging.h"
29 #include "tensorflow/core/platform/macros.h"
30 #include "tensorflow/core/platform/types.h"
31 #include "tensorflow/core/profiler/lib/context_types.h"
32
33 namespace tensorflow {
34 namespace profiler {
35
36 // Name of XPlane that contains TraceMe events.
37 TF_CONST_INIT extern const absl::string_view kHostThreadsPlaneName;
38 // Name prefix of XPlane that contains GPU events.
39 TF_CONST_INIT extern const absl::string_view kGpuPlanePrefix;
40 // Name prefix of XPlane that contains TPU events.
41 TF_CONST_INIT extern const absl::string_view kTpuPlanePrefix;
42 // Regex for XPlanes that contain TensorCore planes.
43 TF_CONST_INIT extern const char kTpuPlaneRegex[];
44 // Name prefix of XPlane that contains custom device events.
45 TF_CONST_INIT extern const absl::string_view kCustomPlanePrefix;
46 // Name prefix of XPlane that contains TPU runtime events.
47 TF_CONST_INIT extern const absl::string_view kTpuRuntimePlaneName;
48 // Name of XPlane that contains CUPTI driver API generated events.
49 TF_CONST_INIT extern const absl::string_view kCuptiDriverApiPlaneName;
50 // Name of XPlane that contains Roctracer API generated events.
51 TF_CONST_INIT extern const absl::string_view kRoctracerApiPlaneName;
52 // Name of XPlane that contains profile metadata such as XLA debug info.
53 TF_CONST_INIT extern const absl::string_view kMetadataPlaneName;
54 // Name of XPlane that contains kpi related metrics.
55 TF_CONST_INIT extern const absl::string_view kTFStreamzPlaneName;
56 // Name of XPlane that contains events from python tracer.
57 TF_CONST_INIT extern const absl::string_view kPythonTracerPlaneName;
58
59 // Names of XLines that contain ML-level events.
60 TF_CONST_INIT extern const absl::string_view kStepLineName;
61 TF_CONST_INIT extern const absl::string_view kTensorFlowNameScopeLineName;
62 TF_CONST_INIT extern const absl::string_view kTensorFlowOpLineName;
63 TF_CONST_INIT extern const absl::string_view kXlaModuleLineName;
64 TF_CONST_INIT extern const absl::string_view kXlaOpLineName;
65 TF_CONST_INIT extern const absl::string_view kXlaAsyncOpLineName;
66 TF_CONST_INIT extern const absl::string_view kKernelLaunchLineName;
67 TF_CONST_INIT extern const absl::string_view kSourceLineName;
68
69 // GPU device vendors.
70 TF_CONST_INIT extern const absl::string_view kDeviceVendorNvidia;
71 TF_CONST_INIT extern const absl::string_view kDeviceVendorAMD;
72
73 // Interesting event types (i.e., TraceMe names).
74 enum HostEventType {
75 kFirstHostEventType = 0,
76 kUnknownHostEventType = kFirstHostEventType,
77 kTraceContext,
78 kSessionRun,
79 kFunctionRun,
80 kRunGraph,
81 kRunGraphDone,
82 kTfOpRun,
83 kEagerKernelExecute,
84 kExecutorStateProcess,
85 kExecutorDoneCallback,
86 kMemoryAllocation,
87 kMemoryDeallocation,
88 // Performance counter related.
89 kRemotePerf,
90 // tf.data captured function events.
91 kTfDataCapturedFunctionRun,
92 kTfDataCapturedFunctionRunWithBorrowedArgs,
93 kTfDataCapturedFunctionRunInstantiated,
94 kTfDataCapturedFunctionRunAsync,
95 // Loop ops.
96 kParallelForOp,
97 kForeverOp,
98 kWhileOpEvalCond,
99 kWhileOpStartBody,
100 kForOp,
101 // tf.data related.
102 kIteratorGetNextOp,
103 kIteratorGetNextAsOptionalOp,
104 kIterator,
105 kDeviceInputPipelineSecondIterator,
106 kPrefetchProduce,
107 kPrefetchConsume,
108 kParallelInterleaveProduce,
109 kParallelInterleaveConsume,
110 kParallelInterleaveInitializedInput,
111 kParallelMapProduce,
112 kParallelMapConsume,
113 kMapAndBatchProduce,
114 kMapAndBatchConsume,
115 kParseExampleProduce,
116 kParseExampleConsume,
117 kParallelBatchProduce,
118 kParallelBatchConsume,
119 // Batching related.
120 kBatchingSessionRun,
121 kProcessBatch,
122 kConcatInputTensors,
123 kMergeInputTensors,
124 kScheduleWithoutSplit,
125 kScheduleWithSplit,
126 kScheduleWithEagerSplit,
127 kASBSQueueSchedule,
128 // TFRT related.
129 kTfrtModelRun,
130 // JAX related.
131 kExecuteOnLocalDevices,
132 // GPU related.
133 kKernelLaunch,
134 kKernelExecute,
135 // TPU related
136 kEnqueueRequestLocked,
137 kRunProgramRequest,
138 kHostCallbackRequest,
139 kTransferH2DRequest,
140 kTransferPreprocessedH2DRequest,
141 kTransferD2HRequest,
142 kOnDeviceSendRequest,
143 kOnDeviceRecvRequest,
144 kOnDeviceSendRecvLocalRequest,
145 kCustomWait,
146 kOnDeviceSendRequestMulti,
147 kOnDeviceRecvRequestMulti,
148 kPjrtAsyncWait,
149 kDoEnqueueProgram,
150 kDoEnqueueContinuationProgram,
151 kWriteHbm,
152 kReadHbm,
153 kTpuExecuteOp,
154 kCompleteCallbacks,
155 kTransferToDeviceIssueEvent,
156 kTransferToDeviceDone,
157 kTransferFromDeviceIssueEvent,
158 kTransferFromDeviceDone,
159 kTpuSystemExecute,
160 kTpuPartitionedCallOpInitializeVarOnTpu,
161 kTpuPartitionedCallOpExecuteRemote,
162 kTpuPartitionedCallOpExecuteLocal,
163 kLinearize,
164 kDelinearize,
165 kTransferBufferFromDeviceFastPath,
166 kLastHostEventType = kTransferBufferFromDeviceFastPath,
167 };
168
169 enum StatType {
170 kFirstStatType = 0,
171 kUnknownStatType = kFirstStatType,
172 // TraceMe arguments.
173 kStepId,
174 kDeviceOrdinal,
175 kChipOrdinal,
176 kNodeOrdinal,
177 kModelId,
178 kQueueId,
179 kQueueAddr,
180 kRequestId,
181 kRunId,
182 kReplicaId,
183 kGraphType,
184 kStepNum,
185 kIterNum,
186 kIndexOnHost,
187 kAllocatorName,
188 kBytesReserved,
189 kBytesAllocated,
190 kBytesAvailable,
191 kFragmentation,
192 kPeakBytesInUse,
193 kRequestedBytes,
194 kAllocationBytes,
195 kAddress,
196 kRegionType,
197 kDataType,
198 kTensorShapes,
199 kTensorLayout,
200 kKpiName,
201 kKpiValue,
202 kElementId,
203 kParentId,
204 // XPlane semantics related.
205 kProducerType,
206 kConsumerType,
207 kProducerId,
208 kConsumerId,
209 kIsRoot,
210 kIsAsync,
211 // Device trace arguments.
212 kDeviceId,
213 kDeviceTypeString,
214 kContextId,
215 kCorrelationId,
216 // TODO(b/176137043): These "details" should differentiate between activity
217 // and API event sources.
218 kMemcpyDetails,
219 kMemallocDetails,
220 kMemFreeDetails,
221 kMemsetDetails,
222 kMemoryResidencyDetails,
223 kNVTXRange,
224 kKernelDetails,
225 kStream,
226 // Stats added when processing traces.
227 kGroupId,
228 kFlow,
229 kStepName,
230 kTfOp,
231 kHloOp,
232 kHloCategory,
233 kHloModule,
234 kProgramId,
235 kEquation,
236 kIsEager,
237 kIsFunc,
238 kTfFunctionCall,
239 kTfFunctionTracingCount,
240 kFlops,
241 kBytesAccessed,
242 kSourceInfo,
243 kModelName,
244 kModelVersion,
245 kBytesTransferred,
246 kDmaQueue,
247 // Performance counter related.
248 kRawValue,
249 kScaledValue,
250 kThreadId,
251 kMatrixUnitUtilizationPercent,
252 // XLA metadata map related.
253 kHloProto,
254 // Device capability related.
255 kDevCapClockRateKHz,
256 kDevCapCoreCount,
257 kDevCapMemoryBandwidth,
258 kDevCapMemorySize,
259 kDevCapComputeCapMajor,
260 kDevCapComputeCapMinor,
261 kDevCapPeakTeraflopsPerSecond,
262 kDevCapPeakHbmBwGigabytesPerSecond,
263 kDevVendor,
264 // Batching related.
265 kBatchSizeAfterPadding,
266 kPaddingAmount,
267 kBatchingInputTaskSize,
268 // GPU occupancy metrics
269 kTheoreticalOccupancyPct,
270 kOccupancyMinGridSize,
271 kOccupancySuggestedBlockSize,
272 // Aggregrated Stats
273 kSelfDurationPs,
274 kMinDurationPs,
275 kMaxIterationNum,
276 kDeviceType,
277 kUsesMegaCore,
278 kSymbolId,
279 kTfOpName,
280 kDmaStallDurationPs,
281 kLastStatType = kDmaStallDurationPs
282 };
283
TpuPlaneName(int32_t device_ordinal)284 inline std::string TpuPlaneName(int32_t device_ordinal) {
285 return absl::StrCat(kTpuPlanePrefix, device_ordinal);
286 }
287
GpuPlaneName(int32_t device_ordinal)288 inline std::string GpuPlaneName(int32_t device_ordinal) {
289 return absl::StrCat(kGpuPlanePrefix, device_ordinal);
290 }
291
292 absl::string_view GetHostEventTypeStr(HostEventType event_type);
293
294 bool IsHostEventType(HostEventType event_type, absl::string_view event_name);
295
IsHostEventType(HostEventType event_type,absl::string_view event_name)296 inline bool IsHostEventType(HostEventType event_type,
297 absl::string_view event_name) {
298 return GetHostEventTypeStr(event_type) == event_name;
299 }
300
301 absl::optional<int64_t> FindHostEventType(absl::string_view event_name);
302
303 absl::optional<int64_t> FindTfOpEventType(absl::string_view event_name);
304
305 absl::string_view GetStatTypeStr(StatType stat_type);
306
307 bool IsStatType(StatType stat_type, absl::string_view stat_name);
308
IsStatType(StatType stat_type,absl::string_view stat_name)309 inline bool IsStatType(StatType stat_type, absl::string_view stat_name) {
310 return GetStatTypeStr(stat_type) == stat_name;
311 }
312
313 absl::optional<int64_t> FindStatType(absl::string_view stat_name);
314
315 // Returns true if the given event shouldn't be shown in the trace viewer.
316 bool IsInternalEvent(absl::optional<int64_t> event_type);
317
318 // Returns true if the given stat shouldn't be shown in the trace viewer.
319 bool IsInternalStat(absl::optional<int64_t> stat_type);
320
321 // Support for flow events:
322 // This class enables encoding/decoding the flow id and direction, stored as
323 // XStat value. The flow id are limited to 56 bits.
324 class XFlow {
325 public:
326 enum FlowDirection {
327 kFlowUnspecified = 0x0,
328 kFlowIn = 0x1,
329 kFlowOut = 0x2,
330 kFlowInOut = 0x3,
331 };
332
333 XFlow(uint64_t flow_id, FlowDirection direction,
334 ContextType category = ContextType::kGeneric) {
335 DCHECK_NE(direction, kFlowUnspecified);
336 encoded_.parts.direction = direction;
337 encoded_.parts.flow_id = flow_id;
338 encoded_.parts.category = static_cast<uint64_t>(category);
339 }
340
341 // Encoding
ToStatValue()342 uint64 ToStatValue() const { return encoded_.whole; }
343
344 // Decoding
FromStatValue(uint64_t encoded)345 static XFlow FromStatValue(uint64_t encoded) { return XFlow(encoded); }
346
347 /* NOTE: absl::HashOf is not consistent across processes (some process level
348 * salt is added), even different executions of the same program.
349 * However we are not tracking cross-host flows, i.e. A single flow's
350 * participating events are from the same XSpace. On the other hand,
351 * events from the same XSpace is always processed in the same profiler
352 * process. Flows from different hosts are unlikely to collide because of
353 * 2^56 hash space. Therefore, we can consider this is good for now. We should
354 * revisit the hash function when cross-hosts flows became more popular.
355 */
356 template <typename... Args>
GetFlowId(Args &&...args)357 static uint64_t GetFlowId(Args&&... args) {
358 return absl::HashOf(std::forward<Args>(args)...) & kFlowMask;
359 }
360
Id()361 uint64_t Id() const { return encoded_.parts.flow_id; }
Category()362 ContextType Category() const {
363 return GetSafeContextType(encoded_.parts.category);
364 }
Direction()365 FlowDirection Direction() const {
366 return FlowDirection(encoded_.parts.direction);
367 }
368
GetUniqueId()369 static uint64_t GetUniqueId() { // unique in current process.
370 return next_flow_id_.fetch_add(1);
371 }
372
373 private:
XFlow(uint64_t encoded)374 explicit XFlow(uint64_t encoded) { encoded_.whole = encoded; }
375 static constexpr uint64_t kFlowMask = (1ULL << 56) - 1;
376 static std::atomic<uint64_t> next_flow_id_;
377
378 union {
379 // Encoded representation.
380 uint64_t whole;
381 struct {
382 uint64_t direction : 2;
383 uint64_t flow_id : 56;
384 uint64_t category : 6;
385 } parts;
386 } encoded_ ABSL_ATTRIBUTE_PACKED;
387
388 static_assert(sizeof(encoded_) == sizeof(uint64_t), "Must be 64 bits.");
389 };
390
391 } // namespace profiler
392 } // namespace tensorflow
393
394 #endif // TENSORFLOW_CORE_PROFILER_UTILS_XPLANE_SCHEMA_H_
395