xref: /aosp_15_r20/external/mesa3d/src/intel/perf/intel_perf.h (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2018 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #ifndef INTEL_PERF_H
25 #define INTEL_PERF_H
26 
27 #include <stdio.h>
28 #include <stdbool.h>
29 #include <stdint.h>
30 #include <string.h>
31 
32 #include "compiler/glsl/list.h"
33 #include "dev/intel_device_info.h"
34 #include "util/bitscan.h"
35 #include "util/bitset.h"
36 #include "util/hash_table.h"
37 #include "util/ralloc.h"
38 
39 #define INTEL_PERF_MAX_METRIC_SETS (1500)
40 
41 #ifdef __cplusplus
42 extern "C" {
43 #endif
44 
45 struct intel_perf_config;
46 struct intel_perf_query_info;
47 
48 #define INTEL_PERF_INVALID_CTX_ID (0xffffffff)
49 
50 enum ENUM_PACKED intel_perf_counter_type {
51    INTEL_PERF_COUNTER_TYPE_EVENT,
52    INTEL_PERF_COUNTER_TYPE_DURATION_NORM,
53    INTEL_PERF_COUNTER_TYPE_DURATION_RAW,
54    INTEL_PERF_COUNTER_TYPE_THROUGHPUT,
55    INTEL_PERF_COUNTER_TYPE_RAW,
56    INTEL_PERF_COUNTER_TYPE_TIMESTAMP,
57 };
58 
59 enum ENUM_PACKED intel_perf_counter_data_type {
60    INTEL_PERF_COUNTER_DATA_TYPE_BOOL32,
61    INTEL_PERF_COUNTER_DATA_TYPE_UINT32,
62    INTEL_PERF_COUNTER_DATA_TYPE_UINT64,
63    INTEL_PERF_COUNTER_DATA_TYPE_FLOAT,
64    INTEL_PERF_COUNTER_DATA_TYPE_DOUBLE,
65 };
66 
67 enum ENUM_PACKED intel_perf_counter_units {
68    /* size */
69    INTEL_PERF_COUNTER_UNITS_BYTES,
70    INTEL_PERF_COUNTER_UNITS_GBPS,
71 
72    /* frequency */
73    INTEL_PERF_COUNTER_UNITS_HZ,
74 
75    /* time */
76    INTEL_PERF_COUNTER_UNITS_NS,
77    INTEL_PERF_COUNTER_UNITS_US,
78 
79    /**/
80    INTEL_PERF_COUNTER_UNITS_PIXELS,
81    INTEL_PERF_COUNTER_UNITS_TEXELS,
82    INTEL_PERF_COUNTER_UNITS_THREADS,
83    INTEL_PERF_COUNTER_UNITS_PERCENT,
84 
85    /* events */
86    INTEL_PERF_COUNTER_UNITS_MESSAGES,
87    INTEL_PERF_COUNTER_UNITS_NUMBER,
88    INTEL_PERF_COUNTER_UNITS_CYCLES,
89    INTEL_PERF_COUNTER_UNITS_EVENTS,
90    INTEL_PERF_COUNTER_UNITS_UTILIZATION,
91 
92    /**/
93    INTEL_PERF_COUNTER_UNITS_EU_SENDS_TO_L3_CACHE_LINES,
94    INTEL_PERF_COUNTER_UNITS_EU_ATOMIC_REQUESTS_TO_L3_CACHE_LINES,
95    INTEL_PERF_COUNTER_UNITS_EU_REQUESTS_TO_L3_CACHE_LINES,
96    INTEL_PERF_COUNTER_UNITS_EU_BYTES_PER_L3_CACHE_LINE,
97 
98    INTEL_PERF_COUNTER_UNITS_MAX
99 };
100 
101 struct intel_pipeline_stat {
102    uint32_t reg;
103    uint32_t numerator;
104    uint32_t denominator;
105 };
106 
107 /*
108  * The largest OA formats we can use include:
109  * For Haswell:
110  *   1 timestamp, 45 A counters, 8 B counters and 8 C counters.
111  * For Gfx8+
112  *   1 timestamp, 1 clock, 36 A counters, 8 B counters and 8 C counters
113  * For Xe2:
114  *   1 timestamp, 1 clock, 64 PEC counters
115  *
116  * Plus 2 PERF_CNT registers and 1 RPSTAT register.
117  */
118 #define MAX_OA_REPORT_COUNTERS (2 + 64 + 3)
119 
120 /*
121  * When currently allocate only one page for pipeline statistics queries. Here
122  * we derived the maximum number of counters for that amount.
123  */
124 #define STATS_BO_SIZE               4096
125 #define STATS_BO_END_OFFSET_BYTES   (STATS_BO_SIZE / 2)
126 #define MAX_STAT_COUNTERS           (STATS_BO_END_OFFSET_BYTES / 8)
127 
128 struct intel_perf_query_result {
129    /**
130     * Storage for the final accumulated OA counters.
131     */
132    uint64_t accumulator[MAX_OA_REPORT_COUNTERS];
133 
134    /**
135     * Hw ID used by the context on which the query was running.
136     */
137    uint32_t hw_id;
138 
139    /**
140     * Number of reports accumulated to produce the results.
141     */
142    uint32_t reports_accumulated;
143 
144    /**
145     * Frequency in the slices of the GT at the begin and end of the
146     * query.
147     */
148    uint64_t slice_frequency[2];
149 
150    /**
151     * Frequency in the unslice of the GT at the begin and end of the
152     * query.
153     */
154    uint64_t unslice_frequency[2];
155 
156    /**
157     * Frequency of the whole GT at the begin and end of the query.
158     */
159    uint64_t gt_frequency[2];
160 
161    /**
162     * Timestamp of the query.
163     */
164    uint64_t begin_timestamp;
165 
166    /**
167     * Timestamp of the query.
168     */
169    uint64_t end_timestamp;
170 
171    /**
172     * Whether the query was interrupted by another workload (aka preemption).
173     */
174    bool query_disjoint;
175 };
176 
177 typedef uint64_t (*intel_counter_read_uint64_t)(struct intel_perf_config *perf,
178                                                 const struct intel_perf_query_info *query,
179                                                 const struct intel_perf_query_result *results);
180 
181 typedef float (*intel_counter_read_float_t)(struct intel_perf_config *perf,
182                                             const struct intel_perf_query_info *query,
183                                             const struct intel_perf_query_result *results);
184 
185 struct intel_perf_query_counter {
186    const char *name;
187    const char *desc;
188    const char *symbol_name;
189    const char *category;
190    enum intel_perf_counter_type type;
191    enum intel_perf_counter_data_type data_type;
192    enum intel_perf_counter_units units;
193    size_t offset;
194 
195    union {
196       intel_counter_read_uint64_t oa_counter_max_uint64;
197       intel_counter_read_float_t  oa_counter_max_float;
198    };
199 
200    union {
201       intel_counter_read_uint64_t oa_counter_read_uint64;
202       intel_counter_read_float_t  oa_counter_read_float;
203       struct intel_pipeline_stat pipeline_stat;
204    };
205 };
206 
207 struct intel_perf_query_register_prog {
208    uint32_t reg;
209    uint32_t val;
210 };
211 
212 /* Register programming for a given query */
213 struct intel_perf_registers {
214    const struct intel_perf_query_register_prog *flex_regs;
215    uint32_t n_flex_regs;
216 
217    const struct intel_perf_query_register_prog *mux_regs;
218    uint32_t n_mux_regs;
219 
220    const struct intel_perf_query_register_prog *b_counter_regs;
221    uint32_t n_b_counter_regs;
222 };
223 
224 struct intel_perf_query_info {
225    struct intel_perf_config *perf;
226 
227    enum intel_perf_query_type {
228       INTEL_PERF_QUERY_TYPE_OA,
229       INTEL_PERF_QUERY_TYPE_RAW,
230       INTEL_PERF_QUERY_TYPE_PIPELINE,
231    } kind;
232    const char *name;
233    const char *symbol_name;
234    const char *guid;
235    struct intel_perf_query_counter *counters;
236    int n_counters;
237    int max_counters;
238    size_t data_size;
239 
240    /* OA specific */
241    uint64_t oa_metrics_set_id;
242    uint64_t oa_format;/* KMD value */
243 
244    /* For indexing into the accumulator[] ... */
245    int gpu_time_offset;
246    int gpu_clock_offset;
247    int a_offset;
248    int b_offset;
249    int c_offset;
250    int perfcnt_offset;
251    int rpstat_offset;
252    int pec_offset;
253 
254    struct intel_perf_registers config;
255 };
256 
257 /* When not using the MI_RPC command, this structure describes the list of
258  * register offsets as well as their storage location so that they can be
259  * stored through a series of MI_SRM commands and accumulated with
260  * intel_perf_query_result_accumulate_snapshots().
261  */
262 struct intel_perf_query_field_layout {
263    /* Alignment for the layout */
264    uint32_t alignment;
265 
266    /* Size of the whole layout */
267    uint32_t size;
268 
269    uint32_t n_fields;
270 
271    struct intel_perf_query_field {
272       /* MMIO location of this register */
273       uint32_t mmio_offset;
274 
275       /* Location of this register in the storage */
276       uint16_t location;
277 
278       /* Type of register, for accumulation (see intel_perf_query_info:*_offset
279        * fields)
280        */
281       enum intel_perf_query_field_type {
282          INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC,
283          INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT,
284          INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT,
285          INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_A,
286          INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B,
287          INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C,
288          INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_PEC,
289       } type;
290 
291       /* Index of register in the given type (for instance A31 or B2,
292        * etc...)
293        */
294       uint8_t index;
295 
296       /* 4, 8 or 256 */
297       uint16_t size;
298 
299       /* If not 0, mask to apply to the register value. */
300       uint64_t mask;
301    } *fields;
302 };
303 
304 struct intel_perf_query_counter_info {
305    struct intel_perf_query_counter *counter;
306 
307    BITSET_DECLARE(query_mask, INTEL_PERF_MAX_METRIC_SETS);
308 
309    /**
310     * Each counter can be a part of many groups, each time at different index.
311     * This struct stores one of those locations.
312     */
313    struct {
314       int group_idx; /* query/group number */
315       int counter_idx; /* index inside of query/group */
316    } location;
317 };
318 
319 enum intel_perf_features {
320    INTEL_PERF_FEATURE_HOLD_PREEMPTION = (1 << 0),
321    INTEL_PERF_FEATURE_GLOBAL_SSEU = (1 << 1),
322    /* Whether i915 has DRM_I915_QUERY_PERF_CONFIG support. */
323    INTEL_PERF_FEATURE_QUERY_PERF = (1 << 2),
324 };
325 
326 struct intel_perf_config {
327    /* Have extended metrics been enabled */
328    bool enable_all_metrics;
329 
330    enum intel_perf_features features_supported;
331 
332    /* Number of bits to shift the OA timestamp values by to match the ring
333     * timestamp.
334     */
335    int oa_timestamp_shift;
336 
337    /* Mask of bits valid from the OA report (for instance you might have the
338     * lower 31 bits [30:0] of timestamp value). This is useful if you want to
339     * recombine a full timestamp value captured from the CPU with OA
340     * timestamps captured on the device but that only include 31bits of data.
341     */
342    uint64_t oa_timestamp_mask;
343 
344    /* Powergating configuration for the running the query.
345     * Only used in i915, struct drm_i915_gem_context_param_sseu.
346     */
347    void *sseu;
348 
349    struct intel_perf_query_info *queries;
350    int n_queries;
351 
352    struct intel_perf_query_counter_info *counter_infos;
353    int n_counters;
354 
355    struct intel_perf_query_field_layout query_layout;
356    size_t oa_sample_size;
357 
358    /* Variables referenced in the XML meta data for OA performance
359     * counters, e.g in the normalization equations.
360     *
361     * All uint64_t for consistent operand types in generated code
362     */
363    struct {
364       uint64_t n_eus;               /** $EuCoresTotalCount */
365       uint64_t n_eu_slices;         /** $EuSlicesTotalCount */
366       uint64_t n_eu_sub_slices;     /** $EuSubslicesTotalCount */
367       uint64_t n_eu_slice0123;      /** $EuDualSubslicesSlice0123Count */
368       uint64_t slice_mask;          /** $SliceMask */
369       uint64_t subslice_mask;       /** $SubsliceMask */
370       uint64_t gt_min_freq;         /** $GpuMinFrequency */
371       uint64_t gt_max_freq;         /** $GpuMaxFrequency */
372       bool     query_mode;          /** $QueryMode */
373    } sys_vars;
374 
375    const struct intel_device_info *devinfo;
376 
377    /* OA metric sets, indexed by GUID, as know by Mesa at build time, to
378     * cross-reference with the GUIDs of configs advertised by the kernel at
379     * runtime
380     */
381    struct hash_table *oa_metrics_table;
382 
383    /* When MDAPI hasn't configured the metric we need to use by the time the
384     * query begins, this OA metric is used as a fallback.
385     */
386    uint64_t fallback_raw_oa_metric;
387 
388    /* Location of the device's sysfs entry. */
389    char sysfs_dev_dir[256];
390 
391    struct {
392       void *(*bo_alloc)(void *bufmgr, const char *name, uint64_t size);
393       void (*bo_unreference)(void *bo);
394       void *(*bo_map)(void *ctx, void *bo, unsigned flags);
395       void (*bo_unmap)(void *bo);
396       bool (*batch_references)(void *batch, void *bo);
397       void (*bo_wait_rendering)(void *bo);
398       int (*bo_busy)(void *bo);
399       void (*emit_stall_at_pixel_scoreboard)(void *ctx);
400       void (*emit_mi_report_perf_count)(void *ctx,
401                                         void *bo,
402                                         uint32_t offset_in_bytes,
403                                         uint32_t report_id);
404       void (*batchbuffer_flush)(void *ctx,
405                                 const char *file, int line);
406       void (*store_register_mem)(void *ctx, void *bo, uint32_t reg, uint32_t reg_size, uint32_t offset);
407 
408    } vtbl;
409 };
410 
411 struct intel_perf_counter_pass {
412    struct intel_perf_query_info *query;
413    struct intel_perf_query_counter *counter;
414 };
415 
416 enum intel_perf_record_type {
417    INTEL_PERF_RECORD_TYPE_SAMPLE = 1,
418    INTEL_PERF_RECORD_TYPE_OA_REPORT_LOST = 2,
419    INTEL_PERF_RECORD_TYPE_OA_BUFFER_LOST = 3,
420    INTEL_PERF_RECORD_TYPE_COUNTER_OVERFLOW = 4,
421    INTEL_PERF_RECORD_TYPE_MMIO_TRG_Q_FULL = 5,
422    INTEL_PERF_RECORD_TYPE_MAX,
423 };
424 
425 struct intel_perf_record_header {
426    uint32_t type; /* enum intel_perf_record_type */
427    uint16_t pad;
428    uint16_t size;
429 };
430 
431 /** Initialize the intel_perf_config object for a given device.
432  *
433  *    include_pipeline_statistics : Whether to add a pipeline statistic query
434  *                                  intel_perf_query_info object
435  *
436  *    use_register_snapshots : Whether the queries should include counters
437  *                             that rely on register snapshots using command
438  *                             streamer instructions (not possible when using
439  *                             only the OA buffer data).
440  */
441 void intel_perf_init_metrics(struct intel_perf_config *perf_cfg,
442                              const struct intel_device_info *devinfo,
443                              int drm_fd,
444                              bool include_pipeline_statistics,
445                              bool use_register_snapshots);
446 
447 /** Query i915 for a metric id using guid.
448  */
449 bool intel_perf_load_metric_id(struct intel_perf_config *perf_cfg,
450                                const char *guid,
451                                uint64_t *metric_id);
452 
453 /** Load a configuation's content from i915 using a guid.
454  */
455 struct intel_perf_registers *intel_perf_load_configuration(struct intel_perf_config *perf_cfg,
456                                                            int fd, const char *guid);
457 
458 /** Store a configuration into i915 using guid and return a new metric id.
459  *
460  * If guid is NULL, then a generated one will be provided by hashing the
461  * content of the configuration.
462  */
463 uint64_t intel_perf_store_configuration(struct intel_perf_config *perf_cfg, int fd,
464                                         const struct intel_perf_registers *config,
465                                         const char *guid);
466 void intel_perf_remove_configuration(struct intel_perf_config *perf_cfg, int fd,
467                                      uint64_t config_id);
468 
469 static inline unsigned
intel_perf_query_counter_info_first_query(const struct intel_perf_query_counter_info * counter_info)470 intel_perf_query_counter_info_first_query(const struct intel_perf_query_counter_info *counter_info)
471 {
472    return BITSET_FFS(counter_info->query_mask);
473 }
474 
475 /** Read the slice/unslice frequency from 2 OA reports and store then into
476  *  result.
477  */
478 void intel_perf_query_result_read_frequencies(struct intel_perf_query_result *result,
479                                               const struct intel_device_info *devinfo,
480                                               const uint32_t *start,
481                                               const uint32_t *end);
482 
483 /** Store the GT frequency as reported by the RPSTAT register.
484  */
485 void intel_perf_query_result_read_gt_frequency(struct intel_perf_query_result *result,
486                                                const struct intel_device_info *devinfo,
487                                                const uint32_t start,
488                                                const uint32_t end);
489 
490 /** Store PERFCNT registers values.
491  */
492 void intel_perf_query_result_read_perfcnts(struct intel_perf_query_result *result,
493                                            const struct intel_perf_query_info *query,
494                                            const uint64_t *start,
495                                            const uint64_t *end);
496 
497 /** Accumulate the delta between 2 OA reports into result for a given query.
498  */
499 void intel_perf_query_result_accumulate(struct intel_perf_query_result *result,
500                                         const struct intel_perf_query_info *query,
501                                         const uint32_t *start,
502                                         const uint32_t *end);
503 
504 /** Read the timestamp value in a report.
505  */
506 uint64_t intel_perf_report_timestamp(const struct intel_perf_query_info *query,
507                                      const struct intel_device_info *devinfo,
508                                      const uint32_t *report);
509 
510 /** Accumulate the delta between 2 snapshots of OA perf registers (layout
511  * should match description specified through intel_perf_query_register_layout).
512  */
513 void intel_perf_query_result_accumulate_fields(struct intel_perf_query_result *result,
514                                                const struct intel_perf_query_info *query,
515                                                const void *start,
516                                                const void *end,
517                                                bool no_oa_accumulate);
518 
519 void intel_perf_query_result_clear(struct intel_perf_query_result *result);
520 
521 /** Debug helper printing out query data.
522  */
523 void intel_perf_query_result_print_fields(const struct intel_perf_query_info *query,
524                                           const void *data);
525 
526 static inline size_t
intel_perf_query_counter_get_size(const struct intel_perf_query_counter * counter)527 intel_perf_query_counter_get_size(const struct intel_perf_query_counter *counter)
528 {
529    switch (counter->data_type) {
530    case INTEL_PERF_COUNTER_DATA_TYPE_BOOL32:
531       return sizeof(uint32_t);
532    case INTEL_PERF_COUNTER_DATA_TYPE_UINT32:
533       return sizeof(uint32_t);
534    case INTEL_PERF_COUNTER_DATA_TYPE_UINT64:
535       return sizeof(uint64_t);
536    case INTEL_PERF_COUNTER_DATA_TYPE_FLOAT:
537       return sizeof(float);
538    case INTEL_PERF_COUNTER_DATA_TYPE_DOUBLE:
539       return sizeof(double);
540    default:
541       unreachable("invalid counter data type");
542    }
543 }
544 
545 static inline struct intel_perf_config *
intel_perf_new(void * ctx)546 intel_perf_new(void *ctx)
547 {
548    struct intel_perf_config *perf = rzalloc(ctx, struct intel_perf_config);
549    return perf;
550 }
551 
552 void intel_perf_free(struct intel_perf_config *perf_cfg);
553 
554 uint64_t intel_perf_get_oa_format(struct intel_perf_config *perf_cfg);
555 
556 /** Whether we have the ability to hold off preemption on a batch so we don't
557  * have to look at the OA buffer to subtract unrelated workloads off the
558  * values captured through MI_* commands.
559  */
560 static inline bool
intel_perf_has_hold_preemption(const struct intel_perf_config * perf)561 intel_perf_has_hold_preemption(const struct intel_perf_config *perf)
562 {
563    return perf->features_supported & INTEL_PERF_FEATURE_HOLD_PREEMPTION;
564 }
565 
566 /** Whether we have the ability to lock EU array power configuration for the
567  * duration of the performance recording. This is useful on Gfx11 where the HW
568  * architecture requires half the EU for particular workloads.
569  */
570 static inline bool
intel_perf_has_global_sseu(const struct intel_perf_config * perf)571 intel_perf_has_global_sseu(const struct intel_perf_config *perf)
572 {
573    return perf->features_supported & INTEL_PERF_FEATURE_GLOBAL_SSEU;
574 }
575 
576 uint32_t intel_perf_get_n_passes(struct intel_perf_config *perf,
577                                  const uint32_t *counter_indices,
578                                  uint32_t counter_indices_count,
579                                  struct intel_perf_query_info **pass_queries);
580 void intel_perf_get_counters_passes(struct intel_perf_config *perf,
581                                     const uint32_t *counter_indices,
582                                     uint32_t counter_indices_count,
583                                     struct intel_perf_counter_pass *counter_pass);
584 
585 int intel_perf_stream_open(struct intel_perf_config *perf_config, int drm_fd,
586                            uint32_t ctx_id, uint64_t metrics_set_id,
587                            uint64_t period_exponent, bool hold_preemption,
588                            bool enable);
589 int intel_perf_stream_read_samples(struct intel_perf_config *perf_config,
590                                    int perf_stream_fd, uint8_t *buffer,
591                                    size_t buffer_len);
592 int intel_perf_stream_set_state(struct intel_perf_config *perf_config,
593                                 int perf_stream_fd, bool enable);
594 int intel_perf_stream_set_metrics_id(struct intel_perf_config *perf_config,
595                                      int perf_stream_fd, uint64_t metrics_set_id);
596 
597 #ifdef __cplusplus
598 } // extern "C"
599 #endif
600 
601 #endif /* INTEL_PERF_H */
602