xref: /aosp_15_r20/external/mesa3d/src/amd/common/ac_pm4.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2012 Advanced Micro Devices, Inc.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "ac_debug.h"
8 #include "ac_gpu_info.h"
9 #include "ac_pm4.h"
10 
11 #include "sid.h"
12 
13 #include <string.h>
14 #include <stdlib.h>
15 
16 static bool
opcode_is_pairs(unsigned opcode)17 opcode_is_pairs(unsigned opcode)
18 {
19    return opcode == PKT3_SET_CONTEXT_REG_PAIRS ||
20           opcode == PKT3_SET_SH_REG_PAIRS ||
21           opcode == PKT3_SET_UCONFIG_REG_PAIRS;
22 }
23 
24 static bool
opcode_is_pairs_packed(unsigned opcode)25 opcode_is_pairs_packed(unsigned opcode)
26 {
27    return opcode == PKT3_SET_CONTEXT_REG_PAIRS_PACKED ||
28           opcode == PKT3_SET_SH_REG_PAIRS_PACKED ||
29           opcode == PKT3_SET_SH_REG_PAIRS_PACKED_N;
30 }
31 
32 static bool
is_privileged_reg(const struct ac_pm4_state * state,unsigned reg)33 is_privileged_reg(const struct ac_pm4_state *state, unsigned reg)
34 {
35    const struct radeon_info *info = state->info;
36 
37    if (info->gfx_level >= GFX10 && info->gfx_level <= GFX10_3)
38       return reg == R_008D04_SQ_THREAD_TRACE_BUF0_SIZE ||
39              reg == R_008D00_SQ_THREAD_TRACE_BUF0_BASE ||
40              reg == R_008D14_SQ_THREAD_TRACE_MASK ||
41              reg == R_008D18_SQ_THREAD_TRACE_TOKEN_MASK ||
42              reg == R_008D1C_SQ_THREAD_TRACE_CTRL;
43 
44    if (info->gfx_level >= GFX6 && info->gfx_level <= GFX8)
45       return reg == R_009100_SPI_CONFIG_CNTL;
46 
47    return false;
48 }
49 
50 static unsigned
pairs_packed_opcode_to_regular(unsigned opcode)51 pairs_packed_opcode_to_regular(unsigned opcode)
52 {
53    switch (opcode) {
54    case PKT3_SET_CONTEXT_REG_PAIRS_PACKED:
55       return PKT3_SET_CONTEXT_REG;
56    case PKT3_SET_SH_REG_PAIRS_PACKED:
57       return PKT3_SET_SH_REG;
58    default:
59       unreachable("invalid packed opcode");
60    }
61 }
62 
63 static unsigned
regular_opcode_to_pairs(struct ac_pm4_state * state,unsigned opcode)64 regular_opcode_to_pairs(struct ac_pm4_state *state, unsigned opcode)
65 {
66    const struct radeon_info *info = state->info;
67 
68    switch (opcode) {
69    case PKT3_SET_CONTEXT_REG:
70       return info->has_set_context_pairs_packed ? PKT3_SET_CONTEXT_REG_PAIRS_PACKED :
71              info->has_set_context_pairs ? PKT3_SET_CONTEXT_REG_PAIRS : opcode;
72    case PKT3_SET_SH_REG:
73       return info->has_set_sh_pairs_packed ? PKT3_SET_SH_REG_PAIRS_PACKED :
74              info->has_set_sh_pairs ? PKT3_SET_SH_REG_PAIRS : opcode;
75    case PKT3_SET_UCONFIG_REG:
76       return info->has_set_uconfig_pairs ? PKT3_SET_UCONFIG_REG_PAIRS : opcode;
77    }
78 
79    return opcode;
80 }
81 
82 static bool
packed_next_is_reg_offset_pair(struct ac_pm4_state * state)83 packed_next_is_reg_offset_pair(struct ac_pm4_state *state)
84 {
85    return (state->ndw - state->last_pm4) % 3 == 2;
86 }
87 
88 static bool
packed_next_is_reg_value1(struct ac_pm4_state * state)89 packed_next_is_reg_value1(struct ac_pm4_state *state)
90 {
91    return (state->ndw - state->last_pm4) % 3 == 1;
92 }
93 
94 static bool
packed_prev_is_reg_value0(struct ac_pm4_state * state)95 packed_prev_is_reg_value0(struct ac_pm4_state *state)
96 {
97    return packed_next_is_reg_value1(state);
98 }
99 
100 static unsigned
get_packed_reg_dw_offsetN(struct ac_pm4_state * state,unsigned index)101 get_packed_reg_dw_offsetN(struct ac_pm4_state *state, unsigned index)
102 {
103    unsigned i = state->last_pm4 + 2 + (index / 2) * 3;
104    assert(i < state->ndw);
105    return (state->pm4[i] >> ((index % 2) * 16)) & 0xffff;
106 }
107 
108 static unsigned
get_packed_reg_valueN_idx(struct ac_pm4_state * state,unsigned index)109 get_packed_reg_valueN_idx(struct ac_pm4_state *state, unsigned index)
110 {
111    unsigned i = state->last_pm4 + 2 + (index / 2) * 3 + 1 + (index % 2);
112    assert(i < state->ndw);
113    return i;
114 }
115 
116 static unsigned
get_packed_reg_valueN(struct ac_pm4_state * state,unsigned index)117 get_packed_reg_valueN(struct ac_pm4_state *state, unsigned index)
118 {
119    return state->pm4[get_packed_reg_valueN_idx(state, index)];
120 }
121 
122 static unsigned
get_packed_reg_count(struct ac_pm4_state * state)123 get_packed_reg_count(struct ac_pm4_state *state)
124 {
125    int body_size = state->ndw - state->last_pm4 - 2;
126    assert(body_size > 0 && body_size % 3 == 0);
127    return (body_size / 3) * 2;
128 }
129 
130 void
ac_pm4_finalize(struct ac_pm4_state * state)131 ac_pm4_finalize(struct ac_pm4_state *state)
132 {
133    if (opcode_is_pairs_packed(state->last_opcode)) {
134       unsigned reg_count = get_packed_reg_count(state);
135       unsigned reg_dw_offset0 = get_packed_reg_dw_offsetN(state, 0);
136 
137       if (state->packed_is_padded)
138          reg_count--;
139 
140       bool all_consecutive = true;
141 
142       /* If the whole packed SET packet only sets consecutive registers, rewrite the packet
143        * to be unpacked to make it shorter.
144        *
145        * This also eliminates the invalid scenario when the packed SET packet sets only
146        * 2 registers and the register offsets are equal due to padding.
147        */
148       for (unsigned i = 1; i < reg_count; i++) {
149          if (reg_dw_offset0 != get_packed_reg_dw_offsetN(state, i) - i) {
150             all_consecutive = false;
151             break;
152          }
153       }
154 
155       if (all_consecutive) {
156          assert(state->ndw - state->last_pm4 == 2 + 3 * (reg_count + state->packed_is_padded) / 2);
157          state->pm4[state->last_pm4] = PKT3(pairs_packed_opcode_to_regular(state->last_opcode),
158                                             reg_count, 0);
159          state->pm4[state->last_pm4 + 1] = reg_dw_offset0;
160          for (unsigned i = 0; i < reg_count; i++)
161             state->pm4[state->last_pm4 + 2 + i] = get_packed_reg_valueN(state, i);
162          state->ndw = state->last_pm4 + 2 + reg_count;
163          state->last_opcode = PKT3_SET_SH_REG;
164       } else {
165          /* Set reg_va_low_idx to where the shader address is stored in the pm4 state. */
166          if (state->debug_sqtt &&
167              (state->last_opcode == PKT3_SET_SH_REG_PAIRS_PACKED ||
168               state->last_opcode == PKT3_SET_SH_REG_PAIRS_PACKED_N)) {
169             if (state->packed_is_padded)
170                reg_count++; /* Add this back because we only need to record the last write. */
171 
172             for (int i = reg_count - 1; i >= 0; i--) {
173                unsigned reg_offset = SI_SH_REG_OFFSET + get_packed_reg_dw_offsetN(state, i) * 4;
174 
175                if (strstr(ac_get_register_name(state->info->gfx_level,
176                                                state->info->family, reg_offset),
177                           "SPI_SHADER_PGM_LO_")) {
178                   state->spi_shader_pgm_lo_reg = reg_offset;
179                   break;
180                }
181             }
182          }
183 
184          /* If it's a packed SET_SH packet, use the *_N variant when possible. */
185          if (state->last_opcode == PKT3_SET_SH_REG_PAIRS_PACKED && reg_count <= 14) {
186             state->pm4[state->last_pm4] &= PKT3_IT_OPCODE_C;
187             state->pm4[state->last_pm4] |= PKT3_IT_OPCODE_S(PKT3_SET_SH_REG_PAIRS_PACKED_N);
188          }
189       }
190    }
191 
192    if (state->debug_sqtt && state->last_opcode == PKT3_SET_SH_REG) {
193       /* Set reg_va_low_idx to where the shader address is stored in the pm4 state. */
194       unsigned reg_count = PKT_COUNT_G(state->pm4[state->last_pm4]);
195       unsigned reg_base_offset = SI_SH_REG_OFFSET + state->pm4[state->last_pm4 + 1] * 4;
196 
197       for (unsigned i = 0; i < reg_count; i++) {
198          if (strstr(ac_get_register_name(state->info->gfx_level,
199                                          state->info->family, reg_base_offset + i * 4),
200                     "SPI_SHADER_PGM_LO_")) {
201             state->spi_shader_pgm_lo_reg = reg_base_offset + i * 4;
202 
203             break;
204          }
205       }
206    }
207 }
208 
209 void
ac_pm4_cmd_begin(struct ac_pm4_state * state,unsigned opcode)210 ac_pm4_cmd_begin(struct ac_pm4_state *state, unsigned opcode)
211 {
212    ac_pm4_finalize(state);
213 
214    assert(state->max_dw);
215    assert(state->ndw < state->max_dw);
216    assert(opcode <= 254);
217    state->last_opcode = opcode;
218    state->last_pm4 = state->ndw++;
219    state->packed_is_padded = false;
220 }
221 
222 void
ac_pm4_cmd_add(struct ac_pm4_state * state,uint32_t dw)223 ac_pm4_cmd_add(struct ac_pm4_state *state, uint32_t dw)
224 {
225    assert(state->max_dw);
226    assert(state->ndw < state->max_dw);
227    state->pm4[state->ndw++] = dw;
228    state->last_opcode = 255; /* invalid opcode */
229 }
230 
231 static bool
need_reset_filter_cam(const struct ac_pm4_state * state)232 need_reset_filter_cam(const struct ac_pm4_state *state)
233 {
234    const struct radeon_info *info = state->info;
235 
236    /* All SET_*_PAIRS* packets on the gfx queue must set RESET_FILTER_CAM. */
237    if (!state->is_compute_queue &&
238        (opcode_is_pairs(state->last_opcode) ||
239         opcode_is_pairs_packed(state->last_opcode)))
240       return true;
241 
242    const uint32_t last_reg = state->last_reg << 2;
243 
244    if (info->gfx_level >= GFX11 && !state->is_compute_queue &&
245        (last_reg + CIK_UCONFIG_REG_OFFSET == R_0367A4_SQ_THREAD_TRACE_BUF0_SIZE ||
246         last_reg + CIK_UCONFIG_REG_OFFSET == R_0367A0_SQ_THREAD_TRACE_BUF0_BASE ||
247         last_reg + CIK_UCONFIG_REG_OFFSET == R_0367B4_SQ_THREAD_TRACE_MASK ||
248         last_reg + CIK_UCONFIG_REG_OFFSET == R_0367B8_SQ_THREAD_TRACE_TOKEN_MASK ||
249         last_reg + CIK_UCONFIG_REG_OFFSET == R_0367B0_SQ_THREAD_TRACE_CTRL))
250       return true;
251 
252    return false;
253 }
254 
255 void
ac_pm4_cmd_end(struct ac_pm4_state * state,bool predicate)256 ac_pm4_cmd_end(struct ac_pm4_state *state, bool predicate)
257 {
258    unsigned count;
259    count = state->ndw - state->last_pm4 - 2;
260    /* All SET_*_PAIRS* packets on the gfx queue must set RESET_FILTER_CAM. */
261    bool reset_filter_cam = need_reset_filter_cam(state);
262 
263    state->pm4[state->last_pm4] = PKT3(state->last_opcode, count, predicate) |
264                                  PKT3_RESET_FILTER_CAM_S(reset_filter_cam);
265 
266    if (opcode_is_pairs_packed(state->last_opcode)) {
267       if (packed_prev_is_reg_value0(state)) {
268          /* Duplicate the first register at the end to make the number of registers aligned to 2. */
269          ac_pm4_set_reg_custom(state, get_packed_reg_dw_offsetN(state, 0) * 4,
270                                get_packed_reg_valueN(state, 0),
271                                state->last_opcode, 0);
272          state->packed_is_padded = true;
273       }
274 
275       state->pm4[state->last_pm4 + 1] = get_packed_reg_count(state);
276    }
277 }
278 
279 void
ac_pm4_set_reg_custom(struct ac_pm4_state * state,unsigned reg,uint32_t val,unsigned opcode,unsigned idx)280 ac_pm4_set_reg_custom(struct ac_pm4_state *state, unsigned reg, uint32_t val,
281                       unsigned opcode, unsigned idx)
282 {
283    bool is_packed = opcode_is_pairs_packed(opcode);
284    reg >>= 2;
285 
286    assert(state->max_dw);
287    assert(state->ndw + 2 <= state->max_dw);
288 
289    if (is_packed) {
290       assert(idx == 0);
291 
292       if (opcode != state->last_opcode) {
293          ac_pm4_cmd_begin(state, opcode); /* reserve space for the header */
294          state->ndw++; /* reserve space for the register count, it will be set at the end */
295       }
296    } else if (opcode_is_pairs(opcode)) {
297       assert(idx == 0);
298 
299       if (opcode != state->last_opcode)
300          ac_pm4_cmd_begin(state, opcode);
301 
302       state->pm4[state->ndw++] = reg;
303    } else if (opcode != state->last_opcode || reg != (state->last_reg + 1) ||
304               idx != state->last_idx) {
305       ac_pm4_cmd_begin(state, opcode);
306       state->pm4[state->ndw++] = reg | (idx << 28);
307    }
308 
309    assert(reg <= UINT16_MAX);
310    state->last_reg = reg;
311    state->last_idx = idx;
312 
313    if (is_packed) {
314       if (state->packed_is_padded) {
315          /* The packet is padded, which means the first register is written redundantly again
316           * at the end. Remove it, so that we can replace it with this register.
317           */
318          state->packed_is_padded = false;
319          state->ndw--;
320       }
321 
322       if (packed_next_is_reg_offset_pair(state)) {
323          state->pm4[state->ndw++] = reg;
324       } else if (packed_next_is_reg_value1(state)) {
325          /* Set the second register offset in the high 16 bits. */
326          state->pm4[state->ndw - 2] &= 0x0000ffff;
327          state->pm4[state->ndw - 2] |= reg << 16;
328       }
329    }
330 
331    state->pm4[state->ndw++] = val;
332    ac_pm4_cmd_end(state, false);
333 }
334 
335 static void
ac_pm4_set_privileged_reg(struct ac_pm4_state * state,unsigned reg,uint32_t val)336 ac_pm4_set_privileged_reg(struct ac_pm4_state *state, unsigned reg, uint32_t val)
337 {
338    assert(reg >= SI_CONFIG_REG_OFFSET && reg < SI_CONFIG_REG_END);
339 
340    ac_pm4_cmd_add(state, PKT3(PKT3_COPY_DATA, 4, 0));
341    ac_pm4_cmd_add(state, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_PERF));
342    ac_pm4_cmd_add(state, val);
343    ac_pm4_cmd_add(state, 0); /* unused */
344    ac_pm4_cmd_add(state, reg >> 2);
345    ac_pm4_cmd_add(state, 0); /* unused */
346 }
347 
ac_pm4_set_reg(struct ac_pm4_state * state,unsigned reg,uint32_t val)348 void ac_pm4_set_reg(struct ac_pm4_state *state, unsigned reg, uint32_t val)
349 {
350    const unsigned original_reg = reg;
351    unsigned opcode;
352 
353    if (reg >= SI_CONFIG_REG_OFFSET && reg < SI_CONFIG_REG_END) {
354       opcode = PKT3_SET_CONFIG_REG;
355       reg -= SI_CONFIG_REG_OFFSET;
356 
357    } else if (reg >= SI_SH_REG_OFFSET && reg < SI_SH_REG_END) {
358       opcode = PKT3_SET_SH_REG;
359       reg -= SI_SH_REG_OFFSET;
360 
361    } else if (reg >= SI_CONTEXT_REG_OFFSET && reg < SI_CONTEXT_REG_END) {
362       opcode = PKT3_SET_CONTEXT_REG;
363       reg -= SI_CONTEXT_REG_OFFSET;
364 
365    } else if (reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END) {
366       opcode = PKT3_SET_UCONFIG_REG;
367       reg -= CIK_UCONFIG_REG_OFFSET;
368 
369    } else {
370       fprintf(stderr, "mesa: Invalid register offset %08x!\n", reg);
371       return;
372    }
373 
374    if (is_privileged_reg(state, original_reg)) {
375       ac_pm4_set_privileged_reg(state, original_reg, val);
376    } else {
377       opcode = regular_opcode_to_pairs(state, opcode);
378 
379       ac_pm4_set_reg_custom(state, reg, val, opcode, 0);
380    }
381 }
382 
383 void
ac_pm4_set_reg_idx3(struct ac_pm4_state * state,unsigned reg,uint32_t val)384 ac_pm4_set_reg_idx3(struct ac_pm4_state *state, unsigned reg, uint32_t val)
385 {
386    if (state->info->uses_kernel_cu_mask) {
387       assert(state->info->gfx_level >= GFX10);
388       ac_pm4_set_reg_custom(state, reg - SI_SH_REG_OFFSET, val, PKT3_SET_SH_REG_INDEX, 3);
389    } else {
390       ac_pm4_set_reg(state, reg, val);
391    }
392 }
393 
394 void
ac_pm4_clear_state(struct ac_pm4_state * state,const struct radeon_info * info,bool debug_sqtt,bool is_compute_queue)395 ac_pm4_clear_state(struct ac_pm4_state *state, const struct radeon_info *info,
396                    bool debug_sqtt, bool is_compute_queue)
397 {
398    state->info = info;
399    state->debug_sqtt = debug_sqtt;
400    state->ndw = 0;
401    state->is_compute_queue = is_compute_queue;
402 
403    if (!state->max_dw)
404       state->max_dw = ARRAY_SIZE(state->pm4);
405 }
406 
407 struct ac_pm4_state *
ac_pm4_create_sized(const struct radeon_info * info,bool debug_sqtt,unsigned max_dw,bool is_compute_queue)408 ac_pm4_create_sized(const struct radeon_info *info, bool debug_sqtt,
409                     unsigned max_dw, bool is_compute_queue)
410 {
411    struct ac_pm4_state *pm4;
412    unsigned size;
413 
414    max_dw = MAX2(max_dw, ARRAY_SIZE(pm4->pm4));
415 
416    size = sizeof(*pm4) + 4 * (max_dw - ARRAY_SIZE(pm4->pm4));
417 
418    pm4 = (struct ac_pm4_state *)calloc(1, size);
419    if (pm4) {
420       pm4->max_dw = max_dw;
421       ac_pm4_clear_state(pm4, info, debug_sqtt, is_compute_queue);
422    }
423    return pm4;
424 }
425 
426 void
ac_pm4_free_state(struct ac_pm4_state * state)427 ac_pm4_free_state(struct ac_pm4_state *state)
428 {
429    if (!state)
430       return;
431 
432    free(state);
433 }
434