1 /*
2 * Copyright 2012 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "ac_debug.h"
8 #include "ac_gpu_info.h"
9 #include "ac_pm4.h"
10
11 #include "sid.h"
12
13 #include <string.h>
14 #include <stdlib.h>
15
16 static bool
opcode_is_pairs(unsigned opcode)17 opcode_is_pairs(unsigned opcode)
18 {
19 return opcode == PKT3_SET_CONTEXT_REG_PAIRS ||
20 opcode == PKT3_SET_SH_REG_PAIRS ||
21 opcode == PKT3_SET_UCONFIG_REG_PAIRS;
22 }
23
24 static bool
opcode_is_pairs_packed(unsigned opcode)25 opcode_is_pairs_packed(unsigned opcode)
26 {
27 return opcode == PKT3_SET_CONTEXT_REG_PAIRS_PACKED ||
28 opcode == PKT3_SET_SH_REG_PAIRS_PACKED ||
29 opcode == PKT3_SET_SH_REG_PAIRS_PACKED_N;
30 }
31
32 static bool
is_privileged_reg(const struct ac_pm4_state * state,unsigned reg)33 is_privileged_reg(const struct ac_pm4_state *state, unsigned reg)
34 {
35 const struct radeon_info *info = state->info;
36
37 if (info->gfx_level >= GFX10 && info->gfx_level <= GFX10_3)
38 return reg == R_008D04_SQ_THREAD_TRACE_BUF0_SIZE ||
39 reg == R_008D00_SQ_THREAD_TRACE_BUF0_BASE ||
40 reg == R_008D14_SQ_THREAD_TRACE_MASK ||
41 reg == R_008D18_SQ_THREAD_TRACE_TOKEN_MASK ||
42 reg == R_008D1C_SQ_THREAD_TRACE_CTRL;
43
44 if (info->gfx_level >= GFX6 && info->gfx_level <= GFX8)
45 return reg == R_009100_SPI_CONFIG_CNTL;
46
47 return false;
48 }
49
50 static unsigned
pairs_packed_opcode_to_regular(unsigned opcode)51 pairs_packed_opcode_to_regular(unsigned opcode)
52 {
53 switch (opcode) {
54 case PKT3_SET_CONTEXT_REG_PAIRS_PACKED:
55 return PKT3_SET_CONTEXT_REG;
56 case PKT3_SET_SH_REG_PAIRS_PACKED:
57 return PKT3_SET_SH_REG;
58 default:
59 unreachable("invalid packed opcode");
60 }
61 }
62
63 static unsigned
regular_opcode_to_pairs(struct ac_pm4_state * state,unsigned opcode)64 regular_opcode_to_pairs(struct ac_pm4_state *state, unsigned opcode)
65 {
66 const struct radeon_info *info = state->info;
67
68 switch (opcode) {
69 case PKT3_SET_CONTEXT_REG:
70 return info->has_set_context_pairs_packed ? PKT3_SET_CONTEXT_REG_PAIRS_PACKED :
71 info->has_set_context_pairs ? PKT3_SET_CONTEXT_REG_PAIRS : opcode;
72 case PKT3_SET_SH_REG:
73 return info->has_set_sh_pairs_packed ? PKT3_SET_SH_REG_PAIRS_PACKED :
74 info->has_set_sh_pairs ? PKT3_SET_SH_REG_PAIRS : opcode;
75 case PKT3_SET_UCONFIG_REG:
76 return info->has_set_uconfig_pairs ? PKT3_SET_UCONFIG_REG_PAIRS : opcode;
77 }
78
79 return opcode;
80 }
81
82 static bool
packed_next_is_reg_offset_pair(struct ac_pm4_state * state)83 packed_next_is_reg_offset_pair(struct ac_pm4_state *state)
84 {
85 return (state->ndw - state->last_pm4) % 3 == 2;
86 }
87
88 static bool
packed_next_is_reg_value1(struct ac_pm4_state * state)89 packed_next_is_reg_value1(struct ac_pm4_state *state)
90 {
91 return (state->ndw - state->last_pm4) % 3 == 1;
92 }
93
94 static bool
packed_prev_is_reg_value0(struct ac_pm4_state * state)95 packed_prev_is_reg_value0(struct ac_pm4_state *state)
96 {
97 return packed_next_is_reg_value1(state);
98 }
99
100 static unsigned
get_packed_reg_dw_offsetN(struct ac_pm4_state * state,unsigned index)101 get_packed_reg_dw_offsetN(struct ac_pm4_state *state, unsigned index)
102 {
103 unsigned i = state->last_pm4 + 2 + (index / 2) * 3;
104 assert(i < state->ndw);
105 return (state->pm4[i] >> ((index % 2) * 16)) & 0xffff;
106 }
107
108 static unsigned
get_packed_reg_valueN_idx(struct ac_pm4_state * state,unsigned index)109 get_packed_reg_valueN_idx(struct ac_pm4_state *state, unsigned index)
110 {
111 unsigned i = state->last_pm4 + 2 + (index / 2) * 3 + 1 + (index % 2);
112 assert(i < state->ndw);
113 return i;
114 }
115
116 static unsigned
get_packed_reg_valueN(struct ac_pm4_state * state,unsigned index)117 get_packed_reg_valueN(struct ac_pm4_state *state, unsigned index)
118 {
119 return state->pm4[get_packed_reg_valueN_idx(state, index)];
120 }
121
122 static unsigned
get_packed_reg_count(struct ac_pm4_state * state)123 get_packed_reg_count(struct ac_pm4_state *state)
124 {
125 int body_size = state->ndw - state->last_pm4 - 2;
126 assert(body_size > 0 && body_size % 3 == 0);
127 return (body_size / 3) * 2;
128 }
129
130 void
ac_pm4_finalize(struct ac_pm4_state * state)131 ac_pm4_finalize(struct ac_pm4_state *state)
132 {
133 if (opcode_is_pairs_packed(state->last_opcode)) {
134 unsigned reg_count = get_packed_reg_count(state);
135 unsigned reg_dw_offset0 = get_packed_reg_dw_offsetN(state, 0);
136
137 if (state->packed_is_padded)
138 reg_count--;
139
140 bool all_consecutive = true;
141
142 /* If the whole packed SET packet only sets consecutive registers, rewrite the packet
143 * to be unpacked to make it shorter.
144 *
145 * This also eliminates the invalid scenario when the packed SET packet sets only
146 * 2 registers and the register offsets are equal due to padding.
147 */
148 for (unsigned i = 1; i < reg_count; i++) {
149 if (reg_dw_offset0 != get_packed_reg_dw_offsetN(state, i) - i) {
150 all_consecutive = false;
151 break;
152 }
153 }
154
155 if (all_consecutive) {
156 assert(state->ndw - state->last_pm4 == 2 + 3 * (reg_count + state->packed_is_padded) / 2);
157 state->pm4[state->last_pm4] = PKT3(pairs_packed_opcode_to_regular(state->last_opcode),
158 reg_count, 0);
159 state->pm4[state->last_pm4 + 1] = reg_dw_offset0;
160 for (unsigned i = 0; i < reg_count; i++)
161 state->pm4[state->last_pm4 + 2 + i] = get_packed_reg_valueN(state, i);
162 state->ndw = state->last_pm4 + 2 + reg_count;
163 state->last_opcode = PKT3_SET_SH_REG;
164 } else {
165 /* Set reg_va_low_idx to where the shader address is stored in the pm4 state. */
166 if (state->debug_sqtt &&
167 (state->last_opcode == PKT3_SET_SH_REG_PAIRS_PACKED ||
168 state->last_opcode == PKT3_SET_SH_REG_PAIRS_PACKED_N)) {
169 if (state->packed_is_padded)
170 reg_count++; /* Add this back because we only need to record the last write. */
171
172 for (int i = reg_count - 1; i >= 0; i--) {
173 unsigned reg_offset = SI_SH_REG_OFFSET + get_packed_reg_dw_offsetN(state, i) * 4;
174
175 if (strstr(ac_get_register_name(state->info->gfx_level,
176 state->info->family, reg_offset),
177 "SPI_SHADER_PGM_LO_")) {
178 state->spi_shader_pgm_lo_reg = reg_offset;
179 break;
180 }
181 }
182 }
183
184 /* If it's a packed SET_SH packet, use the *_N variant when possible. */
185 if (state->last_opcode == PKT3_SET_SH_REG_PAIRS_PACKED && reg_count <= 14) {
186 state->pm4[state->last_pm4] &= PKT3_IT_OPCODE_C;
187 state->pm4[state->last_pm4] |= PKT3_IT_OPCODE_S(PKT3_SET_SH_REG_PAIRS_PACKED_N);
188 }
189 }
190 }
191
192 if (state->debug_sqtt && state->last_opcode == PKT3_SET_SH_REG) {
193 /* Set reg_va_low_idx to where the shader address is stored in the pm4 state. */
194 unsigned reg_count = PKT_COUNT_G(state->pm4[state->last_pm4]);
195 unsigned reg_base_offset = SI_SH_REG_OFFSET + state->pm4[state->last_pm4 + 1] * 4;
196
197 for (unsigned i = 0; i < reg_count; i++) {
198 if (strstr(ac_get_register_name(state->info->gfx_level,
199 state->info->family, reg_base_offset + i * 4),
200 "SPI_SHADER_PGM_LO_")) {
201 state->spi_shader_pgm_lo_reg = reg_base_offset + i * 4;
202
203 break;
204 }
205 }
206 }
207 }
208
209 void
ac_pm4_cmd_begin(struct ac_pm4_state * state,unsigned opcode)210 ac_pm4_cmd_begin(struct ac_pm4_state *state, unsigned opcode)
211 {
212 ac_pm4_finalize(state);
213
214 assert(state->max_dw);
215 assert(state->ndw < state->max_dw);
216 assert(opcode <= 254);
217 state->last_opcode = opcode;
218 state->last_pm4 = state->ndw++;
219 state->packed_is_padded = false;
220 }
221
222 void
ac_pm4_cmd_add(struct ac_pm4_state * state,uint32_t dw)223 ac_pm4_cmd_add(struct ac_pm4_state *state, uint32_t dw)
224 {
225 assert(state->max_dw);
226 assert(state->ndw < state->max_dw);
227 state->pm4[state->ndw++] = dw;
228 state->last_opcode = 255; /* invalid opcode */
229 }
230
231 static bool
need_reset_filter_cam(const struct ac_pm4_state * state)232 need_reset_filter_cam(const struct ac_pm4_state *state)
233 {
234 const struct radeon_info *info = state->info;
235
236 /* All SET_*_PAIRS* packets on the gfx queue must set RESET_FILTER_CAM. */
237 if (!state->is_compute_queue &&
238 (opcode_is_pairs(state->last_opcode) ||
239 opcode_is_pairs_packed(state->last_opcode)))
240 return true;
241
242 const uint32_t last_reg = state->last_reg << 2;
243
244 if (info->gfx_level >= GFX11 && !state->is_compute_queue &&
245 (last_reg + CIK_UCONFIG_REG_OFFSET == R_0367A4_SQ_THREAD_TRACE_BUF0_SIZE ||
246 last_reg + CIK_UCONFIG_REG_OFFSET == R_0367A0_SQ_THREAD_TRACE_BUF0_BASE ||
247 last_reg + CIK_UCONFIG_REG_OFFSET == R_0367B4_SQ_THREAD_TRACE_MASK ||
248 last_reg + CIK_UCONFIG_REG_OFFSET == R_0367B8_SQ_THREAD_TRACE_TOKEN_MASK ||
249 last_reg + CIK_UCONFIG_REG_OFFSET == R_0367B0_SQ_THREAD_TRACE_CTRL))
250 return true;
251
252 return false;
253 }
254
255 void
ac_pm4_cmd_end(struct ac_pm4_state * state,bool predicate)256 ac_pm4_cmd_end(struct ac_pm4_state *state, bool predicate)
257 {
258 unsigned count;
259 count = state->ndw - state->last_pm4 - 2;
260 /* All SET_*_PAIRS* packets on the gfx queue must set RESET_FILTER_CAM. */
261 bool reset_filter_cam = need_reset_filter_cam(state);
262
263 state->pm4[state->last_pm4] = PKT3(state->last_opcode, count, predicate) |
264 PKT3_RESET_FILTER_CAM_S(reset_filter_cam);
265
266 if (opcode_is_pairs_packed(state->last_opcode)) {
267 if (packed_prev_is_reg_value0(state)) {
268 /* Duplicate the first register at the end to make the number of registers aligned to 2. */
269 ac_pm4_set_reg_custom(state, get_packed_reg_dw_offsetN(state, 0) * 4,
270 get_packed_reg_valueN(state, 0),
271 state->last_opcode, 0);
272 state->packed_is_padded = true;
273 }
274
275 state->pm4[state->last_pm4 + 1] = get_packed_reg_count(state);
276 }
277 }
278
279 void
ac_pm4_set_reg_custom(struct ac_pm4_state * state,unsigned reg,uint32_t val,unsigned opcode,unsigned idx)280 ac_pm4_set_reg_custom(struct ac_pm4_state *state, unsigned reg, uint32_t val,
281 unsigned opcode, unsigned idx)
282 {
283 bool is_packed = opcode_is_pairs_packed(opcode);
284 reg >>= 2;
285
286 assert(state->max_dw);
287 assert(state->ndw + 2 <= state->max_dw);
288
289 if (is_packed) {
290 assert(idx == 0);
291
292 if (opcode != state->last_opcode) {
293 ac_pm4_cmd_begin(state, opcode); /* reserve space for the header */
294 state->ndw++; /* reserve space for the register count, it will be set at the end */
295 }
296 } else if (opcode_is_pairs(opcode)) {
297 assert(idx == 0);
298
299 if (opcode != state->last_opcode)
300 ac_pm4_cmd_begin(state, opcode);
301
302 state->pm4[state->ndw++] = reg;
303 } else if (opcode != state->last_opcode || reg != (state->last_reg + 1) ||
304 idx != state->last_idx) {
305 ac_pm4_cmd_begin(state, opcode);
306 state->pm4[state->ndw++] = reg | (idx << 28);
307 }
308
309 assert(reg <= UINT16_MAX);
310 state->last_reg = reg;
311 state->last_idx = idx;
312
313 if (is_packed) {
314 if (state->packed_is_padded) {
315 /* The packet is padded, which means the first register is written redundantly again
316 * at the end. Remove it, so that we can replace it with this register.
317 */
318 state->packed_is_padded = false;
319 state->ndw--;
320 }
321
322 if (packed_next_is_reg_offset_pair(state)) {
323 state->pm4[state->ndw++] = reg;
324 } else if (packed_next_is_reg_value1(state)) {
325 /* Set the second register offset in the high 16 bits. */
326 state->pm4[state->ndw - 2] &= 0x0000ffff;
327 state->pm4[state->ndw - 2] |= reg << 16;
328 }
329 }
330
331 state->pm4[state->ndw++] = val;
332 ac_pm4_cmd_end(state, false);
333 }
334
335 static void
ac_pm4_set_privileged_reg(struct ac_pm4_state * state,unsigned reg,uint32_t val)336 ac_pm4_set_privileged_reg(struct ac_pm4_state *state, unsigned reg, uint32_t val)
337 {
338 assert(reg >= SI_CONFIG_REG_OFFSET && reg < SI_CONFIG_REG_END);
339
340 ac_pm4_cmd_add(state, PKT3(PKT3_COPY_DATA, 4, 0));
341 ac_pm4_cmd_add(state, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_PERF));
342 ac_pm4_cmd_add(state, val);
343 ac_pm4_cmd_add(state, 0); /* unused */
344 ac_pm4_cmd_add(state, reg >> 2);
345 ac_pm4_cmd_add(state, 0); /* unused */
346 }
347
ac_pm4_set_reg(struct ac_pm4_state * state,unsigned reg,uint32_t val)348 void ac_pm4_set_reg(struct ac_pm4_state *state, unsigned reg, uint32_t val)
349 {
350 const unsigned original_reg = reg;
351 unsigned opcode;
352
353 if (reg >= SI_CONFIG_REG_OFFSET && reg < SI_CONFIG_REG_END) {
354 opcode = PKT3_SET_CONFIG_REG;
355 reg -= SI_CONFIG_REG_OFFSET;
356
357 } else if (reg >= SI_SH_REG_OFFSET && reg < SI_SH_REG_END) {
358 opcode = PKT3_SET_SH_REG;
359 reg -= SI_SH_REG_OFFSET;
360
361 } else if (reg >= SI_CONTEXT_REG_OFFSET && reg < SI_CONTEXT_REG_END) {
362 opcode = PKT3_SET_CONTEXT_REG;
363 reg -= SI_CONTEXT_REG_OFFSET;
364
365 } else if (reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END) {
366 opcode = PKT3_SET_UCONFIG_REG;
367 reg -= CIK_UCONFIG_REG_OFFSET;
368
369 } else {
370 fprintf(stderr, "mesa: Invalid register offset %08x!\n", reg);
371 return;
372 }
373
374 if (is_privileged_reg(state, original_reg)) {
375 ac_pm4_set_privileged_reg(state, original_reg, val);
376 } else {
377 opcode = regular_opcode_to_pairs(state, opcode);
378
379 ac_pm4_set_reg_custom(state, reg, val, opcode, 0);
380 }
381 }
382
383 void
ac_pm4_set_reg_idx3(struct ac_pm4_state * state,unsigned reg,uint32_t val)384 ac_pm4_set_reg_idx3(struct ac_pm4_state *state, unsigned reg, uint32_t val)
385 {
386 if (state->info->uses_kernel_cu_mask) {
387 assert(state->info->gfx_level >= GFX10);
388 ac_pm4_set_reg_custom(state, reg - SI_SH_REG_OFFSET, val, PKT3_SET_SH_REG_INDEX, 3);
389 } else {
390 ac_pm4_set_reg(state, reg, val);
391 }
392 }
393
394 void
ac_pm4_clear_state(struct ac_pm4_state * state,const struct radeon_info * info,bool debug_sqtt,bool is_compute_queue)395 ac_pm4_clear_state(struct ac_pm4_state *state, const struct radeon_info *info,
396 bool debug_sqtt, bool is_compute_queue)
397 {
398 state->info = info;
399 state->debug_sqtt = debug_sqtt;
400 state->ndw = 0;
401 state->is_compute_queue = is_compute_queue;
402
403 if (!state->max_dw)
404 state->max_dw = ARRAY_SIZE(state->pm4);
405 }
406
407 struct ac_pm4_state *
ac_pm4_create_sized(const struct radeon_info * info,bool debug_sqtt,unsigned max_dw,bool is_compute_queue)408 ac_pm4_create_sized(const struct radeon_info *info, bool debug_sqtt,
409 unsigned max_dw, bool is_compute_queue)
410 {
411 struct ac_pm4_state *pm4;
412 unsigned size;
413
414 max_dw = MAX2(max_dw, ARRAY_SIZE(pm4->pm4));
415
416 size = sizeof(*pm4) + 4 * (max_dw - ARRAY_SIZE(pm4->pm4));
417
418 pm4 = (struct ac_pm4_state *)calloc(1, size);
419 if (pm4) {
420 pm4->max_dw = max_dw;
421 ac_pm4_clear_state(pm4, info, debug_sqtt, is_compute_queue);
422 }
423 return pm4;
424 }
425
426 void
ac_pm4_free_state(struct ac_pm4_state * state)427 ac_pm4_free_state(struct ac_pm4_state *state)
428 {
429 if (!state)
430 return;
431
432 free(state);
433 }
434