1 /*
2 * Copyright © 2016 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #ifndef BLORP_GENX_EXEC_BRW_H
25 #define BLORP_GENX_EXEC_BRW_H
26
27 #include "blorp_priv.h"
28 #include "dev/intel_device_info.h"
29 #include "common/intel_compute_slm.h"
30 #include "common/intel_sample_positions.h"
31 #include "common/intel_l3_config.h"
32 #include "genxml/gen_macros.h"
33 #include "intel/compiler/brw_compiler.h"
34
35 /**
36 * This file provides the blorp pipeline setup and execution functionality.
37 * It defines the following function:
38 *
39 * static void
40 * blorp_exec(struct blorp_context *blorp, void *batch_data,
41 * const struct blorp_params *params);
42 *
43 * It is the job of whoever includes this header to wrap this in something
44 * to get an externally visible symbol.
45 *
46 * In order for the blorp_exec function to work, the driver must provide
47 * implementations of the following static helper functions.
48 */
49
50 static void *
51 blorp_emit_dwords(struct blorp_batch *batch, unsigned n);
52
53 static uint64_t
54 blorp_emit_reloc(struct blorp_batch *batch,
55 void *location, struct blorp_address address, uint32_t delta);
56
57 static void
58 blorp_measure_start(struct blorp_batch *batch,
59 const struct blorp_params *params);
60
61 static void
62 blorp_measure_end(struct blorp_batch *batch,
63 const struct blorp_params *params);
64
65 static void *
66 blorp_alloc_dynamic_state(struct blorp_batch *batch,
67 uint32_t size,
68 uint32_t alignment,
69 uint32_t *offset);
70
71 UNUSED static void *
72 blorp_alloc_general_state(struct blorp_batch *batch,
73 uint32_t size,
74 uint32_t alignment,
75 uint32_t *offset);
76
77 static uint32_t
78 blorp_get_dynamic_state(struct blorp_batch *batch,
79 enum blorp_dynamic_state name);
80
81 static void *
82 blorp_alloc_vertex_buffer(struct blorp_batch *batch, uint32_t size,
83 struct blorp_address *addr);
84 static void
85 blorp_vf_invalidate_for_vb_48b_transitions(struct blorp_batch *batch,
86 const struct blorp_address *addrs,
87 uint32_t *sizes,
88 unsigned num_vbs);
89
90 UNUSED static struct blorp_address
91 blorp_get_workaround_address(struct blorp_batch *batch);
92
93 static bool
94 blorp_alloc_binding_table(struct blorp_batch *batch, unsigned num_entries,
95 unsigned state_size, unsigned state_alignment,
96 uint32_t *bt_offset, uint32_t *surface_offsets,
97 void **surface_maps);
98
99 static uint32_t
100 blorp_binding_table_offset_to_pointer(struct blorp_batch *batch,
101 uint32_t offset);
102
103 static void
104 blorp_flush_range(struct blorp_batch *batch, void *start, size_t size);
105
106 static void
107 blorp_surface_reloc(struct blorp_batch *batch, uint32_t ss_offset,
108 struct blorp_address address, uint32_t delta);
109
110 static uint64_t
111 blorp_get_surface_address(struct blorp_batch *batch,
112 struct blorp_address address);
113
114 #if GFX_VER < 10
115 static struct blorp_address
116 blorp_get_surface_base_address(struct blorp_batch *batch);
117 #endif
118
119 static const struct intel_l3_config *
120 blorp_get_l3_config(struct blorp_batch *batch);
121
122 static void
123 blorp_pre_emit_urb_config(struct blorp_batch *batch,
124 struct intel_urb_config *urb_config);
125
126 static void
127 blorp_emit_urb_config(struct blorp_batch *batch,
128 struct intel_urb_config *urb_config);
129
130 static void
131 blorp_emit_pipeline(struct blorp_batch *batch,
132 const struct blorp_params *params);
133
134 static void
135 blorp_emit_pre_draw(struct blorp_batch *batch,
136 const struct blorp_params *params);
137 static void
138 blorp_emit_post_draw(struct blorp_batch *batch,
139 const struct blorp_params *params);
140
141 static inline unsigned
brw_blorp_get_urb_length(const struct brw_wm_prog_data * prog_data)142 brw_blorp_get_urb_length(const struct brw_wm_prog_data *prog_data)
143 {
144 if (prog_data == NULL)
145 return 1;
146
147 /* From the BSpec: 3D Pipeline - Strips and Fans - 3DSTATE_SBE
148 *
149 * read_length = ceiling((max_source_attr+1)/2)
150 */
151 return MAX2((prog_data->num_varying_inputs + 1) / 2, 1);
152 }
153
154 /***** BEGIN blorp_exec implementation ******/
155
156 static uint64_t
_blorp_combine_address(struct blorp_batch * batch,void * location,struct blorp_address address,uint32_t delta)157 _blorp_combine_address(struct blorp_batch *batch, void *location,
158 struct blorp_address address, uint32_t delta)
159 {
160 if (address.buffer == NULL) {
161 return address.offset + delta;
162 } else {
163 return blorp_emit_reloc(batch, location, address, delta);
164 }
165 }
166
167 #define __gen_address_type struct blorp_address
168 #define __gen_user_data struct blorp_batch
169 #define __gen_combine_address _blorp_combine_address
170
171 #include "genxml/genX_pack.h"
172 #include "common/intel_genX_state_brw.h"
173
174 #define _blorp_cmd_length(cmd) cmd ## _length
175 #define _blorp_cmd_length_bias(cmd) cmd ## _length_bias
176 #define _blorp_cmd_header(cmd) cmd ## _header
177 #define _blorp_cmd_pack(cmd) cmd ## _pack
178
179 #define blorp_emit(batch, cmd, name) \
180 for (struct cmd name = { _blorp_cmd_header(cmd) }, \
181 *_dst = blorp_emit_dwords(batch, _blorp_cmd_length(cmd)); \
182 __builtin_expect(_dst != NULL, 1); \
183 _blorp_cmd_pack(cmd)(batch, (void *)_dst, &name), \
184 _dst = NULL)
185
186 #define blorp_emitn(batch, cmd, n, ...) ({ \
187 uint32_t *_dw = blorp_emit_dwords(batch, n); \
188 if (_dw) { \
189 struct cmd template = { \
190 _blorp_cmd_header(cmd), \
191 .DWordLength = n - _blorp_cmd_length_bias(cmd), \
192 __VA_ARGS__ \
193 }; \
194 _blorp_cmd_pack(cmd)(batch, _dw, &template); \
195 } \
196 _dw ? _dw + 1 : NULL; /* Array starts at dw[1] */ \
197 })
198
199 #define STRUCT_ZERO(S) ({ struct S t; memset(&t, 0, sizeof(t)); t; })
200
201 #define blorp_context_upload_dynamic(context, state, name, \
202 align, dynamic_name) \
203 for (struct state name = STRUCT_ZERO(state), *_dst = &name; \
204 _dst != NULL; \
205 ({ \
206 uint32_t _dw[_blorp_cmd_length(state)]; \
207 _blorp_cmd_pack(state)(NULL, (void *)_dw, &name); \
208 context->upload_dynamic_state(context, _dw, \
209 _blorp_cmd_length(state) * 4, \
210 align, dynamic_name); \
211 _dst = NULL; \
212 }))
213
214 #define blorp_emit_dynamic(batch, state, name, align, offset) \
215 for (struct state name = STRUCT_ZERO(state), \
216 *_dst = blorp_alloc_dynamic_state(batch, \
217 _blorp_cmd_length(state) * 4, \
218 align, offset); \
219 __builtin_expect(_dst != NULL, 1); \
220 _blorp_cmd_pack(state)(batch, (void *)_dst, &name), \
221 blorp_flush_range(batch, _dst, _blorp_cmd_length(state) * 4), \
222 _dst = NULL)
223
224 /* 3DSTATE_URB
225 * 3DSTATE_URB_VS
226 * 3DSTATE_URB_HS
227 * 3DSTATE_URB_DS
228 * 3DSTATE_URB_GS
229 *
230 * Assign the entire URB to the VS. Even though the VS disabled, URB space
231 * is still needed because the clipper loads the VUE's from the URB. From
232 * the Sandybridge PRM, Volume 2, Part 1, Section 3DSTATE,
233 * Dword 1.15:0 "VS Number of URB Entries":
234 * This field is always used (even if VS Function Enable is DISABLED).
235 *
236 * The warning below appears in the PRM (Section 3DSTATE_URB), but we can
237 * safely ignore it because this batch contains only one draw call.
238 * Because of URB corruption caused by allocating a previous GS unit
239 * URB entry to the VS unit, software is required to send a “GS NULL
240 * Fence” (Send URB fence with VS URB size == 1 and GS URB size == 0)
241 * plus a dummy DRAW call before any case where VS will be taking over
242 * GS URB space.
243 *
244 * If the 3DSTATE_URB_VS is emitted, than the others must be also.
245 * From the Ivybridge PRM, Volume 2 Part 1, section 1.7.1 3DSTATE_URB_VS:
246 *
247 * 3DSTATE_URB_HS, 3DSTATE_URB_DS, and 3DSTATE_URB_GS must also be
248 * programmed in order for the programming of this state to be
249 * valid.
250 */
251 static void
emit_urb_config(struct blorp_batch * batch,const struct blorp_params * params,UNUSED enum intel_urb_deref_block_size * deref_block_size)252 emit_urb_config(struct blorp_batch *batch,
253 const struct blorp_params *params,
254 UNUSED enum intel_urb_deref_block_size *deref_block_size)
255 {
256 /* Once vertex fetcher has written full VUE entries with complete
257 * header the space requirement is as follows per vertex (in bytes):
258 *
259 * Header Position Program constants
260 * +--------+------------+-------------------+
261 * | 16 | 16 | n x 16 |
262 * +--------+------------+-------------------+
263 *
264 * where 'n' stands for number of varying inputs expressed as vec4s.
265 */
266 struct brw_wm_prog_data *wm_prog_data = params->wm_prog_data;
267 const unsigned num_varyings =
268 wm_prog_data ? wm_prog_data->num_varying_inputs : 0;
269 const unsigned total_needed = 16 + 16 + num_varyings * 16;
270
271 /* The URB size is expressed in units of 64 bytes (512 bits) */
272 const unsigned vs_entry_size = DIV_ROUND_UP(total_needed, 64);
273
274 struct intel_urb_config urb_cfg = {
275 .size = { vs_entry_size, 1, 1, 1 },
276 };
277
278 bool constrained;
279 intel_get_urb_config(batch->blorp->compiler->brw->devinfo,
280 blorp_get_l3_config(batch),
281 false, false, &urb_cfg,
282 deref_block_size, &constrained);
283
284 /* Tell drivers about the config. */
285 blorp_pre_emit_urb_config(batch, &urb_cfg);
286
287 for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
288 blorp_emit(batch, GENX(3DSTATE_URB_VS), urb) {
289 urb._3DCommandSubOpcode += i;
290 urb.VSURBStartingAddress = urb_cfg.start[i];
291 urb.VSURBEntryAllocationSize = urb_cfg.size[i] - 1;
292 urb.VSNumberofURBEntries = urb_cfg.entries[i];
293 }
294 }
295
296 if (batch->blorp->config.use_mesh_shading) {
297 #if GFX_VERx10 >= 125
298 blorp_emit(batch, GENX(3DSTATE_URB_ALLOC_MESH), zero);
299 blorp_emit(batch, GENX(3DSTATE_URB_ALLOC_TASK), zero);
300 #endif
301 }
302 }
303
304 static void
305 blorp_emit_memcpy(struct blorp_batch *batch,
306 struct blorp_address dst,
307 struct blorp_address src,
308 uint32_t size);
309
310 static void
blorp_emit_vertex_data(struct blorp_batch * batch,const struct blorp_params * params,struct blorp_address * addr,uint32_t * size)311 blorp_emit_vertex_data(struct blorp_batch *batch,
312 const struct blorp_params *params,
313 struct blorp_address *addr,
314 uint32_t *size)
315 {
316 const float vertices[] = {
317 /* v0 */ (float)params->x1, (float)params->y1, params->z,
318 /* v1 */ (float)params->x0, (float)params->y1, params->z,
319 /* v2 */ (float)params->x0, (float)params->y0, params->z,
320 };
321
322 void *data = blorp_alloc_vertex_buffer(batch, sizeof(vertices), addr);
323 if (data == NULL)
324 return;
325 memcpy(data, vertices, sizeof(vertices));
326 *size = sizeof(vertices);
327 blorp_flush_range(batch, data, *size);
328 }
329
330 static void
blorp_emit_input_varying_data(struct blorp_batch * batch,const struct blorp_params * params,struct blorp_address * addr,uint32_t * size)331 blorp_emit_input_varying_data(struct blorp_batch *batch,
332 const struct blorp_params *params,
333 struct blorp_address *addr,
334 uint32_t *size)
335 {
336 const unsigned vec4_size_in_bytes = 4 * sizeof(float);
337 const unsigned max_num_varyings =
338 DIV_ROUND_UP(sizeof(params->wm_inputs), vec4_size_in_bytes);
339 struct brw_wm_prog_data *wm_prog_data = params->wm_prog_data;
340 const unsigned num_varyings =
341 wm_prog_data ? wm_prog_data->num_varying_inputs : 0;
342
343 *size = 16 + num_varyings * vec4_size_in_bytes;
344
345 const uint32_t *const inputs_src = (const uint32_t *)¶ms->wm_inputs;
346 void *data = blorp_alloc_vertex_buffer(batch, *size, addr);
347 if (data == NULL)
348 return;
349 uint32_t *inputs = data;
350
351 /* Copy in the VS inputs */
352 assert(sizeof(params->vs_inputs) == 16);
353 memcpy(inputs, ¶ms->vs_inputs, sizeof(params->vs_inputs));
354 inputs += 4;
355
356 if (params->wm_prog_data) {
357 /* Walk over the attribute slots, determine if the attribute is used by
358 * the program and when necessary copy the values from the input storage
359 * to the vertex data buffer.
360 */
361 for (unsigned i = 0; i < max_num_varyings; i++) {
362 const gl_varying_slot attr = VARYING_SLOT_VAR0 + i;
363
364 const int input_index = wm_prog_data->urb_setup[attr];
365 if (input_index < 0)
366 continue;
367
368 memcpy(inputs, inputs_src + i * 4, vec4_size_in_bytes);
369
370 inputs += 4;
371 }
372 }
373
374 blorp_flush_range(batch, data, *size);
375
376 if (params->dst_clear_color_as_input) {
377 /* In this case, the clear color isn't known statically and instead
378 * comes in through an indirect which we have to copy into the vertex
379 * buffer before we execute the 3DPRIMITIVE. We already copied the
380 * value of params->wm_inputs.clear_color into the vertex buffer in the
381 * loop above. Now we emit code to stomp it from the GPU with the
382 * actual clear color value.
383 */
384 assert(num_varyings == 1);
385
386 /* The clear color is the first thing after the header */
387 struct blorp_address clear_color_input_addr = *addr;
388 clear_color_input_addr.offset += 16;
389
390 const unsigned clear_color_size =
391 GFX_VER < 10 ? batch->blorp->isl_dev->ss.clear_value_size : 4 * 4;
392 blorp_emit_memcpy(batch, clear_color_input_addr,
393 params->dst.clear_color_addr,
394 clear_color_size);
395 }
396 }
397
398 static void
blorp_fill_vertex_buffer_state(struct GENX (VERTEX_BUFFER_STATE)* vb,unsigned idx,struct blorp_address addr,uint32_t size,uint32_t stride)399 blorp_fill_vertex_buffer_state(struct GENX(VERTEX_BUFFER_STATE) *vb,
400 unsigned idx,
401 struct blorp_address addr, uint32_t size,
402 uint32_t stride)
403 {
404 vb[idx].VertexBufferIndex = idx;
405 vb[idx].BufferStartingAddress = addr;
406 vb[idx].BufferPitch = stride;
407 vb[idx].MOCS = addr.mocs;
408 vb[idx].AddressModifyEnable = true;
409 vb[idx].BufferSize = size;
410
411 #if GFX_VER >= 12
412 vb[idx].L3BypassDisable = true;
413 #endif
414 }
415
416 static void
blorp_emit_vertex_buffers(struct blorp_batch * batch,const struct blorp_params * params)417 blorp_emit_vertex_buffers(struct blorp_batch *batch,
418 const struct blorp_params *params)
419 {
420 struct GENX(VERTEX_BUFFER_STATE) vb[2] = {};
421 const uint32_t num_vbs = ARRAY_SIZE(vb);
422
423 struct blorp_address addrs[2] = {};
424 uint32_t sizes[2] = {};
425 blorp_emit_vertex_data(batch, params, &addrs[0], &sizes[0]);
426 if (sizes[0] == 0)
427 return;
428 blorp_fill_vertex_buffer_state(vb, 0, addrs[0], sizes[0],
429 3 * sizeof(float));
430
431 blorp_emit_input_varying_data(batch, params, &addrs[1], &sizes[1]);
432 blorp_fill_vertex_buffer_state(vb, 1, addrs[1], sizes[1], 0);
433
434 blorp_vf_invalidate_for_vb_48b_transitions(batch, addrs, sizes, num_vbs);
435
436 const unsigned num_dwords = 1 + num_vbs * GENX(VERTEX_BUFFER_STATE_length);
437 uint32_t *dw = blorp_emitn(batch, GENX(3DSTATE_VERTEX_BUFFERS), num_dwords);
438 if (!dw)
439 return;
440
441 for (unsigned i = 0; i < num_vbs; i++) {
442 GENX(VERTEX_BUFFER_STATE_pack)(batch, dw, &vb[i]);
443 dw += GENX(VERTEX_BUFFER_STATE_length);
444 }
445 }
446
447 static void
blorp_emit_vertex_elements(struct blorp_batch * batch,const struct blorp_params * params)448 blorp_emit_vertex_elements(struct blorp_batch *batch,
449 const struct blorp_params *params)
450 {
451 struct brw_wm_prog_data *wm_prog_data = params->wm_prog_data;
452 const unsigned num_varyings =
453 wm_prog_data ? wm_prog_data->num_varying_inputs : 0;
454 const unsigned num_elements = 2 + num_varyings;
455
456 struct GENX(VERTEX_ELEMENT_STATE) ve[num_elements];
457 memset(ve, 0, num_elements * sizeof(*ve));
458
459 /* Setup VBO for the rectangle primitive..
460 *
461 * A rectangle primitive (3DPRIM_RECTLIST) consists of only three
462 * vertices. The vertices reside in screen space with DirectX
463 * coordinates (that is, (0, 0) is the upper left corner).
464 *
465 * v2 ------ implied
466 * | |
467 * | |
468 * v1 ----- v0
469 *
470 * Since the VS is disabled, the clipper loads each VUE directly from
471 * the URB. This is controlled by the 3DSTATE_VERTEX_BUFFERS and
472 * 3DSTATE_VERTEX_ELEMENTS packets below. The VUE contents are as follows:
473 * dw0: Reserved, MBZ.
474 * dw1: Render Target Array Index. Below vertex fetcher gets programmed
475 * to assign this with primitive instance identifier which will be
476 * used for layered clears. All other renders have only one instance
477 * and therefore the value will be effectively zero.
478 * dw2: Viewport Index. The HiZ op disables viewport mapping and
479 * scissoring, so set the dword to 0.
480 * dw3: Point Width: The HiZ op does not emit the POINTLIST primitive,
481 * so set the dword to 0.
482 * dw4: Vertex Position X.
483 * dw5: Vertex Position Y.
484 * dw6: Vertex Position Z.
485 * dw7: Vertex Position W.
486 *
487 * dw8: Flat vertex input 0
488 * dw9: Flat vertex input 1
489 * ...
490 * dwn: Flat vertex input n - 8
491 *
492 * For details, see the Sandybridge PRM, Volume 2, Part 1, Section 1.5.1
493 * "Vertex URB Entry (VUE) Formats".
494 *
495 * Only vertex position X and Y are going to be variable, Z is fixed to
496 * zero and W to one. Header words dw0,2,3 are zero. There is no need to
497 * include the fixed values in the vertex buffer. Vertex fetcher can be
498 * instructed to fill vertex elements with constant values of one and zero
499 * instead of reading them from the buffer.
500 * Flat inputs are program constants that are not interpolated. Moreover
501 * their values will be the same between vertices.
502 *
503 * See the vertex element setup below.
504 */
505 unsigned slot = 0;
506
507 ve[slot] = (struct GENX(VERTEX_ELEMENT_STATE)) {
508 .VertexBufferIndex = 1,
509 .Valid = true,
510 .SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT,
511 .SourceElementOffset = 0,
512 .Component0Control = VFCOMP_STORE_SRC,
513
514 /* From Gfx8 onwards hardware is no more instructed to overwrite
515 * components using an element specifier. Instead one has separate
516 * 3DSTATE_VF_SGVS (System Generated Value Setup) state packet for it.
517 */
518 .Component1Control = VFCOMP_STORE_0,
519 .Component2Control = VFCOMP_STORE_0,
520 .Component3Control = VFCOMP_STORE_0,
521 };
522 slot++;
523
524 ve[slot] = (struct GENX(VERTEX_ELEMENT_STATE)) {
525 .VertexBufferIndex = 0,
526 .Valid = true,
527 .SourceElementFormat = ISL_FORMAT_R32G32B32_FLOAT,
528 .SourceElementOffset = 0,
529 .Component0Control = VFCOMP_STORE_SRC,
530 .Component1Control = VFCOMP_STORE_SRC,
531 .Component2Control = VFCOMP_STORE_SRC,
532 .Component3Control = VFCOMP_STORE_1_FP,
533 };
534 slot++;
535
536 for (unsigned i = 0; i < num_varyings; ++i) {
537 ve[slot] = (struct GENX(VERTEX_ELEMENT_STATE)) {
538 .VertexBufferIndex = 1,
539 .Valid = true,
540 .SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT,
541 .SourceElementOffset = 16 + i * 4 * sizeof(float),
542 .Component0Control = VFCOMP_STORE_SRC,
543 .Component1Control = VFCOMP_STORE_SRC,
544 .Component2Control = VFCOMP_STORE_SRC,
545 .Component3Control = VFCOMP_STORE_SRC,
546 };
547 slot++;
548 }
549
550 const unsigned num_dwords =
551 1 + GENX(VERTEX_ELEMENT_STATE_length) * num_elements;
552 uint32_t *dw = blorp_emitn(batch, GENX(3DSTATE_VERTEX_ELEMENTS), num_dwords);
553 if (!dw)
554 return;
555
556 for (unsigned i = 0; i < num_elements; i++) {
557 GENX(VERTEX_ELEMENT_STATE_pack)(batch, dw, &ve[i]);
558 dw += GENX(VERTEX_ELEMENT_STATE_length);
559 }
560
561 blorp_emit(batch, GENX(3DSTATE_VF_STATISTICS), vf) {
562 vf.StatisticsEnable = false;
563 }
564
565 /* Overwrite Render Target Array Index (2nd dword) in the VUE header with
566 * primitive instance identifier. This is used for layered clears.
567 */
568 blorp_emit(batch, GENX(3DSTATE_VF_SGVS), sgvs) {
569 sgvs.InstanceIDEnable = true;
570 sgvs.InstanceIDComponentNumber = COMP_1;
571 sgvs.InstanceIDElementOffset = 0;
572 }
573
574 #if GFX_VER >= 11
575 blorp_emit(batch, GENX(3DSTATE_VF_SGVS_2), sgvs);
576 #endif
577
578 for (unsigned i = 0; i < num_elements; i++) {
579 blorp_emit(batch, GENX(3DSTATE_VF_INSTANCING), vf) {
580 vf.VertexElementIndex = i;
581 vf.InstancingEnable = false;
582 }
583 }
584
585 blorp_emit(batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
586 topo.PrimitiveTopologyType = _3DPRIM_RECTLIST;
587 }
588 }
589
590 /* 3DSTATE_VIEWPORT_STATE_POINTERS */
591 static uint32_t
blorp_emit_cc_viewport(struct blorp_batch * batch)592 blorp_emit_cc_viewport(struct blorp_batch *batch)
593 {
594 uint32_t cc_vp_offset;
595
596 /* Somehow reusing CC_VIEWPORT on Gfx9 is causing issues :
597 * https://gitlab.freedesktop.org/mesa/mesa/-/issues/11647
598 */
599 if (GFX_VER != 9 && batch->blorp->config.use_cached_dynamic_states) {
600 cc_vp_offset = blorp_get_dynamic_state(batch, BLORP_DYNAMIC_STATE_CC_VIEWPORT);
601 } else {
602 blorp_emit_dynamic(batch, GENX(CC_VIEWPORT), vp, 32, &cc_vp_offset) {
603 vp.MinimumDepth = batch->blorp->config.use_unrestricted_depth_range ?
604 -FLT_MAX : 0.0;
605 vp.MaximumDepth = batch->blorp->config.use_unrestricted_depth_range ?
606 FLT_MAX : 1.0;
607 }
608 }
609
610 blorp_emit(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), vsp) {
611 vsp.CCViewportPointer = cc_vp_offset;
612 }
613
614 return cc_vp_offset;
615 }
616
617 static uint32_t
blorp_emit_sampler_state(struct blorp_batch * batch)618 blorp_emit_sampler_state(struct blorp_batch *batch)
619 {
620 uint32_t offset;
621 blorp_emit_dynamic(batch, GENX(SAMPLER_STATE), sampler, 32, &offset) {
622 sampler.MipModeFilter = MIPFILTER_NONE;
623 sampler.MagModeFilter = MAPFILTER_LINEAR;
624 sampler.MinModeFilter = MAPFILTER_LINEAR;
625 sampler.MinLOD = 0;
626 sampler.MaxLOD = 0;
627 sampler.TCXAddressControlMode = TCM_CLAMP;
628 sampler.TCYAddressControlMode = TCM_CLAMP;
629 sampler.TCZAddressControlMode = TCM_CLAMP;
630 sampler.MaximumAnisotropy = RATIO21;
631 sampler.RAddressMinFilterRoundingEnable = true;
632 sampler.RAddressMagFilterRoundingEnable = true;
633 sampler.VAddressMinFilterRoundingEnable = true;
634 sampler.VAddressMagFilterRoundingEnable = true;
635 sampler.UAddressMinFilterRoundingEnable = true;
636 sampler.UAddressMagFilterRoundingEnable = true;
637 sampler.NonnormalizedCoordinateEnable = true;
638 }
639
640 return offset;
641 }
642
643 UNUSED static uint32_t
blorp_emit_sampler_state_ps(struct blorp_batch * batch)644 blorp_emit_sampler_state_ps(struct blorp_batch *batch)
645 {
646 uint32_t offset = batch->blorp->config.use_cached_dynamic_states ?
647 blorp_get_dynamic_state(batch, BLORP_DYNAMIC_STATE_SAMPLER) :
648 blorp_emit_sampler_state(batch);
649
650 blorp_emit(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS_PS), ssp) {
651 ssp.PointertoPSSamplerState = offset;
652 }
653
654 return offset;
655 }
656
657 /* What follows is the code for setting up a "pipeline". */
658
659 static void
blorp_emit_vs_config(struct blorp_batch * batch,const struct blorp_params * params)660 blorp_emit_vs_config(struct blorp_batch *batch,
661 const struct blorp_params *params)
662 {
663 struct brw_vs_prog_data *vs_prog_data = params->vs_prog_data;
664 assert(!vs_prog_data || GFX_VER < 11 ||
665 vs_prog_data->base.dispatch_mode == INTEL_DISPATCH_MODE_SIMD8);
666
667 blorp_emit(batch, GENX(3DSTATE_VS), vs) {
668 if (vs_prog_data) {
669 vs.Enable = true;
670
671 vs.KernelStartPointer = params->vs_prog_kernel;
672
673 vs.DispatchGRFStartRegisterForURBData =
674 vs_prog_data->base.base.dispatch_grf_start_reg;
675 vs.VertexURBEntryReadLength =
676 vs_prog_data->base.urb_read_length;
677 vs.VertexURBEntryReadOffset = 0;
678
679 vs.MaximumNumberofThreads =
680 batch->blorp->isl_dev->info->max_vs_threads - 1;
681
682 assert(vs_prog_data->base.dispatch_mode == INTEL_DISPATCH_MODE_SIMD8);
683 #if GFX_VER < 20
684 vs.SIMD8DispatchEnable = true;
685 #endif
686 }
687 }
688 }
689
690 static void
blorp_emit_sf_config(struct blorp_batch * batch,const struct blorp_params * params,UNUSED enum intel_urb_deref_block_size urb_deref_block_size)691 blorp_emit_sf_config(struct blorp_batch *batch,
692 const struct blorp_params *params,
693 UNUSED enum intel_urb_deref_block_size urb_deref_block_size)
694 {
695 const struct brw_wm_prog_data *prog_data = params->wm_prog_data;
696
697 /* 3DSTATE_SF
698 *
699 * Disable ViewportTransformEnable (dw2.1)
700 *
701 * From the SandyBridge PRM, Volume 2, Part 1, Section 1.3, "3D
702 * Primitives Overview":
703 * RECTLIST: Viewport Mapping must be DISABLED (as is typical with the
704 * use of screen- space coordinates).
705 *
706 * A solid rectangle must be rendered, so set FrontFaceFillMode (dw2.4:3)
707 * and BackFaceFillMode (dw2.5:6) to SOLID(0).
708 *
709 * From the Sandy Bridge PRM, Volume 2, Part 1, Section
710 * 6.4.1.1 3DSTATE_SF, Field FrontFaceFillMode:
711 * SOLID: Any triangle or rectangle object found to be front-facing
712 * is rendered as a solid object. This setting is required when
713 * (rendering rectangle (RECTLIST) objects.
714 */
715
716 blorp_emit(batch, GENX(3DSTATE_SF), sf) {
717 #if GFX_VER >= 12
718 sf.DerefBlockSize = urb_deref_block_size;
719 #endif
720 }
721
722 blorp_emit(batch, GENX(3DSTATE_RASTER), raster) {
723 raster.CullMode = CULLMODE_NONE;
724 }
725
726 blorp_emit(batch, GENX(3DSTATE_SBE), sbe) {
727 sbe.VertexURBEntryReadOffset = 1;
728 if (prog_data) {
729 sbe.NumberofSFOutputAttributes = prog_data->num_varying_inputs;
730 sbe.VertexURBEntryReadLength = brw_blorp_get_urb_length(prog_data);
731 sbe.ConstantInterpolationEnable = prog_data->flat_inputs;
732 } else {
733 sbe.NumberofSFOutputAttributes = 0;
734 sbe.VertexURBEntryReadLength = 1;
735 }
736 sbe.ForceVertexURBEntryReadLength = true;
737 sbe.ForceVertexURBEntryReadOffset = true;
738
739 for (unsigned i = 0; i < 32; i++)
740 sbe.AttributeActiveComponentFormat[i] = ACF_XYZW;
741 }
742 }
743
744 static void
blorp_emit_ps_config(struct blorp_batch * batch,const struct blorp_params * params)745 blorp_emit_ps_config(struct blorp_batch *batch,
746 const struct blorp_params *params)
747 {
748 const struct brw_wm_prog_data *prog_data = params->wm_prog_data;
749
750 /* Even when thread dispatch is disabled, max threads (dw5.25:31) must be
751 * nonzero to prevent the GPU from hanging. While the documentation doesn't
752 * mention this explicitly, it notes that the valid range for the field is
753 * [1,39] = [2,40] threads, which excludes zero.
754 *
755 * To be safe (and to minimize extraneous code) we go ahead and fully
756 * configure the WM state whether or not there is a WM program.
757 */
758
759 const struct intel_device_info *devinfo = batch->blorp->compiler->brw->devinfo;
760
761 blorp_emit(batch, GENX(3DSTATE_WM), wm);
762
763 blorp_emit(batch, GENX(3DSTATE_PS), ps) {
764 if (params->src.enabled) {
765 ps.SamplerCount = 1; /* Up to 4 samplers */
766 ps.BindingTableEntryCount = 2;
767 } else {
768 ps.BindingTableEntryCount = 1;
769 }
770
771 /* SAMPLER_STATE prefetching is broken on Gfx11 - Wa_1606682166 */
772 if (GFX_VER == 11)
773 ps.SamplerCount = 0;
774
775 /* 3DSTATE_PS expects the number of threads per PSD, which is always 64
776 * for pre Gfx11 and 128 for gfx11+; On gfx11+ If a programmed value is
777 * k, it implies 2(k+1) threads. It implicitly scales for different GT
778 * levels (which have some # of PSDs).
779 */
780 ps.MaximumNumberofThreadsPerPSD = devinfo->max_threads_per_psd - 1;
781
782 switch (params->fast_clear_op) {
783 case ISL_AUX_OP_NONE:
784 break;
785 #if GFX_VER < 20
786 #if GFX_VER >= 10
787 case ISL_AUX_OP_AMBIGUATE:
788 ps.RenderTargetFastClearEnable = true;
789 ps.RenderTargetResolveType = FAST_CLEAR_0;
790 break;
791 #endif /* GFX_VER >= 10 */
792 case ISL_AUX_OP_PARTIAL_RESOLVE:
793 ps.RenderTargetResolveType = RESOLVE_PARTIAL;
794 break;
795 case ISL_AUX_OP_FULL_RESOLVE:
796 /* WA 1406738321:
797 * In-place full resolve of a 3D/Volume surface is not supported.
798 * In order to fully resolve 3D/volume surface, copy operation must be
799 * performed to a new destination (declared as uncompressed) using the
800 * compressed 3D surface as a source.
801 */
802 #if GFX_VERx10 == 120
803 assert(params->src.surf.dim != ISL_SURF_DIM_3D);
804 #endif
805 ps.RenderTargetResolveType = RESOLVE_FULL;
806 break;
807 #endif /* GFX_VER < 20 */
808 case ISL_AUX_OP_FAST_CLEAR:
809 /* WA 1406738321:
810 * 3D/Volumetric surfaces do not support Fast Clear operation.
811 */
812 #if GFX_VERx10 == 120
813 assert(params->dst.surf.dim != ISL_SURF_DIM_3D);
814 #endif
815 ps.RenderTargetFastClearEnable = true;
816 break;
817 default:
818 unreachable("Invalid fast clear op");
819 }
820
821 /* The RENDER_SURFACE_STATE page for TGL says:
822 *
823 * For an 8 bpp surface with NUM_MULTISAMPLES = 1, Surface Width not
824 * multiple of 64 pixels and more than 1 mip level in the view, Fast
825 * Clear is not supported when AUX_CCS_E is set in this field.
826 *
827 * The granularity of a fast-clear or ambiguate operation is likely one
828 * CCS element. For an 8 bpp primary surface, this maps to 32px x 4rows.
829 * Due to the surface layout parameters, if LOD0's width isn't a
830 * multiple of 64px, LOD1 and LOD2+ will share CCS elements. Assert that
831 * these operations aren't occurring on these LODs.
832 *
833 * We don't explicitly check for TGL+ because the restriction is
834 * technically applicable to all hardware. Platforms prior to TGL don't
835 * support CCS on 8 bpp surfaces. So, these unaligned fast clear
836 * operations shouldn't be occurring prior to TGL as well.
837 */
838 if (isl_format_get_layout(params->dst.surf.format)->bpb == 8 &&
839 params->dst.surf.logical_level0_px.width % 64 != 0 &&
840 params->dst.surf.levels >= 3 &&
841 params->dst.view.base_level >= 1) {
842 assert(params->num_samples == 1);
843 assert(!ps.RenderTargetFastClearEnable);
844 }
845
846 if (prog_data) {
847 intel_set_ps_dispatch_state(&ps, devinfo, prog_data,
848 params->num_samples,
849 0 /* msaa_flags */);
850
851 ps.DispatchGRFStartRegisterForConstantSetupData0 =
852 brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 0);
853 ps.DispatchGRFStartRegisterForConstantSetupData1 =
854 brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 1);
855 #if GFX_VER < 20
856 ps.DispatchGRFStartRegisterForConstantSetupData2 =
857 brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 2);
858 #endif
859
860 ps.KernelStartPointer0 = params->wm_prog_kernel +
861 brw_wm_prog_data_prog_offset(prog_data, ps, 0);
862 ps.KernelStartPointer1 = params->wm_prog_kernel +
863 brw_wm_prog_data_prog_offset(prog_data, ps, 1);
864 #if GFX_VER < 20
865 ps.KernelStartPointer2 = params->wm_prog_kernel +
866 brw_wm_prog_data_prog_offset(prog_data, ps, 2);
867 #endif
868 }
869 }
870
871 blorp_emit(batch, GENX(3DSTATE_PS_EXTRA), psx) {
872 if (params->src.enabled)
873 psx.PixelShaderKillsPixel = true;
874
875 if (prog_data) {
876 psx.PixelShaderValid = true;
877 psx.PixelShaderComputedDepthMode = prog_data->computed_depth_mode;
878 psx.PixelShaderComputesStencil = prog_data->computed_stencil;
879 psx.PixelShaderIsPerSample = prog_data->persample_dispatch;
880
881 #if INTEL_WA_18038825448_GFX_VER
882 psx.EnablePSDependencyOnCPsizeChange =
883 batch->flags & BLORP_BATCH_FORCE_CPS_DEPENDENCY;
884 #endif
885
886 #if GFX_VER < 20
887 psx.AttributeEnable = prog_data->num_varying_inputs > 0;
888 #else
889 /* Bspec 57340 (r59562):
890 *
891 * For MSAA fast clear, it (clear shader) must be in per-pixel
892 * dispatch mode.
893 *
894 * Bspec 56424 (r58933):
895 *
896 * Bit 6 of Bit Group 0: Pixel Shader Is Per Sample
897 * If this bit is DISABLED, the dispatch rate is determined by the
898 * value of Pixel Shader Is Per Coarse Pixel.
899 *
900 * Bit 4 of Bit Group 0: Pixel Shader Is Per Coarse Pixel
901 * If Pixel Shader Is Per Sample is DISABLED and this bit is
902 * DISABLED, the pixel shader is dispatched at the per pixel
903 * shading rate.
904 *
905 * The below assertion ensures the MSAA clear shader is in per-pixel
906 * dispatch mode.
907 */
908 if (params->fast_clear_op == ISL_AUX_OP_FAST_CLEAR &&
909 params->num_samples > 1) {
910 assert(!psx.PixelShaderIsPerSample &&
911 !psx.PixelShaderIsPerCoarsePixel);
912 }
913 #endif
914 }
915 }
916 }
917
918 static void
blorp_emit_blend_state(struct blorp_batch * batch,const struct blorp_params * params)919 blorp_emit_blend_state(struct blorp_batch *batch,
920 const struct blorp_params *params)
921 {
922 uint32_t offset;
923 if (!batch->blorp->config.use_cached_dynamic_states) {
924 struct GENX(BLEND_STATE) blend = { };
925
926 const unsigned size = 96;
927 uint32_t *state = blorp_alloc_dynamic_state(batch, size, 64, &offset);
928 if (state == NULL)
929 return;
930 uint32_t *pos = state;
931
932 GENX(BLEND_STATE_pack)(NULL, pos, &blend);
933 pos += GENX(BLEND_STATE_length);
934
935 for (unsigned i = 0; i < params->num_draw_buffers; ++i) {
936 struct GENX(BLEND_STATE_ENTRY) entry = {
937 .PreBlendColorClampEnable = true,
938 .PostBlendColorClampEnable = true,
939 .ColorClampRange = COLORCLAMP_RTFORMAT,
940
941 .WriteDisableRed = params->color_write_disable & 1,
942 .WriteDisableGreen = params->color_write_disable & 2,
943 .WriteDisableBlue = params->color_write_disable & 4,
944 .WriteDisableAlpha = params->color_write_disable & 8,
945 };
946 GENX(BLEND_STATE_ENTRY_pack)(NULL, pos, &entry);
947 pos += GENX(BLEND_STATE_ENTRY_length);
948 }
949
950 blorp_flush_range(batch, state, size);
951 } else {
952 /* We only cached this case. */
953 assert(params->color_write_disable == 0);
954 offset = blorp_get_dynamic_state(batch, BLORP_DYNAMIC_STATE_BLEND);
955 }
956
957 blorp_emit(batch, GENX(3DSTATE_BLEND_STATE_POINTERS), sp) {
958 sp.BlendStatePointer = offset;
959 sp.BlendStatePointerValid = true;
960 }
961
962 blorp_emit(batch, GENX(3DSTATE_PS_BLEND), ps_blend) {
963 ps_blend.HasWriteableRT = true;
964 }
965 }
966
967 static void
blorp_emit_color_calc_state(struct blorp_batch * batch,UNUSED const struct blorp_params * params)968 blorp_emit_color_calc_state(struct blorp_batch *batch,
969 UNUSED const struct blorp_params *params)
970 {
971 uint32_t offset;
972
973 if (batch->blorp->config.use_cached_dynamic_states)
974 offset = blorp_get_dynamic_state(batch, BLORP_DYNAMIC_STATE_COLOR_CALC);
975 else
976 blorp_emit_dynamic(batch, GENX(COLOR_CALC_STATE), cc, 64, &offset) {}
977
978 blorp_emit(batch, GENX(3DSTATE_CC_STATE_POINTERS), sp) {
979 sp.ColorCalcStatePointer = offset;
980 sp.ColorCalcStatePointerValid = true;
981 }
982 }
983
984 static void
blorp_emit_depth_stencil_state(struct blorp_batch * batch,const struct blorp_params * params)985 blorp_emit_depth_stencil_state(struct blorp_batch *batch,
986 const struct blorp_params *params)
987 {
988 blorp_emit(batch, GENX(3DSTATE_WM_DEPTH_STENCIL), ds) {
989 if (params->depth.enabled) {
990 ds.DepthBufferWriteEnable = true;
991
992 switch (params->hiz_op) {
993 /* See the following sections of the Sandy Bridge PRM, Volume 2, Part1:
994 * - 7.5.3.1 Depth Buffer Clear
995 * - 7.5.3.2 Depth Buffer Resolve
996 * - 7.5.3.3 Hierarchical Depth Buffer Resolve
997 */
998 case ISL_AUX_OP_FULL_RESOLVE:
999 ds.DepthTestEnable = true;
1000 ds.DepthTestFunction = COMPAREFUNCTION_NEVER;
1001 break;
1002
1003 case ISL_AUX_OP_NONE:
1004 case ISL_AUX_OP_FAST_CLEAR:
1005 case ISL_AUX_OP_AMBIGUATE:
1006 ds.DepthTestEnable = false;
1007 break;
1008 case ISL_AUX_OP_PARTIAL_RESOLVE:
1009 unreachable("Invalid HIZ op");
1010 }
1011 }
1012
1013 if (params->stencil.enabled) {
1014 ds.StencilBufferWriteEnable = true;
1015 ds.StencilTestEnable = true;
1016 ds.DoubleSidedStencilEnable = false;
1017
1018 ds.StencilTestFunction = COMPAREFUNCTION_ALWAYS;
1019 ds.StencilPassDepthPassOp = STENCILOP_REPLACE;
1020
1021 ds.StencilWriteMask = params->stencil_mask;
1022 ds.StencilReferenceValue = params->stencil_ref;
1023 }
1024 }
1025
1026 #if GFX_VER >= 12
1027 blorp_emit(batch, GENX(3DSTATE_DEPTH_BOUNDS), db) {
1028 db.DepthBoundsTestEnable = false;
1029 db.DepthBoundsTestMinValue = 0.0;
1030 db.DepthBoundsTestMaxValue = 1.0;
1031 }
1032 #endif
1033 }
1034
1035 static void
blorp_emit_3dstate_multisample(struct blorp_batch * batch,const struct blorp_params * params)1036 blorp_emit_3dstate_multisample(struct blorp_batch *batch,
1037 const struct blorp_params *params)
1038 {
1039 blorp_emit(batch, GENX(3DSTATE_MULTISAMPLE), ms) {
1040 ms.NumberofMultisamples = __builtin_ffs(params->num_samples) - 1;
1041 ms.PixelLocation = CENTER;
1042 }
1043 }
1044
1045 static void
blorp_emit_pipeline(struct blorp_batch * batch,const struct blorp_params * params)1046 blorp_emit_pipeline(struct blorp_batch *batch,
1047 const struct blorp_params *params)
1048 {
1049 enum intel_urb_deref_block_size urb_deref_block_size;
1050 emit_urb_config(batch, params, &urb_deref_block_size);
1051
1052 if (params->wm_prog_data) {
1053 blorp_emit_blend_state(batch, params);
1054 }
1055 blorp_emit_color_calc_state(batch, params);
1056 blorp_emit_depth_stencil_state(batch, params);
1057
1058 UNUSED uint32_t mocs = isl_mocs(batch->blorp->isl_dev, 0, false);
1059
1060 #if GFX_VER >= 12
1061 blorp_emit(batch, GENX(3DSTATE_CONSTANT_ALL), pc) {
1062 /* Update empty push constants for all stages (bitmask = 11111b) */
1063 pc.ShaderUpdateEnable = 0x1f;
1064 pc.MOCS = mocs;
1065 }
1066 #else
1067 blorp_emit(batch, GENX(3DSTATE_CONSTANT_VS), xs) { xs.MOCS = mocs; }
1068 blorp_emit(batch, GENX(3DSTATE_CONSTANT_HS), xs) { xs.MOCS = mocs; }
1069 blorp_emit(batch, GENX(3DSTATE_CONSTANT_DS), xs) { xs.MOCS = mocs; }
1070 blorp_emit(batch, GENX(3DSTATE_CONSTANT_GS), xs) { xs.MOCS = mocs; }
1071 blorp_emit(batch, GENX(3DSTATE_CONSTANT_PS), xs) { xs.MOCS = mocs; }
1072 #endif
1073
1074 if (params->src.enabled)
1075 blorp_emit_sampler_state_ps(batch);
1076
1077 blorp_emit_3dstate_multisample(batch, params);
1078
1079 blorp_emit(batch, GENX(3DSTATE_SAMPLE_MASK), mask) {
1080 mask.SampleMask = (1 << params->num_samples) - 1;
1081 }
1082
1083 /* From the BSpec, 3D Pipeline > Geometry > Vertex Shader > State,
1084 * 3DSTATE_VS, Dword 5.0 "VS Function Enable":
1085 *
1086 * [DevSNB] A pipeline flush must be programmed prior to a
1087 * 3DSTATE_VS command that causes the VS Function Enable to
1088 * toggle. Pipeline flush can be executed by sending a PIPE_CONTROL
1089 * command with CS stall bit set and a post sync operation.
1090 *
1091 * We've already done one at the start of the BLORP operation.
1092 */
1093 blorp_emit_vs_config(batch, params);
1094 blorp_emit(batch, GENX(3DSTATE_HS), hs);
1095 blorp_emit(batch, GENX(3DSTATE_TE), te);
1096 blorp_emit(batch, GENX(3DSTATE_DS), DS);
1097 blorp_emit(batch, GENX(3DSTATE_STREAMOUT), so);
1098 blorp_emit(batch, GENX(3DSTATE_GS), gs);
1099
1100 blorp_emit(batch, GENX(3DSTATE_CLIP), clip) {
1101 clip.PerspectiveDivideDisable = true;
1102 }
1103
1104 blorp_emit_sf_config(batch, params, urb_deref_block_size);
1105 blorp_emit_ps_config(batch, params);
1106
1107 blorp_emit_cc_viewport(batch);
1108
1109 #if GFX_VER >= 12
1110 /* Disable Primitive Replication. */
1111 blorp_emit(batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr);
1112 #endif
1113
1114 if (batch->blorp->config.use_mesh_shading) {
1115 #if GFX_VERx10 >= 125
1116 blorp_emit(batch, GENX(3DSTATE_MESH_CONTROL), zero);
1117 blorp_emit(batch, GENX(3DSTATE_TASK_CONTROL), zero);
1118 #endif
1119 }
1120 }
1121
1122 /******** This is the end of the pipeline setup code ********/
1123
1124 static void
blorp_emit_memcpy(struct blorp_batch * batch,struct blorp_address dst,struct blorp_address src,uint32_t size)1125 blorp_emit_memcpy(struct blorp_batch *batch,
1126 struct blorp_address dst,
1127 struct blorp_address src,
1128 uint32_t size)
1129 {
1130 assert(size % 4 == 0);
1131
1132 for (unsigned dw = 0; dw < size; dw += 4) {
1133 blorp_emit(batch, GENX(MI_COPY_MEM_MEM), cp) {
1134 cp.DestinationMemoryAddress = dst;
1135 cp.SourceMemoryAddress = src;
1136 }
1137 dst.offset += 4;
1138 src.offset += 4;
1139 }
1140 }
1141
1142 static void
blorp_emit_surface_state(struct blorp_batch * batch,const struct blorp_surface_info * surface,UNUSED enum isl_aux_op aux_op,void * state,uint32_t state_offset,uint8_t color_write_disable,bool is_render_target)1143 blorp_emit_surface_state(struct blorp_batch *batch,
1144 const struct blorp_surface_info *surface,
1145 UNUSED enum isl_aux_op aux_op,
1146 void *state, uint32_t state_offset,
1147 uint8_t color_write_disable,
1148 bool is_render_target)
1149 {
1150 const struct isl_device *isl_dev = batch->blorp->isl_dev;
1151 struct isl_surf surf = surface->surf;
1152
1153 if (surf.dim == ISL_SURF_DIM_1D &&
1154 surf.dim_layout == ISL_DIM_LAYOUT_GFX4_2D) {
1155 assert(surf.logical_level0_px.height == 1);
1156 surf.dim = ISL_SURF_DIM_2D;
1157 }
1158
1159 if (isl_aux_usage_has_hiz(surface->aux_usage)) {
1160 /* BLORP doesn't render with depth so we can't use HiZ */
1161 assert(!is_render_target);
1162 /* We can't reinterpret HiZ */
1163 assert(surface->surf.format == surface->view.format);
1164 }
1165
1166 enum isl_aux_usage aux_usage = surface->aux_usage;
1167
1168 /* On gfx12, implicit CCS has no aux buffer */
1169 bool use_aux_address = (aux_usage != ISL_AUX_USAGE_NONE) &&
1170 (surface->aux_addr.buffer != NULL);
1171
1172 const bool use_clear_address =
1173 GFX_VER >= 10 && (surface->clear_color_addr.buffer != NULL);
1174
1175 /* On gfx12 (and optionally on gfx11), hardware will read and write to the
1176 * clear color address, converting the raw clear color channels to a pixel
1177 * during a fast-clear. To avoid the restrictions associated with the
1178 * hardware feature, we instead write a software-converted pixel ourselves.
1179 * If we're performing a fast-clear, provide a substitute address to avoid
1180 * a collision with hardware. Outside of gfx11 and gfx12, indirect clear
1181 * color BOs are not used during fast-clears.
1182 */
1183 const struct blorp_address op_clear_addr =
1184 aux_op == ISL_AUX_OP_FAST_CLEAR ? blorp_get_workaround_address(batch) :
1185 surface->clear_color_addr;
1186
1187 isl_surf_fill_state(batch->blorp->isl_dev, state,
1188 .surf = &surf, .view = &surface->view,
1189 .aux_surf = &surface->aux_surf, .aux_usage = aux_usage,
1190 .address =
1191 blorp_get_surface_address(batch, surface->addr),
1192 .aux_address = !use_aux_address ? 0 :
1193 blorp_get_surface_address(batch, surface->aux_addr),
1194 .clear_address = !use_clear_address ? 0 :
1195 blorp_get_surface_address(batch, op_clear_addr),
1196 .mocs = surface->addr.mocs,
1197 .clear_color = surface->clear_color,
1198 .use_clear_address = use_clear_address);
1199
1200 blorp_surface_reloc(batch, state_offset + isl_dev->ss.addr_offset,
1201 surface->addr, 0);
1202
1203 if (use_aux_address) {
1204 /* On gfx7 and prior, the bottom 12 bits of the MCS base address are
1205 * used to store other information. This should be ok, however, because
1206 * surface buffer addresses are always 4K page alinged.
1207 */
1208 assert((surface->aux_addr.offset & 0xfff) == 0);
1209 uint32_t *aux_addr = state + isl_dev->ss.aux_addr_offset;
1210 blorp_surface_reloc(batch, state_offset + isl_dev->ss.aux_addr_offset,
1211 surface->aux_addr, *aux_addr);
1212 }
1213
1214 if (aux_usage != ISL_AUX_USAGE_NONE && surface->clear_color_addr.buffer) {
1215 #if GFX_VER >= 10
1216 assert((surface->clear_color_addr.offset & 0x3f) == 0);
1217 uint32_t *clear_addr = state + isl_dev->ss.clear_color_state_offset;
1218 blorp_surface_reloc(batch, state_offset +
1219 isl_dev->ss.clear_color_state_offset,
1220 op_clear_addr, *clear_addr);
1221 #else
1222 /* Fast clears just whack the AUX surface and don't actually use the
1223 * clear color for anything. We can avoid the MI memcpy on that case.
1224 */
1225 if (aux_op != ISL_AUX_OP_FAST_CLEAR) {
1226 struct blorp_address dst_addr = blorp_get_surface_base_address(batch);
1227 dst_addr.offset += state_offset + isl_dev->ss.clear_value_offset;
1228 blorp_emit_memcpy(batch, dst_addr, surface->clear_color_addr,
1229 isl_dev->ss.clear_value_size);
1230 }
1231 #endif
1232 }
1233
1234 blorp_flush_range(batch, state, GENX(RENDER_SURFACE_STATE_length) * 4);
1235 }
1236
1237 static void
blorp_emit_null_surface_state(struct blorp_batch * batch,const struct blorp_surface_info * surface,uint32_t * state)1238 blorp_emit_null_surface_state(struct blorp_batch *batch,
1239 const struct blorp_surface_info *surface,
1240 uint32_t *state)
1241 {
1242 struct GENX(RENDER_SURFACE_STATE) ss = {
1243 .SurfaceType = SURFTYPE_NULL,
1244 .SurfaceFormat = ISL_FORMAT_R8G8B8A8_UNORM,
1245 .Width = surface->surf.logical_level0_px.width - 1,
1246 .Height = surface->surf.logical_level0_px.height - 1,
1247 .MIPCountLOD = surface->view.base_level,
1248 .MinimumArrayElement = surface->view.base_array_layer,
1249 .Depth = surface->view.array_len - 1,
1250 .RenderTargetViewExtent = surface->view.array_len - 1,
1251 .NumberofMultisamples = ffs(surface->surf.samples) - 1,
1252 .MOCS = isl_mocs(batch->blorp->isl_dev, 0, false),
1253
1254 .SurfaceArray = surface->surf.dim != ISL_SURF_DIM_3D,
1255
1256 #if GFX_VERx10 >= 125
1257 .TileMode = TILE4,
1258 #else
1259 .TileMode = YMAJOR,
1260 #endif
1261 };
1262
1263 GENX(RENDER_SURFACE_STATE_pack)(NULL, state, &ss);
1264
1265 blorp_flush_range(batch, state, GENX(RENDER_SURFACE_STATE_length) * 4);
1266 }
1267
1268 static uint32_t
blorp_setup_binding_table(struct blorp_batch * batch,const struct blorp_params * params)1269 blorp_setup_binding_table(struct blorp_batch *batch,
1270 const struct blorp_params *params)
1271 {
1272 const struct isl_device *isl_dev = batch->blorp->isl_dev;
1273 uint32_t surface_offsets[2], bind_offset = 0;
1274 void *surface_maps[2];
1275
1276 if (params->use_pre_baked_binding_table) {
1277 bind_offset = params->pre_baked_binding_table_offset;
1278 } else {
1279 unsigned num_surfaces = 1 + params->src.enabled;
1280 if (!blorp_alloc_binding_table(batch, num_surfaces,
1281 isl_dev->ss.size, isl_dev->ss.align,
1282 &bind_offset, surface_offsets, surface_maps))
1283 return 0;
1284
1285 if (params->dst.enabled) {
1286 blorp_emit_surface_state(batch, ¶ms->dst,
1287 params->fast_clear_op,
1288 surface_maps[BLORP_RENDERBUFFER_BT_INDEX],
1289 surface_offsets[BLORP_RENDERBUFFER_BT_INDEX],
1290 params->color_write_disable, true);
1291 } else {
1292 assert(params->depth.enabled || params->stencil.enabled);
1293 const struct blorp_surface_info *surface =
1294 params->depth.enabled ? ¶ms->depth : ¶ms->stencil;
1295 blorp_emit_null_surface_state(batch, surface,
1296 surface_maps[BLORP_RENDERBUFFER_BT_INDEX]);
1297 }
1298
1299 if (params->src.enabled) {
1300 blorp_emit_surface_state(batch, ¶ms->src,
1301 params->fast_clear_op,
1302 surface_maps[BLORP_TEXTURE_BT_INDEX],
1303 surface_offsets[BLORP_TEXTURE_BT_INDEX],
1304 0, false);
1305 }
1306 }
1307
1308 return bind_offset;
1309 }
1310
1311 static void
blorp_emit_btp(struct blorp_batch * batch,uint32_t bind_offset)1312 blorp_emit_btp(struct blorp_batch *batch, uint32_t bind_offset)
1313 {
1314 blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), bt);
1315 blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_HS), bt);
1316 blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_DS), bt);
1317 blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_GS), bt);
1318
1319 blorp_emit(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_PS), bt) {
1320 bt.PointertoPSBindingTable =
1321 blorp_binding_table_offset_to_pointer(batch, bind_offset);
1322 }
1323 }
1324
1325 static void
blorp_emit_depth_stencil_config(struct blorp_batch * batch,const struct blorp_params * params)1326 blorp_emit_depth_stencil_config(struct blorp_batch *batch,
1327 const struct blorp_params *params)
1328 {
1329 const struct isl_device *isl_dev = batch->blorp->isl_dev;
1330 const struct intel_device_info *devinfo =
1331 batch->blorp->compiler->brw->devinfo;
1332
1333 uint32_t *dw = blorp_emit_dwords(batch, isl_dev->ds.size / 4);
1334 if (dw == NULL)
1335 return;
1336
1337 struct isl_depth_stencil_hiz_emit_info info = { };
1338
1339 if (params->depth.enabled) {
1340 info.view = ¶ms->depth.view;
1341 info.mocs = params->depth.addr.mocs;
1342 } else if (params->stencil.enabled) {
1343 info.view = ¶ms->stencil.view;
1344 info.mocs = params->stencil.addr.mocs;
1345 } else {
1346 info.mocs = isl_mocs(isl_dev, 0, false);
1347 }
1348
1349 if (params->depth.enabled) {
1350 info.depth_surf = ¶ms->depth.surf;
1351
1352 info.depth_address =
1353 blorp_emit_reloc(batch, dw + isl_dev->ds.depth_offset / 4,
1354 params->depth.addr, 0);
1355
1356 info.hiz_usage = params->depth.aux_usage;
1357 if (isl_aux_usage_has_hiz(info.hiz_usage)) {
1358 info.hiz_surf = ¶ms->depth.aux_surf;
1359
1360 struct blorp_address hiz_address = params->depth.aux_addr;
1361
1362 info.hiz_address =
1363 blorp_emit_reloc(batch, dw + isl_dev->ds.hiz_offset / 4,
1364 hiz_address, 0);
1365
1366 info.depth_clear_value = params->depth.clear_color.f32[0];
1367 }
1368 }
1369
1370 if (params->stencil.enabled) {
1371 info.stencil_surf = ¶ms->stencil.surf;
1372
1373 info.stencil_aux_usage = params->stencil.aux_usage;
1374 struct blorp_address stencil_address = params->stencil.addr;
1375
1376 info.stencil_address =
1377 blorp_emit_reloc(batch, dw + isl_dev->ds.stencil_offset / 4,
1378 stencil_address, 0);
1379 }
1380
1381 isl_emit_depth_stencil_hiz_s(isl_dev, dw, &info);
1382
1383 if (intel_needs_workaround(devinfo, 1408224581) ||
1384 intel_needs_workaround(devinfo, 14014097488) ||
1385 intel_needs_workaround(devinfo, 14016712196)) {
1386 /* Wa_1408224581
1387 *
1388 * Workaround: Gfx12LP Astep only An additional pipe control with
1389 * post-sync = store dword operation would be required.( w/a is to
1390 * have an additional pipe control after the stencil state whenever
1391 * the surface state bits of this state is changing).
1392 *
1393 * This also seems sufficient to handle Wa_14014097488 and
1394 * Wa_14016712196.
1395 */
1396 blorp_emit(batch, GENX(PIPE_CONTROL), pc) {
1397 pc.PostSyncOperation = WriteImmediateData;
1398 pc.Address = blorp_get_workaround_address(batch);
1399 }
1400 }
1401 }
1402
1403 /* Emits the Optimized HiZ sequence specified in the BDW+ PRMs. The
1404 * depth/stencil buffer extents are ignored to handle APIs which perform
1405 * clearing operations without such information.
1406 * */
1407 static void
blorp_emit_gfx8_hiz_op(struct blorp_batch * batch,const struct blorp_params * params)1408 blorp_emit_gfx8_hiz_op(struct blorp_batch *batch,
1409 const struct blorp_params *params)
1410 {
1411 /* We should be performing an operation on a depth or stencil buffer.
1412 */
1413 assert(params->depth.enabled || params->stencil.enabled);
1414
1415 blorp_measure_start(batch, params);
1416
1417 /* The stencil buffer should only be enabled if a fast clear operation is
1418 * requested.
1419 */
1420 if (params->stencil.enabled)
1421 assert(params->hiz_op == ISL_AUX_OP_FAST_CLEAR);
1422
1423 /* From the BDW PRM Volume 2, 3DSTATE_WM_HZ_OP:
1424 *
1425 * 3DSTATE_MULTISAMPLE packet must be used prior to this packet to change
1426 * the Number of Multisamples. This packet must not be used to change
1427 * Number of Multisamples in a rendering sequence.
1428 *
1429 * Since HIZ may be the first thing in a batch buffer, play safe and always
1430 * emit 3DSTATE_MULTISAMPLE.
1431 */
1432 blorp_emit_3dstate_multisample(batch, params);
1433
1434 /* From the BDW PRM Volume 7, Depth Buffer Clear:
1435 *
1436 * The clear value must be between the min and max depth values
1437 * (inclusive) defined in the CC_VIEWPORT. If the depth buffer format is
1438 * D32_FLOAT, then +/-DENORM values are also allowed.
1439 *
1440 * Set the bounds to match our hardware limits.
1441 */
1442 if (params->depth.enabled && params->hiz_op == ISL_AUX_OP_FAST_CLEAR)
1443 blorp_emit_cc_viewport(batch);
1444
1445 /* According to the SKL PRM formula for WM_INT::ThreadDispatchEnable, the
1446 * 3DSTATE_WM::ForceThreadDispatchEnable field can force WM thread dispatch
1447 * even when WM_HZ_OP is active. However, WM thread dispatch is normally
1448 * disabled for HiZ ops and it appears that force-enabling it can lead to
1449 * GPU hangs on at least Skylake. Since we don't know the current state of
1450 * the 3DSTATE_WM packet, just emit a dummy one prior to 3DSTATE_WM_HZ_OP.
1451 */
1452 blorp_emit(batch, GENX(3DSTATE_WM), wm);
1453
1454 /* If we can't alter the depth stencil config and multiple layers are
1455 * involved, the HiZ op will fail. This is because the op requires that a
1456 * new config is emitted for each additional layer.
1457 */
1458 if (batch->flags & BLORP_BATCH_NO_EMIT_DEPTH_STENCIL) {
1459 assert(params->num_layers <= 1);
1460 } else {
1461 blorp_emit_depth_stencil_config(batch, params);
1462 }
1463
1464 /* TODO - If we ever start using 3DSTATE_WM_HZ_OP::StencilBufferResolveEnable
1465 * we need to implement required steps, flushes documented in Wa_1605967699.
1466 */
1467 blorp_emit(batch, GENX(3DSTATE_WM_HZ_OP), hzp) {
1468 switch (params->hiz_op) {
1469 case ISL_AUX_OP_FAST_CLEAR:
1470 hzp.StencilBufferClearEnable = params->stencil.enabled;
1471 hzp.DepthBufferClearEnable = params->depth.enabled;
1472 hzp.StencilClearValue = params->stencil_ref;
1473 hzp.FullSurfaceDepthandStencilClear = params->full_surface_hiz_op;
1474 #if GFX_VER >= 20
1475 hzp.DepthClearValue = params->depth.clear_color.f32[0];
1476
1477 /* From the Xe2 Bspec 56437 (r61349):
1478 *
1479 * The Depth Clear value cannot be a NAN (Not-A-Number) if the
1480 * depth format is Float32.
1481 *
1482 * We're not required to support NaN in APIs, so flush to zero.
1483 */
1484 if (util_is_nan(hzp.DepthClearValue))
1485 hzp.DepthClearValue = 0;
1486 #endif
1487 break;
1488 case ISL_AUX_OP_FULL_RESOLVE:
1489 assert(params->full_surface_hiz_op);
1490 hzp.DepthBufferResolveEnable = true;
1491 break;
1492 case ISL_AUX_OP_AMBIGUATE:
1493 assert(params->full_surface_hiz_op);
1494 hzp.HierarchicalDepthBufferResolveEnable = true;
1495 break;
1496 case ISL_AUX_OP_PARTIAL_RESOLVE:
1497 case ISL_AUX_OP_NONE:
1498 unreachable("Invalid HIZ op");
1499 }
1500
1501 hzp.NumberofMultisamples = ffs(params->num_samples) - 1;
1502 hzp.SampleMask = 0xFFFF;
1503
1504 /* Due to a hardware issue, this bit MBZ */
1505 assert(hzp.ScissorRectangleEnable == false);
1506
1507 /* Contrary to the HW docs both fields are inclusive */
1508 hzp.ClearRectangleXMin = params->x0;
1509 hzp.ClearRectangleYMin = params->y0;
1510
1511 /* Contrary to the HW docs both fields are exclusive */
1512 hzp.ClearRectangleXMax = params->x1;
1513 hzp.ClearRectangleYMax = params->y1;
1514 }
1515
1516 /* PIPE_CONTROL w/ all bits clear except for “Post-Sync Operation” must set
1517 * to “Write Immediate Data” enabled.
1518 */
1519 blorp_emit(batch, GENX(PIPE_CONTROL), pc) {
1520 pc.PostSyncOperation = WriteImmediateData;
1521 pc.Address = blorp_get_workaround_address(batch);
1522 }
1523
1524 blorp_emit(batch, GENX(3DSTATE_WM_HZ_OP), hzp);
1525
1526 blorp_measure_end(batch, params);
1527 }
1528
1529 static bool
blorp_uses_bti_rt_writes(const struct blorp_batch * batch,const struct blorp_params * params)1530 blorp_uses_bti_rt_writes(const struct blorp_batch *batch, const struct blorp_params *params)
1531 {
1532 if (batch->flags & (BLORP_BATCH_USE_BLITTER | BLORP_BATCH_USE_COMPUTE))
1533 return false;
1534
1535 /* HIZ clears use WM_HZ ops rather than a clear shader using RT writes. */
1536 return params->hiz_op == ISL_AUX_OP_NONE;
1537 }
1538
1539 static void
blorp_exec_3d(struct blorp_batch * batch,const struct blorp_params * params)1540 blorp_exec_3d(struct blorp_batch *batch, const struct blorp_params *params)
1541 {
1542 if (params->hiz_op != ISL_AUX_OP_NONE) {
1543 blorp_emit_gfx8_hiz_op(batch, params);
1544 return;
1545 }
1546
1547 blorp_emit_vertex_buffers(batch, params);
1548 blorp_emit_vertex_elements(batch, params);
1549
1550 blorp_emit_pipeline(batch, params);
1551
1552 blorp_emit_btp(batch, blorp_setup_binding_table(batch, params));
1553
1554 if (!(batch->flags & BLORP_BATCH_NO_EMIT_DEPTH_STENCIL))
1555 blorp_emit_depth_stencil_config(batch, params);
1556
1557 const UNUSED bool use_tbimr = false;
1558 blorp_emit_pre_draw(batch, params);
1559 blorp_emit(batch, GENX(3DPRIMITIVE), prim) {
1560 prim.VertexAccessType = SEQUENTIAL;
1561 prim.PrimitiveTopologyType = _3DPRIM_RECTLIST;
1562 prim.PredicateEnable = batch->flags & BLORP_BATCH_PREDICATE_ENABLE;
1563 #if GFX_VERx10 >= 125
1564 prim.TBIMREnable = use_tbimr;
1565 #endif
1566 prim.VertexCountPerInstance = 3;
1567 prim.InstanceCount = params->num_layers;
1568 }
1569 blorp_emit_post_draw(batch, params);
1570 }
1571
1572 static void
blorp_get_compute_push_const(struct blorp_batch * batch,const struct blorp_params * params,uint32_t threads,uint32_t * state_offset,unsigned * state_size)1573 blorp_get_compute_push_const(struct blorp_batch *batch,
1574 const struct blorp_params *params,
1575 uint32_t threads,
1576 uint32_t *state_offset,
1577 unsigned *state_size)
1578 {
1579 const struct brw_cs_prog_data *cs_prog_data = params->cs_prog_data;
1580 const unsigned push_const_size =
1581 ALIGN(brw_cs_push_const_total_size(cs_prog_data, threads), 64);
1582 assert(cs_prog_data->push.cross_thread.size +
1583 cs_prog_data->push.per_thread.size == sizeof(params->wm_inputs));
1584
1585 if (push_const_size == 0) {
1586 *state_offset = 0;
1587 *state_size = 0;
1588 return;
1589 }
1590
1591 uint32_t push_const_offset;
1592 uint32_t *push_const =
1593 GFX_VERx10 >= 125 ?
1594 blorp_alloc_general_state(batch, push_const_size, 64,
1595 &push_const_offset) :
1596 blorp_alloc_dynamic_state(batch, push_const_size, 64,
1597 &push_const_offset);
1598 if (push_const == NULL) {
1599 *state_offset = 0;
1600 *state_size = 0;
1601 return;
1602 }
1603 memset(push_const, 0x0, push_const_size);
1604
1605 void *dst = push_const;
1606 const void *src = (char *)¶ms->wm_inputs;
1607
1608 if (cs_prog_data->push.cross_thread.size > 0) {
1609 memcpy(dst, src, cs_prog_data->push.cross_thread.size);
1610 dst += cs_prog_data->push.cross_thread.size;
1611 src += cs_prog_data->push.cross_thread.size;
1612 }
1613
1614 assert(GFX_VERx10 < 125 || cs_prog_data->push.per_thread.size == 0);
1615 #if GFX_VERx10 < 125
1616 if (cs_prog_data->push.per_thread.size > 0) {
1617 for (unsigned t = 0; t < threads; t++) {
1618 memcpy(dst, src, (cs_prog_data->push.per_thread.dwords - 1) * 4);
1619
1620 uint32_t *subgroup_id = dst + cs_prog_data->push.per_thread.size - 4;
1621 *subgroup_id = t;
1622
1623 dst += cs_prog_data->push.per_thread.size;
1624 }
1625 }
1626 #endif
1627
1628 *state_offset = push_const_offset;
1629 *state_size = push_const_size;
1630 }
1631
1632 static void
blorp_exec_compute(struct blorp_batch * batch,const struct blorp_params * params)1633 blorp_exec_compute(struct blorp_batch *batch, const struct blorp_params *params)
1634 {
1635 assert(!(batch->flags & BLORP_BATCH_PREDICATE_ENABLE));
1636 assert(params->hiz_op == ISL_AUX_OP_NONE);
1637
1638 blorp_measure_start(batch, params);
1639
1640 const struct intel_device_info *devinfo = batch->blorp->compiler->brw->devinfo;
1641 const struct brw_cs_prog_data *cs_prog_data = params->cs_prog_data;
1642 const struct brw_stage_prog_data *prog_data = &cs_prog_data->base;
1643 const struct intel_cs_dispatch_info dispatch =
1644 brw_cs_get_dispatch_info(devinfo, cs_prog_data, NULL);
1645
1646 uint32_t group_x0 = params->x0 / cs_prog_data->local_size[0];
1647 uint32_t group_y0 = params->y0 / cs_prog_data->local_size[1];
1648 uint32_t group_z0 = params->dst.z_offset;
1649 uint32_t group_x1 = DIV_ROUND_UP(params->x1, cs_prog_data->local_size[0]);
1650 uint32_t group_y1 = DIV_ROUND_UP(params->y1, cs_prog_data->local_size[1]);
1651 assert(params->num_layers >= 1);
1652 uint32_t group_z1 = params->dst.z_offset + params->num_layers;
1653 assert(cs_prog_data->local_size[2] == 1);
1654
1655 #if GFX_VERx10 >= 125
1656 assert(cs_prog_data->push.per_thread.regs == 0);
1657 blorp_emit(batch, GENX(COMPUTE_WALKER), cw) {
1658 cw.SIMDSize = dispatch.simd_size / 16;
1659 cw.MessageSIMD = dispatch.simd_size / 16,
1660 cw.LocalXMaximum = cs_prog_data->local_size[0] - 1;
1661 cw.LocalYMaximum = cs_prog_data->local_size[1] - 1;
1662 cw.LocalZMaximum = cs_prog_data->local_size[2] - 1;
1663 cw.ThreadGroupIDStartingX = group_x0;
1664 cw.ThreadGroupIDStartingY = group_y0;
1665 cw.ThreadGroupIDStartingZ = group_z0;
1666 cw.ThreadGroupIDXDimension = group_x1;
1667 cw.ThreadGroupIDYDimension = group_y1;
1668 cw.ThreadGroupIDZDimension = group_z1;
1669 cw.ExecutionMask = 0xffffffff;
1670 cw.PostSync.MOCS = isl_mocs(batch->blorp->isl_dev, 0, false);
1671
1672 uint32_t surfaces_offset = blorp_setup_binding_table(batch, params);
1673
1674 uint32_t samplers_offset =
1675 params->src.enabled ? blorp_emit_sampler_state(batch) : 0;
1676
1677 uint32_t push_const_offset;
1678 unsigned push_const_size;
1679 blorp_get_compute_push_const(batch, params, dispatch.threads,
1680 &push_const_offset, &push_const_size);
1681 cw.IndirectDataStartAddress = push_const_offset;
1682 cw.IndirectDataLength = push_const_size;
1683
1684 #if GFX_VERx10 >= 125
1685 cw.GenerateLocalID = cs_prog_data->generate_local_id != 0;
1686 cw.EmitLocal = cs_prog_data->generate_local_id;
1687 cw.WalkOrder = cs_prog_data->walk_order;
1688 cw.TileLayout = cs_prog_data->walk_order == INTEL_WALK_ORDER_YXZ ?
1689 TileY32bpe : Linear;
1690 #endif
1691
1692 cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
1693 .KernelStartPointer = params->cs_prog_kernel,
1694 .SamplerStatePointer = samplers_offset,
1695 .SamplerCount = params->src.enabled ? 1 : 0,
1696 .BindingTableEntryCount = params->src.enabled ? 2 : 1,
1697 .BindingTablePointer = surfaces_offset,
1698 .NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
1699 .SharedLocalMemorySize =
1700 intel_compute_slm_encode_size(GFX_VER, prog_data->total_shared),
1701 .PreferredSLMAllocationSize =
1702 intel_compute_preferred_slm_calc_encode_size(devinfo,
1703 prog_data->total_shared,
1704 dispatch.group_size,
1705 dispatch.simd_size),
1706 .NumberOfBarriers = cs_prog_data->uses_barrier,
1707 };
1708 }
1709
1710 #else
1711
1712 /* The MEDIA_VFE_STATE documentation for Gfx8+ says:
1713 *
1714 * "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
1715 * the only bits that are changed are scoreboard related: Scoreboard
1716 * Enable, Scoreboard Type, Scoreboard Mask, Scoreboard * Delta. For
1717 * these scoreboard related states, a MEDIA_STATE_FLUSH is sufficient."
1718 *
1719 * Earlier generations say "MI_FLUSH" instead of "stalling PIPE_CONTROL",
1720 * but MI_FLUSH isn't really a thing, so we assume they meant PIPE_CONTROL.
1721 */
1722 blorp_emit(batch, GENX(PIPE_CONTROL), pc) {
1723 pc.CommandStreamerStallEnable = true;
1724 pc.StallAtPixelScoreboard = true;
1725 }
1726
1727 blorp_emit(batch, GENX(MEDIA_VFE_STATE), vfe) {
1728 assert(prog_data->total_scratch == 0);
1729 vfe.MaximumNumberofThreads =
1730 devinfo->max_cs_threads * devinfo->subslice_total - 1;
1731 vfe.NumberofURBEntries = 2;
1732 #if GFX_VER < 11
1733 vfe.ResetGatewayTimer =
1734 Resettingrelativetimerandlatchingtheglobaltimestamp;
1735 #endif
1736 vfe.URBEntryAllocationSize = 2;
1737
1738 const uint32_t vfe_curbe_allocation =
1739 ALIGN(cs_prog_data->push.per_thread.regs * dispatch.threads +
1740 cs_prog_data->push.cross_thread.regs, 2);
1741 vfe.CURBEAllocationSize = vfe_curbe_allocation;
1742 }
1743
1744 uint32_t push_const_offset;
1745 unsigned push_const_size;
1746 blorp_get_compute_push_const(batch, params, dispatch.threads,
1747 &push_const_offset, &push_const_size);
1748
1749 blorp_emit(batch, GENX(MEDIA_CURBE_LOAD), curbe) {
1750 curbe.CURBETotalDataLength = push_const_size;
1751 curbe.CURBEDataStartAddress = push_const_offset;
1752 }
1753
1754 uint32_t surfaces_offset = blorp_setup_binding_table(batch, params);
1755
1756 uint32_t samplers_offset =
1757 params->src.enabled ? blorp_emit_sampler_state(batch) : 0;
1758
1759 struct GENX(INTERFACE_DESCRIPTOR_DATA) idd = {
1760 .KernelStartPointer = params->cs_prog_kernel,
1761 .SamplerStatePointer = samplers_offset,
1762 .SamplerCount = params->src.enabled ? 1 : 0,
1763 .BindingTableEntryCount = params->src.enabled ? 2 : 1,
1764 .BindingTablePointer = surfaces_offset,
1765 .ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs,
1766 .NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
1767 .SharedLocalMemorySize = intel_compute_slm_encode_size(GFX_VER,
1768 prog_data->total_shared),
1769 .BarrierEnable = cs_prog_data->uses_barrier,
1770 .CrossThreadConstantDataReadLength =
1771 cs_prog_data->push.cross_thread.regs,
1772 };
1773
1774 uint32_t idd_offset;
1775 uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
1776 void *state = blorp_alloc_dynamic_state(batch, size, 64, &idd_offset);
1777 if (state == NULL)
1778 return;
1779 GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, state, &idd);
1780
1781 blorp_emit(batch, GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), mid) {
1782 mid.InterfaceDescriptorTotalLength = size;
1783 mid.InterfaceDescriptorDataStartAddress = idd_offset;
1784 }
1785
1786 blorp_emit(batch, GENX(GPGPU_WALKER), ggw) {
1787 ggw.SIMDSize = dispatch.simd_size / 16;
1788 ggw.ThreadDepthCounterMaximum = 0;
1789 ggw.ThreadHeightCounterMaximum = 0;
1790 ggw.ThreadWidthCounterMaximum = dispatch.threads - 1;
1791 ggw.ThreadGroupIDStartingX = group_x0;
1792 ggw.ThreadGroupIDStartingY = group_y0;
1793 ggw.ThreadGroupIDStartingResumeZ = group_z0;
1794 ggw.ThreadGroupIDXDimension = group_x1;
1795 ggw.ThreadGroupIDYDimension = group_y1;
1796 ggw.ThreadGroupIDZDimension = group_z1;
1797 ggw.RightExecutionMask = dispatch.right_mask;
1798 ggw.BottomExecutionMask = 0xffffffff;
1799 }
1800
1801 #endif
1802
1803 blorp_measure_end(batch, params);
1804 }
1805
1806 /* -----------------------------------------------------------------------
1807 * -- BLORP on blitter
1808 * -----------------------------------------------------------------------
1809 */
1810
1811 #include "isl/isl_genX_helpers.h"
1812
1813 #if GFX_VER >= 12
1814 static uint32_t
xy_bcb_tiling(const struct isl_surf * surf)1815 xy_bcb_tiling(const struct isl_surf *surf)
1816 {
1817 switch (surf->tiling) {
1818 case ISL_TILING_LINEAR:
1819 return XY_TILE_LINEAR;
1820 #if GFX_VERx10 >= 125
1821 case ISL_TILING_X:
1822 return XY_TILE_X;
1823 case ISL_TILING_4:
1824 return XY_TILE_4;
1825 case ISL_TILING_64:
1826 case ISL_TILING_64_XE2:
1827 return XY_TILE_64;
1828 #else
1829 case ISL_TILING_Y0:
1830 return XY_TILE_Y;
1831 #endif
1832 default:
1833 unreachable("Invalid tiling for XY_BLOCK_COPY_BLT");
1834 }
1835 }
1836
1837 static uint32_t
xy_color_depth(const struct isl_format_layout * fmtl)1838 xy_color_depth(const struct isl_format_layout *fmtl)
1839 {
1840 switch (fmtl->bpb) {
1841 case 128: return XY_BPP_128_BIT;
1842 case 96: return XY_BPP_96_BIT;
1843 case 64: return XY_BPP_64_BIT;
1844 case 32: return XY_BPP_32_BIT;
1845 case 16: return XY_BPP_16_BIT;
1846 case 8: return XY_BPP_8_BIT;
1847 default:
1848 unreachable("Invalid bpp");
1849 }
1850 }
1851 #endif
1852
1853 #if GFX_VERx10 >= 125
1854 static uint32_t
xy_bcb_surf_dim(const struct isl_surf * surf)1855 xy_bcb_surf_dim(const struct isl_surf *surf)
1856 {
1857 switch (surf->dim) {
1858 case ISL_SURF_DIM_1D:
1859 return XY_SURFTYPE_1D;
1860 case ISL_SURF_DIM_2D:
1861 return XY_SURFTYPE_2D;
1862 case ISL_SURF_DIM_3D:
1863 return XY_SURFTYPE_3D;
1864 default:
1865 unreachable("Invalid dimensionality for XY_BLOCK_COPY_BLT");
1866 }
1867 }
1868
1869 static uint32_t
xy_bcb_surf_depth(const struct isl_surf * surf)1870 xy_bcb_surf_depth(const struct isl_surf *surf)
1871 {
1872 return surf->dim == ISL_SURF_DIM_3D ? surf->logical_level0_px.depth
1873 : surf->logical_level0_px.array_len;
1874 }
1875
1876 #if GFX_VER < 20
1877 static uint32_t
xy_aux_mode(const struct blorp_surface_info * info)1878 xy_aux_mode(const struct blorp_surface_info *info)
1879 {
1880 switch (info->aux_usage) {
1881 case ISL_AUX_USAGE_CCS_E:
1882 case ISL_AUX_USAGE_FCV_CCS_E:
1883 case ISL_AUX_USAGE_STC_CCS:
1884 return XY_CCS_E;
1885 case ISL_AUX_USAGE_NONE:
1886 return XY_NONE;
1887 default:
1888 unreachable("Unsupported aux mode");
1889 }
1890 }
1891 #endif // GFX_VER < 20
1892 #endif // GFX_VERx10 >= 125
1893
1894 UNUSED static void
blorp_xy_block_copy_blt(struct blorp_batch * batch,const struct blorp_params * params)1895 blorp_xy_block_copy_blt(struct blorp_batch *batch,
1896 const struct blorp_params *params)
1897 {
1898 #if GFX_VER < 12
1899 unreachable("Blitter is only supported on Gfx12+");
1900 #else
1901 UNUSED const struct isl_device *isl_dev = batch->blorp->isl_dev;
1902
1903 assert(batch->flags & BLORP_BATCH_USE_BLITTER);
1904 assert(!(batch->flags & BLORP_BATCH_PREDICATE_ENABLE));
1905 assert(params->hiz_op == ISL_AUX_OP_NONE);
1906
1907 assert(params->num_layers == 1);
1908 assert(params->dst.view.levels == 1);
1909 assert(params->src.view.levels == 1);
1910
1911 #if GFX_VERx10 < 125
1912 assert(params->dst.view.base_array_layer == 0);
1913 assert(params->dst.z_offset == 0);
1914 #endif
1915
1916 unsigned dst_x0 = params->x0;
1917 unsigned dst_x1 = params->x1;
1918 unsigned src_x0 =
1919 dst_x0 - params->wm_inputs.coord_transform[0].offset;
1920 ASSERTED unsigned src_x1 =
1921 dst_x1 - params->wm_inputs.coord_transform[0].offset;
1922 unsigned dst_y0 = params->y0;
1923 unsigned dst_y1 = params->y1;
1924 unsigned src_y0 =
1925 dst_y0 - params->wm_inputs.coord_transform[1].offset;
1926 ASSERTED unsigned src_y1 =
1927 dst_y1 - params->wm_inputs.coord_transform[1].offset;
1928
1929 assert(src_x1 - src_x0 == dst_x1 - dst_x0);
1930 assert(src_y1 - src_y0 == dst_y1 - dst_y0);
1931
1932 const struct isl_surf *src_surf = ¶ms->src.surf;
1933 const struct isl_surf *dst_surf = ¶ms->dst.surf;
1934
1935 const struct isl_format_layout *fmtl =
1936 isl_format_get_layout(params->dst.view.format);
1937
1938 if (fmtl->bpb == 96) {
1939 assert(src_surf->tiling == ISL_TILING_LINEAR &&
1940 dst_surf->tiling == ISL_TILING_LINEAR);
1941 }
1942
1943 assert(src_surf->samples == 1);
1944 assert(dst_surf->samples == 1);
1945
1946 unsigned dst_pitch_unit = dst_surf->tiling == ISL_TILING_LINEAR ? 1 : 4;
1947 unsigned src_pitch_unit = src_surf->tiling == ISL_TILING_LINEAR ? 1 : 4;
1948
1949 #if GFX_VERx10 >= 125
1950 struct isl_extent3d src_align = isl_get_image_alignment(src_surf);
1951 struct isl_extent3d dst_align = isl_get_image_alignment(dst_surf);
1952 #endif
1953
1954 blorp_emit(batch, GENX(XY_BLOCK_COPY_BLT), blt) {
1955 blt.ColorDepth = xy_color_depth(fmtl);
1956
1957 blt.DestinationPitch = (dst_surf->row_pitch_B / dst_pitch_unit) - 1;
1958 blt.DestinationMOCS = params->dst.addr.mocs;
1959 blt.DestinationTiling = xy_bcb_tiling(dst_surf);
1960 blt.DestinationX1 = dst_x0;
1961 blt.DestinationY1 = dst_y0;
1962 blt.DestinationX2 = dst_x1;
1963 blt.DestinationY2 = dst_y1;
1964 blt.DestinationBaseAddress = params->dst.addr;
1965 blt.DestinationXOffset = params->dst.tile_x_sa;
1966 blt.DestinationYOffset = params->dst.tile_y_sa;
1967
1968 #if GFX_VERx10 >= 125
1969 blt.DestinationSurfaceType = xy_bcb_surf_dim(dst_surf);
1970 blt.DestinationSurfaceWidth = dst_surf->logical_level0_px.w - 1;
1971 blt.DestinationSurfaceHeight = dst_surf->logical_level0_px.h - 1;
1972 blt.DestinationSurfaceDepth = xy_bcb_surf_depth(dst_surf) - 1;
1973 blt.DestinationArrayIndex =
1974 params->dst.view.base_array_layer + params->dst.z_offset;
1975 blt.DestinationSurfaceQPitch = isl_get_qpitch(dst_surf) >> 2;
1976 blt.DestinationLOD = params->dst.view.base_level;
1977 blt.DestinationMipTailStartLOD = dst_surf->miptail_start_level;
1978 blt.DestinationHorizontalAlign = isl_encode_halign(dst_align.width);
1979 blt.DestinationVerticalAlign = isl_encode_valign(dst_align.height);
1980 #if GFX_VER < 20
1981 /* XY_BLOCK_COPY_BLT only supports AUX_CCS. */
1982 blt.DestinationDepthStencilResource =
1983 params->dst.aux_usage == ISL_AUX_USAGE_STC_CCS;
1984 #endif
1985 blt.DestinationTargetMemory =
1986 params->dst.addr.local_hint ? XY_MEM_LOCAL : XY_MEM_SYSTEM;
1987
1988 if (params->dst.aux_usage != ISL_AUX_USAGE_NONE) {
1989 #if GFX_VER < 20
1990 blt.DestinationAuxiliarySurfaceMode = xy_aux_mode(¶ms->dst);
1991 blt.DestinationCompressionEnable = true;
1992 #endif
1993 blt.DestinationCompressionFormat =
1994 isl_get_render_compression_format(dst_surf->format);
1995 blt.DestinationClearValueEnable = !!params->dst.clear_color_addr.buffer;
1996 blt.DestinationClearAddress = params->dst.clear_color_addr;
1997 }
1998 #endif
1999
2000 blt.SourceX1 = src_x0;
2001 blt.SourceY1 = src_y0;
2002 blt.SourcePitch = (src_surf->row_pitch_B / src_pitch_unit) - 1;
2003 blt.SourceMOCS = params->src.addr.mocs;
2004 blt.SourceTiling = xy_bcb_tiling(src_surf);
2005 blt.SourceBaseAddress = params->src.addr;
2006 blt.SourceXOffset = params->src.tile_x_sa;
2007 blt.SourceYOffset = params->src.tile_y_sa;
2008
2009 #if GFX_VERx10 >= 125
2010 blt.SourceSurfaceType = xy_bcb_surf_dim(src_surf);
2011 blt.SourceSurfaceWidth = src_surf->logical_level0_px.w - 1;
2012 blt.SourceSurfaceHeight = src_surf->logical_level0_px.h - 1;
2013 blt.SourceSurfaceDepth = xy_bcb_surf_depth(src_surf) - 1;
2014 blt.SourceArrayIndex =
2015 params->src.view.base_array_layer + params->src.z_offset;
2016 blt.SourceSurfaceQPitch = isl_get_qpitch(src_surf) >> 2;
2017 blt.SourceLOD = params->src.view.base_level;
2018 blt.SourceMipTailStartLOD = src_surf->miptail_start_level;
2019 blt.SourceHorizontalAlign = isl_encode_halign(src_align.width);
2020 blt.SourceVerticalAlign = isl_encode_valign(src_align.height);
2021 #if GFX_VER < 20
2022 /* XY_BLOCK_COPY_BLT only supports AUX_CCS. */
2023 blt.SourceDepthStencilResource =
2024 params->src.aux_usage == ISL_AUX_USAGE_STC_CCS;
2025 #endif
2026 blt.SourceTargetMemory =
2027 params->src.addr.local_hint ? XY_MEM_LOCAL : XY_MEM_SYSTEM;
2028
2029 if (params->src.aux_usage != ISL_AUX_USAGE_NONE) {
2030 #if GFX_VER < 20
2031 blt.SourceAuxiliarySurfaceMode = xy_aux_mode(¶ms->src);
2032 blt.SourceCompressionEnable = true;
2033 #endif
2034 blt.SourceCompressionFormat =
2035 isl_get_render_compression_format(src_surf->format);
2036 blt.SourceClearValueEnable = !!params->src.clear_color_addr.buffer;
2037 blt.SourceClearAddress = params->src.clear_color_addr;
2038 }
2039 #endif
2040 }
2041 #endif
2042 }
2043
2044 UNUSED static void
blorp_xy_fast_color_blit(struct blorp_batch * batch,const struct blorp_params * params)2045 blorp_xy_fast_color_blit(struct blorp_batch *batch,
2046 const struct blorp_params *params)
2047 {
2048 #if GFX_VER < 12
2049 unreachable("Blitter is only supported on Gfx12+");
2050 #else
2051 UNUSED const struct isl_device *isl_dev = batch->blorp->isl_dev;
2052 const struct isl_surf *dst_surf = ¶ms->dst.surf;
2053 const struct isl_format_layout *fmtl =
2054 isl_format_get_layout(params->dst.view.format);
2055
2056 assert(batch->flags & BLORP_BATCH_USE_BLITTER);
2057 assert(!(batch->flags & BLORP_BATCH_PREDICATE_ENABLE));
2058 assert(params->hiz_op == ISL_AUX_OP_NONE);
2059
2060 assert(params->num_layers == 1);
2061 assert(params->dst.view.levels == 1);
2062 assert(dst_surf->samples == 1);
2063 assert(fmtl->bpb != 96 || dst_surf->tiling == ISL_TILING_LINEAR);
2064
2065 #if GFX_VERx10 < 125
2066 assert(params->dst.view.base_array_layer == 0);
2067 assert(params->dst.z_offset == 0);
2068 #endif
2069
2070 unsigned dst_pitch_unit = dst_surf->tiling == ISL_TILING_LINEAR ? 1 : 4;
2071
2072 #if GFX_VERx10 >= 125
2073 struct isl_extent3d dst_align = isl_get_image_alignment(dst_surf);
2074 #endif
2075
2076 #if INTEL_NEEDS_WA_16021021469
2077 assert(fmtl->bpb != 96);
2078 #endif
2079
2080 blorp_emit(batch, GENX(XY_FAST_COLOR_BLT), blt) {
2081 blt.ColorDepth = xy_color_depth(fmtl);
2082
2083 blt.DestinationPitch = (dst_surf->row_pitch_B / dst_pitch_unit) - 1;
2084 blt.DestinationTiling = xy_bcb_tiling(dst_surf);
2085 blt.DestinationX1 = params->x0;
2086 blt.DestinationY1 = params->y0;
2087 blt.DestinationX2 = params->x1;
2088 blt.DestinationY2 = params->y1;
2089 blt.DestinationBaseAddress = params->dst.addr;
2090 blt.DestinationXOffset = params->dst.tile_x_sa;
2091 blt.DestinationYOffset = params->dst.tile_y_sa;
2092
2093 isl_color_value_pack((union isl_color_value *)
2094 params->wm_inputs.clear_color,
2095 params->dst.view.format, blt.FillColor);
2096
2097 #if GFX_VERx10 >= 125
2098 blt.DestinationSurfaceType = xy_bcb_surf_dim(dst_surf);
2099 blt.DestinationSurfaceWidth = dst_surf->logical_level0_px.w - 1;
2100 blt.DestinationSurfaceHeight = dst_surf->logical_level0_px.h - 1;
2101 blt.DestinationSurfaceDepth = xy_bcb_surf_depth(dst_surf) - 1;
2102 blt.DestinationArrayIndex =
2103 params->dst.view.base_array_layer + params->dst.z_offset;
2104 blt.DestinationSurfaceQPitch = isl_get_qpitch(dst_surf) >> 2;
2105 blt.DestinationLOD = params->dst.view.base_level;
2106 blt.DestinationMipTailStartLOD = dst_surf->miptail_start_level;
2107 blt.DestinationHorizontalAlign = isl_encode_halign(dst_align.width);
2108 blt.DestinationVerticalAlign = isl_encode_valign(dst_align.height);
2109 /* XY_FAST_COLOR_BLT only supports AUX_CCS. */
2110 blt.DestinationDepthStencilResource =
2111 params->dst.aux_usage == ISL_AUX_USAGE_STC_CCS;
2112 blt.DestinationTargetMemory =
2113 params->dst.addr.local_hint ? XY_MEM_LOCAL : XY_MEM_SYSTEM;
2114
2115 if (params->dst.aux_usage != ISL_AUX_USAGE_NONE) {
2116 #if GFX_VERx10 == 125
2117 blt.DestinationAuxiliarySurfaceMode = xy_aux_mode(¶ms->dst);
2118 blt.DestinationCompressionEnable = true;
2119 blt.DestinationClearValueEnable = !!params->dst.clear_color_addr.buffer;
2120 blt.DestinationClearAddress = params->dst.clear_color_addr;
2121 #endif
2122 blt.DestinationCompressionFormat =
2123 isl_get_render_compression_format(dst_surf->format);
2124 }
2125
2126 blt.DestinationMOCS = params->dst.addr.mocs;
2127 #endif
2128 }
2129 #endif
2130 }
2131
2132 static void
blorp_exec_blitter(struct blorp_batch * batch,const struct blorp_params * params)2133 blorp_exec_blitter(struct blorp_batch *batch,
2134 const struct blorp_params *params)
2135 {
2136 blorp_measure_start(batch, params);
2137
2138 if (params->src.enabled)
2139 blorp_xy_block_copy_blt(batch, params);
2140 else
2141 blorp_xy_fast_color_blit(batch, params);
2142
2143 blorp_measure_end(batch, params);
2144 }
2145
2146 /**
2147 * \brief Execute a blit or render pass operation.
2148 *
2149 * To execute the operation, this function manually constructs and emits a
2150 * batch to draw a rectangle primitive. The batchbuffer is flushed before
2151 * constructing and after emitting the batch.
2152 *
2153 * This function alters no GL state.
2154 */
2155 static void
blorp_exec(struct blorp_batch * batch,const struct blorp_params * params)2156 blorp_exec(struct blorp_batch *batch, const struct blorp_params *params)
2157 {
2158 if (batch->flags & BLORP_BATCH_USE_BLITTER) {
2159 blorp_exec_blitter(batch, params);
2160 } else if (batch->flags & BLORP_BATCH_USE_COMPUTE) {
2161 blorp_exec_compute(batch, params);
2162 } else {
2163 blorp_exec_3d(batch, params);
2164 }
2165 }
2166
2167 static void
blorp_init_dynamic_states(struct blorp_context * context)2168 blorp_init_dynamic_states(struct blorp_context *context)
2169 {
2170 {
2171 struct GENX(BLEND_STATE) blend = { };
2172
2173 uint32_t dws[GENX(BLEND_STATE_length) * 4 +
2174 GENX(BLEND_STATE_ENTRY_length) * 4 * 8 /* MAX_RTS */];
2175 uint32_t *pos = dws;
2176
2177 GENX(BLEND_STATE_pack)(NULL, pos, &blend);
2178 pos += GENX(BLEND_STATE_length);
2179
2180 for (unsigned i = 0; i < 8; ++i) {
2181 struct GENX(BLEND_STATE_ENTRY) entry = {
2182 .PreBlendColorClampEnable = true,
2183 .PostBlendColorClampEnable = true,
2184 .ColorClampRange = COLORCLAMP_RTFORMAT,
2185 };
2186 GENX(BLEND_STATE_ENTRY_pack)(NULL, pos, &entry);
2187 pos += GENX(BLEND_STATE_ENTRY_length);
2188 }
2189
2190 context->upload_dynamic_state(context, dws, sizeof(dws), 64,
2191 BLORP_DYNAMIC_STATE_BLEND);
2192 }
2193
2194 blorp_context_upload_dynamic(context, GENX(CC_VIEWPORT), vp, 32,
2195 BLORP_DYNAMIC_STATE_CC_VIEWPORT) {
2196 vp.MinimumDepth = context->config.use_unrestricted_depth_range ?
2197 -FLT_MAX : 0.0;
2198 vp.MaximumDepth = context->config.use_unrestricted_depth_range ?
2199 FLT_MAX : 1.0;
2200 }
2201
2202 blorp_context_upload_dynamic(context, GENX(COLOR_CALC_STATE), cc, 64,
2203 BLORP_DYNAMIC_STATE_COLOR_CALC) {
2204 /* Nothing */
2205 }
2206
2207 blorp_context_upload_dynamic(context, GENX(SAMPLER_STATE), sampler, 32,
2208 BLORP_DYNAMIC_STATE_SAMPLER) {
2209 sampler.MipModeFilter = MIPFILTER_NONE;
2210 sampler.MagModeFilter = MAPFILTER_LINEAR;
2211 sampler.MinModeFilter = MAPFILTER_LINEAR;
2212 sampler.MinLOD = 0;
2213 sampler.MaxLOD = 0;
2214 sampler.TCXAddressControlMode = TCM_CLAMP;
2215 sampler.TCYAddressControlMode = TCM_CLAMP;
2216 sampler.TCZAddressControlMode = TCM_CLAMP;
2217 sampler.MaximumAnisotropy = RATIO21;
2218 sampler.RAddressMinFilterRoundingEnable = true;
2219 sampler.RAddressMagFilterRoundingEnable = true;
2220 sampler.VAddressMinFilterRoundingEnable = true;
2221 sampler.VAddressMagFilterRoundingEnable = true;
2222 sampler.UAddressMinFilterRoundingEnable = true;
2223 sampler.UAddressMagFilterRoundingEnable = true;
2224 sampler.NonnormalizedCoordinateEnable = true;
2225 }
2226 }
2227
2228 #endif /* BLORP_GENX_EXEC_BRW_H */
2229