1 /*
2 * Copyright 2010 Jerome Glisse <[email protected]>
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "nir_serialize.h"
7 #include "pipe/p_defines.h"
8 #include "r600_asm.h"
9 #include "r600_isa.h"
10 #include "r600_sq.h"
11 #include "r600_formats.h"
12 #include "r600_opcodes.h"
13 #include "r600_sfn.h"
14 #include "r600_shader.h"
15 #include "r600_dump.h"
16 #include "r600d.h"
17 #include "sfn/sfn_nir.h"
18
19 #include "pipe/p_shader_tokens.h"
20 #include "tgsi/tgsi_parse.h"
21 #include "tgsi/tgsi_scan.h"
22 #include "tgsi/tgsi_dump.h"
23 #include "tgsi/tgsi_from_mesa.h"
24 #include "nir/tgsi_to_nir.h"
25 #include "nir/nir_to_tgsi_info.h"
26 #include "compiler/nir/nir.h"
27 #include "util/macros.h"
28 #include "util/u_bitcast.h"
29 #include "util/u_dump.h"
30 #include "util/u_endian.h"
31 #include "util/u_memory.h"
32 #include "util/u_math.h"
33 #include <assert.h>
34 #include <stdio.h>
35 #include <errno.h>
36
37 /* CAYMAN notes
38 Why CAYMAN got loops for lots of instructions is explained here.
39
40 -These 8xx t-slot only ops are implemented in all vector slots.
41 MUL_LIT, FLT_TO_UINT, INT_TO_FLT, UINT_TO_FLT
42 These 8xx t-slot only opcodes become vector ops, with all four
43 slots expecting the arguments on sources a and b. Result is
44 broadcast to all channels.
45 MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT, MUL_64
46 These 8xx t-slot only opcodes become vector ops in the z, y, and
47 x slots.
48 EXP_IEEE, LOG_IEEE/CLAMPED, RECIP_IEEE/CLAMPED/FF/INT/UINT/_64/CLAMPED_64
49 RECIPSQRT_IEEE/CLAMPED/FF/_64/CLAMPED_64
50 SQRT_IEEE/_64
51 SIN/COS
52 The w slot may have an independent co-issued operation, or if the
53 result is required to be in the w slot, the opcode above may be
54 issued in the w slot as well.
55 The compiler must issue the source argument to slots z, y, and x
56 */
57
58 /* Contents of r0 on entry to various shaders
59
60 VS - .x = VertexID
61 .y = RelVertexID (??)
62 .w = InstanceID
63
64 GS - r0.xyw, r1.xyz = per-vertex offsets
65 r0.z = PrimitiveID
66
67 TCS - .x = PatchID
68 .y = RelPatchID (??)
69 .z = InvocationID
70 .w = tess factor base.
71
72 TES - .x = TessCoord.x
73 - .y = TessCoord.y
74 - .z = RelPatchID (??)
75 - .w = PrimitiveID
76
77 PS - face_gpr.z = SampleMask
78 face_gpr.w = SampleID
79 */
80
r600_dump_streamout(struct pipe_stream_output_info * so)81 static void r600_dump_streamout(struct pipe_stream_output_info *so)
82 {
83 unsigned i;
84
85 fprintf(stderr, "STREAMOUT\n");
86 for (i = 0; i < so->num_outputs; i++) {
87 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
88 so->output[i].start_component;
89 fprintf(stderr, " %i: MEM_STREAM%d_BUF%i[%i..%i] <- OUT[%i].%s%s%s%s%s\n",
90 i,
91 so->output[i].stream,
92 so->output[i].output_buffer,
93 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
94 so->output[i].register_index,
95 mask & 1 ? "x" : "",
96 mask & 2 ? "y" : "",
97 mask & 4 ? "z" : "",
98 mask & 8 ? "w" : "",
99 so->output[i].dst_offset < so->output[i].start_component ? " (will lower)" : "");
100 }
101 }
102
store_shader(struct pipe_context * ctx,struct r600_pipe_shader * shader)103 static int store_shader(struct pipe_context *ctx,
104 struct r600_pipe_shader *shader)
105 {
106 struct r600_context *rctx = (struct r600_context *)ctx;
107 uint32_t *ptr, i;
108
109 if (shader->bo == NULL) {
110 shader->bo = (struct r600_resource*)
111 pipe_buffer_create(ctx->screen, 0, PIPE_USAGE_IMMUTABLE, shader->shader.bc.ndw * 4);
112 if (shader->bo == NULL) {
113 return -ENOMEM;
114 }
115 ptr = r600_buffer_map_sync_with_rings(
116 &rctx->b, shader->bo,
117 PIPE_MAP_WRITE | RADEON_MAP_TEMPORARY);
118 if (UTIL_ARCH_BIG_ENDIAN) {
119 for (i = 0; i < shader->shader.bc.ndw; ++i) {
120 ptr[i] = util_cpu_to_le32(shader->shader.bc.bytecode[i]);
121 }
122 } else {
123 memcpy(ptr, shader->shader.bc.bytecode, shader->shader.bc.ndw * sizeof(*ptr));
124 }
125 rctx->b.ws->buffer_unmap(rctx->b.ws, shader->bo->buf);
126 }
127
128 return 0;
129 }
130
131 extern const struct nir_shader_compiler_options r600_nir_options;
132 static int nshader = 0;
r600_pipe_shader_create(struct pipe_context * ctx,struct r600_pipe_shader * shader,union r600_shader_key key)133 int r600_pipe_shader_create(struct pipe_context *ctx,
134 struct r600_pipe_shader *shader,
135 union r600_shader_key key)
136 {
137 struct r600_context *rctx = (struct r600_context *)ctx;
138 struct r600_pipe_shader_selector *sel = shader->selector;
139 int r;
140 const nir_shader_compiler_options *nir_options =
141 (const nir_shader_compiler_options *)
142 ctx->screen->get_compiler_options(ctx->screen,
143 PIPE_SHADER_IR_NIR,
144 shader->shader.processor_type);
145 if (!sel->nir && !(sel->ir_type == PIPE_SHADER_IR_TGSI)) {
146 assert(sel->nir_blob);
147 struct blob_reader blob_reader;
148 blob_reader_init(&blob_reader, sel->nir_blob, sel->nir_blob_size);
149 sel->nir = nir_deserialize(NULL, nir_options, &blob_reader);
150 }
151
152 int processor = sel->ir_type == PIPE_SHADER_IR_TGSI ?
153 tgsi_get_processor_type(sel->tokens):
154 pipe_shader_type_from_mesa(sel->nir->info.stage);
155
156 bool dump = r600_can_dump_shader(&rctx->screen->b, processor);
157
158 unsigned export_shader;
159
160 shader->shader.bc.isa = rctx->isa;
161
162 {
163 glsl_type_singleton_init_or_ref();
164 if (sel->ir_type == PIPE_SHADER_IR_TGSI) {
165 if (sel->nir)
166 ralloc_free(sel->nir);
167 if (sel->nir_blob) {
168 free(sel->nir_blob);
169 sel->nir_blob = NULL;
170 }
171 sel->nir = tgsi_to_nir(sel->tokens, ctx->screen, true);
172 /* Lower int64 ops because we have some r600 built-in shaders that use it */
173 if (nir_options->lower_int64_options) {
174 NIR_PASS_V(sel->nir, nir_lower_alu_to_scalar, r600_lower_to_scalar_instr_filter, NULL);
175 NIR_PASS_V(sel->nir, nir_lower_int64);
176 }
177 NIR_PASS_V(sel->nir, nir_lower_flrp, ~0, false);
178 }
179 nir_tgsi_scan_shader(sel->nir, &sel->info, true);
180
181 r = r600_shader_from_nir(rctx, shader, &key);
182
183 glsl_type_singleton_decref();
184
185 if (r) {
186 fprintf(stderr, "--Failed shader--------------------------------------------------\n");
187
188 if (sel->ir_type == PIPE_SHADER_IR_TGSI) {
189 fprintf(stderr, "--TGSI--------------------------------------------------------\n");
190 tgsi_dump(sel->tokens, 0);
191 }
192
193 fprintf(stderr, "--NIR --------------------------------------------------------\n");
194 nir_print_shader(sel->nir, stderr);
195
196 R600_ERR("translation from NIR failed !\n");
197 goto error;
198 }
199 }
200
201 if (dump) {
202 if (sel->ir_type == PIPE_SHADER_IR_TGSI) {
203 fprintf(stderr, "--TGSI--------------------------------------------------------\n");
204 tgsi_dump(sel->tokens, 0);
205 }
206
207 if (sel->so.num_outputs) {
208 r600_dump_streamout(&sel->so);
209 }
210 }
211
212 /* Check if the bytecode has already been built. */
213 if (!shader->shader.bc.bytecode) {
214 r = r600_bytecode_build(&shader->shader.bc);
215 if (r) {
216 R600_ERR("building bytecode failed !\n");
217 goto error;
218 }
219 }
220
221 if (dump) {
222 fprintf(stderr, "--------------------------------------------------------------\n");
223 r600_bytecode_disasm(&shader->shader.bc);
224 fprintf(stderr, "______________________________________________________________\n");
225
226 print_shader_info(stderr, nshader++, &shader->shader);
227 print_pipe_info(stderr, &sel->info);
228 }
229
230 if (shader->gs_copy_shader) {
231 if (dump) {
232 // dump copy shader
233 r600_bytecode_disasm(&shader->gs_copy_shader->shader.bc);
234 }
235
236 if ((r = store_shader(ctx, shader->gs_copy_shader)))
237 goto error;
238 }
239
240 /* Store the shader in a buffer. */
241 if ((r = store_shader(ctx, shader)))
242 goto error;
243
244 /* Build state. */
245 switch (shader->shader.processor_type) {
246 case PIPE_SHADER_TESS_CTRL:
247 evergreen_update_hs_state(ctx, shader);
248 break;
249 case PIPE_SHADER_TESS_EVAL:
250 if (key.tes.as_es)
251 evergreen_update_es_state(ctx, shader);
252 else
253 evergreen_update_vs_state(ctx, shader);
254 break;
255 case PIPE_SHADER_GEOMETRY:
256 if (rctx->b.gfx_level >= EVERGREEN) {
257 evergreen_update_gs_state(ctx, shader);
258 evergreen_update_vs_state(ctx, shader->gs_copy_shader);
259 } else {
260 r600_update_gs_state(ctx, shader);
261 r600_update_vs_state(ctx, shader->gs_copy_shader);
262 }
263 break;
264 case PIPE_SHADER_VERTEX:
265 export_shader = key.vs.as_es;
266 if (rctx->b.gfx_level >= EVERGREEN) {
267 if (key.vs.as_ls)
268 evergreen_update_ls_state(ctx, shader);
269 else if (key.vs.as_es)
270 evergreen_update_es_state(ctx, shader);
271 else
272 evergreen_update_vs_state(ctx, shader);
273 } else {
274 if (export_shader)
275 r600_update_es_state(ctx, shader);
276 else
277 r600_update_vs_state(ctx, shader);
278 }
279 break;
280 case PIPE_SHADER_FRAGMENT:
281 if (rctx->b.gfx_level >= EVERGREEN) {
282 evergreen_update_ps_state(ctx, shader);
283 } else {
284 r600_update_ps_state(ctx, shader);
285 }
286 break;
287 case PIPE_SHADER_COMPUTE:
288 evergreen_update_ls_state(ctx, shader);
289 break;
290 default:
291 r = -EINVAL;
292 goto error;
293 }
294
295 util_debug_message(&rctx->b.debug, SHADER_INFO, "%s shader: %d dw, %d gprs, %d alu_groups, %d loops, %d cf, %d stack",
296 _mesa_shader_stage_to_abbrev(tgsi_processor_to_shader_stage(processor)),
297 shader->shader.bc.ndw,
298 shader->shader.bc.ngpr,
299 shader->shader.bc.nalu_groups,
300 shader->shader.num_loops,
301 shader->shader.bc.ncf,
302 shader->shader.bc.nstack);
303
304 if (!sel->nir_blob && sel->nir && sel->ir_type != PIPE_SHADER_IR_TGSI) {
305 struct blob blob;
306 blob_init(&blob);
307 nir_serialize(&blob, sel->nir, false);
308 sel->nir_blob = malloc(blob.size);
309 memcpy(sel->nir_blob, blob.data, blob.size);
310 sel->nir_blob_size = blob.size;
311 blob_finish(&blob);
312 }
313 ralloc_free(sel->nir);
314 sel->nir = NULL;
315
316 return 0;
317
318 error:
319 r600_pipe_shader_destroy(ctx, shader);
320 return r;
321 }
322
r600_pipe_shader_destroy(struct pipe_context * ctx UNUSED,struct r600_pipe_shader * shader)323 void r600_pipe_shader_destroy(struct pipe_context *ctx UNUSED, struct r600_pipe_shader *shader)
324 {
325 r600_resource_reference(&shader->bo, NULL);
326 if (list_is_linked(&shader->shader.bc.cf))
327 r600_bytecode_clear(&shader->shader.bc);
328 r600_release_command_buffer(&shader->command_buffer);
329
330 if (shader->shader.arrays)
331 free(shader->shader.arrays);
332 }
333
334 struct r600_shader_ctx {
335 unsigned type;
336 unsigned temp_reg;
337 struct r600_bytecode *bc;
338 struct r600_shader *shader;
339 uint32_t max_driver_temp_used;
340 unsigned enabled_stream_buffers_mask;
341 };
342
r600_create_vertex_fetch_shader(struct pipe_context * ctx,unsigned count,const struct pipe_vertex_element * elements)343 void *r600_create_vertex_fetch_shader(struct pipe_context *ctx,
344 unsigned count,
345 const struct pipe_vertex_element *elements)
346 {
347 struct r600_context *rctx = (struct r600_context *)ctx;
348 struct r600_bytecode bc;
349 struct r600_bytecode_vtx vtx;
350 const struct util_format_description *desc;
351 unsigned fetch_resource_start = rctx->b.gfx_level >= EVERGREEN ? 0 : 160;
352 unsigned format, num_format, format_comp, endian;
353 uint32_t *bytecode;
354 int i, j, r, fs_size;
355 uint32_t buffer_mask = 0;
356 struct r600_fetch_shader *shader;
357 unsigned strides[PIPE_MAX_ATTRIBS];
358
359 assert(count < 32);
360
361 memset(&bc, 0, sizeof(bc));
362 r600_bytecode_init(&bc, rctx->b.gfx_level, rctx->b.family,
363 rctx->screen->has_compressed_msaa_texturing);
364
365 bc.isa = rctx->isa;
366
367 for (i = 0; i < count; i++) {
368 if (elements[i].instance_divisor > 1) {
369 if (rctx->b.gfx_level == CAYMAN) {
370 for (j = 0; j < 4; j++) {
371 struct r600_bytecode_alu alu;
372 memset(&alu, 0, sizeof(alu));
373 alu.op = ALU_OP2_MULHI_UINT;
374 alu.src[0].sel = 0;
375 alu.src[0].chan = 3;
376 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
377 alu.src[1].value = (1ll << 32) / elements[i].instance_divisor + 1;
378 alu.dst.sel = i + 1;
379 alu.dst.chan = j;
380 alu.dst.write = j == 3;
381 alu.last = j == 3;
382 if ((r = r600_bytecode_add_alu(&bc, &alu))) {
383 r600_bytecode_clear(&bc);
384 return NULL;
385 }
386 }
387 } else {
388 struct r600_bytecode_alu alu;
389 memset(&alu, 0, sizeof(alu));
390 alu.op = ALU_OP2_MULHI_UINT;
391 alu.src[0].sel = 0;
392 alu.src[0].chan = 3;
393 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
394 alu.src[1].value = (1ll << 32) / elements[i].instance_divisor + 1;
395 alu.dst.sel = i + 1;
396 alu.dst.chan = 3;
397 alu.dst.write = 1;
398 alu.last = 1;
399 if ((r = r600_bytecode_add_alu(&bc, &alu))) {
400 r600_bytecode_clear(&bc);
401 return NULL;
402 }
403 }
404 }
405 strides[elements[i].vertex_buffer_index] = elements[i].src_stride;
406 buffer_mask |= BITFIELD_BIT(elements[i].vertex_buffer_index);
407 }
408
409 for (i = 0; i < count; i++) {
410 r600_vertex_data_type(elements[i].src_format,
411 &format, &num_format, &format_comp, &endian);
412
413 desc = util_format_description(elements[i].src_format);
414
415 if (elements[i].src_offset > 65535) {
416 r600_bytecode_clear(&bc);
417 R600_ERR("too big src_offset: %u\n", elements[i].src_offset);
418 return NULL;
419 }
420
421 memset(&vtx, 0, sizeof(vtx));
422 vtx.buffer_id = elements[i].vertex_buffer_index + fetch_resource_start;
423 vtx.fetch_type = elements[i].instance_divisor ? SQ_VTX_FETCH_INSTANCE_DATA : SQ_VTX_FETCH_VERTEX_DATA;
424 vtx.src_gpr = elements[i].instance_divisor > 1 ? i + 1 : 0;
425 vtx.src_sel_x = elements[i].instance_divisor ? 3 : 0;
426 vtx.mega_fetch_count = 0x1F;
427 vtx.dst_gpr = i + 1;
428 vtx.dst_sel_x = desc->swizzle[0];
429 vtx.dst_sel_y = desc->swizzle[1];
430 vtx.dst_sel_z = desc->swizzle[2];
431 vtx.dst_sel_w = desc->swizzle[3];
432 vtx.data_format = format;
433 vtx.num_format_all = num_format;
434 vtx.format_comp_all = format_comp;
435 vtx.offset = elements[i].src_offset;
436 vtx.endian = endian;
437
438 if ((r = r600_bytecode_add_vtx(&bc, &vtx))) {
439 r600_bytecode_clear(&bc);
440 return NULL;
441 }
442 }
443
444 r600_bytecode_add_cfinst(&bc, CF_OP_RET);
445
446 if ((r = r600_bytecode_build(&bc))) {
447 r600_bytecode_clear(&bc);
448 return NULL;
449 }
450
451 if (rctx->screen->b.debug_flags & DBG_FS) {
452 fprintf(stderr, "--------------------------------------------------------------\n");
453 fprintf(stderr, "Vertex elements state:\n");
454 for (i = 0; i < count; i++) {
455 fprintf(stderr, " ");
456 util_dump_vertex_element(stderr, elements+i);
457 fprintf(stderr, "\n");
458 }
459
460 r600_bytecode_disasm(&bc);
461 }
462
463 fs_size = bc.ndw*4;
464
465 /* Allocate the CSO. */
466 shader = CALLOC_STRUCT(r600_fetch_shader);
467 if (!shader) {
468 r600_bytecode_clear(&bc);
469 return NULL;
470 }
471 memcpy(shader->strides, strides, sizeof(strides));
472 shader->buffer_mask = buffer_mask;
473
474 u_suballocator_alloc(&rctx->allocator_fetch_shader, fs_size, 256,
475 &shader->offset,
476 (struct pipe_resource**)&shader->buffer);
477 if (!shader->buffer) {
478 r600_bytecode_clear(&bc);
479 FREE(shader);
480 return NULL;
481 }
482
483 bytecode = r600_buffer_map_sync_with_rings
484 (&rctx->b, shader->buffer,
485 PIPE_MAP_WRITE | PIPE_MAP_UNSYNCHRONIZED | RADEON_MAP_TEMPORARY);
486 bytecode += shader->offset / 4;
487
488 if (UTIL_ARCH_BIG_ENDIAN) {
489 for (i = 0; i < fs_size / 4; ++i) {
490 bytecode[i] = util_cpu_to_le32(bc.bytecode[i]);
491 }
492 } else {
493 memcpy(bytecode, bc.bytecode, fs_size);
494 }
495 rctx->b.ws->buffer_unmap(rctx->b.ws, shader->buffer->buf);
496
497 r600_bytecode_clear(&bc);
498 return shader;
499 }
500
eg_get_interpolator_index(unsigned interpolate,unsigned location)501 int eg_get_interpolator_index(unsigned interpolate, unsigned location)
502 {
503 if (interpolate == TGSI_INTERPOLATE_COLOR ||
504 interpolate == TGSI_INTERPOLATE_LINEAR ||
505 interpolate == TGSI_INTERPOLATE_PERSPECTIVE)
506 {
507 int is_linear = interpolate == TGSI_INTERPOLATE_LINEAR;
508 int loc;
509
510 switch(location) {
511 case TGSI_INTERPOLATE_LOC_CENTER:
512 loc = 1;
513 break;
514 case TGSI_INTERPOLATE_LOC_CENTROID:
515 loc = 2;
516 break;
517 case TGSI_INTERPOLATE_LOC_SAMPLE:
518 default:
519 loc = 0; break;
520 }
521
522 return is_linear * 3 + loc;
523 }
524
525 return -1;
526 }
527
r600_get_lds_unique_index(unsigned semantic_name,unsigned index)528 int r600_get_lds_unique_index(unsigned semantic_name, unsigned index)
529 {
530 switch (semantic_name) {
531 case TGSI_SEMANTIC_POSITION:
532 return 0;
533 case TGSI_SEMANTIC_PSIZE:
534 return 1;
535 case TGSI_SEMANTIC_CLIPDIST:
536 assert(index <= 1);
537 return 2 + index;
538 case TGSI_SEMANTIC_TEXCOORD:
539 return 4 + index;
540 case TGSI_SEMANTIC_COLOR:
541 return 12 + index;
542 case TGSI_SEMANTIC_BCOLOR:
543 return 14 + index;
544 case TGSI_SEMANTIC_CLIPVERTEX:
545 return 16;
546 case TGSI_SEMANTIC_GENERIC:
547 if (index <= 63-17)
548 return 17 + index;
549 else
550 /* same explanation as in the default statement,
551 * the only user hitting this is st/nine.
552 */
553 return 0;
554
555 /* patch indices are completely separate and thus start from 0 */
556 case TGSI_SEMANTIC_TESSOUTER:
557 return 0;
558 case TGSI_SEMANTIC_TESSINNER:
559 return 1;
560 case TGSI_SEMANTIC_PATCH:
561 return 2 + index;
562
563 default:
564 /* Don't fail here. The result of this function is only used
565 * for LS, TCS, TES, and GS, where legacy GL semantics can't
566 * occur, but this function is called for all vertex shaders
567 * before it's known whether LS will be compiled or not.
568 */
569 return 0;
570 }
571 }
572
emit_streamout(struct r600_shader_ctx * ctx,struct pipe_stream_output_info * so,int stream,unsigned * stream_item_size UNUSED)573 static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output_info *so,
574 int stream, unsigned *stream_item_size UNUSED)
575 {
576 unsigned so_gpr[PIPE_MAX_SHADER_OUTPUTS];
577 unsigned start_comp[PIPE_MAX_SHADER_OUTPUTS];
578 int j, r;
579 unsigned i;
580
581 /* Sanity checking. */
582 if (so->num_outputs > PIPE_MAX_SO_OUTPUTS) {
583 R600_ERR("Too many stream outputs: %d\n", so->num_outputs);
584 r = -EINVAL;
585 goto out_err;
586 }
587 for (i = 0; i < so->num_outputs; i++) {
588 if (so->output[i].output_buffer >= 4) {
589 R600_ERR("Exceeded the max number of stream output buffers, got: %d\n",
590 so->output[i].output_buffer);
591 r = -EINVAL;
592 goto out_err;
593 }
594 }
595
596 if (so->num_outputs && ctx->bc->cf_last->op != CF_OP_ALU &&
597 ctx->bc->cf_last->op != CF_OP_ALU_PUSH_BEFORE)
598 ctx->bc->force_add_cf = 1;
599 /* Initialize locations where the outputs are stored. */
600 for (i = 0; i < so->num_outputs; i++) {
601
602 so_gpr[i] = ctx->shader->output[so->output[i].register_index].gpr;
603 start_comp[i] = so->output[i].start_component;
604 /* Lower outputs with dst_offset < start_component.
605 *
606 * We can only output 4D vectors with a write mask, e.g. we can
607 * only output the W component at offset 3, etc. If we want
608 * to store Y, Z, or W at buffer offset 0, we need to use MOV
609 * to move it to X and output X. */
610 if (so->output[i].dst_offset < so->output[i].start_component) {
611 unsigned tmp = ctx->temp_reg + ctx->max_driver_temp_used++;
612
613 for (j = 0; j < so->output[i].num_components; j++) {
614 struct r600_bytecode_alu alu;
615 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
616 alu.op = ALU_OP1_MOV;
617 alu.src[0].sel = so_gpr[i];
618 alu.src[0].chan = so->output[i].start_component + j;
619
620 alu.dst.sel = tmp;
621 alu.dst.chan = j;
622 alu.dst.write = 1;
623 if (j == so->output[i].num_components - 1)
624 alu.last = 1;
625 r = r600_bytecode_add_alu(ctx->bc, &alu);
626 if (r)
627 return r;
628 }
629 start_comp[i] = 0;
630 so_gpr[i] = tmp;
631 }
632 }
633
634 /* Write outputs to buffers. */
635 for (i = 0; i < so->num_outputs; i++) {
636 struct r600_bytecode_output output;
637
638 if (stream != -1 && stream != so->output[i].stream)
639 continue;
640
641 memset(&output, 0, sizeof(struct r600_bytecode_output));
642 output.gpr = so_gpr[i];
643 output.elem_size = so->output[i].num_components - 1;
644 if (output.elem_size == 2)
645 output.elem_size = 3; // 3 not supported, write 4 with junk at end
646 output.array_base = so->output[i].dst_offset - start_comp[i];
647 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
648 output.burst_count = 1;
649 /* array_size is an upper limit for the burst_count
650 * with MEM_STREAM instructions */
651 output.array_size = 0xFFF;
652 output.comp_mask = ((1 << so->output[i].num_components) - 1) << start_comp[i];
653
654 if (ctx->bc->gfx_level >= EVERGREEN) {
655 switch (so->output[i].output_buffer) {
656 case 0:
657 output.op = CF_OP_MEM_STREAM0_BUF0;
658 break;
659 case 1:
660 output.op = CF_OP_MEM_STREAM0_BUF1;
661 break;
662 case 2:
663 output.op = CF_OP_MEM_STREAM0_BUF2;
664 break;
665 case 3:
666 output.op = CF_OP_MEM_STREAM0_BUF3;
667 break;
668 }
669 output.op += so->output[i].stream * 4;
670 assert(output.op >= CF_OP_MEM_STREAM0_BUF0 && output.op <= CF_OP_MEM_STREAM3_BUF3);
671 ctx->enabled_stream_buffers_mask |= (1 << so->output[i].output_buffer) << so->output[i].stream * 4;
672 } else {
673 switch (so->output[i].output_buffer) {
674 case 0:
675 output.op = CF_OP_MEM_STREAM0;
676 break;
677 case 1:
678 output.op = CF_OP_MEM_STREAM1;
679 break;
680 case 2:
681 output.op = CF_OP_MEM_STREAM2;
682 break;
683 case 3:
684 output.op = CF_OP_MEM_STREAM3;
685 break;
686 }
687 ctx->enabled_stream_buffers_mask |= 1 << so->output[i].output_buffer;
688 }
689 r = r600_bytecode_add_output(ctx->bc, &output);
690 if (r)
691 goto out_err;
692 }
693 return 0;
694 out_err:
695 return r;
696 }
697
generate_gs_copy_shader(struct r600_context * rctx,struct r600_pipe_shader * gs,struct pipe_stream_output_info * so)698 int generate_gs_copy_shader(struct r600_context *rctx,
699 struct r600_pipe_shader *gs,
700 struct pipe_stream_output_info *so)
701 {
702 struct r600_shader_ctx ctx = {};
703 struct r600_shader *gs_shader = &gs->shader;
704 struct r600_pipe_shader *cshader;
705 unsigned ocnt = gs_shader->noutput;
706 struct r600_bytecode_alu alu;
707 struct r600_bytecode_vtx vtx;
708 struct r600_bytecode_output output;
709 struct r600_bytecode_cf *cf_jump, *cf_pop,
710 *last_exp_pos = NULL, *last_exp_param = NULL;
711 int next_clip_pos = 61, next_param = 0;
712 unsigned i, j;
713 int ring;
714 bool only_ring_0 = true;
715 cshader = calloc(1, sizeof(struct r600_pipe_shader));
716 if (!cshader)
717 return 0;
718
719 memcpy(cshader->shader.output, gs_shader->output, ocnt *
720 sizeof(struct r600_shader_io));
721
722 cshader->shader.noutput = ocnt;
723
724 ctx.shader = &cshader->shader;
725 ctx.bc = &ctx.shader->bc;
726 ctx.type = ctx.bc->type = PIPE_SHADER_VERTEX;
727
728 r600_bytecode_init(ctx.bc, rctx->b.gfx_level, rctx->b.family,
729 rctx->screen->has_compressed_msaa_texturing);
730
731 ctx.bc->isa = rctx->isa;
732
733 cf_jump = NULL;
734 memset(cshader->shader.ring_item_sizes, 0, sizeof(cshader->shader.ring_item_sizes));
735
736 /* R0.x = R0.x & 0x3fffffff */
737 memset(&alu, 0, sizeof(alu));
738 alu.op = ALU_OP2_AND_INT;
739 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
740 alu.src[1].value = 0x3fffffff;
741 alu.dst.write = 1;
742 r600_bytecode_add_alu(ctx.bc, &alu);
743
744 /* R0.y = R0.x >> 30 */
745 memset(&alu, 0, sizeof(alu));
746 alu.op = ALU_OP2_LSHR_INT;
747 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
748 alu.src[1].value = 0x1e;
749 alu.dst.chan = 1;
750 alu.dst.write = 1;
751 alu.last = 1;
752 r600_bytecode_add_alu(ctx.bc, &alu);
753
754 /* fetch vertex data from GSVS ring */
755 for (i = 0; i < ocnt; ++i) {
756 struct r600_shader_io *out = &ctx.shader->output[i];
757
758 out->gpr = i + 1;
759 out->ring_offset = i * 16;
760
761 memset(&vtx, 0, sizeof(vtx));
762 vtx.op = FETCH_OP_VFETCH;
763 vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
764 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
765 vtx.mega_fetch_count = 16;
766 vtx.offset = out->ring_offset;
767 vtx.dst_gpr = out->gpr;
768 vtx.src_gpr = 0;
769 vtx.dst_sel_x = 0;
770 vtx.dst_sel_y = 1;
771 vtx.dst_sel_z = 2;
772 vtx.dst_sel_w = 3;
773 if (rctx->b.gfx_level >= EVERGREEN) {
774 vtx.use_const_fields = 1;
775 } else {
776 vtx.data_format = FMT_32_32_32_32_FLOAT;
777 }
778
779 r600_bytecode_add_vtx(ctx.bc, &vtx);
780 }
781 ctx.temp_reg = i + 1;
782 for (ring = 3; ring >= 0; --ring) {
783 bool enabled = false;
784 for (i = 0; i < so->num_outputs; i++) {
785 if (so->output[i].stream == ring) {
786 enabled = true;
787 if (ring > 0)
788 only_ring_0 = false;
789 break;
790 }
791 }
792 if (ring != 0 && !enabled) {
793 cshader->shader.ring_item_sizes[ring] = 0;
794 continue;
795 }
796
797 if (cf_jump) {
798 // Patch up jump label
799 r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
800 cf_pop = ctx.bc->cf_last;
801
802 cf_jump->cf_addr = cf_pop->id + 2;
803 cf_jump->pop_count = 1;
804 cf_pop->cf_addr = cf_pop->id + 2;
805 cf_pop->pop_count = 1;
806 }
807
808 /* PRED_SETE_INT __, R0.y, ring */
809 memset(&alu, 0, sizeof(alu));
810 alu.op = ALU_OP2_PRED_SETE_INT;
811 alu.src[0].chan = 1;
812 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
813 alu.src[1].value = ring;
814 alu.execute_mask = 1;
815 alu.update_pred = 1;
816 alu.last = 1;
817 ctx.bc->force_add_cf = 1;
818 r600_bytecode_add_alu_type(ctx.bc, &alu, CF_OP_ALU_PUSH_BEFORE);
819
820 r600_bytecode_add_cfinst(ctx.bc, CF_OP_JUMP);
821 cf_jump = ctx.bc->cf_last;
822
823 if (enabled)
824 emit_streamout(&ctx, so, only_ring_0 ? -1 : ring, &cshader->shader.ring_item_sizes[ring]);
825 cshader->shader.ring_item_sizes[ring] = ocnt * 16;
826 }
827
828 /* bc adds nops - copy it */
829 if (ctx.bc->gfx_level == R600) {
830 ctx.bc->force_add_cf = 1;
831 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
832 alu.op = ALU_OP0_NOP;
833 alu.last = 1;
834 r600_bytecode_add_alu(ctx.bc, &alu);
835
836 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
837 }
838
839 /* export vertex data */
840 /* XXX factor out common code with r600_shader_from_tgsi ? */
841 for (i = 0; i < ocnt; ++i) {
842 struct r600_shader_io *out = &ctx.shader->output[i];
843 /* The actual parameter export indices will be calculated here, ignore the copied ones. */
844 out->export_param = -1;
845 bool instream0 = true;
846 if (out->varying_slot == VARYING_SLOT_CLIP_VERTEX)
847 continue;
848
849 for (j = 0; j < so->num_outputs; j++) {
850 if (so->output[j].register_index == i) {
851 if (so->output[j].stream == 0)
852 break;
853 if (so->output[j].stream > 0)
854 instream0 = false;
855 }
856 }
857 if (!instream0)
858 continue;
859 memset(&output, 0, sizeof(output));
860 output.gpr = out->gpr;
861 output.elem_size = 3;
862 output.swizzle_x = 0;
863 output.swizzle_y = 1;
864 output.swizzle_z = 2;
865 output.swizzle_w = 3;
866 output.burst_count = 1;
867 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
868 output.op = CF_OP_EXPORT;
869 switch (out->varying_slot) {
870 case VARYING_SLOT_POS:
871 output.array_base = 60;
872 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
873 break;
874
875 case VARYING_SLOT_PSIZ:
876 output.array_base = 61;
877 if (next_clip_pos == 61)
878 next_clip_pos = 62;
879 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
880 output.swizzle_y = 7;
881 output.swizzle_z = 7;
882 output.swizzle_w = 7;
883 ctx.shader->vs_out_misc_write = 1;
884 ctx.shader->vs_out_point_size = 1;
885 break;
886 case VARYING_SLOT_LAYER:
887 if (out->spi_sid) {
888 /* duplicate it as PARAM to pass to the pixel shader */
889 output.array_base = next_param++;
890 out->export_param = output.array_base;
891 r600_bytecode_add_output(ctx.bc, &output);
892 last_exp_param = ctx.bc->cf_last;
893 }
894 output.array_base = 61;
895 if (next_clip_pos == 61)
896 next_clip_pos = 62;
897 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
898 output.swizzle_x = 7;
899 output.swizzle_y = 7;
900 output.swizzle_z = 0;
901 output.swizzle_w = 7;
902 ctx.shader->vs_out_misc_write = 1;
903 ctx.shader->vs_out_layer = 1;
904 break;
905 case VARYING_SLOT_VIEWPORT:
906 if (out->spi_sid) {
907 /* duplicate it as PARAM to pass to the pixel shader */
908 output.array_base = next_param++;
909 out->export_param = output.array_base;
910 r600_bytecode_add_output(ctx.bc, &output);
911 last_exp_param = ctx.bc->cf_last;
912 }
913 output.array_base = 61;
914 if (next_clip_pos == 61)
915 next_clip_pos = 62;
916 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
917 ctx.shader->vs_out_misc_write = 1;
918 ctx.shader->vs_out_viewport = 1;
919 output.swizzle_x = 7;
920 output.swizzle_y = 7;
921 output.swizzle_z = 7;
922 output.swizzle_w = 0;
923 break;
924 case VARYING_SLOT_CLIP_DIST0:
925 case VARYING_SLOT_CLIP_DIST1:
926 /* spi_sid is 0 for clipdistance outputs that were generated
927 * for clipvertex - we don't need to pass them to PS */
928 ctx.shader->clip_dist_write = gs->shader.clip_dist_write;
929 ctx.shader->cull_dist_write = gs->shader.cull_dist_write;
930 ctx.shader->cc_dist_mask = gs->shader.cc_dist_mask;
931 if (out->spi_sid) {
932 /* duplicate it as PARAM to pass to the pixel shader */
933 output.array_base = next_param++;
934 out->export_param = output.array_base;
935 r600_bytecode_add_output(ctx.bc, &output);
936 last_exp_param = ctx.bc->cf_last;
937 }
938 output.array_base = next_clip_pos++;
939 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
940 break;
941 case VARYING_SLOT_FOGC:
942 output.swizzle_y = 4; /* 0 */
943 output.swizzle_z = 4; /* 0 */
944 output.swizzle_w = 5; /* 1 */
945 break;
946 default:
947 break;
948 }
949 if (output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM) {
950 output.array_base = next_param++;
951 out->export_param = output.array_base;
952 }
953 r600_bytecode_add_output(ctx.bc, &output);
954 if (output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM)
955 last_exp_param = ctx.bc->cf_last;
956 else
957 last_exp_pos = ctx.bc->cf_last;
958 }
959
960 if (!last_exp_pos) {
961 memset(&output, 0, sizeof(output));
962 output.gpr = 0;
963 output.elem_size = 3;
964 output.swizzle_x = 7;
965 output.swizzle_y = 7;
966 output.swizzle_z = 7;
967 output.swizzle_w = 7;
968 output.burst_count = 1;
969 output.type = 2;
970 output.op = CF_OP_EXPORT;
971 output.array_base = 60;
972 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
973 r600_bytecode_add_output(ctx.bc, &output);
974 last_exp_pos = ctx.bc->cf_last;
975 }
976
977 if (!last_exp_param) {
978 memset(&output, 0, sizeof(output));
979 output.gpr = 0;
980 output.elem_size = 3;
981 output.swizzle_x = 7;
982 output.swizzle_y = 7;
983 output.swizzle_z = 7;
984 output.swizzle_w = 7;
985 output.burst_count = 1;
986 output.type = 2;
987 output.op = CF_OP_EXPORT;
988 output.array_base = next_param++;
989 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
990 r600_bytecode_add_output(ctx.bc, &output);
991 last_exp_param = ctx.bc->cf_last;
992 }
993
994 last_exp_pos->op = CF_OP_EXPORT_DONE;
995 last_exp_param->op = CF_OP_EXPORT_DONE;
996
997 assert(next_param > 0);
998 cshader->shader.highest_export_param = next_param - 1;
999
1000 r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
1001 cf_pop = ctx.bc->cf_last;
1002
1003 cf_jump->cf_addr = cf_pop->id + 2;
1004 cf_jump->pop_count = 1;
1005 cf_pop->cf_addr = cf_pop->id + 2;
1006 cf_pop->pop_count = 1;
1007
1008 if (ctx.bc->gfx_level == CAYMAN)
1009 cm_bytecode_add_cf_end(ctx.bc);
1010 else {
1011 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
1012 ctx.bc->cf_last->end_of_program = 1;
1013 }
1014
1015 gs->gs_copy_shader = cshader;
1016 cshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
1017
1018 ctx.bc->nstack = 1;
1019
1020 return r600_bytecode_build(ctx.bc);
1021 }
1022
1023