1 /*
2 * Copyright 2011 Adam Rak <[email protected]>
3 * Authors:
4 * Adam Rak <[email protected]>
5 * SPDX-License-Identifier: MIT
6 */
7
8 #ifdef HAVE_OPENCL
9 #include <gelf.h>
10 #include <libelf.h>
11 #endif
12 #include <stdio.h>
13 #include <errno.h>
14 #include "pipe/p_defines.h"
15 #include "pipe/p_state.h"
16 #include "pipe/p_context.h"
17 #include "util/u_blitter.h"
18 #include "util/list.h"
19 #include "util/u_transfer.h"
20 #include "util/u_surface.h"
21 #include "util/u_pack_color.h"
22 #include "util/u_memory.h"
23 #include "util/u_inlines.h"
24 #include "util/u_framebuffer.h"
25 #include "pipebuffer/pb_buffer.h"
26 #include "evergreend.h"
27 #include "r600_shader.h"
28 #include "r600_pipe.h"
29 #include "r600_formats.h"
30 #include "evergreen_compute.h"
31 #include "evergreen_compute_internal.h"
32 #include "compute_memory_pool.h"
33 #include <inttypes.h>
34
35 /**
36 RAT0 is for global binding write
37 VTX1 is for global binding read
38
39 for writing images RAT1...
40 for reading images TEX2...
41 TEX2-RAT1 is paired
42
43 TEX2... consumes the same fetch resources, that VTX2... would consume
44
45 CONST0 and VTX0 is for parameters
46 CONST0 is binding smaller input parameter buffer, and for constant indexing,
47 also constant cached
48 VTX0 is for indirect/non-constant indexing, or if the input is bigger than
49 the constant cache can handle
50
51 RAT-s are limited to 12, so we can only bind at most 11 texture for writing
52 because we reserve RAT0 for global bindings. With byteaddressing enabled,
53 we should reserve another one too.=> 10 image binding for writing max.
54
55 from Nvidia OpenCL:
56 CL_DEVICE_MAX_READ_IMAGE_ARGS: 128
57 CL_DEVICE_MAX_WRITE_IMAGE_ARGS: 8
58
59 so 10 for writing is enough. 176 is the max for reading according to the docs
60
61 writable images should be listed first < 10, so their id corresponds to RAT(id+1)
62 writable images will consume TEX slots, VTX slots too because of linear indexing
63
64 */
65
66 #ifdef HAVE_OPENCL
radeon_shader_binary_init(struct r600_shader_binary * b)67 static void radeon_shader_binary_init(struct r600_shader_binary *b)
68 {
69 memset(b, 0, sizeof(*b));
70 }
71
radeon_shader_binary_clean(struct r600_shader_binary * b)72 static void radeon_shader_binary_clean(struct r600_shader_binary *b)
73 {
74 if (!b)
75 return;
76 FREE(b->code);
77 FREE(b->config);
78 FREE(b->rodata);
79 FREE(b->global_symbol_offsets);
80 FREE(b->relocs);
81 FREE(b->disasm_string);
82 }
83 #endif
84
r600_compute_buffer_alloc_vram(struct r600_screen * screen,unsigned size)85 struct r600_resource *r600_compute_buffer_alloc_vram(struct r600_screen *screen,
86 unsigned size)
87 {
88 struct pipe_resource *buffer = NULL;
89 assert(size);
90
91 buffer = pipe_buffer_create((struct pipe_screen*) screen,
92 0, PIPE_USAGE_IMMUTABLE, size);
93
94 return (struct r600_resource *)buffer;
95 }
96
97
evergreen_set_rat(struct r600_pipe_compute * pipe,unsigned id,struct r600_resource * bo,int start,int size)98 static void evergreen_set_rat(struct r600_pipe_compute *pipe,
99 unsigned id,
100 struct r600_resource *bo,
101 int start,
102 int size)
103 {
104 struct pipe_surface rat_templ;
105 struct r600_surface *surf = NULL;
106 struct r600_context *rctx = NULL;
107
108 assert(id < 12);
109 assert((size & 3) == 0);
110 assert((start & 0xFF) == 0);
111
112 rctx = pipe->ctx;
113
114 COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
115
116 /* Create the RAT surface */
117 memset(&rat_templ, 0, sizeof(rat_templ));
118 rat_templ.format = PIPE_FORMAT_R32_UINT;
119 rat_templ.u.tex.level = 0;
120 rat_templ.u.tex.first_layer = 0;
121 rat_templ.u.tex.last_layer = 0;
122
123 /* Add the RAT the list of color buffers. Drop the old buffer first. */
124 pipe_surface_reference(&pipe->ctx->framebuffer.state.cbufs[id], NULL);
125 pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
126 (struct pipe_context *)pipe->ctx,
127 (struct pipe_resource *)bo, &rat_templ);
128
129 /* Update the number of color buffers */
130 pipe->ctx->framebuffer.state.nr_cbufs =
131 MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
132
133 /* Update the cb_target_mask
134 * XXX: I think this is a potential spot for bugs once we start doing
135 * GL interop. cb_target_mask may be modified in the 3D sections
136 * of this driver. */
137 pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
138
139 surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
140 evergreen_init_color_surface_rat(rctx, surf);
141 }
142
evergreen_cs_set_vertex_buffer(struct r600_context * rctx,unsigned vb_index,unsigned offset,struct pipe_resource * buffer)143 static void evergreen_cs_set_vertex_buffer(struct r600_context *rctx,
144 unsigned vb_index,
145 unsigned offset,
146 struct pipe_resource *buffer)
147 {
148 struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
149 struct pipe_vertex_buffer *vb = &state->vb[vb_index];
150 vb->buffer_offset = offset;
151 vb->buffer.resource = buffer;
152 vb->is_user_buffer = false;
153
154 /* The vertex instructions in the compute shaders use the texture cache,
155 * so we need to invalidate it. */
156 rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
157 state->enabled_mask |= 1 << vb_index;
158 state->dirty_mask |= 1 << vb_index;
159 r600_mark_atom_dirty(rctx, &state->atom);
160 }
161
evergreen_cs_set_constant_buffer(struct r600_context * rctx,unsigned cb_index,unsigned offset,unsigned size,struct pipe_resource * buffer)162 static void evergreen_cs_set_constant_buffer(struct r600_context *rctx,
163 unsigned cb_index,
164 unsigned offset,
165 unsigned size,
166 struct pipe_resource *buffer)
167 {
168 struct pipe_constant_buffer cb;
169 cb.buffer_size = size;
170 cb.buffer_offset = offset;
171 cb.buffer = buffer;
172 cb.user_buffer = NULL;
173
174 rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, false, &cb);
175 }
176
177 /* We need to define these R600 registers here, because we can't include
178 * evergreend.h and r600d.h.
179 */
180 #define R_028868_SQ_PGM_RESOURCES_VS 0x028868
181 #define R_028850_SQ_PGM_RESOURCES_PS 0x028850
182
183 #ifdef HAVE_OPENCL
parse_symbol_table(Elf_Data * symbol_table_data,const GElf_Shdr * symbol_table_header,struct r600_shader_binary * binary)184 static void parse_symbol_table(Elf_Data *symbol_table_data,
185 const GElf_Shdr *symbol_table_header,
186 struct r600_shader_binary *binary)
187 {
188 GElf_Sym symbol;
189 unsigned i = 0;
190 unsigned symbol_count =
191 symbol_table_header->sh_size / symbol_table_header->sh_entsize;
192
193 /* We are over allocating this list, because symbol_count gives the
194 * total number of symbols, and we will only be filling the list
195 * with offsets of global symbols. The memory savings from
196 * allocating the correct size of this list will be small, and
197 * I don't think it is worth the cost of pre-computing the number
198 * of global symbols.
199 */
200 binary->global_symbol_offsets = CALLOC(symbol_count, sizeof(uint64_t));
201
202 while (gelf_getsym(symbol_table_data, i++, &symbol)) {
203 unsigned i;
204 if (GELF_ST_BIND(symbol.st_info) != STB_GLOBAL ||
205 symbol.st_shndx == 0 /* Undefined symbol */) {
206 continue;
207 }
208
209 binary->global_symbol_offsets[binary->global_symbol_count] =
210 symbol.st_value;
211
212 /* Sort the list using bubble sort. This list will usually
213 * be small. */
214 for (i = binary->global_symbol_count; i > 0; --i) {
215 uint64_t lhs = binary->global_symbol_offsets[i - 1];
216 uint64_t rhs = binary->global_symbol_offsets[i];
217 if (lhs < rhs) {
218 break;
219 }
220 binary->global_symbol_offsets[i] = lhs;
221 binary->global_symbol_offsets[i - 1] = rhs;
222 }
223 ++binary->global_symbol_count;
224 }
225 }
226
227
parse_relocs(Elf * elf,Elf_Data * relocs,Elf_Data * symbols,unsigned symbol_sh_link,struct r600_shader_binary * binary)228 static void parse_relocs(Elf *elf, Elf_Data *relocs, Elf_Data *symbols,
229 unsigned symbol_sh_link,
230 struct r600_shader_binary *binary)
231 {
232 unsigned i;
233
234 if (!relocs || !symbols || !binary->reloc_count) {
235 return;
236 }
237 binary->relocs = CALLOC(binary->reloc_count,
238 sizeof(struct r600_shader_reloc));
239 for (i = 0; i < binary->reloc_count; i++) {
240 GElf_Sym symbol;
241 GElf_Rel rel;
242 char *symbol_name;
243 struct r600_shader_reloc *reloc = &binary->relocs[i];
244
245 gelf_getrel(relocs, i, &rel);
246 gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &symbol);
247 symbol_name = elf_strptr(elf, symbol_sh_link, symbol.st_name);
248
249 reloc->offset = rel.r_offset;
250 strncpy(reloc->name, symbol_name, sizeof(reloc->name)-1);
251 reloc->name[sizeof(reloc->name)-1] = 0;
252 }
253 }
254
r600_elf_read(const char * elf_data,unsigned elf_size,struct r600_shader_binary * binary)255 static void r600_elf_read(const char *elf_data, unsigned elf_size,
256 struct r600_shader_binary *binary)
257 {
258 char *elf_buffer;
259 Elf *elf;
260 Elf_Scn *section = NULL;
261 Elf_Data *symbols = NULL, *relocs = NULL;
262 size_t section_str_index;
263 unsigned symbol_sh_link = 0;
264
265 /* One of the libelf implementations
266 * (http://www.mr511.de/software/english.htm) requires calling
267 * elf_version() before elf_memory().
268 */
269 elf_version(EV_CURRENT);
270 elf_buffer = MALLOC(elf_size);
271 memcpy(elf_buffer, elf_data, elf_size);
272
273 elf = elf_memory(elf_buffer, elf_size);
274
275 elf_getshdrstrndx(elf, §ion_str_index);
276
277 while ((section = elf_nextscn(elf, section))) {
278 const char *name;
279 Elf_Data *section_data = NULL;
280 GElf_Shdr section_header;
281 if (gelf_getshdr(section, §ion_header) != §ion_header) {
282 fprintf(stderr, "Failed to read ELF section header\n");
283 return;
284 }
285 name = elf_strptr(elf, section_str_index, section_header.sh_name);
286 if (!strcmp(name, ".text")) {
287 section_data = elf_getdata(section, section_data);
288 binary->code_size = section_data->d_size;
289 binary->code = MALLOC(binary->code_size * sizeof(unsigned char));
290 memcpy(binary->code, section_data->d_buf, binary->code_size);
291 } else if (!strcmp(name, ".AMDGPU.config")) {
292 section_data = elf_getdata(section, section_data);
293 binary->config_size = section_data->d_size;
294 binary->config = MALLOC(binary->config_size * sizeof(unsigned char));
295 memcpy(binary->config, section_data->d_buf, binary->config_size);
296 } else if (!strcmp(name, ".AMDGPU.disasm")) {
297 /* Always read disassembly if it's available. */
298 section_data = elf_getdata(section, section_data);
299 binary->disasm_string = strndup(section_data->d_buf,
300 section_data->d_size);
301 } else if (!strncmp(name, ".rodata", 7)) {
302 section_data = elf_getdata(section, section_data);
303 binary->rodata_size = section_data->d_size;
304 binary->rodata = MALLOC(binary->rodata_size * sizeof(unsigned char));
305 memcpy(binary->rodata, section_data->d_buf, binary->rodata_size);
306 } else if (!strncmp(name, ".symtab", 7)) {
307 symbols = elf_getdata(section, section_data);
308 symbol_sh_link = section_header.sh_link;
309 parse_symbol_table(symbols, §ion_header, binary);
310 } else if (!strcmp(name, ".rel.text")) {
311 relocs = elf_getdata(section, section_data);
312 binary->reloc_count = section_header.sh_size /
313 section_header.sh_entsize;
314 }
315 }
316
317 parse_relocs(elf, relocs, symbols, symbol_sh_link, binary);
318
319 if (elf){
320 elf_end(elf);
321 }
322 FREE(elf_buffer);
323
324 /* Cache the config size per symbol */
325 if (binary->global_symbol_count) {
326 binary->config_size_per_symbol =
327 binary->config_size / binary->global_symbol_count;
328 } else {
329 binary->global_symbol_count = 1;
330 binary->config_size_per_symbol = binary->config_size;
331 }
332 }
333
r600_shader_binary_config_start(const struct r600_shader_binary * binary,uint64_t symbol_offset)334 static const unsigned char *r600_shader_binary_config_start(
335 const struct r600_shader_binary *binary,
336 uint64_t symbol_offset)
337 {
338 unsigned i;
339 for (i = 0; i < binary->global_symbol_count; ++i) {
340 if (binary->global_symbol_offsets[i] == symbol_offset) {
341 unsigned offset = i * binary->config_size_per_symbol;
342 return binary->config + offset;
343 }
344 }
345 return binary->config;
346 }
347
r600_shader_binary_read_config(const struct r600_shader_binary * binary,struct r600_bytecode * bc,uint64_t symbol_offset,bool * use_kill)348 static void r600_shader_binary_read_config(const struct r600_shader_binary *binary,
349 struct r600_bytecode *bc,
350 uint64_t symbol_offset,
351 bool *use_kill)
352 {
353 unsigned i;
354 const unsigned char *config =
355 r600_shader_binary_config_start(binary, symbol_offset);
356
357 for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
358 unsigned reg =
359 util_le32_to_cpu(*(uint32_t*)(config + i));
360 unsigned value =
361 util_le32_to_cpu(*(uint32_t*)(config + i + 4));
362 switch (reg) {
363 /* R600 / R700 */
364 case R_028850_SQ_PGM_RESOURCES_PS:
365 case R_028868_SQ_PGM_RESOURCES_VS:
366 /* Evergreen / Northern Islands */
367 case R_028844_SQ_PGM_RESOURCES_PS:
368 case R_028860_SQ_PGM_RESOURCES_VS:
369 case R_0288D4_SQ_PGM_RESOURCES_LS:
370 bc->ngpr = MAX2(bc->ngpr, G_028844_NUM_GPRS(value));
371 bc->nstack = MAX2(bc->nstack, G_028844_STACK_SIZE(value));
372 break;
373 case R_02880C_DB_SHADER_CONTROL:
374 *use_kill = G_02880C_KILL_ENABLE(value);
375 break;
376 case R_0288E8_SQ_LDS_ALLOC:
377 bc->nlds_dw = value;
378 break;
379 }
380 }
381 }
382
r600_create_shader(struct r600_bytecode * bc,const struct r600_shader_binary * binary,bool * use_kill)383 static unsigned r600_create_shader(struct r600_bytecode *bc,
384 const struct r600_shader_binary *binary,
385 bool *use_kill)
386
387 {
388 assert(binary->code_size % 4 == 0);
389 bc->bytecode = CALLOC(1, binary->code_size);
390 memcpy(bc->bytecode, binary->code, binary->code_size);
391 bc->ndw = binary->code_size / 4;
392
393 r600_shader_binary_read_config(binary, bc, 0, use_kill);
394 return 0;
395 }
396
397 #endif
398
r600_destroy_shader(struct r600_bytecode * bc)399 static void r600_destroy_shader(struct r600_bytecode *bc)
400 {
401 FREE(bc->bytecode);
402 }
403
evergreen_create_compute_state(struct pipe_context * ctx,const struct pipe_compute_state * cso)404 static void *evergreen_create_compute_state(struct pipe_context *ctx,
405 const struct pipe_compute_state *cso)
406 {
407 struct r600_context *rctx = (struct r600_context *)ctx;
408 struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
409 #ifdef HAVE_OPENCL
410 const struct pipe_binary_program_header *header;
411 void *p;
412 bool use_kill;
413 #endif
414
415 shader->ctx = rctx;
416 shader->local_size = cso->static_shared_mem;
417 shader->input_size = cso->req_input_mem;
418
419 shader->ir_type = cso->ir_type;
420
421 if (shader->ir_type == PIPE_SHADER_IR_TGSI ||
422 shader->ir_type == PIPE_SHADER_IR_NIR) {
423 shader->sel = r600_create_shader_state_tokens(ctx, cso->prog, cso->ir_type, PIPE_SHADER_COMPUTE);
424
425 /* Precompile the shader with the expected shader key, to reduce jank at
426 * draw time. Also produces output for shader-db.
427 */
428 bool dirty;
429 r600_shader_select(ctx, shader->sel, &dirty, true);
430
431 return shader;
432 }
433 #ifdef HAVE_OPENCL
434 COMPUTE_DBG(rctx->screen, "*** evergreen_create_compute_state\n");
435 header = cso->prog;
436 radeon_shader_binary_init(&shader->binary);
437 r600_elf_read(header->blob, header->num_bytes, &shader->binary);
438 r600_create_shader(&shader->bc, &shader->binary, &use_kill);
439
440 /* Upload code + ROdata */
441 shader->code_bo = r600_compute_buffer_alloc_vram(rctx->screen,
442 shader->bc.ndw * 4);
443 p = r600_buffer_map_sync_with_rings(
444 &rctx->b, shader->code_bo,
445 PIPE_MAP_WRITE | RADEON_MAP_TEMPORARY);
446 //TODO: use util_memcpy_cpu_to_le32 ?
447 memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
448 rctx->b.ws->buffer_unmap(rctx->b.ws, shader->code_bo->buf);
449 #endif
450
451 return shader;
452 }
453
evergreen_delete_compute_state(struct pipe_context * ctx,void * state)454 static void evergreen_delete_compute_state(struct pipe_context *ctx, void *state)
455 {
456 struct r600_context *rctx = (struct r600_context *)ctx;
457 struct r600_pipe_compute *shader = state;
458
459 COMPUTE_DBG(rctx->screen, "*** evergreen_delete_compute_state\n");
460
461 if (!shader)
462 return;
463
464 if (shader->ir_type == PIPE_SHADER_IR_TGSI ||
465 shader->ir_type == PIPE_SHADER_IR_NIR) {
466 r600_delete_shader_selector(ctx, shader->sel);
467 } else {
468 #ifdef HAVE_OPENCL
469 radeon_shader_binary_clean(&shader->binary);
470 pipe_resource_reference((struct pipe_resource**)&shader->code_bo, NULL);
471 pipe_resource_reference((struct pipe_resource**)&shader->kernel_param, NULL);
472 #endif
473 r600_destroy_shader(&shader->bc);
474 }
475 FREE(shader);
476 }
477
evergreen_bind_compute_state(struct pipe_context * ctx,void * state)478 static void evergreen_bind_compute_state(struct pipe_context *ctx, void *state)
479 {
480 struct r600_context *rctx = (struct r600_context *)ctx;
481 struct r600_pipe_compute *cstate = (struct r600_pipe_compute *)state;
482 COMPUTE_DBG(rctx->screen, "*** evergreen_bind_compute_state\n");
483
484 if (!state) {
485 rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
486 return;
487 }
488
489 if (cstate->ir_type == PIPE_SHADER_IR_TGSI ||
490 cstate->ir_type == PIPE_SHADER_IR_NIR) {
491 bool compute_dirty;
492 if (r600_shader_select(ctx, cstate->sel, &compute_dirty, false))
493 R600_ERR("Failed to select compute shader\n");
494 }
495
496 rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
497 }
498
499 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
500 * kernel parameters there are implicit parameters that need to be stored
501 * in the vertex buffer as well. Here is how these parameters are organized in
502 * the buffer:
503 *
504 * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
505 * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
506 * DWORDS 6-8: Number of work items within each work group in each dimension
507 * (x,y,z)
508 * DWORDS 9+ : Kernel parameters
509 */
evergreen_compute_upload_input(struct pipe_context * ctx,const struct pipe_grid_info * info)510 static void evergreen_compute_upload_input(struct pipe_context *ctx,
511 const struct pipe_grid_info *info)
512 {
513 struct r600_context *rctx = (struct r600_context *)ctx;
514 struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
515 unsigned i;
516 /* We need to reserve 9 dwords (36 bytes) for implicit kernel
517 * parameters.
518 */
519 unsigned input_size;
520 uint32_t *num_work_groups_start;
521 uint32_t *global_size_start;
522 uint32_t *local_size_start;
523 uint32_t *kernel_parameters_start;
524 struct pipe_box box;
525 struct pipe_transfer *transfer = NULL;
526
527 if (!shader)
528 return;
529 if (shader->input_size == 0) {
530 return;
531 }
532 input_size = shader->input_size + 36;
533 if (!shader->kernel_param) {
534 /* Add space for the grid dimensions */
535 shader->kernel_param = (struct r600_resource *)
536 pipe_buffer_create(ctx->screen, 0,
537 PIPE_USAGE_IMMUTABLE, input_size);
538 }
539
540 u_box_1d(0, input_size, &box);
541 num_work_groups_start = ctx->buffer_map(ctx,
542 (struct pipe_resource*)shader->kernel_param,
543 0, PIPE_MAP_WRITE | PIPE_MAP_DISCARD_RANGE,
544 &box, &transfer);
545 global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
546 local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
547 kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
548
549 /* Copy the work group size */
550 memcpy(num_work_groups_start, info->grid, 3 * sizeof(uint));
551
552 /* Copy the global size */
553 for (i = 0; i < 3; i++) {
554 global_size_start[i] = info->grid[i] * info->block[i];
555 }
556
557 /* Copy the local dimensions */
558 memcpy(local_size_start, info->block, 3 * sizeof(uint));
559
560 /* Copy the kernel inputs */
561 memcpy(kernel_parameters_start, info->input, shader->input_size);
562
563 for (i = 0; i < (input_size / 4); i++) {
564 COMPUTE_DBG(rctx->screen, "input %i : %u\n", i,
565 ((unsigned*)num_work_groups_start)[i]);
566 }
567
568 ctx->buffer_unmap(ctx, transfer);
569
570 /* ID=0 and ID=3 are reserved for the parameters.
571 * LLVM will preferably use ID=0, but it does not work for dynamic
572 * indices. */
573 evergreen_cs_set_vertex_buffer(rctx, 3, 0,
574 (struct pipe_resource*)shader->kernel_param);
575 evergreen_cs_set_constant_buffer(rctx, 0, 0, input_size,
576 (struct pipe_resource*)shader->kernel_param);
577 }
578
evergreen_emit_dispatch(struct r600_context * rctx,const struct pipe_grid_info * info,uint32_t indirect_grid[3])579 static void evergreen_emit_dispatch(struct r600_context *rctx,
580 const struct pipe_grid_info *info,
581 uint32_t indirect_grid[3])
582 {
583 int i;
584 struct radeon_cmdbuf *cs = &rctx->b.gfx.cs;
585 struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
586 bool render_cond_bit = rctx->b.render_cond && !rctx->b.render_cond_force_off;
587 unsigned num_waves;
588 unsigned num_pipes = rctx->screen->b.info.r600_max_quad_pipes;
589 unsigned wave_divisor = (16 * num_pipes);
590 int group_size = 1;
591 unsigned lds_size = (shader->local_size + info->variable_shared_mem) / 4;
592
593 if (shader->ir_type != PIPE_SHADER_IR_TGSI &&
594 shader->ir_type != PIPE_SHADER_IR_NIR)
595 lds_size += shader->bc.nlds_dw;
596
597 /* Calculate group_size */
598 for (i = 0; i < 3; i++) {
599 group_size *= info->block[i];
600 }
601
602 /* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
603 num_waves = (info->block[0] * info->block[1] * info->block[2] +
604 wave_divisor - 1) / wave_divisor;
605
606 COMPUTE_DBG(rctx->screen, "Using %u pipes, "
607 "%u wavefronts per thread block, "
608 "allocating %u dwords lds.\n",
609 num_pipes, num_waves, lds_size);
610
611 radeon_set_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
612
613 radeon_set_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
614 radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
615 radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
616 radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
617
618 radeon_set_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
619 group_size);
620
621 radeon_compute_set_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
622 radeon_emit(cs, info->block[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
623 radeon_emit(cs, info->block[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
624 radeon_emit(cs, info->block[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
625
626 if (rctx->b.gfx_level < CAYMAN) {
627 assert(lds_size <= 8192);
628 } else {
629 /* Cayman appears to have a slightly smaller limit, see the
630 * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
631 assert(lds_size <= 8160);
632 }
633
634 radeon_compute_set_context_reg(cs, R_0288E8_SQ_LDS_ALLOC,
635 lds_size | (num_waves << 14));
636
637 if (info->indirect) {
638 radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, render_cond_bit));
639 radeon_emit(cs, indirect_grid[0]);
640 radeon_emit(cs, indirect_grid[1]);
641 radeon_emit(cs, indirect_grid[2]);
642 radeon_emit(cs, 1);
643 } else {
644 /* Dispatch packet */
645 radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, render_cond_bit));
646 radeon_emit(cs, info->grid[0]);
647 radeon_emit(cs, info->grid[1]);
648 radeon_emit(cs, info->grid[2]);
649 /* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
650 radeon_emit(cs, 1);
651 }
652
653 if (rctx->is_debug)
654 eg_trace_emit(rctx);
655 }
656
compute_setup_cbs(struct r600_context * rctx)657 static void compute_setup_cbs(struct r600_context *rctx)
658 {
659 struct radeon_cmdbuf *cs = &rctx->b.gfx.cs;
660 unsigned i;
661
662 /* Emit colorbuffers. */
663 /* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
664 for (i = 0; i < 8 && i < rctx->framebuffer.state.nr_cbufs; i++) {
665 struct r600_surface *cb = (struct r600_surface*)rctx->framebuffer.state.cbufs[i];
666 unsigned reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
667 (struct r600_resource*)cb->base.texture,
668 RADEON_USAGE_READWRITE |
669 RADEON_PRIO_SHADER_RW_BUFFER);
670
671 radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
672 radeon_emit(cs, cb->cb_color_base); /* R_028C60_CB_COLOR0_BASE */
673 radeon_emit(cs, cb->cb_color_pitch); /* R_028C64_CB_COLOR0_PITCH */
674 radeon_emit(cs, cb->cb_color_slice); /* R_028C68_CB_COLOR0_SLICE */
675 radeon_emit(cs, cb->cb_color_view); /* R_028C6C_CB_COLOR0_VIEW */
676 radeon_emit(cs, cb->cb_color_info); /* R_028C70_CB_COLOR0_INFO */
677 radeon_emit(cs, cb->cb_color_attrib); /* R_028C74_CB_COLOR0_ATTRIB */
678 radeon_emit(cs, cb->cb_color_dim); /* R_028C78_CB_COLOR0_DIM */
679
680 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
681 radeon_emit(cs, reloc);
682
683 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
684 radeon_emit(cs, reloc);
685 }
686 for (; i < 8 ; i++)
687 radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
688 S_028C70_FORMAT(V_028C70_COLOR_INVALID));
689 for (; i < 12; i++)
690 radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
691 S_028C70_FORMAT(V_028C70_COLOR_INVALID));
692
693 /* Set CB_TARGET_MASK XXX: Use cb_misc_state */
694 radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
695 rctx->compute_cb_target_mask);
696 }
697
compute_emit_cs(struct r600_context * rctx,const struct pipe_grid_info * info)698 static void compute_emit_cs(struct r600_context *rctx,
699 const struct pipe_grid_info *info)
700 {
701 struct radeon_cmdbuf *cs = &rctx->b.gfx.cs;
702 bool compute_dirty = false;
703 struct r600_pipe_shader *current;
704 struct r600_shader_atomic combined_atomics[8];
705 uint8_t atomic_used_mask;
706 uint32_t indirect_grid[3] = { 0, 0, 0 };
707
708 /* make sure that the gfx ring is only one active */
709 if (radeon_emitted(&rctx->b.dma.cs, 0)) {
710 rctx->b.dma.flush(rctx, PIPE_FLUSH_ASYNC, NULL);
711 }
712
713 r600_update_compressed_resource_state(rctx, true);
714
715 if (!rctx->cmd_buf_is_compute) {
716 rctx->b.gfx.flush(rctx, PIPE_FLUSH_ASYNC, NULL);
717 rctx->cmd_buf_is_compute = true;
718 }
719
720 if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI||
721 rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_NIR) {
722 if (r600_shader_select(&rctx->b.b, rctx->cs_shader_state.shader->sel, &compute_dirty, false)) {
723 R600_ERR("Failed to select compute shader\n");
724 return;
725 }
726
727 current = rctx->cs_shader_state.shader->sel->current;
728 if (compute_dirty) {
729 rctx->cs_shader_state.atom.num_dw = current->command_buffer.num_dw;
730 r600_context_add_resource_size(&rctx->b.b, (struct pipe_resource *)current->bo);
731 r600_set_atom_dirty(rctx, &rctx->cs_shader_state.atom, true);
732 }
733
734 bool need_buf_const = current->shader.uses_tex_buffers ||
735 current->shader.has_txq_cube_array_z_comp;
736
737 if (info->indirect) {
738 struct r600_resource *indirect_resource = (struct r600_resource *)info->indirect;
739 unsigned *data = r600_buffer_map_sync_with_rings(&rctx->b, indirect_resource, PIPE_MAP_READ);
740 unsigned offset = info->indirect_offset / 4;
741 indirect_grid[0] = data[offset];
742 indirect_grid[1] = data[offset + 1];
743 indirect_grid[2] = data[offset + 2];
744 }
745 for (int i = 0; i < 3; i++) {
746 rctx->cs_block_grid_sizes[i] = info->block[i];
747 rctx->cs_block_grid_sizes[i + 4] = info->indirect ? indirect_grid[i] : info->grid[i];
748 }
749 rctx->cs_block_grid_sizes[3] = rctx->cs_block_grid_sizes[7] = 0;
750 rctx->driver_consts[PIPE_SHADER_COMPUTE].cs_block_grid_size_dirty = true;
751
752 evergreen_emit_atomic_buffer_setup_count(rctx, current, combined_atomics, &atomic_used_mask);
753 r600_need_cs_space(rctx, 0, true, util_bitcount(atomic_used_mask));
754
755 if (need_buf_const) {
756 eg_setup_buffer_constants(rctx, PIPE_SHADER_COMPUTE);
757 }
758 r600_update_driver_const_buffers(rctx, true);
759
760 evergreen_emit_atomic_buffer_setup(rctx, true, combined_atomics, atomic_used_mask);
761 if (atomic_used_mask) {
762 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
763 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
764 }
765 } else
766 r600_need_cs_space(rctx, 0, true, 0);
767
768 /* Initialize all the compute-related registers.
769 *
770 * See evergreen_init_atom_start_compute_cs() in this file for the list
771 * of registers initialized by the start_compute_cs_cmd atom.
772 */
773 r600_emit_command_buffer(cs, &rctx->start_compute_cs_cmd);
774
775 /* emit config state */
776 if (rctx->b.gfx_level == EVERGREEN) {
777 if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI||
778 rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_NIR) {
779 radeon_set_config_reg_seq(cs, R_008C04_SQ_GPR_RESOURCE_MGMT_1, 3);
780 radeon_emit(cs, S_008C04_NUM_CLAUSE_TEMP_GPRS(rctx->r6xx_num_clause_temp_gprs));
781 radeon_emit(cs, 0);
782 radeon_emit(cs, 0);
783 radeon_set_config_reg(cs, R_008D8C_SQ_DYN_GPR_CNTL_PS_FLUSH_REQ, (1 << 8));
784 } else
785 r600_emit_atom(rctx, &rctx->config_state.atom);
786 }
787
788 rctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
789 r600_flush_emit(rctx);
790
791 if (rctx->cs_shader_state.shader->ir_type != PIPE_SHADER_IR_TGSI &&
792 rctx->cs_shader_state.shader->ir_type != PIPE_SHADER_IR_NIR) {
793
794 compute_setup_cbs(rctx);
795
796 /* Emit vertex buffer state */
797 rctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(rctx->cs_vertex_buffer_state.dirty_mask);
798 r600_emit_atom(rctx, &rctx->cs_vertex_buffer_state.atom);
799 } else {
800 uint32_t rat_mask;
801
802 rat_mask = evergreen_construct_rat_mask(rctx, &rctx->cb_misc_state, 0);
803 radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
804 rat_mask);
805 }
806
807 r600_emit_atom(rctx, &rctx->b.render_cond_atom);
808
809 /* Emit constant buffer state */
810 r600_emit_atom(rctx, &rctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
811
812 /* Emit sampler state */
813 r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].states.atom);
814
815 /* Emit sampler view (texture resource) state */
816 r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].views.atom);
817
818 /* Emit images state */
819 r600_emit_atom(rctx, &rctx->compute_images.atom);
820
821 /* Emit buffers state */
822 r600_emit_atom(rctx, &rctx->compute_buffers.atom);
823
824 /* Emit shader state */
825 r600_emit_atom(rctx, &rctx->cs_shader_state.atom);
826
827 /* Emit dispatch state and dispatch packet */
828 evergreen_emit_dispatch(rctx, info, indirect_grid);
829
830 /* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
831 */
832 rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
833 R600_CONTEXT_INV_VERTEX_CACHE |
834 R600_CONTEXT_INV_TEX_CACHE;
835 r600_flush_emit(rctx);
836 rctx->b.flags = 0;
837
838 if (rctx->b.gfx_level >= CAYMAN) {
839 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
840 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
841 /* DEALLOC_STATE prevents the GPU from hanging when a
842 * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
843 * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
844 */
845 radeon_emit(cs, PKT3C(PKT3_DEALLOC_STATE, 0, 0));
846 radeon_emit(cs, 0);
847 }
848 if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI ||
849 rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_NIR)
850 evergreen_emit_atomic_buffer_save(rctx, true, combined_atomics, &atomic_used_mask);
851
852 #if 0
853 COMPUTE_DBG(rctx->screen, "cdw: %i\n", cs->cdw);
854 for (i = 0; i < cs->cdw; i++) {
855 COMPUTE_DBG(rctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
856 }
857 #endif
858
859 }
860
861
862 /**
863 * Emit function for r600_cs_shader_state atom
864 */
evergreen_emit_cs_shader(struct r600_context * rctx,struct r600_atom * atom)865 void evergreen_emit_cs_shader(struct r600_context *rctx,
866 struct r600_atom *atom)
867 {
868 struct r600_cs_shader_state *state =
869 (struct r600_cs_shader_state*)atom;
870 struct r600_pipe_compute *shader = state->shader;
871 struct radeon_cmdbuf *cs = &rctx->b.gfx.cs;
872 uint64_t va;
873 struct r600_resource *code_bo;
874 unsigned ngpr, nstack;
875
876 if (shader->ir_type == PIPE_SHADER_IR_TGSI ||
877 shader->ir_type == PIPE_SHADER_IR_NIR) {
878 code_bo = shader->sel->current->bo;
879 va = shader->sel->current->bo->gpu_address;
880 ngpr = shader->sel->current->shader.bc.ngpr;
881 nstack = shader->sel->current->shader.bc.nstack;
882 } else {
883 code_bo = shader->code_bo;
884 va = shader->code_bo->gpu_address + state->pc;
885 ngpr = shader->bc.ngpr;
886 nstack = shader->bc.nstack;
887 }
888
889 radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
890 radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
891 radeon_emit(cs, /* R_0288D4_SQ_PGM_RESOURCES_LS */
892 S_0288D4_NUM_GPRS(ngpr) |
893 S_0288D4_DX10_CLAMP(1) |
894 S_0288D4_STACK_SIZE(nstack));
895 radeon_emit(cs, 0); /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
896
897 radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
898 radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
899 code_bo, RADEON_USAGE_READ |
900 RADEON_PRIO_SHADER_BINARY));
901 }
902
evergreen_launch_grid(struct pipe_context * ctx,const struct pipe_grid_info * info)903 static void evergreen_launch_grid(struct pipe_context *ctx,
904 const struct pipe_grid_info *info)
905 {
906 struct r600_context *rctx = (struct r600_context *)ctx;
907 #ifdef HAVE_OPENCL
908 struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
909 bool use_kill;
910
911 if (shader->ir_type != PIPE_SHADER_IR_TGSI &&
912 shader->ir_type != PIPE_SHADER_IR_NIR) {
913 rctx->cs_shader_state.pc = info->pc;
914 /* Get the config information for this kernel. */
915 r600_shader_binary_read_config(&shader->binary, &shader->bc,
916 info->pc, &use_kill);
917 } else {
918 use_kill = false;
919 rctx->cs_shader_state.pc = 0;
920 }
921 #endif
922
923 COMPUTE_DBG(rctx->screen, "*** evergreen_launch_grid: pc = %u\n", info->pc);
924
925
926 evergreen_compute_upload_input(ctx, info);
927 compute_emit_cs(rctx, info);
928 }
929
evergreen_set_compute_resources(struct pipe_context * ctx,unsigned start,unsigned count,struct pipe_surface ** surfaces)930 static void evergreen_set_compute_resources(struct pipe_context *ctx,
931 unsigned start, unsigned count,
932 struct pipe_surface **surfaces)
933 {
934 struct r600_context *rctx = (struct r600_context *)ctx;
935 struct r600_surface **resources = (struct r600_surface **)surfaces;
936
937 COMPUTE_DBG(rctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
938 start, count);
939
940 for (unsigned i = 0; i < count; i++) {
941 /* The First four vertex buffers are reserved for parameters and
942 * global buffers. */
943 unsigned vtx_id = 4 + i;
944 if (resources[i]) {
945 struct r600_resource_global *buffer =
946 (struct r600_resource_global*)
947 resources[i]->base.texture;
948 if (resources[i]->base.writable) {
949 assert(i+1 < 12);
950
951 evergreen_set_rat(rctx->cs_shader_state.shader, i+1,
952 (struct r600_resource *)resources[i]->base.texture,
953 buffer->chunk->start_in_dw*4,
954 resources[i]->base.texture->width0);
955 }
956
957 evergreen_cs_set_vertex_buffer(rctx, vtx_id,
958 buffer->chunk->start_in_dw * 4,
959 resources[i]->base.texture);
960 }
961 }
962 }
963
evergreen_set_global_binding(struct pipe_context * ctx,unsigned first,unsigned n,struct pipe_resource ** resources,uint32_t ** handles)964 static void evergreen_set_global_binding(struct pipe_context *ctx,
965 unsigned first, unsigned n,
966 struct pipe_resource **resources,
967 uint32_t **handles)
968 {
969 struct r600_context *rctx = (struct r600_context *)ctx;
970 struct compute_memory_pool *pool = rctx->screen->global_pool;
971 struct r600_resource_global **buffers =
972 (struct r600_resource_global **)resources;
973 unsigned i;
974
975 COMPUTE_DBG(rctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
976 first, n);
977
978 if (!resources) {
979 /* XXX: Unset */
980 return;
981 }
982
983 /* We mark these items for promotion to the pool if they
984 * aren't already there */
985 for (i = first; i < first + n; i++) {
986 struct compute_memory_item *item = buffers[i]->chunk;
987
988 if (!is_item_in_pool(item))
989 buffers[i]->chunk->status |= ITEM_FOR_PROMOTING;
990 }
991
992 if (compute_memory_finalize_pending(pool, ctx) == -1) {
993 /* XXX: Unset */
994 return;
995 }
996
997 for (i = first; i < first + n; i++)
998 {
999 uint32_t buffer_offset;
1000 uint32_t handle;
1001 assert(resources[i]->target == PIPE_BUFFER);
1002 assert(resources[i]->bind & PIPE_BIND_GLOBAL);
1003
1004 buffer_offset = util_le32_to_cpu(*(handles[i]));
1005 handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4;
1006
1007 *(handles[i]) = util_cpu_to_le32(handle);
1008 }
1009
1010 /* globals for writing */
1011 evergreen_set_rat(rctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
1012 /* globals for reading */
1013 evergreen_cs_set_vertex_buffer(rctx, 1, 0,
1014 (struct pipe_resource*)pool->bo);
1015
1016 /* constants for reading, LLVM puts them in text segment */
1017 evergreen_cs_set_vertex_buffer(rctx, 2, 0,
1018 (struct pipe_resource*)rctx->cs_shader_state.shader->code_bo);
1019 }
1020
1021 /**
1022 * This function initializes all the compute specific registers that need to
1023 * be initialized for each compute command stream. Registers that are common
1024 * to both compute and 3D will be initialized at the beginning of each compute
1025 * command stream by the start_cs_cmd atom. However, since the SET_CONTEXT_REG
1026 * packet requires that the shader type bit be set, we must initialize all
1027 * context registers needed for compute in this function. The registers
1028 * initialized by the start_cs_cmd atom can be found in evergreen_state.c in the
1029 * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
1030 * on the GPU family.
1031 */
evergreen_init_atom_start_compute_cs(struct r600_context * rctx)1032 void evergreen_init_atom_start_compute_cs(struct r600_context *rctx)
1033 {
1034 struct r600_command_buffer *cb = &rctx->start_compute_cs_cmd;
1035 int num_threads;
1036 int num_stack_entries;
1037
1038 /* since all required registers are initialized in the
1039 * start_compute_cs_cmd atom, we can EMIT_EARLY here.
1040 */
1041 r600_init_command_buffer(cb, 256);
1042 cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
1043
1044 /* We're setting config registers here. */
1045 r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
1046 r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
1047
1048 switch (rctx->b.family) {
1049 case CHIP_CEDAR:
1050 default:
1051 num_threads = 128;
1052 num_stack_entries = 256;
1053 break;
1054 case CHIP_REDWOOD:
1055 num_threads = 128;
1056 num_stack_entries = 256;
1057 break;
1058 case CHIP_JUNIPER:
1059 num_threads = 128;
1060 num_stack_entries = 512;
1061 break;
1062 case CHIP_CYPRESS:
1063 case CHIP_HEMLOCK:
1064 num_threads = 128;
1065 num_stack_entries = 512;
1066 break;
1067 case CHIP_PALM:
1068 num_threads = 128;
1069 num_stack_entries = 256;
1070 break;
1071 case CHIP_SUMO:
1072 num_threads = 128;
1073 num_stack_entries = 256;
1074 break;
1075 case CHIP_SUMO2:
1076 num_threads = 128;
1077 num_stack_entries = 512;
1078 break;
1079 case CHIP_BARTS:
1080 num_threads = 128;
1081 num_stack_entries = 512;
1082 break;
1083 case CHIP_TURKS:
1084 num_threads = 128;
1085 num_stack_entries = 256;
1086 break;
1087 case CHIP_CAICOS:
1088 num_threads = 128;
1089 num_stack_entries = 256;
1090 break;
1091 }
1092
1093 /* The primitive type always needs to be POINTLIST for compute. */
1094 r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
1095 V_008958_DI_PT_POINTLIST);
1096
1097 if (rctx->b.gfx_level < CAYMAN) {
1098
1099 /* These registers control which simds can be used by each stage.
1100 * The default for these registers is 0xffffffff, which means
1101 * all simds are available for each stage. It's possible we may
1102 * want to play around with these in the future, but for now
1103 * the default value is fine.
1104 *
1105 * R_008E20_SQ_STATIC_THREAD_MGMT1
1106 * R_008E24_SQ_STATIC_THREAD_MGMT2
1107 * R_008E28_SQ_STATIC_THREAD_MGMT3
1108 */
1109
1110 /* XXX: We may need to adjust the thread and stack resource
1111 * values for 3D/compute interop */
1112
1113 r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
1114
1115 /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
1116 * Set the number of threads used by the PS/VS/GS/ES stage to
1117 * 0.
1118 */
1119 r600_store_value(cb, 0);
1120
1121 /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
1122 * Set the number of threads used by the CS (aka LS) stage to
1123 * the maximum number of threads and set the number of threads
1124 * for the HS stage to 0. */
1125 r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
1126
1127 /* R_008C20_SQ_STACK_RESOURCE_MGMT_1
1128 * Set the Control Flow stack entries to 0 for PS/VS stages */
1129 r600_store_value(cb, 0);
1130
1131 /* R_008C24_SQ_STACK_RESOURCE_MGMT_2
1132 * Set the Control Flow stack entries to 0 for GS/ES stages */
1133 r600_store_value(cb, 0);
1134
1135 /* R_008C28_SQ_STACK_RESOURCE_MGMT_3
1136 * Set the Control Flow stack entries to 0 for the HS stage, and
1137 * set it to the maximum value for the CS (aka LS) stage. */
1138 r600_store_value(cb,
1139 S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
1140 }
1141 /* Give the compute shader all the available LDS space.
1142 * NOTE: This only sets the maximum number of dwords that a compute
1143 * shader can allocate. When a shader is executed, we still need to
1144 * allocate the appropriate amount of LDS dwords using the
1145 * CM_R_0288E8_SQ_LDS_ALLOC register.
1146 */
1147 if (rctx->b.gfx_level < CAYMAN) {
1148 r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
1149 S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
1150 } else {
1151 r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
1152 S_0286FC_NUM_PS_LDS(0) |
1153 S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
1154 }
1155
1156 /* Context Registers */
1157
1158 if (rctx->b.gfx_level < CAYMAN) {
1159 /* workaround for hw issues with dyn gpr - must set all limits
1160 * to 240 instead of 0, 0x1e == 240 / 8
1161 */
1162 r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
1163 S_028838_PS_GPRS(0x1e) |
1164 S_028838_VS_GPRS(0x1e) |
1165 S_028838_GS_GPRS(0x1e) |
1166 S_028838_ES_GPRS(0x1e) |
1167 S_028838_HS_GPRS(0x1e) |
1168 S_028838_LS_GPRS(0x1e));
1169 }
1170
1171 /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
1172 r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
1173 S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
1174
1175 r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
1176
1177 r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
1178 S_0286E8_TID_IN_GROUP_ENA(1) |
1179 S_0286E8_TGID_ENA(1) |
1180 S_0286E8_DISABLE_INDEX_PACK(1));
1181
1182 /* The LOOP_CONST registers are an optimizations for loops that allows
1183 * you to store the initial counter, increment value, and maximum
1184 * counter value in a register so that hardware can calculate the
1185 * correct number of iterations for the loop, so that you don't need
1186 * to have the loop counter in your shader code. We don't currently use
1187 * this optimization, so we must keep track of the counter in the
1188 * shader and use a break instruction to exit loops. However, the
1189 * hardware will still uses this register to determine when to exit a
1190 * loop, so we need to initialize the counter to 0, set the increment
1191 * value to 1 and the maximum counter value to the 4095 (0xfff) which
1192 * is the maximum value allowed. This gives us a maximum of 4096
1193 * iterations for our loops, but hopefully our break instruction will
1194 * execute before some time before the 4096th iteration.
1195 */
1196 eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
1197 }
1198
1199
evergreen_get_compute_state_info(struct pipe_context * ctx,void * state,struct pipe_compute_state_object_info * info)1200 static void evergreen_get_compute_state_info(struct pipe_context *ctx, void *state,
1201 struct pipe_compute_state_object_info *info)
1202 {
1203 struct r600_context *rctx = (struct r600_context*)ctx;
1204 struct r600_pipe_compute *shader = state;
1205
1206 /* This is somehow copied from RadeonSI, but in thruth this not more
1207 * than an educated guess. */
1208 uint8_t wave_size = r600_wavefront_size(rctx->b.screen->family);
1209 info->private_memory = shader->sel->current->scratch_space_needed;
1210 info->preferred_simd_size = wave_size;
1211 info->simd_sizes = wave_size;
1212 info->max_threads = 128;
1213 }
1214
evergreen_init_compute_state_functions(struct r600_context * rctx)1215 void evergreen_init_compute_state_functions(struct r600_context *rctx)
1216 {
1217 rctx->b.b.create_compute_state = evergreen_create_compute_state;
1218 rctx->b.b.delete_compute_state = evergreen_delete_compute_state;
1219 rctx->b.b.bind_compute_state = evergreen_bind_compute_state;
1220 // rctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
1221 rctx->b.b.set_compute_resources = evergreen_set_compute_resources;
1222 rctx->b.b.set_global_binding = evergreen_set_global_binding;
1223 rctx->b.b.launch_grid = evergreen_launch_grid;
1224 rctx->b.b.get_compute_state_info = evergreen_get_compute_state_info;
1225 }
1226
r600_compute_global_transfer_map(struct pipe_context * ctx,struct pipe_resource * resource,unsigned level,unsigned usage,const struct pipe_box * box,struct pipe_transfer ** ptransfer)1227 void *r600_compute_global_transfer_map(struct pipe_context *ctx,
1228 struct pipe_resource *resource,
1229 unsigned level,
1230 unsigned usage,
1231 const struct pipe_box *box,
1232 struct pipe_transfer **ptransfer)
1233 {
1234 struct r600_context *rctx = (struct r600_context*)ctx;
1235 struct compute_memory_pool *pool = rctx->screen->global_pool;
1236 struct r600_resource_global* buffer =
1237 (struct r600_resource_global*)resource;
1238
1239 struct compute_memory_item *item = buffer->chunk;
1240 struct pipe_resource *dst = NULL;
1241 unsigned offset = box->x;
1242
1243 if (usage & PIPE_MAP_READ)
1244 buffer->chunk->status |= ITEM_MAPPED_FOR_READING;
1245
1246 if (usage & PIPE_MAP_WRITE)
1247 buffer->chunk->status |= ITEM_MAPPED_FOR_WRITING;
1248
1249 if (is_item_in_pool(item)) {
1250 compute_memory_demote_item(pool, item, ctx);
1251 }
1252 else {
1253 if (item->real_buffer == NULL) {
1254 item->real_buffer =
1255 r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4);
1256 }
1257 }
1258
1259 dst = (struct pipe_resource*)item->real_buffer;
1260
1261 COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
1262 "level = %u, usage = %u, box(x = %u, y = %u, z = %u "
1263 "width = %u, height = %u, depth = %u)\n", level, usage,
1264 box->x, box->y, box->z, box->width, box->height,
1265 box->depth);
1266 COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = "
1267 "%u (box.x)\n", item->id, box->x);
1268
1269
1270 assert(resource->target == PIPE_BUFFER);
1271 assert(resource->bind & PIPE_BIND_GLOBAL);
1272 assert(box->x >= 0);
1273 assert(box->y == 0);
1274 assert(box->z == 0);
1275
1276 if (buffer->base.b.is_user_ptr)
1277 return NULL;
1278
1279 ///TODO: do it better, mapping is not possible if the pool is too big
1280 return pipe_buffer_map_range(ctx, dst,
1281 offset, box->width, usage & ~PIPE_MAP_READ, ptransfer);
1282 }
1283
r600_compute_global_transfer_unmap(struct pipe_context * ctx,struct pipe_transfer * transfer)1284 void r600_compute_global_transfer_unmap(struct pipe_context *ctx,
1285 struct pipe_transfer *transfer)
1286 {
1287 /* struct r600_resource_global are not real resources, they just map
1288 * to an offset within the compute memory pool. The function
1289 * r600_compute_global_transfer_map() maps the memory pool
1290 * resource rather than the struct r600_resource_global passed to
1291 * it as an argument and then initializes ptransfer->resource with
1292 * the memory pool resource (via pipe_buffer_map_range).
1293 * When transfer_unmap is called it uses the memory pool's
1294 * vtable which calls r600_buffer_transfer_map() rather than
1295 * this function.
1296 */
1297 assert (!"This function should not be called");
1298 }
1299
r600_compute_global_buffer_destroy(struct pipe_screen * screen,struct pipe_resource * res)1300 void r600_compute_global_buffer_destroy(struct pipe_screen *screen,
1301 struct pipe_resource *res)
1302 {
1303 struct r600_resource_global* buffer = NULL;
1304 struct r600_screen* rscreen = NULL;
1305
1306 assert(res->target == PIPE_BUFFER);
1307 assert(res->bind & PIPE_BIND_GLOBAL);
1308
1309 buffer = (struct r600_resource_global*)res;
1310 rscreen = (struct r600_screen*)screen;
1311
1312 compute_memory_free(rscreen->global_pool, buffer->chunk->id);
1313 buffer->chunk = NULL;
1314
1315 if (buffer->base.b.is_user_ptr)
1316 r600_buffer_destroy(screen, res);
1317 else
1318 free(res);
1319 }
1320
r600_compute_global_buffer_create(struct pipe_screen * screen,const struct pipe_resource * templ)1321 struct pipe_resource *r600_compute_global_buffer_create(struct pipe_screen *screen,
1322 const struct pipe_resource *templ)
1323 {
1324 struct r600_resource_global* result = NULL;
1325 struct r600_screen* rscreen = NULL;
1326 int size_in_dw = 0;
1327
1328 assert(templ->target == PIPE_BUFFER);
1329 assert(templ->bind & PIPE_BIND_GLOBAL);
1330 assert(templ->array_size == 1 || templ->array_size == 0);
1331 assert(templ->depth0 == 1 || templ->depth0 == 0);
1332 assert(templ->height0 == 1 || templ->height0 == 0);
1333
1334 result = (struct r600_resource_global*)
1335 CALLOC(sizeof(struct r600_resource_global), 1);
1336 rscreen = (struct r600_screen*)screen;
1337
1338 COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
1339 COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
1340 templ->array_size);
1341
1342 result->base.b.b = *templ;
1343 result->base.b.b.screen = screen;
1344 result->base.compute_global_bo = true;
1345 pipe_reference_init(&result->base.b.b.reference, 1);
1346
1347 size_in_dw = (templ->width0+3) / 4;
1348
1349 result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
1350
1351 if (result->chunk == NULL)
1352 {
1353 free(result);
1354 return NULL;
1355 }
1356
1357 return &result->base.b.b;
1358 }
1359