xref: /aosp_15_r20/external/mesa3d/src/gallium/drivers/r600/evergreen_compute.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2011 Adam Rak <[email protected]>
3  * Authors:
4  *      Adam Rak <[email protected]>
5  * SPDX-License-Identifier: MIT
6  */
7 
8 #ifdef HAVE_OPENCL
9 #include <gelf.h>
10 #include <libelf.h>
11 #endif
12 #include <stdio.h>
13 #include <errno.h>
14 #include "pipe/p_defines.h"
15 #include "pipe/p_state.h"
16 #include "pipe/p_context.h"
17 #include "util/u_blitter.h"
18 #include "util/list.h"
19 #include "util/u_transfer.h"
20 #include "util/u_surface.h"
21 #include "util/u_pack_color.h"
22 #include "util/u_memory.h"
23 #include "util/u_inlines.h"
24 #include "util/u_framebuffer.h"
25 #include "pipebuffer/pb_buffer.h"
26 #include "evergreend.h"
27 #include "r600_shader.h"
28 #include "r600_pipe.h"
29 #include "r600_formats.h"
30 #include "evergreen_compute.h"
31 #include "evergreen_compute_internal.h"
32 #include "compute_memory_pool.h"
33 #include <inttypes.h>
34 
35 /**
36 RAT0 is for global binding write
37 VTX1 is for global binding read
38 
39 for writing images RAT1...
40 for reading images TEX2...
41   TEX2-RAT1 is paired
42 
43 TEX2... consumes the same fetch resources, that VTX2... would consume
44 
45 CONST0 and VTX0 is for parameters
46   CONST0 is binding smaller input parameter buffer, and for constant indexing,
47   also constant cached
48   VTX0 is for indirect/non-constant indexing, or if the input is bigger than
49   the constant cache can handle
50 
51 RAT-s are limited to 12, so we can only bind at most 11 texture for writing
52 because we reserve RAT0 for global bindings. With byteaddressing enabled,
53 we should reserve another one too.=> 10 image binding for writing max.
54 
55 from Nvidia OpenCL:
56   CL_DEVICE_MAX_READ_IMAGE_ARGS:        128
57   CL_DEVICE_MAX_WRITE_IMAGE_ARGS:       8
58 
59 so 10 for writing is enough. 176 is the max for reading according to the docs
60 
61 writable images should be listed first < 10, so their id corresponds to RAT(id+1)
62 writable images will consume TEX slots, VTX slots too because of linear indexing
63 
64 */
65 
66 #ifdef HAVE_OPENCL
radeon_shader_binary_init(struct r600_shader_binary * b)67 static void radeon_shader_binary_init(struct r600_shader_binary *b)
68 {
69 	memset(b, 0, sizeof(*b));
70 }
71 
radeon_shader_binary_clean(struct r600_shader_binary * b)72 static void radeon_shader_binary_clean(struct r600_shader_binary *b)
73 {
74 	if (!b)
75 		return;
76 	FREE(b->code);
77 	FREE(b->config);
78 	FREE(b->rodata);
79 	FREE(b->global_symbol_offsets);
80 	FREE(b->relocs);
81 	FREE(b->disasm_string);
82 }
83 #endif
84 
r600_compute_buffer_alloc_vram(struct r600_screen * screen,unsigned size)85 struct r600_resource *r600_compute_buffer_alloc_vram(struct r600_screen *screen,
86 						     unsigned size)
87 {
88 	struct pipe_resource *buffer = NULL;
89 	assert(size);
90 
91 	buffer = pipe_buffer_create((struct pipe_screen*) screen,
92 				    0, PIPE_USAGE_IMMUTABLE, size);
93 
94 	return (struct r600_resource *)buffer;
95 }
96 
97 
evergreen_set_rat(struct r600_pipe_compute * pipe,unsigned id,struct r600_resource * bo,int start,int size)98 static void evergreen_set_rat(struct r600_pipe_compute *pipe,
99 			      unsigned id,
100 			      struct r600_resource *bo,
101 			      int start,
102 			      int size)
103 {
104 	struct pipe_surface rat_templ;
105 	struct r600_surface *surf = NULL;
106 	struct r600_context *rctx = NULL;
107 
108 	assert(id < 12);
109 	assert((size & 3) == 0);
110 	assert((start & 0xFF) == 0);
111 
112 	rctx = pipe->ctx;
113 
114 	COMPUTE_DBG(rctx->screen, "bind rat: %i \n", id);
115 
116 	/* Create the RAT surface */
117 	memset(&rat_templ, 0, sizeof(rat_templ));
118 	rat_templ.format = PIPE_FORMAT_R32_UINT;
119 	rat_templ.u.tex.level = 0;
120 	rat_templ.u.tex.first_layer = 0;
121 	rat_templ.u.tex.last_layer = 0;
122 
123 	/* Add the RAT the list of color buffers. Drop the old buffer first. */
124 	pipe_surface_reference(&pipe->ctx->framebuffer.state.cbufs[id], NULL);
125 	pipe->ctx->framebuffer.state.cbufs[id] = pipe->ctx->b.b.create_surface(
126 		(struct pipe_context *)pipe->ctx,
127 		(struct pipe_resource *)bo, &rat_templ);
128 
129 	/* Update the number of color buffers */
130 	pipe->ctx->framebuffer.state.nr_cbufs =
131 		MAX2(id + 1, pipe->ctx->framebuffer.state.nr_cbufs);
132 
133 	/* Update the cb_target_mask
134 	 * XXX: I think this is a potential spot for bugs once we start doing
135 	 * GL interop.  cb_target_mask may be modified in the 3D sections
136 	 * of this driver. */
137 	pipe->ctx->compute_cb_target_mask |= (0xf << (id * 4));
138 
139 	surf = (struct r600_surface*)pipe->ctx->framebuffer.state.cbufs[id];
140 	evergreen_init_color_surface_rat(rctx, surf);
141 }
142 
evergreen_cs_set_vertex_buffer(struct r600_context * rctx,unsigned vb_index,unsigned offset,struct pipe_resource * buffer)143 static void evergreen_cs_set_vertex_buffer(struct r600_context *rctx,
144 					   unsigned vb_index,
145 					   unsigned offset,
146 					   struct pipe_resource *buffer)
147 {
148 	struct r600_vertexbuf_state *state = &rctx->cs_vertex_buffer_state;
149 	struct pipe_vertex_buffer *vb = &state->vb[vb_index];
150 	vb->buffer_offset = offset;
151 	vb->buffer.resource = buffer;
152 	vb->is_user_buffer = false;
153 
154 	/* The vertex instructions in the compute shaders use the texture cache,
155 	 * so we need to invalidate it. */
156 	rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
157 	state->enabled_mask |= 1 << vb_index;
158 	state->dirty_mask |= 1 << vb_index;
159 	r600_mark_atom_dirty(rctx, &state->atom);
160 }
161 
evergreen_cs_set_constant_buffer(struct r600_context * rctx,unsigned cb_index,unsigned offset,unsigned size,struct pipe_resource * buffer)162 static void evergreen_cs_set_constant_buffer(struct r600_context *rctx,
163 					     unsigned cb_index,
164 					     unsigned offset,
165 					     unsigned size,
166 					     struct pipe_resource *buffer)
167 {
168 	struct pipe_constant_buffer cb;
169 	cb.buffer_size = size;
170 	cb.buffer_offset = offset;
171 	cb.buffer = buffer;
172 	cb.user_buffer = NULL;
173 
174 	rctx->b.b.set_constant_buffer(&rctx->b.b, PIPE_SHADER_COMPUTE, cb_index, false, &cb);
175 }
176 
177 /* We need to define these R600 registers here, because we can't include
178  * evergreend.h and r600d.h.
179  */
180 #define R_028868_SQ_PGM_RESOURCES_VS                 0x028868
181 #define R_028850_SQ_PGM_RESOURCES_PS                 0x028850
182 
183 #ifdef HAVE_OPENCL
parse_symbol_table(Elf_Data * symbol_table_data,const GElf_Shdr * symbol_table_header,struct r600_shader_binary * binary)184 static void parse_symbol_table(Elf_Data *symbol_table_data,
185 				const GElf_Shdr *symbol_table_header,
186 				struct r600_shader_binary *binary)
187 {
188 	GElf_Sym symbol;
189 	unsigned i = 0;
190 	unsigned symbol_count =
191 		symbol_table_header->sh_size / symbol_table_header->sh_entsize;
192 
193 	/* We are over allocating this list, because symbol_count gives the
194 	 * total number of symbols, and we will only be filling the list
195 	 * with offsets of global symbols.  The memory savings from
196 	 * allocating the correct size of this list will be small, and
197 	 * I don't think it is worth the cost of pre-computing the number
198 	 * of global symbols.
199 	 */
200 	binary->global_symbol_offsets = CALLOC(symbol_count, sizeof(uint64_t));
201 
202 	while (gelf_getsym(symbol_table_data, i++, &symbol)) {
203 		unsigned i;
204 		if (GELF_ST_BIND(symbol.st_info) != STB_GLOBAL ||
205 		    symbol.st_shndx == 0 /* Undefined symbol */) {
206 			continue;
207 		}
208 
209 		binary->global_symbol_offsets[binary->global_symbol_count] =
210 					symbol.st_value;
211 
212 		/* Sort the list using bubble sort.  This list will usually
213 		 * be small. */
214 		for (i = binary->global_symbol_count; i > 0; --i) {
215 			uint64_t lhs = binary->global_symbol_offsets[i - 1];
216 			uint64_t rhs = binary->global_symbol_offsets[i];
217 			if (lhs < rhs) {
218 				break;
219 			}
220 			binary->global_symbol_offsets[i] = lhs;
221 			binary->global_symbol_offsets[i - 1] = rhs;
222 		}
223 		++binary->global_symbol_count;
224 	}
225 }
226 
227 
parse_relocs(Elf * elf,Elf_Data * relocs,Elf_Data * symbols,unsigned symbol_sh_link,struct r600_shader_binary * binary)228 static void parse_relocs(Elf *elf, Elf_Data *relocs, Elf_Data *symbols,
229 			unsigned symbol_sh_link,
230 			struct r600_shader_binary *binary)
231 {
232 	unsigned i;
233 
234 	if (!relocs || !symbols || !binary->reloc_count) {
235 		return;
236 	}
237 	binary->relocs = CALLOC(binary->reloc_count,
238 			sizeof(struct r600_shader_reloc));
239 	for (i = 0; i < binary->reloc_count; i++) {
240 		GElf_Sym symbol;
241 		GElf_Rel rel;
242 		char *symbol_name;
243 		struct r600_shader_reloc *reloc = &binary->relocs[i];
244 
245 		gelf_getrel(relocs, i, &rel);
246 		gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &symbol);
247 		symbol_name = elf_strptr(elf, symbol_sh_link, symbol.st_name);
248 
249 		reloc->offset = rel.r_offset;
250 		strncpy(reloc->name, symbol_name, sizeof(reloc->name)-1);
251 		reloc->name[sizeof(reloc->name)-1] = 0;
252 	}
253 }
254 
r600_elf_read(const char * elf_data,unsigned elf_size,struct r600_shader_binary * binary)255 static void r600_elf_read(const char *elf_data, unsigned elf_size,
256 		 struct r600_shader_binary *binary)
257 {
258 	char *elf_buffer;
259 	Elf *elf;
260 	Elf_Scn *section = NULL;
261 	Elf_Data *symbols = NULL, *relocs = NULL;
262 	size_t section_str_index;
263 	unsigned symbol_sh_link = 0;
264 
265 	/* One of the libelf implementations
266 	 * (http://www.mr511.de/software/english.htm) requires calling
267 	 * elf_version() before elf_memory().
268 	 */
269 	elf_version(EV_CURRENT);
270 	elf_buffer = MALLOC(elf_size);
271 	memcpy(elf_buffer, elf_data, elf_size);
272 
273 	elf = elf_memory(elf_buffer, elf_size);
274 
275 	elf_getshdrstrndx(elf, &section_str_index);
276 
277 	while ((section = elf_nextscn(elf, section))) {
278 		const char *name;
279 		Elf_Data *section_data = NULL;
280 		GElf_Shdr section_header;
281 		if (gelf_getshdr(section, &section_header) != &section_header) {
282 			fprintf(stderr, "Failed to read ELF section header\n");
283 			return;
284 		}
285 		name = elf_strptr(elf, section_str_index, section_header.sh_name);
286 		if (!strcmp(name, ".text")) {
287 			section_data = elf_getdata(section, section_data);
288 			binary->code_size = section_data->d_size;
289 			binary->code = MALLOC(binary->code_size * sizeof(unsigned char));
290 			memcpy(binary->code, section_data->d_buf, binary->code_size);
291 		} else if (!strcmp(name, ".AMDGPU.config")) {
292 			section_data = elf_getdata(section, section_data);
293 			binary->config_size = section_data->d_size;
294 			binary->config = MALLOC(binary->config_size * sizeof(unsigned char));
295 			memcpy(binary->config, section_data->d_buf, binary->config_size);
296 		} else if (!strcmp(name, ".AMDGPU.disasm")) {
297 			/* Always read disassembly if it's available. */
298 			section_data = elf_getdata(section, section_data);
299 			binary->disasm_string = strndup(section_data->d_buf,
300 							section_data->d_size);
301 		} else if (!strncmp(name, ".rodata", 7)) {
302 			section_data = elf_getdata(section, section_data);
303 			binary->rodata_size = section_data->d_size;
304 			binary->rodata = MALLOC(binary->rodata_size * sizeof(unsigned char));
305 			memcpy(binary->rodata, section_data->d_buf, binary->rodata_size);
306 		} else if (!strncmp(name, ".symtab", 7)) {
307 			symbols = elf_getdata(section, section_data);
308 			symbol_sh_link = section_header.sh_link;
309 			parse_symbol_table(symbols, &section_header, binary);
310 		} else if (!strcmp(name, ".rel.text")) {
311 			relocs = elf_getdata(section, section_data);
312 			binary->reloc_count = section_header.sh_size /
313 					section_header.sh_entsize;
314 		}
315 	}
316 
317 	parse_relocs(elf, relocs, symbols, symbol_sh_link, binary);
318 
319 	if (elf){
320 		elf_end(elf);
321 	}
322 	FREE(elf_buffer);
323 
324 	/* Cache the config size per symbol */
325 	if (binary->global_symbol_count) {
326 		binary->config_size_per_symbol =
327 			binary->config_size / binary->global_symbol_count;
328 	} else {
329 		binary->global_symbol_count = 1;
330 		binary->config_size_per_symbol = binary->config_size;
331 	}
332 }
333 
r600_shader_binary_config_start(const struct r600_shader_binary * binary,uint64_t symbol_offset)334 static const unsigned char *r600_shader_binary_config_start(
335 	const struct r600_shader_binary *binary,
336 	uint64_t symbol_offset)
337 {
338 	unsigned i;
339 	for (i = 0; i < binary->global_symbol_count; ++i) {
340 		if (binary->global_symbol_offsets[i] == symbol_offset) {
341 			unsigned offset = i * binary->config_size_per_symbol;
342 			return binary->config + offset;
343 		}
344 	}
345 	return binary->config;
346 }
347 
r600_shader_binary_read_config(const struct r600_shader_binary * binary,struct r600_bytecode * bc,uint64_t symbol_offset,bool * use_kill)348 static void r600_shader_binary_read_config(const struct r600_shader_binary *binary,
349 					   struct r600_bytecode *bc,
350 					   uint64_t symbol_offset,
351 					   bool *use_kill)
352 {
353        unsigned i;
354        const unsigned char *config =
355                r600_shader_binary_config_start(binary, symbol_offset);
356 
357        for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
358                unsigned reg =
359                        util_le32_to_cpu(*(uint32_t*)(config + i));
360                unsigned value =
361                        util_le32_to_cpu(*(uint32_t*)(config + i + 4));
362                switch (reg) {
363                /* R600 / R700 */
364                case R_028850_SQ_PGM_RESOURCES_PS:
365                case R_028868_SQ_PGM_RESOURCES_VS:
366                /* Evergreen / Northern Islands */
367                case R_028844_SQ_PGM_RESOURCES_PS:
368                case R_028860_SQ_PGM_RESOURCES_VS:
369                case R_0288D4_SQ_PGM_RESOURCES_LS:
370                        bc->ngpr = MAX2(bc->ngpr, G_028844_NUM_GPRS(value));
371                        bc->nstack = MAX2(bc->nstack, G_028844_STACK_SIZE(value));
372                        break;
373                case R_02880C_DB_SHADER_CONTROL:
374                        *use_kill = G_02880C_KILL_ENABLE(value);
375                        break;
376                case R_0288E8_SQ_LDS_ALLOC:
377                        bc->nlds_dw = value;
378                        break;
379                }
380        }
381 }
382 
r600_create_shader(struct r600_bytecode * bc,const struct r600_shader_binary * binary,bool * use_kill)383 static unsigned r600_create_shader(struct r600_bytecode *bc,
384 				   const struct r600_shader_binary *binary,
385 				   bool *use_kill)
386 
387 {
388 	assert(binary->code_size % 4 == 0);
389 	bc->bytecode = CALLOC(1, binary->code_size);
390 	memcpy(bc->bytecode, binary->code, binary->code_size);
391 	bc->ndw = binary->code_size / 4;
392 
393 	r600_shader_binary_read_config(binary, bc, 0, use_kill);
394 	return 0;
395 }
396 
397 #endif
398 
r600_destroy_shader(struct r600_bytecode * bc)399 static void r600_destroy_shader(struct r600_bytecode *bc)
400 {
401 	FREE(bc->bytecode);
402 }
403 
evergreen_create_compute_state(struct pipe_context * ctx,const struct pipe_compute_state * cso)404 static void *evergreen_create_compute_state(struct pipe_context *ctx,
405 					    const struct pipe_compute_state *cso)
406 {
407 	struct r600_context *rctx = (struct r600_context *)ctx;
408 	struct r600_pipe_compute *shader = CALLOC_STRUCT(r600_pipe_compute);
409 #ifdef HAVE_OPENCL
410 	const struct pipe_binary_program_header *header;
411 	void *p;
412 	bool use_kill;
413 #endif
414 
415 	shader->ctx = rctx;
416 	shader->local_size = cso->static_shared_mem;
417 	shader->input_size = cso->req_input_mem;
418 
419 	shader->ir_type = cso->ir_type;
420 
421 	if (shader->ir_type == PIPE_SHADER_IR_TGSI ||
422 	    shader->ir_type == PIPE_SHADER_IR_NIR) {
423 		shader->sel = r600_create_shader_state_tokens(ctx, cso->prog, cso->ir_type, PIPE_SHADER_COMPUTE);
424 
425 		/* Precompile the shader with the expected shader key, to reduce jank at
426 		 * draw time. Also produces output for shader-db.
427 		 */
428 		bool dirty;
429 		r600_shader_select(ctx, shader->sel, &dirty, true);
430 
431 		return shader;
432 	}
433 #ifdef HAVE_OPENCL
434 	COMPUTE_DBG(rctx->screen, "*** evergreen_create_compute_state\n");
435 	header = cso->prog;
436 	radeon_shader_binary_init(&shader->binary);
437 	r600_elf_read(header->blob, header->num_bytes, &shader->binary);
438 	r600_create_shader(&shader->bc, &shader->binary, &use_kill);
439 
440 	/* Upload code + ROdata */
441 	shader->code_bo = r600_compute_buffer_alloc_vram(rctx->screen,
442 							shader->bc.ndw * 4);
443 	p = r600_buffer_map_sync_with_rings(
444 		&rctx->b, shader->code_bo,
445 		PIPE_MAP_WRITE | RADEON_MAP_TEMPORARY);
446 	//TODO: use util_memcpy_cpu_to_le32 ?
447 	memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
448 	rctx->b.ws->buffer_unmap(rctx->b.ws, shader->code_bo->buf);
449 #endif
450 
451 	return shader;
452 }
453 
evergreen_delete_compute_state(struct pipe_context * ctx,void * state)454 static void evergreen_delete_compute_state(struct pipe_context *ctx, void *state)
455 {
456 	struct r600_context *rctx = (struct r600_context *)ctx;
457 	struct r600_pipe_compute *shader = state;
458 
459 	COMPUTE_DBG(rctx->screen, "*** evergreen_delete_compute_state\n");
460 
461 	if (!shader)
462 		return;
463 
464 	if (shader->ir_type == PIPE_SHADER_IR_TGSI ||
465 	    shader->ir_type == PIPE_SHADER_IR_NIR) {
466 		r600_delete_shader_selector(ctx, shader->sel);
467 	} else {
468 #ifdef HAVE_OPENCL
469 		radeon_shader_binary_clean(&shader->binary);
470 		pipe_resource_reference((struct pipe_resource**)&shader->code_bo, NULL);
471 		pipe_resource_reference((struct pipe_resource**)&shader->kernel_param, NULL);
472 #endif
473 		r600_destroy_shader(&shader->bc);
474 	}
475 	FREE(shader);
476 }
477 
evergreen_bind_compute_state(struct pipe_context * ctx,void * state)478 static void evergreen_bind_compute_state(struct pipe_context *ctx, void *state)
479 {
480 	struct r600_context *rctx = (struct r600_context *)ctx;
481 	struct r600_pipe_compute *cstate = (struct r600_pipe_compute *)state;
482 	COMPUTE_DBG(rctx->screen, "*** evergreen_bind_compute_state\n");
483 
484 	if (!state) {
485 		rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
486 		return;
487 	}
488 
489 	if (cstate->ir_type == PIPE_SHADER_IR_TGSI ||
490 	    cstate->ir_type == PIPE_SHADER_IR_NIR) {
491 		bool compute_dirty;
492 		if (r600_shader_select(ctx, cstate->sel, &compute_dirty, false))
493 			R600_ERR("Failed to select compute shader\n");
494 	}
495 
496 	rctx->cs_shader_state.shader = (struct r600_pipe_compute *)state;
497 }
498 
499 /* The kernel parameters are stored a vtx buffer (ID=0), besides the explicit
500  * kernel parameters there are implicit parameters that need to be stored
501  * in the vertex buffer as well.  Here is how these parameters are organized in
502  * the buffer:
503  *
504  * DWORDS 0-2: Number of work groups in each dimension (x,y,z)
505  * DWORDS 3-5: Number of global work items in each dimension (x,y,z)
506  * DWORDS 6-8: Number of work items within each work group in each dimension
507  *             (x,y,z)
508  * DWORDS 9+ : Kernel parameters
509  */
evergreen_compute_upload_input(struct pipe_context * ctx,const struct pipe_grid_info * info)510 static void evergreen_compute_upload_input(struct pipe_context *ctx,
511 					   const struct pipe_grid_info *info)
512 {
513 	struct r600_context *rctx = (struct r600_context *)ctx;
514 	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
515 	unsigned i;
516 	/* We need to reserve 9 dwords (36 bytes) for implicit kernel
517 	 * parameters.
518 	 */
519 	unsigned input_size;
520 	uint32_t *num_work_groups_start;
521 	uint32_t *global_size_start;
522 	uint32_t *local_size_start;
523 	uint32_t *kernel_parameters_start;
524 	struct pipe_box box;
525 	struct pipe_transfer *transfer = NULL;
526 
527 	if (!shader)
528 		return;
529 	if (shader->input_size == 0) {
530 		return;
531 	}
532 	input_size = shader->input_size + 36;
533 	if (!shader->kernel_param) {
534 		/* Add space for the grid dimensions */
535 		shader->kernel_param = (struct r600_resource *)
536 			pipe_buffer_create(ctx->screen, 0,
537 					PIPE_USAGE_IMMUTABLE, input_size);
538 	}
539 
540 	u_box_1d(0, input_size, &box);
541 	num_work_groups_start = ctx->buffer_map(ctx,
542 			(struct pipe_resource*)shader->kernel_param,
543 			0, PIPE_MAP_WRITE | PIPE_MAP_DISCARD_RANGE,
544 			&box, &transfer);
545 	global_size_start = num_work_groups_start + (3 * (sizeof(uint) /4));
546 	local_size_start = global_size_start + (3 * (sizeof(uint)) / 4);
547 	kernel_parameters_start = local_size_start + (3 * (sizeof(uint)) / 4);
548 
549 	/* Copy the work group size */
550 	memcpy(num_work_groups_start, info->grid, 3 * sizeof(uint));
551 
552 	/* Copy the global size */
553 	for (i = 0; i < 3; i++) {
554 		global_size_start[i] = info->grid[i] * info->block[i];
555 	}
556 
557 	/* Copy the local dimensions */
558 	memcpy(local_size_start, info->block, 3 * sizeof(uint));
559 
560 	/* Copy the kernel inputs */
561 	memcpy(kernel_parameters_start, info->input, shader->input_size);
562 
563 	for (i = 0; i < (input_size / 4); i++) {
564 		COMPUTE_DBG(rctx->screen, "input %i : %u\n", i,
565 			((unsigned*)num_work_groups_start)[i]);
566 	}
567 
568 	ctx->buffer_unmap(ctx, transfer);
569 
570 	/* ID=0 and ID=3 are reserved for the parameters.
571 	 * LLVM will preferably use ID=0, but it does not work for dynamic
572 	 * indices. */
573 	evergreen_cs_set_vertex_buffer(rctx, 3, 0,
574 			(struct pipe_resource*)shader->kernel_param);
575 	evergreen_cs_set_constant_buffer(rctx, 0, 0, input_size,
576 			(struct pipe_resource*)shader->kernel_param);
577 }
578 
evergreen_emit_dispatch(struct r600_context * rctx,const struct pipe_grid_info * info,uint32_t indirect_grid[3])579 static void evergreen_emit_dispatch(struct r600_context *rctx,
580 				    const struct pipe_grid_info *info,
581 				    uint32_t indirect_grid[3])
582 {
583 	int i;
584 	struct radeon_cmdbuf *cs = &rctx->b.gfx.cs;
585 	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
586 	bool render_cond_bit = rctx->b.render_cond && !rctx->b.render_cond_force_off;
587 	unsigned num_waves;
588 	unsigned num_pipes = rctx->screen->b.info.r600_max_quad_pipes;
589 	unsigned wave_divisor = (16 * num_pipes);
590 	int group_size = 1;
591 	unsigned lds_size = (shader->local_size + info->variable_shared_mem) / 4;
592 
593 	if (shader->ir_type != PIPE_SHADER_IR_TGSI &&
594 	    shader->ir_type != PIPE_SHADER_IR_NIR)
595 		lds_size += shader->bc.nlds_dw;
596 
597 	/* Calculate group_size */
598 	for (i = 0; i < 3; i++) {
599 		group_size *= info->block[i];
600 	}
601 
602 	/* num_waves = ceil((tg_size.x * tg_size.y, tg_size.z) / (16 * num_pipes)) */
603 	num_waves = (info->block[0] * info->block[1] * info->block[2] +
604 			wave_divisor - 1) / wave_divisor;
605 
606 	COMPUTE_DBG(rctx->screen, "Using %u pipes, "
607 				"%u wavefronts per thread block, "
608 				"allocating %u dwords lds.\n",
609 				num_pipes, num_waves, lds_size);
610 
611 	radeon_set_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size);
612 
613 	radeon_set_config_reg_seq(cs, R_00899C_VGT_COMPUTE_START_X, 3);
614 	radeon_emit(cs, 0); /* R_00899C_VGT_COMPUTE_START_X */
615 	radeon_emit(cs, 0); /* R_0089A0_VGT_COMPUTE_START_Y */
616 	radeon_emit(cs, 0); /* R_0089A4_VGT_COMPUTE_START_Z */
617 
618 	radeon_set_config_reg(cs, R_0089AC_VGT_COMPUTE_THREAD_GROUP_SIZE,
619 								group_size);
620 
621 	radeon_compute_set_context_reg_seq(cs, R_0286EC_SPI_COMPUTE_NUM_THREAD_X, 3);
622 	radeon_emit(cs, info->block[0]); /* R_0286EC_SPI_COMPUTE_NUM_THREAD_X */
623 	radeon_emit(cs, info->block[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */
624 	radeon_emit(cs, info->block[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */
625 
626 	if (rctx->b.gfx_level < CAYMAN) {
627 		assert(lds_size <= 8192);
628 	} else {
629 		/* Cayman appears to have a slightly smaller limit, see the
630 		 * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */
631 		assert(lds_size <= 8160);
632 	}
633 
634 	radeon_compute_set_context_reg(cs, R_0288E8_SQ_LDS_ALLOC,
635 					lds_size | (num_waves << 14));
636 
637 	if (info->indirect) {
638 		radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, render_cond_bit));
639 		radeon_emit(cs, indirect_grid[0]);
640 		radeon_emit(cs, indirect_grid[1]);
641 		radeon_emit(cs, indirect_grid[2]);
642 		radeon_emit(cs, 1);
643 	} else {
644 		/* Dispatch packet */
645 		radeon_emit(cs, PKT3C(PKT3_DISPATCH_DIRECT, 3, render_cond_bit));
646 		radeon_emit(cs, info->grid[0]);
647 		radeon_emit(cs, info->grid[1]);
648 		radeon_emit(cs, info->grid[2]);
649 		/* VGT_DISPATCH_INITIATOR = COMPUTE_SHADER_EN */
650 		radeon_emit(cs, 1);
651 	}
652 
653 	if (rctx->is_debug)
654 		eg_trace_emit(rctx);
655 }
656 
compute_setup_cbs(struct r600_context * rctx)657 static void compute_setup_cbs(struct r600_context *rctx)
658 {
659 	struct radeon_cmdbuf *cs = &rctx->b.gfx.cs;
660 	unsigned i;
661 
662 	/* Emit colorbuffers. */
663 	/* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
664 	for (i = 0; i < 8 && i < rctx->framebuffer.state.nr_cbufs; i++) {
665 		struct r600_surface *cb = (struct r600_surface*)rctx->framebuffer.state.cbufs[i];
666 		unsigned reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
667 						       (struct r600_resource*)cb->base.texture,
668 						       RADEON_USAGE_READWRITE |
669 						       RADEON_PRIO_SHADER_RW_BUFFER);
670 
671 		radeon_compute_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 7);
672 		radeon_emit(cs, cb->cb_color_base);	/* R_028C60_CB_COLOR0_BASE */
673 		radeon_emit(cs, cb->cb_color_pitch);	/* R_028C64_CB_COLOR0_PITCH */
674 		radeon_emit(cs, cb->cb_color_slice);	/* R_028C68_CB_COLOR0_SLICE */
675 		radeon_emit(cs, cb->cb_color_view);	/* R_028C6C_CB_COLOR0_VIEW */
676 		radeon_emit(cs, cb->cb_color_info);	/* R_028C70_CB_COLOR0_INFO */
677 		radeon_emit(cs, cb->cb_color_attrib);	/* R_028C74_CB_COLOR0_ATTRIB */
678 		radeon_emit(cs, cb->cb_color_dim);		/* R_028C78_CB_COLOR0_DIM */
679 
680 		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C60_CB_COLOR0_BASE */
681 		radeon_emit(cs, reloc);
682 
683 		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); /* R_028C74_CB_COLOR0_ATTRIB */
684 		radeon_emit(cs, reloc);
685 	}
686 	for (; i < 8 ; i++)
687 		radeon_compute_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
688 					       S_028C70_FORMAT(V_028C70_COLOR_INVALID));
689 	for (; i < 12; i++)
690 		radeon_compute_set_context_reg(cs, R_028E50_CB_COLOR8_INFO + (i - 8) * 0x1C,
691 					       S_028C70_FORMAT(V_028C70_COLOR_INVALID));
692 
693 	/* Set CB_TARGET_MASK  XXX: Use cb_misc_state */
694 	radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
695 				       rctx->compute_cb_target_mask);
696 }
697 
compute_emit_cs(struct r600_context * rctx,const struct pipe_grid_info * info)698 static void compute_emit_cs(struct r600_context *rctx,
699 			    const struct pipe_grid_info *info)
700 {
701 	struct radeon_cmdbuf *cs = &rctx->b.gfx.cs;
702 	bool compute_dirty = false;
703 	struct r600_pipe_shader *current;
704 	struct r600_shader_atomic combined_atomics[8];
705 	uint8_t atomic_used_mask;
706 	uint32_t indirect_grid[3] = { 0, 0, 0 };
707 
708 	/* make sure that the gfx ring is only one active */
709 	if (radeon_emitted(&rctx->b.dma.cs, 0)) {
710 		rctx->b.dma.flush(rctx, PIPE_FLUSH_ASYNC, NULL);
711 	}
712 
713 	r600_update_compressed_resource_state(rctx, true);
714 
715 	if (!rctx->cmd_buf_is_compute) {
716 		rctx->b.gfx.flush(rctx, PIPE_FLUSH_ASYNC, NULL);
717 		rctx->cmd_buf_is_compute = true;
718 	}
719 
720 	if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI||
721 	    rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_NIR) {
722 		if (r600_shader_select(&rctx->b.b, rctx->cs_shader_state.shader->sel, &compute_dirty, false)) {
723 			R600_ERR("Failed to select compute shader\n");
724 			return;
725 		}
726 
727 		current = rctx->cs_shader_state.shader->sel->current;
728 		if (compute_dirty) {
729 			rctx->cs_shader_state.atom.num_dw = current->command_buffer.num_dw;
730 			r600_context_add_resource_size(&rctx->b.b, (struct pipe_resource *)current->bo);
731 			r600_set_atom_dirty(rctx, &rctx->cs_shader_state.atom, true);
732 		}
733 
734 		bool need_buf_const = current->shader.uses_tex_buffers ||
735 			current->shader.has_txq_cube_array_z_comp;
736 
737 		if (info->indirect) {
738 			struct r600_resource *indirect_resource = (struct r600_resource *)info->indirect;
739 			unsigned *data = r600_buffer_map_sync_with_rings(&rctx->b, indirect_resource, PIPE_MAP_READ);
740 			unsigned offset = info->indirect_offset / 4;
741 			indirect_grid[0] = data[offset];
742 			indirect_grid[1] = data[offset + 1];
743 			indirect_grid[2] = data[offset + 2];
744 		}
745 		for (int i = 0; i < 3; i++) {
746 			rctx->cs_block_grid_sizes[i] = info->block[i];
747 			rctx->cs_block_grid_sizes[i + 4] = info->indirect ? indirect_grid[i] : info->grid[i];
748 		}
749 		rctx->cs_block_grid_sizes[3] = rctx->cs_block_grid_sizes[7] = 0;
750 		rctx->driver_consts[PIPE_SHADER_COMPUTE].cs_block_grid_size_dirty = true;
751 
752 		evergreen_emit_atomic_buffer_setup_count(rctx, current, combined_atomics, &atomic_used_mask);
753 		r600_need_cs_space(rctx, 0, true, util_bitcount(atomic_used_mask));
754 
755 		if (need_buf_const) {
756 			eg_setup_buffer_constants(rctx, PIPE_SHADER_COMPUTE);
757 		}
758 		r600_update_driver_const_buffers(rctx, true);
759 
760 		evergreen_emit_atomic_buffer_setup(rctx, true, combined_atomics, atomic_used_mask);
761 		if (atomic_used_mask) {
762 			radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
763 			radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
764 		}
765 	} else
766 		r600_need_cs_space(rctx, 0, true, 0);
767 
768 	/* Initialize all the compute-related registers.
769 	 *
770 	 * See evergreen_init_atom_start_compute_cs() in this file for the list
771 	 * of registers initialized by the start_compute_cs_cmd atom.
772 	 */
773 	r600_emit_command_buffer(cs, &rctx->start_compute_cs_cmd);
774 
775 	/* emit config state */
776 	if (rctx->b.gfx_level == EVERGREEN) {
777 		if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI||
778 		    rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_NIR) {
779 			radeon_set_config_reg_seq(cs, R_008C04_SQ_GPR_RESOURCE_MGMT_1, 3);
780 			radeon_emit(cs, S_008C04_NUM_CLAUSE_TEMP_GPRS(rctx->r6xx_num_clause_temp_gprs));
781 			radeon_emit(cs, 0);
782 			radeon_emit(cs, 0);
783 			radeon_set_config_reg(cs, R_008D8C_SQ_DYN_GPR_CNTL_PS_FLUSH_REQ, (1 << 8));
784 		} else
785 			r600_emit_atom(rctx, &rctx->config_state.atom);
786 	}
787 
788 	rctx->b.flags |= R600_CONTEXT_WAIT_3D_IDLE | R600_CONTEXT_FLUSH_AND_INV;
789 	r600_flush_emit(rctx);
790 
791 	if (rctx->cs_shader_state.shader->ir_type != PIPE_SHADER_IR_TGSI &&
792 	    rctx->cs_shader_state.shader->ir_type != PIPE_SHADER_IR_NIR) {
793 
794 		compute_setup_cbs(rctx);
795 
796 		/* Emit vertex buffer state */
797 		rctx->cs_vertex_buffer_state.atom.num_dw = 12 * util_bitcount(rctx->cs_vertex_buffer_state.dirty_mask);
798 		r600_emit_atom(rctx, &rctx->cs_vertex_buffer_state.atom);
799 	} else {
800 		uint32_t rat_mask;
801 
802 		rat_mask = evergreen_construct_rat_mask(rctx, &rctx->cb_misc_state, 0);
803 		radeon_compute_set_context_reg(cs, R_028238_CB_TARGET_MASK,
804 					       rat_mask);
805 	}
806 
807 	r600_emit_atom(rctx, &rctx->b.render_cond_atom);
808 
809 	/* Emit constant buffer state */
810 	r600_emit_atom(rctx, &rctx->constbuf_state[PIPE_SHADER_COMPUTE].atom);
811 
812 	/* Emit sampler state */
813 	r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].states.atom);
814 
815 	/* Emit sampler view (texture resource) state */
816 	r600_emit_atom(rctx, &rctx->samplers[PIPE_SHADER_COMPUTE].views.atom);
817 
818 	/* Emit images state */
819 	r600_emit_atom(rctx, &rctx->compute_images.atom);
820 
821 	/* Emit buffers state */
822 	r600_emit_atom(rctx, &rctx->compute_buffers.atom);
823 
824 	/* Emit shader state */
825 	r600_emit_atom(rctx, &rctx->cs_shader_state.atom);
826 
827 	/* Emit dispatch state and dispatch packet */
828 	evergreen_emit_dispatch(rctx, info, indirect_grid);
829 
830 	/* XXX evergreen_flush_emit() hardcodes the CP_COHER_SIZE to 0xffffffff
831 	 */
832 	rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
833 		      R600_CONTEXT_INV_VERTEX_CACHE |
834 	              R600_CONTEXT_INV_TEX_CACHE;
835 	r600_flush_emit(rctx);
836 	rctx->b.flags = 0;
837 
838 	if (rctx->b.gfx_level >= CAYMAN) {
839 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
840 		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
841 		/* DEALLOC_STATE prevents the GPU from hanging when a
842 		 * SURFACE_SYNC packet is emitted some time after a DISPATCH_DIRECT
843 		 * with any of the CB*_DEST_BASE_ENA or DB_DEST_BASE_ENA bits set.
844 		 */
845 		radeon_emit(cs, PKT3C(PKT3_DEALLOC_STATE, 0, 0));
846 		radeon_emit(cs, 0);
847 	}
848 	if (rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_TGSI ||
849 	    rctx->cs_shader_state.shader->ir_type == PIPE_SHADER_IR_NIR)
850 		evergreen_emit_atomic_buffer_save(rctx, true, combined_atomics, &atomic_used_mask);
851 
852 #if 0
853 	COMPUTE_DBG(rctx->screen, "cdw: %i\n", cs->cdw);
854 	for (i = 0; i < cs->cdw; i++) {
855 		COMPUTE_DBG(rctx->screen, "%4i : 0x%08X\n", i, cs->buf[i]);
856 	}
857 #endif
858 
859 }
860 
861 
862 /**
863  * Emit function for r600_cs_shader_state atom
864  */
evergreen_emit_cs_shader(struct r600_context * rctx,struct r600_atom * atom)865 void evergreen_emit_cs_shader(struct r600_context *rctx,
866 			      struct r600_atom *atom)
867 {
868 	struct r600_cs_shader_state *state =
869 					(struct r600_cs_shader_state*)atom;
870 	struct r600_pipe_compute *shader = state->shader;
871 	struct radeon_cmdbuf *cs = &rctx->b.gfx.cs;
872 	uint64_t va;
873 	struct r600_resource *code_bo;
874 	unsigned ngpr, nstack;
875 
876 	if (shader->ir_type == PIPE_SHADER_IR_TGSI ||
877 	    shader->ir_type == PIPE_SHADER_IR_NIR) {
878 		code_bo = shader->sel->current->bo;
879 		va = shader->sel->current->bo->gpu_address;
880 		ngpr = shader->sel->current->shader.bc.ngpr;
881 		nstack = shader->sel->current->shader.bc.nstack;
882 	} else {
883 		code_bo = shader->code_bo;
884 		va = shader->code_bo->gpu_address + state->pc;
885 		ngpr = shader->bc.ngpr;
886 		nstack = shader->bc.nstack;
887 	}
888 
889 	radeon_compute_set_context_reg_seq(cs, R_0288D0_SQ_PGM_START_LS, 3);
890 	radeon_emit(cs, va >> 8); /* R_0288D0_SQ_PGM_START_LS */
891 	radeon_emit(cs,           /* R_0288D4_SQ_PGM_RESOURCES_LS */
892 			S_0288D4_NUM_GPRS(ngpr) |
893 			S_0288D4_DX10_CLAMP(1) |
894 			S_0288D4_STACK_SIZE(nstack));
895 	radeon_emit(cs, 0);	/* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
896 
897 	radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
898 	radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
899 					      code_bo, RADEON_USAGE_READ |
900 					      RADEON_PRIO_SHADER_BINARY));
901 }
902 
evergreen_launch_grid(struct pipe_context * ctx,const struct pipe_grid_info * info)903 static void evergreen_launch_grid(struct pipe_context *ctx,
904 				  const struct pipe_grid_info *info)
905 {
906 	struct r600_context *rctx = (struct r600_context *)ctx;
907 #ifdef HAVE_OPENCL
908 	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
909 	bool use_kill;
910 
911 	if (shader->ir_type != PIPE_SHADER_IR_TGSI &&
912 	    shader->ir_type != PIPE_SHADER_IR_NIR) {
913 		rctx->cs_shader_state.pc = info->pc;
914 		/* Get the config information for this kernel. */
915 		r600_shader_binary_read_config(&shader->binary, &shader->bc,
916 					       info->pc, &use_kill);
917 	} else {
918 		use_kill = false;
919 		rctx->cs_shader_state.pc = 0;
920 	}
921 #endif
922 
923 	COMPUTE_DBG(rctx->screen, "*** evergreen_launch_grid: pc = %u\n", info->pc);
924 
925 
926 	evergreen_compute_upload_input(ctx, info);
927 	compute_emit_cs(rctx, info);
928 }
929 
evergreen_set_compute_resources(struct pipe_context * ctx,unsigned start,unsigned count,struct pipe_surface ** surfaces)930 static void evergreen_set_compute_resources(struct pipe_context *ctx,
931 					    unsigned start, unsigned count,
932 					    struct pipe_surface **surfaces)
933 {
934 	struct r600_context *rctx = (struct r600_context *)ctx;
935 	struct r600_surface **resources = (struct r600_surface **)surfaces;
936 
937 	COMPUTE_DBG(rctx->screen, "*** evergreen_set_compute_resources: start = %u count = %u\n",
938 			start, count);
939 
940 	for (unsigned i = 0; i < count; i++) {
941 		/* The First four vertex buffers are reserved for parameters and
942 		 * global buffers. */
943 		unsigned vtx_id = 4 + i;
944 		if (resources[i]) {
945 			struct r600_resource_global *buffer =
946 				(struct r600_resource_global*)
947 				resources[i]->base.texture;
948 			if (resources[i]->base.writable) {
949 				assert(i+1 < 12);
950 
951 				evergreen_set_rat(rctx->cs_shader_state.shader, i+1,
952 				(struct r600_resource *)resources[i]->base.texture,
953 				buffer->chunk->start_in_dw*4,
954 				resources[i]->base.texture->width0);
955 			}
956 
957 			evergreen_cs_set_vertex_buffer(rctx, vtx_id,
958 					buffer->chunk->start_in_dw * 4,
959 					resources[i]->base.texture);
960 		}
961 	}
962 }
963 
evergreen_set_global_binding(struct pipe_context * ctx,unsigned first,unsigned n,struct pipe_resource ** resources,uint32_t ** handles)964 static void evergreen_set_global_binding(struct pipe_context *ctx,
965 					 unsigned first, unsigned n,
966 					 struct pipe_resource **resources,
967 					 uint32_t **handles)
968 {
969 	struct r600_context *rctx = (struct r600_context *)ctx;
970 	struct compute_memory_pool *pool = rctx->screen->global_pool;
971 	struct r600_resource_global **buffers =
972 		(struct r600_resource_global **)resources;
973 	unsigned i;
974 
975 	COMPUTE_DBG(rctx->screen, "*** evergreen_set_global_binding first = %u n = %u\n",
976 			first, n);
977 
978 	if (!resources) {
979 		/* XXX: Unset */
980 		return;
981 	}
982 
983 	/* We mark these items for promotion to the pool if they
984 	 * aren't already there */
985 	for (i = first; i < first + n; i++) {
986 		struct compute_memory_item *item = buffers[i]->chunk;
987 
988 		if (!is_item_in_pool(item))
989 			buffers[i]->chunk->status |= ITEM_FOR_PROMOTING;
990 	}
991 
992 	if (compute_memory_finalize_pending(pool, ctx) == -1) {
993 		/* XXX: Unset */
994 		return;
995 	}
996 
997 	for (i = first; i < first + n; i++)
998 	{
999 		uint32_t buffer_offset;
1000 		uint32_t handle;
1001 		assert(resources[i]->target == PIPE_BUFFER);
1002 		assert(resources[i]->bind & PIPE_BIND_GLOBAL);
1003 
1004 		buffer_offset = util_le32_to_cpu(*(handles[i]));
1005 		handle = buffer_offset + buffers[i]->chunk->start_in_dw * 4;
1006 
1007 		*(handles[i]) = util_cpu_to_le32(handle);
1008 	}
1009 
1010 	/* globals for writing */
1011 	evergreen_set_rat(rctx->cs_shader_state.shader, 0, pool->bo, 0, pool->size_in_dw * 4);
1012 	/* globals for reading */
1013 	evergreen_cs_set_vertex_buffer(rctx, 1, 0,
1014 				(struct pipe_resource*)pool->bo);
1015 
1016 	/* constants for reading, LLVM puts them in text segment */
1017 	evergreen_cs_set_vertex_buffer(rctx, 2, 0,
1018 				(struct pipe_resource*)rctx->cs_shader_state.shader->code_bo);
1019 }
1020 
1021 /**
1022  * This function initializes all the compute specific registers that need to
1023  * be initialized for each compute command stream.  Registers that are common
1024  * to both compute and 3D will be initialized at the beginning of each compute
1025  * command stream by the start_cs_cmd atom.  However, since the SET_CONTEXT_REG
1026  * packet requires that the shader type bit be set, we must initialize all
1027  * context registers needed for compute in this function.  The registers
1028  * initialized by the start_cs_cmd atom can be found in evergreen_state.c in the
1029  * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs depending
1030  * on the GPU family.
1031  */
evergreen_init_atom_start_compute_cs(struct r600_context * rctx)1032 void evergreen_init_atom_start_compute_cs(struct r600_context *rctx)
1033 {
1034 	struct r600_command_buffer *cb = &rctx->start_compute_cs_cmd;
1035 	int num_threads;
1036 	int num_stack_entries;
1037 
1038 	/* since all required registers are initialized in the
1039 	 * start_compute_cs_cmd atom, we can EMIT_EARLY here.
1040 	 */
1041 	r600_init_command_buffer(cb, 256);
1042 	cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE;
1043 
1044 	/* We're setting config registers here. */
1045 	r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
1046 	r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
1047 
1048 	switch (rctx->b.family) {
1049 	case CHIP_CEDAR:
1050 	default:
1051 		num_threads = 128;
1052 		num_stack_entries = 256;
1053 		break;
1054 	case CHIP_REDWOOD:
1055 		num_threads = 128;
1056 		num_stack_entries = 256;
1057 		break;
1058 	case CHIP_JUNIPER:
1059 		num_threads = 128;
1060 		num_stack_entries = 512;
1061 		break;
1062 	case CHIP_CYPRESS:
1063 	case CHIP_HEMLOCK:
1064 		num_threads = 128;
1065 		num_stack_entries = 512;
1066 		break;
1067 	case CHIP_PALM:
1068 		num_threads = 128;
1069 		num_stack_entries = 256;
1070 		break;
1071 	case CHIP_SUMO:
1072 		num_threads = 128;
1073 		num_stack_entries = 256;
1074 		break;
1075 	case CHIP_SUMO2:
1076 		num_threads = 128;
1077 		num_stack_entries = 512;
1078 		break;
1079 	case CHIP_BARTS:
1080 		num_threads = 128;
1081 		num_stack_entries = 512;
1082 		break;
1083 	case CHIP_TURKS:
1084 		num_threads = 128;
1085 		num_stack_entries = 256;
1086 		break;
1087 	case CHIP_CAICOS:
1088 		num_threads = 128;
1089 		num_stack_entries = 256;
1090 		break;
1091 	}
1092 
1093 	/* The primitive type always needs to be POINTLIST for compute. */
1094 	r600_store_config_reg(cb, R_008958_VGT_PRIMITIVE_TYPE,
1095 						V_008958_DI_PT_POINTLIST);
1096 
1097 	if (rctx->b.gfx_level < CAYMAN) {
1098 
1099 		/* These registers control which simds can be used by each stage.
1100 		 * The default for these registers is 0xffffffff, which means
1101 		 * all simds are available for each stage.  It's possible we may
1102 		 * want to play around with these in the future, but for now
1103 		 * the default value is fine.
1104 		 *
1105 		 * R_008E20_SQ_STATIC_THREAD_MGMT1
1106 		 * R_008E24_SQ_STATIC_THREAD_MGMT2
1107 		 * R_008E28_SQ_STATIC_THREAD_MGMT3
1108 		 */
1109 
1110 		/* XXX: We may need to adjust the thread and stack resource
1111 		 * values for 3D/compute interop */
1112 
1113 		r600_store_config_reg_seq(cb, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5);
1114 
1115 		/* R_008C18_SQ_THREAD_RESOURCE_MGMT_1
1116 		 * Set the number of threads used by the PS/VS/GS/ES stage to
1117 		 * 0.
1118 		 */
1119 		r600_store_value(cb, 0);
1120 
1121 		/* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2
1122 		 * Set the number of threads used by the CS (aka LS) stage to
1123 		 * the maximum number of threads and set the number of threads
1124 		 * for the HS stage to 0. */
1125 		r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads));
1126 
1127 		/* R_008C20_SQ_STACK_RESOURCE_MGMT_1
1128 		 * Set the Control Flow stack entries to 0 for PS/VS stages */
1129 		r600_store_value(cb, 0);
1130 
1131 		/* R_008C24_SQ_STACK_RESOURCE_MGMT_2
1132 		 * Set the Control Flow stack entries to 0 for GS/ES stages */
1133 		r600_store_value(cb, 0);
1134 
1135 		/* R_008C28_SQ_STACK_RESOURCE_MGMT_3
1136 		 * Set the Control Flow stack entries to 0 for the HS stage, and
1137 		 * set it to the maximum value for the CS (aka LS) stage. */
1138 		r600_store_value(cb,
1139 			S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries));
1140 	}
1141 	/* Give the compute shader all the available LDS space.
1142 	 * NOTE: This only sets the maximum number of dwords that a compute
1143 	 * shader can allocate.  When a shader is executed, we still need to
1144 	 * allocate the appropriate amount of LDS dwords using the
1145 	 * CM_R_0288E8_SQ_LDS_ALLOC register.
1146 	 */
1147 	if (rctx->b.gfx_level < CAYMAN) {
1148 		r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT,
1149 			S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192));
1150 	} else {
1151 		r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT,
1152 			S_0286FC_NUM_PS_LDS(0) |
1153 			S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */
1154 	}
1155 
1156 	/* Context Registers */
1157 
1158 	if (rctx->b.gfx_level < CAYMAN) {
1159 		/* workaround for hw issues with dyn gpr - must set all limits
1160 		 * to 240 instead of 0, 0x1e == 240 / 8
1161 		 */
1162 		r600_store_context_reg(cb, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1,
1163 				S_028838_PS_GPRS(0x1e) |
1164 				S_028838_VS_GPRS(0x1e) |
1165 				S_028838_GS_GPRS(0x1e) |
1166 				S_028838_ES_GPRS(0x1e) |
1167 				S_028838_HS_GPRS(0x1e) |
1168 				S_028838_LS_GPRS(0x1e));
1169 	}
1170 
1171 	/* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */
1172 	r600_store_context_reg(cb, R_028A40_VGT_GS_MODE,
1173 		S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1));
1174 
1175 	r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/);
1176 
1177 	r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL,
1178 			       S_0286E8_TID_IN_GROUP_ENA(1) |
1179 			       S_0286E8_TGID_ENA(1) |
1180 			       S_0286E8_DISABLE_INDEX_PACK(1));
1181 
1182 	/* The LOOP_CONST registers are an optimizations for loops that allows
1183 	 * you to store the initial counter, increment value, and maximum
1184 	 * counter value in a register so that hardware can calculate the
1185 	 * correct number of iterations for the loop, so that you don't need
1186 	 * to have the loop counter in your shader code.  We don't currently use
1187 	 * this optimization, so we must keep track of the counter in the
1188 	 * shader and use a break instruction to exit loops.  However, the
1189 	 * hardware will still uses this register to determine when to exit a
1190 	 * loop, so we need to initialize the counter to 0, set the increment
1191 	 * value to 1 and the maximum counter value to the 4095 (0xfff) which
1192 	 * is the maximum value allowed.  This gives us a maximum of 4096
1193 	 * iterations for our loops, but hopefully our break instruction will
1194 	 * execute before some time before the 4096th iteration.
1195 	 */
1196 	eg_store_loop_const(cb, R_03A200_SQ_LOOP_CONST_0 + (160 * 4), 0x1000FFF);
1197 }
1198 
1199 
evergreen_get_compute_state_info(struct pipe_context * ctx,void * state,struct pipe_compute_state_object_info * info)1200 static void evergreen_get_compute_state_info(struct pipe_context *ctx, void *state,
1201                                              struct pipe_compute_state_object_info *info)
1202 {
1203 	struct r600_context *rctx = (struct r600_context*)ctx;
1204 	struct r600_pipe_compute *shader = state;
1205 
1206 	/* This is somehow copied from RadeonSI, but in thruth this not more
1207 	 * than an educated guess. */
1208 	uint8_t wave_size = r600_wavefront_size(rctx->b.screen->family);
1209 	info->private_memory = shader->sel->current->scratch_space_needed;
1210 	info->preferred_simd_size = wave_size;
1211 	info->simd_sizes = wave_size;
1212 	info->max_threads = 128;
1213 }
1214 
evergreen_init_compute_state_functions(struct r600_context * rctx)1215 void evergreen_init_compute_state_functions(struct r600_context *rctx)
1216 {
1217 	rctx->b.b.create_compute_state = evergreen_create_compute_state;
1218 	rctx->b.b.delete_compute_state = evergreen_delete_compute_state;
1219 	rctx->b.b.bind_compute_state = evergreen_bind_compute_state;
1220 //	 rctx->context.create_sampler_view = evergreen_compute_create_sampler_view;
1221 	rctx->b.b.set_compute_resources = evergreen_set_compute_resources;
1222 	rctx->b.b.set_global_binding = evergreen_set_global_binding;
1223 	rctx->b.b.launch_grid = evergreen_launch_grid;
1224 	rctx->b.b.get_compute_state_info = evergreen_get_compute_state_info;
1225 }
1226 
r600_compute_global_transfer_map(struct pipe_context * ctx,struct pipe_resource * resource,unsigned level,unsigned usage,const struct pipe_box * box,struct pipe_transfer ** ptransfer)1227 void *r600_compute_global_transfer_map(struct pipe_context *ctx,
1228 				      struct pipe_resource *resource,
1229 				      unsigned level,
1230 				      unsigned usage,
1231 				      const struct pipe_box *box,
1232 				      struct pipe_transfer **ptransfer)
1233 {
1234 	struct r600_context *rctx = (struct r600_context*)ctx;
1235 	struct compute_memory_pool *pool = rctx->screen->global_pool;
1236 	struct r600_resource_global* buffer =
1237 		(struct r600_resource_global*)resource;
1238 
1239 	struct compute_memory_item *item = buffer->chunk;
1240 	struct pipe_resource *dst = NULL;
1241 	unsigned offset = box->x;
1242 
1243 	if (usage & PIPE_MAP_READ)
1244 		buffer->chunk->status |= ITEM_MAPPED_FOR_READING;
1245 
1246 	if (usage & PIPE_MAP_WRITE)
1247 		buffer->chunk->status |= ITEM_MAPPED_FOR_WRITING;
1248 
1249 	if (is_item_in_pool(item)) {
1250 		compute_memory_demote_item(pool, item, ctx);
1251 	}
1252 	else {
1253 		if (item->real_buffer == NULL) {
1254 			item->real_buffer =
1255 					r600_compute_buffer_alloc_vram(pool->screen, item->size_in_dw * 4);
1256 		}
1257 	}
1258 
1259 	dst = (struct pipe_resource*)item->real_buffer;
1260 
1261 	COMPUTE_DBG(rctx->screen, "* r600_compute_global_transfer_map()\n"
1262 			"level = %u, usage = %u, box(x = %u, y = %u, z = %u "
1263 			"width = %u, height = %u, depth = %u)\n", level, usage,
1264 			box->x, box->y, box->z, box->width, box->height,
1265 			box->depth);
1266 	COMPUTE_DBG(rctx->screen, "Buffer id = %"PRIi64" offset = "
1267 		"%u (box.x)\n", item->id, box->x);
1268 
1269 
1270 	assert(resource->target == PIPE_BUFFER);
1271 	assert(resource->bind & PIPE_BIND_GLOBAL);
1272 	assert(box->x >= 0);
1273 	assert(box->y == 0);
1274 	assert(box->z == 0);
1275 
1276 	if (buffer->base.b.is_user_ptr)
1277 		return NULL;
1278 
1279 	///TODO: do it better, mapping is not possible if the pool is too big
1280 	return pipe_buffer_map_range(ctx, dst,
1281 			offset, box->width, usage & ~PIPE_MAP_READ, ptransfer);
1282 }
1283 
r600_compute_global_transfer_unmap(struct pipe_context * ctx,struct pipe_transfer * transfer)1284 void r600_compute_global_transfer_unmap(struct pipe_context *ctx,
1285 					struct pipe_transfer *transfer)
1286 {
1287 	/* struct r600_resource_global are not real resources, they just map
1288 	 * to an offset within the compute memory pool.  The function
1289 	 * r600_compute_global_transfer_map() maps the memory pool
1290 	 * resource rather than the struct r600_resource_global passed to
1291 	 * it as an argument and then initializes ptransfer->resource with
1292 	 * the memory pool resource (via pipe_buffer_map_range).
1293 	 * When transfer_unmap is called it uses the memory pool's
1294 	 * vtable which calls r600_buffer_transfer_map() rather than
1295 	 * this function.
1296 	 */
1297 	assert (!"This function should not be called");
1298 }
1299 
r600_compute_global_buffer_destroy(struct pipe_screen * screen,struct pipe_resource * res)1300 void r600_compute_global_buffer_destroy(struct pipe_screen *screen,
1301 					struct pipe_resource *res)
1302 {
1303 	struct r600_resource_global* buffer = NULL;
1304 	struct r600_screen* rscreen = NULL;
1305 
1306 	assert(res->target == PIPE_BUFFER);
1307 	assert(res->bind & PIPE_BIND_GLOBAL);
1308 
1309 	buffer = (struct r600_resource_global*)res;
1310 	rscreen = (struct r600_screen*)screen;
1311 
1312 	compute_memory_free(rscreen->global_pool, buffer->chunk->id);
1313 	buffer->chunk = NULL;
1314 
1315 	if (buffer->base.b.is_user_ptr)
1316 		r600_buffer_destroy(screen, res);
1317 	else
1318 		free(res);
1319 }
1320 
r600_compute_global_buffer_create(struct pipe_screen * screen,const struct pipe_resource * templ)1321 struct pipe_resource *r600_compute_global_buffer_create(struct pipe_screen *screen,
1322 							const struct pipe_resource *templ)
1323 {
1324 	struct r600_resource_global* result = NULL;
1325 	struct r600_screen* rscreen = NULL;
1326 	int size_in_dw = 0;
1327 
1328 	assert(templ->target == PIPE_BUFFER);
1329 	assert(templ->bind & PIPE_BIND_GLOBAL);
1330 	assert(templ->array_size == 1 || templ->array_size == 0);
1331 	assert(templ->depth0 == 1 || templ->depth0 == 0);
1332 	assert(templ->height0 == 1 || templ->height0 == 0);
1333 
1334 	result = (struct r600_resource_global*)
1335 	CALLOC(sizeof(struct r600_resource_global), 1);
1336 	rscreen = (struct r600_screen*)screen;
1337 
1338 	COMPUTE_DBG(rscreen, "*** r600_compute_global_buffer_create\n");
1339 	COMPUTE_DBG(rscreen, "width = %u array_size = %u\n", templ->width0,
1340 			templ->array_size);
1341 
1342 	result->base.b.b = *templ;
1343 	result->base.b.b.screen = screen;
1344 	result->base.compute_global_bo = true;
1345 	pipe_reference_init(&result->base.b.b.reference, 1);
1346 
1347 	size_in_dw = (templ->width0+3) / 4;
1348 
1349 	result->chunk = compute_memory_alloc(rscreen->global_pool, size_in_dw);
1350 
1351 	if (result->chunk == NULL)
1352 	{
1353 		free(result);
1354 		return NULL;
1355 	}
1356 
1357 	return &result->base.b.b;
1358 }
1359