xref: /aosp_15_r20/external/mesa3d/src/gallium/drivers/r600/r600_asm.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2010 Jerome Glisse <[email protected]>
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "r600_asm.h"
7 #include "r600_sq.h"
8 #include "r600_opcodes.h"
9 #include "r600_formats.h"
10 #include "r600d.h"
11 #include "r600d_common.h"
12 
13 #include <errno.h>
14 #include <string.h>
15 #include "compiler/shader_enums.h"
16 #include "util/u_memory.h"
17 #include "util/u_math.h"
18 
19 #define NUM_OF_CYCLES 3
20 #define NUM_OF_COMPONENTS 4
21 
alu_writes(struct r600_bytecode_alu * alu)22 static inline bool alu_writes(struct r600_bytecode_alu *alu)
23 {
24 	return alu->dst.write || alu->is_op3;
25 }
26 
r600_bytecode_get_num_operands(const struct r600_bytecode_alu * alu)27 static inline unsigned int r600_bytecode_get_num_operands(const struct r600_bytecode_alu *alu)
28 {
29 	return r600_isa_alu(alu->op)->src_count;
30 }
31 
r600_bytecode_cf(void)32 static struct r600_bytecode_cf *r600_bytecode_cf(void)
33 {
34 	struct r600_bytecode_cf *cf = CALLOC_STRUCT(r600_bytecode_cf);
35 
36 	if (!cf)
37 		return NULL;
38 	list_inithead(&cf->list);
39 	list_inithead(&cf->alu);
40 	list_inithead(&cf->vtx);
41 	list_inithead(&cf->tex);
42 	list_inithead(&cf->gds);
43 	return cf;
44 }
45 
r600_bytecode_alu(void)46 static struct r600_bytecode_alu *r600_bytecode_alu(void)
47 {
48 	struct r600_bytecode_alu *alu = CALLOC_STRUCT(r600_bytecode_alu);
49 
50 	if (!alu)
51 		return NULL;
52 	list_inithead(&alu->list);
53 	return alu;
54 }
55 
r600_bytecode_vtx(void)56 static struct r600_bytecode_vtx *r600_bytecode_vtx(void)
57 {
58 	struct r600_bytecode_vtx *vtx = CALLOC_STRUCT(r600_bytecode_vtx);
59 
60 	if (!vtx)
61 		return NULL;
62 	list_inithead(&vtx->list);
63 	return vtx;
64 }
65 
r600_bytecode_tex(void)66 static struct r600_bytecode_tex *r600_bytecode_tex(void)
67 {
68 	struct r600_bytecode_tex *tex = CALLOC_STRUCT(r600_bytecode_tex);
69 
70 	if (!tex)
71 		return NULL;
72 	list_inithead(&tex->list);
73 	return tex;
74 }
75 
r600_bytecode_gds(void)76 static struct r600_bytecode_gds *r600_bytecode_gds(void)
77 {
78 	struct r600_bytecode_gds *gds = CALLOC_STRUCT(r600_bytecode_gds);
79 
80 	if (gds == NULL)
81 		return NULL;
82 	list_inithead(&gds->list);
83 	return gds;
84 }
85 
stack_entry_size(enum radeon_family chip)86 static unsigned stack_entry_size(enum radeon_family chip) {
87 	/* Wavefront size:
88 	 *   64: R600/RV670/RV770/Cypress/R740/Barts/Turks/Caicos/
89 	 *       Aruba/Sumo/Sumo2/redwood/juniper
90 	 *   32: R630/R730/R710/Palm/Cedar
91 	 *   16: R610/Rs780
92 	 *
93 	 * Stack row size:
94 	 * 	Wavefront Size                        16  32  48  64
95 	 * 	Columns per Row (R6xx/R7xx/R8xx only)  8   8   4   4
96 	 * 	Columns per Row (R9xx+)                8   4   4   4 */
97 
98 	switch (chip) {
99 	/* FIXME: are some chips missing here? */
100 	/* wavefront size 16 */
101 	case CHIP_RV610:
102 	case CHIP_RS780:
103 	case CHIP_RV620:
104 	case CHIP_RS880:
105 	/* wavefront size 32 */
106 	case CHIP_RV630:
107 	case CHIP_RV635:
108 	case CHIP_RV730:
109 	case CHIP_RV710:
110 	case CHIP_PALM:
111 	case CHIP_CEDAR:
112 		return 8;
113 
114 	/* wavefront size 64 */
115 	default:
116 		return 4;
117 	}
118 }
119 
r600_bytecode_init(struct r600_bytecode * bc,enum amd_gfx_level gfx_level,enum radeon_family family,bool has_compressed_msaa_texturing)120 void r600_bytecode_init(struct r600_bytecode *bc,
121 			enum amd_gfx_level gfx_level,
122 			enum radeon_family family,
123 			bool has_compressed_msaa_texturing)
124 {
125 	static unsigned next_shader_id = 0;
126 
127 	bc->debug_id = ++next_shader_id;
128 
129 	if ((gfx_level == R600) &&
130 	    (family != CHIP_RV670 && family != CHIP_RS780 && family != CHIP_RS880)) {
131 		bc->ar_handling = AR_HANDLE_RV6XX;
132 
133 		/* Insert a nop after a relative temp write so that a read in
134 		 * the following instruction group gets the right value.  The
135 		 * r600 and EG ISA specs both say that read-after-rel-write of a
136 		 * register in the next instr group is illegal, but apparently
137 		 * that's not true on all chips (see commit
138 		 * c96b9834032952492efbd2d1f5511fe225704918).
139 		 */
140 		bc->r6xx_nop_after_rel_dst = 1;
141 	} else if (family == CHIP_RV770) {
142 		bc->ar_handling = AR_HANDLE_NORMAL;
143 		bc->r6xx_nop_after_rel_dst = 1;
144 	} else {
145 		bc->ar_handling = AR_HANDLE_NORMAL;
146 		bc->r6xx_nop_after_rel_dst = 0;
147 	}
148 
149 	list_inithead(&bc->cf);
150 	bc->gfx_level = gfx_level;
151 	bc->family = family;
152 	bc->has_compressed_msaa_texturing = has_compressed_msaa_texturing;
153 	bc->stack.entry_size = stack_entry_size(family);
154 }
155 
r600_bytecode_add_cf(struct r600_bytecode * bc)156 int r600_bytecode_add_cf(struct r600_bytecode *bc)
157 {
158 	struct r600_bytecode_cf *cf = r600_bytecode_cf();
159 
160 	if (!cf)
161 		return -ENOMEM;
162 	list_addtail(&cf->list, &bc->cf);
163 	if (bc->cf_last) {
164 		cf->id = bc->cf_last->id + 2;
165 		if (bc->cf_last->eg_alu_extended) {
166 			/* take into account extended alu size */
167 			cf->id += 2;
168 			bc->ndw += 2;
169 		}
170 	}
171 	bc->cf_last = cf;
172 	bc->ncf++;
173 	bc->ndw += 2;
174 	bc->force_add_cf = 0;
175 	bc->ar_loaded = 0;
176 	return 0;
177 }
178 
r600_bytecode_add_output(struct r600_bytecode * bc,const struct r600_bytecode_output * output)179 int r600_bytecode_add_output(struct r600_bytecode *bc,
180 		const struct r600_bytecode_output *output)
181 {
182 	int r;
183 
184 	if (output->gpr >= bc->ngpr)
185 		bc->ngpr = output->gpr + 1;
186 
187 	if (bc->cf_last && (bc->cf_last->op == output->op ||
188 		(bc->cf_last->op == CF_OP_EXPORT &&
189 		output->op == CF_OP_EXPORT_DONE)) &&
190 		output->type == bc->cf_last->output.type &&
191 		output->elem_size == bc->cf_last->output.elem_size &&
192 		output->swizzle_x == bc->cf_last->output.swizzle_x &&
193 		output->swizzle_y == bc->cf_last->output.swizzle_y &&
194 		output->swizzle_z == bc->cf_last->output.swizzle_z &&
195 		output->swizzle_w == bc->cf_last->output.swizzle_w &&
196 		output->comp_mask == bc->cf_last->output.comp_mask &&
197 		(output->burst_count + bc->cf_last->output.burst_count) <= 16) {
198 
199 		if ((output->gpr + output->burst_count) == bc->cf_last->output.gpr &&
200 			(output->array_base + output->burst_count) == bc->cf_last->output.array_base) {
201 
202 			bc->cf_last->op = bc->cf_last->output.op = output->op;
203 			bc->cf_last->output.gpr = output->gpr;
204 			bc->cf_last->output.array_base = output->array_base;
205 			bc->cf_last->output.burst_count += output->burst_count;
206 			return 0;
207 
208 		} else if (output->gpr == (bc->cf_last->output.gpr + bc->cf_last->output.burst_count) &&
209 			output->array_base == (bc->cf_last->output.array_base + bc->cf_last->output.burst_count)) {
210 
211 			bc->cf_last->op = bc->cf_last->output.op = output->op;
212 			bc->cf_last->output.burst_count += output->burst_count;
213 			return 0;
214 		}
215 	}
216 
217 	r = r600_bytecode_add_cf(bc);
218 	if (r)
219 		return r;
220 	bc->cf_last->op = output->op;
221 	memcpy(&bc->cf_last->output, output, sizeof(struct r600_bytecode_output));
222 	bc->cf_last->barrier = 1;
223 	return 0;
224 }
225 
r600_bytecode_add_pending_output(struct r600_bytecode * bc,const struct r600_bytecode_output * output)226 int r600_bytecode_add_pending_output(struct r600_bytecode *bc,
227 		const struct r600_bytecode_output *output)
228 {
229 	assert(bc->n_pending_outputs + 1 < ARRAY_SIZE(bc->pending_outputs));
230 	bc->pending_outputs[bc->n_pending_outputs++] = *output;
231 
232 	return 0;
233 }
234 
235 void
r600_bytecode_add_ack(struct r600_bytecode * bc)236 r600_bytecode_add_ack(struct r600_bytecode *bc)
237 {
238 	bc->need_wait_ack = true;
239 }
240 
241 int
r600_bytecode_wait_acks(struct r600_bytecode * bc)242 r600_bytecode_wait_acks(struct r600_bytecode *bc)
243 {
244 	/* Store acks are an R700+ feature. */
245 	if (bc->gfx_level < R700)
246 		return 0;
247 
248 	if (!bc->need_wait_ack)
249 		return 0;
250 
251 	int ret = r600_bytecode_add_cfinst(bc, CF_OP_WAIT_ACK);
252 	if (ret != 0)
253 		return ret;
254 
255 	struct r600_bytecode_cf *cf = bc->cf_last;
256 	cf->barrier = 1;
257 	/* Request a wait if the number of outstanding acks is > 0 */
258 	cf->cf_addr = 0;
259 
260 	return 0;
261 }
262 
263 uint32_t
r600_bytecode_write_export_ack_type(struct r600_bytecode * bc,bool indirect)264 r600_bytecode_write_export_ack_type(struct r600_bytecode *bc, bool indirect)
265 {
266 	if (bc->gfx_level >= R700) {
267 		if (indirect)
268 			return V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND_ACK_EG;
269 		else
270 			return V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_ACK_EG;
271 	} else {
272 		if (indirect)
273 			return V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
274 		else
275 			return V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
276 	}
277 }
278 
279 /* alu instructions that can only exits once per group */
is_alu_once_inst(struct r600_bytecode_alu * alu)280 static int is_alu_once_inst(struct r600_bytecode_alu *alu)
281 {
282 	return r600_isa_alu(alu->op)->flags & (AF_KILL | AF_PRED) || alu->is_lds_idx_op || alu->op == ALU_OP0_GROUP_BARRIER;
283 }
284 
is_alu_reduction_inst(struct r600_bytecode * bc,struct r600_bytecode_alu * alu)285 static int is_alu_reduction_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
286 {
287 	return (r600_isa_alu(alu->op)->flags & AF_REPL) &&
288 			(r600_isa_alu_slots(bc->isa->hw_class, alu->op) == AF_4V);
289 }
290 
is_alu_mova_inst(struct r600_bytecode_alu * alu)291 static int is_alu_mova_inst(struct r600_bytecode_alu *alu)
292 {
293 	return r600_isa_alu(alu->op)->flags & AF_MOVA;
294 }
295 
alu_uses_rel(struct r600_bytecode_alu * alu)296 static int alu_uses_rel(struct r600_bytecode_alu *alu)
297 {
298 	unsigned num_src = r600_bytecode_get_num_operands(alu);
299 	unsigned src;
300 
301 	if (alu->dst.rel) {
302 		return 1;
303 	}
304 
305 	for (src = 0; src < num_src; ++src) {
306 		if (alu->src[src].rel) {
307 			return 1;
308 		}
309 	}
310 	return 0;
311 }
312 
is_lds_read(int sel)313 static int is_lds_read(int sel)
314 {
315   return sel == EG_V_SQ_ALU_SRC_LDS_OQ_A_POP || sel == EG_V_SQ_ALU_SRC_LDS_OQ_B_POP;
316 }
317 
alu_uses_lds(struct r600_bytecode_alu * alu)318 static int alu_uses_lds(struct r600_bytecode_alu *alu)
319 {
320 	unsigned num_src = r600_bytecode_get_num_operands(alu);
321 	unsigned src;
322 
323 	for (src = 0; src < num_src; ++src) {
324 		if (is_lds_read(alu->src[src].sel)) {
325 			return 1;
326 		}
327 	}
328 	return 0;
329 }
330 
is_alu_64bit_inst(struct r600_bytecode_alu * alu)331 static int is_alu_64bit_inst(struct r600_bytecode_alu *alu)
332 {
333 	const struct alu_op_info *op = r600_isa_alu(alu->op);
334 	return (op->flags & AF_64);
335 }
336 
is_alu_vec_unit_inst(struct r600_bytecode * bc,struct r600_bytecode_alu * alu)337 static int is_alu_vec_unit_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
338 {
339 	unsigned slots = r600_isa_alu_slots(bc->isa->hw_class, alu->op);
340 	return !(slots & AF_S);
341 }
342 
is_alu_trans_unit_inst(struct r600_bytecode * bc,struct r600_bytecode_alu * alu)343 static int is_alu_trans_unit_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
344 {
345 	unsigned slots = r600_isa_alu_slots(bc->isa->hw_class, alu->op);
346 	return !(slots & AF_V);
347 }
348 
349 /* alu instructions that can execute on any unit */
is_alu_any_unit_inst(struct r600_bytecode * bc,struct r600_bytecode_alu * alu)350 static int is_alu_any_unit_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu)
351 {
352 	unsigned slots = r600_isa_alu_slots(bc->isa->hw_class, alu->op);
353 	return slots == AF_VS;
354 }
355 
is_nop_inst(struct r600_bytecode_alu * alu)356 static int is_nop_inst(struct r600_bytecode_alu *alu)
357 {
358 	return alu->op == ALU_OP0_NOP;
359 }
360 
assign_alu_units(struct r600_bytecode * bc,struct r600_bytecode_alu * alu_first,struct r600_bytecode_alu * assignment[5])361 static int assign_alu_units(struct r600_bytecode *bc, struct r600_bytecode_alu *alu_first,
362 			    struct r600_bytecode_alu *assignment[5])
363 {
364 	struct r600_bytecode_alu *alu;
365 	unsigned i, chan, trans;
366 	int max_slots = bc->gfx_level == CAYMAN ? 4 : 5;
367 
368 	for (i = 0; i < max_slots; i++)
369 		assignment[i] = NULL;
370 
371 	for (alu = alu_first; alu; alu = list_entry(alu->list.next, struct r600_bytecode_alu, list)) {
372 		chan = alu->dst.chan;
373 		if (max_slots == 4)
374 			trans = 0;
375 		else if (is_alu_trans_unit_inst(bc, alu))
376 			trans = 1;
377 		else if (is_alu_vec_unit_inst(bc, alu))
378 			trans = 0;
379 		else if (assignment[chan])
380 			trans = 1; /* Assume ALU_INST_PREFER_VECTOR. */
381 		else
382 			trans = 0;
383 
384 		if (trans) {
385 			if (assignment[4]) {
386 				assert(0); /* ALU.Trans has already been allocated. */
387 				return -1;
388 			}
389 			assignment[4] = alu;
390 		} else {
391                         if (assignment[chan]) {
392 			 	assert(0); /* ALU.chan has already been allocated. */
393 				return -1;
394 			}
395 			assignment[chan] = alu;
396 		}
397 
398 		if (alu->last)
399 			break;
400 	}
401 	return 0;
402 }
403 
404 struct alu_bank_swizzle {
405 	int	hw_gpr[NUM_OF_CYCLES][NUM_OF_COMPONENTS];
406 	int	hw_cfile_addr[4];
407 	int	hw_cfile_elem[4];
408 };
409 
410 static const unsigned cycle_for_bank_swizzle_vec[][3] = {
411 	[SQ_ALU_VEC_012] = { 0, 1, 2 },
412 	[SQ_ALU_VEC_021] = { 0, 2, 1 },
413 	[SQ_ALU_VEC_120] = { 1, 2, 0 },
414 	[SQ_ALU_VEC_102] = { 1, 0, 2 },
415 	[SQ_ALU_VEC_201] = { 2, 0, 1 },
416 	[SQ_ALU_VEC_210] = { 2, 1, 0 }
417 };
418 
419 static const unsigned cycle_for_bank_swizzle_scl[][3] = {
420 	[SQ_ALU_SCL_210] = { 2, 1, 0 },
421 	[SQ_ALU_SCL_122] = { 1, 2, 2 },
422 	[SQ_ALU_SCL_212] = { 2, 1, 2 },
423 	[SQ_ALU_SCL_221] = { 2, 2, 1 }
424 };
425 
init_bank_swizzle(struct alu_bank_swizzle * bs)426 static void init_bank_swizzle(struct alu_bank_swizzle *bs)
427 {
428 	int i, cycle, component;
429 	/* set up gpr use */
430 	for (cycle = 0; cycle < NUM_OF_CYCLES; cycle++)
431 		for (component = 0; component < NUM_OF_COMPONENTS; component++)
432 			 bs->hw_gpr[cycle][component] = -1;
433 	for (i = 0; i < 4; i++)
434 		bs->hw_cfile_addr[i] = -1;
435 	for (i = 0; i < 4; i++)
436 		bs->hw_cfile_elem[i] = -1;
437 }
438 
reserve_gpr(struct alu_bank_swizzle * bs,unsigned sel,unsigned chan,unsigned cycle)439 static int reserve_gpr(struct alu_bank_swizzle *bs, unsigned sel, unsigned chan, unsigned cycle)
440 {
441 	if (bs->hw_gpr[cycle][chan] == -1)
442 		bs->hw_gpr[cycle][chan] = sel;
443 	else if (bs->hw_gpr[cycle][chan] != (int)sel) {
444 		/* Another scalar operation has already used the GPR read port for the channel. */
445 		return -1;
446 	}
447 	return 0;
448 }
449 
reserve_cfile(const struct r600_bytecode * bc,struct alu_bank_swizzle * bs,unsigned sel,unsigned chan)450 static int reserve_cfile(const struct r600_bytecode *bc,
451 			 struct alu_bank_swizzle *bs, unsigned sel, unsigned chan)
452 {
453 	int res, num_res = 4;
454 	if (bc->gfx_level >= R700) {
455 		num_res = 2;
456 		chan /= 2;
457 	}
458 	for (res = 0; res < num_res; ++res) {
459 		if (bs->hw_cfile_addr[res] == -1) {
460 			bs->hw_cfile_addr[res] = sel;
461 			bs->hw_cfile_elem[res] = chan;
462 			return 0;
463 		} else if (bs->hw_cfile_addr[res] == sel &&
464 			bs->hw_cfile_elem[res] == chan)
465 			return 0; /* Read for this scalar element already reserved, nothing to do here. */
466 	}
467 	/* All cfile read ports are used, cannot reference vector element. */
468 	return -1;
469 }
470 
is_gpr(unsigned sel)471 static int is_gpr(unsigned sel)
472 {
473 	return (sel <= 127);
474 }
475 
476 /* CB constants start at 512, and get translated to a kcache index when ALU
477  * clauses are constructed. Note that we handle kcache constants the same way
478  * as (the now gone) cfile constants, is that really required? */
is_kcache(unsigned sel)479 static int is_kcache(unsigned sel)
480 {
481    return (sel > 511 && sel < 4607) || /* Kcache before translation. */
482          (sel > 127 && sel < 192) || /* Kcache 0 & 1 after translation. */
483          (sel > 256  && sel < 320);  /* Kcache 2 & 3 after translation (EG). */
484 }
485 
is_const(int sel)486 static int is_const(int sel)
487 {
488    return is_kcache(sel) ||
489 		(sel >= V_SQ_ALU_SRC_0 &&
490 		sel <= V_SQ_ALU_SRC_LITERAL);
491 }
492 
check_vector(const struct r600_bytecode * bc,const struct r600_bytecode_alu * alu,struct alu_bank_swizzle * bs,int bank_swizzle)493 static int check_vector(const struct r600_bytecode *bc, const struct r600_bytecode_alu *alu,
494 			struct alu_bank_swizzle *bs, int bank_swizzle)
495 {
496 	int r, src, num_src, sel, elem, cycle;
497 
498 	num_src = r600_bytecode_get_num_operands(alu);
499 	for (src = 0; src < num_src; src++) {
500 		sel = alu->src[src].sel;
501 		elem = alu->src[src].chan;
502 		if (is_gpr(sel)) {
503 			cycle = cycle_for_bank_swizzle_vec[bank_swizzle][src];
504 			if (src == 1 && sel == alu->src[0].sel && elem == alu->src[0].chan)
505 				/* Nothing to do; special-case optimization,
506 				 * second source uses first source’s reservation. */
507 				continue;
508 			else {
509 				r = reserve_gpr(bs, sel, elem, cycle);
510 				if (r)
511 					return r;
512 			}
513       } else if (is_kcache(sel)) {
514 			r = reserve_cfile(bc, bs, (alu->src[src].kc_bank<<16) + sel, elem);
515 			if (r)
516 				return r;
517 		}
518 		/* No restrictions on PV, PS, literal or special constants. */
519 	}
520 	return 0;
521 }
522 
check_scalar(const struct r600_bytecode * bc,const struct r600_bytecode_alu * alu,struct alu_bank_swizzle * bs,int bank_swizzle)523 static int check_scalar(const struct r600_bytecode *bc, const struct r600_bytecode_alu *alu,
524 			struct alu_bank_swizzle *bs, int bank_swizzle)
525 {
526 	int r, src, num_src, const_count, sel, elem, cycle;
527 
528 	num_src = r600_bytecode_get_num_operands(alu);
529 	for (const_count = 0, src = 0; src < num_src; ++src) {
530 		sel = alu->src[src].sel;
531 		elem = alu->src[src].chan;
532 		if (is_const(sel)) { /* Any constant, including literal and inline constants. */
533 			if (const_count >= 2)
534 				/* More than two references to a constant in
535 				 * transcendental operation. */
536 				return -1;
537 			else
538 				const_count++;
539 		}
540       if (is_kcache(sel)) {
541 			r = reserve_cfile(bc, bs, (alu->src[src].kc_bank<<16) + sel, elem);
542 			if (r)
543 				return r;
544 		}
545 	}
546 	for (src = 0; src < num_src; ++src) {
547 		sel = alu->src[src].sel;
548 		elem = alu->src[src].chan;
549 		if (is_gpr(sel)) {
550 			cycle = cycle_for_bank_swizzle_scl[bank_swizzle][src];
551 			if (cycle < const_count)
552 				/* Cycle for GPR load conflicts with
553 				 * constant load in transcendental operation. */
554 				return -1;
555 			r = reserve_gpr(bs, sel, elem, cycle);
556 			if (r)
557 				return r;
558 		}
559 		/* PV PS restrictions */
560 		if (const_count && (sel == 254 || sel == 255)) {
561 			cycle = cycle_for_bank_swizzle_scl[bank_swizzle][src];
562 			if (cycle < const_count)
563 				return -1;
564 		}
565 	}
566 	return 0;
567 }
568 
check_and_set_bank_swizzle(const struct r600_bytecode * bc,struct r600_bytecode_alu * slots[5])569 static int check_and_set_bank_swizzle(const struct r600_bytecode *bc,
570 				      struct r600_bytecode_alu *slots[5])
571 {
572 	struct alu_bank_swizzle bs;
573 	int bank_swizzle[5];
574 	int i, r = 0, forced = 1;
575 	bool scalar_only = bc->gfx_level == CAYMAN ? false : true;
576 	int max_slots = bc->gfx_level == CAYMAN ? 4 : 5;
577 	int max_checks = max_slots * 1000;
578 
579 	for (i = 0; i < max_slots; i++) {
580 		if (slots[i]) {
581 			if (slots[i]->bank_swizzle_force) {
582 				slots[i]->bank_swizzle = slots[i]->bank_swizzle_force;
583 			} else {
584 				forced = 0;
585 			}
586 		}
587 
588 		if (i < 4 && slots[i])
589 			scalar_only = false;
590 	}
591 	if (forced)
592 		return 0;
593 
594 	/* Just check every possible combination of bank swizzle.
595 	 * Not very efficient, but works on the first try in most of the cases. */
596 	for (i = 0; i < 4; i++)
597 		if (!slots[i] || !slots[i]->bank_swizzle_force || slots[i]->is_lds_idx_op)
598 			bank_swizzle[i] = SQ_ALU_VEC_012;
599 		else
600 			bank_swizzle[i] = slots[i]->bank_swizzle;
601 
602 	bank_swizzle[4] = SQ_ALU_SCL_210;
603 
604 	while(bank_swizzle[4] <= SQ_ALU_SCL_221 && max_checks--) {
605 		init_bank_swizzle(&bs);
606 		if (scalar_only == false) {
607 			for (i = 0; i < 4; i++) {
608 				if (slots[i]) {
609 					r = check_vector(bc, slots[i], &bs, bank_swizzle[i]);
610 					if (r)
611 						break;
612 				}
613 			}
614 		} else
615 			r = 0;
616 
617 		if (!r && max_slots == 5 && slots[4]) {
618 			r = check_scalar(bc, slots[4], &bs, bank_swizzle[4]);
619 		}
620 		if (!r) {
621 			for (i = 0; i < max_slots; i++) {
622 				if (slots[i])
623 					slots[i]->bank_swizzle = bank_swizzle[i];
624 			}
625 			return 0;
626 		}
627 
628 		if (scalar_only) {
629 			bank_swizzle[4]++;
630 		} else {
631 			for (i = 0; i < max_slots; i++) {
632 				if (!slots[i] || (!slots[i]->bank_swizzle_force && !slots[i]->is_lds_idx_op)) {
633 					bank_swizzle[i]++;
634 					if (bank_swizzle[i] <= SQ_ALU_VEC_210)
635 						break;
636 					else if (i < max_slots - 1)
637 						bank_swizzle[i] = SQ_ALU_VEC_012;
638 					else
639 						return -1;
640 				}
641 			}
642 		}
643 	}
644 
645 	/* Couldn't find a working swizzle. */
646 	return -1;
647 }
648 
replace_gpr_with_pv_ps(struct r600_bytecode * bc,struct r600_bytecode_alu * slots[5],struct r600_bytecode_alu * alu_prev)649 static int replace_gpr_with_pv_ps(struct r600_bytecode *bc,
650 				  struct r600_bytecode_alu *slots[5], struct r600_bytecode_alu *alu_prev)
651 {
652 	struct r600_bytecode_alu *prev[5];
653 	int gpr[5], chan[5];
654 	int i, j, r, src, num_src;
655 	int max_slots = bc->gfx_level == CAYMAN ? 4 : 5;
656 
657 	r = assign_alu_units(bc, alu_prev, prev);
658 	if (r)
659 		return r;
660 
661 	for (i = 0; i < max_slots; ++i) {
662 		if (prev[i] && alu_writes(prev[i]) && !prev[i]->dst.rel) {
663 
664 			if (is_alu_64bit_inst(prev[i])) {
665 				gpr[i] = -1;
666 				continue;
667 			}
668 
669 			gpr[i] = prev[i]->dst.sel;
670 			/* cube writes more than PV.X */
671 			if (is_alu_reduction_inst(bc, prev[i]))
672 				chan[i] = 0;
673 			else
674 				chan[i] = prev[i]->dst.chan;
675 		} else
676 			gpr[i] = -1;
677 	}
678 
679 	for (i = 0; i < max_slots; ++i) {
680 		struct r600_bytecode_alu *alu = slots[i];
681 		if (!alu)
682 			continue;
683 
684 		if (is_alu_64bit_inst(alu))
685 			continue;
686 		num_src = r600_bytecode_get_num_operands(alu);
687 		for (src = 0; src < num_src; ++src) {
688 			if (!is_gpr(alu->src[src].sel) || alu->src[src].rel)
689 				continue;
690 
691 			if (bc->gfx_level < CAYMAN) {
692 				if (alu->src[src].sel == gpr[4] &&
693 				    alu->src[src].chan == chan[4] &&
694 				    alu_prev->pred_sel == alu->pred_sel) {
695 					alu->src[src].sel = V_SQ_ALU_SRC_PS;
696 					alu->src[src].chan = 0;
697 					continue;
698 				}
699 			}
700 
701 			for (j = 0; j < 4; ++j) {
702 				if (alu->src[src].sel == gpr[j] &&
703 					alu->src[src].chan == j &&
704 				      alu_prev->pred_sel == alu->pred_sel) {
705 					alu->src[src].sel = V_SQ_ALU_SRC_PV;
706 					alu->src[src].chan = chan[j];
707 					break;
708 				}
709 			}
710 		}
711 	}
712 
713 	return 0;
714 }
715 
r600_bytecode_special_constants(uint32_t value,unsigned * sel)716 void r600_bytecode_special_constants(uint32_t value, unsigned *sel)
717 {
718 	switch(value) {
719 	case 0:
720 		*sel = V_SQ_ALU_SRC_0;
721 		break;
722 	case 1:
723 		*sel = V_SQ_ALU_SRC_1_INT;
724 		break;
725 	case -1:
726 		*sel = V_SQ_ALU_SRC_M_1_INT;
727 		break;
728 	case 0x3F800000: /* 1.0f */
729 		*sel = V_SQ_ALU_SRC_1;
730 		break;
731 	case 0x3F000000: /* 0.5f */
732 		*sel = V_SQ_ALU_SRC_0_5;
733 		break;
734 	default:
735 		*sel = V_SQ_ALU_SRC_LITERAL;
736 		break;
737 	}
738 }
739 
740 /* compute how many literal are needed */
r600_bytecode_alu_nliterals(struct r600_bytecode_alu * alu,uint32_t literal[4],unsigned * nliteral)741 static int r600_bytecode_alu_nliterals(struct r600_bytecode_alu *alu,
742 				 uint32_t literal[4], unsigned *nliteral)
743 {
744 	unsigned num_src = r600_bytecode_get_num_operands(alu);
745 	unsigned i, j;
746 
747 	for (i = 0; i < num_src; ++i) {
748 		if (alu->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
749 			uint32_t value = alu->src[i].value;
750 			unsigned found = 0;
751 			for (j = 0; j < *nliteral; ++j) {
752 				if (literal[j] == value) {
753 					found = 1;
754 					break;
755 				}
756 			}
757 			if (!found) {
758 				if (*nliteral >= 4)
759 					return -EINVAL;
760 				literal[(*nliteral)++] = value;
761 			}
762 		}
763 	}
764 	return 0;
765 }
766 
r600_bytecode_alu_adjust_literals(struct r600_bytecode_alu * alu,uint32_t literal[4],unsigned nliteral)767 static void r600_bytecode_alu_adjust_literals(struct r600_bytecode_alu *alu,
768 					      uint32_t literal[4], unsigned nliteral)
769 {
770 	unsigned num_src = r600_bytecode_get_num_operands(alu);
771 	unsigned i, j;
772 
773 	for (i = 0; i < num_src; ++i) {
774 		if (alu->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
775 			uint32_t value = alu->src[i].value;
776 			for (j = 0; j < nliteral; ++j) {
777 				if (literal[j] == value) {
778 					alu->src[i].chan = j;
779 					break;
780 				}
781 			}
782 		}
783 	}
784 }
785 
merge_inst_groups(struct r600_bytecode * bc,struct r600_bytecode_alu * slots[5],struct r600_bytecode_alu * alu_prev)786 static int merge_inst_groups(struct r600_bytecode *bc, struct r600_bytecode_alu *slots[5],
787 			     struct r600_bytecode_alu *alu_prev)
788 {
789 	struct r600_bytecode_alu *prev[5];
790 	struct r600_bytecode_alu *result[5] = { NULL };
791 
792         uint8_t interp_xz = 0;
793 
794 	uint32_t literal[4], prev_literal[4];
795 	unsigned nliteral = 0, prev_nliteral = 0;
796 
797 	int i, j, r, src, num_src;
798 	int num_once_inst = 0;
799 	int have_mova = 0, have_rel = 0;
800 	int max_slots = bc->gfx_level == CAYMAN ? 4 : 5;
801 
802    bool has_dot = false;
803 
804 	r = assign_alu_units(bc, alu_prev, prev);
805 	if (r)
806 		return r;
807 
808 	for (i = 0; i < max_slots; ++i) {
809 		if (prev[i]) {
810 		      if (prev[i]->pred_sel)
811 			      return 0;
812 		      if (is_alu_once_inst(prev[i]))
813 			      return 0;
814 				has_dot |= prev[i]->op == ALU_OP2_DOT || prev[i]->op == ALU_OP2_DOT_IEEE;
815 
816 
817                       if (prev[i]->op == ALU_OP1_INTERP_LOAD_P0)
818                          interp_xz |= 3;
819                       if (prev[i]->op == ALU_OP2_INTERP_X)
820                          interp_xz |= 1;
821                       if (prev[i]->op == ALU_OP2_INTERP_Z)
822                          interp_xz |= 2;
823 		}
824 		if (slots[i]) {
825 			if (slots[i]->pred_sel)
826 				return 0;
827 			if (is_alu_once_inst(slots[i]))
828 				return 0;
829          has_dot |= slots[i]->op == ALU_OP2_DOT || slots[i]->op == ALU_OP2_DOT_IEEE;
830 				return 0;
831                         if (slots[i]->op == ALU_OP1_INTERP_LOAD_P0)
832                            interp_xz |= 3;
833                         if (slots[i]->op == ALU_OP2_INTERP_X)
834                            interp_xz |= 1;
835                         if (slots[i]->op == ALU_OP2_INTERP_Z)
836                            interp_xz |= 2;
837 		}
838                 if (interp_xz == 3)
839                    return 0;
840 	}
841 
842 	for (i = 0; i < max_slots; ++i) {
843 		struct r600_bytecode_alu *alu;
844 
845 		if (num_once_inst > 0)
846 		   return 0;
847 
848 		/* check number of literals */
849 		if (prev[i]) {
850 			if (r600_bytecode_alu_nliterals(prev[i], literal, &nliteral))
851 				return 0;
852 			if (r600_bytecode_alu_nliterals(prev[i], prev_literal, &prev_nliteral))
853 				return 0;
854 			if (is_alu_mova_inst(prev[i])) {
855 				if (have_rel)
856 					return 0;
857 				have_mova = 1;
858 			}
859 
860 			if (alu_uses_rel(prev[i])) {
861 				if (have_mova) {
862 					return 0;
863 				}
864 				have_rel = 1;
865 			}
866 			if (alu_uses_lds(prev[i]))
867 				return 0;
868 
869 			num_once_inst += is_alu_once_inst(prev[i]);
870 		}
871 		if (slots[i] && r600_bytecode_alu_nliterals(slots[i], literal, &nliteral))
872 			return 0;
873 
874 		/* Let's check used slots. */
875 		if (prev[i] && !slots[i]) {
876 			result[i] = prev[i];
877 			continue;
878 		} else if (prev[i] && slots[i]) {
879 			if (max_slots == 5 && !has_dot && result[4] == NULL && prev[4] == NULL && slots[4] == NULL) {
880 				/* Trans unit is still free try to use it. */
881 				if (is_alu_any_unit_inst(bc, slots[i]) && !alu_uses_lds(slots[i])) {
882 					result[i] = prev[i];
883 					result[4] = slots[i];
884 				} else if (is_alu_any_unit_inst(bc, prev[i])) {
885 					if (slots[i]->dst.sel == prev[i]->dst.sel &&
886 					    alu_writes(slots[i]) &&
887 					    alu_writes(prev[i]))
888 						return 0;
889 
890 					result[i] = slots[i];
891 					result[4] = prev[i];
892 				} else
893 					return 0;
894 			} else
895 				return 0;
896 		} else if(!slots[i]) {
897 			continue;
898 		} else {
899 			if (max_slots == 5 && slots[i] && prev[4] &&
900 					slots[i]->dst.sel == prev[4]->dst.sel &&
901 					slots[i]->dst.chan == prev[4]->dst.chan &&
902 					alu_writes(slots[i]) &&
903 					alu_writes(prev[4]))
904 				return 0;
905 
906 			result[i] = slots[i];
907 		}
908 
909 		alu = slots[i];
910 		num_once_inst += is_alu_once_inst(alu);
911 
912 		/* don't reschedule NOPs */
913 		if (is_nop_inst(alu))
914 			return 0;
915 
916 		if (is_alu_mova_inst(alu)) {
917 			if (have_rel) {
918 				return 0;
919 			}
920 			have_mova = 1;
921 		}
922 
923 		if (alu_uses_rel(alu)) {
924 			if (have_mova) {
925 				return 0;
926 			}
927 			have_rel = 1;
928 		}
929 
930 		if (alu->op == ALU_OP0_SET_CF_IDX0 ||
931 			alu->op == ALU_OP0_SET_CF_IDX1)
932 			return 0; /* data hazard with MOVA */
933 
934 		/* Let's check source gprs */
935 		num_src = r600_bytecode_get_num_operands(alu);
936 		for (src = 0; src < num_src; ++src) {
937 
938 			/* Constants don't matter. */
939 			if (!is_gpr(alu->src[src].sel))
940 				continue;
941 
942 			for (j = 0; j < max_slots; ++j) {
943 				if (!prev[j] || !alu_writes(prev[j]))
944 					continue;
945 
946 				/* If it's relative then we can't determine which gpr is really used. */
947 				if (prev[j]->dst.chan == alu->src[src].chan &&
948 					(prev[j]->dst.sel == alu->src[src].sel ||
949 					prev[j]->dst.rel || alu->src[src].rel))
950 					return 0;
951 			}
952 		}
953 	}
954 
955 	/* more than one PRED_ or KILL_ ? */
956 	if (num_once_inst > 1)
957 		return 0;
958 
959 	/* check if the result can still be swizzlet */
960 	r = check_and_set_bank_swizzle(bc, result);
961 	if (r)
962 		return 0;
963 
964 	/* looks like everything worked out right, apply the changes */
965 
966 	/* undo adding previous literals */
967 	bc->cf_last->ndw -= align(prev_nliteral, 2);
968 
969 	/* sort instructions */
970 	for (i = 0; i < max_slots; ++i) {
971 		slots[i] = result[i];
972 		if (result[i]) {
973 			list_del(&result[i]->list);
974 			result[i]->last = 0;
975 			list_addtail(&result[i]->list, &bc->cf_last->alu);
976 		}
977 	}
978 
979 	/* determine new last instruction */
980 	list_entry(bc->cf_last->alu.prev, struct r600_bytecode_alu, list)->last = 1;
981 
982 	/* determine new first instruction */
983 	for (i = 0; i < max_slots; ++i) {
984 		if (result[i]) {
985 			bc->cf_last->curr_bs_head = result[i];
986 			break;
987 		}
988 	}
989 
990 	bc->cf_last->prev_bs_head = bc->cf_last->prev2_bs_head;
991 	bc->cf_last->prev2_bs_head = NULL;
992 
993 	return 0;
994 }
995 
996 /* we'll keep kcache sets sorted by bank & addr */
r600_bytecode_alloc_kcache_line(struct r600_bytecode * bc,struct r600_bytecode_kcache * kcache,unsigned bank,unsigned line,unsigned index_mode)997 static int r600_bytecode_alloc_kcache_line(struct r600_bytecode *bc,
998 		struct r600_bytecode_kcache *kcache,
999 		unsigned bank, unsigned line, unsigned index_mode)
1000 {
1001 	int i, kcache_banks = bc->gfx_level >= EVERGREEN ? 4 : 2;
1002 
1003 	for (i = 0; i < kcache_banks; i++) {
1004 		if (kcache[i].mode) {
1005 			int d;
1006 
1007 			if (kcache[i].bank < bank)
1008 				continue;
1009 
1010 			if ((kcache[i].bank == bank && kcache[i].addr > line+1) ||
1011 					kcache[i].bank > bank) {
1012 				/* try to insert new line */
1013 				if (kcache[kcache_banks-1].mode) {
1014 					/* all sets are in use */
1015 					return -ENOMEM;
1016 				}
1017 
1018 				memmove(&kcache[i+1],&kcache[i], (kcache_banks-i-1)*sizeof(struct r600_bytecode_kcache));
1019 				kcache[i].mode = V_SQ_CF_KCACHE_LOCK_1;
1020 				kcache[i].bank = bank;
1021 				kcache[i].addr = line;
1022 				kcache[i].index_mode = index_mode;
1023 				return 0;
1024 			}
1025 
1026 			d = line - kcache[i].addr;
1027 
1028 			if (d == -1) {
1029 				kcache[i].addr--;
1030 				if (kcache[i].mode == V_SQ_CF_KCACHE_LOCK_2) {
1031 					/* we are prepending the line to the current set,
1032 					 * discarding the existing second line,
1033 					 * so we'll have to insert line+2 after it */
1034 					line += 2;
1035 					continue;
1036 				} else if (kcache[i].mode == V_SQ_CF_KCACHE_LOCK_1) {
1037 					kcache[i].mode = V_SQ_CF_KCACHE_LOCK_2;
1038 					return 0;
1039 				} else {
1040 					/* V_SQ_CF_KCACHE_LOCK_LOOP_INDEX is not supported */
1041 					return -ENOMEM;
1042 				}
1043 			} else if (d == 1) {
1044 				kcache[i].mode = V_SQ_CF_KCACHE_LOCK_2;
1045 				return 0;
1046 			} else if (d == 0)
1047 				return 0;
1048 		} else { /* free kcache set - use it */
1049 			kcache[i].mode = V_SQ_CF_KCACHE_LOCK_1;
1050 			kcache[i].bank = bank;
1051 			kcache[i].addr = line;
1052 			kcache[i].index_mode = index_mode;
1053 			return 0;
1054 		}
1055 	}
1056 	return -ENOMEM;
1057 }
1058 
r600_bytecode_alloc_inst_kcache_lines(struct r600_bytecode * bc,struct r600_bytecode_kcache * kcache,struct r600_bytecode_alu * alu)1059 static int r600_bytecode_alloc_inst_kcache_lines(struct r600_bytecode *bc,
1060 		struct r600_bytecode_kcache *kcache,
1061 		struct r600_bytecode_alu *alu)
1062 {
1063 	int i, r;
1064 
1065 	for (i = 0; i < 3; i++) {
1066 		unsigned bank, line, sel = alu->src[i].sel, index_mode;
1067 
1068 		if (sel < 512)
1069 			continue;
1070 
1071 		bank = alu->src[i].kc_bank;
1072 		assert(bank < R600_MAX_ALU_CONST_BUFFERS);
1073 		line = (sel-512)>>4;
1074 		index_mode = alu->src[i].kc_rel;
1075 
1076 		if ((r = r600_bytecode_alloc_kcache_line(bc, kcache, bank, line, index_mode)))
1077 			return r;
1078 	}
1079 	return 0;
1080 }
1081 
r600_bytecode_assign_kcache_banks(struct r600_bytecode_alu * alu,struct r600_bytecode_kcache * kcache)1082 static int r600_bytecode_assign_kcache_banks(
1083 		struct r600_bytecode_alu *alu,
1084 		struct r600_bytecode_kcache * kcache)
1085 {
1086 	int i, j;
1087 
1088 	/* Alter the src operands to refer to the kcache. */
1089 	for (i = 0; i < 3; ++i) {
1090 		static const unsigned int base[] = {128, 160, 256, 288};
1091 		unsigned int line, sel = alu->src[i].sel, found = 0;
1092 
1093 		if (sel < 512)
1094 			continue;
1095 
1096 		sel -= 512;
1097 		line = sel>>4;
1098 
1099 		for (j = 0; j < 4 && !found; ++j) {
1100 			switch (kcache[j].mode) {
1101 			case V_SQ_CF_KCACHE_NOP:
1102 			case V_SQ_CF_KCACHE_LOCK_LOOP_INDEX:
1103 				R600_ASM_ERR("unexpected kcache line mode\n");
1104 				return -ENOMEM;
1105 			default:
1106 				if (kcache[j].bank == alu->src[i].kc_bank &&
1107 						kcache[j].addr <= line &&
1108 						line < kcache[j].addr + kcache[j].mode) {
1109 					alu->src[i].sel = sel - (kcache[j].addr<<4);
1110 					alu->src[i].sel += base[j];
1111 					found=1;
1112 			    }
1113 			}
1114 		}
1115 	}
1116 	return 0;
1117 }
1118 
r600_bytecode_alloc_kcache_lines(struct r600_bytecode * bc,struct r600_bytecode_alu * alu,unsigned type)1119 static int r600_bytecode_alloc_kcache_lines(struct r600_bytecode *bc,
1120 		struct r600_bytecode_alu *alu,
1121 		unsigned type)
1122 {
1123 	struct r600_bytecode_kcache kcache_sets[4];
1124 	struct r600_bytecode_kcache *kcache = kcache_sets;
1125 	int r;
1126 
1127 	memcpy(kcache, bc->cf_last->kcache, 4 * sizeof(struct r600_bytecode_kcache));
1128 
1129 	if ((r = r600_bytecode_alloc_inst_kcache_lines(bc, kcache, alu))) {
1130 		/* can't alloc, need to start new clause */
1131 
1132 		/* Make sure the CF ends with an "last" instruction when
1133 		 * we split an ALU group because of a new CF */
1134 		if (!list_is_empty(&bc->cf_last->alu))  {
1135 			struct r600_bytecode_alu *last_submitted =
1136 				list_last_entry(&bc->cf_last->alu, struct r600_bytecode_alu, list);
1137 				last_submitted->last = 1;
1138 		}
1139 
1140 		if ((r = r600_bytecode_add_cf(bc))) {
1141 			return r;
1142 		}
1143 		bc->cf_last->op = type;
1144 
1145 		/* retry with the new clause */
1146 		kcache = bc->cf_last->kcache;
1147 		if ((r = r600_bytecode_alloc_inst_kcache_lines(bc, kcache, alu))) {
1148 			/* can't alloc again- should never happen */
1149 			return r;
1150 		}
1151 	} else {
1152 		/* update kcache sets */
1153 		memcpy(bc->cf_last->kcache, kcache, 4 * sizeof(struct r600_bytecode_kcache));
1154 	}
1155 
1156 	/* if we actually used more than 2 kcache sets, or have relative indexing - use ALU_EXTENDED on eg+ */
1157 	if (kcache[2].mode != V_SQ_CF_KCACHE_NOP ||
1158 		kcache[0].index_mode || kcache[1].index_mode || kcache[2].index_mode || kcache[3].index_mode) {
1159 		if (bc->gfx_level < EVERGREEN)
1160 			return -ENOMEM;
1161 		bc->cf_last->eg_alu_extended = 1;
1162 	}
1163 
1164 	return 0;
1165 }
1166 
insert_nop_r6xx(struct r600_bytecode * bc,int max_slots)1167 static int insert_nop_r6xx(struct r600_bytecode *bc, int max_slots)
1168 {
1169 	struct r600_bytecode_alu alu;
1170 	int r, i;
1171 
1172 	for (i = 0; i < max_slots; i++) {
1173 		memset(&alu, 0, sizeof(alu));
1174 		alu.op = ALU_OP0_NOP;
1175 		alu.src[0].chan = i & 3;
1176 		alu.dst.chan = i & 3;
1177 		alu.last = (i == max_slots - 1);
1178 		r = r600_bytecode_add_alu(bc, &alu);
1179 		if (r)
1180 			return r;
1181 	}
1182 	return 0;
1183 }
1184 
1185 /* load AR register from gpr (bc->ar_reg) with MOVA_INT */
load_ar_r6xx(struct r600_bytecode * bc,bool for_src)1186 static int load_ar_r6xx(struct r600_bytecode *bc, bool for_src)
1187 {
1188 	struct r600_bytecode_alu alu;
1189 	int r;
1190 
1191 	if (bc->ar_loaded)
1192 		return 0;
1193 
1194 	/* hack to avoid making MOVA the last instruction in the clause */
1195 	if (bc->cf_last == NULL || (bc->cf_last->ndw>>1) >= 110)
1196 		bc->force_add_cf = 1;
1197    else if (for_src) {
1198       insert_nop_r6xx(bc, 4);
1199       bc->nalu_groups++;
1200    }
1201 
1202 	memset(&alu, 0, sizeof(alu));
1203 	alu.op = ALU_OP1_MOVA_GPR_INT;
1204 	alu.src[0].sel = bc->ar_reg;
1205 	alu.src[0].chan = bc->ar_chan;
1206 	alu.last = 1;
1207 	alu.index_mode = INDEX_MODE_LOOP;
1208 	r = r600_bytecode_add_alu(bc, &alu);
1209 	if (r)
1210 		return r;
1211 
1212 	/* no requirement to set uses waterfall on MOVA_GPR_INT */
1213 	bc->ar_loaded = 1;
1214 	return 0;
1215 }
1216 
1217 /* load AR register from gpr (bc->ar_reg) with MOVA_INT */
r600_load_ar(struct r600_bytecode * bc,bool for_src)1218 int r600_load_ar(struct r600_bytecode *bc, bool for_src)
1219 {
1220 	struct r600_bytecode_alu alu;
1221 	int r;
1222 
1223 	if (bc->ar_handling)
1224 		return load_ar_r6xx(bc, for_src);
1225 
1226 	if (bc->ar_loaded)
1227 		return 0;
1228 
1229 	/* hack to avoid making MOVA the last instruction in the clause */
1230 	if (bc->cf_last == NULL || (bc->cf_last->ndw>>1) >= 110)
1231 		bc->force_add_cf = 1;
1232 
1233 	memset(&alu, 0, sizeof(alu));
1234 	alu.op = ALU_OP1_MOVA_INT;
1235 	alu.src[0].sel = bc->ar_reg;
1236 	alu.src[0].chan = bc->ar_chan;
1237 	alu.last = 1;
1238 	r = r600_bytecode_add_alu(bc, &alu);
1239 	if (r)
1240 		return r;
1241 
1242 	bc->cf_last->r6xx_uses_waterfall = 1;
1243 	bc->ar_loaded = 1;
1244 	return 0;
1245 }
1246 
r600_bytecode_add_alu_type(struct r600_bytecode * bc,const struct r600_bytecode_alu * alu,unsigned type)1247 int r600_bytecode_add_alu_type(struct r600_bytecode *bc,
1248 		const struct r600_bytecode_alu *alu, unsigned type)
1249 {
1250 	struct r600_bytecode_alu *nalu = r600_bytecode_alu();
1251 	struct r600_bytecode_alu *lalu;
1252 	int i, r;
1253 
1254 	if (!nalu)
1255 		return -ENOMEM;
1256 	memcpy(nalu, alu, sizeof(struct r600_bytecode_alu));
1257 
1258 	if (alu->is_op3) {
1259 		/* will fail later since alu does not support it. */
1260 		assert(!alu->src[0].abs && !alu->src[1].abs && !alu->src[2].abs);
1261 	}
1262 
1263 	if (bc->cf_last != NULL && bc->cf_last->op != type) {
1264 		/* check if we could add it anyway */
1265 		if ((bc->cf_last->op == CF_OP_ALU && type == CF_OP_ALU_PUSH_BEFORE) ||
1266 		 	(bc->cf_last->op == CF_OP_ALU_PUSH_BEFORE && type == CF_OP_ALU)) {
1267 		 	LIST_FOR_EACH_ENTRY(lalu, &bc->cf_last->alu, list) {
1268 		 		if (lalu->execute_mask) {
1269                                         assert(bc->force_add_cf || !"no force cf");
1270 					bc->force_add_cf = 1;
1271 					break;
1272 				}
1273 		 		type = CF_OP_ALU_PUSH_BEFORE;
1274 			}
1275 		} else  {
1276                    assert(bc->force_add_cf ||!"no force cf");
1277 			bc->force_add_cf = 1;
1278                 }
1279 	}
1280 
1281 	/* cf can contains only alu or only vtx or only tex */
1282 	if (bc->cf_last == NULL || bc->force_add_cf) {
1283                if (bc->cf_last && bc->cf_last->curr_bs_head)
1284                   bc->cf_last->curr_bs_head->last = 1;
1285 		r = r600_bytecode_add_cf(bc);
1286 		if (r) {
1287 			free(nalu);
1288 			return r;
1289 		}
1290 	}
1291 	bc->cf_last->op = type;
1292 
1293 	if (bc->gfx_level >= EVERGREEN) {
1294 		for (i = 0; i < 3; i++)
1295 			if (nalu->src[i].kc_bank &&  nalu->src[i].kc_rel)
1296 				assert(bc->index_loaded[nalu->src[i].kc_rel - 1]);
1297 	}
1298 
1299 	/* Check AR usage and load it if required */
1300 	for (i = 0; i < 3; i++)
1301 		if (nalu->src[i].rel && !bc->ar_loaded)
1302 			r600_load_ar(bc, true);
1303 
1304 	if (nalu->dst.rel && !bc->ar_loaded)
1305 		r600_load_ar(bc, false);
1306 
1307 	/* Setup the kcache for this ALU instruction. This will start a new
1308 	 * ALU clause if needed. */
1309 	if ((r = r600_bytecode_alloc_kcache_lines(bc, nalu, type))) {
1310 		free(nalu);
1311 		return r;
1312 	}
1313 
1314 	if (!bc->cf_last->curr_bs_head) {
1315 		bc->cf_last->curr_bs_head = nalu;
1316 	}
1317 	/* number of gpr == the last gpr used in any alu */
1318 	for (i = 0; i < 3; i++) {
1319 		if (nalu->src[i].sel >= bc->ngpr && nalu->src[i].sel < 123) {
1320 			bc->ngpr = nalu->src[i].sel + 1;
1321 		}
1322 		if (nalu->src[i].sel == V_SQ_ALU_SRC_LITERAL)
1323 			r600_bytecode_special_constants(nalu->src[i].value,
1324 				&nalu->src[i].sel);
1325 	}
1326 	if (nalu->dst.write && nalu->dst.sel >= bc->ngpr && nalu->dst.sel < 123) {
1327 		bc->ngpr = nalu->dst.sel + 1;
1328 	}
1329 	list_addtail(&nalu->list, &bc->cf_last->alu);
1330 	/* each alu use 2 dwords */
1331 	bc->cf_last->ndw += 2;
1332 	bc->ndw += 2;
1333 
1334 	/* process cur ALU instructions for bank swizzle */
1335 	if (nalu->last) {
1336 		uint32_t literal[4];
1337 		unsigned nliteral;
1338 		struct r600_bytecode_alu *slots[5];
1339 		int max_slots = bc->gfx_level == CAYMAN ? 4 : 5;
1340 		r = assign_alu_units(bc, bc->cf_last->curr_bs_head, slots);
1341 		if (r)
1342 			return r;
1343 
1344 		if (bc->cf_last->prev_bs_head) {
1345          struct r600_bytecode_alu *cur_prev_head = bc->cf_last->prev_bs_head;
1346 			r = merge_inst_groups(bc, slots, cur_prev_head);
1347 			if (r)
1348 				return r;
1349          if (cur_prev_head != bc->cf_last->prev_bs_head)
1350             bc->nalu_groups--;
1351 		}
1352 
1353 		if (bc->cf_last->prev_bs_head) {
1354 			r = replace_gpr_with_pv_ps(bc, slots, bc->cf_last->prev_bs_head);
1355 			if (r)
1356 				return r;
1357 		}
1358 
1359 		r = check_and_set_bank_swizzle(bc, slots);
1360 		if (r)
1361 			return r;
1362 
1363 		for (i = 0, nliteral = 0; i < max_slots; i++) {
1364 			if (slots[i]) {
1365 				r = r600_bytecode_alu_nliterals(slots[i], literal, &nliteral);
1366 				if (r)
1367 					return r;
1368 			}
1369 		}
1370 		bc->cf_last->ndw += align(nliteral, 2);
1371 
1372 		bc->cf_last->prev2_bs_head = bc->cf_last->prev_bs_head;
1373 		bc->cf_last->prev_bs_head = bc->cf_last->curr_bs_head;
1374 		bc->cf_last->curr_bs_head = NULL;
1375 
1376 		bc->nalu_groups++;
1377 
1378 		if (bc->r6xx_nop_after_rel_dst) {
1379 			for (int i = 0; i < max_slots; ++i) {
1380 				if (slots[i] && slots[i]->dst.rel) {
1381 					insert_nop_r6xx(bc, max_slots);
1382 					bc->nalu_groups++;
1383 					break;
1384 				}
1385 			}
1386 		}
1387 	}
1388 
1389 	/* Might need to insert spill write ops after current clause */
1390 	if (nalu->last && bc->n_pending_outputs) {
1391 		while (bc->n_pending_outputs) {
1392 			r = r600_bytecode_add_output(bc, &bc->pending_outputs[--bc->n_pending_outputs]);
1393 			if (r)
1394 				return r;
1395 		}
1396 	}
1397 
1398 	return 0;
1399 }
1400 
r600_bytecode_add_alu(struct r600_bytecode * bc,const struct r600_bytecode_alu * alu)1401 int r600_bytecode_add_alu(struct r600_bytecode *bc, const struct r600_bytecode_alu *alu)
1402 {
1403 	return r600_bytecode_add_alu_type(bc, alu, CF_OP_ALU);
1404 }
1405 
r600_bytecode_num_tex_and_vtx_instructions(const struct r600_bytecode * bc)1406 static unsigned r600_bytecode_num_tex_and_vtx_instructions(const struct r600_bytecode *bc)
1407 {
1408 	switch (bc->gfx_level) {
1409 	case R600:
1410 		return 8;
1411 
1412 	case R700:
1413 	case EVERGREEN:
1414 	case CAYMAN:
1415 		return 16;
1416 
1417 	default:
1418 		R600_ASM_ERR("Unknown gfx level %d.\n", bc->gfx_level);
1419 		return 8;
1420 	}
1421 }
1422 
last_inst_was_not_vtx_fetch(struct r600_bytecode * bc,bool use_tc)1423 static inline bool last_inst_was_not_vtx_fetch(struct r600_bytecode *bc, bool use_tc)
1424 {
1425 	return !((r600_isa_cf(bc->cf_last->op)->flags & CF_FETCH) &&
1426 		 bc->cf_last->op != CF_OP_GDS &&
1427 		 (bc->gfx_level == CAYMAN || use_tc ||
1428 		  bc->cf_last->op != CF_OP_TEX));
1429 }
1430 
r600_bytecode_add_vtx_internal(struct r600_bytecode * bc,const struct r600_bytecode_vtx * vtx,bool use_tc)1431 static int r600_bytecode_add_vtx_internal(struct r600_bytecode *bc, const struct r600_bytecode_vtx *vtx,
1432 					  bool use_tc)
1433 {
1434 	struct r600_bytecode_vtx *nvtx = r600_bytecode_vtx();
1435 	int r;
1436 
1437 	if (!nvtx)
1438 		return -ENOMEM;
1439 	memcpy(nvtx, vtx, sizeof(struct r600_bytecode_vtx));
1440 
1441 	if (bc->gfx_level >= EVERGREEN) {
1442 		assert(!vtx->buffer_index_mode ||
1443 		       bc->index_loaded[vtx->buffer_index_mode - 1]);
1444 	}
1445 
1446 
1447 	/* cf can contains only alu or only vtx or only tex */
1448 	if (bc->cf_last == NULL ||
1449 	    last_inst_was_not_vtx_fetch(bc, use_tc) ||
1450 	    bc->force_add_cf) {
1451 		r = r600_bytecode_add_cf(bc);
1452 		if (r) {
1453 			free(nvtx);
1454 			return r;
1455 		}
1456 		switch (bc->gfx_level) {
1457 		case R600:
1458 		case R700:
1459 			bc->cf_last->op = CF_OP_VTX;
1460 			break;
1461 		case EVERGREEN:
1462 			if (use_tc)
1463 				bc->cf_last->op = CF_OP_TEX;
1464 			else
1465 				bc->cf_last->op = CF_OP_VTX;
1466 			break;
1467 		case CAYMAN:
1468 			bc->cf_last->op = CF_OP_TEX;
1469 			break;
1470 		default:
1471 			R600_ASM_ERR("Unknown gfx level %d.\n", bc->gfx_level);
1472 			free(nvtx);
1473 			return -EINVAL;
1474 		}
1475 	}
1476 	list_addtail(&nvtx->list, &bc->cf_last->vtx);
1477 	/* each fetch use 4 dwords */
1478 	bc->cf_last->ndw += 4;
1479 	bc->ndw += 4;
1480 	if ((bc->cf_last->ndw / 4) >= r600_bytecode_num_tex_and_vtx_instructions(bc))
1481 		bc->force_add_cf = 1;
1482 
1483 	bc->ngpr = MAX2(bc->ngpr, vtx->src_gpr + 1);
1484 	bc->ngpr = MAX2(bc->ngpr, vtx->dst_gpr + 1);
1485 
1486 	return 0;
1487 }
1488 
r600_bytecode_add_vtx(struct r600_bytecode * bc,const struct r600_bytecode_vtx * vtx)1489 int r600_bytecode_add_vtx(struct r600_bytecode *bc, const struct r600_bytecode_vtx *vtx)
1490 {
1491 	return r600_bytecode_add_vtx_internal(bc, vtx, false);
1492 }
1493 
r600_bytecode_add_vtx_tc(struct r600_bytecode * bc,const struct r600_bytecode_vtx * vtx)1494 int r600_bytecode_add_vtx_tc(struct r600_bytecode *bc, const struct r600_bytecode_vtx *vtx)
1495 {
1496 	return r600_bytecode_add_vtx_internal(bc, vtx, true);
1497 }
1498 
r600_bytecode_add_tex(struct r600_bytecode * bc,const struct r600_bytecode_tex * tex)1499 int r600_bytecode_add_tex(struct r600_bytecode *bc, const struct r600_bytecode_tex *tex)
1500 {
1501 	struct r600_bytecode_tex *ntex = r600_bytecode_tex();
1502 	int r;
1503 
1504 	if (!ntex)
1505 		return -ENOMEM;
1506 	memcpy(ntex, tex, sizeof(struct r600_bytecode_tex));
1507 
1508 	if (bc->gfx_level >= EVERGREEN) {
1509 		assert(!tex->sampler_index_mode ||
1510 		       bc->index_loaded[tex->sampler_index_mode - 1]);
1511 		assert(!tex->resource_index_mode ||
1512                        bc->index_loaded[tex->resource_index_mode - 1]);
1513 	}
1514 
1515 	/* we can't fetch data und use it as texture lookup address in the same TEX clause */
1516 	if (bc->cf_last != NULL &&
1517 		bc->cf_last->op == CF_OP_TEX) {
1518 		struct r600_bytecode_tex *ttex;
1519                 uint8_t use_mask = ((1 << ntex->src_sel_x) |
1520                                     (1 << ntex->src_sel_y) |
1521                                     (1 << ntex->src_sel_z) |
1522                                     (1 << ntex->src_sel_w)) & 0xf;
1523 
1524 		LIST_FOR_EACH_ENTRY(ttex, &bc->cf_last->tex, list) {
1525 			if (ttex->dst_gpr == ntex->src_gpr) {
1526                            uint8_t write_mask = (ttex->dst_sel_x < 6 ? 1 : 0) |
1527                                                 (ttex->dst_sel_y < 6 ? 2 : 0) |
1528                                                 (ttex->dst_sel_z < 6 ? 4 : 0) |
1529                                                 (ttex->dst_sel_w < 6 ? 8 : 0);
1530                            if (use_mask & write_mask) {
1531                               bc->force_add_cf = 1;
1532                               break;
1533                            }
1534 			}
1535 		}
1536 		/* vtx instrs get inserted after tex, so make sure we aren't moving the tex
1537 		 * before (say) the instr fetching the texcoord.
1538 		 */
1539 		if (!list_is_empty(&bc->cf_last->vtx))
1540 			bc->force_add_cf = 1;
1541 
1542 		/* slight hack to make gradients always go into same cf */
1543 		if (ntex->op == FETCH_OP_SET_GRADIENTS_H)
1544 			bc->force_add_cf = 1;
1545 	}
1546 
1547 	/* cf can contains only alu or only vtx or only tex */
1548 	if (bc->cf_last == NULL ||
1549 		bc->cf_last->op != CF_OP_TEX ||
1550 	        bc->force_add_cf) {
1551 		r = r600_bytecode_add_cf(bc);
1552 		if (r) {
1553 			free(ntex);
1554 			return r;
1555 		}
1556 		bc->cf_last->op = CF_OP_TEX;
1557 	}
1558 	if (ntex->src_gpr >= bc->ngpr) {
1559 		bc->ngpr = ntex->src_gpr + 1;
1560 	}
1561 	if (ntex->dst_gpr >= bc->ngpr) {
1562 		bc->ngpr = ntex->dst_gpr + 1;
1563 	}
1564 	list_addtail(&ntex->list, &bc->cf_last->tex);
1565 	/* each texture fetch use 4 dwords */
1566 	bc->cf_last->ndw += 4;
1567 	bc->ndw += 4;
1568 	if ((bc->cf_last->ndw / 4) >= r600_bytecode_num_tex_and_vtx_instructions(bc))
1569 		bc->force_add_cf = 1;
1570 	return 0;
1571 }
1572 
r600_bytecode_add_gds(struct r600_bytecode * bc,const struct r600_bytecode_gds * gds)1573 int r600_bytecode_add_gds(struct r600_bytecode *bc, const struct r600_bytecode_gds *gds)
1574 {
1575 	struct r600_bytecode_gds *ngds = r600_bytecode_gds();
1576 	int r;
1577 
1578 	if (ngds == NULL)
1579 		return -ENOMEM;
1580 	memcpy(ngds, gds, sizeof(struct r600_bytecode_gds));
1581 
1582 	if (bc->gfx_level >= EVERGREEN) {
1583 		assert(!gds->uav_index_mode ||
1584 		       bc->index_loaded[gds->uav_index_mode - 1]);
1585 	}
1586 
1587 	if (bc->cf_last == NULL ||
1588 	    bc->cf_last->op != CF_OP_GDS ||
1589 	    bc->force_add_cf) {
1590 		r = r600_bytecode_add_cf(bc);
1591 		if (r) {
1592 			free(ngds);
1593 			return r;
1594 		}
1595 		bc->cf_last->op = CF_OP_GDS;
1596 	}
1597 
1598 	list_addtail(&ngds->list, &bc->cf_last->gds);
1599 	bc->cf_last->ndw += 4; /* each GDS uses 4 dwords */
1600 	if ((bc->cf_last->ndw / 4) >= r600_bytecode_num_tex_and_vtx_instructions(bc))
1601 		bc->force_add_cf = 1;
1602 	return 0;
1603 }
1604 
r600_bytecode_add_cfinst(struct r600_bytecode * bc,unsigned op)1605 int r600_bytecode_add_cfinst(struct r600_bytecode *bc, unsigned op)
1606 {
1607 	int r;
1608 
1609 	/* Emit WAIT_ACK before control flow to ensure pending writes are always acked. */
1610 	if (op != CF_OP_WAIT_ACK && op != CF_OP_MEM_SCRATCH)
1611 		r600_bytecode_wait_acks(bc);
1612 
1613 	r = r600_bytecode_add_cf(bc);
1614 	if (r)
1615 		return r;
1616 
1617 	bc->cf_last->cond = V_SQ_CF_COND_ACTIVE;
1618 	bc->cf_last->op = op;
1619 	return 0;
1620 }
1621 
cm_bytecode_add_cf_end(struct r600_bytecode * bc)1622 int cm_bytecode_add_cf_end(struct r600_bytecode *bc)
1623 {
1624 	return r600_bytecode_add_cfinst(bc, CF_OP_CF_END);
1625 }
1626 
1627 /* common to all 3 families */
r600_bytecode_vtx_build(struct r600_bytecode * bc,struct r600_bytecode_vtx * vtx,unsigned id)1628 static int r600_bytecode_vtx_build(struct r600_bytecode *bc, struct r600_bytecode_vtx *vtx, unsigned id)
1629 {
1630 	if (r600_isa_fetch(vtx->op)->flags & FF_MEM)
1631 		return r700_bytecode_fetch_mem_build(bc, vtx, id);
1632 	bc->bytecode[id] = S_SQ_VTX_WORD0_VTX_INST(r600_isa_fetch_opcode(bc->isa->hw_class, vtx->op)) |
1633 			S_SQ_VTX_WORD0_BUFFER_ID(vtx->buffer_id) |
1634 			S_SQ_VTX_WORD0_FETCH_TYPE(vtx->fetch_type) |
1635 			S_SQ_VTX_WORD0_SRC_GPR(vtx->src_gpr) |
1636 			S_SQ_VTX_WORD0_SRC_SEL_X(vtx->src_sel_x);
1637 	if (bc->gfx_level < CAYMAN)
1638 		bc->bytecode[id] |= S_SQ_VTX_WORD0_MEGA_FETCH_COUNT(vtx->mega_fetch_count);
1639 	id++;
1640 	bc->bytecode[id++] = S_SQ_VTX_WORD1_DST_SEL_X(vtx->dst_sel_x) |
1641 				S_SQ_VTX_WORD1_DST_SEL_Y(vtx->dst_sel_y) |
1642 				S_SQ_VTX_WORD1_DST_SEL_Z(vtx->dst_sel_z) |
1643 				S_SQ_VTX_WORD1_DST_SEL_W(vtx->dst_sel_w) |
1644 				S_SQ_VTX_WORD1_USE_CONST_FIELDS(vtx->use_const_fields) |
1645 				S_SQ_VTX_WORD1_DATA_FORMAT(vtx->data_format) |
1646 				S_SQ_VTX_WORD1_NUM_FORMAT_ALL(vtx->num_format_all) |
1647 				S_SQ_VTX_WORD1_FORMAT_COMP_ALL(vtx->format_comp_all) |
1648 				S_SQ_VTX_WORD1_SRF_MODE_ALL(vtx->srf_mode_all) |
1649 				S_SQ_VTX_WORD1_GPR_DST_GPR(vtx->dst_gpr);
1650 	bc->bytecode[id] = S_SQ_VTX_WORD2_OFFSET(vtx->offset)|
1651 				S_SQ_VTX_WORD2_ENDIAN_SWAP(vtx->endian);
1652 	if (bc->gfx_level >= EVERGREEN)
1653 		bc->bytecode[id] |= ((vtx->buffer_index_mode & 0x3) << 21); // S_SQ_VTX_WORD2_BIM(vtx->buffer_index_mode);
1654 	if (bc->gfx_level < CAYMAN)
1655 		bc->bytecode[id] |= S_SQ_VTX_WORD2_MEGA_FETCH(1);
1656 	id++;
1657 	bc->bytecode[id++] = 0;
1658 	return 0;
1659 }
1660 
1661 /* common to all 3 families */
r600_bytecode_tex_build(struct r600_bytecode * bc,struct r600_bytecode_tex * tex,unsigned id)1662 static int r600_bytecode_tex_build(struct r600_bytecode *bc, struct r600_bytecode_tex *tex, unsigned id)
1663 {
1664 	bc->bytecode[id] = S_SQ_TEX_WORD0_TEX_INST(
1665 					r600_isa_fetch_opcode(bc->isa->hw_class, tex->op)) |
1666 			    EG_S_SQ_TEX_WORD0_INST_MOD(tex->inst_mod) |
1667 				S_SQ_TEX_WORD0_RESOURCE_ID(tex->resource_id) |
1668 				S_SQ_TEX_WORD0_SRC_GPR(tex->src_gpr) |
1669 				S_SQ_TEX_WORD0_SRC_REL(tex->src_rel);
1670 	if (bc->gfx_level >= EVERGREEN)
1671 		bc->bytecode[id] |= ((tex->sampler_index_mode & 0x3) << 27) | // S_SQ_TEX_WORD0_SIM(tex->sampler_index_mode);
1672 				((tex->resource_index_mode & 0x3) << 25); // S_SQ_TEX_WORD0_RIM(tex->resource_index_mode)
1673 	id++;
1674 	bc->bytecode[id++] = S_SQ_TEX_WORD1_DST_GPR(tex->dst_gpr) |
1675 				S_SQ_TEX_WORD1_DST_REL(tex->dst_rel) |
1676 				S_SQ_TEX_WORD1_DST_SEL_X(tex->dst_sel_x) |
1677 				S_SQ_TEX_WORD1_DST_SEL_Y(tex->dst_sel_y) |
1678 				S_SQ_TEX_WORD1_DST_SEL_Z(tex->dst_sel_z) |
1679 				S_SQ_TEX_WORD1_DST_SEL_W(tex->dst_sel_w) |
1680 				S_SQ_TEX_WORD1_LOD_BIAS(tex->lod_bias) |
1681 				S_SQ_TEX_WORD1_COORD_TYPE_X(tex->coord_type_x) |
1682 				S_SQ_TEX_WORD1_COORD_TYPE_Y(tex->coord_type_y) |
1683 				S_SQ_TEX_WORD1_COORD_TYPE_Z(tex->coord_type_z) |
1684 				S_SQ_TEX_WORD1_COORD_TYPE_W(tex->coord_type_w);
1685 	bc->bytecode[id++] = S_SQ_TEX_WORD2_OFFSET_X(tex->offset_x) |
1686 				S_SQ_TEX_WORD2_OFFSET_Y(tex->offset_y) |
1687 				S_SQ_TEX_WORD2_OFFSET_Z(tex->offset_z) |
1688 				S_SQ_TEX_WORD2_SAMPLER_ID(tex->sampler_id) |
1689 				S_SQ_TEX_WORD2_SRC_SEL_X(tex->src_sel_x) |
1690 				S_SQ_TEX_WORD2_SRC_SEL_Y(tex->src_sel_y) |
1691 				S_SQ_TEX_WORD2_SRC_SEL_Z(tex->src_sel_z) |
1692 				S_SQ_TEX_WORD2_SRC_SEL_W(tex->src_sel_w);
1693 	bc->bytecode[id++] = 0;
1694 	return 0;
1695 }
1696 
1697 /* r600 only, r700/eg bits in r700_asm.c */
r600_bytecode_alu_build(struct r600_bytecode * bc,struct r600_bytecode_alu * alu,unsigned id)1698 static int r600_bytecode_alu_build(struct r600_bytecode *bc, struct r600_bytecode_alu *alu, unsigned id)
1699 {
1700 	unsigned opcode = r600_isa_alu_opcode(bc->isa->hw_class, alu->op);
1701 
1702 	/* don't replace gpr by pv or ps for destination register */
1703 	bc->bytecode[id++] = S_SQ_ALU_WORD0_SRC0_SEL(alu->src[0].sel) |
1704 				S_SQ_ALU_WORD0_SRC0_REL(alu->src[0].rel) |
1705 				S_SQ_ALU_WORD0_SRC0_CHAN(alu->src[0].chan) |
1706 				S_SQ_ALU_WORD0_SRC0_NEG(alu->src[0].neg) |
1707 				S_SQ_ALU_WORD0_SRC1_SEL(alu->src[1].sel) |
1708 				S_SQ_ALU_WORD0_SRC1_REL(alu->src[1].rel) |
1709 				S_SQ_ALU_WORD0_SRC1_CHAN(alu->src[1].chan) |
1710 				S_SQ_ALU_WORD0_SRC1_NEG(alu->src[1].neg) |
1711 				S_SQ_ALU_WORD0_INDEX_MODE(alu->index_mode) |
1712 				S_SQ_ALU_WORD0_PRED_SEL(alu->pred_sel) |
1713 				S_SQ_ALU_WORD0_LAST(alu->last);
1714 
1715 	if (alu->is_op3) {
1716 		assert(!alu->src[0].abs && !alu->src[1].abs && !alu->src[2].abs);
1717 		bc->bytecode[id++] = S_SQ_ALU_WORD1_DST_GPR(alu->dst.sel) |
1718 					S_SQ_ALU_WORD1_DST_CHAN(alu->dst.chan) |
1719 					S_SQ_ALU_WORD1_DST_REL(alu->dst.rel) |
1720 					S_SQ_ALU_WORD1_CLAMP(alu->dst.clamp) |
1721 					S_SQ_ALU_WORD1_OP3_SRC2_SEL(alu->src[2].sel) |
1722 					S_SQ_ALU_WORD1_OP3_SRC2_REL(alu->src[2].rel) |
1723 					S_SQ_ALU_WORD1_OP3_SRC2_CHAN(alu->src[2].chan) |
1724 					S_SQ_ALU_WORD1_OP3_SRC2_NEG(alu->src[2].neg) |
1725 					S_SQ_ALU_WORD1_OP3_ALU_INST(opcode) |
1726 					S_SQ_ALU_WORD1_BANK_SWIZZLE(alu->bank_swizzle);
1727 	} else {
1728 		bc->bytecode[id++] = S_SQ_ALU_WORD1_DST_GPR(alu->dst.sel) |
1729 					S_SQ_ALU_WORD1_DST_CHAN(alu->dst.chan) |
1730 					S_SQ_ALU_WORD1_DST_REL(alu->dst.rel) |
1731 					S_SQ_ALU_WORD1_CLAMP(alu->dst.clamp) |
1732 					S_SQ_ALU_WORD1_OP2_SRC0_ABS(alu->src[0].abs) |
1733 					S_SQ_ALU_WORD1_OP2_SRC1_ABS(alu->src[1].abs) |
1734 					S_SQ_ALU_WORD1_OP2_WRITE_MASK(alu->dst.write) |
1735 					S_SQ_ALU_WORD1_OP2_OMOD(alu->omod) |
1736 					S_SQ_ALU_WORD1_OP2_ALU_INST(opcode) |
1737 					S_SQ_ALU_WORD1_BANK_SWIZZLE(alu->bank_swizzle) |
1738 					S_SQ_ALU_WORD1_OP2_UPDATE_EXECUTE_MASK(alu->execute_mask) |
1739 					S_SQ_ALU_WORD1_OP2_UPDATE_PRED(alu->update_pred);
1740 	}
1741 	return 0;
1742 }
1743 
r600_bytecode_cf_vtx_build(uint32_t * bytecode,const struct r600_bytecode_cf * cf)1744 static void r600_bytecode_cf_vtx_build(uint32_t *bytecode, const struct r600_bytecode_cf *cf)
1745 {
1746 	*bytecode++ = S_SQ_CF_WORD0_ADDR(cf->addr >> 1);
1747 	*bytecode++ = S_SQ_CF_WORD1_CF_INST(r600_isa_cf_opcode(ISA_CC_R600, cf->op)) |
1748 			S_SQ_CF_WORD1_BARRIER(1) |
1749 			S_SQ_CF_WORD1_COUNT((cf->ndw / 4) - 1)|
1750 			S_SQ_CF_WORD1_END_OF_PROGRAM(cf->end_of_program);
1751 }
1752 
1753 /* common for r600/r700 - eg in eg_asm.c */
r600_bytecode_cf_build(struct r600_bytecode * bc,struct r600_bytecode_cf * cf)1754 static int r600_bytecode_cf_build(struct r600_bytecode *bc, struct r600_bytecode_cf *cf)
1755 {
1756 	unsigned id = cf->id;
1757 	const struct cf_op_info *cfop = r600_isa_cf(cf->op);
1758 	unsigned opcode = r600_isa_cf_opcode(bc->isa->hw_class, cf->op);
1759 
1760 
1761 	if (cf->op == CF_NATIVE) {
1762 		bc->bytecode[id++] = cf->isa[0];
1763 		bc->bytecode[id++] = cf->isa[1];
1764 	} else if (cfop->flags & CF_ALU) {
1765 		bc->bytecode[id++] = S_SQ_CF_ALU_WORD0_ADDR(cf->addr >> 1) |
1766 			S_SQ_CF_ALU_WORD0_KCACHE_MODE0(cf->kcache[0].mode) |
1767 			S_SQ_CF_ALU_WORD0_KCACHE_BANK0(cf->kcache[0].bank) |
1768 			S_SQ_CF_ALU_WORD0_KCACHE_BANK1(cf->kcache[1].bank);
1769 
1770 		bc->bytecode[id++] = S_SQ_CF_ALU_WORD1_CF_INST(opcode) |
1771 			S_SQ_CF_ALU_WORD1_KCACHE_MODE1(cf->kcache[1].mode) |
1772 			S_SQ_CF_ALU_WORD1_KCACHE_ADDR0(cf->kcache[0].addr) |
1773 			S_SQ_CF_ALU_WORD1_KCACHE_ADDR1(cf->kcache[1].addr) |
1774 					S_SQ_CF_ALU_WORD1_BARRIER(1) |
1775 					S_SQ_CF_ALU_WORD1_USES_WATERFALL(bc->gfx_level == R600 ? cf->r6xx_uses_waterfall : 0) |
1776 					S_SQ_CF_ALU_WORD1_COUNT((cf->ndw / 2) - 1);
1777 	} else if (cfop->flags & CF_FETCH) {
1778 		if (bc->gfx_level == R700)
1779 			r700_bytecode_cf_vtx_build(&bc->bytecode[id], cf);
1780 		else
1781 			r600_bytecode_cf_vtx_build(&bc->bytecode[id], cf);
1782 	} else if (cfop->flags & CF_EXP) {
1783 		bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(cf->output.gpr) |
1784 			S_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE(cf->output.elem_size) |
1785 			S_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(cf->output.array_base) |
1786 			S_SQ_CF_ALLOC_EXPORT_WORD0_TYPE(cf->output.type) |
1787 			S_SQ_CF_ALLOC_EXPORT_WORD0_INDEX_GPR(cf->output.index_gpr);
1788 		bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT(cf->output.burst_count - 1) |
1789 			S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_X(cf->output.swizzle_x) |
1790 			S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Y(cf->output.swizzle_y) |
1791 			S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Z(cf->output.swizzle_z) |
1792 			S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_W(cf->output.swizzle_w) |
1793 			S_SQ_CF_ALLOC_EXPORT_WORD1_BARRIER(cf->barrier) |
1794 			S_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(opcode) |
1795 			S_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(cf->end_of_program);
1796 	} else if (cfop->flags & CF_MEM) {
1797 		bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(cf->output.gpr) |
1798 			S_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE(cf->output.elem_size) |
1799 			S_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(cf->output.array_base) |
1800 			S_SQ_CF_ALLOC_EXPORT_WORD0_TYPE(cf->output.type) |
1801 			S_SQ_CF_ALLOC_EXPORT_WORD0_INDEX_GPR(cf->output.index_gpr);
1802 		bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT(cf->output.burst_count - 1) |
1803 			S_SQ_CF_ALLOC_EXPORT_WORD1_BARRIER(cf->barrier) |
1804 			S_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(opcode) |
1805 			S_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(cf->end_of_program) |
1806 			S_SQ_CF_ALLOC_EXPORT_WORD1_BUF_ARRAY_SIZE(cf->output.array_size) |
1807 			S_SQ_CF_ALLOC_EXPORT_WORD1_BUF_COMP_MASK(cf->output.comp_mask);
1808 	} else {
1809 		bc->bytecode[id++] = S_SQ_CF_WORD0_ADDR(cf->cf_addr >> 1);
1810 		bc->bytecode[id++] = S_SQ_CF_WORD1_CF_INST(opcode) |
1811 					S_SQ_CF_WORD1_BARRIER(1) |
1812 			                S_SQ_CF_WORD1_COND(cf->cond) |
1813 			                S_SQ_CF_WORD1_POP_COUNT(cf->pop_count) |
1814 					S_SQ_CF_WORD1_END_OF_PROGRAM(cf->end_of_program);
1815 	}
1816 	return 0;
1817 }
1818 
r600_bytecode_build(struct r600_bytecode * bc)1819 int r600_bytecode_build(struct r600_bytecode *bc)
1820 {
1821 	struct r600_bytecode_cf *cf;
1822 	struct r600_bytecode_alu *alu;
1823 	struct r600_bytecode_vtx *vtx;
1824 	struct r600_bytecode_tex *tex;
1825 	struct r600_bytecode_gds *gds;
1826 	uint32_t literal[4];
1827 	unsigned nliteral;
1828 	unsigned addr;
1829 	int i, r;
1830 
1831 	if (!bc->nstack) { // If not 0, Stack_size already provided by llvm
1832 		if (bc->stack.max_entries)
1833 			bc->nstack = bc->stack.max_entries;
1834 		else if (bc->type == PIPE_SHADER_VERTEX ||
1835 			 bc->type == PIPE_SHADER_TESS_EVAL ||
1836 			 bc->type == PIPE_SHADER_TESS_CTRL)
1837 			bc->nstack = 1;
1838 	}
1839 
1840 	/* first path compute addr of each CF block */
1841 	/* addr start after all the CF instructions */
1842 	addr = bc->cf_last->id + 2;
1843 	LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
1844 		if (r600_isa_cf(cf->op)->flags & CF_FETCH) {
1845 			addr += 3;
1846 			addr &= 0xFFFFFFFCUL;
1847 		}
1848 		cf->addr = addr;
1849 		addr += cf->ndw;
1850 		bc->ndw = cf->addr + cf->ndw;
1851 	}
1852 	free(bc->bytecode);
1853 	bc->bytecode = calloc(4, bc->ndw);
1854 	if (bc->bytecode == NULL)
1855 		return -ENOMEM;
1856 	LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
1857 		const struct cf_op_info *cfop = r600_isa_cf(cf->op);
1858 		addr = cf->addr;
1859 		if (bc->gfx_level >= EVERGREEN)
1860 			r = eg_bytecode_cf_build(bc, cf);
1861 		else
1862 			r = r600_bytecode_cf_build(bc, cf);
1863 		if (r)
1864 			return r;
1865 		if (cfop->flags & CF_ALU) {
1866 			nliteral = 0;
1867 			memset(literal, 0, sizeof(literal));
1868 			LIST_FOR_EACH_ENTRY(alu, &cf->alu, list) {
1869 				r = r600_bytecode_alu_nliterals(alu, literal, &nliteral);
1870 				if (r)
1871 					return r;
1872 				r600_bytecode_alu_adjust_literals(alu, literal, nliteral);
1873 				r600_bytecode_assign_kcache_banks(alu, cf->kcache);
1874 
1875 				switch(bc->gfx_level) {
1876 				case R600:
1877 					r = r600_bytecode_alu_build(bc, alu, addr);
1878 					break;
1879 				case R700:
1880 					r = r700_bytecode_alu_build(bc, alu, addr);
1881 					break;
1882 				case EVERGREEN:
1883 				case CAYMAN:
1884 					r = eg_bytecode_alu_build(bc, alu, addr);
1885 					break;
1886 				default:
1887 					R600_ASM_ERR("unknown gfx level %d.\n", bc->gfx_level);
1888 					return -EINVAL;
1889 				}
1890 				if (r)
1891 					return r;
1892 				addr += 2;
1893 				if (alu->last) {
1894 					for (i = 0; i < align(nliteral, 2); ++i) {
1895 						bc->bytecode[addr++] = literal[i];
1896 					}
1897 					nliteral = 0;
1898 					memset(literal, 0, sizeof(literal));
1899 				}
1900 			}
1901 		} else if (cf->op == CF_OP_VTX) {
1902 			LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) {
1903 				r = r600_bytecode_vtx_build(bc, vtx, addr);
1904 				if (r)
1905 					return r;
1906 				addr += 4;
1907 			}
1908 		} else if (cf->op == CF_OP_GDS) {
1909 			assert(bc->gfx_level >= EVERGREEN);
1910 			LIST_FOR_EACH_ENTRY(gds, &cf->gds, list) {
1911 				r = eg_bytecode_gds_build(bc, gds, addr);
1912 				if (r)
1913 					return r;
1914 				addr += 4;
1915 			}
1916 		} else if (cf->op == CF_OP_TEX) {
1917 			LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) {
1918 				assert(bc->gfx_level >= EVERGREEN);
1919 				r = r600_bytecode_vtx_build(bc, vtx, addr);
1920 				if (r)
1921 					return r;
1922 				addr += 4;
1923 			}
1924 			LIST_FOR_EACH_ENTRY(tex, &cf->tex, list) {
1925 				r = r600_bytecode_tex_build(bc, tex, addr);
1926 				if (r)
1927 					return r;
1928 				addr += 4;
1929 			}
1930 		}
1931 	}
1932 	return 0;
1933 }
1934 
r600_bytecode_clear(struct r600_bytecode * bc)1935 void r600_bytecode_clear(struct r600_bytecode *bc)
1936 {
1937 	struct r600_bytecode_cf *cf = NULL, *next_cf;
1938 
1939 	free(bc->bytecode);
1940 	bc->bytecode = NULL;
1941 
1942 	LIST_FOR_EACH_ENTRY_SAFE(cf, next_cf, &bc->cf, list) {
1943 		struct r600_bytecode_alu *alu = NULL, *next_alu;
1944 		struct r600_bytecode_tex *tex = NULL, *next_tex;
1945 		struct r600_bytecode_tex *vtx = NULL, *next_vtx;
1946 		struct r600_bytecode_gds *gds = NULL, *next_gds;
1947 
1948 		LIST_FOR_EACH_ENTRY_SAFE(alu, next_alu, &cf->alu, list) {
1949 			free(alu);
1950 		}
1951 
1952 		list_inithead(&cf->alu);
1953 
1954 		LIST_FOR_EACH_ENTRY_SAFE(tex, next_tex, &cf->tex, list) {
1955 			free(tex);
1956 		}
1957 
1958 		list_inithead(&cf->tex);
1959 
1960 		LIST_FOR_EACH_ENTRY_SAFE(vtx, next_vtx, &cf->vtx, list) {
1961 			free(vtx);
1962 		}
1963 
1964 		list_inithead(&cf->vtx);
1965 
1966 		LIST_FOR_EACH_ENTRY_SAFE(gds, next_gds, &cf->gds, list) {
1967 			free(gds);
1968 		}
1969 
1970 		list_inithead(&cf->gds);
1971 
1972 		free(cf);
1973 	}
1974 
1975 	list_inithead(&cf->list);
1976 }
1977 
print_swizzle(unsigned swz)1978 static int print_swizzle(unsigned swz)
1979 {
1980 	const char * swzchars = "xyzw01?_";
1981 	assert(swz<8 && swz != 6);
1982 	return fprintf(stderr, "%c", swzchars[swz]);
1983 }
1984 
print_sel(unsigned sel,unsigned rel,unsigned index_mode,unsigned need_brackets)1985 static int print_sel(unsigned sel, unsigned rel, unsigned index_mode,
1986 		unsigned need_brackets)
1987 {
1988 	int o = 0;
1989 	if (rel && index_mode >= 5 && sel < 128)
1990 		o += fprintf(stderr, "G");
1991 	if (rel || need_brackets) {
1992 		o += fprintf(stderr, "[");
1993 	}
1994 	o += fprintf(stderr, "%d", sel);
1995 	if (rel) {
1996 		if (index_mode == 0 || index_mode == 6)
1997 			o += fprintf(stderr, "+AR");
1998 		else if (index_mode == 4)
1999 			o += fprintf(stderr, "+AL");
2000 	}
2001 	if (rel || need_brackets) {
2002 		o += fprintf(stderr, "]");
2003 	}
2004 	return o;
2005 }
2006 
print_dst(struct r600_bytecode_alu * alu)2007 static int print_dst(struct r600_bytecode_alu *alu)
2008 {
2009 	int o = 0;
2010 	unsigned sel = alu->dst.sel;
2011 	char reg_char = 'R';
2012 	if (sel >= 128 - 4) { /* clause temporary gpr */
2013 		sel -= 128 - 4;
2014 		reg_char = 'T';
2015 	}
2016 
2017 	if (alu_writes(alu)) {
2018 		o += fprintf(stderr, "%c", reg_char);
2019 		o += print_sel(sel, alu->dst.rel, alu->index_mode, 0);
2020 	} else {
2021 		o += fprintf(stderr, "__");
2022 	}
2023 	o += fprintf(stderr, ".");
2024 	o += print_swizzle(alu->dst.chan);
2025 	return o;
2026 }
2027 
print_src(struct r600_bytecode_alu * alu,unsigned idx)2028 static int print_src(struct r600_bytecode_alu *alu, unsigned idx)
2029 {
2030 	int o = 0;
2031 	struct r600_bytecode_alu_src *src = &alu->src[idx];
2032 	unsigned sel = src->sel, need_sel = 1, need_chan = 1, need_brackets = 0;
2033 
2034 	if (src->neg)
2035 		o += fprintf(stderr,"-");
2036 	if (src->abs)
2037 		o += fprintf(stderr,"|");
2038 
2039 	if (sel < 128 - 4) {
2040 		o += fprintf(stderr, "R");
2041 	} else if (sel < 128) {
2042 		o += fprintf(stderr, "T");
2043 		sel -= 128 - 4;
2044 	} else if (sel < 160) {
2045 		o += fprintf(stderr, "KC0");
2046 		need_brackets = 1;
2047 		sel -= 128;
2048 	} else if (sel < 192) {
2049 		o += fprintf(stderr, "KC1");
2050 		need_brackets = 1;
2051 		sel -= 160;
2052 	} else if (sel >= 512) {
2053 		o += fprintf(stderr, "C%d", src->kc_bank);
2054 		need_brackets = 1;
2055 		sel -= 512;
2056 	} else if (sel >= 448) {
2057 		o += fprintf(stderr, "Param");
2058 		sel -= 448;
2059 		need_chan = 0;
2060 	} else if (sel >= 288) {
2061 		o += fprintf(stderr, "KC3");
2062 		need_brackets = 1;
2063 		sel -= 288;
2064 	} else if (sel >= 256) {
2065 		o += fprintf(stderr, "KC2");
2066 		need_brackets = 1;
2067 		sel -= 256;
2068 	} else {
2069 		need_sel = 0;
2070 		need_chan = 0;
2071 		switch (sel) {
2072 		case EG_V_SQ_ALU_SRC_LDS_DIRECT_A:
2073 			o += fprintf(stderr, "LDS_A[0x%08X]", src->value);
2074 			break;
2075 		case EG_V_SQ_ALU_SRC_LDS_DIRECT_B:
2076 			o += fprintf(stderr, "LDS_B[0x%08X]", src->value);
2077 			break;
2078 		case EG_V_SQ_ALU_SRC_LDS_OQ_A:
2079 			o += fprintf(stderr, "LDS_OQ_A");
2080 			need_chan = 1;
2081 			break;
2082 		case EG_V_SQ_ALU_SRC_LDS_OQ_B:
2083 			o += fprintf(stderr, "LDS_OQ_B");
2084 			need_chan = 1;
2085 			break;
2086 		case EG_V_SQ_ALU_SRC_LDS_OQ_A_POP:
2087 			o += fprintf(stderr, "LDS_OQ_A_POP");
2088 			need_chan = 1;
2089 			break;
2090 		case EG_V_SQ_ALU_SRC_LDS_OQ_B_POP:
2091 			o += fprintf(stderr, "LDS_OQ_B_POP");
2092 			need_chan = 1;
2093 			break;
2094 		case EG_V_SQ_ALU_SRC_TIME_LO:
2095 			o += fprintf(stderr, "TIME_LO");
2096 			break;
2097 		case EG_V_SQ_ALU_SRC_TIME_HI:
2098 			o += fprintf(stderr, "TIME_HI");
2099 			break;
2100 		case EG_V_SQ_ALU_SRC_SE_ID:
2101 			o += fprintf(stderr, "SE_ID");
2102 			break;
2103 		case EG_V_SQ_ALU_SRC_SIMD_ID:
2104 			o += fprintf(stderr, "SIMD_ID");
2105 			break;
2106 		case EG_V_SQ_ALU_SRC_HW_WAVE_ID:
2107 			o += fprintf(stderr, "HW_WAVE_ID");
2108 			break;
2109 		case V_SQ_ALU_SRC_PS:
2110 			o += fprintf(stderr, "PS");
2111 			break;
2112 		case V_SQ_ALU_SRC_PV:
2113 			o += fprintf(stderr, "PV");
2114 			need_chan = 1;
2115 			break;
2116 		case V_SQ_ALU_SRC_LITERAL:
2117 			{
2118 				const uint32_t value_uint32 = src->value;
2119 				float value_float;
2120 				memcpy(&value_float, &value_uint32, sizeof(float));
2121 				o += fprintf(stderr, "[0x%08X %f]", value_uint32, value_float);
2122 			}
2123 			break;
2124 		case V_SQ_ALU_SRC_0_5:
2125 			o += fprintf(stderr, "0.5");
2126 			break;
2127 		case V_SQ_ALU_SRC_M_1_INT:
2128 			o += fprintf(stderr, "-1");
2129 			break;
2130 		case V_SQ_ALU_SRC_1_INT:
2131 			o += fprintf(stderr, "1");
2132 			break;
2133 		case V_SQ_ALU_SRC_1:
2134 			o += fprintf(stderr, "1.0");
2135 			break;
2136 		case V_SQ_ALU_SRC_0:
2137 			o += fprintf(stderr, "0");
2138 			break;
2139 		default:
2140 			o += fprintf(stderr, "??IMM_%d", sel);
2141 			break;
2142 		}
2143 	}
2144 
2145 	if (need_sel)
2146 		o += print_sel(sel, src->rel, alu->index_mode, need_brackets);
2147 
2148 	if (need_chan) {
2149 		o += fprintf(stderr, ".");
2150 		o += print_swizzle(src->chan);
2151 	}
2152 
2153 	if (src->abs)
2154 		o += fprintf(stderr,"|");
2155 
2156 	return o;
2157 }
2158 
print_indent(int p,int c)2159 static int print_indent(int p, int c)
2160 {
2161 	int o = 0;
2162 	while (p++ < c)
2163 		o += fprintf(stderr, " ");
2164 	return o;
2165 }
2166 
2167 const char *rat_instr_name[] = {
2168    "NOP",
2169    "STORE_TYPED",
2170    "STORE_RAW",
2171    "STORE_RAW_FDENORM",
2172    "CMP_XCHG_INT",
2173    "CMP_XCHG_FLT",
2174    "CMP_XCHG_FDENORM",
2175    "ADD",
2176    "SUB",
2177    "RSUB",
2178    "MIN_INT",
2179    "MIN_UINT",
2180    "MAX_INT",
2181    "MAX_UINT",
2182    "AND",
2183    "OR",
2184    "XOR",
2185    "MSKOR",
2186    "INC_UINT",
2187    "DEC_UINT",
2188    "RESERVED20",
2189    "RESERVED21",
2190    "RESERVED22",
2191    "RESERVED23",
2192    "RESERVED24",
2193    "RESERVED25",
2194    "RESERVED26",
2195    "RESERVED27",
2196    "RESERVED28",
2197    "RESERVED29",
2198    "RESERVED30",
2199    "RESERVED31",
2200    "NOP_RTN",
2201    "RESERVED33",
2202    "XCHG_RTN",
2203    "XCHG_FDENORM_RTN",
2204    "CMPXCHG_INT_RTN",
2205    "CMPXCHG_FLT_RTN",
2206    "CMPXCHG_FDENORM_RTN",
2207    "ADD_RTN",
2208    "SUB_RTN",
2209    "RSUB_RTN",
2210    "MIN_INT_RTN",
2211    "MIN_UINT_RTN",
2212    "MAX_INT_RTN",
2213    "MAX_UINT_RTN",
2214    "AND_RTN",
2215    "OR_RTN",
2216    "XOR_RTN",
2217    "MSKOR_RTN",
2218    "INC_UINT_RTN",
2219    "DEC_UINT_RTN",
2220 };
2221 
2222 
r600_bytecode_disasm(struct r600_bytecode * bc)2223 void r600_bytecode_disasm(struct r600_bytecode *bc)
2224 {
2225 	const char *index_mode[] = {"CF_INDEX_NONE", "CF_INDEX_0", "CF_INDEX_1"};
2226 	static int index = 0;
2227 	struct r600_bytecode_cf *cf = NULL;
2228 	struct r600_bytecode_alu *alu = NULL;
2229 	struct r600_bytecode_vtx *vtx = NULL;
2230 	struct r600_bytecode_tex *tex = NULL;
2231 	struct r600_bytecode_gds *gds = NULL;
2232 
2233 	unsigned id, ngr = 0, last;
2234 	uint32_t literal[4];
2235 	unsigned nliteral;
2236 	char chip = '6';
2237 
2238 	switch (bc->gfx_level) {
2239 	case R700:
2240 		chip = '7';
2241 		break;
2242 	case EVERGREEN:
2243 		chip = 'E';
2244 		break;
2245 	case CAYMAN:
2246 		chip = 'C';
2247 		break;
2248 	case R600:
2249 	default:
2250 		chip = '6';
2251 		break;
2252 	}
2253 	fprintf(stderr, "bytecode %d dw -- %d gprs -- %d nstack -------------\n",
2254 	        bc->ndw, bc->ngpr, bc->nstack);
2255 	fprintf(stderr, "shader %d -- %c\n", index++, chip);
2256 
2257 	LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
2258 		id = cf->id;
2259 		if (cf->op == CF_NATIVE) {
2260 			fprintf(stderr, "%04d %08X %08X CF_NATIVE\n", id, bc->bytecode[id],
2261 					bc->bytecode[id + 1]);
2262 		} else {
2263 			const struct cf_op_info *cfop = r600_isa_cf(cf->op);
2264 			if (cfop->flags & CF_ALU) {
2265 				if (cf->eg_alu_extended) {
2266 					fprintf(stderr, "%04d %08X %08X  %s\n", id, bc->bytecode[id],
2267 							bc->bytecode[id + 1], "ALU_EXT");
2268 					id += 2;
2269 				}
2270 				fprintf(stderr, "%04d %08X %08X  %s ", id, bc->bytecode[id],
2271 						bc->bytecode[id + 1], cfop->name);
2272 				fprintf(stderr, "%d @%d ", cf->ndw / 2, cf->addr);
2273 				for (int i = 0; i < 4; ++i) {
2274 					if (cf->kcache[i].mode) {
2275 						int c_start = (cf->kcache[i].addr << 4);
2276 						int c_end = c_start + (cf->kcache[i].mode << 4);
2277 						fprintf(stderr, "KC%d[CB%d:%d-%d%s%s] ",
2278 						        i, cf->kcache[i].bank, c_start, c_end,
2279 						        cf->kcache[i].index_mode ? " " : "",
2280 						        cf->kcache[i].index_mode ? index_mode[cf->kcache[i].index_mode] : "");
2281 					}
2282 				}
2283 				fprintf(stderr, "\n");
2284 			} else if (cfop->flags & CF_FETCH) {
2285 				fprintf(stderr, "%04d %08X %08X  %s ", id, bc->bytecode[id],
2286 						bc->bytecode[id + 1], cfop->name);
2287 				fprintf(stderr, "%d @%d ", cf->ndw / 4, cf->addr);
2288 				if (cf->vpm)
2289 					fprintf(stderr, "VPM ");
2290 				if (cf->end_of_program)
2291 					fprintf(stderr, "EOP ");
2292 				fprintf(stderr, "\n");
2293 
2294 			} else if (cfop->flags & CF_EXP) {
2295 				int o = 0;
2296 				const char *exp_type[] = {"PIXEL", "POS  ", "PARAM"};
2297 				o += fprintf(stderr, "%04d %08X %08X  %s ", id, bc->bytecode[id],
2298 						bc->bytecode[id + 1], cfop->name);
2299 				o += print_indent(o, 43);
2300 				o += fprintf(stderr, "%s ", exp_type[cf->output.type]);
2301 				if (cf->output.burst_count > 1) {
2302 					o += fprintf(stderr, "%d-%d ", cf->output.array_base,
2303 							cf->output.array_base + cf->output.burst_count - 1);
2304 
2305 					o += print_indent(o, 55);
2306 					o += fprintf(stderr, "R%d-%d.", cf->output.gpr,
2307 							cf->output.gpr + cf->output.burst_count - 1);
2308 				} else {
2309 					o += fprintf(stderr, "%d ", cf->output.array_base);
2310 					o += print_indent(o, 55);
2311 					o += fprintf(stderr, "R%d.", cf->output.gpr);
2312 				}
2313 
2314 				o += print_swizzle(cf->output.swizzle_x);
2315 				o += print_swizzle(cf->output.swizzle_y);
2316 				o += print_swizzle(cf->output.swizzle_z);
2317 				o += print_swizzle(cf->output.swizzle_w);
2318 
2319 				print_indent(o, 67);
2320 
2321 				fprintf(stderr, " ES:%X ", cf->output.elem_size);
2322 				if (cf->mark)
2323 					fprintf(stderr, "MARK ");
2324 				if (!cf->barrier)
2325 					fprintf(stderr, "NO_BARRIER ");
2326 				if (cf->end_of_program)
2327 					fprintf(stderr, "EOP ");
2328 				fprintf(stderr, "\n");
2329 			} else if (r600_isa_cf(cf->op)->flags & CF_MEM) {
2330 				int o = 0;
2331 				const char *exp_type_r600[] = {"WRITE", "WRITE_IND", "READ",
2332 				                               "READ_IND"};
2333 				const char *exp_type_r700[] = {"WRITE", "WRITE_IND", "WRITE_ACK",
2334 				                               "WRITE_IND_ACK"};
2335 
2336 				const char **exp_type = bc->gfx_level >= R700 ?
2337                                        exp_type_r700 : exp_type_r600;
2338 
2339 				o += fprintf(stderr, "%04d %08X %08X  %s ", id,
2340 						bc->bytecode[id], bc->bytecode[id + 1], cfop->name);
2341 				o += print_indent(o, 43);
2342 				o += fprintf(stderr, "%s ", exp_type[cf->output.type]);
2343 
2344 				if (r600_isa_cf(cf->op)->flags & CF_RAT) {
2345 					o += fprintf(stderr, "RAT%d", cf->rat.id);
2346 					if (cf->rat.index_mode) {
2347 						o += fprintf(stderr, "[IDX%d]", cf->rat.index_mode - 1);
2348 					}
2349                assert(ARRAY_SIZE(rat_instr_name) > cf->rat.inst);
2350 					o += fprintf(stderr, " %s ", rat_instr_name[cf->rat.inst]);
2351 				}
2352 
2353 				if (cf->output.burst_count > 1) {
2354 					o += fprintf(stderr, "%d-%d ", cf->output.array_base,
2355 							cf->output.array_base + cf->output.burst_count - 1);
2356 					o += print_indent(o, 55);
2357 					o += fprintf(stderr, "R%d-%d.", cf->output.gpr,
2358 							cf->output.gpr + cf->output.burst_count - 1);
2359 				} else {
2360 					o += fprintf(stderr, "%d ", cf->output.array_base);
2361 					o += print_indent(o, 55);
2362 					o += fprintf(stderr, "R%d.", cf->output.gpr);
2363 				}
2364 				for (int i = 0; i < 4; ++i) {
2365 					if (cf->output.comp_mask & (1 << i))
2366 						o += print_swizzle(i);
2367 					else
2368 						o += print_swizzle(7);
2369 				}
2370 
2371 				if (cf->output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND ||
2372 				    cf->output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND)
2373 					o += fprintf(stderr, " R%d.xyz", cf->output.index_gpr);
2374 
2375 				o += print_indent(o, 67);
2376 
2377 				fprintf(stderr, " ES:%i ", cf->output.elem_size);
2378 				if (cf->output.array_size != 0xFFF)
2379 					fprintf(stderr, "AS:%i ", cf->output.array_size);
2380 				if (cf->mark)
2381 					fprintf(stderr, "MARK ");
2382 				if (!cf->barrier)
2383 					fprintf(stderr, "NO_BARRIER ");
2384 				if (cf->end_of_program)
2385 					fprintf(stderr, "EOP ");
2386 
2387 				if (cf->output.mark)
2388 					fprintf(stderr, "MARK ");
2389 
2390 				fprintf(stderr, "\n");
2391 			} else {
2392 				fprintf(stderr, "%04d %08X %08X  %s ", id, bc->bytecode[id],
2393 						bc->bytecode[id + 1], cfop->name);
2394 				fprintf(stderr, "@%d ", cf->cf_addr);
2395 				if (cf->cond)
2396 					fprintf(stderr, "CND:%X ", cf->cond);
2397 				if (cf->pop_count)
2398 					fprintf(stderr, "POP:%X ", cf->pop_count);
2399 				if (cf->count && (cfop->flags & CF_EMIT))
2400 					fprintf(stderr, "STREAM%d ", cf->count);
2401 				if (cf->vpm)
2402 					fprintf(stderr, "VPM ");
2403 				if (cf->end_of_program)
2404 					fprintf(stderr, "EOP ");
2405 				fprintf(stderr, "\n");
2406 			}
2407 		}
2408 
2409 		id = cf->addr;
2410 		nliteral = 0;
2411 		last = 1;
2412 		int chan_mask = 0;
2413 		LIST_FOR_EACH_ENTRY(alu, &cf->alu, list) {
2414 			const char chan[] = "xyzwt";
2415 			const char *omod_str[] = {"","*2","*4","/2"};
2416 			const struct alu_op_info *aop = r600_isa_alu(alu->op);
2417 			int o = 0;
2418 
2419 			r600_bytecode_alu_nliterals(alu, literal, &nliteral);
2420 			o += fprintf(stderr, " %04d %08X %08X  ", id, bc->bytecode[id], bc->bytecode[id+1]);
2421 			if (last)
2422 				o += fprintf(stderr, "%4d ", ++ngr);
2423 			else
2424 				o += fprintf(stderr, "     ");
2425 
2426 			if ((chan_mask & (1 << alu->dst.chan)) ||
2427 				((aop->slots[bc->isa->hw_class] == AF_S) && !(bc->isa->hw_class == ISA_CC_CAYMAN)))
2428 				o += fprintf(stderr, "t:");
2429 			else
2430 				o += fprintf(stderr, "%c:", chan[alu->dst.chan]);
2431 			chan_mask |= 1 << alu->dst.chan;
2432 
2433 			o += fprintf(stderr, "%c%c %c ", alu->execute_mask ? 'M':' ',
2434 					alu->update_pred ? 'P':' ',
2435 					alu->pred_sel ? alu->pred_sel==2 ? '0':'1':' ');
2436 
2437 			o += fprintf(stderr, "%s%s%s ", aop->name,
2438 					omod_str[alu->omod], alu->dst.clamp ? "_sat":"");
2439 
2440 			o += print_indent(o,60);
2441 			if (bc->isa->hw_class == ISA_CC_CAYMAN && alu->op == ALU_OP1_MOVA_INT) {
2442 				switch (alu->dst.sel) {
2443 				case 0: fprintf(stderr, "AR"); break;
2444 				case 2: fprintf(stderr, "CF_IDX0"); break;
2445 				case 3: fprintf(stderr, "CF_IDX1"); break;
2446 				}
2447 			} else {
2448 				o += print_dst(alu);
2449 			}
2450 			for (int i = 0; i < aop->src_count; ++i) {
2451 				o += fprintf(stderr, i == 0 ? ",  ": ", ");
2452 				o += print_src(alu, i);
2453 			}
2454 
2455 			if (alu->bank_swizzle) {
2456 				o += print_indent(o,75);
2457 				o += fprintf(stderr, "  BS:%d", alu->bank_swizzle);
2458 			}
2459 
2460 			fprintf(stderr, "\n");
2461 			id += 2;
2462 
2463 			if (alu->last) {
2464 				for (unsigned i = 0; i < nliteral; i++, id++) {
2465 					float *f = (float*)(bc->bytecode + id);
2466 					o = fprintf(stderr, " %04d %08X", id, bc->bytecode[id]);
2467 					print_indent(o, 60);
2468 					fprintf(stderr, " %f (%d)\n", *f, *(bc->bytecode + id));
2469 				}
2470 				id += nliteral & 1;
2471 				nliteral = 0;
2472 				chan_mask = 0;
2473 			}
2474 			last = alu->last;
2475 		}
2476 
2477 		LIST_FOR_EACH_ENTRY(tex, &cf->tex, list) {
2478 			int o = 0;
2479 			o += fprintf(stderr, " %04d %08X %08X %08X   ", id, bc->bytecode[id],
2480 					bc->bytecode[id + 1], bc->bytecode[id + 2]);
2481 
2482 			o += fprintf(stderr, "%s ", r600_isa_fetch(tex->op)->name);
2483 
2484 			o += print_indent(o, 50);
2485 
2486 			o += fprintf(stderr, "R%d.", tex->dst_gpr);
2487 			o += print_swizzle(tex->dst_sel_x);
2488 			o += print_swizzle(tex->dst_sel_y);
2489 			o += print_swizzle(tex->dst_sel_z);
2490 			o += print_swizzle(tex->dst_sel_w);
2491 
2492 			o += fprintf(stderr, ", R%d.", tex->src_gpr);
2493 			o += print_swizzle(tex->src_sel_x);
2494 			o += print_swizzle(tex->src_sel_y);
2495 			o += print_swizzle(tex->src_sel_z);
2496 			o += print_swizzle(tex->src_sel_w);
2497 
2498 			o += fprintf(stderr, ",  RID:%d ", tex->resource_id);
2499                         if (tex->resource_index_mode)
2500 				fprintf(stderr, "RQ_%s", index_mode[tex->resource_index_mode]);
2501 
2502 			o += fprintf(stderr, ", SID:%d  ", tex->sampler_id);
2503 
2504 			if (tex->sampler_index_mode)
2505 				fprintf(stderr, "SQ_%s ", index_mode[tex->sampler_index_mode]);
2506 
2507 
2508 
2509 			if (tex->lod_bias)
2510 				fprintf(stderr, "LB:%d ", tex->lod_bias);
2511 
2512 			fprintf(stderr, "CT:%c%c%c%c ",
2513 					tex->coord_type_x ? 'N' : 'U',
2514 					tex->coord_type_y ? 'N' : 'U',
2515 					tex->coord_type_z ? 'N' : 'U',
2516 					tex->coord_type_w ? 'N' : 'U');
2517 
2518 			if (tex->offset_x)
2519 				fprintf(stderr, "OX:%d ", tex->offset_x);
2520 			if (tex->offset_y)
2521 				fprintf(stderr, "OY:%d ", tex->offset_y);
2522 			if (tex->offset_z)
2523 				fprintf(stderr, "OZ:%d ", tex->offset_z);
2524 
2525 			id += 4;
2526 			fprintf(stderr, "\n");
2527 		}
2528 
2529 		LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) {
2530 			int o = 0;
2531 			const char * fetch_type[] = {"VERTEX", "INSTANCE", ""};
2532 			o += fprintf(stderr, " %04d %08X %08X %08X   ", id, bc->bytecode[id],
2533 					bc->bytecode[id + 1], bc->bytecode[id + 2]);
2534 
2535 			o += fprintf(stderr, "%s ", r600_isa_fetch(vtx->op)->name);
2536 
2537 			o += print_indent(o, 50);
2538 
2539 			o += fprintf(stderr, "R%d.", vtx->dst_gpr);
2540 			o += print_swizzle(vtx->dst_sel_x);
2541 			o += print_swizzle(vtx->dst_sel_y);
2542 			o += print_swizzle(vtx->dst_sel_z);
2543 			o += print_swizzle(vtx->dst_sel_w);
2544 
2545 			o += fprintf(stderr, ", R%d.", vtx->src_gpr);
2546 			o += print_swizzle(vtx->src_sel_x);
2547 			if (r600_isa_fetch(vtx->op)->flags & FF_MEM)
2548 				o += print_swizzle(vtx->src_sel_y);
2549 
2550 			if (vtx->offset)
2551 				fprintf(stderr, " +%db", vtx->offset);
2552 
2553 			o += print_indent(o, 55);
2554 
2555 			fprintf(stderr, ",  RID:%d ", vtx->buffer_id);
2556 
2557 			fprintf(stderr, "%s ", fetch_type[vtx->fetch_type]);
2558 
2559 			if (bc->gfx_level < CAYMAN && vtx->mega_fetch_count)
2560 				fprintf(stderr, "MFC:%d ", vtx->mega_fetch_count);
2561 
2562 			if (bc->gfx_level >= EVERGREEN && vtx->buffer_index_mode)
2563 				fprintf(stderr, "SQ_%s ", index_mode[vtx->buffer_index_mode]);
2564 
2565 			if (r600_isa_fetch(vtx->op)->flags & FF_MEM) {
2566 				if (vtx->uncached)
2567 					fprintf(stderr, "UNCACHED ");
2568 				if (vtx->indexed)
2569 					fprintf(stderr, "INDEXED:%d ", vtx->indexed);
2570 
2571 				fprintf(stderr, "ELEM_SIZE:%d ", vtx->elem_size);
2572 				if (vtx->burst_count)
2573 					fprintf(stderr, "BURST_COUNT:%d ", vtx->burst_count);
2574 				fprintf(stderr, "ARRAY_BASE:%d ", vtx->array_base);
2575 				fprintf(stderr, "ARRAY_SIZE:%d ", vtx->array_size);
2576 			}
2577 
2578 			fprintf(stderr, "UCF:%d ", vtx->use_const_fields);
2579 			fprintf(stderr, "FMT(DTA:%d ", vtx->data_format);
2580 			fprintf(stderr, "NUM:%d ", vtx->num_format_all);
2581 			fprintf(stderr, "COMP:%d ", vtx->format_comp_all);
2582 			fprintf(stderr, "MODE:%d)\n", vtx->srf_mode_all);
2583 
2584 			id += 4;
2585 		}
2586 
2587 		LIST_FOR_EACH_ENTRY(gds, &cf->gds, list) {
2588 			UNUSED int o = 0;
2589 			o += fprintf(stderr, " %04d %08X %08X %08X   ", id, bc->bytecode[id],
2590 					bc->bytecode[id + 1], bc->bytecode[id + 2]);
2591 
2592 			o += fprintf(stderr, "%s ", r600_isa_fetch(gds->op)->name);
2593 
2594 			if (gds->op != FETCH_OP_TF_WRITE) {
2595 				o += fprintf(stderr, "R%d.", gds->dst_gpr);
2596 				o += print_swizzle(gds->dst_sel_x);
2597 				o += print_swizzle(gds->dst_sel_y);
2598 				o += print_swizzle(gds->dst_sel_z);
2599 				o += print_swizzle(gds->dst_sel_w);
2600 			}
2601 
2602 			o += fprintf(stderr, ", R%d.", gds->src_gpr);
2603 			o += print_swizzle(gds->src_sel_x);
2604 			o += print_swizzle(gds->src_sel_y);
2605 			o += print_swizzle(gds->src_sel_z);
2606 
2607 			if (gds->op != FETCH_OP_TF_WRITE) {
2608 				o += fprintf(stderr, ", R%d.", gds->src_gpr2);
2609 			}
2610 			if (gds->alloc_consume) {
2611 				o += fprintf(stderr, " UAV: %d", gds->uav_id);
2612 				if (gds->uav_index_mode)
2613 					o += fprintf(stderr, "[%s]", index_mode[gds->uav_index_mode]);
2614 			}
2615 			fprintf(stderr, "\n");
2616 			id += 4;
2617 		}
2618 	}
2619 
2620 	fprintf(stderr, "--------------------------------------\n");
2621 }
2622 
r600_vertex_data_type(enum pipe_format pformat,unsigned * format,unsigned * num_format,unsigned * format_comp,unsigned * endian)2623 void r600_vertex_data_type(enum pipe_format pformat,
2624 				  unsigned *format,
2625 				  unsigned *num_format, unsigned *format_comp, unsigned *endian)
2626 {
2627 	const struct util_format_description *desc;
2628 	unsigned i;
2629 
2630 	*format = 0;
2631 	*num_format = 0;
2632 	*format_comp = 0;
2633 	*endian = ENDIAN_NONE;
2634 
2635 	if (pformat == PIPE_FORMAT_R11G11B10_FLOAT) {
2636 		*format = FMT_10_11_11_FLOAT;
2637 		*endian = r600_endian_swap(32);
2638 		return;
2639 	}
2640 
2641 	if (pformat == PIPE_FORMAT_B5G6R5_UNORM) {
2642 		*format = FMT_5_6_5;
2643 		*endian = r600_endian_swap(16);
2644 		return;
2645 	}
2646 
2647 	if (pformat == PIPE_FORMAT_B5G5R5A1_UNORM) {
2648 		*format = FMT_1_5_5_5;
2649 		*endian = r600_endian_swap(16);
2650 		return;
2651 	}
2652 
2653 	if (pformat == PIPE_FORMAT_A1B5G5R5_UNORM) {
2654 		*format = FMT_5_5_5_1;
2655 		return;
2656 	}
2657 
2658 	desc = util_format_description(pformat);
2659 	if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) {
2660 		goto out_unknown;
2661 	}
2662 
2663 	i = util_format_get_first_non_void_channel(pformat);
2664 
2665 	*endian = r600_endian_swap(desc->channel[i].size);
2666 
2667 	switch (desc->channel[i].type) {
2668 	/* Half-floats, floats, ints */
2669 	case UTIL_FORMAT_TYPE_FLOAT:
2670 		switch (desc->channel[i].size) {
2671 		case 16:
2672 			switch (desc->nr_channels) {
2673 			case 1:
2674 				*format = FMT_16_FLOAT;
2675 				break;
2676 			case 2:
2677 				*format = FMT_16_16_FLOAT;
2678 				break;
2679 			case 3:
2680 			case 4:
2681 				*format = FMT_16_16_16_16_FLOAT;
2682 				break;
2683 			}
2684 			break;
2685 		case 32:
2686 			switch (desc->nr_channels) {
2687 			case 1:
2688 				*format = FMT_32_FLOAT;
2689 				break;
2690 			case 2:
2691 				*format = FMT_32_32_FLOAT;
2692 				break;
2693 			case 3:
2694 				*format = FMT_32_32_32_FLOAT;
2695 				break;
2696 			case 4:
2697 				*format = FMT_32_32_32_32_FLOAT;
2698 				break;
2699 			}
2700 			break;
2701 		default:
2702 			goto out_unknown;
2703 		}
2704 		break;
2705 		/* Unsigned ints */
2706 	case UTIL_FORMAT_TYPE_UNSIGNED:
2707 		/* Signed ints */
2708 	case UTIL_FORMAT_TYPE_SIGNED:
2709 		switch (desc->channel[i].size) {
2710 		case 4:
2711 			switch (desc->nr_channels) {
2712 			case 2:
2713 				*format = FMT_4_4;
2714 				break;
2715 			case 4:
2716 				*format = FMT_4_4_4_4;
2717 				break;
2718 			}
2719 			break;
2720 		case 8:
2721 			switch (desc->nr_channels) {
2722 			case 1:
2723 				*format = FMT_8;
2724 				break;
2725 			case 2:
2726 				*format = FMT_8_8;
2727 				break;
2728 			case 3:
2729 			case 4:
2730 				*format = FMT_8_8_8_8;
2731 				break;
2732 			}
2733 			break;
2734 		case 10:
2735 			if (desc->nr_channels != 4)
2736 				goto out_unknown;
2737 
2738 			*format = FMT_2_10_10_10;
2739 			break;
2740 		case 16:
2741 			switch (desc->nr_channels) {
2742 			case 1:
2743 				*format = FMT_16;
2744 				break;
2745 			case 2:
2746 				*format = FMT_16_16;
2747 				break;
2748 			case 3:
2749 			case 4:
2750 				*format = FMT_16_16_16_16;
2751 				break;
2752 			}
2753 			break;
2754 		case 32:
2755 			switch (desc->nr_channels) {
2756 			case 1:
2757 				*format = FMT_32;
2758 				break;
2759 			case 2:
2760 				*format = FMT_32_32;
2761 				break;
2762 			case 3:
2763 				*format = FMT_32_32_32;
2764 				break;
2765 			case 4:
2766 				*format = FMT_32_32_32_32;
2767 				break;
2768 			}
2769 			break;
2770 		default:
2771 			goto out_unknown;
2772 		}
2773 		break;
2774 	default:
2775 		goto out_unknown;
2776 	}
2777 
2778 	if (desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) {
2779 		*format_comp = 1;
2780 	}
2781 
2782 	*num_format = 0;
2783 	if (desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED ||
2784 	    desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) {
2785 		if (!desc->channel[i].normalized) {
2786 			if (desc->channel[i].pure_integer)
2787 				*num_format = 1;
2788 			else
2789 				*num_format = 2;
2790 		}
2791 	}
2792 	return;
2793 out_unknown:
2794 	R600_ASM_ERR("unsupported vertex format %s\n", util_format_name(pformat));
2795 }
2796 
r600_bytecode_alu_read(struct r600_bytecode * bc,struct r600_bytecode_alu * alu,uint32_t word0,uint32_t word1)2797 void r600_bytecode_alu_read(struct r600_bytecode *bc,
2798 		struct r600_bytecode_alu *alu, uint32_t word0, uint32_t word1)
2799 {
2800 	/* WORD0 */
2801 	alu->src[0].sel = G_SQ_ALU_WORD0_SRC0_SEL(word0);
2802 	alu->src[0].rel = G_SQ_ALU_WORD0_SRC0_REL(word0);
2803 	alu->src[0].chan = G_SQ_ALU_WORD0_SRC0_CHAN(word0);
2804 	alu->src[0].neg = G_SQ_ALU_WORD0_SRC0_NEG(word0);
2805 	alu->src[1].sel = G_SQ_ALU_WORD0_SRC1_SEL(word0);
2806 	alu->src[1].rel = G_SQ_ALU_WORD0_SRC1_REL(word0);
2807 	alu->src[1].chan = G_SQ_ALU_WORD0_SRC1_CHAN(word0);
2808 	alu->src[1].neg = G_SQ_ALU_WORD0_SRC1_NEG(word0);
2809 	alu->index_mode = G_SQ_ALU_WORD0_INDEX_MODE(word0);
2810 	alu->pred_sel = G_SQ_ALU_WORD0_PRED_SEL(word0);
2811 	alu->last = G_SQ_ALU_WORD0_LAST(word0);
2812 
2813 	/* WORD1 */
2814 	alu->bank_swizzle = G_SQ_ALU_WORD1_BANK_SWIZZLE(word1);
2815 	if (alu->bank_swizzle)
2816 		alu->bank_swizzle_force = alu->bank_swizzle;
2817 	alu->dst.sel = G_SQ_ALU_WORD1_DST_GPR(word1);
2818 	alu->dst.rel = G_SQ_ALU_WORD1_DST_REL(word1);
2819 	alu->dst.chan = G_SQ_ALU_WORD1_DST_CHAN(word1);
2820 	alu->dst.clamp = G_SQ_ALU_WORD1_CLAMP(word1);
2821 	if (G_SQ_ALU_WORD1_ENCODING(word1)) /*ALU_DWORD1_OP3*/
2822 	{
2823 		alu->is_op3 = 1;
2824 		alu->src[2].sel = G_SQ_ALU_WORD1_OP3_SRC2_SEL(word1);
2825 		alu->src[2].rel = G_SQ_ALU_WORD1_OP3_SRC2_REL(word1);
2826 		alu->src[2].chan = G_SQ_ALU_WORD1_OP3_SRC2_CHAN(word1);
2827 		alu->src[2].neg = G_SQ_ALU_WORD1_OP3_SRC2_NEG(word1);
2828 		alu->op = r600_isa_alu_by_opcode(bc->isa,
2829 				G_SQ_ALU_WORD1_OP3_ALU_INST(word1), /* is_op3 = */ 1);
2830 
2831 	}
2832 	else /*ALU_DWORD1_OP2*/
2833 	{
2834 		alu->src[0].abs = G_SQ_ALU_WORD1_OP2_SRC0_ABS(word1);
2835 		alu->src[1].abs = G_SQ_ALU_WORD1_OP2_SRC1_ABS(word1);
2836 		alu->op = r600_isa_alu_by_opcode(bc->isa,
2837 				G_SQ_ALU_WORD1_OP2_ALU_INST(word1), /* is_op3 = */ 0);
2838 		alu->omod = G_SQ_ALU_WORD1_OP2_OMOD(word1);
2839 		alu->dst.write = G_SQ_ALU_WORD1_OP2_WRITE_MASK(word1);
2840 		alu->update_pred = G_SQ_ALU_WORD1_OP2_UPDATE_PRED(word1);
2841 		alu->execute_mask =
2842 			G_SQ_ALU_WORD1_OP2_UPDATE_EXECUTE_MASK(word1);
2843 	}
2844 }
2845 
2846 #if 0
2847 void r600_bytecode_export_read(struct r600_bytecode *bc,
2848 		struct r600_bytecode_output *output, uint32_t word0, uint32_t word1)
2849 {
2850 	output->array_base = G_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(word0);
2851 	output->type = G_SQ_CF_ALLOC_EXPORT_WORD0_TYPE(word0);
2852 	output->gpr = G_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(word0);
2853 	output->elem_size = G_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE(word0);
2854 
2855 	output->swizzle_x = G_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_X(word1);
2856 	output->swizzle_y = G_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Y(word1);
2857 	output->swizzle_z = G_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Z(word1);
2858 	output->swizzle_w = G_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_W(word1);
2859 	output->burst_count = G_SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT(word1);
2860 	output->end_of_program = G_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(word1);
2861     output->op = r600_isa_cf_by_opcode(bc->isa,
2862 			G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(word1), 0);
2863 	output->barrier = G_SQ_CF_ALLOC_EXPORT_WORD1_BARRIER(word1);
2864 	output->array_size = G_SQ_CF_ALLOC_EXPORT_WORD1_BUF_ARRAY_SIZE(word1);
2865 	output->comp_mask = G_SQ_CF_ALLOC_EXPORT_WORD1_BUF_COMP_MASK(word1);
2866 }
2867 #endif
2868