xref: /aosp_15_r20/external/mesa3d/src/gallium/drivers/r300/compiler/r3xx_vertprog.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2009 Nicolai Hähnle <[email protected]>
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "radeon_compiler.h"
7 
8 #include <stdbool.h>
9 #include <stdio.h>
10 
11 #include "r300_reg.h"
12 
13 #include "radeon_compiler_util.h"
14 #include "radeon_dataflow.h"
15 #include "radeon_program.h"
16 #include "radeon_program_alu.h"
17 #include "radeon_swizzle.h"
18 #include "radeon_remove_constants.h"
19 #include "radeon_regalloc.h"
20 #include "radeon_list.h"
21 
22 #include "util/compiler.h"
23 
24 /*
25  * Take an already-setup and valid source then swizzle it appropriately to
26  * obtain a constant ZERO or ONE source.
27  */
28 #define __CONST(x, y)	\
29 	(PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[x]),	\
30 			   t_swizzle(y),	\
31 			   t_swizzle(y),	\
32 			   t_swizzle(y),	\
33 			   t_swizzle(y),	\
34 			   t_src_class(vpi->SrcReg[x].File), \
35 			   RC_MASK_NONE) | (vpi->SrcReg[x].RelAddr << 4))
36 
37 
t_dst_mask(unsigned int mask)38 static unsigned long t_dst_mask(unsigned int mask)
39 {
40 	/* RC_MASK_* is equivalent to VSF_FLAG_* */
41 	return mask & RC_MASK_XYZW;
42 }
43 
t_dst_class(rc_register_file file)44 static unsigned long t_dst_class(rc_register_file file)
45 {
46 	switch (file) {
47 	default:
48 		fprintf(stderr, "%s: Bad register file %i\n", __func__, file);
49 		FALLTHROUGH;
50 	case RC_FILE_TEMPORARY:
51 		return PVS_DST_REG_TEMPORARY;
52 	case RC_FILE_OUTPUT:
53 		return PVS_DST_REG_OUT;
54 	case RC_FILE_ADDRESS:
55 		return PVS_DST_REG_A0;
56 	}
57 }
58 
t_dst_index(struct r300_vertex_program_code * vp,struct rc_dst_register * dst)59 static unsigned long t_dst_index(struct r300_vertex_program_code *vp,
60 				 struct rc_dst_register *dst)
61 {
62 	if (dst->File == RC_FILE_OUTPUT)
63 		return vp->outputs[dst->Index];
64 
65 	return dst->Index;
66 }
67 
t_src_class(rc_register_file file)68 static unsigned long t_src_class(rc_register_file file)
69 {
70 	switch (file) {
71 	default:
72 		fprintf(stderr, "%s: Bad register file %i\n", __func__, file);
73 		FALLTHROUGH;
74 	case RC_FILE_NONE:
75 	case RC_FILE_TEMPORARY:
76 		return PVS_SRC_REG_TEMPORARY;
77 	case RC_FILE_INPUT:
78 		return PVS_SRC_REG_INPUT;
79 	case RC_FILE_CONSTANT:
80 		return PVS_SRC_REG_CONSTANT;
81 	}
82 }
83 
t_src_conflict(struct rc_src_register a,struct rc_src_register b)84 static int t_src_conflict(struct rc_src_register a, struct rc_src_register b)
85 {
86 	unsigned long aclass = t_src_class(a.File);
87 	unsigned long bclass = t_src_class(b.File);
88 
89 	if (aclass != bclass)
90 		return 0;
91 	if (aclass == PVS_SRC_REG_TEMPORARY)
92 		return 0;
93 
94 	if (a.RelAddr || b.RelAddr)
95 		return 1;
96 	if (a.Index != b.Index)
97 		return 1;
98 
99 	return 0;
100 }
101 
t_swizzle(unsigned int swizzle)102 static inline unsigned long t_swizzle(unsigned int swizzle)
103 {
104 	/* this is in fact a NOP as the Mesa RC_SWIZZLE_* are all identical to VSF_IN_COMPONENT_* */
105 	return swizzle;
106 }
107 
t_src_index(struct r300_vertex_program_code * vp,struct rc_src_register * src)108 static unsigned long t_src_index(struct r300_vertex_program_code *vp,
109 				 struct rc_src_register *src)
110 {
111 	if (src->File == RC_FILE_INPUT) {
112 		assert(vp->inputs[src->Index] != -1);
113 		return vp->inputs[src->Index];
114 	} else {
115 		if (src->Index < 0) {
116 			fprintf(stderr,
117 				"negative offsets for indirect addressing do not work.\n");
118 			return 0;
119 		}
120 		return src->Index;
121 	}
122 }
123 
124 /* these two functions should probably be merged... */
125 
t_src(struct r300_vertex_program_code * vp,struct rc_src_register * src)126 static unsigned long t_src(struct r300_vertex_program_code *vp,
127 			   struct rc_src_register *src)
128 {
129 	/* src->Negate uses the RC_MASK_ flags from program_instruction.h,
130 	 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
131 	 */
132 	return PVS_SRC_OPERAND(t_src_index(vp, src),
133 			       t_swizzle(GET_SWZ(src->Swizzle, 0)),
134 			       t_swizzle(GET_SWZ(src->Swizzle, 1)),
135 			       t_swizzle(GET_SWZ(src->Swizzle, 2)),
136 			       t_swizzle(GET_SWZ(src->Swizzle, 3)),
137 			       t_src_class(src->File),
138 			       src->Negate) |
139 	       (src->RelAddr << 4) | (src->Abs << 3);
140 }
141 
t_src_scalar(struct r300_vertex_program_code * vp,struct rc_src_register * src)142 static unsigned long t_src_scalar(struct r300_vertex_program_code *vp,
143 				  struct rc_src_register *src)
144 {
145 	/* src->Negate uses the RC_MASK_ flags from program_instruction.h,
146 	 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
147 	 */
148 	unsigned int swz = rc_get_scalar_src_swz(src->Swizzle);
149 
150 	return PVS_SRC_OPERAND(t_src_index(vp, src),
151 			       t_swizzle(swz),
152 			       t_swizzle(swz),
153 			       t_swizzle(swz),
154 			       t_swizzle(swz),
155 			       t_src_class(src->File),
156 			       src->Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
157 	       (src->RelAddr << 4) | (src->Abs << 3);
158 }
159 
valid_dst(struct r300_vertex_program_code * vp,struct rc_dst_register * dst)160 static int valid_dst(struct r300_vertex_program_code *vp,
161 			   struct rc_dst_register *dst)
162 {
163 	if (dst->File == RC_FILE_OUTPUT && vp->outputs[dst->Index] == -1) {
164 		return 0;
165 	} else if (dst->File == RC_FILE_ADDRESS) {
166 		assert(dst->Index == 0);
167 	}
168 
169 	return 1;
170 }
171 
ei_vector1(struct r300_vertex_program_code * vp,unsigned int hw_opcode,struct rc_sub_instruction * vpi,unsigned int * inst)172 static void ei_vector1(struct r300_vertex_program_code *vp,
173 				unsigned int hw_opcode,
174 				struct rc_sub_instruction *vpi,
175 				unsigned int * inst)
176 {
177 	inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
178 				     0,
179 				     0,
180 				     t_dst_index(vp, &vpi->DstReg),
181 				     t_dst_mask(vpi->DstReg.WriteMask),
182 				     t_dst_class(vpi->DstReg.File),
183                                      vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
184 	inst[1] = t_src(vp, &vpi->SrcReg[0]);
185 	inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
186 	inst[3] = __CONST(0, RC_SWIZZLE_ZERO);
187 }
188 
ei_vector2(struct r300_vertex_program_code * vp,unsigned int hw_opcode,struct rc_sub_instruction * vpi,unsigned int * inst)189 static void ei_vector2(struct r300_vertex_program_code *vp,
190 				unsigned int hw_opcode,
191 				struct rc_sub_instruction *vpi,
192 				unsigned int * inst)
193 {
194 	inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
195 				     0,
196 				     0,
197 				     t_dst_index(vp, &vpi->DstReg),
198 				     t_dst_mask(vpi->DstReg.WriteMask),
199 				     t_dst_class(vpi->DstReg.File),
200                                      vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
201 	inst[1] = t_src(vp, &vpi->SrcReg[0]);
202 	inst[2] = t_src(vp, &vpi->SrcReg[1]);
203 	inst[3] = __CONST(1, RC_SWIZZLE_ZERO);
204 }
205 
ei_math1(struct r300_vertex_program_code * vp,unsigned int hw_opcode,struct rc_sub_instruction * vpi,unsigned int * inst)206 static void ei_math1(struct r300_vertex_program_code *vp,
207 				unsigned int hw_opcode,
208 				struct rc_sub_instruction *vpi,
209 				unsigned int * inst)
210 {
211 	inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
212 				     1,
213 				     0,
214 				     t_dst_index(vp, &vpi->DstReg),
215 				     t_dst_mask(vpi->DstReg.WriteMask),
216 				     t_dst_class(vpi->DstReg.File),
217                                      vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
218 	inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);
219 	inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
220 	inst[3] = __CONST(0, RC_SWIZZLE_ZERO);
221 }
222 
ei_cmp(struct r300_vertex_program_code * vp,struct rc_sub_instruction * vpi,unsigned int * inst)223 static void ei_cmp(struct r300_vertex_program_code *vp,
224 				struct rc_sub_instruction *vpi,
225 				unsigned int * inst)
226 {
227 	inst[0] = PVS_OP_DST_OPERAND(VE_COND_MUX_GTE,
228 				     0,
229 				     0,
230 				     t_dst_index(vp, &vpi->DstReg),
231 				     t_dst_mask(vpi->DstReg.WriteMask),
232 				     t_dst_class(vpi->DstReg.File),
233                                      vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
234 
235 	/* Arguments with constant swizzles still count as a unique
236 	 * temporary, so we should make sure these arguments share a
237 	 * register index with one of the other arguments. */
238 	for (unsigned i = 0; i < 3; i++) {
239 		unsigned j = (i + 1) % 3;
240 		if (vpi->SrcReg[i].File == RC_FILE_NONE &&
241 			(vpi->SrcReg[j].File == RC_FILE_NONE ||
242 			 vpi->SrcReg[j].File == RC_FILE_TEMPORARY)) {
243 			vpi->SrcReg[i].Index = vpi->SrcReg[j].Index;
244 			break;
245 		}
246 	}
247 
248 	inst[1] = t_src(vp, &vpi->SrcReg[0]);
249 	inst[2] = t_src(vp, &vpi->SrcReg[2]);
250 	inst[3] = t_src(vp, &vpi->SrcReg[1]);
251 }
252 
ei_lit(struct r300_vertex_program_code * vp,struct rc_sub_instruction * vpi,unsigned int * inst)253 static void ei_lit(struct r300_vertex_program_code *vp,
254 				      struct rc_sub_instruction *vpi,
255 				      unsigned int * inst)
256 {
257 	//LIT TMP 1.Y Z TMP 1{} {X W Z Y} TMP 1{} {Y W Z X} TMP 1{} {Y X Z W}
258 
259 	inst[0] = PVS_OP_DST_OPERAND(ME_LIGHT_COEFF_DX,
260 				     1,
261 				     0,
262 				     t_dst_index(vp, &vpi->DstReg),
263 				     t_dst_mask(vpi->DstReg.WriteMask),
264 				     t_dst_class(vpi->DstReg.File),
265                                      vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
266 	/* NOTE: Users swizzling might not work. */
267 	inst[1] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)),	// X
268 				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)),	// W
269 				  PVS_SRC_SELECT_FORCE_0,	// Z
270 				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)),	// Y
271 				  t_src_class(vpi->SrcReg[0].File),
272 				  vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
273 	    (vpi->SrcReg[0].RelAddr << 4);
274 	inst[2] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)),	// Y
275 				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)),	// W
276 				  PVS_SRC_SELECT_FORCE_0,	// Z
277 				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)),	// X
278 				  t_src_class(vpi->SrcReg[0].File),
279 				  vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
280 	    (vpi->SrcReg[0].RelAddr << 4);
281 	inst[3] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)),	// Y
282 				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)),	// X
283 				  PVS_SRC_SELECT_FORCE_0,	// Z
284 				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)),	// W
285 				  t_src_class(vpi->SrcReg[0].File),
286 				  vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
287 	    (vpi->SrcReg[0].RelAddr << 4);
288 }
289 
ei_mad(struct r300_vertex_program_code * vp,struct rc_sub_instruction * vpi,unsigned int * inst)290 static void ei_mad(struct r300_vertex_program_code *vp,
291 				      struct rc_sub_instruction *vpi,
292 				      unsigned int * inst)
293 {
294 	unsigned int i;
295 	/* Remarks about hardware limitations of MAD
296 	 * (please preserve this comment, as this information is _NOT_
297 	 * in the documentation provided by AMD).
298 	 *
299 	 * As described in the documentation, MAD with three unique temporary
300 	 * source registers requires the use of the macro version.
301 	 *
302 	 * However (and this is not mentioned in the documentation), apparently
303 	 * the macro version is _NOT_ a full superset of the normal version.
304 	 * In particular, the macro version does not always work when relative
305 	 * addressing is used in the source operands.
306 	 *
307 	 * This limitation caused incorrect rendering in Sauerbraten's OpenGL
308 	 * assembly shader path when using medium quality animations
309 	 * (i.e. animations with matrix blending instead of quaternion blending).
310 	 *
311 	 * Unfortunately, I (nha) have been unable to extract a Piglit regression
312 	 * test for this issue - for some reason, it is possible to have vertex
313 	 * programs whose prefix is *exactly* the same as the prefix of the
314 	 * offending program in Sauerbraten up to the offending instruction
315 	 * without causing any trouble.
316 	 *
317 	 * Bottom line: Only use the macro version only when really necessary;
318 	 * according to AMD docs, this should improve performance by one clock
319 	 * as a nice side bonus.
320 	 */
321 	if (vpi->SrcReg[0].File == RC_FILE_TEMPORARY &&
322 	    vpi->SrcReg[1].File == RC_FILE_TEMPORARY &&
323 	    vpi->SrcReg[2].File == RC_FILE_TEMPORARY &&
324 	    vpi->SrcReg[0].Index != vpi->SrcReg[1].Index &&
325 	    vpi->SrcReg[0].Index != vpi->SrcReg[2].Index &&
326 	    vpi->SrcReg[1].Index != vpi->SrcReg[2].Index) {
327 		inst[0] = PVS_OP_DST_OPERAND(PVS_MACRO_OP_2CLK_MADD,
328 				0,
329 				1,
330 				t_dst_index(vp, &vpi->DstReg),
331 				t_dst_mask(vpi->DstReg.WriteMask),
332 				t_dst_class(vpi->DstReg.File),
333                                 vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
334 	} else {
335 		inst[0] = PVS_OP_DST_OPERAND(VE_MULTIPLY_ADD,
336 				0,
337 				0,
338 				t_dst_index(vp, &vpi->DstReg),
339 				t_dst_mask(vpi->DstReg.WriteMask),
340 				t_dst_class(vpi->DstReg.File),
341                                 vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
342 
343 		/* Arguments with constant swizzles still count as a unique
344 		 * temporary, so we should make sure these arguments share a
345 		 * register index with one of the other arguments. */
346 		for (i = 0; i < 3; i++) {
347 			unsigned int j;
348 			if (vpi->SrcReg[i].File != RC_FILE_NONE)
349 				continue;
350 
351 			for (j = 0; j < 3; j++) {
352 				if (i != j) {
353 					vpi->SrcReg[i].Index =
354 						vpi->SrcReg[j].Index;
355 					break;
356 				}
357 			}
358 		}
359 	}
360 	inst[1] = t_src(vp, &vpi->SrcReg[0]);
361 	inst[2] = t_src(vp, &vpi->SrcReg[1]);
362 	inst[3] = t_src(vp, &vpi->SrcReg[2]);
363 }
364 
ei_pow(struct r300_vertex_program_code * vp,struct rc_sub_instruction * vpi,unsigned int * inst)365 static void ei_pow(struct r300_vertex_program_code *vp,
366 				      struct rc_sub_instruction *vpi,
367 				      unsigned int * inst)
368 {
369 	inst[0] = PVS_OP_DST_OPERAND(ME_POWER_FUNC_FF,
370 				     1,
371 				     0,
372 				     t_dst_index(vp, &vpi->DstReg),
373 				     t_dst_mask(vpi->DstReg.WriteMask),
374 				     t_dst_class(vpi->DstReg.File),
375                                      vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
376 	inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);
377 	inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
378 	inst[3] = t_src_scalar(vp, &vpi->SrcReg[1]);
379 }
380 
translate_vertex_program(struct radeon_compiler * c,void * user)381 static void translate_vertex_program(struct radeon_compiler *c, void *user)
382 {
383 	struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler*)c;
384 	struct rc_instruction *rci;
385 
386 	unsigned loops[R500_PVS_MAX_LOOP_DEPTH] = {};
387 	unsigned loop_depth = 0;
388 	bool last_input_read_at_loop_end = false;
389 	bool last_pos_write_at_loop_end = false;
390 
391 	compiler->code->pos_end = 0;	/* Not supported yet */
392 	compiler->code->length = 0;
393 	compiler->code->num_temporaries = 0;
394 	compiler->code->last_input_read = 0;
395 	compiler->code->last_pos_write = 0;
396 
397 	compiler->SetHwInputOutput(compiler);
398 
399 	for(rci = compiler->Base.Program.Instructions.Next; rci != &compiler->Base.Program.Instructions; rci = rci->Next) {
400 		struct rc_sub_instruction *vpi = &rci->U.I;
401 		unsigned int *inst = compiler->code->body.d + compiler->code->length;
402 		const struct rc_opcode_info *info = rc_get_opcode_info(vpi->Opcode);
403 
404 		/* Skip instructions writing to non-existing destination */
405 		if (!valid_dst(compiler->code, &vpi->DstReg))
406 			continue;
407 
408 		if (info->HasDstReg) {
409 			/* Neither is Saturate. */
410 			if (vpi->SaturateMode != RC_SATURATE_NONE && !c->is_r500) {
411 				rc_error(&compiler->Base, "Vertex program does not support the Saturate "
412 					 "modifier (yet).\n");
413 			}
414 		}
415 
416 		if (compiler->code->length >= c->max_alu_insts * 4) {
417 			rc_error(&compiler->Base, "Vertex program has too many instructions\n");
418 			return;
419 		}
420 
421 		assert(compiler->Base.is_r500 ||
422 		       (vpi->Opcode != RC_OPCODE_SEQ &&
423 			vpi->Opcode != RC_OPCODE_SNE));
424 
425 		switch (vpi->Opcode) {
426 		case RC_OPCODE_ADD: ei_vector2(compiler->code, VE_ADD, vpi, inst); break;
427 		case RC_OPCODE_ARL: ei_vector1(compiler->code, VE_FLT2FIX_DX, vpi, inst); break;
428 		case RC_OPCODE_ARR: ei_vector1(compiler->code, VE_FLT2FIX_DX_RND, vpi, inst); break;
429 		case RC_OPCODE_COS: ei_math1(compiler->code, ME_COS, vpi, inst); break;
430 		case RC_OPCODE_CMP: ei_cmp(compiler->code, vpi, inst); break;
431 		case RC_OPCODE_DP4: ei_vector2(compiler->code, VE_DOT_PRODUCT, vpi, inst); break;
432 		case RC_OPCODE_DST: ei_vector2(compiler->code, VE_DISTANCE_VECTOR, vpi, inst); break;
433 		case RC_OPCODE_EX2: ei_math1(compiler->code, ME_EXP_BASE2_FULL_DX, vpi, inst); break;
434 		case RC_OPCODE_EXP: ei_math1(compiler->code, ME_EXP_BASE2_DX, vpi, inst); break;
435 		case RC_OPCODE_FRC: ei_vector1(compiler->code, VE_FRACTION, vpi, inst); break;
436 		case RC_OPCODE_LG2: ei_math1(compiler->code, ME_LOG_BASE2_FULL_DX, vpi, inst); break;
437 		case RC_OPCODE_LIT: ei_lit(compiler->code, vpi, inst); break;
438 		case RC_OPCODE_LOG: ei_math1(compiler->code, ME_LOG_BASE2_DX, vpi, inst); break;
439 		case RC_OPCODE_MAD: ei_mad(compiler->code, vpi, inst); break;
440 		case RC_OPCODE_MAX: ei_vector2(compiler->code, VE_MAXIMUM, vpi, inst); break;
441 		case RC_OPCODE_MIN: ei_vector2(compiler->code, VE_MINIMUM, vpi, inst); break;
442 		case RC_OPCODE_MOV: ei_vector1(compiler->code, VE_ADD, vpi, inst); break;
443 		case RC_OPCODE_MUL: ei_vector2(compiler->code, VE_MULTIPLY, vpi, inst); break;
444 		case RC_OPCODE_POW: ei_pow(compiler->code, vpi, inst); break;
445 		case RC_OPCODE_RCP: ei_math1(compiler->code, ME_RECIP_DX, vpi, inst); break;
446 		case RC_OPCODE_RSQ: ei_math1(compiler->code, ME_RECIP_SQRT_DX, vpi, inst); break;
447 		case RC_OPCODE_SEQ: ei_vector2(compiler->code, VE_SET_EQUAL, vpi, inst); break;
448 		case RC_OPCODE_SGE: ei_vector2(compiler->code, VE_SET_GREATER_THAN_EQUAL, vpi, inst); break;
449 		case RC_OPCODE_SIN: ei_math1(compiler->code, ME_SIN, vpi, inst); break;
450 		case RC_OPCODE_SLT: ei_vector2(compiler->code, VE_SET_LESS_THAN, vpi, inst); break;
451 		case RC_OPCODE_SNE: ei_vector2(compiler->code, VE_SET_NOT_EQUAL, vpi, inst); break;
452 		case RC_OPCODE_BGNLOOP:
453 		{
454 			if ((!compiler->Base.is_r500
455 				&& loop_depth >= R300_VS_MAX_LOOP_DEPTH)
456 				|| loop_depth >= R500_PVS_MAX_LOOP_DEPTH) {
457 				rc_error(&compiler->Base,
458 						"Loops are nested too deep.");
459 				return;
460 			}
461 			loops[loop_depth++] = ((compiler->code->length)/ 4) + 1;
462 			break;
463 		}
464 		case RC_OPCODE_ENDLOOP:
465 		{
466 			unsigned int act_addr;
467 			unsigned int last_addr;
468 			unsigned int ret_addr;
469 
470 			if (loop_depth == 1 && last_input_read_at_loop_end) {
471 				compiler->code->last_input_read = compiler->code->length / 4;
472 				last_input_read_at_loop_end = false;
473 			}
474 			if (loop_depth == 1 && last_pos_write_at_loop_end) {
475 				compiler->code->last_pos_write = compiler->code->length / 4;
476 				last_pos_write_at_loop_end = false;
477 			}
478 
479 			ret_addr = loops[--loop_depth];
480 			act_addr = ret_addr - 1;
481 			last_addr = (compiler->code->length / 4) - 1;
482 
483 			if (loop_depth >= R300_VS_MAX_FC_OPS) {
484 				rc_error(&compiler->Base,
485 					"Too many flow control instructions.");
486 				return;
487 			}
488 			/* Maximum of R500_PVS_FC_LOOP_CNT_JMP_INST is 0xff, here
489 			 * we reduce it to half to avoid occasional hangs on RV516
490 			 * and downclocked RV530.
491 			 */
492 			if (compiler->Base.is_r500) {
493 				compiler->code->fc_op_addrs.r500
494 					[compiler->code->num_fc_ops].lw =
495 					R500_PVS_FC_ACT_ADRS(act_addr)
496 					| R500_PVS_FC_LOOP_CNT_JMP_INST(0x0080)
497 					;
498 				compiler->code->fc_op_addrs.r500
499 					[compiler->code->num_fc_ops].uw =
500 					R500_PVS_FC_LAST_INST(last_addr)
501 					| R500_PVS_FC_RTN_INST(ret_addr)
502 					;
503 			} else {
504 				compiler->code->fc_op_addrs.r300
505 					[compiler->code->num_fc_ops] =
506 					R300_PVS_FC_ACT_ADRS(act_addr)
507 					| R300_PVS_FC_LOOP_CNT_JMP_INST(0xff)
508 					| R300_PVS_FC_LAST_INST(last_addr)
509 					| R300_PVS_FC_RTN_INST(ret_addr)
510 					;
511 			}
512 			compiler->code->fc_loop_index[compiler->code->num_fc_ops] =
513 				R300_PVS_FC_LOOP_INIT_VAL(0x0)
514 				| R300_PVS_FC_LOOP_STEP_VAL(0x1)
515 				;
516 			compiler->code->fc_ops |= R300_VAP_PVS_FC_OPC_LOOP(
517 						compiler->code->num_fc_ops);
518 			compiler->code->num_fc_ops++;
519 
520 			break;
521 		}
522 
523 		case RC_ME_PRED_SET_CLR:
524 			ei_math1(compiler->code, ME_PRED_SET_CLR, vpi, inst);
525 			break;
526 
527 		case RC_ME_PRED_SET_INV:
528 			ei_math1(compiler->code, ME_PRED_SET_INV, vpi, inst);
529 			break;
530 
531 		case RC_ME_PRED_SET_POP:
532 			ei_math1(compiler->code, ME_PRED_SET_POP, vpi, inst);
533 			break;
534 
535 		case RC_ME_PRED_SET_RESTORE:
536 			ei_math1(compiler->code, ME_PRED_SET_RESTORE, vpi, inst);
537 			break;
538 
539 		case RC_ME_PRED_SEQ:
540 			ei_math1(compiler->code, ME_PRED_SET_EQ, vpi, inst);
541 			break;
542 
543 		case RC_ME_PRED_SNEQ:
544 			ei_math1(compiler->code, ME_PRED_SET_NEQ, vpi, inst);
545 			break;
546 
547 		case RC_VE_PRED_SNEQ_PUSH:
548 			ei_vector2(compiler->code, VE_PRED_SET_NEQ_PUSH,
549 								vpi, inst);
550 			break;
551 
552 		default:
553 			rc_error(&compiler->Base, "Unknown opcode %s\n", info->Name);
554 			return;
555 		}
556 
557 		if (vpi->DstReg.Pred != RC_PRED_DISABLED) {
558 			inst[0] |= (PVS_DST_PRED_ENABLE_MASK
559 						<< PVS_DST_PRED_ENABLE_SHIFT);
560 			if (vpi->DstReg.Pred == RC_PRED_SET) {
561 				inst[0] |= (PVS_DST_PRED_SENSE_MASK
562 						<< PVS_DST_PRED_SENSE_SHIFT);
563 			}
564 		}
565 
566 		/* Update the number of temporaries. */
567 		if (info->HasDstReg && vpi->DstReg.File == RC_FILE_TEMPORARY &&
568 		    vpi->DstReg.Index >= compiler->code->num_temporaries)
569 			compiler->code->num_temporaries = vpi->DstReg.Index + 1;
570 
571 		/* last instruction that writes position */
572 		if (info->HasDstReg && vpi->DstReg.File == RC_FILE_OUTPUT &&
573 		    t_dst_index(compiler->code, &vpi->DstReg) == 0) {
574 			if (loop_depth == 0)
575 				compiler->code->last_pos_write = compiler->code->length / 4;
576 			else
577 				last_pos_write_at_loop_end = true;
578 		}
579 
580 		for (unsigned i = 0; i < info->NumSrcRegs; i++) {
581 			if (vpi->SrcReg[i].File == RC_FILE_TEMPORARY &&
582 			    vpi->SrcReg[i].Index >= compiler->code->num_temporaries)
583 				compiler->code->num_temporaries = vpi->SrcReg[i].Index + 1;
584 			if (vpi->SrcReg[i].File == RC_FILE_INPUT) {
585 				if (loop_depth == 0)
586 					compiler->code->last_input_read = compiler->code->length / 4;
587 				else
588 					last_input_read_at_loop_end = true;
589 			}
590 
591 		}
592 
593 
594 		if (compiler->code->num_temporaries > compiler->Base.max_temp_regs) {
595 			rc_error(&compiler->Base, "Too many temporaries.\n");
596 			return;
597 		}
598 
599 		compiler->code->length += 4;
600 
601 		if (compiler->Base.Error)
602 			return;
603 	}
604 }
605 
606 struct temporary_allocation {
607 	unsigned int Allocated:1;
608 	unsigned int HwTemp:15;
609 	struct rc_instruction * LastRead;
610 };
611 
get_reg(struct radeon_compiler * c,struct temporary_allocation * ta,bool * hwtemps,unsigned int orig)612 static int get_reg(struct radeon_compiler *c, struct temporary_allocation *ta, bool *hwtemps,
613                    unsigned int orig)
614 {
615     if (!ta[orig].Allocated) {
616         int j;
617         for (j = 0; j < c->max_temp_regs; ++j)
618         {
619             if (!hwtemps[j])
620                 break;
621         }
622         ta[orig].Allocated = 1;
623         ta[orig].HwTemp = j;
624         hwtemps[ta[orig].HwTemp] = true;
625     }
626 
627     return ta[orig].HwTemp;
628 }
629 
allocate_temporary_registers(struct radeon_compiler * c,void * user)630 static void allocate_temporary_registers(struct radeon_compiler *c, void *user)
631 {
632 	unsigned int node_count, node_index;
633 	struct ra_class ** node_classes;
634 	struct rc_list * var_ptr;
635 	struct rc_list * variables;
636 	struct ra_graph * graph;
637 	const struct rc_regalloc_state *ra_state = c->regalloc_state;
638 
639 	rc_recompute_ips(c);
640 
641 	/* Get list of program variables */
642 	variables = rc_get_variables(c);
643 	node_count = rc_list_count(variables);
644 	node_classes = memory_pool_malloc(&c->Pool,
645 			node_count * sizeof(struct ra_class *));
646 
647 	for (var_ptr = variables, node_index = 0; var_ptr;
648 					var_ptr = var_ptr->Next, node_index++) {
649 		unsigned int class_index = 0;
650 		int index;
651 		/* Compute the live intervals */
652 		rc_variable_compute_live_intervals(var_ptr->Item);
653 		unsigned int writemask = rc_variable_writemask_sum(var_ptr->Item);
654 		index = rc_find_class(c->regalloc_state->class_list, writemask, 6);
655 		if (index > -1) {
656 			class_index = c->regalloc_state->class_list[index].ID;
657 		} else {
658 			rc_error(c,
659 				"Could not find class for index=%u mask=%u\n",
660 				((struct rc_variable *)var_ptr->Item)->Dst.Index, writemask);
661 		}
662 		node_classes[node_index] = ra_state->classes[class_index];
663 	}
664 
665 	graph = ra_alloc_interference_graph(ra_state->regs, node_count);
666 
667 	for (node_index = 0; node_index < node_count; node_index++) {
668 		ra_set_node_class(graph, node_index, node_classes[node_index]);
669 	}
670 
671 	rc_build_interference_graph(graph, variables);
672 
673 	if (!ra_allocate(graph)) {
674 		rc_error(c, "Ran out of hardware temporaries\n");
675                 ralloc_free(graph);
676 		return;
677 	}
678 
679 	/* Rewrite the registers */
680 	for (var_ptr = variables, node_index = 0; var_ptr;
681 				var_ptr = var_ptr->Next, node_index++) {
682 		int reg = ra_get_node_reg(graph, node_index);
683 		unsigned int writemask = reg_get_writemask(reg);
684 		unsigned int index = reg_get_index(reg);
685 		struct rc_variable * var = var_ptr->Item;
686 
687 		rc_variable_change_dst(var, index, writemask);
688 	}
689 
690 	ralloc_free(graph);
691 }
692 
693 /**
694  * Vertex engine cannot read two inputs or two constants at the same time.
695  * Introduce intermediate MOVs to temporary registers to account for this.
696  */
transform_source_conflicts(struct radeon_compiler * c,struct rc_instruction * inst,void * unused)697 static int transform_source_conflicts(
698 	struct radeon_compiler *c,
699 	struct rc_instruction* inst,
700 	void* unused)
701 {
702 	const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
703 
704 	if (opcode->NumSrcRegs == 3) {
705 		if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[2])
706 		    || t_src_conflict(inst->U.I.SrcReg[0], inst->U.I.SrcReg[2])) {
707 			int tmpreg = rc_find_free_temporary(c);
708 			struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);
709 			inst_mov->U.I.Opcode = RC_OPCODE_MOV;
710 			inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY;
711 			inst_mov->U.I.DstReg.Index = tmpreg;
712 			inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[2];
713 			inst_mov->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
714 			inst_mov->U.I.SrcReg[0].Negate = 0;
715 			inst_mov->U.I.SrcReg[0].Abs = 0;
716 
717 			inst->U.I.SrcReg[2].File = RC_FILE_TEMPORARY;
718 			inst->U.I.SrcReg[2].Index = tmpreg;
719 			inst->U.I.SrcReg[2].RelAddr = false;
720 		}
721 	}
722 
723 	if (opcode->NumSrcRegs >= 2) {
724 		if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[0])) {
725 			int tmpreg = rc_find_free_temporary(c);
726 			struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);
727 			inst_mov->U.I.Opcode = RC_OPCODE_MOV;
728 			inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY;
729 			inst_mov->U.I.DstReg.Index = tmpreg;
730 			inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[1];
731 			inst_mov->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
732 			inst_mov->U.I.SrcReg[0].Negate = 0;
733 			inst_mov->U.I.SrcReg[0].Abs = 0;
734 
735 			inst->U.I.SrcReg[1].File = RC_FILE_TEMPORARY;
736 			inst->U.I.SrcReg[1].Index = tmpreg;
737 			inst->U.I.SrcReg[1].RelAddr = false;
738 		}
739 	}
740 
741 	return 1;
742 }
743 
rc_vs_add_artificial_outputs(struct radeon_compiler * c,void * user)744 static void rc_vs_add_artificial_outputs(struct radeon_compiler *c, void *user)
745 {
746 	struct r300_vertex_program_compiler * compiler = (struct r300_vertex_program_compiler*)c;
747 	int i;
748 
749 	for(i = 0; i < 32; ++i) {
750 		if ((compiler->RequiredOutputs & (1U << i)) &&
751 		    !(compiler->Base.Program.OutputsWritten & (1U << i))) {
752 			struct rc_instruction * inst = rc_insert_new_instruction(&compiler->Base, compiler->Base.Program.Instructions.Prev);
753 			inst->U.I.Opcode = RC_OPCODE_MOV;
754 
755 			inst->U.I.DstReg.File = RC_FILE_OUTPUT;
756 			inst->U.I.DstReg.Index = i;
757 			inst->U.I.DstReg.WriteMask = RC_MASK_XYZW;
758 
759 			inst->U.I.SrcReg[0].File = RC_FILE_CONSTANT;
760 			inst->U.I.SrcReg[0].Index = 0;
761 			inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
762 
763 			compiler->Base.Program.OutputsWritten |= 1U << i;
764 		}
765 	}
766 }
767 
swizzle_is_native(rc_opcode opcode,struct rc_src_register reg)768 static int swizzle_is_native(rc_opcode opcode, struct rc_src_register reg)
769 {
770 	(void) opcode;
771 	(void) reg;
772 
773 	return 1;
774 }
775 
776 const struct rc_swizzle_caps r300_vertprog_swizzle_caps = {
777 	.IsNative = &swizzle_is_native,
778 	.Split = NULL /* should never be called */
779 };
780 
r3xx_compile_vertex_program(struct r300_vertex_program_compiler * c)781 void r3xx_compile_vertex_program(struct r300_vertex_program_compiler *c)
782 {
783 	int is_r500 = c->Base.is_r500;
784 	int opt = !c->Base.disable_optimizations;
785 
786 	/* Lists of instruction transformations. */
787 	struct radeon_program_transformation alu_rewrite[] = {
788 		{ &r300_transform_vertex_alu, NULL },
789 		{ NULL, NULL }
790 	};
791 
792 	struct radeon_program_transformation resolve_src_conflicts[] = {
793 		{ &transform_source_conflicts, NULL },
794 		{ NULL, NULL }
795 	};
796 
797 	/* List of compiler passes. */
798 	struct radeon_compiler_pass vs_list[] = {
799 		/* NAME				DUMP PREDICATE	FUNCTION			PARAM */
800 		{"add artificial outputs",	0, 1,		rc_vs_add_artificial_outputs,	NULL},
801 		{"native rewrite",		1, 1,		rc_local_transform,		alu_rewrite},
802 		{"unused channels",		1, opt,		rc_mark_unused_channels,	NULL},
803 		{"dataflow optimize",		1, opt,		rc_optimize,			NULL},
804 		{"dead constants",		1, 1,		rc_remove_unused_constants,	&c->code->constants_remap_table},
805 		/* This pass must be done after optimizations. */
806 		{"source conflict resolve",	1, 1,		rc_local_transform,		resolve_src_conflicts},
807 		{"register allocation",		1, opt,		allocate_temporary_registers,	NULL},
808 		{"lower control flow opcodes",	1, is_r500,	rc_vert_fc,			NULL},
809 		{"final code validation",	0, 1,		rc_validate_final_shader,	NULL},
810 		{"machine code generation",	0, 1,		translate_vertex_program,	NULL},
811 		{"dump machine code",		0, c->Base.Debug & RC_DBG_LOG, r300_vertex_program_dump,	NULL},
812 		{NULL, 0, 0, NULL, NULL}
813 	};
814 
815 	c->Base.type = RC_VERTEX_PROGRAM;
816 	c->Base.SwizzleCaps = &r300_vertprog_swizzle_caps;
817 
818 	rc_run_compiler(&c->Base, vs_list);
819 
820 	c->code->InputsRead = c->Base.Program.InputsRead;
821 	c->code->OutputsWritten = c->Base.Program.OutputsWritten;
822 	rc_constants_copy(&c->code->constants, &c->Base.Program.Constants);
823 }
824