1 /*
2 * Copyright 2009 Nicolai Hähnle <[email protected]>
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "radeon_compiler.h"
7
8 #include <stdbool.h>
9 #include <stdio.h>
10
11 #include "r300_reg.h"
12
13 #include "radeon_compiler_util.h"
14 #include "radeon_dataflow.h"
15 #include "radeon_program.h"
16 #include "radeon_program_alu.h"
17 #include "radeon_swizzle.h"
18 #include "radeon_remove_constants.h"
19 #include "radeon_regalloc.h"
20 #include "radeon_list.h"
21
22 #include "util/compiler.h"
23
24 /*
25 * Take an already-setup and valid source then swizzle it appropriately to
26 * obtain a constant ZERO or ONE source.
27 */
28 #define __CONST(x, y) \
29 (PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[x]), \
30 t_swizzle(y), \
31 t_swizzle(y), \
32 t_swizzle(y), \
33 t_swizzle(y), \
34 t_src_class(vpi->SrcReg[x].File), \
35 RC_MASK_NONE) | (vpi->SrcReg[x].RelAddr << 4))
36
37
t_dst_mask(unsigned int mask)38 static unsigned long t_dst_mask(unsigned int mask)
39 {
40 /* RC_MASK_* is equivalent to VSF_FLAG_* */
41 return mask & RC_MASK_XYZW;
42 }
43
t_dst_class(rc_register_file file)44 static unsigned long t_dst_class(rc_register_file file)
45 {
46 switch (file) {
47 default:
48 fprintf(stderr, "%s: Bad register file %i\n", __func__, file);
49 FALLTHROUGH;
50 case RC_FILE_TEMPORARY:
51 return PVS_DST_REG_TEMPORARY;
52 case RC_FILE_OUTPUT:
53 return PVS_DST_REG_OUT;
54 case RC_FILE_ADDRESS:
55 return PVS_DST_REG_A0;
56 }
57 }
58
t_dst_index(struct r300_vertex_program_code * vp,struct rc_dst_register * dst)59 static unsigned long t_dst_index(struct r300_vertex_program_code *vp,
60 struct rc_dst_register *dst)
61 {
62 if (dst->File == RC_FILE_OUTPUT)
63 return vp->outputs[dst->Index];
64
65 return dst->Index;
66 }
67
t_src_class(rc_register_file file)68 static unsigned long t_src_class(rc_register_file file)
69 {
70 switch (file) {
71 default:
72 fprintf(stderr, "%s: Bad register file %i\n", __func__, file);
73 FALLTHROUGH;
74 case RC_FILE_NONE:
75 case RC_FILE_TEMPORARY:
76 return PVS_SRC_REG_TEMPORARY;
77 case RC_FILE_INPUT:
78 return PVS_SRC_REG_INPUT;
79 case RC_FILE_CONSTANT:
80 return PVS_SRC_REG_CONSTANT;
81 }
82 }
83
t_src_conflict(struct rc_src_register a,struct rc_src_register b)84 static int t_src_conflict(struct rc_src_register a, struct rc_src_register b)
85 {
86 unsigned long aclass = t_src_class(a.File);
87 unsigned long bclass = t_src_class(b.File);
88
89 if (aclass != bclass)
90 return 0;
91 if (aclass == PVS_SRC_REG_TEMPORARY)
92 return 0;
93
94 if (a.RelAddr || b.RelAddr)
95 return 1;
96 if (a.Index != b.Index)
97 return 1;
98
99 return 0;
100 }
101
t_swizzle(unsigned int swizzle)102 static inline unsigned long t_swizzle(unsigned int swizzle)
103 {
104 /* this is in fact a NOP as the Mesa RC_SWIZZLE_* are all identical to VSF_IN_COMPONENT_* */
105 return swizzle;
106 }
107
t_src_index(struct r300_vertex_program_code * vp,struct rc_src_register * src)108 static unsigned long t_src_index(struct r300_vertex_program_code *vp,
109 struct rc_src_register *src)
110 {
111 if (src->File == RC_FILE_INPUT) {
112 assert(vp->inputs[src->Index] != -1);
113 return vp->inputs[src->Index];
114 } else {
115 if (src->Index < 0) {
116 fprintf(stderr,
117 "negative offsets for indirect addressing do not work.\n");
118 return 0;
119 }
120 return src->Index;
121 }
122 }
123
124 /* these two functions should probably be merged... */
125
t_src(struct r300_vertex_program_code * vp,struct rc_src_register * src)126 static unsigned long t_src(struct r300_vertex_program_code *vp,
127 struct rc_src_register *src)
128 {
129 /* src->Negate uses the RC_MASK_ flags from program_instruction.h,
130 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
131 */
132 return PVS_SRC_OPERAND(t_src_index(vp, src),
133 t_swizzle(GET_SWZ(src->Swizzle, 0)),
134 t_swizzle(GET_SWZ(src->Swizzle, 1)),
135 t_swizzle(GET_SWZ(src->Swizzle, 2)),
136 t_swizzle(GET_SWZ(src->Swizzle, 3)),
137 t_src_class(src->File),
138 src->Negate) |
139 (src->RelAddr << 4) | (src->Abs << 3);
140 }
141
t_src_scalar(struct r300_vertex_program_code * vp,struct rc_src_register * src)142 static unsigned long t_src_scalar(struct r300_vertex_program_code *vp,
143 struct rc_src_register *src)
144 {
145 /* src->Negate uses the RC_MASK_ flags from program_instruction.h,
146 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
147 */
148 unsigned int swz = rc_get_scalar_src_swz(src->Swizzle);
149
150 return PVS_SRC_OPERAND(t_src_index(vp, src),
151 t_swizzle(swz),
152 t_swizzle(swz),
153 t_swizzle(swz),
154 t_swizzle(swz),
155 t_src_class(src->File),
156 src->Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
157 (src->RelAddr << 4) | (src->Abs << 3);
158 }
159
valid_dst(struct r300_vertex_program_code * vp,struct rc_dst_register * dst)160 static int valid_dst(struct r300_vertex_program_code *vp,
161 struct rc_dst_register *dst)
162 {
163 if (dst->File == RC_FILE_OUTPUT && vp->outputs[dst->Index] == -1) {
164 return 0;
165 } else if (dst->File == RC_FILE_ADDRESS) {
166 assert(dst->Index == 0);
167 }
168
169 return 1;
170 }
171
ei_vector1(struct r300_vertex_program_code * vp,unsigned int hw_opcode,struct rc_sub_instruction * vpi,unsigned int * inst)172 static void ei_vector1(struct r300_vertex_program_code *vp,
173 unsigned int hw_opcode,
174 struct rc_sub_instruction *vpi,
175 unsigned int * inst)
176 {
177 inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
178 0,
179 0,
180 t_dst_index(vp, &vpi->DstReg),
181 t_dst_mask(vpi->DstReg.WriteMask),
182 t_dst_class(vpi->DstReg.File),
183 vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
184 inst[1] = t_src(vp, &vpi->SrcReg[0]);
185 inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
186 inst[3] = __CONST(0, RC_SWIZZLE_ZERO);
187 }
188
ei_vector2(struct r300_vertex_program_code * vp,unsigned int hw_opcode,struct rc_sub_instruction * vpi,unsigned int * inst)189 static void ei_vector2(struct r300_vertex_program_code *vp,
190 unsigned int hw_opcode,
191 struct rc_sub_instruction *vpi,
192 unsigned int * inst)
193 {
194 inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
195 0,
196 0,
197 t_dst_index(vp, &vpi->DstReg),
198 t_dst_mask(vpi->DstReg.WriteMask),
199 t_dst_class(vpi->DstReg.File),
200 vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
201 inst[1] = t_src(vp, &vpi->SrcReg[0]);
202 inst[2] = t_src(vp, &vpi->SrcReg[1]);
203 inst[3] = __CONST(1, RC_SWIZZLE_ZERO);
204 }
205
ei_math1(struct r300_vertex_program_code * vp,unsigned int hw_opcode,struct rc_sub_instruction * vpi,unsigned int * inst)206 static void ei_math1(struct r300_vertex_program_code *vp,
207 unsigned int hw_opcode,
208 struct rc_sub_instruction *vpi,
209 unsigned int * inst)
210 {
211 inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
212 1,
213 0,
214 t_dst_index(vp, &vpi->DstReg),
215 t_dst_mask(vpi->DstReg.WriteMask),
216 t_dst_class(vpi->DstReg.File),
217 vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
218 inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);
219 inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
220 inst[3] = __CONST(0, RC_SWIZZLE_ZERO);
221 }
222
ei_cmp(struct r300_vertex_program_code * vp,struct rc_sub_instruction * vpi,unsigned int * inst)223 static void ei_cmp(struct r300_vertex_program_code *vp,
224 struct rc_sub_instruction *vpi,
225 unsigned int * inst)
226 {
227 inst[0] = PVS_OP_DST_OPERAND(VE_COND_MUX_GTE,
228 0,
229 0,
230 t_dst_index(vp, &vpi->DstReg),
231 t_dst_mask(vpi->DstReg.WriteMask),
232 t_dst_class(vpi->DstReg.File),
233 vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
234
235 /* Arguments with constant swizzles still count as a unique
236 * temporary, so we should make sure these arguments share a
237 * register index with one of the other arguments. */
238 for (unsigned i = 0; i < 3; i++) {
239 unsigned j = (i + 1) % 3;
240 if (vpi->SrcReg[i].File == RC_FILE_NONE &&
241 (vpi->SrcReg[j].File == RC_FILE_NONE ||
242 vpi->SrcReg[j].File == RC_FILE_TEMPORARY)) {
243 vpi->SrcReg[i].Index = vpi->SrcReg[j].Index;
244 break;
245 }
246 }
247
248 inst[1] = t_src(vp, &vpi->SrcReg[0]);
249 inst[2] = t_src(vp, &vpi->SrcReg[2]);
250 inst[3] = t_src(vp, &vpi->SrcReg[1]);
251 }
252
ei_lit(struct r300_vertex_program_code * vp,struct rc_sub_instruction * vpi,unsigned int * inst)253 static void ei_lit(struct r300_vertex_program_code *vp,
254 struct rc_sub_instruction *vpi,
255 unsigned int * inst)
256 {
257 //LIT TMP 1.Y Z TMP 1{} {X W Z Y} TMP 1{} {Y W Z X} TMP 1{} {Y X Z W}
258
259 inst[0] = PVS_OP_DST_OPERAND(ME_LIGHT_COEFF_DX,
260 1,
261 0,
262 t_dst_index(vp, &vpi->DstReg),
263 t_dst_mask(vpi->DstReg.WriteMask),
264 t_dst_class(vpi->DstReg.File),
265 vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
266 /* NOTE: Users swizzling might not work. */
267 inst[1] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)), // X
268 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)), // W
269 PVS_SRC_SELECT_FORCE_0, // Z
270 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)), // Y
271 t_src_class(vpi->SrcReg[0].File),
272 vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
273 (vpi->SrcReg[0].RelAddr << 4);
274 inst[2] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)), // Y
275 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)), // W
276 PVS_SRC_SELECT_FORCE_0, // Z
277 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)), // X
278 t_src_class(vpi->SrcReg[0].File),
279 vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
280 (vpi->SrcReg[0].RelAddr << 4);
281 inst[3] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)), // Y
282 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)), // X
283 PVS_SRC_SELECT_FORCE_0, // Z
284 t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)), // W
285 t_src_class(vpi->SrcReg[0].File),
286 vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
287 (vpi->SrcReg[0].RelAddr << 4);
288 }
289
ei_mad(struct r300_vertex_program_code * vp,struct rc_sub_instruction * vpi,unsigned int * inst)290 static void ei_mad(struct r300_vertex_program_code *vp,
291 struct rc_sub_instruction *vpi,
292 unsigned int * inst)
293 {
294 unsigned int i;
295 /* Remarks about hardware limitations of MAD
296 * (please preserve this comment, as this information is _NOT_
297 * in the documentation provided by AMD).
298 *
299 * As described in the documentation, MAD with three unique temporary
300 * source registers requires the use of the macro version.
301 *
302 * However (and this is not mentioned in the documentation), apparently
303 * the macro version is _NOT_ a full superset of the normal version.
304 * In particular, the macro version does not always work when relative
305 * addressing is used in the source operands.
306 *
307 * This limitation caused incorrect rendering in Sauerbraten's OpenGL
308 * assembly shader path when using medium quality animations
309 * (i.e. animations with matrix blending instead of quaternion blending).
310 *
311 * Unfortunately, I (nha) have been unable to extract a Piglit regression
312 * test for this issue - for some reason, it is possible to have vertex
313 * programs whose prefix is *exactly* the same as the prefix of the
314 * offending program in Sauerbraten up to the offending instruction
315 * without causing any trouble.
316 *
317 * Bottom line: Only use the macro version only when really necessary;
318 * according to AMD docs, this should improve performance by one clock
319 * as a nice side bonus.
320 */
321 if (vpi->SrcReg[0].File == RC_FILE_TEMPORARY &&
322 vpi->SrcReg[1].File == RC_FILE_TEMPORARY &&
323 vpi->SrcReg[2].File == RC_FILE_TEMPORARY &&
324 vpi->SrcReg[0].Index != vpi->SrcReg[1].Index &&
325 vpi->SrcReg[0].Index != vpi->SrcReg[2].Index &&
326 vpi->SrcReg[1].Index != vpi->SrcReg[2].Index) {
327 inst[0] = PVS_OP_DST_OPERAND(PVS_MACRO_OP_2CLK_MADD,
328 0,
329 1,
330 t_dst_index(vp, &vpi->DstReg),
331 t_dst_mask(vpi->DstReg.WriteMask),
332 t_dst_class(vpi->DstReg.File),
333 vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
334 } else {
335 inst[0] = PVS_OP_DST_OPERAND(VE_MULTIPLY_ADD,
336 0,
337 0,
338 t_dst_index(vp, &vpi->DstReg),
339 t_dst_mask(vpi->DstReg.WriteMask),
340 t_dst_class(vpi->DstReg.File),
341 vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
342
343 /* Arguments with constant swizzles still count as a unique
344 * temporary, so we should make sure these arguments share a
345 * register index with one of the other arguments. */
346 for (i = 0; i < 3; i++) {
347 unsigned int j;
348 if (vpi->SrcReg[i].File != RC_FILE_NONE)
349 continue;
350
351 for (j = 0; j < 3; j++) {
352 if (i != j) {
353 vpi->SrcReg[i].Index =
354 vpi->SrcReg[j].Index;
355 break;
356 }
357 }
358 }
359 }
360 inst[1] = t_src(vp, &vpi->SrcReg[0]);
361 inst[2] = t_src(vp, &vpi->SrcReg[1]);
362 inst[3] = t_src(vp, &vpi->SrcReg[2]);
363 }
364
ei_pow(struct r300_vertex_program_code * vp,struct rc_sub_instruction * vpi,unsigned int * inst)365 static void ei_pow(struct r300_vertex_program_code *vp,
366 struct rc_sub_instruction *vpi,
367 unsigned int * inst)
368 {
369 inst[0] = PVS_OP_DST_OPERAND(ME_POWER_FUNC_FF,
370 1,
371 0,
372 t_dst_index(vp, &vpi->DstReg),
373 t_dst_mask(vpi->DstReg.WriteMask),
374 t_dst_class(vpi->DstReg.File),
375 vpi->SaturateMode == RC_SATURATE_ZERO_ONE);
376 inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);
377 inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
378 inst[3] = t_src_scalar(vp, &vpi->SrcReg[1]);
379 }
380
translate_vertex_program(struct radeon_compiler * c,void * user)381 static void translate_vertex_program(struct radeon_compiler *c, void *user)
382 {
383 struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler*)c;
384 struct rc_instruction *rci;
385
386 unsigned loops[R500_PVS_MAX_LOOP_DEPTH] = {};
387 unsigned loop_depth = 0;
388 bool last_input_read_at_loop_end = false;
389 bool last_pos_write_at_loop_end = false;
390
391 compiler->code->pos_end = 0; /* Not supported yet */
392 compiler->code->length = 0;
393 compiler->code->num_temporaries = 0;
394 compiler->code->last_input_read = 0;
395 compiler->code->last_pos_write = 0;
396
397 compiler->SetHwInputOutput(compiler);
398
399 for(rci = compiler->Base.Program.Instructions.Next; rci != &compiler->Base.Program.Instructions; rci = rci->Next) {
400 struct rc_sub_instruction *vpi = &rci->U.I;
401 unsigned int *inst = compiler->code->body.d + compiler->code->length;
402 const struct rc_opcode_info *info = rc_get_opcode_info(vpi->Opcode);
403
404 /* Skip instructions writing to non-existing destination */
405 if (!valid_dst(compiler->code, &vpi->DstReg))
406 continue;
407
408 if (info->HasDstReg) {
409 /* Neither is Saturate. */
410 if (vpi->SaturateMode != RC_SATURATE_NONE && !c->is_r500) {
411 rc_error(&compiler->Base, "Vertex program does not support the Saturate "
412 "modifier (yet).\n");
413 }
414 }
415
416 if (compiler->code->length >= c->max_alu_insts * 4) {
417 rc_error(&compiler->Base, "Vertex program has too many instructions\n");
418 return;
419 }
420
421 assert(compiler->Base.is_r500 ||
422 (vpi->Opcode != RC_OPCODE_SEQ &&
423 vpi->Opcode != RC_OPCODE_SNE));
424
425 switch (vpi->Opcode) {
426 case RC_OPCODE_ADD: ei_vector2(compiler->code, VE_ADD, vpi, inst); break;
427 case RC_OPCODE_ARL: ei_vector1(compiler->code, VE_FLT2FIX_DX, vpi, inst); break;
428 case RC_OPCODE_ARR: ei_vector1(compiler->code, VE_FLT2FIX_DX_RND, vpi, inst); break;
429 case RC_OPCODE_COS: ei_math1(compiler->code, ME_COS, vpi, inst); break;
430 case RC_OPCODE_CMP: ei_cmp(compiler->code, vpi, inst); break;
431 case RC_OPCODE_DP4: ei_vector2(compiler->code, VE_DOT_PRODUCT, vpi, inst); break;
432 case RC_OPCODE_DST: ei_vector2(compiler->code, VE_DISTANCE_VECTOR, vpi, inst); break;
433 case RC_OPCODE_EX2: ei_math1(compiler->code, ME_EXP_BASE2_FULL_DX, vpi, inst); break;
434 case RC_OPCODE_EXP: ei_math1(compiler->code, ME_EXP_BASE2_DX, vpi, inst); break;
435 case RC_OPCODE_FRC: ei_vector1(compiler->code, VE_FRACTION, vpi, inst); break;
436 case RC_OPCODE_LG2: ei_math1(compiler->code, ME_LOG_BASE2_FULL_DX, vpi, inst); break;
437 case RC_OPCODE_LIT: ei_lit(compiler->code, vpi, inst); break;
438 case RC_OPCODE_LOG: ei_math1(compiler->code, ME_LOG_BASE2_DX, vpi, inst); break;
439 case RC_OPCODE_MAD: ei_mad(compiler->code, vpi, inst); break;
440 case RC_OPCODE_MAX: ei_vector2(compiler->code, VE_MAXIMUM, vpi, inst); break;
441 case RC_OPCODE_MIN: ei_vector2(compiler->code, VE_MINIMUM, vpi, inst); break;
442 case RC_OPCODE_MOV: ei_vector1(compiler->code, VE_ADD, vpi, inst); break;
443 case RC_OPCODE_MUL: ei_vector2(compiler->code, VE_MULTIPLY, vpi, inst); break;
444 case RC_OPCODE_POW: ei_pow(compiler->code, vpi, inst); break;
445 case RC_OPCODE_RCP: ei_math1(compiler->code, ME_RECIP_DX, vpi, inst); break;
446 case RC_OPCODE_RSQ: ei_math1(compiler->code, ME_RECIP_SQRT_DX, vpi, inst); break;
447 case RC_OPCODE_SEQ: ei_vector2(compiler->code, VE_SET_EQUAL, vpi, inst); break;
448 case RC_OPCODE_SGE: ei_vector2(compiler->code, VE_SET_GREATER_THAN_EQUAL, vpi, inst); break;
449 case RC_OPCODE_SIN: ei_math1(compiler->code, ME_SIN, vpi, inst); break;
450 case RC_OPCODE_SLT: ei_vector2(compiler->code, VE_SET_LESS_THAN, vpi, inst); break;
451 case RC_OPCODE_SNE: ei_vector2(compiler->code, VE_SET_NOT_EQUAL, vpi, inst); break;
452 case RC_OPCODE_BGNLOOP:
453 {
454 if ((!compiler->Base.is_r500
455 && loop_depth >= R300_VS_MAX_LOOP_DEPTH)
456 || loop_depth >= R500_PVS_MAX_LOOP_DEPTH) {
457 rc_error(&compiler->Base,
458 "Loops are nested too deep.");
459 return;
460 }
461 loops[loop_depth++] = ((compiler->code->length)/ 4) + 1;
462 break;
463 }
464 case RC_OPCODE_ENDLOOP:
465 {
466 unsigned int act_addr;
467 unsigned int last_addr;
468 unsigned int ret_addr;
469
470 if (loop_depth == 1 && last_input_read_at_loop_end) {
471 compiler->code->last_input_read = compiler->code->length / 4;
472 last_input_read_at_loop_end = false;
473 }
474 if (loop_depth == 1 && last_pos_write_at_loop_end) {
475 compiler->code->last_pos_write = compiler->code->length / 4;
476 last_pos_write_at_loop_end = false;
477 }
478
479 ret_addr = loops[--loop_depth];
480 act_addr = ret_addr - 1;
481 last_addr = (compiler->code->length / 4) - 1;
482
483 if (loop_depth >= R300_VS_MAX_FC_OPS) {
484 rc_error(&compiler->Base,
485 "Too many flow control instructions.");
486 return;
487 }
488 /* Maximum of R500_PVS_FC_LOOP_CNT_JMP_INST is 0xff, here
489 * we reduce it to half to avoid occasional hangs on RV516
490 * and downclocked RV530.
491 */
492 if (compiler->Base.is_r500) {
493 compiler->code->fc_op_addrs.r500
494 [compiler->code->num_fc_ops].lw =
495 R500_PVS_FC_ACT_ADRS(act_addr)
496 | R500_PVS_FC_LOOP_CNT_JMP_INST(0x0080)
497 ;
498 compiler->code->fc_op_addrs.r500
499 [compiler->code->num_fc_ops].uw =
500 R500_PVS_FC_LAST_INST(last_addr)
501 | R500_PVS_FC_RTN_INST(ret_addr)
502 ;
503 } else {
504 compiler->code->fc_op_addrs.r300
505 [compiler->code->num_fc_ops] =
506 R300_PVS_FC_ACT_ADRS(act_addr)
507 | R300_PVS_FC_LOOP_CNT_JMP_INST(0xff)
508 | R300_PVS_FC_LAST_INST(last_addr)
509 | R300_PVS_FC_RTN_INST(ret_addr)
510 ;
511 }
512 compiler->code->fc_loop_index[compiler->code->num_fc_ops] =
513 R300_PVS_FC_LOOP_INIT_VAL(0x0)
514 | R300_PVS_FC_LOOP_STEP_VAL(0x1)
515 ;
516 compiler->code->fc_ops |= R300_VAP_PVS_FC_OPC_LOOP(
517 compiler->code->num_fc_ops);
518 compiler->code->num_fc_ops++;
519
520 break;
521 }
522
523 case RC_ME_PRED_SET_CLR:
524 ei_math1(compiler->code, ME_PRED_SET_CLR, vpi, inst);
525 break;
526
527 case RC_ME_PRED_SET_INV:
528 ei_math1(compiler->code, ME_PRED_SET_INV, vpi, inst);
529 break;
530
531 case RC_ME_PRED_SET_POP:
532 ei_math1(compiler->code, ME_PRED_SET_POP, vpi, inst);
533 break;
534
535 case RC_ME_PRED_SET_RESTORE:
536 ei_math1(compiler->code, ME_PRED_SET_RESTORE, vpi, inst);
537 break;
538
539 case RC_ME_PRED_SEQ:
540 ei_math1(compiler->code, ME_PRED_SET_EQ, vpi, inst);
541 break;
542
543 case RC_ME_PRED_SNEQ:
544 ei_math1(compiler->code, ME_PRED_SET_NEQ, vpi, inst);
545 break;
546
547 case RC_VE_PRED_SNEQ_PUSH:
548 ei_vector2(compiler->code, VE_PRED_SET_NEQ_PUSH,
549 vpi, inst);
550 break;
551
552 default:
553 rc_error(&compiler->Base, "Unknown opcode %s\n", info->Name);
554 return;
555 }
556
557 if (vpi->DstReg.Pred != RC_PRED_DISABLED) {
558 inst[0] |= (PVS_DST_PRED_ENABLE_MASK
559 << PVS_DST_PRED_ENABLE_SHIFT);
560 if (vpi->DstReg.Pred == RC_PRED_SET) {
561 inst[0] |= (PVS_DST_PRED_SENSE_MASK
562 << PVS_DST_PRED_SENSE_SHIFT);
563 }
564 }
565
566 /* Update the number of temporaries. */
567 if (info->HasDstReg && vpi->DstReg.File == RC_FILE_TEMPORARY &&
568 vpi->DstReg.Index >= compiler->code->num_temporaries)
569 compiler->code->num_temporaries = vpi->DstReg.Index + 1;
570
571 /* last instruction that writes position */
572 if (info->HasDstReg && vpi->DstReg.File == RC_FILE_OUTPUT &&
573 t_dst_index(compiler->code, &vpi->DstReg) == 0) {
574 if (loop_depth == 0)
575 compiler->code->last_pos_write = compiler->code->length / 4;
576 else
577 last_pos_write_at_loop_end = true;
578 }
579
580 for (unsigned i = 0; i < info->NumSrcRegs; i++) {
581 if (vpi->SrcReg[i].File == RC_FILE_TEMPORARY &&
582 vpi->SrcReg[i].Index >= compiler->code->num_temporaries)
583 compiler->code->num_temporaries = vpi->SrcReg[i].Index + 1;
584 if (vpi->SrcReg[i].File == RC_FILE_INPUT) {
585 if (loop_depth == 0)
586 compiler->code->last_input_read = compiler->code->length / 4;
587 else
588 last_input_read_at_loop_end = true;
589 }
590
591 }
592
593
594 if (compiler->code->num_temporaries > compiler->Base.max_temp_regs) {
595 rc_error(&compiler->Base, "Too many temporaries.\n");
596 return;
597 }
598
599 compiler->code->length += 4;
600
601 if (compiler->Base.Error)
602 return;
603 }
604 }
605
606 struct temporary_allocation {
607 unsigned int Allocated:1;
608 unsigned int HwTemp:15;
609 struct rc_instruction * LastRead;
610 };
611
get_reg(struct radeon_compiler * c,struct temporary_allocation * ta,bool * hwtemps,unsigned int orig)612 static int get_reg(struct radeon_compiler *c, struct temporary_allocation *ta, bool *hwtemps,
613 unsigned int orig)
614 {
615 if (!ta[orig].Allocated) {
616 int j;
617 for (j = 0; j < c->max_temp_regs; ++j)
618 {
619 if (!hwtemps[j])
620 break;
621 }
622 ta[orig].Allocated = 1;
623 ta[orig].HwTemp = j;
624 hwtemps[ta[orig].HwTemp] = true;
625 }
626
627 return ta[orig].HwTemp;
628 }
629
allocate_temporary_registers(struct radeon_compiler * c,void * user)630 static void allocate_temporary_registers(struct radeon_compiler *c, void *user)
631 {
632 unsigned int node_count, node_index;
633 struct ra_class ** node_classes;
634 struct rc_list * var_ptr;
635 struct rc_list * variables;
636 struct ra_graph * graph;
637 const struct rc_regalloc_state *ra_state = c->regalloc_state;
638
639 rc_recompute_ips(c);
640
641 /* Get list of program variables */
642 variables = rc_get_variables(c);
643 node_count = rc_list_count(variables);
644 node_classes = memory_pool_malloc(&c->Pool,
645 node_count * sizeof(struct ra_class *));
646
647 for (var_ptr = variables, node_index = 0; var_ptr;
648 var_ptr = var_ptr->Next, node_index++) {
649 unsigned int class_index = 0;
650 int index;
651 /* Compute the live intervals */
652 rc_variable_compute_live_intervals(var_ptr->Item);
653 unsigned int writemask = rc_variable_writemask_sum(var_ptr->Item);
654 index = rc_find_class(c->regalloc_state->class_list, writemask, 6);
655 if (index > -1) {
656 class_index = c->regalloc_state->class_list[index].ID;
657 } else {
658 rc_error(c,
659 "Could not find class for index=%u mask=%u\n",
660 ((struct rc_variable *)var_ptr->Item)->Dst.Index, writemask);
661 }
662 node_classes[node_index] = ra_state->classes[class_index];
663 }
664
665 graph = ra_alloc_interference_graph(ra_state->regs, node_count);
666
667 for (node_index = 0; node_index < node_count; node_index++) {
668 ra_set_node_class(graph, node_index, node_classes[node_index]);
669 }
670
671 rc_build_interference_graph(graph, variables);
672
673 if (!ra_allocate(graph)) {
674 rc_error(c, "Ran out of hardware temporaries\n");
675 ralloc_free(graph);
676 return;
677 }
678
679 /* Rewrite the registers */
680 for (var_ptr = variables, node_index = 0; var_ptr;
681 var_ptr = var_ptr->Next, node_index++) {
682 int reg = ra_get_node_reg(graph, node_index);
683 unsigned int writemask = reg_get_writemask(reg);
684 unsigned int index = reg_get_index(reg);
685 struct rc_variable * var = var_ptr->Item;
686
687 rc_variable_change_dst(var, index, writemask);
688 }
689
690 ralloc_free(graph);
691 }
692
693 /**
694 * Vertex engine cannot read two inputs or two constants at the same time.
695 * Introduce intermediate MOVs to temporary registers to account for this.
696 */
transform_source_conflicts(struct radeon_compiler * c,struct rc_instruction * inst,void * unused)697 static int transform_source_conflicts(
698 struct radeon_compiler *c,
699 struct rc_instruction* inst,
700 void* unused)
701 {
702 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
703
704 if (opcode->NumSrcRegs == 3) {
705 if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[2])
706 || t_src_conflict(inst->U.I.SrcReg[0], inst->U.I.SrcReg[2])) {
707 int tmpreg = rc_find_free_temporary(c);
708 struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);
709 inst_mov->U.I.Opcode = RC_OPCODE_MOV;
710 inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY;
711 inst_mov->U.I.DstReg.Index = tmpreg;
712 inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[2];
713 inst_mov->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
714 inst_mov->U.I.SrcReg[0].Negate = 0;
715 inst_mov->U.I.SrcReg[0].Abs = 0;
716
717 inst->U.I.SrcReg[2].File = RC_FILE_TEMPORARY;
718 inst->U.I.SrcReg[2].Index = tmpreg;
719 inst->U.I.SrcReg[2].RelAddr = false;
720 }
721 }
722
723 if (opcode->NumSrcRegs >= 2) {
724 if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[0])) {
725 int tmpreg = rc_find_free_temporary(c);
726 struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);
727 inst_mov->U.I.Opcode = RC_OPCODE_MOV;
728 inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY;
729 inst_mov->U.I.DstReg.Index = tmpreg;
730 inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[1];
731 inst_mov->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
732 inst_mov->U.I.SrcReg[0].Negate = 0;
733 inst_mov->U.I.SrcReg[0].Abs = 0;
734
735 inst->U.I.SrcReg[1].File = RC_FILE_TEMPORARY;
736 inst->U.I.SrcReg[1].Index = tmpreg;
737 inst->U.I.SrcReg[1].RelAddr = false;
738 }
739 }
740
741 return 1;
742 }
743
rc_vs_add_artificial_outputs(struct radeon_compiler * c,void * user)744 static void rc_vs_add_artificial_outputs(struct radeon_compiler *c, void *user)
745 {
746 struct r300_vertex_program_compiler * compiler = (struct r300_vertex_program_compiler*)c;
747 int i;
748
749 for(i = 0; i < 32; ++i) {
750 if ((compiler->RequiredOutputs & (1U << i)) &&
751 !(compiler->Base.Program.OutputsWritten & (1U << i))) {
752 struct rc_instruction * inst = rc_insert_new_instruction(&compiler->Base, compiler->Base.Program.Instructions.Prev);
753 inst->U.I.Opcode = RC_OPCODE_MOV;
754
755 inst->U.I.DstReg.File = RC_FILE_OUTPUT;
756 inst->U.I.DstReg.Index = i;
757 inst->U.I.DstReg.WriteMask = RC_MASK_XYZW;
758
759 inst->U.I.SrcReg[0].File = RC_FILE_CONSTANT;
760 inst->U.I.SrcReg[0].Index = 0;
761 inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
762
763 compiler->Base.Program.OutputsWritten |= 1U << i;
764 }
765 }
766 }
767
swizzle_is_native(rc_opcode opcode,struct rc_src_register reg)768 static int swizzle_is_native(rc_opcode opcode, struct rc_src_register reg)
769 {
770 (void) opcode;
771 (void) reg;
772
773 return 1;
774 }
775
776 const struct rc_swizzle_caps r300_vertprog_swizzle_caps = {
777 .IsNative = &swizzle_is_native,
778 .Split = NULL /* should never be called */
779 };
780
r3xx_compile_vertex_program(struct r300_vertex_program_compiler * c)781 void r3xx_compile_vertex_program(struct r300_vertex_program_compiler *c)
782 {
783 int is_r500 = c->Base.is_r500;
784 int opt = !c->Base.disable_optimizations;
785
786 /* Lists of instruction transformations. */
787 struct radeon_program_transformation alu_rewrite[] = {
788 { &r300_transform_vertex_alu, NULL },
789 { NULL, NULL }
790 };
791
792 struct radeon_program_transformation resolve_src_conflicts[] = {
793 { &transform_source_conflicts, NULL },
794 { NULL, NULL }
795 };
796
797 /* List of compiler passes. */
798 struct radeon_compiler_pass vs_list[] = {
799 /* NAME DUMP PREDICATE FUNCTION PARAM */
800 {"add artificial outputs", 0, 1, rc_vs_add_artificial_outputs, NULL},
801 {"native rewrite", 1, 1, rc_local_transform, alu_rewrite},
802 {"unused channels", 1, opt, rc_mark_unused_channels, NULL},
803 {"dataflow optimize", 1, opt, rc_optimize, NULL},
804 {"dead constants", 1, 1, rc_remove_unused_constants, &c->code->constants_remap_table},
805 /* This pass must be done after optimizations. */
806 {"source conflict resolve", 1, 1, rc_local_transform, resolve_src_conflicts},
807 {"register allocation", 1, opt, allocate_temporary_registers, NULL},
808 {"lower control flow opcodes", 1, is_r500, rc_vert_fc, NULL},
809 {"final code validation", 0, 1, rc_validate_final_shader, NULL},
810 {"machine code generation", 0, 1, translate_vertex_program, NULL},
811 {"dump machine code", 0, c->Base.Debug & RC_DBG_LOG, r300_vertex_program_dump, NULL},
812 {NULL, 0, 0, NULL, NULL}
813 };
814
815 c->Base.type = RC_VERTEX_PROGRAM;
816 c->Base.SwizzleCaps = &r300_vertprog_swizzle_caps;
817
818 rc_run_compiler(&c->Base, vs_list);
819
820 c->code->InputsRead = c->Base.Program.InputsRead;
821 c->code->OutputsWritten = c->Base.Program.OutputsWritten;
822 rc_constants_copy(&c->code->constants, &c->Base.Program.Constants);
823 }
824