1 /*
2 * Copyright 2009 Nicolai Haehnle.
3 * Copyright 2010 Tom Stellard <[email protected]>
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "util/u_math.h"
8
9 #include "radeon_dataflow.h"
10
11 #include "radeon_compiler.h"
12 #include "radeon_compiler_util.h"
13 #include "radeon_list.h"
14 #include "radeon_swizzle.h"
15 #include "radeon_variable.h"
16
17 struct src_clobbered_reads_cb_data {
18 rc_register_file File;
19 unsigned int Index;
20 unsigned int Mask;
21 struct rc_reader_data * ReaderData;
22 };
23
24 typedef void (*rc_presub_replace_fn)(struct rc_instruction *,
25 struct rc_instruction *,
26 unsigned int);
27
chain_srcregs(struct rc_src_register outer,struct rc_src_register inner)28 static struct rc_src_register chain_srcregs(struct rc_src_register outer, struct rc_src_register inner)
29 {
30 struct rc_src_register combine;
31 combine.File = inner.File;
32 combine.Index = inner.Index;
33 combine.RelAddr = inner.RelAddr;
34 if (outer.Abs) {
35 combine.Abs = 1;
36 combine.Negate = outer.Negate;
37 } else {
38 combine.Abs = inner.Abs;
39 combine.Negate = swizzle_mask(outer.Swizzle, inner.Negate);
40 combine.Negate ^= outer.Negate;
41 }
42 combine.Swizzle = combine_swizzles(inner.Swizzle, outer.Swizzle);
43 return combine;
44 }
45
copy_propagate_scan_read(void * data,struct rc_instruction * inst,struct rc_src_register * src)46 static void copy_propagate_scan_read(void * data, struct rc_instruction * inst,
47 struct rc_src_register * src)
48 {
49 rc_register_file file = src->File;
50 struct rc_reader_data * reader_data = data;
51
52 if(!rc_inst_can_use_presub(reader_data->C,
53 inst,
54 reader_data->Writer->U.I.PreSub.Opcode,
55 rc_swizzle_to_writemask(src->Swizzle),
56 src,
57 &reader_data->Writer->U.I.PreSub.SrcReg[0],
58 &reader_data->Writer->U.I.PreSub.SrcReg[1])) {
59 reader_data->Abort = 1;
60 return;
61 }
62
63 /* XXX This could probably be handled better. */
64 if (file == RC_FILE_ADDRESS) {
65 reader_data->Abort = 1;
66 return;
67 }
68
69 /* R300/R400 is unhappy about propagating
70 * 0: MOV temp[1], -none.1111;
71 * 1: KIL temp[1];
72 * to
73 * 0: KIL -none.1111;
74 *
75 * R500 is fine with it.
76 */
77 if (!reader_data->C->is_r500 && inst->U.I.Opcode == RC_OPCODE_KIL &&
78 reader_data->Writer->U.I.SrcReg[0].File == RC_FILE_NONE) {
79 reader_data->Abort = 1;
80 return;
81 }
82
83 /* These instructions cannot read from the constants file.
84 * see radeonTransformTEX()
85 */
86 if(reader_data->Writer->U.I.SrcReg[0].File != RC_FILE_TEMPORARY &&
87 reader_data->Writer->U.I.SrcReg[0].File != RC_FILE_INPUT &&
88 reader_data->Writer->U.I.SrcReg[0].File != RC_FILE_NONE &&
89 (inst->U.I.Opcode == RC_OPCODE_TEX ||
90 inst->U.I.Opcode == RC_OPCODE_TXB ||
91 inst->U.I.Opcode == RC_OPCODE_TXP ||
92 inst->U.I.Opcode == RC_OPCODE_TXD ||
93 inst->U.I.Opcode == RC_OPCODE_TXL ||
94 inst->U.I.Opcode == RC_OPCODE_KIL)){
95 reader_data->Abort = 1;
96 return;
97 }
98 }
99
src_clobbered_reads_cb(void * data,struct rc_instruction * inst,struct rc_src_register * src)100 static void src_clobbered_reads_cb(
101 void * data,
102 struct rc_instruction * inst,
103 struct rc_src_register * src)
104 {
105 struct src_clobbered_reads_cb_data * sc_data = data;
106
107 if (src->File == sc_data->File
108 && src->Index == sc_data->Index
109 && (rc_swizzle_to_writemask(src->Swizzle) & sc_data->Mask)) {
110
111 sc_data->ReaderData->AbortOnRead = RC_MASK_XYZW;
112 }
113
114 if (src->RelAddr && sc_data->File == RC_FILE_ADDRESS) {
115 sc_data->ReaderData->AbortOnRead = RC_MASK_XYZW;
116 }
117 }
118
is_src_clobbered_scan_write(void * data,struct rc_instruction * inst,rc_register_file file,unsigned int index,unsigned int mask)119 static void is_src_clobbered_scan_write(
120 void * data,
121 struct rc_instruction * inst,
122 rc_register_file file,
123 unsigned int index,
124 unsigned int mask)
125 {
126 struct src_clobbered_reads_cb_data sc_data;
127 struct rc_reader_data * reader_data = data;
128 sc_data.File = file;
129 sc_data.Index = index;
130 sc_data.Mask = mask;
131 sc_data.ReaderData = reader_data;
132 rc_for_all_reads_src(reader_data->Writer,
133 src_clobbered_reads_cb, &sc_data);
134 }
135
copy_propagate(struct radeon_compiler * c,struct rc_instruction * inst_mov)136 static void copy_propagate(struct radeon_compiler * c, struct rc_instruction * inst_mov)
137 {
138 struct rc_reader_data reader_data;
139 unsigned int i;
140
141 if (inst_mov->U.I.DstReg.File != RC_FILE_TEMPORARY ||
142 inst_mov->U.I.WriteALUResult)
143 return;
144
145 /* Get a list of all the readers of this MOV instruction. */
146 reader_data.ExitOnAbort = 1;
147 rc_get_readers(c, inst_mov, &reader_data,
148 copy_propagate_scan_read, NULL,
149 is_src_clobbered_scan_write);
150
151 if (reader_data.Abort || reader_data.ReaderCount == 0)
152 return;
153
154 /* We can propagate SaturateMode if all the readers are MOV instructions
155 * without a presubtract operation, source negation and absolute.
156 * In that case, we just move SaturateMode to all readers. */
157 if (inst_mov->U.I.SaturateMode) {
158 for (i = 0; i < reader_data.ReaderCount; i++) {
159 struct rc_instruction * inst = reader_data.Readers[i].Inst;
160
161 if (inst->U.I.Opcode != RC_OPCODE_MOV ||
162 inst->U.I.SrcReg[0].File == RC_FILE_PRESUB ||
163 inst->U.I.SrcReg[0].Abs ||
164 inst->U.I.SrcReg[0].Negate) {
165 return;
166 }
167 }
168 }
169
170 /* Propagate the MOV instruction. */
171 for (i = 0; i < reader_data.ReaderCount; i++) {
172 struct rc_instruction * inst = reader_data.Readers[i].Inst;
173 *reader_data.Readers[i].U.I.Src = chain_srcregs(*reader_data.Readers[i].U.I.Src, inst_mov->U.I.SrcReg[0]);
174
175 if (inst_mov->U.I.SrcReg[0].File == RC_FILE_PRESUB)
176 inst->U.I.PreSub = inst_mov->U.I.PreSub;
177 if (!inst->U.I.SaturateMode)
178 inst->U.I.SaturateMode = inst_mov->U.I.SaturateMode;
179 }
180
181 /* Finally, remove the original MOV instruction */
182 rc_remove_instruction(inst_mov);
183 }
184
185 /**
186 * Check if a source register is actually always the same
187 * swizzle constant.
188 */
is_src_uniform_constant(struct rc_src_register src,rc_swizzle * pswz,unsigned int * pnegate)189 static int is_src_uniform_constant(struct rc_src_register src,
190 rc_swizzle * pswz, unsigned int * pnegate)
191 {
192 int have_used = 0;
193
194 if (src.File != RC_FILE_NONE) {
195 *pswz = 0;
196 return 0;
197 }
198
199 for(unsigned int chan = 0; chan < 4; ++chan) {
200 unsigned int swz = GET_SWZ(src.Swizzle, chan);
201 if (swz < 4) {
202 *pswz = 0;
203 return 0;
204 }
205 if (swz == RC_SWIZZLE_UNUSED)
206 continue;
207
208 if (!have_used) {
209 *pswz = swz;
210 *pnegate = GET_BIT(src.Negate, chan);
211 have_used = 1;
212 } else {
213 if (swz != *pswz || *pnegate != GET_BIT(src.Negate, chan)) {
214 *pswz = 0;
215 return 0;
216 }
217 }
218 }
219
220 return 1;
221 }
222
223 /**
224 * Replace 0.0, 1.0 and 0.5 immediate constants by their
225 * respective swizzles. Simplify instructions like ADD dst, src, 0;
226 */
constant_folding(struct radeon_compiler * c,struct rc_instruction * inst)227 static void constant_folding(struct radeon_compiler * c, struct rc_instruction * inst)
228 {
229 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
230 unsigned int i;
231
232 /* Replace 0.0, 1.0 and 0.5 immediates by their explicit swizzles */
233 for(unsigned int src = 0; src < opcode->NumSrcRegs; ++src) {
234 struct rc_constant * constant;
235 struct rc_src_register newsrc;
236 int have_real_reference;
237 unsigned int chan;
238
239 /* If there are only 0, 0.5, 1, or _ swizzles, mark the source as a constant. */
240 for (chan = 0; chan < 4; ++chan)
241 if (GET_SWZ(inst->U.I.SrcReg[src].Swizzle, chan) <= 3)
242 break;
243 if (chan == 4) {
244 inst->U.I.SrcReg[src].File = RC_FILE_NONE;
245 continue;
246 }
247
248 /* Convert immediates to swizzles. */
249 if (inst->U.I.SrcReg[src].File != RC_FILE_CONSTANT ||
250 inst->U.I.SrcReg[src].RelAddr ||
251 inst->U.I.SrcReg[src].Index >= c->Program.Constants.Count)
252 continue;
253
254 constant =
255 &c->Program.Constants.Constants[inst->U.I.SrcReg[src].Index];
256
257 if (constant->Type != RC_CONSTANT_IMMEDIATE)
258 continue;
259
260 newsrc = inst->U.I.SrcReg[src];
261 have_real_reference = 0;
262 for (chan = 0; chan < 4; ++chan) {
263 unsigned int swz = GET_SWZ(newsrc.Swizzle, chan);
264 unsigned int newswz;
265 float imm;
266 float baseimm;
267
268 if (swz >= 4)
269 continue;
270
271 imm = constant->u.Immediate[swz];
272 baseimm = imm;
273 if (imm < 0.0)
274 baseimm = -baseimm;
275
276 if (baseimm == 0.0) {
277 newswz = RC_SWIZZLE_ZERO;
278 } else if (baseimm == 1.0) {
279 newswz = RC_SWIZZLE_ONE;
280 } else if (baseimm == 0.5 && c->has_half_swizzles) {
281 newswz = RC_SWIZZLE_HALF;
282 } else {
283 have_real_reference = 1;
284 continue;
285 }
286
287 SET_SWZ(newsrc.Swizzle, chan, newswz);
288 if (imm < 0.0 && !newsrc.Abs)
289 newsrc.Negate ^= 1 << chan;
290 }
291
292 if (!have_real_reference) {
293 newsrc.File = RC_FILE_NONE;
294 newsrc.Index = 0;
295 }
296
297 /* don't make the swizzle worse */
298 if (!c->SwizzleCaps->IsNative(inst->U.I.Opcode, newsrc))
299 continue;
300
301 inst->U.I.SrcReg[src] = newsrc;
302 }
303
304 /* In case this instruction has been converted, make sure all of the
305 * registers that are no longer used are empty. */
306 opcode = rc_get_opcode_info(inst->U.I.Opcode);
307 for(i = opcode->NumSrcRegs; i < 3; i++) {
308 memset(&inst->U.I.SrcReg[i], 0, sizeof(struct rc_src_register));
309 }
310 }
311
312 /**
313 * If src and dst use the same register, this function returns a writemask that
314 * indicates which components are read by src. Otherwise zero is returned.
315 */
src_reads_dst_mask(struct rc_src_register src,struct rc_dst_register dst)316 static unsigned int src_reads_dst_mask(struct rc_src_register src,
317 struct rc_dst_register dst)
318 {
319 if (dst.File != src.File || dst.Index != src.Index) {
320 return 0;
321 }
322 return rc_swizzle_to_writemask(src.Swizzle);
323 }
324
325 /* Return 1 if the source registers has a constant swizzle (e.g. 0, 0.5, 1.0)
326 * in any of its channels. Return 0 otherwise. */
src_has_const_swz(struct rc_src_register src)327 static int src_has_const_swz(struct rc_src_register src) {
328 int chan;
329 for(chan = 0; chan < 4; chan++) {
330 unsigned int swz = GET_SWZ(src.Swizzle, chan);
331 if (swz == RC_SWIZZLE_ZERO || swz == RC_SWIZZLE_HALF
332 || swz == RC_SWIZZLE_ONE) {
333 return 1;
334 }
335 }
336 return 0;
337 }
338
presub_scan_read(void * data,struct rc_instruction * inst,struct rc_src_register * src)339 static void presub_scan_read(
340 void * data,
341 struct rc_instruction * inst,
342 struct rc_src_register * src)
343 {
344 struct rc_reader_data * reader_data = data;
345 rc_presubtract_op * presub_opcode = reader_data->CbData;
346
347 if (!rc_inst_can_use_presub(reader_data->C,
348 inst,
349 *presub_opcode,
350 reader_data->Writer->U.I.DstReg.WriteMask,
351 src,
352 &reader_data->Writer->U.I.SrcReg[0],
353 &reader_data->Writer->U.I.SrcReg[1])) {
354 reader_data->Abort = 1;
355 return;
356 }
357 }
358
presub_helper(struct radeon_compiler * c,struct rc_instruction * inst_add,rc_presubtract_op presub_opcode,rc_presub_replace_fn presub_replace)359 static int presub_helper(
360 struct radeon_compiler * c,
361 struct rc_instruction * inst_add,
362 rc_presubtract_op presub_opcode,
363 rc_presub_replace_fn presub_replace)
364 {
365 struct rc_reader_data reader_data;
366 unsigned int i;
367 rc_presubtract_op cb_op = presub_opcode;
368
369 reader_data.CbData = &cb_op;
370 reader_data.ExitOnAbort = 1;
371 rc_get_readers(c, inst_add, &reader_data, presub_scan_read, NULL,
372 is_src_clobbered_scan_write);
373
374 if (reader_data.Abort || reader_data.ReaderCount == 0)
375 return 0;
376
377 for(i = 0; i < reader_data.ReaderCount; i++) {
378 unsigned int src_index;
379 struct rc_reader reader = reader_data.Readers[i];
380 const struct rc_opcode_info * info =
381 rc_get_opcode_info(reader.Inst->U.I.Opcode);
382
383 for (src_index = 0; src_index < info->NumSrcRegs; src_index++) {
384 if (&reader.Inst->U.I.SrcReg[src_index] == reader.U.I.Src)
385 presub_replace(inst_add, reader.Inst, src_index);
386 }
387 }
388 return 1;
389 }
390
presub_replace_add(struct rc_instruction * inst_add,struct rc_instruction * inst_reader,unsigned int src_index)391 static void presub_replace_add(
392 struct rc_instruction * inst_add,
393 struct rc_instruction * inst_reader,
394 unsigned int src_index)
395 {
396 rc_presubtract_op presub_opcode;
397
398 unsigned int negates = 0;
399 if (inst_add->U.I.SrcReg[0].Negate)
400 negates++;
401 if (inst_add->U.I.SrcReg[1].Negate)
402 negates++;
403 assert(negates != 2 || inst_add->U.I.SrcReg[1].Negate == inst_add->U.I.SrcReg[0].Negate);
404
405 if (negates == 1)
406 presub_opcode = RC_PRESUB_SUB;
407 else
408 presub_opcode = RC_PRESUB_ADD;
409
410 if (inst_add->U.I.SrcReg[1].Negate && negates == 1) {
411 inst_reader->U.I.PreSub.SrcReg[0] = inst_add->U.I.SrcReg[1];
412 inst_reader->U.I.PreSub.SrcReg[1] = inst_add->U.I.SrcReg[0];
413 } else {
414 inst_reader->U.I.PreSub.SrcReg[0] = inst_add->U.I.SrcReg[0];
415 inst_reader->U.I.PreSub.SrcReg[1] = inst_add->U.I.SrcReg[1];
416 }
417 /* If both sources are negative we can move the negate to the presub. */
418 unsigned negate_mask = negates == 1 ? 0 : inst_add->U.I.SrcReg[0].Negate;
419 inst_reader->U.I.PreSub.SrcReg[0].Negate = negate_mask;
420 inst_reader->U.I.PreSub.SrcReg[1].Negate = negate_mask;
421 inst_reader->U.I.PreSub.Opcode = presub_opcode;
422 inst_reader->U.I.SrcReg[src_index] =
423 chain_srcregs(inst_reader->U.I.SrcReg[src_index],
424 inst_reader->U.I.PreSub.SrcReg[0]);
425 inst_reader->U.I.SrcReg[src_index].File = RC_FILE_PRESUB;
426 inst_reader->U.I.SrcReg[src_index].Index = presub_opcode;
427 }
428
is_presub_candidate(struct radeon_compiler * c,struct rc_instruction * inst)429 static int is_presub_candidate(
430 struct radeon_compiler * c,
431 struct rc_instruction * inst)
432 {
433 const struct rc_opcode_info * info = rc_get_opcode_info(inst->U.I.Opcode);
434 unsigned int i;
435 unsigned int is_constant[2] = {0, 0};
436
437 assert(inst->U.I.Opcode == RC_OPCODE_ADD || inst->U.I.Opcode == RC_OPCODE_MAD);
438
439 if (inst->U.I.PreSub.Opcode != RC_PRESUB_NONE
440 || inst->U.I.SaturateMode
441 || inst->U.I.WriteALUResult
442 || inst->U.I.Omod) {
443 return 0;
444 }
445
446 /* If first two sources use a constant swizzle, then we can't convert it to
447 * a presubtract operation. In fact for the ADD and SUB presubtract
448 * operations neither source can contain a constant swizzle. This
449 * specific case is checked in peephole_add_presub_add() when
450 * we make sure the swizzles for both sources are equal, so we
451 * don't need to worry about it here. */
452 for (i = 0; i < 2; i++) {
453 int chan;
454 for (chan = 0; chan < 4; chan++) {
455 rc_swizzle swz =
456 get_swz(inst->U.I.SrcReg[i].Swizzle, chan);
457 if (swz == RC_SWIZZLE_ONE
458 || swz == RC_SWIZZLE_ZERO
459 || swz == RC_SWIZZLE_HALF) {
460 is_constant[i] = 1;
461 }
462 }
463 }
464 if (is_constant[0] && is_constant[1])
465 return 0;
466
467 for(i = 0; i < info->NumSrcRegs; i++) {
468 struct rc_src_register src = inst->U.I.SrcReg[i];
469 if (src_reads_dst_mask(src, inst->U.I.DstReg))
470 return 0;
471
472 src.File = RC_FILE_PRESUB;
473 if (!c->SwizzleCaps->IsNative(inst->U.I.Opcode, src))
474 return 0;
475 }
476 return 1;
477 }
478
peephole_add_presub_add(struct radeon_compiler * c,struct rc_instruction * inst_add)479 static int peephole_add_presub_add(
480 struct radeon_compiler * c,
481 struct rc_instruction * inst_add)
482 {
483 unsigned dstmask = inst_add->U.I.DstReg.WriteMask;
484 unsigned src0_neg = inst_add->U.I.SrcReg[0].Negate & dstmask;
485 unsigned src1_neg = inst_add->U.I.SrcReg[1].Negate & dstmask;
486
487 if (inst_add->U.I.SrcReg[0].Swizzle != inst_add->U.I.SrcReg[1].Swizzle)
488 return 0;
489
490 /* src0 and src1 can't have absolute values */
491 if (inst_add->U.I.SrcReg[0].Abs || inst_add->U.I.SrcReg[1].Abs)
492 return 0;
493
494 /* if src0 is negative, at least all bits of dstmask have to be set */
495 if (inst_add->U.I.SrcReg[0].Negate && src0_neg != dstmask)
496 return 0;
497
498 /* if src1 is negative, at least all bits of dstmask have to be set */
499 if (inst_add->U.I.SrcReg[1].Negate && src1_neg != dstmask)
500 return 0;
501
502 if (!is_presub_candidate(c, inst_add))
503 return 0;
504
505 if (presub_helper(c, inst_add, RC_PRESUB_ADD, presub_replace_add)) {
506 rc_remove_instruction(inst_add);
507 return 1;
508 }
509 return 0;
510 }
511
presub_replace_inv(struct rc_instruction * inst_add,struct rc_instruction * inst_reader,unsigned int src_index)512 static void presub_replace_inv(
513 struct rc_instruction * inst_add,
514 struct rc_instruction * inst_reader,
515 unsigned int src_index)
516 {
517 /* We must be careful not to modify inst_add, since it
518 * is possible it will remain part of the program.*/
519 inst_reader->U.I.PreSub.SrcReg[0] = inst_add->U.I.SrcReg[1];
520 inst_reader->U.I.PreSub.SrcReg[0].Negate = 0;
521 inst_reader->U.I.PreSub.Opcode = RC_PRESUB_INV;
522 inst_reader->U.I.SrcReg[src_index] = chain_srcregs(inst_reader->U.I.SrcReg[src_index],
523 inst_reader->U.I.PreSub.SrcReg[0]);
524
525 inst_reader->U.I.SrcReg[src_index].File = RC_FILE_PRESUB;
526 inst_reader->U.I.SrcReg[src_index].Index = RC_PRESUB_INV;
527 }
528
presub_replace_bias(struct rc_instruction * inst_mad,struct rc_instruction * inst_reader,unsigned int src_index)529 static void presub_replace_bias(
530 struct rc_instruction * inst_mad,
531 struct rc_instruction * inst_reader,
532 unsigned int src_index)
533 {
534 /* We must be careful not to modify inst_mad, since it
535 * is possible it will remain part of the program.*/
536 inst_reader->U.I.PreSub.SrcReg[0] = inst_mad->U.I.SrcReg[0];
537 inst_reader->U.I.PreSub.SrcReg[0].Negate = 0;
538 inst_reader->U.I.PreSub.Opcode = RC_PRESUB_BIAS;
539 inst_reader->U.I.SrcReg[src_index] = chain_srcregs(inst_reader->U.I.SrcReg[src_index],
540 inst_reader->U.I.PreSub.SrcReg[0]);
541
542 inst_reader->U.I.SrcReg[src_index].File = RC_FILE_PRESUB;
543 inst_reader->U.I.SrcReg[src_index].Index = RC_PRESUB_BIAS;
544 }
545
546 /**
547 * PRESUB_INV: ADD TEMP[0], none.1, -TEMP[1]
548 * Use the presubtract 1 - src0 for all readers of TEMP[0]. The first source
549 * of the add instruction must have the constant 1 swizzle. This function
550 * does not check const registers to see if their value is 1.0, so it should
551 * be called after the constant_folding optimization.
552 * @return
553 * 0 if the ADD instruction is still part of the program.
554 * 1 if the ADD instruction is no longer part of the program.
555 */
peephole_add_presub_inv(struct radeon_compiler * c,struct rc_instruction * inst_add)556 static int peephole_add_presub_inv(
557 struct radeon_compiler * c,
558 struct rc_instruction * inst_add)
559 {
560 unsigned int i, swz;
561
562 if (!is_presub_candidate(c, inst_add))
563 return 0;
564
565 /* Check if src0 is 1. */
566 /* XXX It would be nice to use is_src_uniform_constant here, but that
567 * function only works if the register's file is RC_FILE_NONE */
568 for(i = 0; i < 4; i++ ) {
569 if (!(inst_add->U.I.DstReg.WriteMask & (1 << i)))
570 continue;
571
572 swz = GET_SWZ(inst_add->U.I.SrcReg[0].Swizzle, i);
573 if (swz != RC_SWIZZLE_ONE || inst_add->U.I.SrcReg[0].Negate & (1 << i))
574 return 0;
575 }
576
577 /* Check src1. */
578 if ((inst_add->U.I.SrcReg[1].Negate & inst_add->U.I.DstReg.WriteMask) !=
579 inst_add->U.I.DstReg.WriteMask
580 || inst_add->U.I.SrcReg[1].Abs
581 || src_has_const_swz(inst_add->U.I.SrcReg[1])) {
582
583 return 0;
584 }
585
586 if (presub_helper(c, inst_add, RC_PRESUB_INV, presub_replace_inv)) {
587 rc_remove_instruction(inst_add);
588 return 1;
589 }
590 return 0;
591 }
592
593 /**
594 * PRESUB_BIAD: MAD -TEMP[0], 2.0, 1.0
595 * Use the presubtract 1 - 2*src0 for all readers of TEMP[0]. The first source
596 * of the add instruction must have the constant 1 swizzle. This function
597 * does not check const registers to see if their value is 1.0, so it should
598 * be called after the constant_folding optimization.
599 * @return
600 * 0 if the MAD instruction is still part of the program.
601 * 1 if the MAD instruction is no longer part of the program.
602 */
peephole_mad_presub_bias(struct radeon_compiler * c,struct rc_instruction * inst_mad)603 static int peephole_mad_presub_bias(
604 struct radeon_compiler * c,
605 struct rc_instruction * inst_mad)
606 {
607 unsigned int i, swz;
608
609 if (!is_presub_candidate(c, inst_mad))
610 return 0;
611
612 /* Check if src2 is 1. */
613 for(i = 0; i < 4; i++ ) {
614 if (!(inst_mad->U.I.DstReg.WriteMask & (1 << i)))
615 continue;
616
617 swz = GET_SWZ(inst_mad->U.I.SrcReg[2].Swizzle, i);
618 if (swz != RC_SWIZZLE_ONE || inst_mad->U.I.SrcReg[2].Negate & (1 << i))
619 return 0;
620 }
621
622 /* Check if src1 is 2. */
623 struct rc_src_register src1_reg = inst_mad->U.I.SrcReg[1];
624 if ((src1_reg.Negate & inst_mad->U.I.DstReg.WriteMask) != 0 || src1_reg.Abs)
625 return 0;
626 if (src1_reg.File == RC_FILE_INLINE) {
627 if (rc_inline_to_float(src1_reg.Index) != 2.0f)
628 return 0;
629 } else {
630 if (src1_reg.File != RC_FILE_CONSTANT)
631 return 0;
632
633 struct rc_constant *constant = &c->Program.Constants.Constants[src1_reg.Index];
634 if (constant->Type != RC_CONSTANT_IMMEDIATE)
635 return 0;
636 for (i = 0; i < 4; i++) {
637 if (!(inst_mad->U.I.DstReg.WriteMask & (1 << i)))
638 continue;
639 swz = GET_SWZ(src1_reg.Swizzle, i);
640 if (swz >= RC_SWIZZLE_ZERO || constant->u.Immediate[swz] != 2.0)
641 return 0;
642 }
643 }
644
645 /* Check src0. */
646 if ((inst_mad->U.I.SrcReg[0].Negate & inst_mad->U.I.DstReg.WriteMask) !=
647 inst_mad->U.I.DstReg.WriteMask
648 || inst_mad->U.I.SrcReg[0].Abs
649 || src_has_const_swz(inst_mad->U.I.SrcReg[0])) {
650
651 return 0;
652 }
653
654 if (presub_helper(c, inst_mad, RC_PRESUB_BIAS, presub_replace_bias)) {
655 rc_remove_instruction(inst_mad);
656 return 1;
657 }
658 return 0;
659 }
660
661 struct peephole_mul_cb_data {
662 struct rc_dst_register * Writer;
663 unsigned int Clobbered;
664 };
665
omod_filter_reader_cb(void * userdata,struct rc_instruction * inst,rc_register_file file,unsigned int index,unsigned int mask)666 static void omod_filter_reader_cb(
667 void * userdata,
668 struct rc_instruction * inst,
669 rc_register_file file,
670 unsigned int index,
671 unsigned int mask)
672 {
673 struct peephole_mul_cb_data * d = userdata;
674 if (rc_src_reads_dst_mask(file, mask, index,
675 d->Writer->File, d->Writer->Index, d->Writer->WriteMask)) {
676
677 d->Clobbered = 1;
678 }
679 }
680
omod_filter_writer_cb(void * userdata,struct rc_instruction * inst,rc_register_file file,unsigned int index,unsigned int mask)681 static void omod_filter_writer_cb(
682 void * userdata,
683 struct rc_instruction * inst,
684 rc_register_file file,
685 unsigned int index,
686 unsigned int mask)
687 {
688 struct peephole_mul_cb_data * d = userdata;
689 if (file == d->Writer->File && index == d->Writer->Index &&
690 (mask & d->Writer->WriteMask)) {
691 d->Clobbered = 1;
692 }
693 }
694
peephole_mul_omod(struct radeon_compiler * c,struct rc_instruction * inst_mul,struct rc_list * var_list)695 static int peephole_mul_omod(
696 struct radeon_compiler * c,
697 struct rc_instruction * inst_mul,
698 struct rc_list * var_list)
699 {
700 unsigned int chan = 0, swz, i;
701 int const_index = -1;
702 int temp_index = -1;
703 float const_value;
704 rc_omod_op omod_op = RC_OMOD_DISABLE;
705 struct rc_list * writer_list;
706 struct rc_variable * var;
707 struct peephole_mul_cb_data cb_data;
708 unsigned writemask_sum;
709
710 for (i = 0; i < 2; i++) {
711 unsigned int j;
712 if (inst_mul->U.I.SrcReg[i].File != RC_FILE_CONSTANT
713 && inst_mul->U.I.SrcReg[i].File != RC_FILE_TEMPORARY
714 && inst_mul->U.I.SrcReg[i].File != RC_FILE_NONE) {
715 return 0;
716 }
717
718 /* The only relevant case with constant swizzles we should check for
719 * is multiply by one half.
720 */
721 if (inst_mul->U.I.SrcReg[i].File == RC_FILE_NONE) {
722 for (j = 0; j < 4; j++) {
723 swz = GET_SWZ(inst_mul->U.I.SrcReg[i].Swizzle, j);
724 if (swz == RC_SWIZZLE_UNUSED) {
725 continue;
726 }
727 if (swz != RC_SWIZZLE_HALF) {
728 return 0;
729 } else {
730 omod_op = RC_OMOD_DIV_2;
731 }
732 }
733 }
734
735 if (inst_mul->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
736 if (temp_index != -1) {
737 /* The instruction has two temp sources */
738 return 0;
739 } else {
740 temp_index = i;
741 continue;
742 }
743 }
744 /* If we get this far Src[i] must be a constant src */
745 if (inst_mul->U.I.SrcReg[i].Negate) {
746 return 0;
747 }
748 /* The constant src needs to read from the same swizzle */
749 swz = RC_SWIZZLE_UNUSED;
750 chan = 0;
751 for (j = 0; j < 4; j++) {
752 unsigned int j_swz =
753 GET_SWZ(inst_mul->U.I.SrcReg[i].Swizzle, j);
754 if (j_swz == RC_SWIZZLE_UNUSED) {
755 continue;
756 }
757 if (swz == RC_SWIZZLE_UNUSED) {
758 swz = j_swz;
759 chan = j;
760 } else if (j_swz != swz) {
761 return 0;
762 }
763 }
764
765 if (const_index != -1) {
766 /* The instruction has two constant sources */
767 return 0;
768 } else {
769 const_index = i;
770 }
771 }
772
773 if (omod_op == RC_OMOD_DISABLE) {
774 if (!rc_src_reg_is_immediate(c, inst_mul->U.I.SrcReg[const_index].File,
775 inst_mul->U.I.SrcReg[const_index].Index)) {
776 return 0;
777 }
778 const_value = rc_get_constant_value(c,
779 inst_mul->U.I.SrcReg[const_index].Index,
780 inst_mul->U.I.SrcReg[const_index].Swizzle,
781 inst_mul->U.I.SrcReg[const_index].Negate,
782 chan);
783
784 if (const_value == 2.0f) {
785 omod_op = RC_OMOD_MUL_2;
786 } else if (const_value == 4.0f) {
787 omod_op = RC_OMOD_MUL_4;
788 } else if (const_value == 8.0f) {
789 omod_op = RC_OMOD_MUL_8;
790 } else if (const_value == (1.0f / 2.0f)) {
791 omod_op = RC_OMOD_DIV_2;
792 } else if (const_value == (1.0f / 4.0f)) {
793 omod_op = RC_OMOD_DIV_4;
794 } else if (const_value == (1.0f / 8.0f)) {
795 omod_op = RC_OMOD_DIV_8;
796 } else {
797 return 0;
798 }
799 }
800
801 writer_list = rc_variable_list_get_writers_one_reader(var_list,
802 RC_INSTRUCTION_NORMAL, &inst_mul->U.I.SrcReg[temp_index]);
803
804 if (!writer_list) {
805 return 0;
806 }
807
808 cb_data.Clobbered = 0;
809 cb_data.Writer = &inst_mul->U.I.DstReg;
810 for (var = writer_list->Item; var; var = var->Friend) {
811 struct rc_instruction * inst;
812 const struct rc_opcode_info * info = rc_get_opcode_info(
813 var->Inst->U.I.Opcode);
814 if (info->HasTexture) {
815 return 0;
816 }
817 if (var->Inst->U.I.SaturateMode != RC_SATURATE_NONE) {
818 return 0;
819 }
820
821 /* Empirical testing shows that DDX/DDY directly into output
822 * with non-identity omod is problematic.
823 */
824 if ((info->Opcode == RC_OPCODE_DDX || info->Opcode == RC_OPCODE_DDY) &&
825 inst_mul->U.I.DstReg.File == RC_FILE_OUTPUT) {
826 return 0;
827 }
828
829 for (inst = inst_mul->Prev; inst != var->Inst;
830 inst = inst->Prev) {
831 rc_for_all_reads_mask(inst, omod_filter_reader_cb,
832 &cb_data);
833 rc_for_all_writes_mask(inst, omod_filter_writer_cb,
834 &cb_data);
835 if (cb_data.Clobbered) {
836 break;
837 }
838 }
839 }
840
841 if (cb_data.Clobbered) {
842 return 0;
843 }
844
845 writemask_sum = rc_variable_writemask_sum(writer_list->Item);
846
847 /* rc_normal_rewrite_writemask can't expand a previous writemask to store
848 * more channels replicated.
849 */
850 if (util_bitcount(writemask_sum) < util_bitcount(inst_mul->U.I.DstReg.WriteMask))
851 return 0;
852
853 /* Rewrite the instructions */
854 for (var = writer_list->Item; var; var = var->Friend) {
855 struct rc_variable * writer = var;
856 unsigned conversion_swizzle = RC_SWIZZLE_UUUU;
857 for (chan = 0; chan < 4; chan++) {
858 unsigned swz = GET_SWZ(inst_mul->U.I.SrcReg[temp_index].Swizzle, chan);
859 if (swz <= RC_SWIZZLE_W)
860 SET_SWZ(conversion_swizzle, swz, chan);
861 }
862 writer->Inst->U.I.Omod = omod_op;
863 writer->Inst->U.I.DstReg.File = inst_mul->U.I.DstReg.File;
864 writer->Inst->U.I.DstReg.Index = inst_mul->U.I.DstReg.Index;
865 rc_normal_rewrite_writemask(writer->Inst, conversion_swizzle);
866 writer->Inst->U.I.SaturateMode = inst_mul->U.I.SaturateMode;
867 }
868
869 rc_remove_instruction(inst_mul);
870
871 return 1;
872 }
873
874 /**
875 * @return
876 * 0 if inst is still part of the program.
877 * 1 if inst is no longer part of the program.
878 */
879 int
rc_opt_presubtract(struct radeon_compiler * c,struct rc_instruction * inst,void * data)880 rc_opt_presubtract(struct radeon_compiler *c, struct rc_instruction *inst, void *data)
881 {
882 switch(inst->U.I.Opcode) {
883 case RC_OPCODE_ADD:
884 {
885 if (peephole_add_presub_inv(c, inst))
886 return 1;
887 if (peephole_add_presub_add(c, inst))
888 return 1;
889 break;
890 }
891 case RC_OPCODE_MAD:
892 {
893 if (peephole_mad_presub_bias(c, inst))
894 return 1;
895 break;
896 }
897 default:
898 break;
899 }
900 return 0;
901 }
902
merge_swizzles(unsigned int swz1,unsigned int swz2)903 static unsigned int merge_swizzles(unsigned int swz1, unsigned int swz2)
904 {
905 unsigned int new_swz = rc_init_swizzle(RC_SWIZZLE_UNUSED, 0);
906 for (unsigned int chan = 0; chan < 4; chan++) {
907 unsigned int swz = GET_SWZ(swz1, chan);
908 if (swz != RC_SWIZZLE_UNUSED) {
909 SET_SWZ(new_swz, chan, swz);
910 continue;
911 }
912 swz = GET_SWZ(swz2, chan);
913 SET_SWZ(new_swz, chan, swz);
914 }
915 return new_swz;
916 }
917
918 /* Sets negate to 0 for unused channels. */
clean_negate(struct rc_src_register src)919 static unsigned int clean_negate(struct rc_src_register src)
920 {
921 unsigned int new_negate = 0;
922 for (unsigned int chan = 0; chan < 4; chan++) {
923 unsigned int swz = GET_SWZ(src.Swizzle, chan);
924 if (swz != RC_SWIZZLE_UNUSED)
925 new_negate |= src.Negate & (1 << chan);
926 }
927 return new_negate;
928 }
929
merge_negates(struct rc_src_register src1,struct rc_src_register src2)930 static unsigned int merge_negates(struct rc_src_register src1, struct rc_src_register src2)
931 {
932 return clean_negate(src1) | clean_negate(src2);
933 }
934
fill_swizzle(unsigned int orig_swz,unsigned int wmask,unsigned int const_swz)935 static unsigned int fill_swizzle(unsigned int orig_swz, unsigned int wmask, unsigned int const_swz)
936 {
937 for (unsigned int chan = 0; chan < 4; chan++) {
938 unsigned int swz = GET_SWZ(orig_swz, chan);
939 if (swz == RC_SWIZZLE_UNUSED && (wmask & (1 << chan))) {
940 SET_SWZ(orig_swz, chan, const_swz);
941 }
942 }
943 return orig_swz;
944 }
945
have_shared_source(struct rc_instruction * inst1,struct rc_instruction * inst2)946 static int have_shared_source(struct rc_instruction * inst1, struct rc_instruction * inst2)
947 {
948 int shared_src = -1;
949 const struct rc_opcode_info * opcode1 = rc_get_opcode_info(inst1->U.I.Opcode);
950 const struct rc_opcode_info * opcode2 = rc_get_opcode_info(inst2->U.I.Opcode);
951 for (unsigned i = 0; i < opcode1->NumSrcRegs; i++) {
952 for (unsigned j = 0; j < opcode2->NumSrcRegs; j++) {
953 if (inst1->U.I.SrcReg[i].File == inst2->U.I.SrcReg[j].File &&
954 inst1->U.I.SrcReg[i].Index == inst2->U.I.SrcReg[j].Index &&
955 inst1->U.I.SrcReg[i].RelAddr == inst2->U.I.SrcReg[j].RelAddr &&
956 inst1->U.I.SrcReg[i].Abs == inst2->U.I.SrcReg[j].Abs)
957 shared_src = i;
958 }
959 }
960 return shared_src;
961 }
962
963 /**
964 * Merges two MOVs writing different channels of the same destination register
965 * with the use of the constant swizzles.
966 */
merge_movs(struct radeon_compiler * c,struct rc_instruction * inst,struct rc_instruction * cur)967 static bool merge_movs(
968 struct radeon_compiler * c,
969 struct rc_instruction * inst,
970 struct rc_instruction * cur)
971 {
972 /* We can merge two MOVs into MOV if one of them is from inline constant,
973 * i.e., constant swizzles and RC_FILE_NONE).
974 *
975 * For example
976 * MOV temp[0].x none.1___
977 * MOV temp[0].y input[0]._x__
978 *
979 * becomes
980 * MOV temp[0].xy input[0].1x__
981 */
982 unsigned int orig_dst_wmask = inst->U.I.DstReg.WriteMask;
983 if (cur->U.I.SrcReg[0].File == RC_FILE_NONE ||
984 inst->U.I.SrcReg[0].File == RC_FILE_NONE) {
985 struct rc_src_register src;
986 if (cur->U.I.SrcReg[0].File == RC_FILE_NONE)
987 src = inst->U.I.SrcReg[0];
988 else
989 src = cur->U.I.SrcReg[0];
990 src.Swizzle = merge_swizzles(cur->U.I.SrcReg[0].Swizzle,
991 inst->U.I.SrcReg[0].Swizzle);
992 src.Negate = merge_negates(inst->U.I.SrcReg[0], cur->U.I.SrcReg[0]);
993 if (c->SwizzleCaps->IsNative(RC_OPCODE_MOV, src)) {
994 cur->U.I.DstReg.WriteMask |= orig_dst_wmask;
995 cur->U.I.SrcReg[0] = src;
996 rc_remove_instruction(inst);
997 return true;
998 }
999 }
1000
1001 /* Handle the trivial case where the MOVs share a source.
1002 *
1003 * For example
1004 * MOV temp[0].x const[0].x
1005 * MOV temp[0].y const[0].z
1006 *
1007 * becomes
1008 * MOV temp[0].xy const[0].xz
1009 */
1010 if (have_shared_source(inst, cur) == 0) {
1011 struct rc_src_register src = cur->U.I.SrcReg[0];
1012 src.Negate = merge_negates(inst->U.I.SrcReg[0], cur->U.I.SrcReg[0]);
1013 src.Swizzle = merge_swizzles(cur->U.I.SrcReg[0].Swizzle,
1014 inst->U.I.SrcReg[0].Swizzle);
1015
1016 if (c->SwizzleCaps->IsNative(RC_OPCODE_MOV, src)) {
1017 cur->U.I.DstReg.WriteMask |= orig_dst_wmask;
1018 cur->U.I.SrcReg[0] = src;
1019 rc_remove_instruction(inst);
1020 return true;
1021 }
1022 }
1023
1024 /* Otherwise, we can convert the MOVs into ADD.
1025 *
1026 * For example
1027 * MOV temp[0].x const[0].x
1028 * MOV temp[0].y input[0].y
1029 *
1030 * becomes
1031 * ADD temp[0].xy const[0].x0 input[0].0y
1032 */
1033 unsigned wmask = cur->U.I.DstReg.WriteMask | orig_dst_wmask;
1034 struct rc_src_register src0 = inst->U.I.SrcReg[0];
1035 struct rc_src_register src1 = cur->U.I.SrcReg[0];
1036
1037 src0.Swizzle = fill_swizzle(src0.Swizzle,
1038 wmask, RC_SWIZZLE_ZERO);
1039 src1.Swizzle = fill_swizzle(src1.Swizzle,
1040 wmask, RC_SWIZZLE_ZERO);
1041 if (!c->SwizzleCaps->IsNative(RC_OPCODE_ADD, src0) ||
1042 !c->SwizzleCaps->IsNative(RC_OPCODE_ADD, src1))
1043 return false;
1044
1045 cur->U.I.DstReg.WriteMask = wmask;
1046 cur->U.I.Opcode = RC_OPCODE_ADD;
1047 cur->U.I.SrcReg[0] = src0;
1048 cur->U.I.SrcReg[1] = src1;
1049
1050 /* finally delete the original mov */
1051 rc_remove_instruction(inst);
1052 return true;
1053 }
1054
1055 /**
1056 * This function will try to merge MOV and ADD/MUL instructions with the same
1057 * destination, making use of the constant swizzles.
1058 *
1059 * For example:
1060 * MOV temp[0].x const[0].x
1061 * MUL temp[0].yz const[1].yz const[2].yz
1062 *
1063 * becomes
1064 * MAD temp[0].xyz const[1].0yz const[2].0yz const[0].x00
1065 */
merge_mov_add_mul(struct radeon_compiler * c,struct rc_instruction * inst1,struct rc_instruction * inst2)1066 static int merge_mov_add_mul(
1067 struct radeon_compiler * c,
1068 struct rc_instruction * inst1,
1069 struct rc_instruction * inst2)
1070 {
1071 struct rc_instruction * inst, * mov;
1072 if (inst1->U.I.Opcode == RC_OPCODE_MOV) {
1073 mov = inst1;
1074 inst = inst2;
1075 } else {
1076 mov = inst2;
1077 inst = inst1;
1078 }
1079
1080 const bool is_mul = inst->U.I.Opcode == RC_OPCODE_MUL;
1081 int shared_index = have_shared_source(inst, mov);
1082 unsigned wmask = mov->U.I.DstReg.WriteMask | inst->U.I.DstReg.WriteMask;
1083
1084 /* If there is a shared source, just merge the swizzles and be done with it. */
1085 if (shared_index != -1) {
1086 struct rc_src_register shared_src = inst->U.I.SrcReg[shared_index];
1087 struct rc_src_register other_src = inst->U.I.SrcReg[1 - shared_index];
1088
1089 shared_src.Negate = merge_negates(mov->U.I.SrcReg[0], shared_src);
1090 shared_src.Swizzle = merge_swizzles(shared_src.Swizzle,
1091 mov->U.I.SrcReg[0].Swizzle);
1092 other_src.Negate = clean_negate(other_src);
1093 unsigned int swz = is_mul ? RC_SWIZZLE_ONE : RC_SWIZZLE_ZERO;
1094 other_src.Swizzle = fill_swizzle(other_src.Swizzle, wmask, swz);
1095
1096 if (!c->SwizzleCaps->IsNative(RC_OPCODE_ADD, shared_src) ||
1097 !c->SwizzleCaps->IsNative(RC_OPCODE_ADD, other_src))
1098 return 0;
1099
1100 inst2->U.I.Opcode = inst->U.I.Opcode;
1101 inst2->U.I.SrcReg[0] = shared_src;
1102 inst2->U.I.SrcReg[1] = other_src;
1103
1104 /* TODO: we can do a bit better in the special case when one of the sources is none.
1105 * Convert to MAD otherwise.
1106 */
1107 } else {
1108 struct rc_src_register src0, src1, src2;
1109 if (is_mul) {
1110 src2 = mov->U.I.SrcReg[0];
1111 src0 = inst->U.I.SrcReg[0];
1112 src1 = inst->U.I.SrcReg[1];
1113 } else {
1114 src0 = mov->U.I.SrcReg[0];
1115 src1 = inst->U.I.SrcReg[0];
1116 src2 = inst->U.I.SrcReg[1];
1117 }
1118 /* The following login expects that the unused channels have empty negate bits. */
1119 src0.Negate = clean_negate(src0);
1120 src1.Negate = clean_negate(src1);
1121 src2.Negate = clean_negate(src2);
1122
1123 src0.Swizzle = fill_swizzle(src0.Swizzle,
1124 wmask, RC_SWIZZLE_ONE);
1125 src1.Swizzle = fill_swizzle(src1.Swizzle,
1126 wmask, is_mul ? RC_SWIZZLE_ZERO : RC_SWIZZLE_ONE);
1127 src2.Swizzle = fill_swizzle(src2.Swizzle,
1128 wmask, RC_SWIZZLE_ZERO);
1129 if (!c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src0) ||
1130 !c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src1) ||
1131 !c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src2))
1132 return 0;
1133
1134 inst2->U.I.Opcode = RC_OPCODE_MAD;
1135 inst2->U.I.SrcReg[0] = src0;
1136 inst2->U.I.SrcReg[1] = src1;
1137 inst2->U.I.SrcReg[2] = src2;
1138 }
1139 inst2->U.I.DstReg.WriteMask = wmask;
1140 /* finally delete the original instruction */
1141 rc_remove_instruction(inst1);
1142
1143 return 1;
1144 }
1145
1146 /**
1147 * This function will try to merge MOV and MAD instructions with the same
1148 * destination, making use of the constant swizzles. This only works
1149 * if there is a shared source or one of the sources is RC_FILE_NONE.
1150 *
1151 * For example:
1152 * MOV temp[0].x const[0].x
1153 * MAD temp[0].yz const[0].yz const[1].yz input[0].xw
1154 *
1155 * becomes
1156 * MAD temp[0].xyz const[0].xyz const[2].1yz input[0].0xw
1157 */
merge_mov_mad(struct radeon_compiler * c,struct rc_instruction * inst1,struct rc_instruction * inst2)1158 static bool merge_mov_mad(
1159 struct radeon_compiler * c,
1160 struct rc_instruction * inst1,
1161 struct rc_instruction * inst2)
1162 {
1163 struct rc_instruction * mov, * mad;
1164 if (inst1->U.I.Opcode == RC_OPCODE_MOV) {
1165 mov = inst1;
1166 mad = inst2;
1167 } else {
1168 mov = inst2;
1169 mad = inst1;
1170 }
1171
1172 int shared_index = have_shared_source(mad, mov);
1173 unsigned wmask = mov->U.I.DstReg.WriteMask | mad->U.I.DstReg.WriteMask;
1174 struct rc_src_register src[3];
1175 src[0] = mad->U.I.SrcReg[0];
1176 src[1] = mad->U.I.SrcReg[1];
1177 src[2] = mad->U.I.SrcReg[2];
1178
1179 /* Shared source is the one for multiplication. */
1180 if (shared_index == 0 || shared_index == 1) {
1181 src[shared_index].Negate = merge_negates(src[shared_index], mov->U.I.SrcReg[0]);
1182 src[1 - shared_index].Negate = clean_negate(src[1 - shared_index]);
1183 src[shared_index].Swizzle = merge_swizzles(src[shared_index].Swizzle,
1184 mov->U.I.SrcReg[0].Swizzle);
1185 src[1 - shared_index].Swizzle = fill_swizzle(
1186 src[1 - shared_index].Swizzle, wmask, RC_SWIZZLE_ONE);
1187 src[2].Swizzle = fill_swizzle(src[2].Swizzle, wmask, RC_SWIZZLE_ZERO);
1188
1189 /* Shared source is the one for used for addition, or it is none. Additionally,
1190 * if the mov SrcReg is none, we merge it with the addition (third) reg as well
1191 * because than we have the highest change the swizzles will be legal.
1192 */
1193 } else if (shared_index == 2 || mov->U.I.SrcReg[0].File == RC_FILE_NONE ||
1194 src[2].File == RC_FILE_NONE) {
1195 src[2].Negate = merge_negates(src[2], mov->U.I.SrcReg[0]);
1196 src[2].Swizzle = merge_swizzles(src[2].Swizzle, mov->U.I.SrcReg[0].Swizzle);
1197 src[0].Swizzle = fill_swizzle(src[0].Swizzle, wmask, RC_SWIZZLE_ZERO);
1198 src[1].Swizzle = fill_swizzle(src[1].Swizzle, wmask, RC_SWIZZLE_ZERO);
1199 if (src[2].File == RC_FILE_NONE) {
1200 src[2].File = mov->U.I.SrcReg[0].File;
1201 src[2].Index = mov->U.I.SrcReg[0].Index;
1202 src[2].RelAddr = mov->U.I.SrcReg[0].RelAddr;
1203 src[2].Abs = mov->U.I.SrcReg[0].Abs;
1204 }
1205
1206 /* First or the second MAD source is RC_FILE_NONE, we merge the mov into it,
1207 * fill the other one with ones and the reg for addition with zeros.
1208 */
1209 } else if (src[0].File == RC_FILE_NONE || src[1].File == RC_FILE_NONE) {
1210 unsigned none_src = src[0].File == RC_FILE_NONE ? 0 : 1;
1211 src[none_src] = mov->U.I.SrcReg[0];
1212 src[none_src].Negate = merge_negates(src[none_src], mad->U.I.SrcReg[none_src]);
1213 src[none_src].Swizzle = merge_swizzles(src[none_src].Swizzle,
1214 mad->U.I.SrcReg[none_src].Swizzle);
1215 src[1 - none_src].Negate = clean_negate(src[1 - none_src]);
1216 src[1 - none_src].Swizzle = fill_swizzle(src[1 - none_src].Swizzle,
1217 wmask, RC_SWIZZLE_ONE);
1218 src[2].Swizzle = fill_swizzle(src[2].Swizzle, wmask, RC_SWIZZLE_ZERO);
1219 } else {
1220 return false;
1221 }
1222
1223 if (!c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src[0]) ||
1224 !c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src[1]) ||
1225 !c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src[2]))
1226 return false;
1227
1228 inst2->U.I.Opcode = RC_OPCODE_MAD;
1229 inst2->U.I.SrcReg[0] = src[0];
1230 inst2->U.I.SrcReg[1] = src[1];
1231 inst2->U.I.SrcReg[2] = src[2];
1232 inst2->U.I.DstReg.WriteMask = wmask;
1233 rc_remove_instruction(inst1);
1234 return true;
1235 }
1236
inst_combination(struct rc_instruction * inst1,struct rc_instruction * inst2,rc_opcode opcode1,rc_opcode opcode2)1237 static bool inst_combination(
1238 struct rc_instruction * inst1,
1239 struct rc_instruction * inst2,
1240 rc_opcode opcode1,
1241 rc_opcode opcode2)
1242 {
1243 return ((inst1->U.I.Opcode == opcode1 && inst2->U.I.Opcode == opcode2) ||
1244 (inst2->U.I.Opcode == opcode1 && inst1->U.I.Opcode == opcode2));
1245 }
1246
1247 /**
1248 * Searches for instructions writing different channels of the same register that could
1249 * be merged together with the use of constant swizzles.
1250 *
1251 * The potential candidates are combinations of MOVs, ADDs, MULs and MADs.
1252 */
merge_channels(struct radeon_compiler * c,struct rc_instruction * inst)1253 static void merge_channels(struct radeon_compiler * c, struct rc_instruction * inst)
1254 {
1255 unsigned int orig_dst_reg = inst->U.I.DstReg.Index;
1256 unsigned int orig_dst_file = inst->U.I.DstReg.File;
1257 unsigned int orig_dst_wmask = inst->U.I.DstReg.WriteMask;
1258 const struct rc_opcode_info * orig_opcode = rc_get_opcode_info(inst->U.I.Opcode);
1259
1260 struct rc_instruction * cur = inst;
1261 while (cur!= &c->Program.Instructions) {
1262 cur = cur->Next;
1263 const struct rc_opcode_info * opcode = rc_get_opcode_info(cur->U.I.Opcode);
1264
1265 /* Keep it simple for now and stop when encountering any
1266 * control flow.
1267 */
1268 if (opcode->IsFlowControl)
1269 return;
1270
1271 /* Stop when the original destination is overwritten */
1272 if (orig_dst_reg == cur->U.I.DstReg.Index &&
1273 orig_dst_file == cur->U.I.DstReg.File &&
1274 (orig_dst_wmask & cur->U.I.DstReg.WriteMask) != 0)
1275 return;
1276
1277 /* Stop the search when the original instruction destination
1278 * is used as a source for anything.
1279 */
1280 for (unsigned i = 0; i < opcode->NumSrcRegs; i++) {
1281 if (cur->U.I.SrcReg[i].File == orig_dst_file &&
1282 cur->U.I.SrcReg[i].Index == orig_dst_reg)
1283 return;
1284 }
1285
1286 /* Stop the search when some of the original sources are touched. */
1287 for (unsigned i = 0; i < orig_opcode->NumSrcRegs; i++) {
1288 if (inst->U.I.SrcReg[i].File == cur->U.I.DstReg.File &&
1289 inst->U.I.SrcReg[i].Index == cur->U.I.DstReg.Index)
1290 return;
1291 }
1292
1293 if (cur->U.I.DstReg.File == orig_dst_file &&
1294 cur->U.I.DstReg.Index == orig_dst_reg &&
1295 cur->U.I.SaturateMode == inst->U.I.SaturateMode &&
1296 (cur->U.I.DstReg.WriteMask & orig_dst_wmask) == 0) {
1297
1298 if (inst_combination(cur, inst, RC_OPCODE_MOV, RC_OPCODE_MOV)) {
1299 if (merge_movs(c, inst, cur))
1300 return;
1301 }
1302
1303 /* Skip the merge if one of the instructions writes just w channel
1304 * and we are compiling a fragment shader. We can pair-schedule it together
1305 * later anyway and it will also give the scheduler a bit more flexibility.
1306 * Only check this after merging MOVs as when we manage to merge two MOVs
1307 * into another MOV we can still copy propagate it away. So it is a win in
1308 * that case.
1309 */
1310 if (c->has_omod && (cur->U.I.DstReg.WriteMask == RC_MASK_W ||
1311 inst->U.I.DstReg.WriteMask == RC_MASK_W))
1312 continue;
1313
1314 if (inst_combination(cur, inst, RC_OPCODE_MOV, RC_OPCODE_ADD) ||
1315 inst_combination(cur, inst, RC_OPCODE_MOV, RC_OPCODE_MUL)) {
1316 if (merge_mov_add_mul(c, inst, cur))
1317 return;
1318 }
1319
1320 if (inst_combination(cur, inst, RC_OPCODE_MOV, RC_OPCODE_MAD)) {
1321 if (merge_mov_mad(c, inst, cur))
1322 return;
1323 }
1324 }
1325 }
1326 }
1327
1328 /**
1329 * Searches for duplicate ARLs/ARRs
1330 *
1331 * Only a very trivial case is now optimized where if a second one is detected which reads from
1332 * the same register as the first one and source is the same, just remove the second one.
1333 */
merge_A0_loads(struct radeon_compiler * c,struct rc_instruction * inst,bool is_ARL)1334 static void merge_A0_loads(
1335 struct radeon_compiler * c,
1336 struct rc_instruction * inst,
1337 bool is_ARL)
1338 {
1339 unsigned int A0_src_reg = inst->U.I.SrcReg[0].Index;
1340 unsigned int A0_src_file = inst->U.I.SrcReg[0].File;
1341 unsigned int A0_src_swizzle = inst->U.I.SrcReg[0].Swizzle;
1342 int cf_depth = 0;
1343
1344 struct rc_instruction * cur = inst;
1345 while (cur != &c->Program.Instructions) {
1346 cur = cur->Next;
1347 const struct rc_opcode_info * opcode = rc_get_opcode_info(cur->U.I.Opcode);
1348
1349 /* Keep it simple for now and stop when encountering any
1350 * control flow besides simple ifs.
1351 */
1352 if (opcode->IsFlowControl) {
1353 switch (cur->U.I.Opcode) {
1354 case RC_OPCODE_IF:
1355 {
1356 cf_depth++;
1357 break;
1358 }
1359 case RC_OPCODE_ELSE:
1360 {
1361 if (cf_depth < 1)
1362 return;
1363 break;
1364 }
1365 case RC_OPCODE_ENDIF:
1366 {
1367 cf_depth--;
1368 break;
1369 }
1370 default:
1371 return;
1372 }
1373 }
1374
1375 /* Stop when the original source is overwritten */
1376 if (A0_src_reg == cur->U.I.DstReg.Index &&
1377 A0_src_file == cur->U.I.DstReg.File &&
1378 cur->U.I.DstReg.WriteMask | rc_swizzle_to_writemask(A0_src_swizzle))
1379 return;
1380
1381 /* Wrong A0 load type. */
1382 if ((is_ARL && cur->U.I.Opcode == RC_OPCODE_ARR) ||
1383 (!is_ARL && cur->U.I.Opcode == RC_OPCODE_ARL))
1384 return;
1385
1386 if (cur->U.I.Opcode == RC_OPCODE_ARL || cur->U.I.Opcode == RC_OPCODE_ARR) {
1387 if (A0_src_reg == cur->U.I.SrcReg[0].Index &&
1388 A0_src_file == cur->U.I.SrcReg[0].File &&
1389 A0_src_swizzle == cur->U.I.SrcReg[0].Swizzle) {
1390 struct rc_instruction * next = cur->Next;
1391 rc_remove_instruction(cur);
1392 cur = next;
1393 } else {
1394 return;
1395 }
1396 }
1397 }
1398 }
1399
1400 /**
1401 * According to the GLSL spec, round is only 1.30 and up
1402 * so the only reason why we should ever see round is if it actually
1403 * is lowered ARR (from nine->ttn). In that case we want to reconstruct
1404 * the ARR instead of lowering the round.
1405 */
transform_vertex_ROUND(struct radeon_compiler * c,struct rc_instruction * inst)1406 static void transform_vertex_ROUND(struct radeon_compiler* c,
1407 struct rc_instruction* inst)
1408 {
1409 struct rc_reader_data readers;
1410 rc_get_readers(c, inst, &readers, NULL, NULL, NULL);
1411
1412 assert(readers.ReaderCount > 0);
1413 for (unsigned i = 0; i < readers.ReaderCount; i++) {
1414 struct rc_instruction *reader = readers.Readers[i].Inst;
1415 if (reader->U.I.Opcode != RC_OPCODE_ARL) {
1416 assert(!"Unable to convert ROUND+ARL to ARR\n");
1417 return;
1418 }
1419 }
1420
1421 /* Only ARL readers, convert all to ARR */
1422 for (unsigned i = 0; i < readers.ReaderCount; i++) {
1423 readers.Readers[i].Inst->U.I.Opcode = RC_OPCODE_ARR;
1424 }
1425 /* Switch ROUND to MOV and let copy propagate sort it out later. */
1426 inst->U.I.Opcode = RC_OPCODE_MOV;
1427 }
1428
1429 /**
1430 * Apply various optimizations specific to the A0 address register loads.
1431 */
optimize_A0_loads(struct radeon_compiler * c)1432 static void optimize_A0_loads(struct radeon_compiler * c) {
1433 struct rc_instruction * inst = c->Program.Instructions.Next;
1434
1435 while (inst != &c->Program.Instructions) {
1436 struct rc_instruction * cur = inst;
1437 inst = inst->Next;
1438 if (cur->U.I.Opcode == RC_OPCODE_ARL) {
1439 merge_A0_loads(c, cur, true);
1440 } else if (cur->U.I.Opcode == RC_OPCODE_ARR) {
1441 merge_A0_loads(c, cur, false);
1442 } else if (cur->U.I.Opcode == RC_OPCODE_ROUND) {
1443 transform_vertex_ROUND(c, cur);
1444 }
1445 }
1446 }
1447
rc_optimize(struct radeon_compiler * c,void * user)1448 void rc_optimize(struct radeon_compiler * c, void *user)
1449 {
1450 struct rc_instruction * inst = c->Program.Instructions.Next;
1451 while(inst != &c->Program.Instructions) {
1452 struct rc_instruction * cur = inst;
1453 inst = inst->Next;
1454 constant_folding(c, cur);
1455 }
1456
1457 /* Copy propagate simple movs away. */
1458 inst = c->Program.Instructions.Next;
1459 while(inst != &c->Program.Instructions) {
1460 struct rc_instruction * cur = inst;
1461 inst = inst->Next;
1462 if (cur->U.I.Opcode == RC_OPCODE_MOV) {
1463 copy_propagate(c, cur);
1464 }
1465 }
1466
1467 if (c->type == RC_VERTEX_PROGRAM) {
1468 optimize_A0_loads(c);
1469 }
1470
1471 /* Merge MOVs to same source in different channels using the constant
1472 * swizzle.
1473 */
1474 if (c->is_r500 || c->type == RC_VERTEX_PROGRAM) {
1475 inst = c->Program.Instructions.Next;
1476 while(inst != &c->Program.Instructions) {
1477 struct rc_instruction * cur = inst;
1478 inst = inst->Next;
1479 if (cur->U.I.Opcode == RC_OPCODE_MOV ||
1480 cur->U.I.Opcode == RC_OPCODE_ADD ||
1481 cur->U.I.Opcode == RC_OPCODE_MAD ||
1482 cur->U.I.Opcode == RC_OPCODE_MUL)
1483 merge_channels(c, cur);
1484 }
1485 }
1486
1487 /* Copy propagate few extra movs from the merge_channels pass. */
1488 inst = c->Program.Instructions.Next;
1489 while(inst != &c->Program.Instructions) {
1490 struct rc_instruction * cur = inst;
1491 inst = inst->Next;
1492 if (cur->U.I.Opcode == RC_OPCODE_MOV) {
1493 copy_propagate(c, cur);
1494 }
1495 }
1496
1497 if (c->type != RC_FRAGMENT_PROGRAM) {
1498 return;
1499 }
1500
1501 /* Output modifiers. */
1502 inst = c->Program.Instructions.Next;
1503 struct rc_list * var_list = NULL;
1504 while(inst != &c->Program.Instructions) {
1505 struct rc_instruction * cur = inst;
1506 inst = inst->Next;
1507 if (cur->U.I.Opcode == RC_OPCODE_MUL) {
1508 if (!var_list)
1509 var_list = rc_get_variables(c);
1510 if (peephole_mul_omod(c, cur, var_list))
1511 var_list = NULL;
1512 }
1513 }
1514 }
1515