xref: /aosp_15_r20/external/mesa3d/src/gallium/drivers/r300/compiler/radeon_optimize.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2009 Nicolai Haehnle.
3  * Copyright 2010 Tom Stellard <[email protected]>
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "util/u_math.h"
8 
9 #include "radeon_dataflow.h"
10 
11 #include "radeon_compiler.h"
12 #include "radeon_compiler_util.h"
13 #include "radeon_list.h"
14 #include "radeon_swizzle.h"
15 #include "radeon_variable.h"
16 
17 struct src_clobbered_reads_cb_data {
18 	rc_register_file File;
19 	unsigned int Index;
20 	unsigned int Mask;
21 	struct rc_reader_data * ReaderData;
22 };
23 
24 typedef void (*rc_presub_replace_fn)(struct rc_instruction *,
25 						struct rc_instruction *,
26 						unsigned int);
27 
chain_srcregs(struct rc_src_register outer,struct rc_src_register inner)28 static struct rc_src_register chain_srcregs(struct rc_src_register outer, struct rc_src_register inner)
29 {
30 	struct rc_src_register combine;
31 	combine.File = inner.File;
32 	combine.Index = inner.Index;
33 	combine.RelAddr = inner.RelAddr;
34 	if (outer.Abs) {
35 		combine.Abs = 1;
36 		combine.Negate = outer.Negate;
37 	} else {
38 		combine.Abs = inner.Abs;
39 		combine.Negate = swizzle_mask(outer.Swizzle, inner.Negate);
40 		combine.Negate ^= outer.Negate;
41 	}
42 	combine.Swizzle = combine_swizzles(inner.Swizzle, outer.Swizzle);
43 	return combine;
44 }
45 
copy_propagate_scan_read(void * data,struct rc_instruction * inst,struct rc_src_register * src)46 static void copy_propagate_scan_read(void * data, struct rc_instruction * inst,
47 						struct rc_src_register * src)
48 {
49 	rc_register_file file = src->File;
50 	struct rc_reader_data * reader_data = data;
51 
52 	if(!rc_inst_can_use_presub(reader_data->C,
53 				inst,
54 				reader_data->Writer->U.I.PreSub.Opcode,
55 				rc_swizzle_to_writemask(src->Swizzle),
56 				src,
57 				&reader_data->Writer->U.I.PreSub.SrcReg[0],
58 				&reader_data->Writer->U.I.PreSub.SrcReg[1])) {
59 		reader_data->Abort = 1;
60 		return;
61 	}
62 
63 	/* XXX This could probably be handled better. */
64 	if (file == RC_FILE_ADDRESS) {
65 		reader_data->Abort = 1;
66 		return;
67 	}
68 
69 	/* R300/R400 is unhappy about propagating
70 	 *  0: MOV temp[1], -none.1111;
71 	 *  1: KIL temp[1];
72 	 * to
73 	 *  0: KIL -none.1111;
74 	 *
75 	 * R500 is fine with it.
76 	 */
77 	if (!reader_data->C->is_r500 && inst->U.I.Opcode == RC_OPCODE_KIL &&
78 		reader_data->Writer->U.I.SrcReg[0].File == RC_FILE_NONE) {
79 		reader_data->Abort = 1;
80 		return;
81 	}
82 
83 	/* These instructions cannot read from the constants file.
84 	 * see radeonTransformTEX()
85 	 */
86 	if(reader_data->Writer->U.I.SrcReg[0].File != RC_FILE_TEMPORARY &&
87 			reader_data->Writer->U.I.SrcReg[0].File != RC_FILE_INPUT &&
88 			reader_data->Writer->U.I.SrcReg[0].File != RC_FILE_NONE &&
89 				(inst->U.I.Opcode == RC_OPCODE_TEX ||
90 				inst->U.I.Opcode == RC_OPCODE_TXB ||
91 				inst->U.I.Opcode == RC_OPCODE_TXP ||
92 				inst->U.I.Opcode == RC_OPCODE_TXD ||
93 				inst->U.I.Opcode == RC_OPCODE_TXL ||
94 				inst->U.I.Opcode == RC_OPCODE_KIL)){
95 		reader_data->Abort = 1;
96 		return;
97 	}
98 }
99 
src_clobbered_reads_cb(void * data,struct rc_instruction * inst,struct rc_src_register * src)100 static void src_clobbered_reads_cb(
101 	void * data,
102 	struct rc_instruction * inst,
103 	struct rc_src_register * src)
104 {
105 	struct src_clobbered_reads_cb_data * sc_data = data;
106 
107 	if (src->File == sc_data->File
108 	    && src->Index == sc_data->Index
109 	    && (rc_swizzle_to_writemask(src->Swizzle) & sc_data->Mask)) {
110 
111 		sc_data->ReaderData->AbortOnRead = RC_MASK_XYZW;
112 	}
113 
114 	if (src->RelAddr && sc_data->File == RC_FILE_ADDRESS) {
115 		sc_data->ReaderData->AbortOnRead = RC_MASK_XYZW;
116 	}
117 }
118 
is_src_clobbered_scan_write(void * data,struct rc_instruction * inst,rc_register_file file,unsigned int index,unsigned int mask)119 static void is_src_clobbered_scan_write(
120 	void * data,
121 	struct rc_instruction * inst,
122 	rc_register_file file,
123 	unsigned int index,
124 	unsigned int mask)
125 {
126 	struct src_clobbered_reads_cb_data sc_data;
127 	struct rc_reader_data * reader_data = data;
128 	sc_data.File = file;
129 	sc_data.Index = index;
130 	sc_data.Mask = mask;
131 	sc_data.ReaderData = reader_data;
132 	rc_for_all_reads_src(reader_data->Writer,
133 					src_clobbered_reads_cb, &sc_data);
134 }
135 
copy_propagate(struct radeon_compiler * c,struct rc_instruction * inst_mov)136 static void copy_propagate(struct radeon_compiler * c, struct rc_instruction * inst_mov)
137 {
138 	struct rc_reader_data reader_data;
139 	unsigned int i;
140 
141 	if (inst_mov->U.I.DstReg.File != RC_FILE_TEMPORARY ||
142 	    inst_mov->U.I.WriteALUResult)
143 		return;
144 
145 	/* Get a list of all the readers of this MOV instruction. */
146 	reader_data.ExitOnAbort = 1;
147 	rc_get_readers(c, inst_mov, &reader_data,
148 		       copy_propagate_scan_read, NULL,
149 		       is_src_clobbered_scan_write);
150 
151 	if (reader_data.Abort || reader_data.ReaderCount == 0)
152 		return;
153 
154 	/* We can propagate SaturateMode if all the readers are MOV instructions
155 	 * without a presubtract operation, source negation and absolute.
156 	 * In that case, we just move SaturateMode to all readers. */
157         if (inst_mov->U.I.SaturateMode) {
158 		for (i = 0; i < reader_data.ReaderCount; i++) {
159 			struct rc_instruction * inst = reader_data.Readers[i].Inst;
160 
161 			if (inst->U.I.Opcode != RC_OPCODE_MOV ||
162 			    inst->U.I.SrcReg[0].File == RC_FILE_PRESUB ||
163 			    inst->U.I.SrcReg[0].Abs ||
164 			    inst->U.I.SrcReg[0].Negate) {
165 				return;
166 			}
167 		}
168 	}
169 
170 	/* Propagate the MOV instruction. */
171 	for (i = 0; i < reader_data.ReaderCount; i++) {
172 		struct rc_instruction * inst = reader_data.Readers[i].Inst;
173 		*reader_data.Readers[i].U.I.Src = chain_srcregs(*reader_data.Readers[i].U.I.Src, inst_mov->U.I.SrcReg[0]);
174 
175 		if (inst_mov->U.I.SrcReg[0].File == RC_FILE_PRESUB)
176 			inst->U.I.PreSub = inst_mov->U.I.PreSub;
177 		if (!inst->U.I.SaturateMode)
178 			inst->U.I.SaturateMode = inst_mov->U.I.SaturateMode;
179 	}
180 
181 	/* Finally, remove the original MOV instruction */
182 	rc_remove_instruction(inst_mov);
183 }
184 
185 /**
186  * Check if a source register is actually always the same
187  * swizzle constant.
188  */
is_src_uniform_constant(struct rc_src_register src,rc_swizzle * pswz,unsigned int * pnegate)189 static int is_src_uniform_constant(struct rc_src_register src,
190 		rc_swizzle * pswz, unsigned int * pnegate)
191 {
192 	int have_used = 0;
193 
194 	if (src.File != RC_FILE_NONE) {
195 		*pswz = 0;
196 		return 0;
197 	}
198 
199 	for(unsigned int chan = 0; chan < 4; ++chan) {
200 		unsigned int swz = GET_SWZ(src.Swizzle, chan);
201 		if (swz < 4) {
202 			*pswz = 0;
203 			return 0;
204 		}
205 		if (swz == RC_SWIZZLE_UNUSED)
206 			continue;
207 
208 		if (!have_used) {
209 			*pswz = swz;
210 			*pnegate = GET_BIT(src.Negate, chan);
211 			have_used = 1;
212 		} else {
213 			if (swz != *pswz || *pnegate != GET_BIT(src.Negate, chan)) {
214 				*pswz = 0;
215 				return 0;
216 			}
217 		}
218 	}
219 
220 	return 1;
221 }
222 
223 /**
224  * Replace 0.0, 1.0 and 0.5 immediate constants by their
225  * respective swizzles. Simplify instructions like ADD dst, src, 0;
226  */
constant_folding(struct radeon_compiler * c,struct rc_instruction * inst)227 static void constant_folding(struct radeon_compiler * c, struct rc_instruction * inst)
228 {
229 	const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
230 	unsigned int i;
231 
232 	/* Replace 0.0, 1.0 and 0.5 immediates by their explicit swizzles */
233 	for(unsigned int src = 0; src < opcode->NumSrcRegs; ++src) {
234 		struct rc_constant * constant;
235 		struct rc_src_register newsrc;
236 		int have_real_reference;
237 		unsigned int chan;
238 
239 		/* If there are only 0, 0.5, 1, or _ swizzles, mark the source as a constant. */
240 		for (chan = 0; chan < 4; ++chan)
241 			if (GET_SWZ(inst->U.I.SrcReg[src].Swizzle, chan) <= 3)
242 				break;
243 		if (chan == 4) {
244 			inst->U.I.SrcReg[src].File = RC_FILE_NONE;
245 			continue;
246 		}
247 
248 		/* Convert immediates to swizzles. */
249 		if (inst->U.I.SrcReg[src].File != RC_FILE_CONSTANT ||
250 		    inst->U.I.SrcReg[src].RelAddr ||
251 		    inst->U.I.SrcReg[src].Index >= c->Program.Constants.Count)
252 			continue;
253 
254 		constant =
255 			&c->Program.Constants.Constants[inst->U.I.SrcReg[src].Index];
256 
257 		if (constant->Type != RC_CONSTANT_IMMEDIATE)
258 			continue;
259 
260 		newsrc = inst->U.I.SrcReg[src];
261 		have_real_reference = 0;
262 		for (chan = 0; chan < 4; ++chan) {
263 			unsigned int swz = GET_SWZ(newsrc.Swizzle, chan);
264 			unsigned int newswz;
265 			float imm;
266 			float baseimm;
267 
268 			if (swz >= 4)
269 				continue;
270 
271 			imm = constant->u.Immediate[swz];
272 			baseimm = imm;
273 			if (imm < 0.0)
274 				baseimm = -baseimm;
275 
276 			if (baseimm == 0.0) {
277 				newswz = RC_SWIZZLE_ZERO;
278 			} else if (baseimm == 1.0) {
279 				newswz = RC_SWIZZLE_ONE;
280 			} else if (baseimm == 0.5 && c->has_half_swizzles) {
281 				newswz = RC_SWIZZLE_HALF;
282 			} else {
283 				have_real_reference = 1;
284 				continue;
285 			}
286 
287 			SET_SWZ(newsrc.Swizzle, chan, newswz);
288 			if (imm < 0.0 && !newsrc.Abs)
289 				newsrc.Negate ^= 1 << chan;
290 		}
291 
292 		if (!have_real_reference) {
293 			newsrc.File = RC_FILE_NONE;
294 			newsrc.Index = 0;
295 		}
296 
297 		/* don't make the swizzle worse */
298 		if (!c->SwizzleCaps->IsNative(inst->U.I.Opcode, newsrc))
299 			continue;
300 
301 		inst->U.I.SrcReg[src] = newsrc;
302 	}
303 
304 	/* In case this instruction has been converted, make sure all of the
305 	 * registers that are no longer used are empty. */
306 	opcode = rc_get_opcode_info(inst->U.I.Opcode);
307 	for(i = opcode->NumSrcRegs; i < 3; i++) {
308 		memset(&inst->U.I.SrcReg[i], 0, sizeof(struct rc_src_register));
309 	}
310 }
311 
312 /**
313  * If src and dst use the same register, this function returns a writemask that
314  * indicates which components are read by src.  Otherwise zero is returned.
315  */
src_reads_dst_mask(struct rc_src_register src,struct rc_dst_register dst)316 static unsigned int src_reads_dst_mask(struct rc_src_register src,
317 						struct rc_dst_register dst)
318 {
319 	if (dst.File != src.File || dst.Index != src.Index) {
320 		return 0;
321 	}
322 	return rc_swizzle_to_writemask(src.Swizzle);
323 }
324 
325 /* Return 1 if the source registers has a constant swizzle (e.g. 0, 0.5, 1.0)
326  * in any of its channels.  Return 0 otherwise. */
src_has_const_swz(struct rc_src_register src)327 static int src_has_const_swz(struct rc_src_register src) {
328 	int chan;
329 	for(chan = 0; chan < 4; chan++) {
330 		unsigned int swz = GET_SWZ(src.Swizzle, chan);
331 		if (swz == RC_SWIZZLE_ZERO || swz == RC_SWIZZLE_HALF
332 						|| swz == RC_SWIZZLE_ONE) {
333 			return 1;
334 		}
335 	}
336 	return 0;
337 }
338 
presub_scan_read(void * data,struct rc_instruction * inst,struct rc_src_register * src)339 static void presub_scan_read(
340 	void * data,
341 	struct rc_instruction * inst,
342 	struct rc_src_register * src)
343 {
344 	struct rc_reader_data * reader_data = data;
345 	rc_presubtract_op * presub_opcode = reader_data->CbData;
346 
347 	if (!rc_inst_can_use_presub(reader_data->C,
348 			inst,
349 			*presub_opcode,
350 			reader_data->Writer->U.I.DstReg.WriteMask,
351 			src,
352 			&reader_data->Writer->U.I.SrcReg[0],
353 			&reader_data->Writer->U.I.SrcReg[1])) {
354 		reader_data->Abort = 1;
355 		return;
356 	}
357 }
358 
presub_helper(struct radeon_compiler * c,struct rc_instruction * inst_add,rc_presubtract_op presub_opcode,rc_presub_replace_fn presub_replace)359 static int presub_helper(
360 	struct radeon_compiler * c,
361 	struct rc_instruction * inst_add,
362 	rc_presubtract_op presub_opcode,
363 	rc_presub_replace_fn presub_replace)
364 {
365 	struct rc_reader_data reader_data;
366 	unsigned int i;
367 	rc_presubtract_op cb_op = presub_opcode;
368 
369 	reader_data.CbData = &cb_op;
370 	reader_data.ExitOnAbort = 1;
371 	rc_get_readers(c, inst_add, &reader_data, presub_scan_read, NULL,
372 						is_src_clobbered_scan_write);
373 
374 	if (reader_data.Abort || reader_data.ReaderCount == 0)
375 		return 0;
376 
377 	for(i = 0; i < reader_data.ReaderCount; i++) {
378 		unsigned int src_index;
379 		struct rc_reader reader = reader_data.Readers[i];
380 		const struct rc_opcode_info * info =
381 				rc_get_opcode_info(reader.Inst->U.I.Opcode);
382 
383 		for (src_index = 0; src_index < info->NumSrcRegs; src_index++) {
384 			if (&reader.Inst->U.I.SrcReg[src_index] == reader.U.I.Src)
385 				presub_replace(inst_add, reader.Inst, src_index);
386 		}
387 	}
388 	return 1;
389 }
390 
presub_replace_add(struct rc_instruction * inst_add,struct rc_instruction * inst_reader,unsigned int src_index)391 static void presub_replace_add(
392 	struct rc_instruction * inst_add,
393 	struct rc_instruction * inst_reader,
394 	unsigned int src_index)
395 {
396 	rc_presubtract_op presub_opcode;
397 
398 	unsigned int negates = 0;
399 	if (inst_add->U.I.SrcReg[0].Negate)
400 		negates++;
401 	if (inst_add->U.I.SrcReg[1].Negate)
402 		negates++;
403 	assert(negates != 2 || inst_add->U.I.SrcReg[1].Negate == inst_add->U.I.SrcReg[0].Negate);
404 
405 	if (negates == 1)
406 		presub_opcode = RC_PRESUB_SUB;
407 	else
408 		presub_opcode = RC_PRESUB_ADD;
409 
410 	if (inst_add->U.I.SrcReg[1].Negate && negates == 1) {
411 		inst_reader->U.I.PreSub.SrcReg[0] = inst_add->U.I.SrcReg[1];
412 		inst_reader->U.I.PreSub.SrcReg[1] = inst_add->U.I.SrcReg[0];
413 	} else {
414 		inst_reader->U.I.PreSub.SrcReg[0] = inst_add->U.I.SrcReg[0];
415 		inst_reader->U.I.PreSub.SrcReg[1] = inst_add->U.I.SrcReg[1];
416 	}
417 	/* If both sources are negative we can move the negate to the presub. */
418 	unsigned negate_mask = negates == 1 ? 0 : inst_add->U.I.SrcReg[0].Negate;
419 	inst_reader->U.I.PreSub.SrcReg[0].Negate = negate_mask;
420 	inst_reader->U.I.PreSub.SrcReg[1].Negate = negate_mask;
421 	inst_reader->U.I.PreSub.Opcode = presub_opcode;
422 	inst_reader->U.I.SrcReg[src_index] =
423 			chain_srcregs(inst_reader->U.I.SrcReg[src_index],
424 					inst_reader->U.I.PreSub.SrcReg[0]);
425 	inst_reader->U.I.SrcReg[src_index].File = RC_FILE_PRESUB;
426 	inst_reader->U.I.SrcReg[src_index].Index = presub_opcode;
427 }
428 
is_presub_candidate(struct radeon_compiler * c,struct rc_instruction * inst)429 static int is_presub_candidate(
430 	struct radeon_compiler * c,
431 	struct rc_instruction * inst)
432 {
433 	const struct rc_opcode_info * info = rc_get_opcode_info(inst->U.I.Opcode);
434 	unsigned int i;
435 	unsigned int is_constant[2] = {0, 0};
436 
437 	assert(inst->U.I.Opcode == RC_OPCODE_ADD || inst->U.I.Opcode == RC_OPCODE_MAD);
438 
439 	if (inst->U.I.PreSub.Opcode != RC_PRESUB_NONE
440 			|| inst->U.I.SaturateMode
441 			|| inst->U.I.WriteALUResult
442 			|| inst->U.I.Omod) {
443 		return 0;
444 	}
445 
446 	/* If first two sources use a constant swizzle, then we can't convert it to
447 	 * a presubtract operation.  In fact for the ADD and SUB presubtract
448 	 * operations neither source can contain a constant swizzle.  This
449 	 * specific case is checked in peephole_add_presub_add() when
450 	 * we make sure the swizzles for both sources are equal, so we
451 	 * don't need to worry about it here. */
452 	for (i = 0; i < 2; i++) {
453 		int chan;
454 		for (chan = 0; chan < 4; chan++) {
455 			rc_swizzle swz =
456 				get_swz(inst->U.I.SrcReg[i].Swizzle, chan);
457 			if (swz == RC_SWIZZLE_ONE
458 					|| swz == RC_SWIZZLE_ZERO
459 					|| swz == RC_SWIZZLE_HALF) {
460 				is_constant[i] = 1;
461 			}
462 		}
463 	}
464 	if (is_constant[0] && is_constant[1])
465 		return 0;
466 
467 	for(i = 0; i < info->NumSrcRegs; i++) {
468 		struct rc_src_register src = inst->U.I.SrcReg[i];
469 		if (src_reads_dst_mask(src, inst->U.I.DstReg))
470 			return 0;
471 
472 		src.File = RC_FILE_PRESUB;
473 		if (!c->SwizzleCaps->IsNative(inst->U.I.Opcode, src))
474 			return 0;
475 	}
476 	return 1;
477 }
478 
peephole_add_presub_add(struct radeon_compiler * c,struct rc_instruction * inst_add)479 static int peephole_add_presub_add(
480 	struct radeon_compiler * c,
481 	struct rc_instruction * inst_add)
482 {
483 	unsigned dstmask = inst_add->U.I.DstReg.WriteMask;
484         unsigned src0_neg = inst_add->U.I.SrcReg[0].Negate & dstmask;
485         unsigned src1_neg = inst_add->U.I.SrcReg[1].Negate & dstmask;
486 
487 	if (inst_add->U.I.SrcReg[0].Swizzle != inst_add->U.I.SrcReg[1].Swizzle)
488 		return 0;
489 
490 	/* src0 and src1 can't have absolute values */
491 	if (inst_add->U.I.SrcReg[0].Abs || inst_add->U.I.SrcReg[1].Abs)
492 	        return 0;
493 
494         /* if src0 is negative, at least all bits of dstmask have to be set */
495         if (inst_add->U.I.SrcReg[0].Negate && src0_neg != dstmask)
496 	        return 0;
497 
498         /* if src1 is negative, at least all bits of dstmask have to be set */
499         if (inst_add->U.I.SrcReg[1].Negate && src1_neg != dstmask)
500 	        return 0;
501 
502 	if (!is_presub_candidate(c, inst_add))
503 		return 0;
504 
505 	if (presub_helper(c, inst_add, RC_PRESUB_ADD, presub_replace_add)) {
506 		rc_remove_instruction(inst_add);
507 		return 1;
508 	}
509 	return 0;
510 }
511 
presub_replace_inv(struct rc_instruction * inst_add,struct rc_instruction * inst_reader,unsigned int src_index)512 static void presub_replace_inv(
513 	struct rc_instruction * inst_add,
514 	struct rc_instruction * inst_reader,
515 	unsigned int src_index)
516 {
517 	/* We must be careful not to modify inst_add, since it
518 	 * is possible it will remain part of the program.*/
519 	inst_reader->U.I.PreSub.SrcReg[0] = inst_add->U.I.SrcReg[1];
520 	inst_reader->U.I.PreSub.SrcReg[0].Negate = 0;
521 	inst_reader->U.I.PreSub.Opcode = RC_PRESUB_INV;
522 	inst_reader->U.I.SrcReg[src_index] = chain_srcregs(inst_reader->U.I.SrcReg[src_index],
523 						inst_reader->U.I.PreSub.SrcReg[0]);
524 
525 	inst_reader->U.I.SrcReg[src_index].File = RC_FILE_PRESUB;
526 	inst_reader->U.I.SrcReg[src_index].Index = RC_PRESUB_INV;
527 }
528 
presub_replace_bias(struct rc_instruction * inst_mad,struct rc_instruction * inst_reader,unsigned int src_index)529 static void presub_replace_bias(
530 	struct rc_instruction * inst_mad,
531 	struct rc_instruction * inst_reader,
532 	unsigned int src_index)
533 {
534 	/* We must be careful not to modify inst_mad, since it
535 	 * is possible it will remain part of the program.*/
536 	inst_reader->U.I.PreSub.SrcReg[0] = inst_mad->U.I.SrcReg[0];
537 	inst_reader->U.I.PreSub.SrcReg[0].Negate = 0;
538 	inst_reader->U.I.PreSub.Opcode = RC_PRESUB_BIAS;
539 	inst_reader->U.I.SrcReg[src_index] = chain_srcregs(inst_reader->U.I.SrcReg[src_index],
540 						inst_reader->U.I.PreSub.SrcReg[0]);
541 
542 	inst_reader->U.I.SrcReg[src_index].File = RC_FILE_PRESUB;
543 	inst_reader->U.I.SrcReg[src_index].Index = RC_PRESUB_BIAS;
544 }
545 
546 /**
547  * PRESUB_INV: ADD TEMP[0], none.1, -TEMP[1]
548  * Use the presubtract 1 - src0 for all readers of TEMP[0].  The first source
549  * of the add instruction must have the constant 1 swizzle.  This function
550  * does not check const registers to see if their value is 1.0, so it should
551  * be called after the constant_folding optimization.
552  * @return
553  * 	0 if the ADD instruction is still part of the program.
554  * 	1 if the ADD instruction is no longer part of the program.
555  */
peephole_add_presub_inv(struct radeon_compiler * c,struct rc_instruction * inst_add)556 static int peephole_add_presub_inv(
557 	struct radeon_compiler * c,
558 	struct rc_instruction * inst_add)
559 {
560 	unsigned int i, swz;
561 
562 	if (!is_presub_candidate(c, inst_add))
563 		return 0;
564 
565 	/* Check if src0 is 1. */
566 	/* XXX It would be nice to use is_src_uniform_constant here, but that
567 	 * function only works if the register's file is RC_FILE_NONE */
568 	for(i = 0; i < 4; i++ ) {
569 		if (!(inst_add->U.I.DstReg.WriteMask & (1 << i)))
570 			continue;
571 
572 		swz = GET_SWZ(inst_add->U.I.SrcReg[0].Swizzle, i);
573 		if (swz != RC_SWIZZLE_ONE || inst_add->U.I.SrcReg[0].Negate & (1 << i))
574 			return 0;
575 	}
576 
577 	/* Check src1. */
578 	if ((inst_add->U.I.SrcReg[1].Negate & inst_add->U.I.DstReg.WriteMask) !=
579 						inst_add->U.I.DstReg.WriteMask
580 		|| inst_add->U.I.SrcReg[1].Abs
581 		|| src_has_const_swz(inst_add->U.I.SrcReg[1])) {
582 
583 		return 0;
584 	}
585 
586 	if (presub_helper(c, inst_add, RC_PRESUB_INV, presub_replace_inv)) {
587 		rc_remove_instruction(inst_add);
588 		return 1;
589 	}
590 	return 0;
591 }
592 
593 /**
594  * PRESUB_BIAD: MAD -TEMP[0], 2.0, 1.0
595  * Use the presubtract 1 - 2*src0 for all readers of TEMP[0].  The first source
596  * of the add instruction must have the constant 1 swizzle.  This function
597  * does not check const registers to see if their value is 1.0, so it should
598  * be called after the constant_folding optimization.
599  * @return
600  * 	0 if the MAD instruction is still part of the program.
601  * 	1 if the MAD instruction is no longer part of the program.
602  */
peephole_mad_presub_bias(struct radeon_compiler * c,struct rc_instruction * inst_mad)603 static int peephole_mad_presub_bias(
604 	struct radeon_compiler * c,
605 	struct rc_instruction * inst_mad)
606 {
607 	unsigned int i, swz;
608 
609 	if (!is_presub_candidate(c, inst_mad))
610 		return 0;
611 
612 	/* Check if src2 is 1. */
613 	for(i = 0; i < 4; i++ ) {
614 		if (!(inst_mad->U.I.DstReg.WriteMask & (1 << i)))
615 			continue;
616 
617 		swz = GET_SWZ(inst_mad->U.I.SrcReg[2].Swizzle, i);
618 		if (swz != RC_SWIZZLE_ONE || inst_mad->U.I.SrcReg[2].Negate & (1 << i))
619 			return 0;
620 	}
621 
622 	/* Check if src1 is 2. */
623 	struct rc_src_register src1_reg = inst_mad->U.I.SrcReg[1];
624 	if ((src1_reg.Negate & inst_mad->U.I.DstReg.WriteMask) != 0 || src1_reg.Abs)
625 		return 0;
626 	if (src1_reg.File == RC_FILE_INLINE) {
627 		if (rc_inline_to_float(src1_reg.Index) != 2.0f)
628 			 return 0;
629 	} else {
630 		if (src1_reg.File != RC_FILE_CONSTANT)
631 			return 0;
632 
633 		struct rc_constant *constant = &c->Program.Constants.Constants[src1_reg.Index];
634 		if (constant->Type != RC_CONSTANT_IMMEDIATE)
635 			return 0;
636 	        for (i = 0; i < 4; i++) {
637 			if (!(inst_mad->U.I.DstReg.WriteMask & (1 << i)))
638 				continue;
639 			swz = GET_SWZ(src1_reg.Swizzle, i);
640 			if (swz >= RC_SWIZZLE_ZERO || constant->u.Immediate[swz] != 2.0)
641 				return 0;
642 		}
643 	}
644 
645 	/* Check src0. */
646 	if ((inst_mad->U.I.SrcReg[0].Negate & inst_mad->U.I.DstReg.WriteMask) !=
647 						inst_mad->U.I.DstReg.WriteMask
648 		|| inst_mad->U.I.SrcReg[0].Abs
649 		|| src_has_const_swz(inst_mad->U.I.SrcReg[0])) {
650 
651 		return 0;
652 	}
653 
654 	if (presub_helper(c, inst_mad, RC_PRESUB_BIAS, presub_replace_bias)) {
655 		rc_remove_instruction(inst_mad);
656 		return 1;
657 	}
658 	return 0;
659 }
660 
661 struct peephole_mul_cb_data {
662 	struct rc_dst_register * Writer;
663 	unsigned int Clobbered;
664 };
665 
omod_filter_reader_cb(void * userdata,struct rc_instruction * inst,rc_register_file file,unsigned int index,unsigned int mask)666 static void omod_filter_reader_cb(
667 	void * userdata,
668 	struct rc_instruction * inst,
669 	rc_register_file file,
670 	unsigned int index,
671 	unsigned int mask)
672 {
673 	struct peephole_mul_cb_data * d = userdata;
674 	if (rc_src_reads_dst_mask(file, mask, index,
675 		d->Writer->File, d->Writer->Index, d->Writer->WriteMask)) {
676 
677 		d->Clobbered = 1;
678 	}
679 }
680 
omod_filter_writer_cb(void * userdata,struct rc_instruction * inst,rc_register_file file,unsigned int index,unsigned int mask)681 static void omod_filter_writer_cb(
682 	void * userdata,
683 	struct rc_instruction * inst,
684 	rc_register_file file,
685 	unsigned int index,
686 	unsigned int mask)
687 {
688 	struct peephole_mul_cb_data * d = userdata;
689 	if (file == d->Writer->File && index == d->Writer->Index &&
690 					(mask & d->Writer->WriteMask)) {
691 		d->Clobbered = 1;
692 	}
693 }
694 
peephole_mul_omod(struct radeon_compiler * c,struct rc_instruction * inst_mul,struct rc_list * var_list)695 static int peephole_mul_omod(
696 	struct radeon_compiler * c,
697 	struct rc_instruction * inst_mul,
698 	struct rc_list * var_list)
699 {
700 	unsigned int chan = 0, swz, i;
701 	int const_index = -1;
702 	int temp_index = -1;
703 	float const_value;
704 	rc_omod_op omod_op = RC_OMOD_DISABLE;
705 	struct rc_list * writer_list;
706 	struct rc_variable * var;
707 	struct peephole_mul_cb_data cb_data;
708 	unsigned writemask_sum;
709 
710 	for (i = 0; i < 2; i++) {
711 		unsigned int j;
712 		if (inst_mul->U.I.SrcReg[i].File != RC_FILE_CONSTANT
713 			&& inst_mul->U.I.SrcReg[i].File != RC_FILE_TEMPORARY
714 			&& inst_mul->U.I.SrcReg[i].File != RC_FILE_NONE) {
715 			return 0;
716 		}
717 
718 		/* The only relevant case with constant swizzles we should check for
719 		 * is multiply by one half.
720 		 */
721 		if (inst_mul->U.I.SrcReg[i].File == RC_FILE_NONE) {
722 			for (j = 0; j < 4; j++) {
723 				swz = GET_SWZ(inst_mul->U.I.SrcReg[i].Swizzle, j);
724 				if (swz == RC_SWIZZLE_UNUSED) {
725 					continue;
726 				}
727 				if (swz != RC_SWIZZLE_HALF) {
728 					return 0;
729 				} else {
730 					omod_op = RC_OMOD_DIV_2;
731 				}
732 			}
733 		}
734 
735 		if (inst_mul->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
736 			if (temp_index != -1) {
737 				/* The instruction has two temp sources */
738 				return 0;
739 			} else {
740 				temp_index = i;
741 				continue;
742 			}
743 		}
744 		/* If we get this far Src[i] must be a constant src */
745 		if (inst_mul->U.I.SrcReg[i].Negate) {
746 			return 0;
747 		}
748 		/* The constant src needs to read from the same swizzle */
749 		swz = RC_SWIZZLE_UNUSED;
750 		chan = 0;
751 		for (j = 0; j < 4; j++) {
752 			unsigned int j_swz =
753 				GET_SWZ(inst_mul->U.I.SrcReg[i].Swizzle, j);
754 			if (j_swz == RC_SWIZZLE_UNUSED) {
755 				continue;
756 			}
757 			if (swz == RC_SWIZZLE_UNUSED) {
758 				swz = j_swz;
759 				chan = j;
760 			} else if (j_swz != swz) {
761 				return 0;
762 			}
763 		}
764 
765 		if (const_index != -1) {
766 			/* The instruction has two constant sources */
767 			return 0;
768 		} else {
769 			const_index = i;
770 		}
771 	}
772 
773 	if (omod_op == RC_OMOD_DISABLE) {
774 		if (!rc_src_reg_is_immediate(c, inst_mul->U.I.SrcReg[const_index].File,
775 					inst_mul->U.I.SrcReg[const_index].Index)) {
776 			return 0;
777 		}
778 		const_value = rc_get_constant_value(c,
779 				inst_mul->U.I.SrcReg[const_index].Index,
780 				inst_mul->U.I.SrcReg[const_index].Swizzle,
781 				inst_mul->U.I.SrcReg[const_index].Negate,
782 				chan);
783 
784 		if (const_value == 2.0f) {
785 			omod_op = RC_OMOD_MUL_2;
786 		} else if (const_value == 4.0f) {
787 			omod_op = RC_OMOD_MUL_4;
788 		} else if (const_value == 8.0f) {
789 			omod_op = RC_OMOD_MUL_8;
790 		} else if (const_value == (1.0f / 2.0f)) {
791 			omod_op = RC_OMOD_DIV_2;
792 		} else if (const_value == (1.0f / 4.0f)) {
793 			omod_op = RC_OMOD_DIV_4;
794 		} else if (const_value == (1.0f / 8.0f)) {
795 			omod_op = RC_OMOD_DIV_8;
796 		} else {
797 			return 0;
798 		}
799 	}
800 
801 	writer_list = rc_variable_list_get_writers_one_reader(var_list,
802 		RC_INSTRUCTION_NORMAL, &inst_mul->U.I.SrcReg[temp_index]);
803 
804 	if (!writer_list) {
805 		return 0;
806 	}
807 
808 	cb_data.Clobbered = 0;
809 	cb_data.Writer = &inst_mul->U.I.DstReg;
810 	for (var = writer_list->Item; var; var = var->Friend) {
811 		struct rc_instruction * inst;
812 		const struct rc_opcode_info * info = rc_get_opcode_info(
813 				var->Inst->U.I.Opcode);
814 		if (info->HasTexture) {
815 			return 0;
816 		}
817 		if (var->Inst->U.I.SaturateMode != RC_SATURATE_NONE) {
818 			return 0;
819 		}
820 
821 		/* Empirical testing shows that DDX/DDY directly into output
822 		 * with non-identity omod is problematic.
823 		 */
824 		if ((info->Opcode == RC_OPCODE_DDX || info->Opcode == RC_OPCODE_DDY) &&
825 			inst_mul->U.I.DstReg.File == RC_FILE_OUTPUT) {
826 			return 0;
827 		}
828 
829 		for (inst = inst_mul->Prev; inst != var->Inst;
830 							inst = inst->Prev) {
831 			rc_for_all_reads_mask(inst, omod_filter_reader_cb,
832 								&cb_data);
833 			rc_for_all_writes_mask(inst, omod_filter_writer_cb,
834 								&cb_data);
835 			if (cb_data.Clobbered) {
836 				break;
837 			}
838 		}
839 	}
840 
841 	if (cb_data.Clobbered) {
842 		return 0;
843 	}
844 
845 	writemask_sum = rc_variable_writemask_sum(writer_list->Item);
846 
847 	/* rc_normal_rewrite_writemask can't expand a previous writemask to store
848 	 * more channels replicated.
849 	 */
850 	if (util_bitcount(writemask_sum) < util_bitcount(inst_mul->U.I.DstReg.WriteMask))
851 		return 0;
852 
853 	/* Rewrite the instructions */
854 	for (var = writer_list->Item; var; var = var->Friend) {
855 		struct rc_variable * writer = var;
856 		unsigned conversion_swizzle = RC_SWIZZLE_UUUU;
857 		for (chan = 0; chan < 4; chan++) {
858 			unsigned swz = GET_SWZ(inst_mul->U.I.SrcReg[temp_index].Swizzle, chan);
859 			if (swz <= RC_SWIZZLE_W)
860 				SET_SWZ(conversion_swizzle, swz, chan);
861 		}
862 		writer->Inst->U.I.Omod = omod_op;
863 		writer->Inst->U.I.DstReg.File = inst_mul->U.I.DstReg.File;
864 		writer->Inst->U.I.DstReg.Index = inst_mul->U.I.DstReg.Index;
865 		rc_normal_rewrite_writemask(writer->Inst, conversion_swizzle);
866 		writer->Inst->U.I.SaturateMode = inst_mul->U.I.SaturateMode;
867 	}
868 
869 	rc_remove_instruction(inst_mul);
870 
871 	return 1;
872 }
873 
874 /**
875  * @return
876  * 	0 if inst is still part of the program.
877  * 	1 if inst is no longer part of the program.
878  */
879 int
rc_opt_presubtract(struct radeon_compiler * c,struct rc_instruction * inst,void * data)880 rc_opt_presubtract(struct radeon_compiler *c, struct rc_instruction *inst, void *data)
881 {
882 	switch(inst->U.I.Opcode) {
883 	case RC_OPCODE_ADD:
884 	{
885 		if (peephole_add_presub_inv(c, inst))
886 			return 1;
887 		if (peephole_add_presub_add(c, inst))
888 			return 1;
889 		break;
890 	}
891 	case RC_OPCODE_MAD:
892 	{
893 		if (peephole_mad_presub_bias(c, inst))
894 			return 1;
895 		break;
896 	}
897 	default:
898 		break;
899 	}
900 	return 0;
901 }
902 
merge_swizzles(unsigned int swz1,unsigned int swz2)903 static unsigned int merge_swizzles(unsigned int swz1, unsigned int swz2)
904 {
905 	unsigned int new_swz = rc_init_swizzle(RC_SWIZZLE_UNUSED, 0);
906 	for (unsigned int chan = 0; chan < 4; chan++) {
907 		unsigned int swz = GET_SWZ(swz1, chan);
908 		if (swz != RC_SWIZZLE_UNUSED) {
909 			SET_SWZ(new_swz, chan, swz);
910 			continue;
911 		}
912 		swz = GET_SWZ(swz2, chan);
913 		SET_SWZ(new_swz, chan, swz);
914 	}
915 	return new_swz;
916 }
917 
918 /* Sets negate to 0 for unused channels. */
clean_negate(struct rc_src_register src)919 static unsigned int clean_negate(struct rc_src_register src)
920 {
921 	unsigned int new_negate = 0;
922 	for (unsigned int chan = 0; chan < 4; chan++) {
923 		unsigned int swz = GET_SWZ(src.Swizzle, chan);
924 		if (swz != RC_SWIZZLE_UNUSED)
925 			new_negate |= src.Negate & (1 << chan);
926 	}
927 	return new_negate;
928 }
929 
merge_negates(struct rc_src_register src1,struct rc_src_register src2)930 static unsigned int merge_negates(struct rc_src_register src1, struct rc_src_register src2)
931 {
932 	return clean_negate(src1) | clean_negate(src2);
933 }
934 
fill_swizzle(unsigned int orig_swz,unsigned int wmask,unsigned int const_swz)935 static unsigned int fill_swizzle(unsigned int orig_swz, unsigned int wmask, unsigned int const_swz)
936 {
937 	for (unsigned int chan = 0; chan < 4; chan++) {
938 		unsigned int swz = GET_SWZ(orig_swz, chan);
939 		if (swz == RC_SWIZZLE_UNUSED && (wmask & (1 << chan))) {
940 			SET_SWZ(orig_swz, chan, const_swz);
941 		}
942 	}
943 	return orig_swz;
944 }
945 
have_shared_source(struct rc_instruction * inst1,struct rc_instruction * inst2)946 static int have_shared_source(struct rc_instruction * inst1, struct rc_instruction * inst2)
947 {
948 	int shared_src = -1;
949 	const struct rc_opcode_info * opcode1 = rc_get_opcode_info(inst1->U.I.Opcode);
950 	const struct rc_opcode_info * opcode2 = rc_get_opcode_info(inst2->U.I.Opcode);
951 	for (unsigned i = 0; i < opcode1->NumSrcRegs; i++) {
952 		for (unsigned j = 0; j < opcode2->NumSrcRegs; j++) {
953 			if (inst1->U.I.SrcReg[i].File == inst2->U.I.SrcReg[j].File &&
954 				inst1->U.I.SrcReg[i].Index == inst2->U.I.SrcReg[j].Index &&
955 				inst1->U.I.SrcReg[i].RelAddr == inst2->U.I.SrcReg[j].RelAddr &&
956 				inst1->U.I.SrcReg[i].Abs == inst2->U.I.SrcReg[j].Abs)
957 				shared_src = i;
958 		}
959 	}
960 	return shared_src;
961 }
962 
963 /**
964  * Merges two MOVs writing different channels of the same destination register
965  * with the use of the constant swizzles.
966  */
merge_movs(struct radeon_compiler * c,struct rc_instruction * inst,struct rc_instruction * cur)967 static bool merge_movs(
968 	struct radeon_compiler * c,
969 	struct rc_instruction * inst,
970 	struct rc_instruction * cur)
971 {
972 	/* We can merge two MOVs into MOV if one of them is from inline constant,
973 	 * i.e., constant swizzles and RC_FILE_NONE).
974 	 *
975 	 * For example
976 	 *   MOV temp[0].x none.1___
977 	 *   MOV temp[0].y input[0]._x__
978 	 *
979 	 * becomes
980 	 *   MOV temp[0].xy input[0].1x__
981 	 */
982 	unsigned int orig_dst_wmask = inst->U.I.DstReg.WriteMask;
983 	if (cur->U.I.SrcReg[0].File == RC_FILE_NONE ||
984 		inst->U.I.SrcReg[0].File == RC_FILE_NONE) {
985 		struct rc_src_register src;
986 		if (cur->U.I.SrcReg[0].File == RC_FILE_NONE)
987 			src = inst->U.I.SrcReg[0];
988 		else
989 			src = cur->U.I.SrcReg[0];
990 		src.Swizzle = merge_swizzles(cur->U.I.SrcReg[0].Swizzle,
991 						inst->U.I.SrcReg[0].Swizzle);
992 		src.Negate = merge_negates(inst->U.I.SrcReg[0], cur->U.I.SrcReg[0]);
993 		if (c->SwizzleCaps->IsNative(RC_OPCODE_MOV, src)) {
994 			cur->U.I.DstReg.WriteMask |= orig_dst_wmask;
995 			cur->U.I.SrcReg[0] = src;
996 			rc_remove_instruction(inst);
997 			return true;
998 		}
999 	}
1000 
1001 	/* Handle the trivial case where the MOVs share a source.
1002 	 *
1003 	 * For example
1004 	 *   MOV temp[0].x const[0].x
1005 	 *   MOV temp[0].y const[0].z
1006 	 *
1007 	 * becomes
1008 	 *   MOV temp[0].xy const[0].xz
1009 	 */
1010 	if (have_shared_source(inst, cur) == 0) {
1011 		struct rc_src_register src = cur->U.I.SrcReg[0];
1012 		src.Negate = merge_negates(inst->U.I.SrcReg[0], cur->U.I.SrcReg[0]);
1013 		src.Swizzle = merge_swizzles(cur->U.I.SrcReg[0].Swizzle,
1014 						inst->U.I.SrcReg[0].Swizzle);
1015 
1016                 if (c->SwizzleCaps->IsNative(RC_OPCODE_MOV, src)) {
1017                         cur->U.I.DstReg.WriteMask |= orig_dst_wmask;
1018                         cur->U.I.SrcReg[0] = src;
1019                         rc_remove_instruction(inst);
1020                         return true;
1021                 }
1022 	}
1023 
1024 	/* Otherwise, we can convert the MOVs into ADD.
1025 	 *
1026 	 * For example
1027 	 *   MOV temp[0].x const[0].x
1028 	 *   MOV temp[0].y input[0].y
1029 	 *
1030 	 * becomes
1031 	 *   ADD temp[0].xy const[0].x0 input[0].0y
1032 	 */
1033 	unsigned wmask = cur->U.I.DstReg.WriteMask | orig_dst_wmask;
1034 	struct rc_src_register src0 = inst->U.I.SrcReg[0];
1035 	struct rc_src_register src1 = cur->U.I.SrcReg[0];
1036 
1037 	src0.Swizzle = fill_swizzle(src0.Swizzle,
1038 				wmask, RC_SWIZZLE_ZERO);
1039 	src1.Swizzle = fill_swizzle(src1.Swizzle,
1040 				wmask, RC_SWIZZLE_ZERO);
1041 	if (!c->SwizzleCaps->IsNative(RC_OPCODE_ADD, src0) ||
1042 		!c->SwizzleCaps->IsNative(RC_OPCODE_ADD, src1))
1043 		return false;
1044 
1045 	cur->U.I.DstReg.WriteMask = wmask;
1046 	cur->U.I.Opcode = RC_OPCODE_ADD;
1047 	cur->U.I.SrcReg[0] = src0;
1048 	cur->U.I.SrcReg[1] = src1;
1049 
1050 	/* finally delete the original mov */
1051 	rc_remove_instruction(inst);
1052 	return true;
1053 }
1054 
1055 /**
1056  * This function will try to merge MOV and ADD/MUL instructions with the same
1057  * destination, making use of the constant swizzles.
1058  *
1059  * For example:
1060  *   MOV temp[0].x const[0].x
1061  *   MUL temp[0].yz const[1].yz const[2].yz
1062  *
1063  * becomes
1064  *   MAD temp[0].xyz const[1].0yz const[2].0yz const[0].x00
1065  */
merge_mov_add_mul(struct radeon_compiler * c,struct rc_instruction * inst1,struct rc_instruction * inst2)1066 static int merge_mov_add_mul(
1067 	struct radeon_compiler * c,
1068 	struct rc_instruction * inst1,
1069 	struct rc_instruction * inst2)
1070 {
1071 	struct rc_instruction * inst, * mov;
1072 	if (inst1->U.I.Opcode == RC_OPCODE_MOV) {
1073 		mov = inst1;
1074 		inst = inst2;
1075 	} else {
1076 		mov = inst2;
1077 		inst = inst1;
1078 	}
1079 
1080 	const bool is_mul = inst->U.I.Opcode == RC_OPCODE_MUL;
1081 	int shared_index = have_shared_source(inst, mov);
1082 	unsigned wmask = mov->U.I.DstReg.WriteMask | inst->U.I.DstReg.WriteMask;
1083 
1084 	/* If there is a shared source, just merge the swizzles and be done with it. */
1085 	if (shared_index != -1) {
1086 		struct rc_src_register shared_src = inst->U.I.SrcReg[shared_index];
1087 		struct rc_src_register other_src = inst->U.I.SrcReg[1 - shared_index];
1088 
1089 		shared_src.Negate = merge_negates(mov->U.I.SrcReg[0], shared_src);
1090 		shared_src.Swizzle = merge_swizzles(shared_src.Swizzle,
1091 					mov->U.I.SrcReg[0].Swizzle);
1092 		other_src.Negate = clean_negate(other_src);
1093 		unsigned int swz = is_mul ? RC_SWIZZLE_ONE : RC_SWIZZLE_ZERO;
1094 		other_src.Swizzle = fill_swizzle(other_src.Swizzle, wmask, swz);
1095 
1096 		if (!c->SwizzleCaps->IsNative(RC_OPCODE_ADD, shared_src) ||
1097 			!c->SwizzleCaps->IsNative(RC_OPCODE_ADD, other_src))
1098 			return 0;
1099 
1100 		inst2->U.I.Opcode = inst->U.I.Opcode;
1101 		inst2->U.I.SrcReg[0] = shared_src;
1102 		inst2->U.I.SrcReg[1] = other_src;
1103 
1104 	/* TODO: we can do a bit better in the special case when one of the sources is none.
1105 	 * Convert to MAD otherwise.
1106 	 */
1107 	} else {
1108 		struct rc_src_register src0, src1, src2;
1109 		if (is_mul) {
1110 			src2 = mov->U.I.SrcReg[0];
1111 			src0 = inst->U.I.SrcReg[0];
1112 			src1 = inst->U.I.SrcReg[1];
1113 		} else {
1114 			src0 = mov->U.I.SrcReg[0];
1115 			src1 = inst->U.I.SrcReg[0];
1116 			src2 = inst->U.I.SrcReg[1];
1117 		}
1118 		/* The following login expects that the unused channels have empty negate bits. */
1119 		src0.Negate = clean_negate(src0);
1120 		src1.Negate = clean_negate(src1);
1121 		src2.Negate = clean_negate(src2);
1122 
1123 		src0.Swizzle = fill_swizzle(src0.Swizzle,
1124 					wmask, RC_SWIZZLE_ONE);
1125 		src1.Swizzle = fill_swizzle(src1.Swizzle,
1126 					wmask, is_mul ? RC_SWIZZLE_ZERO : RC_SWIZZLE_ONE);
1127 		src2.Swizzle = fill_swizzle(src2.Swizzle,
1128 					wmask, RC_SWIZZLE_ZERO);
1129 		if (!c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src0) ||
1130 			!c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src1) ||
1131 			!c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src2))
1132 			return 0;
1133 
1134 		inst2->U.I.Opcode = RC_OPCODE_MAD;
1135 		inst2->U.I.SrcReg[0] = src0;
1136 		inst2->U.I.SrcReg[1] = src1;
1137 		inst2->U.I.SrcReg[2] = src2;
1138 	}
1139 	inst2->U.I.DstReg.WriteMask = wmask;
1140 	/* finally delete the original instruction */
1141 	rc_remove_instruction(inst1);
1142 
1143 	return 1;
1144 }
1145 
1146 /**
1147  * This function will try to merge MOV and MAD instructions with the same
1148  * destination, making use of the constant swizzles. This only works
1149  * if there is a shared source or one of the sources is RC_FILE_NONE.
1150  *
1151  * For example:
1152  *   MOV temp[0].x const[0].x
1153  *   MAD temp[0].yz const[0].yz const[1].yz input[0].xw
1154  *
1155  * becomes
1156  *   MAD temp[0].xyz const[0].xyz const[2].1yz input[0].0xw
1157  */
merge_mov_mad(struct radeon_compiler * c,struct rc_instruction * inst1,struct rc_instruction * inst2)1158 static bool merge_mov_mad(
1159 	struct radeon_compiler * c,
1160 	struct rc_instruction * inst1,
1161 	struct rc_instruction * inst2)
1162 {
1163 	struct rc_instruction * mov, * mad;
1164 	if (inst1->U.I.Opcode == RC_OPCODE_MOV) {
1165 		mov = inst1;
1166 		mad = inst2;
1167 	} else {
1168 		mov = inst2;
1169 		mad = inst1;
1170 	}
1171 
1172 	int shared_index = have_shared_source(mad, mov);
1173 	unsigned wmask = mov->U.I.DstReg.WriteMask | mad->U.I.DstReg.WriteMask;
1174 	struct rc_src_register src[3];
1175 	src[0] = mad->U.I.SrcReg[0];
1176 	src[1] = mad->U.I.SrcReg[1];
1177 	src[2] = mad->U.I.SrcReg[2];
1178 
1179 	/* Shared source is the one for multiplication. */
1180 	if (shared_index == 0 || shared_index == 1) {
1181 		src[shared_index].Negate = merge_negates(src[shared_index], mov->U.I.SrcReg[0]);
1182 		src[1 - shared_index].Negate = clean_negate(src[1 - shared_index]);
1183 		src[shared_index].Swizzle = merge_swizzles(src[shared_index].Swizzle,
1184 				mov->U.I.SrcReg[0].Swizzle);
1185 		src[1 - shared_index].Swizzle = fill_swizzle(
1186 				src[1 - shared_index].Swizzle, wmask, RC_SWIZZLE_ONE);
1187 		src[2].Swizzle =  fill_swizzle(src[2].Swizzle, wmask, RC_SWIZZLE_ZERO);
1188 
1189 	/* Shared source is the one for used for addition, or it is none. Additionally,
1190 	 * if the mov SrcReg is none, we merge it with the addition (third) reg as well
1191 	 * because than we have the highest change the swizzles will be legal.
1192 	 */
1193 	} else if (shared_index == 2 || mov->U.I.SrcReg[0].File == RC_FILE_NONE ||
1194 			src[2].File == RC_FILE_NONE) {
1195 		src[2].Negate = merge_negates(src[2], mov->U.I.SrcReg[0]);
1196 		src[2].Swizzle = merge_swizzles(src[2].Swizzle, mov->U.I.SrcReg[0].Swizzle);
1197 		src[0].Swizzle = fill_swizzle(src[0].Swizzle, wmask, RC_SWIZZLE_ZERO);
1198 		src[1].Swizzle = fill_swizzle(src[1].Swizzle, wmask, RC_SWIZZLE_ZERO);
1199 		if (src[2].File == RC_FILE_NONE) {
1200 			src[2].File = mov->U.I.SrcReg[0].File;
1201 			src[2].Index = mov->U.I.SrcReg[0].Index;
1202 			src[2].RelAddr = mov->U.I.SrcReg[0].RelAddr;
1203 			src[2].Abs = mov->U.I.SrcReg[0].Abs;
1204 		}
1205 
1206 	/* First or the second MAD source is RC_FILE_NONE, we merge the mov into it,
1207 	 * fill the other one with ones and the reg for addition with zeros.
1208 	 */
1209 	} else if (src[0].File == RC_FILE_NONE || src[1].File == RC_FILE_NONE) {
1210 		unsigned none_src = src[0].File == RC_FILE_NONE ? 0 : 1;
1211 		src[none_src] = mov->U.I.SrcReg[0];
1212 		src[none_src].Negate = merge_negates(src[none_src], mad->U.I.SrcReg[none_src]);
1213 		src[none_src].Swizzle = merge_swizzles(src[none_src].Swizzle,
1214 				mad->U.I.SrcReg[none_src].Swizzle);
1215 		src[1 - none_src].Negate = clean_negate(src[1 - none_src]);
1216 		src[1 - none_src].Swizzle = fill_swizzle(src[1 - none_src].Swizzle,
1217 				wmask, RC_SWIZZLE_ONE);
1218 		src[2].Swizzle =  fill_swizzle(src[2].Swizzle, wmask, RC_SWIZZLE_ZERO);
1219 	} else {
1220 		return false;
1221 	}
1222 
1223 	if (!c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src[0]) ||
1224 		!c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src[1]) ||
1225 		!c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src[2]))
1226 		return false;
1227 
1228 	inst2->U.I.Opcode = RC_OPCODE_MAD;
1229 	inst2->U.I.SrcReg[0] = src[0];
1230 	inst2->U.I.SrcReg[1] = src[1];
1231 	inst2->U.I.SrcReg[2] = src[2];
1232 	inst2->U.I.DstReg.WriteMask = wmask;
1233 	rc_remove_instruction(inst1);
1234 	return true;
1235 }
1236 
inst_combination(struct rc_instruction * inst1,struct rc_instruction * inst2,rc_opcode opcode1,rc_opcode opcode2)1237 static bool inst_combination(
1238 	struct rc_instruction * inst1,
1239 	struct rc_instruction * inst2,
1240 	rc_opcode opcode1,
1241 	rc_opcode opcode2)
1242 {
1243 	return ((inst1->U.I.Opcode == opcode1 && inst2->U.I.Opcode == opcode2) ||
1244 		(inst2->U.I.Opcode == opcode1 && inst1->U.I.Opcode == opcode2));
1245 }
1246 
1247 /**
1248  * Searches for instructions writing different channels of the same register that could
1249  * be merged together with the use of constant swizzles.
1250  *
1251  * The potential candidates are combinations of MOVs, ADDs, MULs and MADs.
1252  */
merge_channels(struct radeon_compiler * c,struct rc_instruction * inst)1253 static void merge_channels(struct radeon_compiler * c, struct rc_instruction * inst)
1254 {
1255 	unsigned int orig_dst_reg = inst->U.I.DstReg.Index;
1256 	unsigned int orig_dst_file = inst->U.I.DstReg.File;
1257 	unsigned int orig_dst_wmask = inst->U.I.DstReg.WriteMask;
1258 	const struct rc_opcode_info * orig_opcode = rc_get_opcode_info(inst->U.I.Opcode);
1259 
1260 	struct rc_instruction * cur = inst;
1261 	while (cur!= &c->Program.Instructions) {
1262 		cur = cur->Next;
1263 		const struct rc_opcode_info * opcode = rc_get_opcode_info(cur->U.I.Opcode);
1264 
1265 		/* Keep it simple for now and stop when encountering any
1266 		 * control flow.
1267 		 */
1268 		if (opcode->IsFlowControl)
1269 			return;
1270 
1271 		/* Stop when the original destination is overwritten */
1272 		if (orig_dst_reg == cur->U.I.DstReg.Index &&
1273 			orig_dst_file == cur->U.I.DstReg.File &&
1274 			(orig_dst_wmask & cur->U.I.DstReg.WriteMask) != 0)
1275 			return;
1276 
1277 		/* Stop the search when the original instruction destination
1278 		 * is used as a source for anything.
1279 		 */
1280 		for (unsigned i = 0; i < opcode->NumSrcRegs; i++) {
1281 			if (cur->U.I.SrcReg[i].File == orig_dst_file &&
1282 				cur->U.I.SrcReg[i].Index == orig_dst_reg)
1283 				return;
1284 		}
1285 
1286 		/* Stop the search when some of the original sources are touched. */
1287 		for (unsigned i = 0; i < orig_opcode->NumSrcRegs; i++) {
1288 			if (inst->U.I.SrcReg[i].File == cur->U.I.DstReg.File &&
1289 				inst->U.I.SrcReg[i].Index == cur->U.I.DstReg.Index)
1290 				return;
1291 		}
1292 
1293 		if (cur->U.I.DstReg.File == orig_dst_file &&
1294 			cur->U.I.DstReg.Index == orig_dst_reg &&
1295 			cur->U.I.SaturateMode == inst->U.I.SaturateMode &&
1296 			(cur->U.I.DstReg.WriteMask & orig_dst_wmask) == 0) {
1297 
1298 			if (inst_combination(cur, inst, RC_OPCODE_MOV, RC_OPCODE_MOV)) {
1299 				if (merge_movs(c, inst, cur))
1300 					return;
1301 			}
1302 
1303 			/* Skip the merge if one of the instructions writes just w channel
1304 			 * and we are compiling a fragment shader. We can pair-schedule it together
1305 			 * later anyway and it will also give the scheduler a bit more flexibility.
1306 			 * Only check this after merging MOVs as when we manage to merge two MOVs
1307 			 * into another MOV we can still copy propagate it away. So it is a win in
1308 			 * that case.
1309 			 */
1310 			if (c->has_omod && (cur->U.I.DstReg.WriteMask == RC_MASK_W ||
1311 				inst->U.I.DstReg.WriteMask == RC_MASK_W))
1312 				continue;
1313 
1314 			if (inst_combination(cur, inst, RC_OPCODE_MOV, RC_OPCODE_ADD) ||
1315 				inst_combination(cur, inst, RC_OPCODE_MOV, RC_OPCODE_MUL)) {
1316 				if (merge_mov_add_mul(c, inst, cur))
1317 					return;
1318 			}
1319 
1320 			if (inst_combination(cur, inst, RC_OPCODE_MOV, RC_OPCODE_MAD)) {
1321 				if (merge_mov_mad(c, inst, cur))
1322 					return;
1323 			}
1324 		}
1325 	}
1326 }
1327 
1328 /**
1329  * Searches for duplicate ARLs/ARRs
1330  *
1331  * Only a very trivial case is now optimized where if a second one is detected which reads from
1332  * the same register as the first one and source is the same, just remove the second one.
1333  */
merge_A0_loads(struct radeon_compiler * c,struct rc_instruction * inst,bool is_ARL)1334 static void merge_A0_loads(
1335 	struct radeon_compiler * c,
1336 	struct rc_instruction * inst,
1337 	bool is_ARL)
1338 {
1339 	unsigned int A0_src_reg = inst->U.I.SrcReg[0].Index;
1340 	unsigned int A0_src_file = inst->U.I.SrcReg[0].File;
1341 	unsigned int A0_src_swizzle = inst->U.I.SrcReg[0].Swizzle;
1342 	int cf_depth = 0;
1343 
1344 	struct rc_instruction * cur = inst;
1345 	while (cur != &c->Program.Instructions) {
1346 		cur = cur->Next;
1347 		const struct rc_opcode_info * opcode = rc_get_opcode_info(cur->U.I.Opcode);
1348 
1349 		/* Keep it simple for now and stop when encountering any
1350 		 * control flow besides simple ifs.
1351 		 */
1352 		if (opcode->IsFlowControl) {
1353 			switch (cur->U.I.Opcode) {
1354 			case RC_OPCODE_IF:
1355 			{
1356 				cf_depth++;
1357 				break;
1358 			}
1359 			case RC_OPCODE_ELSE:
1360 			{
1361 				if (cf_depth < 1)
1362 					return;
1363 				break;
1364 			}
1365 			case RC_OPCODE_ENDIF:
1366 			{
1367                                 cf_depth--;
1368                                 break;
1369 			}
1370 			default:
1371 				return;
1372 			}
1373 		}
1374 
1375 		/* Stop when the original source is overwritten */
1376 		if (A0_src_reg == cur->U.I.DstReg.Index &&
1377 			A0_src_file == cur->U.I.DstReg.File &&
1378 			cur->U.I.DstReg.WriteMask | rc_swizzle_to_writemask(A0_src_swizzle))
1379 			return;
1380 
1381 		/* Wrong A0 load type. */
1382 		if ((is_ARL && cur->U.I.Opcode == RC_OPCODE_ARR) ||
1383 		    (!is_ARL && cur->U.I.Opcode == RC_OPCODE_ARL))
1384 			return;
1385 
1386 		if (cur->U.I.Opcode == RC_OPCODE_ARL || cur->U.I.Opcode == RC_OPCODE_ARR) {
1387 			if (A0_src_reg == cur->U.I.SrcReg[0].Index &&
1388 			    A0_src_file == cur->U.I.SrcReg[0].File &&
1389 			    A0_src_swizzle == cur->U.I.SrcReg[0].Swizzle) {
1390 				struct rc_instruction * next = cur->Next;
1391 				rc_remove_instruction(cur);
1392 				cur = next;
1393 			} else {
1394 				return;
1395 			}
1396 		}
1397 	}
1398 }
1399 
1400 /**
1401  * According to the GLSL spec, round is only 1.30 and up
1402  * so the only reason why we should ever see round is if it actually
1403  * is lowered ARR (from nine->ttn). In that case we want to reconstruct
1404  * the ARR instead of lowering the round.
1405  */
transform_vertex_ROUND(struct radeon_compiler * c,struct rc_instruction * inst)1406 static void transform_vertex_ROUND(struct radeon_compiler* c,
1407 	struct rc_instruction* inst)
1408 {
1409 	struct rc_reader_data readers;
1410 	rc_get_readers(c, inst, &readers, NULL, NULL, NULL);
1411 
1412 	assert(readers.ReaderCount > 0);
1413 	for (unsigned i = 0; i < readers.ReaderCount; i++) {
1414 		struct rc_instruction *reader = readers.Readers[i].Inst;
1415 		if (reader->U.I.Opcode != RC_OPCODE_ARL) {
1416 			assert(!"Unable to convert ROUND+ARL to ARR\n");
1417 			return;
1418 		}
1419 	}
1420 
1421 	/* Only ARL readers, convert all to ARR */
1422 	for (unsigned i = 0; i < readers.ReaderCount; i++) {
1423 		readers.Readers[i].Inst->U.I.Opcode = RC_OPCODE_ARR;
1424 	}
1425 	/* Switch ROUND to MOV and let copy propagate sort it out later. */
1426 	inst->U.I.Opcode = RC_OPCODE_MOV;
1427 }
1428 
1429 /**
1430  * Apply various optimizations specific to the A0 address register loads.
1431  */
optimize_A0_loads(struct radeon_compiler * c)1432 static void optimize_A0_loads(struct radeon_compiler * c) {
1433 	struct rc_instruction * inst = c->Program.Instructions.Next;
1434 
1435 	while (inst != &c->Program.Instructions) {
1436 		struct rc_instruction * cur = inst;
1437 		inst = inst->Next;
1438 		if (cur->U.I.Opcode == RC_OPCODE_ARL) {
1439 			merge_A0_loads(c, cur, true);
1440 		} else if (cur->U.I.Opcode == RC_OPCODE_ARR) {
1441 			merge_A0_loads(c, cur, false);
1442 		} else if (cur->U.I.Opcode == RC_OPCODE_ROUND) {
1443 			transform_vertex_ROUND(c, cur);
1444 		}
1445 	}
1446 }
1447 
rc_optimize(struct radeon_compiler * c,void * user)1448 void rc_optimize(struct radeon_compiler * c, void *user)
1449 {
1450 	struct rc_instruction * inst = c->Program.Instructions.Next;
1451 	while(inst != &c->Program.Instructions) {
1452 		struct rc_instruction * cur = inst;
1453 		inst = inst->Next;
1454 		constant_folding(c, cur);
1455 	}
1456 
1457 	/* Copy propagate simple movs away. */
1458 	inst = c->Program.Instructions.Next;
1459 	while(inst != &c->Program.Instructions) {
1460 		struct rc_instruction * cur = inst;
1461 		inst = inst->Next;
1462 		if (cur->U.I.Opcode == RC_OPCODE_MOV) {
1463 			copy_propagate(c, cur);
1464 		}
1465 	}
1466 
1467 	if (c->type == RC_VERTEX_PROGRAM) {
1468 		optimize_A0_loads(c);
1469 	}
1470 
1471 	/* Merge MOVs to same source in different channels using the constant
1472 	 * swizzle.
1473 	 */
1474 	if (c->is_r500 || c->type == RC_VERTEX_PROGRAM) {
1475 		inst = c->Program.Instructions.Next;
1476 		while(inst != &c->Program.Instructions) {
1477 			struct rc_instruction * cur = inst;
1478 			inst = inst->Next;
1479 			if (cur->U.I.Opcode == RC_OPCODE_MOV ||
1480 				cur->U.I.Opcode == RC_OPCODE_ADD ||
1481 				cur->U.I.Opcode == RC_OPCODE_MAD ||
1482 				cur->U.I.Opcode == RC_OPCODE_MUL)
1483 				merge_channels(c, cur);
1484 		}
1485 	}
1486 
1487 	/* Copy propagate few extra movs from the merge_channels pass. */
1488 	inst = c->Program.Instructions.Next;
1489 	while(inst != &c->Program.Instructions) {
1490 		struct rc_instruction * cur = inst;
1491 		inst = inst->Next;
1492 		if (cur->U.I.Opcode == RC_OPCODE_MOV) {
1493 			copy_propagate(c, cur);
1494 		}
1495 	}
1496 
1497 	if (c->type != RC_FRAGMENT_PROGRAM) {
1498 		return;
1499 	}
1500 
1501 	/* Output modifiers. */
1502 	inst = c->Program.Instructions.Next;
1503 	struct rc_list * var_list = NULL;
1504 	while(inst != &c->Program.Instructions) {
1505 		struct rc_instruction * cur = inst;
1506 		inst = inst->Next;
1507 		if (cur->U.I.Opcode == RC_OPCODE_MUL) {
1508 			if (!var_list)
1509 				var_list = rc_get_variables(c);
1510 			if (peephole_mul_omod(c, cur, var_list))
1511 				var_list = NULL;
1512 		}
1513 	}
1514 }
1515