1 /*
2 * Copyright 2011 Christoph Bumiller
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22
23 #include "nv50_ir.h"
24 #include "nv50_ir_target.h"
25 #include "nv50_ir_build_util.h"
26
27 extern "C" {
28 #include "util/u_math.h"
29 }
30
31 namespace nv50_ir {
32
33 bool
isNop() const34 Instruction::isNop() const
35 {
36 if (op == OP_PHI || op == OP_SPLIT || op == OP_MERGE)
37 return true;
38 if (terminator || join) // XXX: should terminator imply flow ?
39 return false;
40 if (op == OP_ATOM)
41 return false;
42 if (!fixed && op == OP_NOP)
43 return true;
44
45 if (defExists(0) && def(0).rep()->reg.data.id < 0) {
46 for (int d = 1; defExists(d); ++d)
47 if (def(d).rep()->reg.data.id >= 0)
48 WARN("part of vector result is unused !\n");
49 return true;
50 }
51
52 if (op == OP_MOV || op == OP_UNION) {
53 if (!getDef(0)->equals(getSrc(0)))
54 return false;
55 if (op == OP_UNION)
56 if (!getDef(0)->equals(getSrc(1)))
57 return false;
58 return true;
59 }
60
61 return false;
62 }
63
isDead() const64 bool Instruction::isDead() const
65 {
66 if (op == OP_STORE ||
67 op == OP_EXPORT ||
68 op == OP_ATOM ||
69 op == OP_SUSTB || op == OP_SUSTP || op == OP_SUREDP || op == OP_SUREDB)
70 return false;
71
72 for (int d = 0; defExists(d); ++d)
73 if (getDef(d)->refCount() || getDef(d)->reg.data.id >= 0)
74 return false;
75
76 if (terminator || asFlow())
77 return false;
78 if (fixed)
79 return false;
80
81 return true;
82 };
83
84 // =============================================================================
85
86 class CopyPropagation : public Pass
87 {
88 private:
89 virtual bool visit(BasicBlock *);
90 };
91
92 // Propagate all MOVs forward to make subsequent optimization easier, except if
93 // the sources stem from a phi, in which case we don't want to mess up potential
94 // swaps $rX <-> $rY, i.e. do not create live range overlaps of phi src and def.
95 bool
visit(BasicBlock * bb)96 CopyPropagation::visit(BasicBlock *bb)
97 {
98 Instruction *mov, *si, *next;
99
100 for (mov = bb->getEntry(); mov; mov = next) {
101 next = mov->next;
102 if (mov->op != OP_MOV || mov->fixed || !mov->getSrc(0)->asLValue())
103 continue;
104 if (mov->getPredicate())
105 continue;
106 if (mov->def(0).getFile() != mov->src(0).getFile())
107 continue;
108 si = mov->getSrc(0)->getInsn();
109 if (mov->getDef(0)->reg.data.id < 0 && si && si->op != OP_PHI) {
110 // propagate
111 mov->def(0).replace(mov->getSrc(0), false);
112 delete_Instruction(prog, mov);
113 }
114 }
115 return true;
116 }
117
118 // =============================================================================
119
120 class MergeSplits : public Pass
121 {
122 private:
123 virtual bool visit(BasicBlock *);
124 };
125
126 // For SPLIT / MERGE pairs that operate on the same registers, replace the
127 // post-merge def with the SPLIT's source.
128 bool
visit(BasicBlock * bb)129 MergeSplits::visit(BasicBlock *bb)
130 {
131 Instruction *i, *next, *si;
132
133 for (i = bb->getEntry(); i; i = next) {
134 next = i->next;
135 if (i->op != OP_MERGE || typeSizeof(i->dType) != 8)
136 continue;
137 si = i->getSrc(0)->getInsn();
138 if (si->op != OP_SPLIT || si != i->getSrc(1)->getInsn())
139 continue;
140 if (i->getSrc(0) != si->getDef(0) || i->getSrc(1) != si->getDef(1))
141 continue;
142 i->def(0).replace(si->getSrc(0), false);
143 delete_Instruction(prog, i);
144 }
145
146 return true;
147 }
148
149 // =============================================================================
150
151 class LoadPropagation : public Pass
152 {
153 private:
154 virtual bool visit(BasicBlock *);
155
156 void checkSwapSrc01(Instruction *);
157
158 bool isCSpaceLoad(Instruction *);
159 bool isImmdLoad(Instruction *);
160 bool isAttribOrSharedLoad(Instruction *);
161 };
162
163 bool
isCSpaceLoad(Instruction * ld)164 LoadPropagation::isCSpaceLoad(Instruction *ld)
165 {
166 return ld && ld->op == OP_LOAD && ld->src(0).getFile() == FILE_MEMORY_CONST;
167 }
168
169 bool
isImmdLoad(Instruction * ld)170 LoadPropagation::isImmdLoad(Instruction *ld)
171 {
172 if (!ld || (ld->op != OP_MOV) ||
173 ((typeSizeof(ld->dType) != 4) && (typeSizeof(ld->dType) != 8)))
174 return false;
175
176 // A 0 can be replaced with a register, so it doesn't count as an immediate.
177 ImmediateValue val;
178 return ld->src(0).getImmediate(val) && !val.isInteger(0);
179 }
180
181 bool
isAttribOrSharedLoad(Instruction * ld)182 LoadPropagation::isAttribOrSharedLoad(Instruction *ld)
183 {
184 return ld &&
185 (ld->op == OP_VFETCH ||
186 (ld->op == OP_LOAD &&
187 (ld->src(0).getFile() == FILE_SHADER_INPUT ||
188 ld->src(0).getFile() == FILE_MEMORY_SHARED)));
189 }
190
191 void
checkSwapSrc01(Instruction * insn)192 LoadPropagation::checkSwapSrc01(Instruction *insn)
193 {
194 const Target *targ = prog->getTarget();
195 if (!targ->getOpInfo(insn).commutative) {
196 if (insn->op != OP_SET && insn->op != OP_SLCT &&
197 insn->op != OP_SUB && insn->op != OP_XMAD)
198 return;
199 // XMAD is only commutative if both the CBCC and MRG flags are not set.
200 if (insn->op == OP_XMAD &&
201 (insn->subOp & NV50_IR_SUBOP_XMAD_CMODE_MASK) == NV50_IR_SUBOP_XMAD_CBCC)
202 return;
203 if (insn->op == OP_XMAD && (insn->subOp & NV50_IR_SUBOP_XMAD_MRG))
204 return;
205 }
206 if (insn->src(1).getFile() != FILE_GPR)
207 return;
208 // This is the special OP_SET used for alphatesting, we can't reverse its
209 // arguments as that will confuse the fixup code.
210 if (insn->op == OP_SET && insn->subOp)
211 return;
212
213 Instruction *i0 = insn->getSrc(0)->getInsn();
214 Instruction *i1 = insn->getSrc(1)->getInsn();
215
216 // Swap sources to inline the less frequently used source. That way,
217 // optimistically, it will eventually be able to remove the instruction.
218 int i0refs = insn->getSrc(0)->refCount();
219 int i1refs = insn->getSrc(1)->refCount();
220
221 if ((isCSpaceLoad(i0) || isImmdLoad(i0)) && targ->insnCanLoad(insn, 1, i0)) {
222 if ((!isImmdLoad(i1) && !isCSpaceLoad(i1)) ||
223 !targ->insnCanLoad(insn, 1, i1) ||
224 i0refs < i1refs)
225 insn->swapSources(0, 1);
226 else
227 return;
228 } else
229 if (isAttribOrSharedLoad(i1)) {
230 if (!isAttribOrSharedLoad(i0))
231 insn->swapSources(0, 1);
232 else
233 return;
234 } else {
235 return;
236 }
237
238 if (insn->op == OP_SET || insn->op == OP_SET_AND ||
239 insn->op == OP_SET_OR || insn->op == OP_SET_XOR)
240 insn->asCmp()->setCond = reverseCondCode(insn->asCmp()->setCond);
241 else
242 if (insn->op == OP_SLCT)
243 insn->asCmp()->setCond = inverseCondCode(insn->asCmp()->setCond);
244 else
245 if (insn->op == OP_SUB) {
246 insn->src(0).mod = insn->src(0).mod ^ Modifier(NV50_IR_MOD_NEG);
247 insn->src(1).mod = insn->src(1).mod ^ Modifier(NV50_IR_MOD_NEG);
248 } else
249 if (insn->op == OP_XMAD) {
250 // swap h1 flags
251 uint16_t h1 = (insn->subOp >> 1 & NV50_IR_SUBOP_XMAD_H1(0)) |
252 (insn->subOp << 1 & NV50_IR_SUBOP_XMAD_H1(1));
253 insn->subOp = (insn->subOp & ~NV50_IR_SUBOP_XMAD_H1_MASK) | h1;
254 }
255 }
256
257 bool
visit(BasicBlock * bb)258 LoadPropagation::visit(BasicBlock *bb)
259 {
260 const Target *targ = prog->getTarget();
261 Instruction *next;
262
263 for (Instruction *i = bb->getEntry(); i; i = next) {
264 next = i->next;
265
266 if (i->op == OP_CALL) // calls have args as sources, they must be in regs
267 continue;
268
269 if (i->op == OP_PFETCH) // pfetch expects arg1 to be a reg
270 continue;
271
272 if (i->srcExists(1))
273 checkSwapSrc01(i);
274
275 for (int s = 0; i->srcExists(s); ++s) {
276 Instruction *ld = i->getSrc(s)->getInsn();
277
278 if (!ld || ld->fixed || (ld->op != OP_LOAD && ld->op != OP_MOV))
279 continue;
280 if (ld->op == OP_LOAD && ld->subOp == NV50_IR_SUBOP_LOAD_LOCKED)
281 continue;
282 if (!targ->insnCanLoad(i, s, ld))
283 continue;
284
285 // propagate !
286 i->setSrc(s, ld->getSrc(0));
287 if (ld->src(0).isIndirect(0))
288 i->setIndirect(s, 0, ld->getIndirect(0, 0));
289
290 if (ld->getDef(0)->refCount() == 0)
291 delete_Instruction(prog, ld);
292 }
293 }
294 return true;
295 }
296
297 // =============================================================================
298
299 class IndirectPropagation : public Pass
300 {
301 private:
302 virtual bool visit(BasicBlock *);
303
304 BuildUtil bld;
305 };
306
307 bool
visit(BasicBlock * bb)308 IndirectPropagation::visit(BasicBlock *bb)
309 {
310 const Target *targ = prog->getTarget();
311 Instruction *next;
312
313 for (Instruction *i = bb->getEntry(); i; i = next) {
314 next = i->next;
315
316 bld.setPosition(i, false);
317
318 for (int s = 0; i->srcExists(s); ++s) {
319 Instruction *insn;
320 ImmediateValue imm;
321 if (!i->src(s).isIndirect(0))
322 continue;
323 insn = i->getIndirect(s, 0)->getInsn();
324 if (!insn)
325 continue;
326 if (insn->op == OP_ADD && !isFloatType(insn->dType)) {
327 if (insn->src(0).getFile() != targ->nativeFile(FILE_ADDRESS) ||
328 !insn->src(1).getImmediate(imm) ||
329 !targ->insnCanLoadOffset(i, s, imm.reg.data.s32))
330 continue;
331 i->setIndirect(s, 0, insn->getSrc(0));
332 i->setSrc(s, cloneShallow(func, i->getSrc(s)));
333 i->src(s).get()->reg.data.offset += imm.reg.data.u32;
334 } else if (insn->op == OP_SUB && !isFloatType(insn->dType)) {
335 if (insn->src(0).getFile() != targ->nativeFile(FILE_ADDRESS) ||
336 !insn->src(1).getImmediate(imm) ||
337 !targ->insnCanLoadOffset(i, s, -imm.reg.data.s32))
338 continue;
339 i->setIndirect(s, 0, insn->getSrc(0));
340 i->setSrc(s, cloneShallow(func, i->getSrc(s)));
341 i->src(s).get()->reg.data.offset -= imm.reg.data.u32;
342 } else if (insn->op == OP_MOV) {
343 if (!insn->src(0).getImmediate(imm) ||
344 !targ->insnCanLoadOffset(i, s, imm.reg.data.s32))
345 continue;
346 i->setIndirect(s, 0, NULL);
347 i->setSrc(s, cloneShallow(func, i->getSrc(s)));
348 i->src(s).get()->reg.data.offset += imm.reg.data.u32;
349 } else if (insn->op == OP_SHLADD) {
350 if (!insn->src(2).getImmediate(imm) ||
351 !targ->insnCanLoadOffset(i, s, imm.reg.data.s32))
352 continue;
353 i->setIndirect(s, 0, bld.mkOp2v(
354 OP_SHL, TYPE_U32, bld.getSSA(), insn->getSrc(0), insn->getSrc(1)));
355 i->setSrc(s, cloneShallow(func, i->getSrc(s)));
356 i->src(s).get()->reg.data.offset += imm.reg.data.u32;
357 }
358 }
359 }
360 return true;
361 }
362
363 // =============================================================================
364
365 // Evaluate constant expressions.
366 class ConstantFolding : public Pass
367 {
368 public:
ConstantFolding()369 ConstantFolding() : foldCount(0) {}
370 bool foldAll(Program *);
371
372 private:
373 virtual bool visit(BasicBlock *);
374
375 void expr(Instruction *, ImmediateValue&, ImmediateValue&);
376 void expr(Instruction *, ImmediateValue&, ImmediateValue&, ImmediateValue&);
377 /* true if i was deleted */
378 bool opnd(Instruction *i, ImmediateValue&, int s);
379 void opnd3(Instruction *, ImmediateValue&);
380
381 void unary(Instruction *, const ImmediateValue&);
382
383 void tryCollapseChainedMULs(Instruction *, const int s, ImmediateValue&);
384
385 CmpInstruction *findOriginForTestWithZero(Value *);
386
387 bool createMul(DataType ty, Value *def, Value *a, int64_t b, Value *c);
388
389 unsigned int foldCount;
390
391 BuildUtil bld;
392 };
393
394 // TODO: remember generated immediates and only revisit these
395 bool
foldAll(Program * prog)396 ConstantFolding::foldAll(Program *prog)
397 {
398 unsigned int iterCount = 0;
399 do {
400 foldCount = 0;
401 if (!run(prog))
402 return false;
403 } while (foldCount && ++iterCount < 2);
404 return true;
405 }
406
407 bool
visit(BasicBlock * bb)408 ConstantFolding::visit(BasicBlock *bb)
409 {
410 Instruction *i, *next;
411
412 for (i = bb->getEntry(); i; i = next) {
413 next = i->next;
414 if (i->op == OP_MOV || i->op == OP_CALL)
415 continue;
416
417 ImmediateValue src0, src1, src2;
418
419 if (i->srcExists(2) &&
420 i->src(0).getImmediate(src0) &&
421 i->src(1).getImmediate(src1) &&
422 i->src(2).getImmediate(src2)) {
423 expr(i, src0, src1, src2);
424 } else
425 if (i->srcExists(1) &&
426 i->src(0).getImmediate(src0) && i->src(1).getImmediate(src1)) {
427 expr(i, src0, src1);
428 } else
429 if (i->srcExists(0) && i->src(0).getImmediate(src0)) {
430 if (opnd(i, src0, 0))
431 continue;
432 } else
433 if (i->srcExists(1) && i->src(1).getImmediate(src1)) {
434 if (opnd(i, src1, 1))
435 continue;
436 }
437 if (i->srcExists(2) && i->src(2).getImmediate(src2))
438 opnd3(i, src2);
439 }
440 return true;
441 }
442
443 CmpInstruction *
findOriginForTestWithZero(Value * value)444 ConstantFolding::findOriginForTestWithZero(Value *value)
445 {
446 if (!value)
447 return NULL;
448 Instruction *insn = value->getInsn();
449 if (!insn)
450 return NULL;
451
452 if (insn->asCmp() && insn->op != OP_SLCT)
453 return insn->asCmp();
454
455 /* Sometimes mov's will sneak in as a result of other folding. This gets
456 * cleaned up later.
457 */
458 if (insn->op == OP_MOV)
459 return findOriginForTestWithZero(insn->getSrc(0));
460
461 /* Deal with AND 1.0 here since nv50 can't fold into boolean float */
462 if (insn->op == OP_AND) {
463 int s = 0;
464 ImmediateValue imm;
465 if (!insn->src(s).getImmediate(imm)) {
466 s = 1;
467 if (!insn->src(s).getImmediate(imm))
468 return NULL;
469 }
470 if (imm.reg.data.f32 != 1.0f)
471 return NULL;
472 /* TODO: Come up with a way to handle the condition being inverted */
473 if (insn->src(!s).mod != Modifier(0))
474 return NULL;
475 return findOriginForTestWithZero(insn->getSrc(!s));
476 }
477
478 return NULL;
479 }
480
481 void
applyTo(ImmediateValue & imm) const482 Modifier::applyTo(ImmediateValue& imm) const
483 {
484 if (!bits) // avoid failure if imm.reg.type is unhandled (e.g. b128)
485 return;
486 switch (imm.reg.type) {
487 case TYPE_F32:
488 if (bits & NV50_IR_MOD_ABS)
489 imm.reg.data.f32 = fabsf(imm.reg.data.f32);
490 if (bits & NV50_IR_MOD_NEG)
491 imm.reg.data.f32 = -imm.reg.data.f32;
492 if (bits & NV50_IR_MOD_SAT) {
493 if (imm.reg.data.f32 < 0.0f)
494 imm.reg.data.f32 = 0.0f;
495 else
496 if (imm.reg.data.f32 > 1.0f)
497 imm.reg.data.f32 = 1.0f;
498 }
499 assert(!(bits & NV50_IR_MOD_NOT));
500 break;
501
502 case TYPE_S8: // NOTE: will be extended
503 case TYPE_S16:
504 case TYPE_S32:
505 case TYPE_U8: // NOTE: treated as signed
506 case TYPE_U16:
507 case TYPE_U32:
508 if (bits & NV50_IR_MOD_ABS)
509 imm.reg.data.s32 = (imm.reg.data.s32 >= 0) ?
510 imm.reg.data.s32 : -imm.reg.data.s32;
511 if (bits & NV50_IR_MOD_NEG)
512 imm.reg.data.s32 = -imm.reg.data.s32;
513 if (bits & NV50_IR_MOD_NOT)
514 imm.reg.data.s32 = ~imm.reg.data.s32;
515 break;
516
517 case TYPE_F64:
518 if (bits & NV50_IR_MOD_ABS)
519 imm.reg.data.f64 = fabs(imm.reg.data.f64);
520 if (bits & NV50_IR_MOD_NEG)
521 imm.reg.data.f64 = -imm.reg.data.f64;
522 if (bits & NV50_IR_MOD_SAT) {
523 if (imm.reg.data.f64 < 0.0)
524 imm.reg.data.f64 = 0.0;
525 else
526 if (imm.reg.data.f64 > 1.0)
527 imm.reg.data.f64 = 1.0;
528 }
529 assert(!(bits & NV50_IR_MOD_NOT));
530 break;
531
532 default:
533 assert(!"invalid/unhandled type");
534 imm.reg.data.u64 = 0;
535 break;
536 }
537 }
538
539 operation
getOp() const540 Modifier::getOp() const
541 {
542 switch (bits) {
543 case NV50_IR_MOD_ABS: return OP_ABS;
544 case NV50_IR_MOD_NEG: return OP_NEG;
545 case NV50_IR_MOD_SAT: return OP_SAT;
546 case NV50_IR_MOD_NOT: return OP_NOT;
547 case 0:
548 return OP_MOV;
549 default:
550 return OP_CVT;
551 }
552 }
553
554 void
expr(Instruction * i,ImmediateValue & imm0,ImmediateValue & imm1)555 ConstantFolding::expr(Instruction *i,
556 ImmediateValue &imm0, ImmediateValue &imm1)
557 {
558 struct Storage *const a = &imm0.reg, *const b = &imm1.reg;
559 struct Storage res;
560 DataType type = i->dType;
561
562 memset(&res.data, 0, sizeof(res.data));
563
564 switch (i->op) {
565 case OP_SGXT: {
566 int bits = b->data.u32;
567 if (bits) {
568 uint32_t data = a->data.u32 & (0xffffffff >> (32 - bits));
569 if (bits < 32 && (data & (1 << (bits - 1))))
570 data = data - (1 << bits);
571 res.data.u32 = data;
572 }
573 break;
574 }
575 case OP_BMSK:
576 res.data.u32 = ((1 << b->data.u32) - 1) << a->data.u32;
577 break;
578 case OP_MAD:
579 case OP_FMA:
580 case OP_MUL:
581 if (i->dnz && i->dType == TYPE_F32) {
582 if (!isfinite(a->data.f32))
583 a->data.f32 = 0.0f;
584 if (!isfinite(b->data.f32))
585 b->data.f32 = 0.0f;
586 }
587 switch (i->dType) {
588 case TYPE_F32:
589 res.data.f32 = a->data.f32 * b->data.f32 * exp2f(i->postFactor);
590 break;
591 case TYPE_F64: res.data.f64 = a->data.f64 * b->data.f64; break;
592 case TYPE_S32:
593 if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
594 res.data.s32 = ((int64_t)a->data.s32 * b->data.s32) >> 32;
595 break;
596 }
597 FALLTHROUGH;
598 case TYPE_U32:
599 if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
600 res.data.u32 = ((uint64_t)a->data.u32 * b->data.u32) >> 32;
601 break;
602 }
603 res.data.u32 = a->data.u32 * b->data.u32; break;
604 default:
605 return;
606 }
607 break;
608 case OP_DIV:
609 if (b->data.u32 == 0)
610 break;
611 switch (i->dType) {
612 case TYPE_F32: res.data.f32 = a->data.f32 / b->data.f32; break;
613 case TYPE_F64: res.data.f64 = a->data.f64 / b->data.f64; break;
614 case TYPE_S32: res.data.s32 = a->data.s32 / b->data.s32; break;
615 case TYPE_U32: res.data.u32 = a->data.u32 / b->data.u32; break;
616 default:
617 return;
618 }
619 break;
620 case OP_ADD:
621 switch (i->dType) {
622 case TYPE_F32: res.data.f32 = a->data.f32 + b->data.f32; break;
623 case TYPE_F64: res.data.f64 = a->data.f64 + b->data.f64; break;
624 case TYPE_S32:
625 case TYPE_U32: res.data.u32 = a->data.u32 + b->data.u32; break;
626 default:
627 return;
628 }
629 break;
630 case OP_SUB:
631 switch (i->dType) {
632 case TYPE_F32: res.data.f32 = a->data.f32 - b->data.f32; break;
633 case TYPE_F64: res.data.f64 = a->data.f64 - b->data.f64; break;
634 case TYPE_S32:
635 case TYPE_U32: res.data.u32 = a->data.u32 - b->data.u32; break;
636 default:
637 return;
638 }
639 break;
640 case OP_MAX:
641 switch (i->dType) {
642 case TYPE_F32: res.data.f32 = MAX2(a->data.f32, b->data.f32); break;
643 case TYPE_F64: res.data.f64 = MAX2(a->data.f64, b->data.f64); break;
644 case TYPE_S32: res.data.s32 = MAX2(a->data.s32, b->data.s32); break;
645 case TYPE_U32: res.data.u32 = MAX2(a->data.u32, b->data.u32); break;
646 default:
647 return;
648 }
649 break;
650 case OP_MIN:
651 switch (i->dType) {
652 case TYPE_F32: res.data.f32 = MIN2(a->data.f32, b->data.f32); break;
653 case TYPE_F64: res.data.f64 = MIN2(a->data.f64, b->data.f64); break;
654 case TYPE_S32: res.data.s32 = MIN2(a->data.s32, b->data.s32); break;
655 case TYPE_U32: res.data.u32 = MIN2(a->data.u32, b->data.u32); break;
656 default:
657 return;
658 }
659 break;
660 case OP_AND:
661 res.data.u64 = a->data.u64 & b->data.u64;
662 break;
663 case OP_OR:
664 res.data.u64 = a->data.u64 | b->data.u64;
665 break;
666 case OP_XOR:
667 res.data.u64 = a->data.u64 ^ b->data.u64;
668 break;
669 case OP_SHL:
670 res.data.u32 = a->data.u32 << b->data.u32;
671 break;
672 case OP_SHR:
673 switch (i->dType) {
674 case TYPE_S32: res.data.s32 = a->data.s32 >> b->data.u32; break;
675 case TYPE_U32: res.data.u32 = a->data.u32 >> b->data.u32; break;
676 default:
677 return;
678 }
679 break;
680 case OP_SLCT:
681 if (a->data.u32 != b->data.u32)
682 return;
683 res.data.u32 = a->data.u32;
684 break;
685 case OP_EXTBF: {
686 int offset = b->data.u32 & 0xff;
687 int width = (b->data.u32 >> 8) & 0xff;
688 int rshift = offset;
689 int lshift = 0;
690 if (width == 0) {
691 res.data.u32 = 0;
692 break;
693 }
694 if (width + offset < 32) {
695 rshift = 32 - width;
696 lshift = 32 - width - offset;
697 }
698 if (i->subOp == NV50_IR_SUBOP_EXTBF_REV)
699 res.data.u32 = util_bitreverse(a->data.u32);
700 else
701 res.data.u32 = a->data.u32;
702 switch (i->dType) {
703 case TYPE_S32: res.data.s32 = (res.data.s32 << lshift) >> rshift; break;
704 case TYPE_U32: res.data.u32 = (res.data.u32 << lshift) >> rshift; break;
705 default:
706 return;
707 }
708 break;
709 }
710 case OP_POPCNT:
711 res.data.u32 = util_bitcount(a->data.u32 & b->data.u32);
712 break;
713 case OP_PFETCH:
714 // The two arguments to pfetch are logically added together. Normally
715 // the second argument will not be constant, but that can happen.
716 res.data.u32 = a->data.u32 + b->data.u32;
717 type = TYPE_U32;
718 break;
719 case OP_MERGE:
720 switch (i->dType) {
721 case TYPE_U64:
722 case TYPE_S64:
723 case TYPE_F64:
724 res.data.u64 = (((uint64_t)b->data.u32) << 32) | a->data.u32;
725 break;
726 default:
727 return;
728 }
729 break;
730 default:
731 return;
732 }
733 ++foldCount;
734
735 i->src(0).mod = Modifier(0);
736 i->src(1).mod = Modifier(0);
737 i->postFactor = 0;
738
739 i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res.data.u32));
740 i->setSrc(1, NULL);
741
742 i->getSrc(0)->reg.data = res.data;
743 i->getSrc(0)->reg.type = type;
744 i->getSrc(0)->reg.size = typeSizeof(type);
745
746 switch (i->op) {
747 case OP_MAD:
748 case OP_FMA: {
749 ImmediateValue src0, src1;
750 src1 = *i->getSrc(0)->asImm();
751
752 // Move the immediate into position 1, where we know it might be
753 // emittable. However it might not be anyways, as there may be other
754 // restrictions, so move it into a separate LValue.
755 bld.setPosition(i, false);
756 i->op = OP_ADD;
757 i->dnz = 0;
758 i->setSrc(1, bld.mkMov(bld.getSSA(type), i->getSrc(0), type)->getDef(0));
759 i->setSrc(0, i->getSrc(2));
760 i->src(0).mod = i->src(2).mod;
761 i->setSrc(2, NULL);
762
763 if (i->src(0).getImmediate(src0))
764 expr(i, src0, src1);
765 else
766 opnd(i, src1, 1);
767 break;
768 }
769 case OP_PFETCH:
770 // Leave PFETCH alone... we just folded its 2 args into 1.
771 break;
772 default:
773 i->op = i->saturate ? OP_SAT : OP_MOV;
774 if (i->saturate)
775 unary(i, *i->getSrc(0)->asImm());
776 break;
777 }
778 i->subOp = 0;
779 }
780
781 void
expr(Instruction * i,ImmediateValue & imm0,ImmediateValue & imm1,ImmediateValue & imm2)782 ConstantFolding::expr(Instruction *i,
783 ImmediateValue &imm0,
784 ImmediateValue &imm1,
785 ImmediateValue &imm2)
786 {
787 struct Storage *const a = &imm0.reg, *const b = &imm1.reg, *const c = &imm2.reg;
788 struct Storage res;
789
790 memset(&res.data, 0, sizeof(res.data));
791
792 switch (i->op) {
793 case OP_LOP3_LUT:
794 for (int n = 0; n < 32; n++) {
795 uint8_t lut = ((a->data.u32 >> n) & 1) << 2 |
796 ((b->data.u32 >> n) & 1) << 1 |
797 ((c->data.u32 >> n) & 1);
798 res.data.u32 |= !!(i->subOp & (1 << lut)) << n;
799 }
800 break;
801 case OP_PERMT:
802 if (!i->subOp) {
803 uint64_t input = (uint64_t)c->data.u32 << 32 | a->data.u32;
804 uint16_t permt = b->data.u32;
805 for (int n = 0 ; n < 4; n++, permt >>= 4)
806 res.data.u32 |= ((input >> ((permt & 0xf) * 8)) & 0xff) << n * 8;
807 } else
808 return;
809 break;
810 case OP_INSBF: {
811 int offset = b->data.u32 & 0xff;
812 int width = (b->data.u32 >> 8) & 0xff;
813 unsigned bitmask = ((1 << width) - 1) << offset;
814 res.data.u32 = ((a->data.u32 << offset) & bitmask) | (c->data.u32 & ~bitmask);
815 break;
816 }
817 case OP_MAD:
818 case OP_FMA: {
819 switch (i->dType) {
820 case TYPE_F32:
821 res.data.f32 = a->data.f32 * b->data.f32 * exp2f(i->postFactor) +
822 c->data.f32;
823 break;
824 case TYPE_F64:
825 res.data.f64 = a->data.f64 * b->data.f64 + c->data.f64;
826 break;
827 case TYPE_S32:
828 if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
829 res.data.s32 = ((int64_t)a->data.s32 * b->data.s32 >> 32) + c->data.s32;
830 break;
831 }
832 FALLTHROUGH;
833 case TYPE_U32:
834 if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
835 res.data.u32 = ((uint64_t)a->data.u32 * b->data.u32 >> 32) + c->data.u32;
836 break;
837 }
838 res.data.u32 = a->data.u32 * b->data.u32 + c->data.u32;
839 break;
840 default:
841 return;
842 }
843 break;
844 }
845 case OP_SHLADD:
846 res.data.u32 = (a->data.u32 << b->data.u32) + c->data.u32;
847 break;
848 default:
849 return;
850 }
851
852 ++foldCount;
853 i->src(0).mod = Modifier(0);
854 i->src(1).mod = Modifier(0);
855 i->src(2).mod = Modifier(0);
856
857 i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res.data.u32));
858 i->setSrc(1, NULL);
859 i->setSrc(2, NULL);
860
861 i->getSrc(0)->reg.data = res.data;
862 i->getSrc(0)->reg.type = i->dType;
863 i->getSrc(0)->reg.size = typeSizeof(i->dType);
864
865 i->op = OP_MOV;
866 }
867
868 void
unary(Instruction * i,const ImmediateValue & imm)869 ConstantFolding::unary(Instruction *i, const ImmediateValue &imm)
870 {
871 Storage res;
872
873 if (i->dType != TYPE_F32)
874 return;
875 switch (i->op) {
876 case OP_NEG: res.data.f32 = -imm.reg.data.f32; break;
877 case OP_ABS: res.data.f32 = fabsf(imm.reg.data.f32); break;
878 case OP_SAT: res.data.f32 = SATURATE(imm.reg.data.f32); break;
879 case OP_RCP: res.data.f32 = 1.0f / imm.reg.data.f32; break;
880 case OP_RSQ: res.data.f32 = 1.0f / sqrtf(imm.reg.data.f32); break;
881 case OP_LG2: res.data.f32 = log2f(imm.reg.data.f32); break;
882 case OP_EX2: res.data.f32 = exp2f(imm.reg.data.f32); break;
883 case OP_SIN: res.data.f32 = sinf(imm.reg.data.f32); break;
884 case OP_COS: res.data.f32 = cosf(imm.reg.data.f32); break;
885 case OP_SQRT: res.data.f32 = sqrtf(imm.reg.data.f32); break;
886 case OP_PRESIN:
887 case OP_PREEX2:
888 // these should be handled in subsequent OP_SIN/COS/EX2
889 res.data.f32 = imm.reg.data.f32;
890 break;
891 default:
892 return;
893 }
894 i->op = OP_MOV;
895 i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res.data.f32));
896 i->src(0).mod = Modifier(0);
897 }
898
899 void
tryCollapseChainedMULs(Instruction * mul2,const int s,ImmediateValue & imm2)900 ConstantFolding::tryCollapseChainedMULs(Instruction *mul2,
901 const int s, ImmediateValue& imm2)
902 {
903 const int t = s ? 0 : 1;
904 Instruction *insn;
905 Instruction *mul1 = NULL; // mul1 before mul2
906 int e = 0;
907 float f = imm2.reg.data.f32 * exp2f(mul2->postFactor);
908 ImmediateValue imm1;
909
910 assert(mul2->op == OP_MUL && mul2->dType == TYPE_F32);
911
912 if (mul2->getSrc(t)->refCount() == 1) {
913 insn = mul2->getSrc(t)->getInsn();
914 if (!mul2->src(t).mod && insn->op == OP_MUL && insn->dType == TYPE_F32)
915 mul1 = insn;
916 if (mul1 && !mul1->saturate) {
917 int s1;
918
919 if (mul1->src(s1 = 0).getImmediate(imm1) ||
920 mul1->src(s1 = 1).getImmediate(imm1)) {
921 bld.setPosition(mul1, false);
922 // a = mul r, imm1
923 // d = mul a, imm2 -> d = mul r, (imm1 * imm2)
924 mul1->setSrc(s1, bld.loadImm(NULL, f * imm1.reg.data.f32));
925 mul1->src(s1).mod = Modifier(0);
926 mul2->def(0).replace(mul1->getDef(0), false);
927 mul1->saturate = mul2->saturate;
928 } else
929 if (prog->getTarget()->isPostMultiplySupported(OP_MUL, f, e)) {
930 // c = mul a, b
931 // d = mul c, imm -> d = mul_x_imm a, b
932 mul1->postFactor = e;
933 mul2->def(0).replace(mul1->getDef(0), false);
934 if (f < 0)
935 mul1->src(0).mod *= Modifier(NV50_IR_MOD_NEG);
936 mul1->saturate = mul2->saturate;
937 }
938 return;
939 }
940 }
941 if (mul2->getDef(0)->refCount() == 1 && !mul2->saturate) {
942 // b = mul a, imm
943 // d = mul b, c -> d = mul_x_imm a, c
944 int s2, t2;
945 insn = (*mul2->getDef(0)->uses.begin())->getInsn();
946 if (!insn)
947 return;
948 mul1 = mul2;
949 mul2 = NULL;
950 s2 = insn->getSrc(0) == mul1->getDef(0) ? 0 : 1;
951 t2 = s2 ? 0 : 1;
952 if (insn->op == OP_MUL && insn->dType == TYPE_F32)
953 if (!insn->src(s2).mod && !insn->src(t2).getImmediate(imm1))
954 mul2 = insn;
955 if (mul2 && prog->getTarget()->isPostMultiplySupported(OP_MUL, f, e)) {
956 mul2->postFactor = e;
957 mul2->setSrc(s2, mul1->src(t));
958 if (f < 0)
959 mul2->src(s2).mod *= Modifier(NV50_IR_MOD_NEG);
960 }
961 }
962 }
963
964 void
opnd3(Instruction * i,ImmediateValue & imm2)965 ConstantFolding::opnd3(Instruction *i, ImmediateValue &imm2)
966 {
967 switch (i->op) {
968 case OP_MAD:
969 case OP_FMA:
970 if (imm2.isInteger(0)) {
971 i->op = OP_MUL;
972 i->setSrc(2, NULL);
973 foldCount++;
974 return;
975 }
976 break;
977 case OP_SHLADD:
978 if (imm2.isInteger(0)) {
979 i->op = OP_SHL;
980 i->setSrc(2, NULL);
981 foldCount++;
982 return;
983 }
984 break;
985 default:
986 return;
987 }
988 }
989
990 bool
createMul(DataType ty,Value * def,Value * a,int64_t b,Value * c)991 ConstantFolding::createMul(DataType ty, Value *def, Value *a, int64_t b, Value *c)
992 {
993 const Target *target = prog->getTarget();
994 int64_t absB = llabs(b);
995
996 //a * (2^shl) -> a << shl
997 if (b >= 0 && util_is_power_of_two_or_zero64(b)) {
998 int shl = util_logbase2_64(b);
999
1000 Value *res = c ? bld.getSSA(typeSizeof(ty)) : def;
1001 bld.mkOp2(OP_SHL, ty, res, a, bld.mkImm(shl));
1002 if (c)
1003 bld.mkOp2(OP_ADD, ty, def, res, c);
1004
1005 return true;
1006 }
1007
1008 //a * (2^shl + 1) -> a << shl + a
1009 //a * -(2^shl + 1) -> -a << shl + a
1010 //a * (2^shl - 1) -> a << shl - a
1011 //a * -(2^shl - 1) -> -a << shl - a
1012 if (typeSizeof(ty) == 4 &&
1013 (util_is_power_of_two_or_zero64(absB - 1) ||
1014 util_is_power_of_two_or_zero64(absB + 1)) &&
1015 target->isOpSupported(OP_SHLADD, TYPE_U32)) {
1016 bool subA = util_is_power_of_two_or_zero64(absB + 1);
1017 int shl = subA ? util_logbase2_64(absB + 1) : util_logbase2_64(absB - 1);
1018
1019 Value *res = c ? bld.getSSA() : def;
1020 Instruction *insn = bld.mkOp3(OP_SHLADD, TYPE_U32, res, a, bld.mkImm(shl), a);
1021 if (b < 0)
1022 insn->src(0).mod = Modifier(NV50_IR_MOD_NEG);
1023 if (subA)
1024 insn->src(2).mod = Modifier(NV50_IR_MOD_NEG);
1025
1026 if (c)
1027 bld.mkOp2(OP_ADD, TYPE_U32, def, res, c);
1028
1029 return true;
1030 }
1031
1032 if (typeSizeof(ty) == 4 && b >= 0 && b <= 0xffff &&
1033 target->isOpSupported(OP_XMAD, TYPE_U32)) {
1034 Value *tmp = bld.mkOp3v(OP_XMAD, TYPE_U32, bld.getSSA(),
1035 a, bld.mkImm((uint32_t)b), c ? c : bld.mkImm(0));
1036 bld.mkOp3(OP_XMAD, TYPE_U32, def, a, bld.mkImm((uint32_t)b), tmp)->subOp =
1037 NV50_IR_SUBOP_XMAD_PSL | NV50_IR_SUBOP_XMAD_H1(0);
1038
1039 return true;
1040 }
1041
1042 return false;
1043 }
1044
1045 bool
opnd(Instruction * i,ImmediateValue & imm0,int s)1046 ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
1047 {
1048 const int t = !s;
1049 const operation op = i->op;
1050 Instruction *newi = i;
1051 bool deleted = false;
1052
1053 switch (i->op) {
1054 case OP_SPLIT: {
1055 bld.setPosition(i, false);
1056
1057 uint8_t size = i->getDef(0)->reg.size;
1058 uint8_t bitsize = size * 8;
1059 uint32_t mask = (1ULL << bitsize) - 1;
1060 assert(bitsize <= 32);
1061
1062 uint64_t val = imm0.reg.data.u64;
1063 for (int8_t d = 0; i->defExists(d); ++d) {
1064 Value *def = i->getDef(d);
1065 assert(def->reg.size == size);
1066
1067 newi = bld.mkMov(def, bld.mkImm((uint32_t)(val & mask)),
1068 typeOfSize(size));
1069 val >>= bitsize;
1070 }
1071 delete_Instruction(prog, i);
1072 deleted = true;
1073 break;
1074 }
1075 case OP_MUL:
1076 if (i->dType == TYPE_F32 && !i->precise)
1077 tryCollapseChainedMULs(i, s, imm0);
1078
1079 if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
1080 assert(!isFloatType(i->sType));
1081 if (imm0.isInteger(1) && i->dType == TYPE_S32) {
1082 bld.setPosition(i, false);
1083 // Need to set to the sign value, which is a compare.
1084 newi = bld.mkCmp(OP_SET, CC_LT, TYPE_S32, i->getDef(0),
1085 TYPE_S32, i->getSrc(t), bld.mkImm(0));
1086 delete_Instruction(prog, i);
1087 deleted = true;
1088 } else if (imm0.isInteger(0) || imm0.isInteger(1)) {
1089 // The high bits can't be set in this case (either mul by 0 or
1090 // unsigned by 1)
1091 i->op = OP_MOV;
1092 i->subOp = 0;
1093 i->setSrc(0, new_ImmediateValue(prog, 0u));
1094 i->src(0).mod = Modifier(0);
1095 i->setSrc(1, NULL);
1096 } else if (!imm0.isNegative() && imm0.isPow2()) {
1097 // Translate into a shift
1098 imm0.applyLog2();
1099 i->op = OP_SHR;
1100 i->subOp = 0;
1101 imm0.reg.data.u32 = 32 - imm0.reg.data.u32;
1102 i->setSrc(0, i->getSrc(t));
1103 i->src(0).mod = i->src(t).mod;
1104 i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32));
1105 i->src(1).mod = 0;
1106 }
1107 } else
1108 if (imm0.isInteger(0)) {
1109 i->dnz = 0;
1110 i->op = OP_MOV;
1111 i->setSrc(0, new_ImmediateValue(prog, 0u));
1112 i->src(0).mod = Modifier(0);
1113 i->postFactor = 0;
1114 i->setSrc(1, NULL);
1115 } else
1116 if (!i->postFactor && (imm0.isInteger(1) || imm0.isInteger(-1))) {
1117 if (imm0.isNegative())
1118 i->src(t).mod = i->src(t).mod ^ Modifier(NV50_IR_MOD_NEG);
1119 i->dnz = 0;
1120 i->op = i->src(t).mod.getOp();
1121 if (s == 0) {
1122 i->setSrc(0, i->getSrc(1));
1123 i->src(0).mod = i->src(1).mod;
1124 i->src(1).mod = 0;
1125 }
1126 if (i->op != OP_CVT)
1127 i->src(0).mod = 0;
1128 i->setSrc(1, NULL);
1129 } else
1130 if (!i->postFactor && (imm0.isInteger(2) || imm0.isInteger(-2))) {
1131 if (imm0.isNegative())
1132 i->src(t).mod = i->src(t).mod ^ Modifier(NV50_IR_MOD_NEG);
1133 i->op = OP_ADD;
1134 i->dnz = 0;
1135 i->setSrc(s, i->getSrc(t));
1136 i->src(s).mod = i->src(t).mod;
1137 } else
1138 if (!isFloatType(i->dType) && !i->src(t).mod) {
1139 bld.setPosition(i, false);
1140 int64_t b = typeSizeof(i->dType) == 8 ? imm0.reg.data.s64 : imm0.reg.data.s32;
1141 if (createMul(i->dType, i->getDef(0), i->getSrc(t), b, NULL)) {
1142 delete_Instruction(prog, i);
1143 deleted = true;
1144 }
1145 } else
1146 if (i->postFactor && i->sType == TYPE_F32) {
1147 /* Can't emit a postfactor with an immediate, have to fold it in */
1148 i->setSrc(s, new_ImmediateValue(
1149 prog, imm0.reg.data.f32 * exp2f(i->postFactor)));
1150 i->postFactor = 0;
1151 }
1152 break;
1153 case OP_FMA:
1154 case OP_MAD:
1155 if (imm0.isInteger(0)) {
1156 i->setSrc(0, i->getSrc(2));
1157 i->src(0).mod = i->src(2).mod;
1158 i->setSrc(1, NULL);
1159 i->setSrc(2, NULL);
1160 i->dnz = 0;
1161 i->op = i->src(0).mod.getOp();
1162 if (i->op != OP_CVT)
1163 i->src(0).mod = 0;
1164 } else
1165 if (i->subOp != NV50_IR_SUBOP_MUL_HIGH &&
1166 (imm0.isInteger(1) || imm0.isInteger(-1))) {
1167 if (imm0.isNegative())
1168 i->src(t).mod = i->src(t).mod ^ Modifier(NV50_IR_MOD_NEG);
1169 if (s == 0) {
1170 i->setSrc(0, i->getSrc(1));
1171 i->src(0).mod = i->src(1).mod;
1172 }
1173 i->setSrc(1, i->getSrc(2));
1174 i->src(1).mod = i->src(2).mod;
1175 i->setSrc(2, NULL);
1176 i->dnz = 0;
1177 i->op = OP_ADD;
1178 } else
1179 if (!isFloatType(i->dType) && !i->subOp && !i->src(t).mod && !i->src(2).mod) {
1180 bld.setPosition(i, false);
1181 int64_t b = typeSizeof(i->dType) == 8 ? imm0.reg.data.s64 : imm0.reg.data.s32;
1182 if (createMul(i->dType, i->getDef(0), i->getSrc(t), b, i->getSrc(2))) {
1183 delete_Instruction(prog, i);
1184 deleted = true;
1185 }
1186 }
1187 break;
1188 case OP_SUB:
1189 if (imm0.isInteger(0) && s == 0 && typeSizeof(i->dType) == 8 &&
1190 !isFloatType(i->dType))
1191 break;
1192 FALLTHROUGH;
1193 case OP_ADD:
1194 if (i->usesFlags())
1195 break;
1196 if (imm0.isInteger(0)) {
1197 if (s == 0) {
1198 i->setSrc(0, i->getSrc(1));
1199 i->src(0).mod = i->src(1).mod;
1200 if (i->op == OP_SUB)
1201 i->src(0).mod = i->src(0).mod ^ Modifier(NV50_IR_MOD_NEG);
1202 }
1203 i->setSrc(1, NULL);
1204 i->op = i->src(0).mod.getOp();
1205 if (i->op != OP_CVT)
1206 i->src(0).mod = Modifier(0);
1207 }
1208 break;
1209
1210 case OP_DIV:
1211 if (s != 1 || (i->dType != TYPE_S32 && i->dType != TYPE_U32))
1212 break;
1213 bld.setPosition(i, false);
1214 if (imm0.reg.data.u32 == 0) {
1215 break;
1216 } else
1217 if (imm0.reg.data.u32 == 1) {
1218 i->op = OP_MOV;
1219 i->setSrc(1, NULL);
1220 } else
1221 if (i->dType == TYPE_U32 && imm0.isPow2()) {
1222 i->op = OP_SHR;
1223 i->setSrc(1, bld.mkImm(util_logbase2(imm0.reg.data.u32)));
1224 } else
1225 if (i->dType == TYPE_U32) {
1226 Instruction *mul;
1227 Value *tA, *tB;
1228 const uint32_t d = imm0.reg.data.u32;
1229 uint32_t m;
1230 int r, s;
1231 uint32_t l = util_logbase2(d);
1232 if (((uint32_t)1 << l) < d)
1233 ++l;
1234 m = (((uint64_t)1 << 32) * (((uint64_t)1 << l) - d)) / d + 1;
1235 r = l ? 1 : 0;
1236 s = l ? (l - 1) : 0;
1237
1238 tA = bld.getSSA();
1239 tB = bld.getSSA();
1240 mul = bld.mkOp2(OP_MUL, TYPE_U32, tA, i->getSrc(0),
1241 bld.loadImm(NULL, m));
1242 mul->subOp = NV50_IR_SUBOP_MUL_HIGH;
1243 bld.mkOp2(OP_SUB, TYPE_U32, tB, i->getSrc(0), tA);
1244 tA = bld.getSSA();
1245 if (r)
1246 bld.mkOp2(OP_SHR, TYPE_U32, tA, tB, bld.mkImm(r));
1247 else
1248 tA = tB;
1249 tB = s ? bld.getSSA() : i->getDef(0);
1250 newi = bld.mkOp2(OP_ADD, TYPE_U32, tB, mul->getDef(0), tA);
1251 if (s)
1252 bld.mkOp2(OP_SHR, TYPE_U32, i->getDef(0), tB, bld.mkImm(s));
1253
1254 delete_Instruction(prog, i);
1255 deleted = true;
1256 } else
1257 if (imm0.reg.data.s32 == -1) {
1258 i->op = OP_NEG;
1259 i->setSrc(1, NULL);
1260 } else {
1261 LValue *tA, *tB;
1262 LValue *tD;
1263 const int32_t d = imm0.reg.data.s32;
1264 int32_t m;
1265 int32_t l = util_logbase2(static_cast<unsigned>(abs(d)));
1266 if ((1 << l) < abs(d))
1267 ++l;
1268 if (!l)
1269 l = 1;
1270 m = ((uint64_t)1 << (32 + l - 1)) / abs(d) + 1 - ((uint64_t)1 << 32);
1271
1272 tA = bld.getSSA();
1273 tB = bld.getSSA();
1274 bld.mkOp3(OP_MAD, TYPE_S32, tA, i->getSrc(0), bld.loadImm(NULL, m),
1275 i->getSrc(0))->subOp = NV50_IR_SUBOP_MUL_HIGH;
1276 if (l > 1)
1277 bld.mkOp2(OP_SHR, TYPE_S32, tB, tA, bld.mkImm(l - 1));
1278 else
1279 tB = tA;
1280 tA = bld.getSSA();
1281 bld.mkCmp(OP_SET, CC_LT, TYPE_S32, tA, TYPE_S32, i->getSrc(0), bld.mkImm(0));
1282 tD = (d < 0) ? bld.getSSA() : i->getDef(0)->asLValue();
1283 newi = bld.mkOp2(OP_SUB, TYPE_U32, tD, tB, tA);
1284 if (d < 0)
1285 bld.mkOp1(OP_NEG, TYPE_S32, i->getDef(0), tB);
1286
1287 delete_Instruction(prog, i);
1288 deleted = true;
1289 }
1290 break;
1291
1292 case OP_MOD:
1293 if (s == 1 && imm0.isPow2()) {
1294 bld.setPosition(i, false);
1295 if (i->sType == TYPE_U32) {
1296 i->op = OP_AND;
1297 i->setSrc(1, bld.loadImm(NULL, imm0.reg.data.u32 - 1));
1298 } else if (i->sType == TYPE_S32) {
1299 // Do it on the absolute value of the input, and then restore the
1300 // sign. The only odd case is MIN_INT, but that should work out
1301 // as well, since MIN_INT mod any power of 2 is 0.
1302 //
1303 // Technically we don't have to do any of this since MOD is
1304 // undefined with negative arguments in GLSL, but this seems like
1305 // the nice thing to do.
1306 Value *abs = bld.mkOp1v(OP_ABS, TYPE_S32, bld.getSSA(), i->getSrc(0));
1307 Value *neg, *v1, *v2;
1308 bld.mkCmp(OP_SET, CC_LT, TYPE_S32,
1309 (neg = bld.getSSA(1, prog->getTarget()->nativeFile(FILE_PREDICATE))),
1310 TYPE_S32, i->getSrc(0), bld.loadImm(NULL, 0));
1311 Value *mod = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), abs,
1312 bld.loadImm(NULL, imm0.reg.data.u32 - 1));
1313 bld.mkOp1(OP_NEG, TYPE_S32, (v1 = bld.getSSA()), mod)
1314 ->setPredicate(CC_P, neg);
1315 bld.mkOp1(OP_MOV, TYPE_S32, (v2 = bld.getSSA()), mod)
1316 ->setPredicate(CC_NOT_P, neg);
1317 newi = bld.mkOp2(OP_UNION, TYPE_S32, i->getDef(0), v1, v2);
1318
1319 delete_Instruction(prog, i);
1320 deleted = true;
1321 }
1322 } else if (s == 1) {
1323 // In this case, we still want the optimized lowering that we get
1324 // from having division by an immediate.
1325 //
1326 // a % b == a - (a/b) * b
1327 bld.setPosition(i, false);
1328 Value *div = bld.mkOp2v(OP_DIV, i->sType, bld.getSSA(),
1329 i->getSrc(0), i->getSrc(1));
1330 newi = bld.mkOp2(OP_ADD, i->sType, i->getDef(0), i->getSrc(0),
1331 bld.mkOp2v(OP_MUL, i->sType, bld.getSSA(), div, i->getSrc(1)));
1332 // TODO: Check that target supports this. In this case, we know that
1333 // all backends do.
1334 newi->src(1).mod = Modifier(NV50_IR_MOD_NEG);
1335
1336 delete_Instruction(prog, i);
1337 deleted = true;
1338 }
1339 break;
1340
1341 case OP_SET: // TODO: SET_AND,OR,XOR
1342 {
1343 /* This optimizes the case where the output of a set is being compared
1344 * to zero. Since the set can only produce 0/-1 (int) or 0/1 (float), we
1345 * can be a lot cleverer in our comparison.
1346 */
1347 CmpInstruction *si = findOriginForTestWithZero(i->getSrc(t));
1348 CondCode cc, ccZ;
1349 if (imm0.reg.data.u32 != 0 || !si)
1350 return false;
1351 cc = si->setCond;
1352 ccZ = (CondCode)((unsigned int)i->asCmp()->setCond & ~CC_U);
1353 // We do everything assuming var (cmp) 0, reverse the condition if 0 is
1354 // first.
1355 if (s == 0)
1356 ccZ = reverseCondCode(ccZ);
1357 // If there is a negative modifier, we need to undo that, by flipping
1358 // the comparison to zero.
1359 if (i->src(t).mod.neg())
1360 ccZ = reverseCondCode(ccZ);
1361 // If this is a signed comparison, we expect the input to be a regular
1362 // boolean, i.e. 0/-1. However the rest of the logic assumes that true
1363 // is positive, so just flip the sign.
1364 if (i->sType == TYPE_S32) {
1365 assert(!isFloatType(si->dType));
1366 ccZ = reverseCondCode(ccZ);
1367 }
1368 switch (ccZ) {
1369 case CC_LT: cc = CC_FL; break; // bool < 0 -- this is never true
1370 case CC_GE: cc = CC_TR; break; // bool >= 0 -- this is always true
1371 case CC_EQ: cc = inverseCondCode(cc); break; // bool == 0 -- !bool
1372 case CC_LE: cc = inverseCondCode(cc); break; // bool <= 0 -- !bool
1373 case CC_GT: break; // bool > 0 -- bool
1374 case CC_NE: break; // bool != 0 -- bool
1375 default:
1376 return false;
1377 }
1378
1379 // Update the condition of this SET to be identical to the origin set,
1380 // but with the updated condition code. The original SET should get
1381 // DCE'd, ideally.
1382 i->op = si->op;
1383 i->asCmp()->setCond = cc;
1384 i->setSrc(0, si->src(0));
1385 i->setSrc(1, si->src(1));
1386 if (si->srcExists(2))
1387 i->setSrc(2, si->src(2));
1388 i->sType = si->sType;
1389 }
1390 break;
1391
1392 case OP_AND:
1393 {
1394 Instruction *src = i->getSrc(t)->getInsn();
1395 ImmediateValue imm1;
1396 if (imm0.reg.data.u32 == 0) {
1397 i->op = OP_MOV;
1398 i->setSrc(0, new_ImmediateValue(prog, 0u));
1399 i->src(0).mod = Modifier(0);
1400 i->setSrc(1, NULL);
1401 } else if (imm0.reg.data.u32 == ~0U) {
1402 i->op = i->src(t).mod.getOp();
1403 if (t) {
1404 i->setSrc(0, i->getSrc(t));
1405 i->src(0).mod = i->src(t).mod;
1406 }
1407 i->setSrc(1, NULL);
1408 } else if (src->asCmp()) {
1409 CmpInstruction *cmp = src->asCmp();
1410 if (!cmp || cmp->op == OP_SLCT || cmp->getDef(0)->refCount() > 1)
1411 return false;
1412 if (!prog->getTarget()->isOpSupported(cmp->op, TYPE_F32))
1413 return false;
1414 if (imm0.reg.data.f32 != 1.0)
1415 return false;
1416 if (cmp->dType != TYPE_U32)
1417 return false;
1418
1419 cmp->dType = TYPE_F32;
1420 if (i->src(t).mod != Modifier(0)) {
1421 assert(i->src(t).mod == Modifier(NV50_IR_MOD_NOT));
1422 i->src(t).mod = Modifier(0);
1423 cmp->setCond = inverseCondCode(cmp->setCond);
1424 }
1425 i->op = OP_MOV;
1426 i->setSrc(s, NULL);
1427 if (t) {
1428 i->setSrc(0, i->getSrc(t));
1429 i->setSrc(t, NULL);
1430 }
1431 } else if (prog->getTarget()->isOpSupported(OP_EXTBF, TYPE_U32) &&
1432 src->op == OP_SHR &&
1433 src->src(1).getImmediate(imm1) &&
1434 i->src(t).mod == Modifier(0) &&
1435 util_is_power_of_two_or_zero(imm0.reg.data.u32 + 1)) {
1436 // low byte = offset, high byte = width
1437 uint32_t ext = (util_last_bit(imm0.reg.data.u32) << 8) | imm1.reg.data.u32;
1438 i->op = OP_EXTBF;
1439 i->setSrc(0, src->getSrc(0));
1440 i->setSrc(1, new_ImmediateValue(prog, ext));
1441 } else if (src->op == OP_SHL &&
1442 src->src(1).getImmediate(imm1) &&
1443 i->src(t).mod == Modifier(0) &&
1444 util_is_power_of_two_or_zero(~imm0.reg.data.u32 + 1) &&
1445 util_last_bit(~imm0.reg.data.u32) <= imm1.reg.data.u32) {
1446 i->op = OP_MOV;
1447 i->setSrc(s, NULL);
1448 if (t) {
1449 i->setSrc(0, i->getSrc(t));
1450 i->setSrc(t, NULL);
1451 }
1452 }
1453 }
1454 break;
1455
1456 case OP_SHL:
1457 {
1458 if (s != 1 || i->src(0).mod != Modifier(0))
1459 break;
1460
1461 if (imm0.reg.data.u32 == 0) {
1462 i->op = OP_MOV;
1463 i->setSrc(1, NULL);
1464 break;
1465 }
1466 // try to concatenate shifts
1467 Instruction *si = i->getSrc(0)->getInsn();
1468 if (!si)
1469 break;
1470 ImmediateValue imm1;
1471 switch (si->op) {
1472 case OP_SHL:
1473 if (si->src(1).getImmediate(imm1)) {
1474 bld.setPosition(i, false);
1475 i->setSrc(0, si->getSrc(0));
1476 i->setSrc(1, bld.loadImm(NULL, imm0.reg.data.u32 + imm1.reg.data.u32));
1477 }
1478 break;
1479 case OP_SHR:
1480 if (si->src(1).getImmediate(imm1) && imm0.reg.data.u32 == imm1.reg.data.u32) {
1481 bld.setPosition(i, false);
1482 i->op = OP_AND;
1483 i->setSrc(0, si->getSrc(0));
1484 i->setSrc(1, bld.loadImm(NULL, ~((1 << imm0.reg.data.u32) - 1)));
1485 }
1486 break;
1487 case OP_MUL:
1488 int muls;
1489 if (isFloatType(si->dType))
1490 return false;
1491 if (si->subOp)
1492 return false;
1493 if (si->src(1).getImmediate(imm1))
1494 muls = 1;
1495 else if (si->src(0).getImmediate(imm1))
1496 muls = 0;
1497 else
1498 return false;
1499
1500 bld.setPosition(i, false);
1501 i->op = OP_MUL;
1502 i->subOp = 0;
1503 i->dType = si->dType;
1504 i->sType = si->sType;
1505 i->setSrc(0, si->getSrc(!muls));
1506 i->setSrc(1, bld.loadImm(NULL, imm1.reg.data.u32 << imm0.reg.data.u32));
1507 break;
1508 case OP_SUB:
1509 case OP_ADD:
1510 int adds;
1511 if (isFloatType(si->dType))
1512 return false;
1513 if (si->op != OP_SUB && si->src(0).getImmediate(imm1))
1514 adds = 0;
1515 else if (si->src(1).getImmediate(imm1))
1516 adds = 1;
1517 else
1518 return false;
1519 if (si->src(!adds).mod != Modifier(0))
1520 return false;
1521 // SHL(ADD(x, y), z) = ADD(SHL(x, z), SHL(y, z))
1522
1523 // This is more operations, but if one of x, y is an immediate, then
1524 // we can get a situation where (a) we can use ISCADD, or (b)
1525 // propagate the add bit into an indirect load.
1526 bld.setPosition(i, false);
1527 i->op = si->op;
1528 i->setSrc(adds, bld.loadImm(NULL, imm1.reg.data.u32 << imm0.reg.data.u32));
1529 i->setSrc(!adds, bld.mkOp2v(OP_SHL, i->dType,
1530 bld.getSSA(i->def(0).getSize(), i->def(0).getFile()),
1531 si->getSrc(!adds),
1532 bld.mkImm(imm0.reg.data.u32)));
1533 break;
1534 default:
1535 return false;
1536 }
1537 }
1538 break;
1539
1540 case OP_ABS:
1541 case OP_NEG:
1542 case OP_SAT:
1543 case OP_LG2:
1544 case OP_RCP:
1545 case OP_SQRT:
1546 case OP_RSQ:
1547 case OP_PRESIN:
1548 case OP_SIN:
1549 case OP_COS:
1550 case OP_PREEX2:
1551 case OP_EX2:
1552 unary(i, imm0);
1553 break;
1554 case OP_BFIND: {
1555 int32_t res;
1556 switch (i->dType) {
1557 case TYPE_S32: res = util_last_bit_signed(imm0.reg.data.s32) - 1; break;
1558 case TYPE_U32: res = util_last_bit(imm0.reg.data.u32) - 1; break;
1559 default:
1560 return false;
1561 }
1562 if (i->subOp == NV50_IR_SUBOP_BFIND_SAMT && res >= 0)
1563 res = 31 - res;
1564 bld.setPosition(i, false); /* make sure bld is init'ed */
1565 i->setSrc(0, bld.mkImm(res));
1566 i->setSrc(1, NULL);
1567 i->op = OP_MOV;
1568 i->subOp = 0;
1569 break;
1570 }
1571 case OP_BREV: {
1572 uint32_t res = util_bitreverse(imm0.reg.data.u32);
1573 i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res));
1574 i->op = OP_MOV;
1575 break;
1576 }
1577 case OP_POPCNT: {
1578 // Only deal with 1-arg POPCNT here
1579 if (i->srcExists(1))
1580 break;
1581 uint32_t res = util_bitcount(imm0.reg.data.u32);
1582 i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res));
1583 i->setSrc(1, NULL);
1584 i->op = OP_MOV;
1585 break;
1586 }
1587 case OP_CVT: {
1588 Storage res;
1589
1590 // TODO: handle 64-bit values properly
1591 if (typeSizeof(i->dType) == 8 || typeSizeof(i->sType) == 8)
1592 return false;
1593
1594 // TODO: handle single byte/word extractions
1595 if (i->subOp)
1596 return false;
1597
1598 bld.setPosition(i, true); /* make sure bld is init'ed */
1599
1600 #define CASE(type, dst, fmin, fmax, imin, imax, umin, umax) \
1601 case type: \
1602 switch (i->sType) { \
1603 case TYPE_F64: \
1604 res.data.dst = util_iround(i->saturate ? \
1605 CLAMP(imm0.reg.data.f64, fmin, fmax) : \
1606 imm0.reg.data.f64); \
1607 break; \
1608 case TYPE_F32: \
1609 res.data.dst = util_iround(i->saturate ? \
1610 CLAMP(imm0.reg.data.f32, fmin, fmax) : \
1611 imm0.reg.data.f32); \
1612 break; \
1613 case TYPE_S32: \
1614 res.data.dst = i->saturate ? \
1615 CLAMP(imm0.reg.data.s32, imin, imax) : \
1616 imm0.reg.data.s32; \
1617 break; \
1618 case TYPE_U32: \
1619 res.data.dst = i->saturate ? \
1620 CLAMP(imm0.reg.data.u32, umin, umax) : \
1621 imm0.reg.data.u32; \
1622 break; \
1623 case TYPE_S16: \
1624 res.data.dst = i->saturate ? \
1625 CLAMP(imm0.reg.data.s16, imin, imax) : \
1626 imm0.reg.data.s16; \
1627 break; \
1628 case TYPE_U16: \
1629 res.data.dst = i->saturate ? \
1630 CLAMP(imm0.reg.data.u16, umin, umax) : \
1631 imm0.reg.data.u16; \
1632 break; \
1633 default: return false; \
1634 } \
1635 i->setSrc(0, bld.mkImm(res.data.dst)); \
1636 break
1637
1638 switch(i->dType) {
1639 CASE(TYPE_U16, u16, 0, UINT16_MAX, 0, UINT16_MAX, 0, UINT16_MAX);
1640 CASE(TYPE_S16, s16, INT16_MIN, INT16_MAX, INT16_MIN, INT16_MAX, 0, INT16_MAX);
1641 CASE(TYPE_U32, u32, 0, (float)UINT32_MAX, 0, INT32_MAX, 0, UINT32_MAX);
1642 CASE(TYPE_S32, s32, (float)INT32_MIN, (float)INT32_MAX, INT32_MIN, INT32_MAX, 0, INT32_MAX);
1643 case TYPE_F32:
1644 switch (i->sType) {
1645 case TYPE_F64:
1646 res.data.f32 = i->saturate ?
1647 SATURATE(imm0.reg.data.f64) :
1648 imm0.reg.data.f64;
1649 break;
1650 case TYPE_F32:
1651 res.data.f32 = i->saturate ?
1652 SATURATE(imm0.reg.data.f32) :
1653 imm0.reg.data.f32;
1654 break;
1655 case TYPE_U16: res.data.f32 = (float) imm0.reg.data.u16; break;
1656 case TYPE_U32: res.data.f32 = (float) imm0.reg.data.u32; break;
1657 case TYPE_S16: res.data.f32 = (float) imm0.reg.data.s16; break;
1658 case TYPE_S32: res.data.f32 = (float) imm0.reg.data.s32; break;
1659 default:
1660 return false;
1661 }
1662 i->setSrc(0, bld.mkImm(res.data.f32));
1663 break;
1664 case TYPE_F64:
1665 switch (i->sType) {
1666 case TYPE_F64:
1667 res.data.f64 = i->saturate ?
1668 SATURATE(imm0.reg.data.f64) :
1669 imm0.reg.data.f64;
1670 break;
1671 case TYPE_F32:
1672 res.data.f64 = i->saturate ?
1673 SATURATE(imm0.reg.data.f32) :
1674 imm0.reg.data.f32;
1675 break;
1676 case TYPE_U16: res.data.f64 = (double) imm0.reg.data.u16; break;
1677 case TYPE_U32: res.data.f64 = (double) imm0.reg.data.u32; break;
1678 case TYPE_S16: res.data.f64 = (double) imm0.reg.data.s16; break;
1679 case TYPE_S32: res.data.f64 = (double) imm0.reg.data.s32; break;
1680 default:
1681 return false;
1682 }
1683 i->setSrc(0, bld.mkImm(res.data.f64));
1684 break;
1685 default:
1686 return false;
1687 }
1688 #undef CASE
1689
1690 i->setType(i->dType); /* Remove i->sType, which we don't need anymore */
1691 i->op = OP_MOV;
1692 i->saturate = 0;
1693 i->src(0).mod = Modifier(0); /* Clear the already applied modifier */
1694 break;
1695 }
1696 default:
1697 return false;
1698 }
1699
1700 // This can get left behind some of the optimizations which simplify
1701 // saturatable values.
1702 if (newi->op == OP_MOV && newi->saturate) {
1703 ImmediateValue tmp;
1704 newi->saturate = 0;
1705 newi->op = OP_SAT;
1706 if (newi->src(0).getImmediate(tmp))
1707 unary(newi, tmp);
1708 }
1709
1710 if (newi->op != op)
1711 foldCount++;
1712 return deleted;
1713 }
1714
1715 // =============================================================================
1716
1717 // Merge modifier operations (ABS, NEG, NOT) into ValueRefs where allowed.
1718 class ModifierFolding : public Pass
1719 {
1720 private:
1721 virtual bool visit(BasicBlock *);
1722 };
1723
1724 bool
visit(BasicBlock * bb)1725 ModifierFolding::visit(BasicBlock *bb)
1726 {
1727 const Target *target = prog->getTarget();
1728
1729 Instruction *i, *next, *mi;
1730 Modifier mod;
1731
1732 for (i = bb->getEntry(); i; i = next) {
1733 next = i->next;
1734
1735 if (false && i->op == OP_SUB) {
1736 // turn "sub" into "add neg" (do we really want this ?)
1737 i->op = OP_ADD;
1738 i->src(0).mod = i->src(0).mod ^ Modifier(NV50_IR_MOD_NEG);
1739 }
1740
1741 for (int s = 0; s < 3 && i->srcExists(s); ++s) {
1742 mi = i->getSrc(s)->getInsn();
1743 if (!mi ||
1744 mi->predSrc >= 0 || mi->getDef(0)->refCount() > 8)
1745 continue;
1746 if (i->sType == TYPE_U32 && mi->dType == TYPE_S32) {
1747 if ((i->op != OP_ADD &&
1748 i->op != OP_MUL) ||
1749 (mi->op != OP_ABS &&
1750 mi->op != OP_NEG))
1751 continue;
1752 } else
1753 if (i->sType != mi->dType) {
1754 continue;
1755 }
1756 if ((mod = Modifier(mi->op)) == Modifier(0))
1757 continue;
1758 mod *= mi->src(0).mod;
1759
1760 if ((i->op == OP_ABS) || i->src(s).mod.abs()) {
1761 // abs neg [abs] = abs
1762 mod = mod & Modifier(~(NV50_IR_MOD_NEG | NV50_IR_MOD_ABS));
1763 } else
1764 if ((i->op == OP_NEG) && mod.neg()) {
1765 assert(s == 0);
1766 // neg as both opcode and modifier on same insn is prohibited
1767 // neg neg abs = abs, neg neg = identity
1768 mod = mod & Modifier(~NV50_IR_MOD_NEG);
1769 i->op = mod.getOp();
1770 mod = mod & Modifier(~NV50_IR_MOD_ABS);
1771 if (mod == Modifier(0))
1772 i->op = OP_MOV;
1773 }
1774
1775 if (target->isModSupported(i, s, mod)) {
1776 i->setSrc(s, mi->getSrc(0));
1777 i->src(s).mod *= mod;
1778 }
1779 }
1780
1781 if (i->op == OP_SAT) {
1782 mi = i->getSrc(0)->getInsn();
1783 if (mi &&
1784 mi->getDef(0)->refCount() <= 1 && target->isSatSupported(mi)) {
1785 mi->saturate = 1;
1786 mi->setDef(0, i->getDef(0));
1787 delete_Instruction(prog, i);
1788 }
1789 }
1790 }
1791
1792 return true;
1793 }
1794
1795 // =============================================================================
1796
1797 // MUL + ADD -> MAD/FMA
1798 // MIN/MAX(a, a) -> a, etc.
1799 // SLCT(a, b, const) -> cc(const) ? a : b
1800 // RCP(RCP(a)) -> a
1801 // MUL(MUL(a, b), const) -> MUL_Xconst(a, b)
1802 // EXTBF(RDSV(COMBINED_TID)) -> RDSV(TID)
1803 class AlgebraicOpt : public Pass
1804 {
1805 private:
1806 virtual bool visit(BasicBlock *);
1807
1808 void handleABS(Instruction *);
1809 bool handleADD(Instruction *);
1810 bool tryADDToMADOrSAD(Instruction *, operation toOp);
1811 void handleMINMAX(Instruction *);
1812 void handleRCP(Instruction *);
1813 void handleSLCT(Instruction *);
1814 void handleLOGOP(Instruction *);
1815 void handleCVT_NEG(Instruction *);
1816 void handleCVT_CVT(Instruction *);
1817 void handleCVT_EXTBF(Instruction *);
1818 void handleSUCLAMP(Instruction *);
1819 void handleNEG(Instruction *);
1820 void handleEXTBF_RDSV(Instruction *);
1821
1822 BuildUtil bld;
1823 };
1824
1825 void
handleABS(Instruction * abs)1826 AlgebraicOpt::handleABS(Instruction *abs)
1827 {
1828 Instruction *sub = abs->getSrc(0)->getInsn();
1829 DataType ty;
1830 if (!sub ||
1831 !prog->getTarget()->isOpSupported(OP_SAD, abs->dType))
1832 return;
1833 // hidden conversion ?
1834 ty = intTypeToSigned(sub->dType);
1835 if (abs->dType != abs->sType || ty != abs->sType)
1836 return;
1837
1838 if ((sub->op != OP_ADD && sub->op != OP_SUB) ||
1839 sub->src(0).getFile() != FILE_GPR || sub->src(0).mod ||
1840 sub->src(1).getFile() != FILE_GPR || sub->src(1).mod)
1841 return;
1842
1843 Value *src0 = sub->getSrc(0);
1844 Value *src1 = sub->getSrc(1);
1845
1846 if (sub->op == OP_ADD) {
1847 Instruction *neg = sub->getSrc(1)->getInsn();
1848 if (neg && neg->op != OP_NEG) {
1849 neg = sub->getSrc(0)->getInsn();
1850 src0 = sub->getSrc(1);
1851 }
1852 if (!neg || neg->op != OP_NEG ||
1853 neg->dType != neg->sType || neg->sType != ty)
1854 return;
1855 src1 = neg->getSrc(0);
1856 }
1857
1858 // found ABS(SUB))
1859 abs->moveSources(1, 2); // move sources >=1 up by 2
1860 abs->op = OP_SAD;
1861 abs->setType(sub->dType);
1862 abs->setSrc(0, src0);
1863 abs->setSrc(1, src1);
1864 bld.setPosition(abs, false);
1865 abs->setSrc(2, bld.loadImm(bld.getSSA(typeSizeof(ty)), 0));
1866 }
1867
1868 bool
handleADD(Instruction * add)1869 AlgebraicOpt::handleADD(Instruction *add)
1870 {
1871 Value *src0 = add->getSrc(0);
1872 Value *src1 = add->getSrc(1);
1873
1874 if (src0->reg.file != FILE_GPR || src1->reg.file != FILE_GPR)
1875 return false;
1876
1877 bool changed = false;
1878 // we can't optimize to MAD if the add is precise
1879 if (!add->precise && prog->getTarget()->isOpSupported(OP_MAD, add->dType))
1880 changed = tryADDToMADOrSAD(add, OP_MAD);
1881 if (!changed && prog->getTarget()->isOpSupported(OP_SAD, add->dType))
1882 changed = tryADDToMADOrSAD(add, OP_SAD);
1883 return changed;
1884 }
1885
1886 // ADD(SAD(a,b,0), c) -> SAD(a,b,c)
1887 // ADD(MUL(a,b), c) -> MAD(a,b,c)
1888 bool
tryADDToMADOrSAD(Instruction * add,operation toOp)1889 AlgebraicOpt::tryADDToMADOrSAD(Instruction *add, operation toOp)
1890 {
1891 Value *src0 = add->getSrc(0);
1892 Value *src1 = add->getSrc(1);
1893 Value *src;
1894 int s;
1895 const operation srcOp = toOp == OP_SAD ? OP_SAD : OP_MUL;
1896 const Modifier modBad = Modifier(~((toOp == OP_MAD) ? NV50_IR_MOD_NEG : 0));
1897 Modifier mod[4];
1898
1899 if (src0->refCount() == 1 &&
1900 src0->getUniqueInsn() && src0->getUniqueInsn()->op == srcOp)
1901 s = 0;
1902 else
1903 if (src1->refCount() == 1 &&
1904 src1->getUniqueInsn() && src1->getUniqueInsn()->op == srcOp)
1905 s = 1;
1906 else
1907 return false;
1908
1909 src = add->getSrc(s);
1910
1911 if (src->getUniqueInsn() && src->getUniqueInsn()->bb != add->bb)
1912 return false;
1913
1914 if (src->getInsn()->saturate || src->getInsn()->postFactor ||
1915 src->getInsn()->dnz || src->getInsn()->precise)
1916 return false;
1917
1918 if (toOp == OP_SAD) {
1919 ImmediateValue imm;
1920 if (!src->getInsn()->src(2).getImmediate(imm))
1921 return false;
1922 if (!imm.isInteger(0))
1923 return false;
1924 }
1925
1926 if (typeSizeof(add->dType) != typeSizeof(src->getInsn()->dType) ||
1927 isFloatType(add->dType) != isFloatType(src->getInsn()->dType))
1928 return false;
1929
1930 mod[0] = add->src(0).mod;
1931 mod[1] = add->src(1).mod;
1932 mod[2] = src->getUniqueInsn()->src(0).mod;
1933 mod[3] = src->getUniqueInsn()->src(1).mod;
1934
1935 if (((mod[0] | mod[1]) | (mod[2] | mod[3])) & modBad)
1936 return false;
1937
1938 add->op = toOp;
1939 add->subOp = src->getInsn()->subOp; // potentially mul-high
1940 add->dnz = src->getInsn()->dnz;
1941 add->dType = src->getInsn()->dType; // sign matters for imad hi
1942 add->sType = src->getInsn()->sType;
1943
1944 add->setSrc(2, add->src(s ? 0 : 1));
1945
1946 add->setSrc(0, src->getInsn()->getSrc(0));
1947 add->src(0).mod = mod[2] ^ mod[s];
1948 add->setSrc(1, src->getInsn()->getSrc(1));
1949 add->src(1).mod = mod[3];
1950
1951 return true;
1952 }
1953
1954 void
handleMINMAX(Instruction * minmax)1955 AlgebraicOpt::handleMINMAX(Instruction *minmax)
1956 {
1957 Value *src0 = minmax->getSrc(0);
1958 Value *src1 = minmax->getSrc(1);
1959
1960 if (src0 != src1 || src0->reg.file != FILE_GPR)
1961 return;
1962 if (minmax->src(0).mod == minmax->src(1).mod) {
1963 if (minmax->def(0).mayReplace(minmax->src(0))) {
1964 minmax->def(0).replace(minmax->src(0), false);
1965 delete_Instruction(prog, minmax);
1966 } else {
1967 minmax->op = OP_CVT;
1968 minmax->setSrc(1, NULL);
1969 }
1970 } else {
1971 // TODO:
1972 // min(x, -x) = -abs(x)
1973 // min(x, -abs(x)) = -abs(x)
1974 // min(x, abs(x)) = x
1975 // max(x, -abs(x)) = x
1976 // max(x, abs(x)) = abs(x)
1977 // max(x, -x) = abs(x)
1978 }
1979 }
1980
1981 // rcp(rcp(a)) = a
1982 // rcp(sqrt(a)) = rsq(a)
1983 void
handleRCP(Instruction * rcp)1984 AlgebraicOpt::handleRCP(Instruction *rcp)
1985 {
1986 Instruction *si = rcp->getSrc(0)->getUniqueInsn();
1987
1988 if (!si)
1989 return;
1990
1991 if (si->op == OP_RCP) {
1992 Modifier mod = rcp->src(0).mod * si->src(0).mod;
1993 rcp->op = mod.getOp();
1994 rcp->setSrc(0, si->getSrc(0));
1995 } else if (si->op == OP_SQRT) {
1996 rcp->op = OP_RSQ;
1997 rcp->setSrc(0, si->getSrc(0));
1998 rcp->src(0).mod = rcp->src(0).mod * si->src(0).mod;
1999 }
2000 }
2001
2002 void
handleSLCT(Instruction * slct)2003 AlgebraicOpt::handleSLCT(Instruction *slct)
2004 {
2005 if (slct->getSrc(2)->reg.file == FILE_IMMEDIATE) {
2006 if (slct->getSrc(2)->asImm()->compare(slct->asCmp()->setCond, 0.0f))
2007 slct->setSrc(0, slct->getSrc(1));
2008 } else
2009 if (slct->getSrc(0) != slct->getSrc(1)) {
2010 return;
2011 }
2012 slct->op = OP_MOV;
2013 slct->setSrc(1, NULL);
2014 slct->setSrc(2, NULL);
2015 }
2016
2017 void
handleLOGOP(Instruction * logop)2018 AlgebraicOpt::handleLOGOP(Instruction *logop)
2019 {
2020 Value *src0 = logop->getSrc(0);
2021 Value *src1 = logop->getSrc(1);
2022
2023 if (src0->reg.file != FILE_GPR || src1->reg.file != FILE_GPR)
2024 return;
2025
2026 if (src0 == src1) {
2027 if ((logop->op == OP_AND || logop->op == OP_OR) &&
2028 logop->def(0).mayReplace(logop->src(0))) {
2029 logop->def(0).replace(logop->src(0), false);
2030 delete_Instruction(prog, logop);
2031 }
2032 } else {
2033 // try AND(SET, SET) -> SET_AND(SET)
2034 Instruction *set0 = src0->getInsn();
2035 Instruction *set1 = src1->getInsn();
2036
2037 if (!set0 || set0->fixed || !set1 || set1->fixed)
2038 return;
2039 if (set1->op != OP_SET) {
2040 Instruction *xchg = set0;
2041 set0 = set1;
2042 set1 = xchg;
2043 if (set1->op != OP_SET)
2044 return;
2045 }
2046 operation redOp = (logop->op == OP_AND ? OP_SET_AND :
2047 logop->op == OP_XOR ? OP_SET_XOR : OP_SET_OR);
2048 if (!prog->getTarget()->isOpSupported(redOp, set1->sType))
2049 return;
2050 if (set0->op != OP_SET &&
2051 set0->op != OP_SET_AND &&
2052 set0->op != OP_SET_OR &&
2053 set0->op != OP_SET_XOR)
2054 return;
2055 if (set0->getDef(0)->refCount() > 1 &&
2056 set1->getDef(0)->refCount() > 1)
2057 return;
2058 if (set0->getPredicate() || set1->getPredicate())
2059 return;
2060 // check that they don't source each other
2061 for (int s = 0; s < 2; ++s)
2062 if (set0->getSrc(s) == set1->getDef(0) ||
2063 set1->getSrc(s) == set0->getDef(0))
2064 return;
2065
2066 set0 = cloneForward(func, set0);
2067 set1 = cloneShallow(func, set1);
2068 logop->bb->insertAfter(logop, set1);
2069 logop->bb->insertAfter(logop, set0);
2070
2071 set0->dType = TYPE_U8;
2072 set0->getDef(0)->reg.file = FILE_PREDICATE;
2073 set0->getDef(0)->reg.size = 1;
2074 set1->setSrc(2, set0->getDef(0));
2075 set1->op = redOp;
2076 set1->setDef(0, logop->getDef(0));
2077 delete_Instruction(prog, logop);
2078 }
2079 }
2080
2081 // F2I(NEG(SET with result 1.0f/0.0f)) -> SET with result -1/0
2082 // nv50:
2083 // F2I(NEG(I2F(ABS(SET))))
2084 void
handleCVT_NEG(Instruction * cvt)2085 AlgebraicOpt::handleCVT_NEG(Instruction *cvt)
2086 {
2087 Instruction *insn = cvt->getSrc(0)->getInsn();
2088 if (cvt->sType != TYPE_F32 ||
2089 cvt->dType != TYPE_S32 || cvt->src(0).mod != Modifier(0))
2090 return;
2091 if (!insn || insn->op != OP_NEG || insn->dType != TYPE_F32)
2092 return;
2093 if (insn->src(0).mod != Modifier(0))
2094 return;
2095 insn = insn->getSrc(0)->getInsn();
2096
2097 // check for nv50 SET(-1,0) -> SET(1.0f/0.0f) chain and nvc0's f32 SET
2098 if (insn && insn->op == OP_CVT &&
2099 insn->dType == TYPE_F32 &&
2100 insn->sType == TYPE_S32) {
2101 insn = insn->getSrc(0)->getInsn();
2102 if (!insn || insn->op != OP_ABS || insn->sType != TYPE_S32 ||
2103 insn->src(0).mod)
2104 return;
2105 insn = insn->getSrc(0)->getInsn();
2106 if (!insn || insn->op != OP_SET || insn->dType != TYPE_U32)
2107 return;
2108 } else
2109 if (!insn || insn->op != OP_SET || insn->dType != TYPE_F32) {
2110 return;
2111 }
2112
2113 Instruction *bset = cloneShallow(func, insn);
2114 bset->dType = TYPE_U32;
2115 bset->setDef(0, cvt->getDef(0));
2116 cvt->bb->insertAfter(cvt, bset);
2117 delete_Instruction(prog, cvt);
2118 }
2119
2120 // F2I(TRUNC()) and so on can be expressed as a single CVT. If the earlier CVT
2121 // does a type conversion, this becomes trickier as there might be range
2122 // changes/etc. We could handle those in theory as long as the range was being
2123 // reduced or kept the same.
2124 void
handleCVT_CVT(Instruction * cvt)2125 AlgebraicOpt::handleCVT_CVT(Instruction *cvt)
2126 {
2127 Instruction *insn = cvt->getSrc(0)->getInsn();
2128
2129 if (!insn ||
2130 insn->saturate ||
2131 insn->subOp ||
2132 insn->dType != insn->sType ||
2133 insn->dType != cvt->sType)
2134 return;
2135
2136 RoundMode rnd = insn->rnd;
2137 switch (insn->op) {
2138 case OP_CEIL:
2139 rnd = ROUND_PI;
2140 break;
2141 case OP_FLOOR:
2142 rnd = ROUND_MI;
2143 break;
2144 case OP_TRUNC:
2145 rnd = ROUND_ZI;
2146 break;
2147 case OP_CVT:
2148 break;
2149 default:
2150 return;
2151 }
2152
2153 if (!isFloatType(cvt->dType) || !isFloatType(insn->sType))
2154 rnd = (RoundMode)(rnd & 3);
2155
2156 cvt->rnd = rnd;
2157 cvt->setSrc(0, insn->getSrc(0));
2158 cvt->src(0).mod *= insn->src(0).mod;
2159 cvt->sType = insn->sType;
2160 }
2161
2162 // Some shaders extract packed bytes out of words and convert them to
2163 // e.g. float. The Fermi+ CVT instruction can extract those directly, as can
2164 // nv50 for word sizes.
2165 //
2166 // CVT(EXTBF(x, byte/word))
2167 // CVT(AND(bytemask, x))
2168 // CVT(AND(bytemask, SHR(x, 8/16/24)))
2169 // CVT(SHR(x, 16/24))
2170 void
handleCVT_EXTBF(Instruction * cvt)2171 AlgebraicOpt::handleCVT_EXTBF(Instruction *cvt)
2172 {
2173 Instruction *insn = cvt->getSrc(0)->getInsn();
2174 ImmediateValue imm;
2175 Value *arg = NULL;
2176 unsigned width, offset = 0;
2177 if ((cvt->sType != TYPE_U32 && cvt->sType != TYPE_S32) || !insn)
2178 return;
2179 if (insn->op == OP_EXTBF && insn->src(1).getImmediate(imm)) {
2180 width = (imm.reg.data.u32 >> 8) & 0xff;
2181 offset = imm.reg.data.u32 & 0xff;
2182 arg = insn->getSrc(0);
2183
2184 if (width != 8 && width != 16)
2185 return;
2186 if (width == 8 && offset & 0x7)
2187 return;
2188 if (width == 16 && offset & 0xf)
2189 return;
2190 } else if (insn->op == OP_AND) {
2191 int s;
2192 if (insn->src(0).getImmediate(imm))
2193 s = 0;
2194 else if (insn->src(1).getImmediate(imm))
2195 s = 1;
2196 else
2197 return;
2198
2199 if (imm.reg.data.u32 == 0xff)
2200 width = 8;
2201 else if (imm.reg.data.u32 == 0xffff)
2202 width = 16;
2203 else
2204 return;
2205
2206 arg = insn->getSrc(!s);
2207 Instruction *shift = arg->getInsn();
2208
2209 if (shift && shift->op == OP_SHR &&
2210 shift->sType == cvt->sType &&
2211 shift->src(1).getImmediate(imm) &&
2212 ((width == 8 && (imm.reg.data.u32 & 0x7) == 0) ||
2213 (width == 16 && (imm.reg.data.u32 & 0xf) == 0))) {
2214 arg = shift->getSrc(0);
2215 offset = imm.reg.data.u32;
2216 }
2217 // We just AND'd the high bits away, which means this is effectively an
2218 // unsigned value.
2219 cvt->sType = TYPE_U32;
2220 } else if (insn->op == OP_SHR &&
2221 insn->sType == cvt->sType &&
2222 insn->src(1).getImmediate(imm)) {
2223 arg = insn->getSrc(0);
2224 if (imm.reg.data.u32 == 24) {
2225 width = 8;
2226 offset = 24;
2227 } else if (imm.reg.data.u32 == 16) {
2228 width = 16;
2229 offset = 16;
2230 } else {
2231 return;
2232 }
2233 }
2234
2235 if (!arg)
2236 return;
2237
2238 // Irrespective of what came earlier, we can undo a shift on the argument
2239 // by adjusting the offset.
2240 Instruction *shift = arg->getInsn();
2241 if (shift && shift->op == OP_SHL &&
2242 shift->src(1).getImmediate(imm) &&
2243 ((width == 8 && (imm.reg.data.u32 & 0x7) == 0) ||
2244 (width == 16 && (imm.reg.data.u32 & 0xf) == 0)) &&
2245 imm.reg.data.u32 <= offset) {
2246 arg = shift->getSrc(0);
2247 offset -= imm.reg.data.u32;
2248 }
2249
2250 // The unpackSnorm lowering still leaves a few shifts behind, but it's too
2251 // annoying to detect them.
2252
2253 if (width == 8) {
2254 cvt->sType = cvt->sType == TYPE_U32 ? TYPE_U8 : TYPE_S8;
2255 } else {
2256 assert(width == 16);
2257 cvt->sType = cvt->sType == TYPE_U32 ? TYPE_U16 : TYPE_S16;
2258 }
2259 cvt->setSrc(0, arg);
2260 cvt->subOp = offset >> 3;
2261 }
2262
2263 // SUCLAMP dst, (ADD b imm), k, 0 -> SUCLAMP dst, b, k, imm (if imm fits s6)
2264 void
handleSUCLAMP(Instruction * insn)2265 AlgebraicOpt::handleSUCLAMP(Instruction *insn)
2266 {
2267 ImmediateValue imm;
2268 int32_t val = insn->getSrc(2)->asImm()->reg.data.s32;
2269 int s;
2270 Instruction *add;
2271
2272 assert(insn->srcExists(0) && insn->src(0).getFile() == FILE_GPR);
2273
2274 // look for ADD (TODO: only count references by non-SUCLAMP)
2275 if (insn->getSrc(0)->refCount() > 1)
2276 return;
2277 add = insn->getSrc(0)->getInsn();
2278 if (!add || add->op != OP_ADD ||
2279 (add->dType != TYPE_U32 &&
2280 add->dType != TYPE_S32))
2281 return;
2282
2283 // look for immediate
2284 for (s = 0; s < 2; ++s)
2285 if (add->src(s).getImmediate(imm))
2286 break;
2287 if (s >= 2)
2288 return;
2289 s = s ? 0 : 1;
2290 // determine if immediate fits
2291 val += imm.reg.data.s32;
2292 if (val > 31 || val < -32)
2293 return;
2294 // determine if other addend fits
2295 if (add->src(s).getFile() != FILE_GPR || add->src(s).mod != Modifier(0))
2296 return;
2297
2298 bld.setPosition(insn, false); // make sure bld is init'ed
2299 // replace sources
2300 insn->setSrc(2, bld.mkImm(val));
2301 insn->setSrc(0, add->getSrc(s));
2302 }
2303
2304 // NEG(AND(SET, 1)) -> SET
2305 void
handleNEG(Instruction * i)2306 AlgebraicOpt::handleNEG(Instruction *i) {
2307 Instruction *src = i->getSrc(0)->getInsn();
2308 ImmediateValue imm;
2309 int b;
2310
2311 if (isFloatType(i->sType) || !src || src->op != OP_AND)
2312 return;
2313
2314 if (src->src(0).getImmediate(imm))
2315 b = 1;
2316 else if (src->src(1).getImmediate(imm))
2317 b = 0;
2318 else
2319 return;
2320
2321 if (!imm.isInteger(1))
2322 return;
2323
2324 Instruction *set = src->getSrc(b)->getInsn();
2325 if ((set->op == OP_SET || set->op == OP_SET_AND ||
2326 set->op == OP_SET_OR || set->op == OP_SET_XOR) &&
2327 !isFloatType(set->dType)) {
2328 i->def(0).replace(set->getDef(0), false);
2329 }
2330 }
2331
2332 // EXTBF(RDSV(COMBINED_TID)) -> RDSV(TID)
2333 void
handleEXTBF_RDSV(Instruction * i)2334 AlgebraicOpt::handleEXTBF_RDSV(Instruction *i)
2335 {
2336 Instruction *rdsv = i->getSrc(0)->getUniqueInsn();
2337 if (rdsv->op != OP_RDSV ||
2338 rdsv->getSrc(0)->asSym()->reg.data.sv.sv != SV_COMBINED_TID)
2339 return;
2340 // Avoid creating more RDSV instructions
2341 if (rdsv->getDef(0)->refCount() > 1)
2342 return;
2343
2344 ImmediateValue imm;
2345 if (!i->src(1).getImmediate(imm))
2346 return;
2347
2348 int index;
2349 if (imm.isInteger(0x1000))
2350 index = 0;
2351 else
2352 if (imm.isInteger(0x0a10))
2353 index = 1;
2354 else
2355 if (imm.isInteger(0x061a))
2356 index = 2;
2357 else
2358 return;
2359
2360 bld.setPosition(i, false);
2361
2362 i->op = OP_RDSV;
2363 i->setSrc(0, bld.mkSysVal(SV_TID, index));
2364 i->setSrc(1, NULL);
2365 }
2366
2367 bool
visit(BasicBlock * bb)2368 AlgebraicOpt::visit(BasicBlock *bb)
2369 {
2370 Instruction *next;
2371 for (Instruction *i = bb->getEntry(); i; i = next) {
2372 next = i->next;
2373 switch (i->op) {
2374 case OP_ABS:
2375 handleABS(i);
2376 break;
2377 case OP_ADD:
2378 handleADD(i);
2379 break;
2380 case OP_RCP:
2381 handleRCP(i);
2382 break;
2383 case OP_MIN:
2384 case OP_MAX:
2385 handleMINMAX(i);
2386 break;
2387 case OP_SLCT:
2388 handleSLCT(i);
2389 break;
2390 case OP_AND:
2391 case OP_OR:
2392 case OP_XOR:
2393 handleLOGOP(i);
2394 break;
2395 case OP_CVT:
2396 handleCVT_NEG(i);
2397 handleCVT_CVT(i);
2398 if (prog->getTarget()->isOpSupported(OP_EXTBF, TYPE_U32))
2399 handleCVT_EXTBF(i);
2400 break;
2401 case OP_SUCLAMP:
2402 handleSUCLAMP(i);
2403 break;
2404 case OP_NEG:
2405 handleNEG(i);
2406 break;
2407 case OP_EXTBF:
2408 handleEXTBF_RDSV(i);
2409 break;
2410 default:
2411 break;
2412 }
2413 }
2414
2415 return true;
2416 }
2417
2418 // =============================================================================
2419
2420 // ADD(SHL(a, b), c) -> SHLADD(a, b, c)
2421 // MUL(a, b) -> a few XMADs
2422 // MAD/FMA(a, b, c) -> a few XMADs
2423 class LateAlgebraicOpt : public Pass
2424 {
2425 private:
2426 virtual bool visit(Instruction *);
2427
2428 void handleADD(Instruction *);
2429 void handleMULMAD(Instruction *);
2430 bool tryADDToSHLADD(Instruction *);
2431
2432 BuildUtil bld;
2433 };
2434
2435 void
handleADD(Instruction * add)2436 LateAlgebraicOpt::handleADD(Instruction *add)
2437 {
2438 Value *src0 = add->getSrc(0);
2439 Value *src1 = add->getSrc(1);
2440
2441 if (src0->reg.file != FILE_GPR || src1->reg.file != FILE_GPR)
2442 return;
2443
2444 if (prog->getTarget()->isOpSupported(OP_SHLADD, add->dType))
2445 tryADDToSHLADD(add);
2446 }
2447
2448 // ADD(SHL(a, b), c) -> SHLADD(a, b, c)
2449 bool
tryADDToSHLADD(Instruction * add)2450 LateAlgebraicOpt::tryADDToSHLADD(Instruction *add)
2451 {
2452 Value *src0 = add->getSrc(0);
2453 Value *src1 = add->getSrc(1);
2454 ImmediateValue imm;
2455 Instruction *shl;
2456 Value *src;
2457 int s;
2458
2459 if (add->saturate || add->usesFlags() || typeSizeof(add->dType) == 8
2460 || isFloatType(add->dType))
2461 return false;
2462
2463 if (src0->getUniqueInsn() && src0->getUniqueInsn()->op == OP_SHL)
2464 s = 0;
2465 else
2466 if (src1->getUniqueInsn() && src1->getUniqueInsn()->op == OP_SHL)
2467 s = 1;
2468 else
2469 return false;
2470
2471 src = add->getSrc(s);
2472 shl = src->getUniqueInsn();
2473
2474 if (shl->bb != add->bb || shl->usesFlags() || shl->subOp || shl->src(0).mod)
2475 return false;
2476
2477 if (!shl->src(1).getImmediate(imm))
2478 return false;
2479
2480 add->op = OP_SHLADD;
2481 add->setSrc(2, add->src(!s));
2482 // SHL can't have any modifiers, but the ADD source may have had
2483 // one. Preserve it.
2484 add->setSrc(0, shl->getSrc(0));
2485 if (s == 1)
2486 add->src(0).mod = add->src(1).mod;
2487 add->setSrc(1, new_ImmediateValue(shl->bb->getProgram(), imm.reg.data.u32));
2488 add->src(1).mod = Modifier(0);
2489
2490 return true;
2491 }
2492
2493 // MUL(a, b) -> a few XMADs
2494 // MAD/FMA(a, b, c) -> a few XMADs
2495 void
handleMULMAD(Instruction * i)2496 LateAlgebraicOpt::handleMULMAD(Instruction *i)
2497 {
2498 // TODO: handle NV50_IR_SUBOP_MUL_HIGH
2499 if (!prog->getTarget()->isOpSupported(OP_XMAD, TYPE_U32))
2500 return;
2501 if (isFloatType(i->dType) || typeSizeof(i->dType) != 4)
2502 return;
2503 if (i->subOp || i->usesFlags() || i->flagsDef >= 0)
2504 return;
2505
2506 assert(!i->src(0).mod);
2507 assert(!i->src(1).mod);
2508 assert(i->op == OP_MUL ? 1 : !i->src(2).mod);
2509
2510 bld.setPosition(i, false);
2511
2512 Value *a = i->getSrc(0);
2513 Value *b = i->getSrc(1);
2514 Value *c = i->op == OP_MUL ? bld.mkImm(0) : i->getSrc(2);
2515
2516 Value *tmp0 = bld.getSSA();
2517 Value *tmp1 = bld.getSSA();
2518
2519 Instruction *insn = bld.mkOp3(OP_XMAD, TYPE_U32, tmp0, b, a, c);
2520 insn->setPredicate(i->cc, i->getPredicate());
2521
2522 insn = bld.mkOp3(OP_XMAD, TYPE_U32, tmp1, b, a, bld.mkImm(0));
2523 insn->setPredicate(i->cc, i->getPredicate());
2524 insn->subOp = NV50_IR_SUBOP_XMAD_MRG | NV50_IR_SUBOP_XMAD_H1(1);
2525
2526 Value *pred = i->getPredicate();
2527 i->setPredicate(i->cc, NULL);
2528
2529 i->op = OP_XMAD;
2530 i->setSrc(0, b);
2531 i->setSrc(1, tmp1);
2532 i->setSrc(2, tmp0);
2533 i->subOp = NV50_IR_SUBOP_XMAD_PSL | NV50_IR_SUBOP_XMAD_CBCC;
2534 i->subOp |= NV50_IR_SUBOP_XMAD_H1(0) | NV50_IR_SUBOP_XMAD_H1(1);
2535
2536 i->setPredicate(i->cc, pred);
2537 }
2538
2539 bool
visit(Instruction * i)2540 LateAlgebraicOpt::visit(Instruction *i)
2541 {
2542 switch (i->op) {
2543 case OP_ADD:
2544 handleADD(i);
2545 break;
2546 case OP_MUL:
2547 case OP_MAD:
2548 case OP_FMA:
2549 handleMULMAD(i);
2550 break;
2551 default:
2552 break;
2553 }
2554
2555 return true;
2556 }
2557
2558 // =============================================================================
2559
2560 // Split 64-bit MUL and MAD
2561 class Split64BitOpPreRA : public Pass
2562 {
2563 private:
2564 virtual bool visit(BasicBlock *);
2565 void split64MulMad(Function *, Instruction *, DataType);
2566
2567 BuildUtil bld;
2568 };
2569
2570 bool
visit(BasicBlock * bb)2571 Split64BitOpPreRA::visit(BasicBlock *bb)
2572 {
2573 Instruction *i, *next;
2574 Modifier mod;
2575
2576 for (i = bb->getEntry(); i; i = next) {
2577 next = i->next;
2578
2579 DataType hTy;
2580 switch (i->dType) {
2581 case TYPE_U64: hTy = TYPE_U32; break;
2582 case TYPE_S64: hTy = TYPE_S32; break;
2583 default:
2584 continue;
2585 }
2586
2587 if (i->op == OP_MAD || i->op == OP_MUL)
2588 split64MulMad(func, i, hTy);
2589 }
2590
2591 return true;
2592 }
2593
2594 void
split64MulMad(Function * fn,Instruction * i,DataType hTy)2595 Split64BitOpPreRA::split64MulMad(Function *fn, Instruction *i, DataType hTy)
2596 {
2597 assert(i->op == OP_MAD || i->op == OP_MUL);
2598 assert(!isFloatType(i->dType) && !isFloatType(i->sType));
2599 assert(typeSizeof(hTy) == 4);
2600
2601 bld.setPosition(i, true);
2602
2603 Value *zero = bld.mkImm(0u);
2604 Value *carry = bld.getSSA(1, FILE_FLAGS);
2605
2606 // We want to compute `d = a * b (+ c)?`, where a, b, c and d are 64-bit
2607 // values (a, b and c might be 32-bit values), using 32-bit operations. This
2608 // gives the following operations:
2609 // * `d.low = low(a.low * b.low) (+ c.low)?`
2610 // * `d.high = low(a.high * b.low) + low(a.low * b.high)
2611 // + high(a.low * b.low) (+ c.high)?`
2612 //
2613 // To compute the high bits, we can split in the following operations:
2614 // * `tmp1 = low(a.high * b.low) (+ c.high)?`
2615 // * `tmp2 = low(a.low * b.high) + tmp1`
2616 // * `d.high = high(a.low * b.low) + tmp2`
2617 //
2618 // mkSplit put lower bits at index 0 and higher bits at index 1
2619
2620 Value *op1[2];
2621 if (i->getSrc(0)->reg.size == 8)
2622 bld.mkSplit(op1, 4, i->getSrc(0));
2623 else {
2624 op1[0] = i->getSrc(0);
2625 op1[1] = zero;
2626 }
2627 Value *op2[2];
2628 if (i->getSrc(1)->reg.size == 8)
2629 bld.mkSplit(op2, 4, i->getSrc(1));
2630 else {
2631 op2[0] = i->getSrc(1);
2632 op2[1] = zero;
2633 }
2634
2635 Value *op3[2] = { NULL, NULL };
2636 if (i->op == OP_MAD) {
2637 if (i->getSrc(2)->reg.size == 8)
2638 bld.mkSplit(op3, 4, i->getSrc(2));
2639 else {
2640 op3[0] = i->getSrc(2);
2641 op3[1] = zero;
2642 }
2643 }
2644
2645 Value *tmpRes1Hi = bld.getSSA();
2646 if (i->op == OP_MAD)
2647 bld.mkOp3(OP_MAD, hTy, tmpRes1Hi, op1[1], op2[0], op3[1]);
2648 else
2649 bld.mkOp2(OP_MUL, hTy, tmpRes1Hi, op1[1], op2[0]);
2650
2651 Value *tmpRes2Hi = bld.mkOp3v(OP_MAD, hTy, bld.getSSA(), op1[0], op2[1], tmpRes1Hi);
2652
2653 Value *def[2] = { bld.getSSA(), bld.getSSA() };
2654
2655 // If it was a MAD, add the carry from the low bits
2656 // It is not needed if it was a MUL, since we added high(a.low * b.low) to
2657 // d.high
2658 if (i->op == OP_MAD)
2659 bld.mkOp3(OP_MAD, hTy, def[0], op1[0], op2[0], op3[0])->setFlagsDef(1, carry);
2660 else
2661 bld.mkOp2(OP_MUL, hTy, def[0], op1[0], op2[0]);
2662
2663 Instruction *hiPart3 = bld.mkOp3(OP_MAD, hTy, def[1], op1[0], op2[0], tmpRes2Hi);
2664 hiPart3->subOp = NV50_IR_SUBOP_MUL_HIGH;
2665 if (i->op == OP_MAD)
2666 hiPart3->setFlagsSrc(3, carry);
2667
2668 bld.mkOp2(OP_MERGE, i->dType, i->getDef(0), def[0], def[1]);
2669
2670 delete_Instruction(fn->getProgram(), i);
2671 }
2672
2673 // =============================================================================
2674
2675 static inline void
updateLdStOffset(Instruction * ldst,int32_t offset,Function * fn)2676 updateLdStOffset(Instruction *ldst, int32_t offset, Function *fn)
2677 {
2678 if (offset != ldst->getSrc(0)->reg.data.offset) {
2679 if (ldst->getSrc(0)->refCount() > 1)
2680 ldst->setSrc(0, cloneShallow(fn, ldst->getSrc(0)));
2681 ldst->getSrc(0)->reg.data.offset = offset;
2682 }
2683 }
2684
2685 // Combine loads and stores, forward stores to loads where possible.
2686 class MemoryOpt : public Pass
2687 {
2688 private:
2689 class Record
2690 {
2691 public:
2692 Record *next;
2693 Instruction *insn;
2694 const Value *rel[2];
2695 const Value *base;
2696 int32_t offset;
2697 int8_t fileIndex;
2698 uint8_t size;
2699 bool locked;
2700 Record *prev;
2701
2702 bool overlaps(const Instruction *ldst) const;
2703
2704 inline void link(Record **);
2705 inline void unlink(Record **);
2706 inline void set(const Instruction *ldst);
2707 };
2708
2709 public:
2710 MemoryOpt();
2711
2712 Record *loads[DATA_FILE_COUNT];
2713 Record *stores[DATA_FILE_COUNT];
2714
2715 MemoryPool recordPool;
2716
2717 private:
2718 virtual bool visit(BasicBlock *);
2719 bool runOpt(BasicBlock *);
2720
2721 Record **getList(const Instruction *);
2722
2723 Record *findRecord(const Instruction *, bool load, bool& isAdjacent) const;
2724
2725 // merge @insn into load/store instruction from @rec
2726 bool combineLd(Record *rec, Instruction *ld);
2727 bool combineSt(Record *rec, Instruction *st);
2728
2729 bool replaceLdFromLd(Instruction *ld, Record *ldRec);
2730 bool replaceLdFromSt(Instruction *ld, Record *stRec);
2731 bool replaceStFromSt(Instruction *restrict st, Record *stRec);
2732
2733 void addRecord(Instruction *ldst);
2734 void purgeRecords(Instruction *const st, DataFile);
2735 void lockStores(Instruction *const ld);
2736 void reset();
2737
2738 private:
2739 Record *prevRecord;
2740 };
2741
MemoryOpt()2742 MemoryOpt::MemoryOpt() : recordPool(sizeof(MemoryOpt::Record), 6)
2743 {
2744 for (int i = 0; i < DATA_FILE_COUNT; ++i) {
2745 loads[i] = NULL;
2746 stores[i] = NULL;
2747 }
2748 prevRecord = NULL;
2749 }
2750
2751 void
reset()2752 MemoryOpt::reset()
2753 {
2754 for (unsigned int i = 0; i < DATA_FILE_COUNT; ++i) {
2755 Record *it, *next;
2756 for (it = loads[i]; it; it = next) {
2757 next = it->next;
2758 recordPool.release(it);
2759 }
2760 loads[i] = NULL;
2761 for (it = stores[i]; it; it = next) {
2762 next = it->next;
2763 recordPool.release(it);
2764 }
2765 stores[i] = NULL;
2766 }
2767 }
2768
2769 bool
combineLd(Record * rec,Instruction * ld)2770 MemoryOpt::combineLd(Record *rec, Instruction *ld)
2771 {
2772 int32_t offRc = rec->offset;
2773 int32_t offLd = ld->getSrc(0)->reg.data.offset;
2774 int sizeRc = rec->size;
2775 int sizeLd = typeSizeof(ld->dType);
2776 int size = sizeRc + sizeLd;
2777 int d, j;
2778
2779 if (!prog->getTarget()->
2780 isAccessSupported(ld->getSrc(0)->reg.file, typeOfSize(size)))
2781 return false;
2782 // no unaligned loads
2783 if (((size == 0x8) && (MIN2(offLd, offRc) & 0x7)) ||
2784 ((size == 0xc) && (MIN2(offLd, offRc) & 0xf)))
2785 return false;
2786 // for compute indirect loads are not guaranteed to be aligned
2787 if (prog->getType() == Program::TYPE_COMPUTE && rec->rel[0])
2788 return false;
2789
2790 assert(sizeRc + sizeLd <= 16 && offRc != offLd);
2791
2792 // lock any stores that overlap with the load being merged into the
2793 // existing record.
2794 lockStores(ld);
2795
2796 for (j = 0; sizeRc; sizeRc -= rec->insn->getDef(j)->reg.size, ++j);
2797
2798 if (offLd < offRc) {
2799 int sz;
2800 for (sz = 0, d = 0; sz < sizeLd; sz += ld->getDef(d)->reg.size, ++d);
2801 // d: nr of definitions in ld
2802 // j: nr of definitions in rec->insn, move:
2803 for (d = d + j - 1; j > 0; --j, --d)
2804 rec->insn->setDef(d, rec->insn->getDef(j - 1));
2805
2806 if (rec->insn->getSrc(0)->refCount() > 1)
2807 rec->insn->setSrc(0, cloneShallow(func, rec->insn->getSrc(0)));
2808 rec->offset = rec->insn->getSrc(0)->reg.data.offset = offLd;
2809
2810 d = 0;
2811 } else {
2812 d = j;
2813 }
2814 // move definitions of @ld to @rec->insn
2815 for (j = 0; sizeLd; ++j, ++d) {
2816 sizeLd -= ld->getDef(j)->reg.size;
2817 rec->insn->setDef(d, ld->getDef(j));
2818 }
2819
2820 rec->size = size;
2821 rec->insn->getSrc(0)->reg.size = size;
2822 rec->insn->setType(typeOfSize(size));
2823
2824 delete_Instruction(prog, ld);
2825
2826 return true;
2827 }
2828
2829 bool
combineSt(Record * rec,Instruction * st)2830 MemoryOpt::combineSt(Record *rec, Instruction *st)
2831 {
2832 int32_t offRc = rec->offset;
2833 int32_t offSt = st->getSrc(0)->reg.data.offset;
2834 int sizeRc = rec->size;
2835 int sizeSt = typeSizeof(st->dType);
2836 int s = sizeSt / 4;
2837 int size = sizeRc + sizeSt;
2838 int j, k;
2839 Value *src[4]; // no modifiers in ValueRef allowed for st
2840 Value *extra[3];
2841
2842 if (!prog->getTarget()->
2843 isAccessSupported(st->getSrc(0)->reg.file, typeOfSize(size)))
2844 return false;
2845 // no unaligned stores
2846 if (size == 8 && MIN2(offRc, offSt) & 0x7)
2847 return false;
2848 // for compute indirect stores are not guaranteed to be aligned
2849 if (prog->getType() == Program::TYPE_COMPUTE && rec->rel[0])
2850 return false;
2851
2852 // There's really no great place to put this in a generic manner. Seemingly
2853 // wide stores at 0x60 don't work in GS shaders on SM50+. Don't combine
2854 // those.
2855 if (prog->getTarget()->getChipset() >= NVISA_GM107_CHIPSET &&
2856 prog->getType() == Program::TYPE_GEOMETRY &&
2857 st->getSrc(0)->reg.file == FILE_SHADER_OUTPUT &&
2858 rec->rel[0] == NULL &&
2859 MIN2(offRc, offSt) == 0x60)
2860 return false;
2861
2862 // remove any existing load/store records for the store being merged into
2863 // the existing record.
2864 purgeRecords(st, DATA_FILE_COUNT);
2865
2866 st->takeExtraSources(0, extra); // save predicate and indirect address
2867
2868 if (offRc < offSt) {
2869 // save values from @st
2870 for (s = 0; sizeSt; ++s) {
2871 sizeSt -= st->getSrc(s + 1)->reg.size;
2872 src[s] = st->getSrc(s + 1);
2873 }
2874 // set record's values as low sources of @st
2875 for (j = 1; sizeRc; ++j) {
2876 sizeRc -= rec->insn->getSrc(j)->reg.size;
2877 st->setSrc(j, rec->insn->getSrc(j));
2878 }
2879 // set saved values as high sources of @st
2880 for (k = j, j = 0; j < s; ++j)
2881 st->setSrc(k++, src[j]);
2882
2883 updateLdStOffset(st, offRc, func);
2884 } else {
2885 for (j = 1; sizeSt; ++j)
2886 sizeSt -= st->getSrc(j)->reg.size;
2887 for (s = 1; sizeRc; ++j, ++s) {
2888 sizeRc -= rec->insn->getSrc(s)->reg.size;
2889 st->setSrc(j, rec->insn->getSrc(s));
2890 }
2891 rec->offset = offSt;
2892 }
2893 st->putExtraSources(0, extra); // restore pointer and predicate
2894
2895 delete_Instruction(prog, rec->insn);
2896 rec->insn = st;
2897 rec->size = size;
2898 rec->insn->getSrc(0)->reg.size = size;
2899 rec->insn->setType(typeOfSize(size));
2900 return true;
2901 }
2902
2903 void
set(const Instruction * ldst)2904 MemoryOpt::Record::set(const Instruction *ldst)
2905 {
2906 const Symbol *mem = ldst->getSrc(0)->asSym();
2907 fileIndex = mem->reg.fileIndex;
2908 rel[0] = ldst->getIndirect(0, 0);
2909 rel[1] = ldst->getIndirect(0, 1);
2910 offset = mem->reg.data.offset;
2911 base = mem->getBase();
2912 size = typeSizeof(ldst->sType);
2913 }
2914
2915 void
link(Record ** list)2916 MemoryOpt::Record::link(Record **list)
2917 {
2918 next = *list;
2919 if (next)
2920 next->prev = this;
2921 prev = NULL;
2922 *list = this;
2923 }
2924
2925 void
unlink(Record ** list)2926 MemoryOpt::Record::unlink(Record **list)
2927 {
2928 if (next)
2929 next->prev = prev;
2930 if (prev)
2931 prev->next = next;
2932 else
2933 *list = next;
2934 }
2935
2936 MemoryOpt::Record **
getList(const Instruction * insn)2937 MemoryOpt::getList(const Instruction *insn)
2938 {
2939 if (insn->op == OP_LOAD || insn->op == OP_VFETCH)
2940 return &loads[insn->src(0).getFile()];
2941 return &stores[insn->src(0).getFile()];
2942 }
2943
2944 void
addRecord(Instruction * i)2945 MemoryOpt::addRecord(Instruction *i)
2946 {
2947 Record **list = getList(i);
2948 Record *it = reinterpret_cast<Record *>(recordPool.allocate());
2949
2950 it->link(list);
2951 it->set(i);
2952 it->insn = i;
2953 it->locked = false;
2954 }
2955
2956 MemoryOpt::Record *
findRecord(const Instruction * insn,bool load,bool & isAdj) const2957 MemoryOpt::findRecord(const Instruction *insn, bool load, bool& isAdj) const
2958 {
2959 const Symbol *sym = insn->getSrc(0)->asSym();
2960 const int size = typeSizeof(insn->sType);
2961 Record *rec = NULL;
2962 Record *it = load ? loads[sym->reg.file] : stores[sym->reg.file];
2963
2964 for (; it; it = it->next) {
2965 if (it->locked && insn->op != OP_LOAD && insn->op != OP_VFETCH)
2966 continue;
2967 if ((it->offset >> 4) != (sym->reg.data.offset >> 4) ||
2968 it->rel[0] != insn->getIndirect(0, 0) ||
2969 it->fileIndex != sym->reg.fileIndex ||
2970 it->rel[1] != insn->getIndirect(0, 1))
2971 continue;
2972
2973 if (it->offset < sym->reg.data.offset) {
2974 if (it->offset + it->size >= sym->reg.data.offset) {
2975 isAdj = (it->offset + it->size == sym->reg.data.offset);
2976 if (!isAdj)
2977 return it;
2978 if (!(it->offset & 0x7))
2979 rec = it;
2980 }
2981 } else {
2982 isAdj = it->offset != sym->reg.data.offset;
2983 if (size <= it->size && !isAdj)
2984 return it;
2985 else
2986 if (!(sym->reg.data.offset & 0x7))
2987 if (it->offset - size <= sym->reg.data.offset)
2988 rec = it;
2989 }
2990 }
2991 return rec;
2992 }
2993
2994 bool
replaceLdFromSt(Instruction * ld,Record * rec)2995 MemoryOpt::replaceLdFromSt(Instruction *ld, Record *rec)
2996 {
2997 Instruction *st = rec->insn;
2998 int32_t offSt = rec->offset;
2999 int32_t offLd = ld->getSrc(0)->reg.data.offset;
3000 int d, s;
3001
3002 for (s = 1; offSt != offLd && st->srcExists(s); ++s)
3003 offSt += st->getSrc(s)->reg.size;
3004 if (offSt != offLd)
3005 return false;
3006
3007 for (d = 0; ld->defExists(d) && st->srcExists(s); ++d, ++s) {
3008 if (ld->getDef(d)->reg.size != st->getSrc(s)->reg.size)
3009 return false;
3010 if (st->getSrc(s)->reg.file != FILE_GPR)
3011 return false;
3012 ld->def(d).replace(st->src(s), false);
3013 }
3014 ld->bb->remove(ld);
3015 return true;
3016 }
3017
3018 bool
replaceLdFromLd(Instruction * ldE,Record * rec)3019 MemoryOpt::replaceLdFromLd(Instruction *ldE, Record *rec)
3020 {
3021 Instruction *ldR = rec->insn;
3022 int32_t offR = rec->offset;
3023 int32_t offE = ldE->getSrc(0)->reg.data.offset;
3024 int dR, dE;
3025
3026 assert(offR <= offE);
3027 for (dR = 0; offR < offE && ldR->defExists(dR); ++dR)
3028 offR += ldR->getDef(dR)->reg.size;
3029 if (offR != offE)
3030 return false;
3031
3032 for (dE = 0; ldE->defExists(dE) && ldR->defExists(dR); ++dE, ++dR) {
3033 if (ldE->getDef(dE)->reg.size != ldR->getDef(dR)->reg.size)
3034 return false;
3035 ldE->def(dE).replace(ldR->getDef(dR), false);
3036 }
3037
3038 delete_Instruction(prog, ldE);
3039 return true;
3040 }
3041
3042 bool
replaceStFromSt(Instruction * restrict st,Record * rec)3043 MemoryOpt::replaceStFromSt(Instruction *restrict st, Record *rec)
3044 {
3045 const Instruction *const ri = rec->insn;
3046 Value *extra[3];
3047
3048 int32_t offS = st->getSrc(0)->reg.data.offset;
3049 int32_t offR = rec->offset;
3050 int32_t endS = offS + typeSizeof(st->dType);
3051 int32_t endR = offR + typeSizeof(ri->dType);
3052
3053 rec->size = MAX2(endS, endR) - MIN2(offS, offR);
3054
3055 st->takeExtraSources(0, extra);
3056
3057 if (offR < offS) {
3058 Value *vals[10];
3059 int s, n;
3060 int k = 0;
3061 // get non-replaced sources of ri
3062 for (s = 1; offR < offS; offR += ri->getSrc(s)->reg.size, ++s)
3063 vals[k++] = ri->getSrc(s);
3064 n = s;
3065 // get replaced sources of st
3066 for (s = 1; st->srcExists(s); offS += st->getSrc(s)->reg.size, ++s)
3067 vals[k++] = st->getSrc(s);
3068 // skip replaced sources of ri
3069 for (s = n; offR < endS; offR += ri->getSrc(s)->reg.size, ++s);
3070 // get non-replaced sources after values covered by st
3071 for (; offR < endR; offR += ri->getSrc(s)->reg.size, ++s)
3072 vals[k++] = ri->getSrc(s);
3073 assert((unsigned int)k <= ARRAY_SIZE(vals));
3074 for (s = 0; s < k; ++s)
3075 st->setSrc(s + 1, vals[s]);
3076 st->setSrc(0, ri->getSrc(0));
3077 } else
3078 if (endR > endS) {
3079 int j, s;
3080 for (j = 1; offR < endS; offR += ri->getSrc(j++)->reg.size);
3081 for (s = 1; offS < endS; offS += st->getSrc(s++)->reg.size);
3082 for (; offR < endR; offR += ri->getSrc(j++)->reg.size)
3083 st->setSrc(s++, ri->getSrc(j));
3084 }
3085 st->putExtraSources(0, extra);
3086
3087 delete_Instruction(prog, rec->insn);
3088
3089 rec->insn = st;
3090 rec->offset = st->getSrc(0)->reg.data.offset;
3091
3092 st->setType(typeOfSize(rec->size));
3093
3094 return true;
3095 }
3096
3097 bool
overlaps(const Instruction * ldst) const3098 MemoryOpt::Record::overlaps(const Instruction *ldst) const
3099 {
3100 Record that;
3101 that.set(ldst);
3102
3103 // This assumes that images/buffers can't overlap. They can.
3104 // TODO: Plumb the restrict logic through, and only skip when it's a
3105 // restrict situation, or there can implicitly be no writes.
3106 if (this->fileIndex != that.fileIndex && this->rel[1] == that.rel[1])
3107 return false;
3108
3109 if (this->rel[0] || that.rel[0])
3110 return this->base == that.base;
3111
3112 return
3113 (this->offset < that.offset + that.size) &&
3114 (this->offset + this->size > that.offset);
3115 }
3116
3117 // We must not eliminate stores that affect the result of @ld if
3118 // we find later stores to the same location, and we may no longer
3119 // merge them with later stores.
3120 // The stored value can, however, still be used to determine the value
3121 // returned by future loads.
3122 void
lockStores(Instruction * const ld)3123 MemoryOpt::lockStores(Instruction *const ld)
3124 {
3125 for (Record *r = stores[ld->src(0).getFile()]; r; r = r->next)
3126 if (!r->locked && r->overlaps(ld))
3127 r->locked = true;
3128 }
3129
3130 // Prior loads from the location of @st are no longer valid.
3131 // Stores to the location of @st may no longer be used to derive
3132 // the value at it nor be coalesced into later stores.
3133 void
purgeRecords(Instruction * const st,DataFile f)3134 MemoryOpt::purgeRecords(Instruction *const st, DataFile f)
3135 {
3136 if (st)
3137 f = st->src(0).getFile();
3138
3139 for (Record *r = loads[f]; r; r = r->next)
3140 if (!st || r->overlaps(st))
3141 r->unlink(&loads[f]);
3142
3143 for (Record *r = stores[f]; r; r = r->next)
3144 if (!st || r->overlaps(st))
3145 r->unlink(&stores[f]);
3146 }
3147
3148 bool
visit(BasicBlock * bb)3149 MemoryOpt::visit(BasicBlock *bb)
3150 {
3151 bool ret = runOpt(bb);
3152 // Run again, one pass won't combine 4 32 bit ld/st to a single 128 bit ld/st
3153 // where 96 bit memory operations are forbidden.
3154 if (ret)
3155 ret = runOpt(bb);
3156 return ret;
3157 }
3158
3159 bool
runOpt(BasicBlock * bb)3160 MemoryOpt::runOpt(BasicBlock *bb)
3161 {
3162 Instruction *ldst, *next;
3163 Record *rec;
3164 bool isAdjacent = true;
3165
3166 for (ldst = bb->getEntry(); ldst; ldst = next) {
3167 bool keep = true;
3168 bool isLoad = true;
3169 next = ldst->next;
3170
3171 // TODO: Handle combining sub 4-bytes loads/stores.
3172 if (ldst->op == OP_STORE && typeSizeof(ldst->dType) < 4) {
3173 purgeRecords(ldst, ldst->src(0).getFile());
3174 continue;
3175 }
3176
3177 if (ldst->op == OP_LOAD || ldst->op == OP_VFETCH) {
3178 if (ldst->subOp == NV50_IR_SUBOP_LOAD_LOCKED) {
3179 purgeRecords(ldst, ldst->src(0).getFile());
3180 continue;
3181 }
3182 if (ldst->isDead()) {
3183 // might have been produced by earlier optimization
3184 delete_Instruction(prog, ldst);
3185 continue;
3186 }
3187 } else
3188 if (ldst->op == OP_STORE || ldst->op == OP_EXPORT) {
3189 if (ldst->subOp == NV50_IR_SUBOP_STORE_UNLOCKED) {
3190 purgeRecords(ldst, ldst->src(0).getFile());
3191 continue;
3192 }
3193 if (typeSizeof(ldst->dType) == 4 &&
3194 ldst->src(1).getFile() == FILE_GPR &&
3195 ldst->getSrc(1)->getInsn()->op == OP_NOP) {
3196 delete_Instruction(prog, ldst);
3197 continue;
3198 }
3199 isLoad = false;
3200 } else {
3201 // TODO: maybe have all fixed ops act as barrier ?
3202 if (ldst->op == OP_CALL ||
3203 ldst->op == OP_BAR ||
3204 ldst->op == OP_MEMBAR) {
3205 purgeRecords(NULL, FILE_MEMORY_LOCAL);
3206 purgeRecords(NULL, FILE_MEMORY_GLOBAL);
3207 purgeRecords(NULL, FILE_MEMORY_SHARED);
3208 purgeRecords(NULL, FILE_SHADER_OUTPUT);
3209 } else
3210 if (ldst->op == OP_ATOM || ldst->op == OP_CCTL) {
3211 if (ldst->src(0).getFile() == FILE_MEMORY_GLOBAL) {
3212 purgeRecords(NULL, FILE_MEMORY_LOCAL);
3213 purgeRecords(NULL, FILE_MEMORY_GLOBAL);
3214 purgeRecords(NULL, FILE_MEMORY_SHARED);
3215 } else {
3216 purgeRecords(NULL, ldst->src(0).getFile());
3217 }
3218 } else
3219 if (ldst->op == OP_EMIT || ldst->op == OP_RESTART) {
3220 purgeRecords(NULL, FILE_SHADER_OUTPUT);
3221 }
3222 continue;
3223 }
3224
3225 DataFile file = ldst->src(0).getFile();
3226 if (file != FILE_MEMORY_CONST &&
3227 file != FILE_SHADER_INPUT &&
3228 file != FILE_SHADER_OUTPUT)
3229 continue;
3230
3231 if (ldst->getPredicate()) // TODO: handle predicated ld/st
3232 continue;
3233 if (ldst->perPatch) // TODO: create separate per-patch lists
3234 continue;
3235
3236 if (isLoad) {
3237 // if ld l[]/g[] look for previous store to eliminate the reload
3238 if (file == FILE_MEMORY_GLOBAL || file == FILE_MEMORY_LOCAL) {
3239 // TODO: shared memory ?
3240 rec = findRecord(ldst, false, isAdjacent);
3241 if (rec && !isAdjacent)
3242 keep = !replaceLdFromSt(ldst, rec);
3243 }
3244
3245 // or look for ld from the same location and replace this one
3246 rec = keep ? findRecord(ldst, true, isAdjacent) : NULL;
3247 if (rec) {
3248 if (!isAdjacent)
3249 keep = !replaceLdFromLd(ldst, rec);
3250 else
3251 // or combine a previous load with this one
3252 keep = !combineLd(rec, ldst);
3253 }
3254 if (keep)
3255 lockStores(ldst);
3256 } else {
3257 rec = findRecord(ldst, false, isAdjacent);
3258 if (rec) {
3259 if (!isAdjacent)
3260 keep = !replaceStFromSt(ldst, rec);
3261 else
3262 keep = !combineSt(rec, ldst);
3263 }
3264 if (keep)
3265 purgeRecords(ldst, DATA_FILE_COUNT);
3266 }
3267 if (keep)
3268 addRecord(ldst);
3269 }
3270 reset();
3271
3272 return true;
3273 }
3274
3275 // =============================================================================
3276
3277 // Turn control flow into predicated instructions (after register allocation !).
3278 // TODO:
3279 // Could move this to before register allocation on NVC0 and also handle nested
3280 // constructs.
3281 class FlatteningPass : public Pass
3282 {
3283 public:
FlatteningPass()3284 FlatteningPass() : gpr_unit(0) {}
3285
3286 private:
3287 virtual bool visit(Function *);
3288 virtual bool visit(BasicBlock *);
3289
3290 bool tryPredicateConditional(BasicBlock *);
3291 void predicateInstructions(BasicBlock *, Value *pred, CondCode cc);
3292 void tryPropagateBranch(BasicBlock *);
3293 inline bool isConstantCondition(Value *pred);
3294 inline bool mayPredicate(const Instruction *, const Value *pred) const;
3295 inline void removeFlow(Instruction *);
3296
3297 uint8_t gpr_unit;
3298 };
3299
3300 bool
isConstantCondition(Value * pred)3301 FlatteningPass::isConstantCondition(Value *pred)
3302 {
3303 Instruction *insn = pred->getUniqueInsn();
3304 assert(insn);
3305 if (insn->op != OP_SET || insn->srcExists(2))
3306 return false;
3307
3308 for (int s = 0; s < 2 && insn->srcExists(s); ++s) {
3309 Instruction *ld = insn->getSrc(s)->getUniqueInsn();
3310 DataFile file;
3311 if (ld) {
3312 if (ld->op != OP_MOV && ld->op != OP_LOAD)
3313 return false;
3314 if (ld->src(0).isIndirect(0))
3315 return false;
3316 file = ld->src(0).getFile();
3317 } else {
3318 file = insn->src(s).getFile();
3319 // catch $r63 on NVC0 and $r63/$r127 on NV50. Unfortunately maxGPR is
3320 // in register "units", which can vary between targets.
3321 if (file == FILE_GPR) {
3322 Value *v = insn->getSrc(s);
3323 int bytes = v->reg.data.id * MIN2(v->reg.size, 4);
3324 int units = bytes >> gpr_unit;
3325 if (units > prog->maxGPR)
3326 file = FILE_IMMEDIATE;
3327 }
3328 }
3329 if (file != FILE_IMMEDIATE && file != FILE_MEMORY_CONST)
3330 return false;
3331 }
3332 return true;
3333 }
3334
3335 void
removeFlow(Instruction * insn)3336 FlatteningPass::removeFlow(Instruction *insn)
3337 {
3338 FlowInstruction *term = insn ? insn->asFlow() : NULL;
3339 if (!term)
3340 return;
3341 Graph::Edge::Type ty = term->bb->cfg.outgoing().getType();
3342
3343 if (term->op == OP_BRA) {
3344 // TODO: this might get more difficult when we get arbitrary BRAs
3345 if (ty == Graph::Edge::CROSS || ty == Graph::Edge::BACK)
3346 return;
3347 } else
3348 if (term->op != OP_JOIN)
3349 return;
3350
3351 Value *pred = term->getPredicate();
3352
3353 delete_Instruction(prog, term);
3354
3355 if (pred && pred->refCount() == 0) {
3356 Instruction *pSet = pred->getUniqueInsn();
3357 pred->join->reg.data.id = -1; // deallocate
3358 if (pSet->isDead())
3359 delete_Instruction(prog, pSet);
3360 }
3361 }
3362
3363 void
predicateInstructions(BasicBlock * bb,Value * pred,CondCode cc)3364 FlatteningPass::predicateInstructions(BasicBlock *bb, Value *pred, CondCode cc)
3365 {
3366 for (Instruction *i = bb->getEntry(); i; i = i->next) {
3367 if (i->isNop())
3368 continue;
3369 assert(!i->getPredicate());
3370 i->setPredicate(cc, pred);
3371 }
3372 removeFlow(bb->getExit());
3373 }
3374
3375 bool
mayPredicate(const Instruction * insn,const Value * pred) const3376 FlatteningPass::mayPredicate(const Instruction *insn, const Value *pred) const
3377 {
3378 if (insn->isPseudo())
3379 return true;
3380 // TODO: calls where we don't know which registers are modified
3381
3382 if (!prog->getTarget()->mayPredicate(insn, pred))
3383 return false;
3384 for (int d = 0; insn->defExists(d); ++d)
3385 if (insn->getDef(d)->equals(pred))
3386 return false;
3387 return true;
3388 }
3389
3390 // If we jump to BRA/RET/EXIT, replace the jump with it.
3391 // NOTE: We do not update the CFG anymore here !
3392 //
3393 // TODO: Handle cases where we skip over a branch (maybe do that elsewhere ?):
3394 // BB:0
3395 // @p0 bra BB:2 -> @!p0 bra BB:3 iff (!) BB:2 immediately adjoins BB:1
3396 // BB1:
3397 // bra BB:3
3398 // BB2:
3399 // ...
3400 // BB3:
3401 // ...
3402 void
tryPropagateBranch(BasicBlock * bb)3403 FlatteningPass::tryPropagateBranch(BasicBlock *bb)
3404 {
3405 for (Instruction *i = bb->getExit(); i && i->op == OP_BRA; i = i->prev) {
3406 BasicBlock *bf = i->asFlow()->target.bb;
3407
3408 if (bf->getInsnCount() != 1)
3409 continue;
3410
3411 FlowInstruction *bra = i->asFlow();
3412 FlowInstruction *rep = bf->getExit()->asFlow();
3413
3414 if (!rep || rep->getPredicate())
3415 continue;
3416 if (rep->op != OP_BRA &&
3417 rep->op != OP_JOIN &&
3418 rep->op != OP_EXIT)
3419 continue;
3420
3421 // TODO: If there are multiple branches to @rep, only the first would
3422 // be replaced, so only remove them after this pass is done ?
3423 // Also, need to check all incident blocks for fall-through exits and
3424 // add the branch there.
3425 bra->op = rep->op;
3426 bra->target.bb = rep->target.bb;
3427 if (bf->cfg.incidentCount() == 1)
3428 bf->remove(rep);
3429 }
3430 }
3431
3432 bool
visit(Function * fn)3433 FlatteningPass::visit(Function *fn)
3434 {
3435 gpr_unit = prog->getTarget()->getFileUnit(FILE_GPR);
3436
3437 return true;
3438 }
3439
3440 bool
visit(BasicBlock * bb)3441 FlatteningPass::visit(BasicBlock *bb)
3442 {
3443 if (tryPredicateConditional(bb))
3444 return true;
3445
3446 // try to attach join to previous instruction
3447 if (prog->getTarget()->hasJoin) {
3448 Instruction *insn = bb->getExit();
3449 if (insn && insn->op == OP_JOIN && !insn->getPredicate()) {
3450 insn = insn->prev;
3451 if (insn && !insn->getPredicate() &&
3452 !insn->asFlow() &&
3453 insn->op != OP_DISCARD &&
3454 insn->op != OP_TEXBAR &&
3455 !isTextureOp(insn->op) && // probably just nve4
3456 !isSurfaceOp(insn->op) && // not confirmed
3457 insn->op != OP_LINTERP && // probably just nve4
3458 insn->op != OP_PINTERP && // probably just nve4
3459 ((insn->op != OP_LOAD && insn->op != OP_STORE && insn->op != OP_ATOM) ||
3460 (typeSizeof(insn->dType) <= 4 && !insn->src(0).isIndirect(0))) &&
3461 !insn->isNop()) {
3462 insn->join = 1;
3463 bb->remove(bb->getExit());
3464 return true;
3465 }
3466 }
3467 }
3468
3469 tryPropagateBranch(bb);
3470
3471 return true;
3472 }
3473
3474 bool
tryPredicateConditional(BasicBlock * bb)3475 FlatteningPass::tryPredicateConditional(BasicBlock *bb)
3476 {
3477 BasicBlock *bL = NULL, *bR = NULL;
3478 unsigned int nL = 0, nR = 0, limit = 12;
3479 Instruction *insn;
3480 unsigned int mask;
3481
3482 mask = bb->initiatesSimpleConditional();
3483 if (!mask)
3484 return false;
3485
3486 assert(bb->getExit());
3487 Value *pred = bb->getExit()->getPredicate();
3488 assert(pred);
3489
3490 if (isConstantCondition(pred))
3491 limit = 4;
3492
3493 Graph::EdgeIterator ei = bb->cfg.outgoing();
3494
3495 if (mask & 1) {
3496 bL = BasicBlock::get(ei.getNode());
3497 for (insn = bL->getEntry(); insn; insn = insn->next, ++nL)
3498 if (!mayPredicate(insn, pred))
3499 return false;
3500 if (nL > limit)
3501 return false; // too long, do a real branch
3502 }
3503 ei.next();
3504
3505 if (mask & 2) {
3506 bR = BasicBlock::get(ei.getNode());
3507 for (insn = bR->getEntry(); insn; insn = insn->next, ++nR)
3508 if (!mayPredicate(insn, pred))
3509 return false;
3510 if (nR > limit)
3511 return false; // too long, do a real branch
3512 }
3513
3514 if (bL)
3515 predicateInstructions(bL, pred, bb->getExit()->cc);
3516 if (bR)
3517 predicateInstructions(bR, pred, inverseCondCode(bb->getExit()->cc));
3518
3519 if (bb->joinAt) {
3520 bb->remove(bb->joinAt);
3521 bb->joinAt = NULL;
3522 }
3523 removeFlow(bb->getExit()); // delete the branch/join at the fork point
3524
3525 // remove potential join operations at the end of the conditional
3526 if (prog->getTarget()->joinAnterior) {
3527 bb = BasicBlock::get((bL ? bL : bR)->cfg.outgoing().getNode());
3528 if (bb->getEntry() && bb->getEntry()->op == OP_JOIN)
3529 removeFlow(bb->getEntry());
3530 }
3531
3532 return true;
3533 }
3534
3535 // =============================================================================
3536
3537 // Fold Immediate into MAD; must be done after register allocation due to
3538 // constraint SDST == SSRC2
3539 // TODO:
3540 // Does NVC0+ have other situations where this pass makes sense?
3541 class PostRaLoadPropagation : public Pass
3542 {
3543 private:
3544 virtual bool visit(Instruction *);
3545
3546 void handleMADforNV50(Instruction *);
3547 void handleMADforNVC0(Instruction *);
3548 };
3549
3550 static bool
post_ra_dead(Instruction * i)3551 post_ra_dead(Instruction *i)
3552 {
3553 for (int d = 0; i->defExists(d); ++d)
3554 if (i->getDef(d)->refCount())
3555 return false;
3556 return true;
3557 }
3558
3559 // Fold Immediate into MAD; must be done after register allocation due to
3560 // constraint SDST == SSRC2
3561 void
handleMADforNV50(Instruction * i)3562 PostRaLoadPropagation::handleMADforNV50(Instruction *i)
3563 {
3564 if (i->def(0).getFile() != FILE_GPR ||
3565 i->src(0).getFile() != FILE_GPR ||
3566 i->src(1).getFile() != FILE_GPR ||
3567 i->src(2).getFile() != FILE_GPR ||
3568 i->getDef(0)->reg.data.id != i->getSrc(2)->reg.data.id)
3569 return;
3570
3571 if (i->getDef(0)->reg.data.id >= 64 ||
3572 i->getSrc(0)->reg.data.id >= 64)
3573 return;
3574
3575 if (i->flagsSrc >= 0 && i->getSrc(i->flagsSrc)->reg.data.id != 0)
3576 return;
3577
3578 if (i->getPredicate())
3579 return;
3580
3581 Value *vtmp;
3582 Instruction *def = i->getSrc(1)->getInsn();
3583
3584 if (def && def->op == OP_SPLIT && typeSizeof(def->sType) == 4)
3585 def = def->getSrc(0)->getInsn();
3586 if (def && def->op == OP_MOV && def->src(0).getFile() == FILE_IMMEDIATE) {
3587 vtmp = i->getSrc(1);
3588 if (isFloatType(i->sType)) {
3589 i->setSrc(1, def->getSrc(0));
3590 } else {
3591 ImmediateValue val;
3592 // getImmediate() has side-effects on the argument so this *shouldn't*
3593 // be folded into the assert()
3594 ASSERTED bool ret = def->src(0).getImmediate(val);
3595 assert(ret);
3596 if (i->getSrc(1)->reg.data.id & 1)
3597 val.reg.data.u32 >>= 16;
3598 val.reg.data.u32 &= 0xffff;
3599 i->setSrc(1, new_ImmediateValue(prog, val.reg.data.u32));
3600 }
3601
3602 /* There's no post-RA dead code elimination, so do it here
3603 * XXX: if we add more code-removing post-RA passes, we might
3604 * want to create a post-RA dead-code elim pass */
3605 if (post_ra_dead(vtmp->getInsn())) {
3606 Value *src = vtmp->getInsn()->getSrc(0);
3607 // Careful -- splits will have already been removed from the
3608 // functions. Don't double-delete.
3609 if (vtmp->getInsn()->bb)
3610 delete_Instruction(prog, vtmp->getInsn());
3611 if (src->getInsn() && post_ra_dead(src->getInsn()))
3612 delete_Instruction(prog, src->getInsn());
3613 }
3614 }
3615 }
3616
3617 void
handleMADforNVC0(Instruction * i)3618 PostRaLoadPropagation::handleMADforNVC0(Instruction *i)
3619 {
3620 if (i->def(0).getFile() != FILE_GPR ||
3621 i->src(0).getFile() != FILE_GPR ||
3622 i->src(1).getFile() != FILE_GPR ||
3623 i->src(2).getFile() != FILE_GPR ||
3624 i->getDef(0)->reg.data.id != i->getSrc(2)->reg.data.id)
3625 return;
3626
3627 // TODO: gm107 can also do this for S32, maybe other chipsets as well
3628 if (i->dType != TYPE_F32)
3629 return;
3630
3631 if ((i->src(2).mod | Modifier(NV50_IR_MOD_NEG)) != Modifier(NV50_IR_MOD_NEG))
3632 return;
3633
3634 ImmediateValue val;
3635 int s;
3636
3637 if (i->src(0).getImmediate(val))
3638 s = 1;
3639 else if (i->src(1).getImmediate(val))
3640 s = 0;
3641 else
3642 return;
3643
3644 if ((i->src(s).mod | Modifier(NV50_IR_MOD_NEG)) != Modifier(NV50_IR_MOD_NEG))
3645 return;
3646
3647 if (s == 1)
3648 i->swapSources(0, 1);
3649
3650 Instruction *imm = i->getSrc(1)->getInsn();
3651 i->setSrc(1, imm->getSrc(0));
3652 if (post_ra_dead(imm))
3653 delete_Instruction(prog, imm);
3654 }
3655
3656 bool
visit(Instruction * i)3657 PostRaLoadPropagation::visit(Instruction *i)
3658 {
3659 switch (i->op) {
3660 case OP_FMA:
3661 case OP_MAD:
3662 if (prog->getTarget()->getChipset() < 0xc0)
3663 handleMADforNV50(i);
3664 else
3665 handleMADforNVC0(i);
3666 break;
3667 default:
3668 break;
3669 }
3670
3671 return true;
3672 }
3673
3674 // =============================================================================
3675
3676 // Common subexpression elimination. Stupid O^2 implementation.
3677 class LocalCSE : public Pass
3678 {
3679 private:
3680 virtual bool visit(BasicBlock *);
3681
3682 inline bool tryReplace(Instruction **, Instruction *);
3683
3684 DLList ops[OP_LAST + 1];
3685 };
3686
3687 class GlobalCSE : public Pass
3688 {
3689 private:
3690 virtual bool visit(BasicBlock *);
3691 };
3692
3693 bool
isActionEqual(const Instruction * that) const3694 Instruction::isActionEqual(const Instruction *that) const
3695 {
3696 if (this->op != that->op ||
3697 this->dType != that->dType ||
3698 this->sType != that->sType)
3699 return false;
3700 if (this->cc != that->cc)
3701 return false;
3702
3703 if (this->asTex()) {
3704 if (memcmp(&this->asTex()->tex,
3705 &that->asTex()->tex,
3706 sizeof(this->asTex()->tex)))
3707 return false;
3708 } else
3709 if (this->asCmp()) {
3710 if (this->asCmp()->setCond != that->asCmp()->setCond)
3711 return false;
3712 } else
3713 if (this->asFlow()) {
3714 return false;
3715 } else
3716 if (this->op == OP_PHI && this->bb != that->bb) {
3717 /* TODO: we could probably be a bit smarter here by following the
3718 * control flow, but honestly, it is quite painful to check */
3719 return false;
3720 } else {
3721 if (this->ipa != that->ipa ||
3722 this->lanes != that->lanes ||
3723 this->perPatch != that->perPatch)
3724 return false;
3725 if (this->postFactor != that->postFactor)
3726 return false;
3727 }
3728
3729 if (this->subOp != that->subOp ||
3730 this->saturate != that->saturate ||
3731 this->rnd != that->rnd ||
3732 this->ftz != that->ftz ||
3733 this->dnz != that->dnz ||
3734 this->cache != that->cache ||
3735 this->mask != that->mask)
3736 return false;
3737
3738 return true;
3739 }
3740
3741 bool
isResultEqual(const Instruction * that) const3742 Instruction::isResultEqual(const Instruction *that) const
3743 {
3744 unsigned int d, s;
3745
3746 // NOTE: location of discard only affects tex with liveOnly and quadops
3747 if (!this->defExists(0) && this->op != OP_DISCARD)
3748 return false;
3749
3750 if (!isActionEqual(that))
3751 return false;
3752
3753 if (this->predSrc != that->predSrc)
3754 return false;
3755
3756 for (d = 0; this->defExists(d); ++d) {
3757 if (!that->defExists(d) ||
3758 !this->getDef(d)->equals(that->getDef(d), false))
3759 return false;
3760 }
3761 if (that->defExists(d))
3762 return false;
3763
3764 for (s = 0; this->srcExists(s); ++s) {
3765 if (!that->srcExists(s))
3766 return false;
3767 if (this->src(s).mod != that->src(s).mod)
3768 return false;
3769 if (!this->getSrc(s)->equals(that->getSrc(s), true))
3770 return false;
3771 }
3772 if (that->srcExists(s))
3773 return false;
3774
3775 if (op == OP_LOAD || op == OP_VFETCH || op == OP_ATOM) {
3776 switch (src(0).getFile()) {
3777 case FILE_MEMORY_CONST:
3778 case FILE_SHADER_INPUT:
3779 return true;
3780 case FILE_SHADER_OUTPUT:
3781 return bb->getProgram()->getType() == Program::TYPE_TESSELLATION_EVAL;
3782 default:
3783 return false;
3784 }
3785 }
3786
3787 return true;
3788 }
3789
3790 // pull through common expressions from different in-blocks
3791 bool
visit(BasicBlock * bb)3792 GlobalCSE::visit(BasicBlock *bb)
3793 {
3794 Instruction *phi, *next, *ik;
3795 int s;
3796
3797 // TODO: maybe do this with OP_UNION, too
3798
3799 for (phi = bb->getPhi(); phi && phi->op == OP_PHI; phi = next) {
3800 next = phi->next;
3801 if (phi->getSrc(0)->refCount() > 1)
3802 continue;
3803 ik = phi->getSrc(0)->getInsn();
3804 if (!ik)
3805 continue; // probably a function input
3806 if (ik->defCount(0xff) > 1)
3807 continue; // too painful to check if we can really push this forward
3808 for (s = 1; phi->srcExists(s); ++s) {
3809 if (phi->getSrc(s)->refCount() > 1)
3810 break;
3811 if (!phi->getSrc(s)->getInsn() ||
3812 !phi->getSrc(s)->getInsn()->isResultEqual(ik))
3813 break;
3814 }
3815 if (!phi->srcExists(s)) {
3816 assert(ik->op != OP_PHI);
3817 Instruction *entry = bb->getEntry();
3818 ik->bb->remove(ik);
3819 if (!entry || entry->op != OP_JOIN)
3820 bb->insertHead(ik);
3821 else
3822 bb->insertAfter(entry, ik);
3823 ik->setDef(0, phi->getDef(0));
3824 delete_Instruction(prog, phi);
3825 }
3826 }
3827
3828 return true;
3829 }
3830
3831 bool
tryReplace(Instruction ** ptr,Instruction * i)3832 LocalCSE::tryReplace(Instruction **ptr, Instruction *i)
3833 {
3834 Instruction *old = *ptr;
3835
3836 // TODO: maybe relax this later (causes trouble with OP_UNION)
3837 if (i->isPredicated())
3838 return false;
3839
3840 if (!old->isResultEqual(i))
3841 return false;
3842
3843 for (int d = 0; old->defExists(d); ++d)
3844 old->def(d).replace(i->getDef(d), false);
3845 delete_Instruction(prog, old);
3846 *ptr = NULL;
3847 return true;
3848 }
3849
3850 bool
visit(BasicBlock * bb)3851 LocalCSE::visit(BasicBlock *bb)
3852 {
3853 unsigned int replaced;
3854
3855 do {
3856 Instruction *ir, *next;
3857
3858 replaced = 0;
3859
3860 // will need to know the order of instructions
3861 int serial = 0;
3862 for (ir = bb->getFirst(); ir; ir = ir->next)
3863 ir->serial = serial++;
3864
3865 for (ir = bb->getFirst(); ir; ir = next) {
3866 int s;
3867 Value *src = NULL;
3868
3869 next = ir->next;
3870
3871 if (ir->fixed) {
3872 ops[ir->op].insert(ir);
3873 continue;
3874 }
3875
3876 for (s = 0; ir->srcExists(s); ++s)
3877 if (ir->getSrc(s)->asLValue())
3878 if (!src || ir->getSrc(s)->refCount() < src->refCount())
3879 src = ir->getSrc(s);
3880
3881 if (src) {
3882 for (Value::UseIterator it = src->uses.begin();
3883 it != src->uses.end(); ++it) {
3884 Instruction *ik = (*it)->getInsn();
3885 if (ik && ik->bb == ir->bb && ik->serial < ir->serial)
3886 if (tryReplace(&ir, ik))
3887 break;
3888 }
3889 } else {
3890 DLLIST_FOR_EACH(&ops[ir->op], iter)
3891 {
3892 Instruction *ik = reinterpret_cast<Instruction *>(iter.get());
3893 if (tryReplace(&ir, ik))
3894 break;
3895 }
3896 }
3897
3898 if (ir)
3899 ops[ir->op].insert(ir);
3900 else
3901 ++replaced;
3902 }
3903 for (unsigned int i = 0; i <= OP_LAST; ++i)
3904 ops[i].clear();
3905
3906 } while (replaced);
3907
3908 return true;
3909 }
3910
3911 // =============================================================================
3912
3913 // Remove computations of unused values.
3914 class DeadCodeElim : public Pass
3915 {
3916 public:
DeadCodeElim()3917 DeadCodeElim() : deadCount(0) {}
3918 bool buryAll(Program *);
3919
3920 private:
3921 virtual bool visit(BasicBlock *);
3922
3923 void checkSplitLoad(Instruction *ld); // for partially dead loads
3924
3925 unsigned int deadCount;
3926 };
3927
3928 bool
buryAll(Program * prog)3929 DeadCodeElim::buryAll(Program *prog)
3930 {
3931 do {
3932 deadCount = 0;
3933 if (!this->run(prog, false, false))
3934 return false;
3935 } while (deadCount);
3936
3937 return true;
3938 }
3939
3940 bool
visit(BasicBlock * bb)3941 DeadCodeElim::visit(BasicBlock *bb)
3942 {
3943 Instruction *prev;
3944
3945 for (Instruction *i = bb->getExit(); i; i = prev) {
3946 prev = i->prev;
3947 if (i->isDead()) {
3948 ++deadCount;
3949 delete_Instruction(prog, i);
3950 } else
3951 if (i->defExists(1) &&
3952 i->subOp == 0 &&
3953 (i->op == OP_VFETCH || i->op == OP_LOAD)) {
3954 checkSplitLoad(i);
3955 } else
3956 if (i->defExists(0) && !i->getDef(0)->refCount()) {
3957 if (i->op == OP_ATOM ||
3958 i->op == OP_SUREDP ||
3959 i->op == OP_SUREDB) {
3960 const Target *targ = prog->getTarget();
3961 if (targ->getChipset() >= NVISA_GF100_CHIPSET ||
3962 i->subOp != NV50_IR_SUBOP_ATOM_CAS)
3963 i->setDef(0, NULL);
3964 if (i->op == OP_ATOM && i->subOp == NV50_IR_SUBOP_ATOM_EXCH) {
3965 i->cache = CACHE_CV;
3966 i->op = OP_STORE;
3967 i->subOp = 0;
3968 }
3969 } else if (i->op == OP_LOAD && i->subOp == NV50_IR_SUBOP_LOAD_LOCKED) {
3970 i->setDef(0, i->getDef(1));
3971 i->setDef(1, NULL);
3972 }
3973 }
3974 }
3975 return true;
3976 }
3977
3978 // Each load can go into up to 4 destinations, any of which might potentially
3979 // be dead (i.e. a hole). These can always be split into 2 loads, independent
3980 // of where the holes are. We find the first contiguous region, put it into
3981 // the first load, and then put the second contiguous region into the second
3982 // load. There can be at most 2 contiguous regions.
3983 //
3984 // Note that there are some restrictions, for example it's not possible to do
3985 // a 64-bit load that's not 64-bit aligned, so such a load has to be split
3986 // up. Also hardware doesn't support 96-bit loads, so those also have to be
3987 // split into a 64-bit and 32-bit load.
3988 void
checkSplitLoad(Instruction * ld1)3989 DeadCodeElim::checkSplitLoad(Instruction *ld1)
3990 {
3991 Instruction *ld2 = NULL; // can get at most 2 loads
3992 Value *def1[4];
3993 Value *def2[4];
3994 int32_t addr1, addr2;
3995 int32_t size1, size2;
3996 int d, n1, n2;
3997 uint32_t mask = 0xffffffff;
3998
3999 for (d = 0; ld1->defExists(d); ++d)
4000 if (!ld1->getDef(d)->refCount() && ld1->getDef(d)->reg.data.id < 0)
4001 mask &= ~(1 << d);
4002 if (mask == 0xffffffff)
4003 return;
4004
4005 addr1 = ld1->getSrc(0)->reg.data.offset;
4006 n1 = n2 = 0;
4007 size1 = size2 = 0;
4008
4009 // Compute address/width for first load
4010 for (d = 0; ld1->defExists(d); ++d) {
4011 if (mask & (1 << d)) {
4012 if (size1 && (addr1 & 0x7))
4013 break;
4014 def1[n1] = ld1->getDef(d);
4015 size1 += def1[n1++]->reg.size;
4016 } else
4017 if (!n1) {
4018 addr1 += ld1->getDef(d)->reg.size;
4019 } else {
4020 break;
4021 }
4022 }
4023
4024 // Scale back the size of the first load until it can be loaded. This
4025 // typically happens for TYPE_B96 loads.
4026 while (n1 &&
4027 !prog->getTarget()->isAccessSupported(ld1->getSrc(0)->reg.file,
4028 typeOfSize(size1))) {
4029 size1 -= def1[--n1]->reg.size;
4030 d--;
4031 }
4032
4033 // Compute address/width for second load
4034 for (addr2 = addr1 + size1; ld1->defExists(d); ++d) {
4035 if (mask & (1 << d)) {
4036 assert(!size2 || !(addr2 & 0x7));
4037 def2[n2] = ld1->getDef(d);
4038 size2 += def2[n2++]->reg.size;
4039 } else if (!n2) {
4040 assert(!n2);
4041 addr2 += ld1->getDef(d)->reg.size;
4042 } else {
4043 break;
4044 }
4045 }
4046
4047 // Make sure that we've processed all the values
4048 for (; ld1->defExists(d); ++d)
4049 assert(!(mask & (1 << d)));
4050
4051 updateLdStOffset(ld1, addr1, func);
4052 ld1->setType(typeOfSize(size1));
4053 for (d = 0; d < 4; ++d)
4054 ld1->setDef(d, (d < n1) ? def1[d] : NULL);
4055
4056 if (!n2)
4057 return;
4058
4059 ld2 = cloneShallow(func, ld1);
4060 updateLdStOffset(ld2, addr2, func);
4061 ld2->setType(typeOfSize(size2));
4062 for (d = 0; d < 4; ++d)
4063 ld2->setDef(d, (d < n2) ? def2[d] : NULL);
4064
4065 ld1->bb->insertAfter(ld1, ld2);
4066 }
4067
4068 // =============================================================================
4069
4070 #define RUN_PASS(l, n, f) \
4071 if (level >= (l)) { \
4072 if (dbgFlags & NV50_IR_DEBUG_VERBOSE) \
4073 INFO("PEEPHOLE: %s\n", #n); \
4074 n pass; \
4075 if (!pass.f(this)) \
4076 return false; \
4077 }
4078
4079 bool
optimizeSSA(int level)4080 Program::optimizeSSA(int level)
4081 {
4082 RUN_PASS(1, DeadCodeElim, buryAll);
4083 RUN_PASS(1, CopyPropagation, run);
4084 RUN_PASS(1, MergeSplits, run);
4085 RUN_PASS(2, GlobalCSE, run);
4086 RUN_PASS(1, LocalCSE, run);
4087 RUN_PASS(2, AlgebraicOpt, run);
4088 RUN_PASS(2, ModifierFolding, run); // before load propagation -> less checks
4089 RUN_PASS(1, ConstantFolding, foldAll);
4090 RUN_PASS(0, Split64BitOpPreRA, run);
4091 RUN_PASS(2, LateAlgebraicOpt, run);
4092 RUN_PASS(1, LoadPropagation, run);
4093 RUN_PASS(1, IndirectPropagation, run);
4094 RUN_PASS(4, MemoryOpt, run);
4095 RUN_PASS(2, LocalCSE, run);
4096 RUN_PASS(0, DeadCodeElim, buryAll);
4097
4098 return true;
4099 }
4100
4101 bool
optimizePostRA(int level)4102 Program::optimizePostRA(int level)
4103 {
4104 RUN_PASS(2, FlatteningPass, run);
4105 RUN_PASS(2, PostRaLoadPropagation, run);
4106
4107 return true;
4108 }
4109
4110 }
4111