xref: /aosp_15_r20/external/mesa3d/src/nouveau/codegen/nv50_ir_peephole.cpp (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2011 Christoph Bumiller
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  */
22 
23 #include "nv50_ir.h"
24 #include "nv50_ir_target.h"
25 #include "nv50_ir_build_util.h"
26 
27 extern "C" {
28 #include "util/u_math.h"
29 }
30 
31 namespace nv50_ir {
32 
33 bool
isNop() const34 Instruction::isNop() const
35 {
36    if (op == OP_PHI || op == OP_SPLIT || op == OP_MERGE)
37       return true;
38    if (terminator || join) // XXX: should terminator imply flow ?
39       return false;
40    if (op == OP_ATOM)
41       return false;
42    if (!fixed && op == OP_NOP)
43       return true;
44 
45    if (defExists(0) && def(0).rep()->reg.data.id < 0) {
46       for (int d = 1; defExists(d); ++d)
47          if (def(d).rep()->reg.data.id >= 0)
48             WARN("part of vector result is unused !\n");
49       return true;
50    }
51 
52    if (op == OP_MOV || op == OP_UNION) {
53       if (!getDef(0)->equals(getSrc(0)))
54          return false;
55       if (op == OP_UNION)
56          if (!getDef(0)->equals(getSrc(1)))
57             return false;
58       return true;
59    }
60 
61    return false;
62 }
63 
isDead() const64 bool Instruction::isDead() const
65 {
66    if (op == OP_STORE ||
67        op == OP_EXPORT ||
68        op == OP_ATOM ||
69        op == OP_SUSTB || op == OP_SUSTP || op == OP_SUREDP || op == OP_SUREDB)
70       return false;
71 
72    for (int d = 0; defExists(d); ++d)
73       if (getDef(d)->refCount() || getDef(d)->reg.data.id >= 0)
74          return false;
75 
76    if (terminator || asFlow())
77       return false;
78    if (fixed)
79       return false;
80 
81    return true;
82 };
83 
84 // =============================================================================
85 
86 class CopyPropagation : public Pass
87 {
88 private:
89    virtual bool visit(BasicBlock *);
90 };
91 
92 // Propagate all MOVs forward to make subsequent optimization easier, except if
93 // the sources stem from a phi, in which case we don't want to mess up potential
94 // swaps $rX <-> $rY, i.e. do not create live range overlaps of phi src and def.
95 bool
visit(BasicBlock * bb)96 CopyPropagation::visit(BasicBlock *bb)
97 {
98    Instruction *mov, *si, *next;
99 
100    for (mov = bb->getEntry(); mov; mov = next) {
101       next = mov->next;
102       if (mov->op != OP_MOV || mov->fixed || !mov->getSrc(0)->asLValue())
103          continue;
104       if (mov->getPredicate())
105          continue;
106       if (mov->def(0).getFile() != mov->src(0).getFile())
107          continue;
108       si = mov->getSrc(0)->getInsn();
109       if (mov->getDef(0)->reg.data.id < 0 && si && si->op != OP_PHI) {
110          // propagate
111          mov->def(0).replace(mov->getSrc(0), false);
112          delete_Instruction(prog, mov);
113       }
114    }
115    return true;
116 }
117 
118 // =============================================================================
119 
120 class MergeSplits : public Pass
121 {
122 private:
123    virtual bool visit(BasicBlock *);
124 };
125 
126 // For SPLIT / MERGE pairs that operate on the same registers, replace the
127 // post-merge def with the SPLIT's source.
128 bool
visit(BasicBlock * bb)129 MergeSplits::visit(BasicBlock *bb)
130 {
131    Instruction *i, *next, *si;
132 
133    for (i = bb->getEntry(); i; i = next) {
134       next = i->next;
135       if (i->op != OP_MERGE || typeSizeof(i->dType) != 8)
136          continue;
137       si = i->getSrc(0)->getInsn();
138       if (si->op != OP_SPLIT || si != i->getSrc(1)->getInsn())
139          continue;
140       if (i->getSrc(0) != si->getDef(0) || i->getSrc(1) != si->getDef(1))
141          continue;
142       i->def(0).replace(si->getSrc(0), false);
143       delete_Instruction(prog, i);
144    }
145 
146    return true;
147 }
148 
149 // =============================================================================
150 
151 class LoadPropagation : public Pass
152 {
153 private:
154    virtual bool visit(BasicBlock *);
155 
156    void checkSwapSrc01(Instruction *);
157 
158    bool isCSpaceLoad(Instruction *);
159    bool isImmdLoad(Instruction *);
160    bool isAttribOrSharedLoad(Instruction *);
161 };
162 
163 bool
isCSpaceLoad(Instruction * ld)164 LoadPropagation::isCSpaceLoad(Instruction *ld)
165 {
166    return ld && ld->op == OP_LOAD && ld->src(0).getFile() == FILE_MEMORY_CONST;
167 }
168 
169 bool
isImmdLoad(Instruction * ld)170 LoadPropagation::isImmdLoad(Instruction *ld)
171 {
172    if (!ld || (ld->op != OP_MOV) ||
173        ((typeSizeof(ld->dType) != 4) && (typeSizeof(ld->dType) != 8)))
174       return false;
175 
176    // A 0 can be replaced with a register, so it doesn't count as an immediate.
177    ImmediateValue val;
178    return ld->src(0).getImmediate(val) && !val.isInteger(0);
179 }
180 
181 bool
isAttribOrSharedLoad(Instruction * ld)182 LoadPropagation::isAttribOrSharedLoad(Instruction *ld)
183 {
184    return ld &&
185       (ld->op == OP_VFETCH ||
186        (ld->op == OP_LOAD &&
187         (ld->src(0).getFile() == FILE_SHADER_INPUT ||
188          ld->src(0).getFile() == FILE_MEMORY_SHARED)));
189 }
190 
191 void
checkSwapSrc01(Instruction * insn)192 LoadPropagation::checkSwapSrc01(Instruction *insn)
193 {
194    const Target *targ = prog->getTarget();
195    if (!targ->getOpInfo(insn).commutative) {
196       if (insn->op != OP_SET && insn->op != OP_SLCT &&
197           insn->op != OP_SUB && insn->op != OP_XMAD)
198          return;
199       // XMAD is only commutative if both the CBCC and MRG flags are not set.
200       if (insn->op == OP_XMAD &&
201           (insn->subOp & NV50_IR_SUBOP_XMAD_CMODE_MASK) == NV50_IR_SUBOP_XMAD_CBCC)
202          return;
203       if (insn->op == OP_XMAD && (insn->subOp & NV50_IR_SUBOP_XMAD_MRG))
204          return;
205    }
206    if (insn->src(1).getFile() != FILE_GPR)
207       return;
208    // This is the special OP_SET used for alphatesting, we can't reverse its
209    // arguments as that will confuse the fixup code.
210    if (insn->op == OP_SET && insn->subOp)
211       return;
212 
213    Instruction *i0 = insn->getSrc(0)->getInsn();
214    Instruction *i1 = insn->getSrc(1)->getInsn();
215 
216    // Swap sources to inline the less frequently used source. That way,
217    // optimistically, it will eventually be able to remove the instruction.
218    int i0refs = insn->getSrc(0)->refCount();
219    int i1refs = insn->getSrc(1)->refCount();
220 
221    if ((isCSpaceLoad(i0) || isImmdLoad(i0)) && targ->insnCanLoad(insn, 1, i0)) {
222       if ((!isImmdLoad(i1) && !isCSpaceLoad(i1)) ||
223           !targ->insnCanLoad(insn, 1, i1) ||
224           i0refs < i1refs)
225          insn->swapSources(0, 1);
226       else
227          return;
228    } else
229    if (isAttribOrSharedLoad(i1)) {
230       if (!isAttribOrSharedLoad(i0))
231          insn->swapSources(0, 1);
232       else
233          return;
234    } else {
235       return;
236    }
237 
238    if (insn->op == OP_SET || insn->op == OP_SET_AND ||
239        insn->op == OP_SET_OR || insn->op == OP_SET_XOR)
240       insn->asCmp()->setCond = reverseCondCode(insn->asCmp()->setCond);
241    else
242    if (insn->op == OP_SLCT)
243       insn->asCmp()->setCond = inverseCondCode(insn->asCmp()->setCond);
244    else
245    if (insn->op == OP_SUB) {
246       insn->src(0).mod = insn->src(0).mod ^ Modifier(NV50_IR_MOD_NEG);
247       insn->src(1).mod = insn->src(1).mod ^ Modifier(NV50_IR_MOD_NEG);
248    } else
249    if (insn->op == OP_XMAD) {
250       // swap h1 flags
251       uint16_t h1 = (insn->subOp >> 1 & NV50_IR_SUBOP_XMAD_H1(0)) |
252                     (insn->subOp << 1 & NV50_IR_SUBOP_XMAD_H1(1));
253       insn->subOp = (insn->subOp & ~NV50_IR_SUBOP_XMAD_H1_MASK) | h1;
254    }
255 }
256 
257 bool
visit(BasicBlock * bb)258 LoadPropagation::visit(BasicBlock *bb)
259 {
260    const Target *targ = prog->getTarget();
261    Instruction *next;
262 
263    for (Instruction *i = bb->getEntry(); i; i = next) {
264       next = i->next;
265 
266       if (i->op == OP_CALL) // calls have args as sources, they must be in regs
267          continue;
268 
269       if (i->op == OP_PFETCH) // pfetch expects arg1 to be a reg
270          continue;
271 
272       if (i->srcExists(1))
273          checkSwapSrc01(i);
274 
275       for (int s = 0; i->srcExists(s); ++s) {
276          Instruction *ld = i->getSrc(s)->getInsn();
277 
278          if (!ld || ld->fixed || (ld->op != OP_LOAD && ld->op != OP_MOV))
279             continue;
280          if (ld->op == OP_LOAD && ld->subOp == NV50_IR_SUBOP_LOAD_LOCKED)
281             continue;
282          if (!targ->insnCanLoad(i, s, ld))
283             continue;
284 
285          // propagate !
286          i->setSrc(s, ld->getSrc(0));
287          if (ld->src(0).isIndirect(0))
288             i->setIndirect(s, 0, ld->getIndirect(0, 0));
289 
290          if (ld->getDef(0)->refCount() == 0)
291             delete_Instruction(prog, ld);
292       }
293    }
294    return true;
295 }
296 
297 // =============================================================================
298 
299 class IndirectPropagation : public Pass
300 {
301 private:
302    virtual bool visit(BasicBlock *);
303 
304    BuildUtil bld;
305 };
306 
307 bool
visit(BasicBlock * bb)308 IndirectPropagation::visit(BasicBlock *bb)
309 {
310    const Target *targ = prog->getTarget();
311    Instruction *next;
312 
313    for (Instruction *i = bb->getEntry(); i; i = next) {
314       next = i->next;
315 
316       bld.setPosition(i, false);
317 
318       for (int s = 0; i->srcExists(s); ++s) {
319          Instruction *insn;
320          ImmediateValue imm;
321          if (!i->src(s).isIndirect(0))
322             continue;
323          insn = i->getIndirect(s, 0)->getInsn();
324          if (!insn)
325             continue;
326          if (insn->op == OP_ADD && !isFloatType(insn->dType)) {
327             if (insn->src(0).getFile() != targ->nativeFile(FILE_ADDRESS) ||
328                 !insn->src(1).getImmediate(imm) ||
329                 !targ->insnCanLoadOffset(i, s, imm.reg.data.s32))
330                continue;
331             i->setIndirect(s, 0, insn->getSrc(0));
332             i->setSrc(s, cloneShallow(func, i->getSrc(s)));
333             i->src(s).get()->reg.data.offset += imm.reg.data.u32;
334          } else if (insn->op == OP_SUB && !isFloatType(insn->dType)) {
335             if (insn->src(0).getFile() != targ->nativeFile(FILE_ADDRESS) ||
336                 !insn->src(1).getImmediate(imm) ||
337                 !targ->insnCanLoadOffset(i, s, -imm.reg.data.s32))
338                continue;
339             i->setIndirect(s, 0, insn->getSrc(0));
340             i->setSrc(s, cloneShallow(func, i->getSrc(s)));
341             i->src(s).get()->reg.data.offset -= imm.reg.data.u32;
342          } else if (insn->op == OP_MOV) {
343             if (!insn->src(0).getImmediate(imm) ||
344                 !targ->insnCanLoadOffset(i, s, imm.reg.data.s32))
345                continue;
346             i->setIndirect(s, 0, NULL);
347             i->setSrc(s, cloneShallow(func, i->getSrc(s)));
348             i->src(s).get()->reg.data.offset += imm.reg.data.u32;
349          } else if (insn->op == OP_SHLADD) {
350             if (!insn->src(2).getImmediate(imm) ||
351                 !targ->insnCanLoadOffset(i, s, imm.reg.data.s32))
352                continue;
353             i->setIndirect(s, 0, bld.mkOp2v(
354                OP_SHL, TYPE_U32, bld.getSSA(), insn->getSrc(0), insn->getSrc(1)));
355             i->setSrc(s, cloneShallow(func, i->getSrc(s)));
356             i->src(s).get()->reg.data.offset += imm.reg.data.u32;
357          }
358       }
359    }
360    return true;
361 }
362 
363 // =============================================================================
364 
365 // Evaluate constant expressions.
366 class ConstantFolding : public Pass
367 {
368 public:
ConstantFolding()369    ConstantFolding() : foldCount(0) {}
370    bool foldAll(Program *);
371 
372 private:
373    virtual bool visit(BasicBlock *);
374 
375    void expr(Instruction *, ImmediateValue&, ImmediateValue&);
376    void expr(Instruction *, ImmediateValue&, ImmediateValue&, ImmediateValue&);
377    /* true if i was deleted */
378    bool opnd(Instruction *i, ImmediateValue&, int s);
379    void opnd3(Instruction *, ImmediateValue&);
380 
381    void unary(Instruction *, const ImmediateValue&);
382 
383    void tryCollapseChainedMULs(Instruction *, const int s, ImmediateValue&);
384 
385    CmpInstruction *findOriginForTestWithZero(Value *);
386 
387    bool createMul(DataType ty, Value *def, Value *a, int64_t b, Value *c);
388 
389    unsigned int foldCount;
390 
391    BuildUtil bld;
392 };
393 
394 // TODO: remember generated immediates and only revisit these
395 bool
foldAll(Program * prog)396 ConstantFolding::foldAll(Program *prog)
397 {
398    unsigned int iterCount = 0;
399    do {
400       foldCount = 0;
401       if (!run(prog))
402          return false;
403    } while (foldCount && ++iterCount < 2);
404    return true;
405 }
406 
407 bool
visit(BasicBlock * bb)408 ConstantFolding::visit(BasicBlock *bb)
409 {
410    Instruction *i, *next;
411 
412    for (i = bb->getEntry(); i; i = next) {
413       next = i->next;
414       if (i->op == OP_MOV || i->op == OP_CALL)
415          continue;
416 
417       ImmediateValue src0, src1, src2;
418 
419       if (i->srcExists(2) &&
420           i->src(0).getImmediate(src0) &&
421           i->src(1).getImmediate(src1) &&
422           i->src(2).getImmediate(src2)) {
423          expr(i, src0, src1, src2);
424       } else
425       if (i->srcExists(1) &&
426           i->src(0).getImmediate(src0) && i->src(1).getImmediate(src1)) {
427          expr(i, src0, src1);
428       } else
429       if (i->srcExists(0) && i->src(0).getImmediate(src0)) {
430          if (opnd(i, src0, 0))
431             continue;
432       } else
433       if (i->srcExists(1) && i->src(1).getImmediate(src1)) {
434          if (opnd(i, src1, 1))
435             continue;
436       }
437       if (i->srcExists(2) && i->src(2).getImmediate(src2))
438          opnd3(i, src2);
439    }
440    return true;
441 }
442 
443 CmpInstruction *
findOriginForTestWithZero(Value * value)444 ConstantFolding::findOriginForTestWithZero(Value *value)
445 {
446    if (!value)
447       return NULL;
448    Instruction *insn = value->getInsn();
449    if (!insn)
450       return NULL;
451 
452    if (insn->asCmp() && insn->op != OP_SLCT)
453       return insn->asCmp();
454 
455    /* Sometimes mov's will sneak in as a result of other folding. This gets
456     * cleaned up later.
457     */
458    if (insn->op == OP_MOV)
459       return findOriginForTestWithZero(insn->getSrc(0));
460 
461    /* Deal with AND 1.0 here since nv50 can't fold into boolean float */
462    if (insn->op == OP_AND) {
463       int s = 0;
464       ImmediateValue imm;
465       if (!insn->src(s).getImmediate(imm)) {
466          s = 1;
467          if (!insn->src(s).getImmediate(imm))
468             return NULL;
469       }
470       if (imm.reg.data.f32 != 1.0f)
471          return NULL;
472       /* TODO: Come up with a way to handle the condition being inverted */
473       if (insn->src(!s).mod != Modifier(0))
474          return NULL;
475       return findOriginForTestWithZero(insn->getSrc(!s));
476    }
477 
478    return NULL;
479 }
480 
481 void
applyTo(ImmediateValue & imm) const482 Modifier::applyTo(ImmediateValue& imm) const
483 {
484    if (!bits) // avoid failure if imm.reg.type is unhandled (e.g. b128)
485       return;
486    switch (imm.reg.type) {
487    case TYPE_F32:
488       if (bits & NV50_IR_MOD_ABS)
489          imm.reg.data.f32 = fabsf(imm.reg.data.f32);
490       if (bits & NV50_IR_MOD_NEG)
491          imm.reg.data.f32 = -imm.reg.data.f32;
492       if (bits & NV50_IR_MOD_SAT) {
493          if (imm.reg.data.f32 < 0.0f)
494             imm.reg.data.f32 = 0.0f;
495          else
496          if (imm.reg.data.f32 > 1.0f)
497             imm.reg.data.f32 = 1.0f;
498       }
499       assert(!(bits & NV50_IR_MOD_NOT));
500       break;
501 
502    case TYPE_S8: // NOTE: will be extended
503    case TYPE_S16:
504    case TYPE_S32:
505    case TYPE_U8: // NOTE: treated as signed
506    case TYPE_U16:
507    case TYPE_U32:
508       if (bits & NV50_IR_MOD_ABS)
509          imm.reg.data.s32 = (imm.reg.data.s32 >= 0) ?
510             imm.reg.data.s32 : -imm.reg.data.s32;
511       if (bits & NV50_IR_MOD_NEG)
512          imm.reg.data.s32 = -imm.reg.data.s32;
513       if (bits & NV50_IR_MOD_NOT)
514          imm.reg.data.s32 = ~imm.reg.data.s32;
515       break;
516 
517    case TYPE_F64:
518       if (bits & NV50_IR_MOD_ABS)
519          imm.reg.data.f64 = fabs(imm.reg.data.f64);
520       if (bits & NV50_IR_MOD_NEG)
521          imm.reg.data.f64 = -imm.reg.data.f64;
522       if (bits & NV50_IR_MOD_SAT) {
523          if (imm.reg.data.f64 < 0.0)
524             imm.reg.data.f64 = 0.0;
525          else
526          if (imm.reg.data.f64 > 1.0)
527             imm.reg.data.f64 = 1.0;
528       }
529       assert(!(bits & NV50_IR_MOD_NOT));
530       break;
531 
532    default:
533       assert(!"invalid/unhandled type");
534       imm.reg.data.u64 = 0;
535       break;
536    }
537 }
538 
539 operation
getOp() const540 Modifier::getOp() const
541 {
542    switch (bits) {
543    case NV50_IR_MOD_ABS: return OP_ABS;
544    case NV50_IR_MOD_NEG: return OP_NEG;
545    case NV50_IR_MOD_SAT: return OP_SAT;
546    case NV50_IR_MOD_NOT: return OP_NOT;
547    case 0:
548       return OP_MOV;
549    default:
550       return OP_CVT;
551    }
552 }
553 
554 void
expr(Instruction * i,ImmediateValue & imm0,ImmediateValue & imm1)555 ConstantFolding::expr(Instruction *i,
556                       ImmediateValue &imm0, ImmediateValue &imm1)
557 {
558    struct Storage *const a = &imm0.reg, *const b = &imm1.reg;
559    struct Storage res;
560    DataType type = i->dType;
561 
562    memset(&res.data, 0, sizeof(res.data));
563 
564    switch (i->op) {
565    case OP_SGXT: {
566       int bits = b->data.u32;
567       if (bits) {
568          uint32_t data = a->data.u32 & (0xffffffff >> (32 - bits));
569          if (bits < 32 && (data & (1 << (bits - 1))))
570             data = data - (1 << bits);
571          res.data.u32 = data;
572       }
573       break;
574    }
575    case OP_BMSK:
576       res.data.u32 = ((1 << b->data.u32) - 1) << a->data.u32;
577       break;
578    case OP_MAD:
579    case OP_FMA:
580    case OP_MUL:
581       if (i->dnz && i->dType == TYPE_F32) {
582          if (!isfinite(a->data.f32))
583             a->data.f32 = 0.0f;
584          if (!isfinite(b->data.f32))
585             b->data.f32 = 0.0f;
586       }
587       switch (i->dType) {
588       case TYPE_F32:
589          res.data.f32 = a->data.f32 * b->data.f32 * exp2f(i->postFactor);
590          break;
591       case TYPE_F64: res.data.f64 = a->data.f64 * b->data.f64; break;
592       case TYPE_S32:
593          if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
594             res.data.s32 = ((int64_t)a->data.s32 * b->data.s32) >> 32;
595             break;
596          }
597          FALLTHROUGH;
598       case TYPE_U32:
599          if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
600             res.data.u32 = ((uint64_t)a->data.u32 * b->data.u32) >> 32;
601             break;
602          }
603          res.data.u32 = a->data.u32 * b->data.u32; break;
604       default:
605          return;
606       }
607       break;
608    case OP_DIV:
609       if (b->data.u32 == 0)
610          break;
611       switch (i->dType) {
612       case TYPE_F32: res.data.f32 = a->data.f32 / b->data.f32; break;
613       case TYPE_F64: res.data.f64 = a->data.f64 / b->data.f64; break;
614       case TYPE_S32: res.data.s32 = a->data.s32 / b->data.s32; break;
615       case TYPE_U32: res.data.u32 = a->data.u32 / b->data.u32; break;
616       default:
617          return;
618       }
619       break;
620    case OP_ADD:
621       switch (i->dType) {
622       case TYPE_F32: res.data.f32 = a->data.f32 + b->data.f32; break;
623       case TYPE_F64: res.data.f64 = a->data.f64 + b->data.f64; break;
624       case TYPE_S32:
625       case TYPE_U32: res.data.u32 = a->data.u32 + b->data.u32; break;
626       default:
627          return;
628       }
629       break;
630    case OP_SUB:
631       switch (i->dType) {
632       case TYPE_F32: res.data.f32 = a->data.f32 - b->data.f32; break;
633       case TYPE_F64: res.data.f64 = a->data.f64 - b->data.f64; break;
634       case TYPE_S32:
635       case TYPE_U32: res.data.u32 = a->data.u32 - b->data.u32; break;
636       default:
637          return;
638       }
639       break;
640    case OP_MAX:
641       switch (i->dType) {
642       case TYPE_F32: res.data.f32 = MAX2(a->data.f32, b->data.f32); break;
643       case TYPE_F64: res.data.f64 = MAX2(a->data.f64, b->data.f64); break;
644       case TYPE_S32: res.data.s32 = MAX2(a->data.s32, b->data.s32); break;
645       case TYPE_U32: res.data.u32 = MAX2(a->data.u32, b->data.u32); break;
646       default:
647          return;
648       }
649       break;
650    case OP_MIN:
651       switch (i->dType) {
652       case TYPE_F32: res.data.f32 = MIN2(a->data.f32, b->data.f32); break;
653       case TYPE_F64: res.data.f64 = MIN2(a->data.f64, b->data.f64); break;
654       case TYPE_S32: res.data.s32 = MIN2(a->data.s32, b->data.s32); break;
655       case TYPE_U32: res.data.u32 = MIN2(a->data.u32, b->data.u32); break;
656       default:
657          return;
658       }
659       break;
660    case OP_AND:
661       res.data.u64 = a->data.u64 & b->data.u64;
662       break;
663    case OP_OR:
664       res.data.u64 = a->data.u64 | b->data.u64;
665       break;
666    case OP_XOR:
667       res.data.u64 = a->data.u64 ^ b->data.u64;
668       break;
669    case OP_SHL:
670       res.data.u32 = a->data.u32 << b->data.u32;
671       break;
672    case OP_SHR:
673       switch (i->dType) {
674       case TYPE_S32: res.data.s32 = a->data.s32 >> b->data.u32; break;
675       case TYPE_U32: res.data.u32 = a->data.u32 >> b->data.u32; break;
676       default:
677          return;
678       }
679       break;
680    case OP_SLCT:
681       if (a->data.u32 != b->data.u32)
682          return;
683       res.data.u32 = a->data.u32;
684       break;
685    case OP_EXTBF: {
686       int offset = b->data.u32 & 0xff;
687       int width = (b->data.u32 >> 8) & 0xff;
688       int rshift = offset;
689       int lshift = 0;
690       if (width == 0) {
691          res.data.u32 = 0;
692          break;
693       }
694       if (width + offset < 32) {
695          rshift = 32 - width;
696          lshift = 32 - width - offset;
697       }
698       if (i->subOp == NV50_IR_SUBOP_EXTBF_REV)
699          res.data.u32 = util_bitreverse(a->data.u32);
700       else
701          res.data.u32 = a->data.u32;
702       switch (i->dType) {
703       case TYPE_S32: res.data.s32 = (res.data.s32 << lshift) >> rshift; break;
704       case TYPE_U32: res.data.u32 = (res.data.u32 << lshift) >> rshift; break;
705       default:
706          return;
707       }
708       break;
709    }
710    case OP_POPCNT:
711       res.data.u32 = util_bitcount(a->data.u32 & b->data.u32);
712       break;
713    case OP_PFETCH:
714       // The two arguments to pfetch are logically added together. Normally
715       // the second argument will not be constant, but that can happen.
716       res.data.u32 = a->data.u32 + b->data.u32;
717       type = TYPE_U32;
718       break;
719    case OP_MERGE:
720       switch (i->dType) {
721       case TYPE_U64:
722       case TYPE_S64:
723       case TYPE_F64:
724          res.data.u64 = (((uint64_t)b->data.u32) << 32) | a->data.u32;
725          break;
726       default:
727          return;
728       }
729       break;
730    default:
731       return;
732    }
733    ++foldCount;
734 
735    i->src(0).mod = Modifier(0);
736    i->src(1).mod = Modifier(0);
737    i->postFactor = 0;
738 
739    i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res.data.u32));
740    i->setSrc(1, NULL);
741 
742    i->getSrc(0)->reg.data = res.data;
743    i->getSrc(0)->reg.type = type;
744    i->getSrc(0)->reg.size = typeSizeof(type);
745 
746    switch (i->op) {
747    case OP_MAD:
748    case OP_FMA: {
749       ImmediateValue src0, src1;
750       src1 = *i->getSrc(0)->asImm();
751 
752       // Move the immediate into position 1, where we know it might be
753       // emittable. However it might not be anyways, as there may be other
754       // restrictions, so move it into a separate LValue.
755       bld.setPosition(i, false);
756       i->op = OP_ADD;
757       i->dnz = 0;
758       i->setSrc(1, bld.mkMov(bld.getSSA(type), i->getSrc(0), type)->getDef(0));
759       i->setSrc(0, i->getSrc(2));
760       i->src(0).mod = i->src(2).mod;
761       i->setSrc(2, NULL);
762 
763       if (i->src(0).getImmediate(src0))
764          expr(i, src0, src1);
765       else
766          opnd(i, src1, 1);
767       break;
768    }
769    case OP_PFETCH:
770       // Leave PFETCH alone... we just folded its 2 args into 1.
771       break;
772    default:
773       i->op = i->saturate ? OP_SAT : OP_MOV;
774       if (i->saturate)
775          unary(i, *i->getSrc(0)->asImm());
776       break;
777    }
778    i->subOp = 0;
779 }
780 
781 void
expr(Instruction * i,ImmediateValue & imm0,ImmediateValue & imm1,ImmediateValue & imm2)782 ConstantFolding::expr(Instruction *i,
783                       ImmediateValue &imm0,
784                       ImmediateValue &imm1,
785                       ImmediateValue &imm2)
786 {
787    struct Storage *const a = &imm0.reg, *const b = &imm1.reg, *const c = &imm2.reg;
788    struct Storage res;
789 
790    memset(&res.data, 0, sizeof(res.data));
791 
792    switch (i->op) {
793    case OP_LOP3_LUT:
794       for (int n = 0; n < 32; n++) {
795          uint8_t lut = ((a->data.u32 >> n) & 1) << 2 |
796                        ((b->data.u32 >> n) & 1) << 1 |
797                        ((c->data.u32 >> n) & 1);
798          res.data.u32 |= !!(i->subOp & (1 << lut)) << n;
799       }
800       break;
801    case OP_PERMT:
802       if (!i->subOp) {
803          uint64_t input = (uint64_t)c->data.u32 << 32 | a->data.u32;
804          uint16_t permt = b->data.u32;
805          for (int n = 0 ; n < 4; n++, permt >>= 4)
806             res.data.u32 |= ((input >> ((permt & 0xf) * 8)) & 0xff) << n * 8;
807       } else
808          return;
809       break;
810    case OP_INSBF: {
811       int offset = b->data.u32 & 0xff;
812       int width = (b->data.u32 >> 8) & 0xff;
813       unsigned bitmask = ((1 << width) - 1) << offset;
814       res.data.u32 = ((a->data.u32 << offset) & bitmask) | (c->data.u32 & ~bitmask);
815       break;
816    }
817    case OP_MAD:
818    case OP_FMA: {
819       switch (i->dType) {
820       case TYPE_F32:
821          res.data.f32 = a->data.f32 * b->data.f32 * exp2f(i->postFactor) +
822             c->data.f32;
823          break;
824       case TYPE_F64:
825          res.data.f64 = a->data.f64 * b->data.f64 + c->data.f64;
826          break;
827       case TYPE_S32:
828          if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
829             res.data.s32 = ((int64_t)a->data.s32 * b->data.s32 >> 32) + c->data.s32;
830             break;
831          }
832          FALLTHROUGH;
833       case TYPE_U32:
834          if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
835             res.data.u32 = ((uint64_t)a->data.u32 * b->data.u32 >> 32) + c->data.u32;
836             break;
837          }
838          res.data.u32 = a->data.u32 * b->data.u32 + c->data.u32;
839          break;
840       default:
841          return;
842       }
843       break;
844    }
845    case OP_SHLADD:
846       res.data.u32 = (a->data.u32 << b->data.u32) + c->data.u32;
847       break;
848    default:
849       return;
850    }
851 
852    ++foldCount;
853    i->src(0).mod = Modifier(0);
854    i->src(1).mod = Modifier(0);
855    i->src(2).mod = Modifier(0);
856 
857    i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res.data.u32));
858    i->setSrc(1, NULL);
859    i->setSrc(2, NULL);
860 
861    i->getSrc(0)->reg.data = res.data;
862    i->getSrc(0)->reg.type = i->dType;
863    i->getSrc(0)->reg.size = typeSizeof(i->dType);
864 
865    i->op = OP_MOV;
866 }
867 
868 void
unary(Instruction * i,const ImmediateValue & imm)869 ConstantFolding::unary(Instruction *i, const ImmediateValue &imm)
870 {
871    Storage res;
872 
873    if (i->dType != TYPE_F32)
874       return;
875    switch (i->op) {
876    case OP_NEG: res.data.f32 = -imm.reg.data.f32; break;
877    case OP_ABS: res.data.f32 = fabsf(imm.reg.data.f32); break;
878    case OP_SAT: res.data.f32 = SATURATE(imm.reg.data.f32); break;
879    case OP_RCP: res.data.f32 = 1.0f / imm.reg.data.f32; break;
880    case OP_RSQ: res.data.f32 = 1.0f / sqrtf(imm.reg.data.f32); break;
881    case OP_LG2: res.data.f32 = log2f(imm.reg.data.f32); break;
882    case OP_EX2: res.data.f32 = exp2f(imm.reg.data.f32); break;
883    case OP_SIN: res.data.f32 = sinf(imm.reg.data.f32); break;
884    case OP_COS: res.data.f32 = cosf(imm.reg.data.f32); break;
885    case OP_SQRT: res.data.f32 = sqrtf(imm.reg.data.f32); break;
886    case OP_PRESIN:
887    case OP_PREEX2:
888       // these should be handled in subsequent OP_SIN/COS/EX2
889       res.data.f32 = imm.reg.data.f32;
890       break;
891    default:
892       return;
893    }
894    i->op = OP_MOV;
895    i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res.data.f32));
896    i->src(0).mod = Modifier(0);
897 }
898 
899 void
tryCollapseChainedMULs(Instruction * mul2,const int s,ImmediateValue & imm2)900 ConstantFolding::tryCollapseChainedMULs(Instruction *mul2,
901                                         const int s, ImmediateValue& imm2)
902 {
903    const int t = s ? 0 : 1;
904    Instruction *insn;
905    Instruction *mul1 = NULL; // mul1 before mul2
906    int e = 0;
907    float f = imm2.reg.data.f32 * exp2f(mul2->postFactor);
908    ImmediateValue imm1;
909 
910    assert(mul2->op == OP_MUL && mul2->dType == TYPE_F32);
911 
912    if (mul2->getSrc(t)->refCount() == 1) {
913       insn = mul2->getSrc(t)->getInsn();
914       if (!mul2->src(t).mod && insn->op == OP_MUL && insn->dType == TYPE_F32)
915          mul1 = insn;
916       if (mul1 && !mul1->saturate) {
917          int s1;
918 
919          if (mul1->src(s1 = 0).getImmediate(imm1) ||
920              mul1->src(s1 = 1).getImmediate(imm1)) {
921             bld.setPosition(mul1, false);
922             // a = mul r, imm1
923             // d = mul a, imm2 -> d = mul r, (imm1 * imm2)
924             mul1->setSrc(s1, bld.loadImm(NULL, f * imm1.reg.data.f32));
925             mul1->src(s1).mod = Modifier(0);
926             mul2->def(0).replace(mul1->getDef(0), false);
927             mul1->saturate = mul2->saturate;
928          } else
929          if (prog->getTarget()->isPostMultiplySupported(OP_MUL, f, e)) {
930             // c = mul a, b
931             // d = mul c, imm   -> d = mul_x_imm a, b
932             mul1->postFactor = e;
933             mul2->def(0).replace(mul1->getDef(0), false);
934             if (f < 0)
935                mul1->src(0).mod *= Modifier(NV50_IR_MOD_NEG);
936             mul1->saturate = mul2->saturate;
937          }
938          return;
939       }
940    }
941    if (mul2->getDef(0)->refCount() == 1 && !mul2->saturate) {
942       // b = mul a, imm
943       // d = mul b, c   -> d = mul_x_imm a, c
944       int s2, t2;
945       insn = (*mul2->getDef(0)->uses.begin())->getInsn();
946       if (!insn)
947          return;
948       mul1 = mul2;
949       mul2 = NULL;
950       s2 = insn->getSrc(0) == mul1->getDef(0) ? 0 : 1;
951       t2 = s2 ? 0 : 1;
952       if (insn->op == OP_MUL && insn->dType == TYPE_F32)
953          if (!insn->src(s2).mod && !insn->src(t2).getImmediate(imm1))
954             mul2 = insn;
955       if (mul2 && prog->getTarget()->isPostMultiplySupported(OP_MUL, f, e)) {
956          mul2->postFactor = e;
957          mul2->setSrc(s2, mul1->src(t));
958          if (f < 0)
959             mul2->src(s2).mod *= Modifier(NV50_IR_MOD_NEG);
960       }
961    }
962 }
963 
964 void
opnd3(Instruction * i,ImmediateValue & imm2)965 ConstantFolding::opnd3(Instruction *i, ImmediateValue &imm2)
966 {
967    switch (i->op) {
968    case OP_MAD:
969    case OP_FMA:
970       if (imm2.isInteger(0)) {
971          i->op = OP_MUL;
972          i->setSrc(2, NULL);
973          foldCount++;
974          return;
975       }
976       break;
977    case OP_SHLADD:
978       if (imm2.isInteger(0)) {
979          i->op = OP_SHL;
980          i->setSrc(2, NULL);
981          foldCount++;
982          return;
983       }
984       break;
985    default:
986       return;
987    }
988 }
989 
990 bool
createMul(DataType ty,Value * def,Value * a,int64_t b,Value * c)991 ConstantFolding::createMul(DataType ty, Value *def, Value *a, int64_t b, Value *c)
992 {
993    const Target *target = prog->getTarget();
994    int64_t absB = llabs(b);
995 
996    //a * (2^shl) -> a << shl
997    if (b >= 0 && util_is_power_of_two_or_zero64(b)) {
998       int shl = util_logbase2_64(b);
999 
1000       Value *res = c ? bld.getSSA(typeSizeof(ty)) : def;
1001       bld.mkOp2(OP_SHL, ty, res, a, bld.mkImm(shl));
1002       if (c)
1003          bld.mkOp2(OP_ADD, ty, def, res, c);
1004 
1005       return true;
1006    }
1007 
1008    //a * (2^shl + 1) -> a << shl + a
1009    //a * -(2^shl + 1) -> -a << shl + a
1010    //a * (2^shl - 1) -> a << shl - a
1011    //a * -(2^shl - 1) -> -a << shl - a
1012    if (typeSizeof(ty) == 4 &&
1013        (util_is_power_of_two_or_zero64(absB - 1) ||
1014         util_is_power_of_two_or_zero64(absB + 1)) &&
1015        target->isOpSupported(OP_SHLADD, TYPE_U32)) {
1016       bool subA = util_is_power_of_two_or_zero64(absB + 1);
1017       int shl = subA ? util_logbase2_64(absB + 1) : util_logbase2_64(absB - 1);
1018 
1019       Value *res = c ? bld.getSSA() : def;
1020       Instruction *insn = bld.mkOp3(OP_SHLADD, TYPE_U32, res, a, bld.mkImm(shl), a);
1021       if (b < 0)
1022          insn->src(0).mod = Modifier(NV50_IR_MOD_NEG);
1023       if (subA)
1024          insn->src(2).mod = Modifier(NV50_IR_MOD_NEG);
1025 
1026       if (c)
1027          bld.mkOp2(OP_ADD, TYPE_U32, def, res, c);
1028 
1029       return true;
1030    }
1031 
1032    if (typeSizeof(ty) == 4 && b >= 0 && b <= 0xffff &&
1033        target->isOpSupported(OP_XMAD, TYPE_U32)) {
1034       Value *tmp = bld.mkOp3v(OP_XMAD, TYPE_U32, bld.getSSA(),
1035                               a, bld.mkImm((uint32_t)b), c ? c : bld.mkImm(0));
1036       bld.mkOp3(OP_XMAD, TYPE_U32, def, a, bld.mkImm((uint32_t)b), tmp)->subOp =
1037          NV50_IR_SUBOP_XMAD_PSL | NV50_IR_SUBOP_XMAD_H1(0);
1038 
1039       return true;
1040    }
1041 
1042    return false;
1043 }
1044 
1045 bool
opnd(Instruction * i,ImmediateValue & imm0,int s)1046 ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
1047 {
1048    const int t = !s;
1049    const operation op = i->op;
1050    Instruction *newi = i;
1051    bool deleted = false;
1052 
1053    switch (i->op) {
1054    case OP_SPLIT: {
1055       bld.setPosition(i, false);
1056 
1057       uint8_t size = i->getDef(0)->reg.size;
1058       uint8_t bitsize = size * 8;
1059       uint32_t mask = (1ULL << bitsize) - 1;
1060       assert(bitsize <= 32);
1061 
1062       uint64_t val = imm0.reg.data.u64;
1063       for (int8_t d = 0; i->defExists(d); ++d) {
1064          Value *def = i->getDef(d);
1065          assert(def->reg.size == size);
1066 
1067          newi = bld.mkMov(def, bld.mkImm((uint32_t)(val & mask)),
1068                           typeOfSize(size));
1069          val >>= bitsize;
1070       }
1071       delete_Instruction(prog, i);
1072       deleted = true;
1073       break;
1074    }
1075    case OP_MUL:
1076       if (i->dType == TYPE_F32 && !i->precise)
1077          tryCollapseChainedMULs(i, s, imm0);
1078 
1079       if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
1080          assert(!isFloatType(i->sType));
1081          if (imm0.isInteger(1) && i->dType == TYPE_S32) {
1082             bld.setPosition(i, false);
1083             // Need to set to the sign value, which is a compare.
1084             newi = bld.mkCmp(OP_SET, CC_LT, TYPE_S32, i->getDef(0),
1085                              TYPE_S32, i->getSrc(t), bld.mkImm(0));
1086             delete_Instruction(prog, i);
1087             deleted = true;
1088          } else if (imm0.isInteger(0) || imm0.isInteger(1)) {
1089             // The high bits can't be set in this case (either mul by 0 or
1090             // unsigned by 1)
1091             i->op = OP_MOV;
1092             i->subOp = 0;
1093             i->setSrc(0, new_ImmediateValue(prog, 0u));
1094             i->src(0).mod = Modifier(0);
1095             i->setSrc(1, NULL);
1096          } else if (!imm0.isNegative() && imm0.isPow2()) {
1097             // Translate into a shift
1098             imm0.applyLog2();
1099             i->op = OP_SHR;
1100             i->subOp = 0;
1101             imm0.reg.data.u32 = 32 - imm0.reg.data.u32;
1102             i->setSrc(0, i->getSrc(t));
1103             i->src(0).mod = i->src(t).mod;
1104             i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32));
1105             i->src(1).mod = 0;
1106          }
1107       } else
1108       if (imm0.isInteger(0)) {
1109          i->dnz = 0;
1110          i->op = OP_MOV;
1111          i->setSrc(0, new_ImmediateValue(prog, 0u));
1112          i->src(0).mod = Modifier(0);
1113          i->postFactor = 0;
1114          i->setSrc(1, NULL);
1115       } else
1116       if (!i->postFactor && (imm0.isInteger(1) || imm0.isInteger(-1))) {
1117          if (imm0.isNegative())
1118             i->src(t).mod = i->src(t).mod ^ Modifier(NV50_IR_MOD_NEG);
1119          i->dnz = 0;
1120          i->op = i->src(t).mod.getOp();
1121          if (s == 0) {
1122             i->setSrc(0, i->getSrc(1));
1123             i->src(0).mod = i->src(1).mod;
1124             i->src(1).mod = 0;
1125          }
1126          if (i->op != OP_CVT)
1127             i->src(0).mod = 0;
1128          i->setSrc(1, NULL);
1129       } else
1130       if (!i->postFactor && (imm0.isInteger(2) || imm0.isInteger(-2))) {
1131          if (imm0.isNegative())
1132             i->src(t).mod = i->src(t).mod ^ Modifier(NV50_IR_MOD_NEG);
1133          i->op = OP_ADD;
1134          i->dnz = 0;
1135          i->setSrc(s, i->getSrc(t));
1136          i->src(s).mod = i->src(t).mod;
1137       } else
1138       if (!isFloatType(i->dType) && !i->src(t).mod) {
1139          bld.setPosition(i, false);
1140          int64_t b = typeSizeof(i->dType) == 8 ? imm0.reg.data.s64 : imm0.reg.data.s32;
1141          if (createMul(i->dType, i->getDef(0), i->getSrc(t), b, NULL)) {
1142             delete_Instruction(prog, i);
1143             deleted = true;
1144          }
1145       } else
1146       if (i->postFactor && i->sType == TYPE_F32) {
1147          /* Can't emit a postfactor with an immediate, have to fold it in */
1148          i->setSrc(s, new_ImmediateValue(
1149                       prog, imm0.reg.data.f32 * exp2f(i->postFactor)));
1150          i->postFactor = 0;
1151       }
1152       break;
1153    case OP_FMA:
1154    case OP_MAD:
1155       if (imm0.isInteger(0)) {
1156          i->setSrc(0, i->getSrc(2));
1157          i->src(0).mod = i->src(2).mod;
1158          i->setSrc(1, NULL);
1159          i->setSrc(2, NULL);
1160          i->dnz = 0;
1161          i->op = i->src(0).mod.getOp();
1162          if (i->op != OP_CVT)
1163             i->src(0).mod = 0;
1164       } else
1165       if (i->subOp != NV50_IR_SUBOP_MUL_HIGH &&
1166           (imm0.isInteger(1) || imm0.isInteger(-1))) {
1167          if (imm0.isNegative())
1168             i->src(t).mod = i->src(t).mod ^ Modifier(NV50_IR_MOD_NEG);
1169          if (s == 0) {
1170             i->setSrc(0, i->getSrc(1));
1171             i->src(0).mod = i->src(1).mod;
1172          }
1173          i->setSrc(1, i->getSrc(2));
1174          i->src(1).mod = i->src(2).mod;
1175          i->setSrc(2, NULL);
1176          i->dnz = 0;
1177          i->op = OP_ADD;
1178       } else
1179       if (!isFloatType(i->dType) && !i->subOp && !i->src(t).mod && !i->src(2).mod) {
1180          bld.setPosition(i, false);
1181          int64_t b = typeSizeof(i->dType) == 8 ? imm0.reg.data.s64 : imm0.reg.data.s32;
1182          if (createMul(i->dType, i->getDef(0), i->getSrc(t), b, i->getSrc(2))) {
1183             delete_Instruction(prog, i);
1184             deleted = true;
1185          }
1186       }
1187       break;
1188    case OP_SUB:
1189       if (imm0.isInteger(0) && s == 0 && typeSizeof(i->dType) == 8 &&
1190           !isFloatType(i->dType))
1191          break;
1192       FALLTHROUGH;
1193    case OP_ADD:
1194       if (i->usesFlags())
1195          break;
1196       if (imm0.isInteger(0)) {
1197          if (s == 0) {
1198             i->setSrc(0, i->getSrc(1));
1199             i->src(0).mod = i->src(1).mod;
1200             if (i->op == OP_SUB)
1201                i->src(0).mod = i->src(0).mod ^ Modifier(NV50_IR_MOD_NEG);
1202          }
1203          i->setSrc(1, NULL);
1204          i->op = i->src(0).mod.getOp();
1205          if (i->op != OP_CVT)
1206             i->src(0).mod = Modifier(0);
1207       }
1208       break;
1209 
1210    case OP_DIV:
1211       if (s != 1 || (i->dType != TYPE_S32 && i->dType != TYPE_U32))
1212          break;
1213       bld.setPosition(i, false);
1214       if (imm0.reg.data.u32 == 0) {
1215          break;
1216       } else
1217       if (imm0.reg.data.u32 == 1) {
1218          i->op = OP_MOV;
1219          i->setSrc(1, NULL);
1220       } else
1221       if (i->dType == TYPE_U32 && imm0.isPow2()) {
1222          i->op = OP_SHR;
1223          i->setSrc(1, bld.mkImm(util_logbase2(imm0.reg.data.u32)));
1224       } else
1225       if (i->dType == TYPE_U32) {
1226          Instruction *mul;
1227          Value *tA, *tB;
1228          const uint32_t d = imm0.reg.data.u32;
1229          uint32_t m;
1230          int r, s;
1231          uint32_t l = util_logbase2(d);
1232          if (((uint32_t)1 << l) < d)
1233             ++l;
1234          m = (((uint64_t)1 << 32) * (((uint64_t)1 << l) - d)) / d + 1;
1235          r = l ? 1 : 0;
1236          s = l ? (l - 1) : 0;
1237 
1238          tA = bld.getSSA();
1239          tB = bld.getSSA();
1240          mul = bld.mkOp2(OP_MUL, TYPE_U32, tA, i->getSrc(0),
1241                          bld.loadImm(NULL, m));
1242          mul->subOp = NV50_IR_SUBOP_MUL_HIGH;
1243          bld.mkOp2(OP_SUB, TYPE_U32, tB, i->getSrc(0), tA);
1244          tA = bld.getSSA();
1245          if (r)
1246             bld.mkOp2(OP_SHR, TYPE_U32, tA, tB, bld.mkImm(r));
1247          else
1248             tA = tB;
1249          tB = s ? bld.getSSA() : i->getDef(0);
1250          newi = bld.mkOp2(OP_ADD, TYPE_U32, tB, mul->getDef(0), tA);
1251          if (s)
1252             bld.mkOp2(OP_SHR, TYPE_U32, i->getDef(0), tB, bld.mkImm(s));
1253 
1254          delete_Instruction(prog, i);
1255          deleted = true;
1256       } else
1257       if (imm0.reg.data.s32 == -1) {
1258          i->op = OP_NEG;
1259          i->setSrc(1, NULL);
1260       } else {
1261          LValue *tA, *tB;
1262          LValue *tD;
1263          const int32_t d = imm0.reg.data.s32;
1264          int32_t m;
1265          int32_t l = util_logbase2(static_cast<unsigned>(abs(d)));
1266          if ((1 << l) < abs(d))
1267             ++l;
1268          if (!l)
1269             l = 1;
1270          m = ((uint64_t)1 << (32 + l - 1)) / abs(d) + 1 - ((uint64_t)1 << 32);
1271 
1272          tA = bld.getSSA();
1273          tB = bld.getSSA();
1274          bld.mkOp3(OP_MAD, TYPE_S32, tA, i->getSrc(0), bld.loadImm(NULL, m),
1275                    i->getSrc(0))->subOp = NV50_IR_SUBOP_MUL_HIGH;
1276          if (l > 1)
1277             bld.mkOp2(OP_SHR, TYPE_S32, tB, tA, bld.mkImm(l - 1));
1278          else
1279             tB = tA;
1280          tA = bld.getSSA();
1281          bld.mkCmp(OP_SET, CC_LT, TYPE_S32, tA, TYPE_S32, i->getSrc(0), bld.mkImm(0));
1282          tD = (d < 0) ? bld.getSSA() : i->getDef(0)->asLValue();
1283          newi = bld.mkOp2(OP_SUB, TYPE_U32, tD, tB, tA);
1284          if (d < 0)
1285             bld.mkOp1(OP_NEG, TYPE_S32, i->getDef(0), tB);
1286 
1287          delete_Instruction(prog, i);
1288          deleted = true;
1289       }
1290       break;
1291 
1292    case OP_MOD:
1293       if (s == 1 && imm0.isPow2()) {
1294          bld.setPosition(i, false);
1295          if (i->sType == TYPE_U32) {
1296             i->op = OP_AND;
1297             i->setSrc(1, bld.loadImm(NULL, imm0.reg.data.u32 - 1));
1298          } else if (i->sType == TYPE_S32) {
1299             // Do it on the absolute value of the input, and then restore the
1300             // sign. The only odd case is MIN_INT, but that should work out
1301             // as well, since MIN_INT mod any power of 2 is 0.
1302             //
1303             // Technically we don't have to do any of this since MOD is
1304             // undefined with negative arguments in GLSL, but this seems like
1305             // the nice thing to do.
1306             Value *abs = bld.mkOp1v(OP_ABS, TYPE_S32, bld.getSSA(), i->getSrc(0));
1307             Value *neg, *v1, *v2;
1308             bld.mkCmp(OP_SET, CC_LT, TYPE_S32,
1309                       (neg = bld.getSSA(1, prog->getTarget()->nativeFile(FILE_PREDICATE))),
1310                       TYPE_S32, i->getSrc(0), bld.loadImm(NULL, 0));
1311             Value *mod = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), abs,
1312                                     bld.loadImm(NULL, imm0.reg.data.u32 - 1));
1313             bld.mkOp1(OP_NEG, TYPE_S32, (v1 = bld.getSSA()), mod)
1314                ->setPredicate(CC_P, neg);
1315             bld.mkOp1(OP_MOV, TYPE_S32, (v2 = bld.getSSA()), mod)
1316                ->setPredicate(CC_NOT_P, neg);
1317             newi = bld.mkOp2(OP_UNION, TYPE_S32, i->getDef(0), v1, v2);
1318 
1319             delete_Instruction(prog, i);
1320             deleted = true;
1321          }
1322       } else if (s == 1) {
1323          // In this case, we still want the optimized lowering that we get
1324          // from having division by an immediate.
1325          //
1326          // a % b == a - (a/b) * b
1327          bld.setPosition(i, false);
1328          Value *div = bld.mkOp2v(OP_DIV, i->sType, bld.getSSA(),
1329                                  i->getSrc(0), i->getSrc(1));
1330          newi = bld.mkOp2(OP_ADD, i->sType, i->getDef(0), i->getSrc(0),
1331                           bld.mkOp2v(OP_MUL, i->sType, bld.getSSA(), div, i->getSrc(1)));
1332          // TODO: Check that target supports this. In this case, we know that
1333          // all backends do.
1334          newi->src(1).mod = Modifier(NV50_IR_MOD_NEG);
1335 
1336          delete_Instruction(prog, i);
1337          deleted = true;
1338       }
1339       break;
1340 
1341    case OP_SET: // TODO: SET_AND,OR,XOR
1342    {
1343       /* This optimizes the case where the output of a set is being compared
1344        * to zero. Since the set can only produce 0/-1 (int) or 0/1 (float), we
1345        * can be a lot cleverer in our comparison.
1346        */
1347       CmpInstruction *si = findOriginForTestWithZero(i->getSrc(t));
1348       CondCode cc, ccZ;
1349       if (imm0.reg.data.u32 != 0 || !si)
1350          return false;
1351       cc = si->setCond;
1352       ccZ = (CondCode)((unsigned int)i->asCmp()->setCond & ~CC_U);
1353       // We do everything assuming var (cmp) 0, reverse the condition if 0 is
1354       // first.
1355       if (s == 0)
1356          ccZ = reverseCondCode(ccZ);
1357       // If there is a negative modifier, we need to undo that, by flipping
1358       // the comparison to zero.
1359       if (i->src(t).mod.neg())
1360          ccZ = reverseCondCode(ccZ);
1361       // If this is a signed comparison, we expect the input to be a regular
1362       // boolean, i.e. 0/-1. However the rest of the logic assumes that true
1363       // is positive, so just flip the sign.
1364       if (i->sType == TYPE_S32) {
1365          assert(!isFloatType(si->dType));
1366          ccZ = reverseCondCode(ccZ);
1367       }
1368       switch (ccZ) {
1369       case CC_LT: cc = CC_FL; break; // bool < 0 -- this is never true
1370       case CC_GE: cc = CC_TR; break; // bool >= 0 -- this is always true
1371       case CC_EQ: cc = inverseCondCode(cc); break; // bool == 0 -- !bool
1372       case CC_LE: cc = inverseCondCode(cc); break; // bool <= 0 -- !bool
1373       case CC_GT: break; // bool > 0 -- bool
1374       case CC_NE: break; // bool != 0 -- bool
1375       default:
1376          return false;
1377       }
1378 
1379       // Update the condition of this SET to be identical to the origin set,
1380       // but with the updated condition code. The original SET should get
1381       // DCE'd, ideally.
1382       i->op = si->op;
1383       i->asCmp()->setCond = cc;
1384       i->setSrc(0, si->src(0));
1385       i->setSrc(1, si->src(1));
1386       if (si->srcExists(2))
1387          i->setSrc(2, si->src(2));
1388       i->sType = si->sType;
1389    }
1390       break;
1391 
1392    case OP_AND:
1393    {
1394       Instruction *src = i->getSrc(t)->getInsn();
1395       ImmediateValue imm1;
1396       if (imm0.reg.data.u32 == 0) {
1397          i->op = OP_MOV;
1398          i->setSrc(0, new_ImmediateValue(prog, 0u));
1399          i->src(0).mod = Modifier(0);
1400          i->setSrc(1, NULL);
1401       } else if (imm0.reg.data.u32 == ~0U) {
1402          i->op = i->src(t).mod.getOp();
1403          if (t) {
1404             i->setSrc(0, i->getSrc(t));
1405             i->src(0).mod = i->src(t).mod;
1406          }
1407          i->setSrc(1, NULL);
1408       } else if (src->asCmp()) {
1409          CmpInstruction *cmp = src->asCmp();
1410          if (!cmp || cmp->op == OP_SLCT || cmp->getDef(0)->refCount() > 1)
1411             return false;
1412          if (!prog->getTarget()->isOpSupported(cmp->op, TYPE_F32))
1413             return false;
1414          if (imm0.reg.data.f32 != 1.0)
1415             return false;
1416          if (cmp->dType != TYPE_U32)
1417             return false;
1418 
1419          cmp->dType = TYPE_F32;
1420          if (i->src(t).mod != Modifier(0)) {
1421             assert(i->src(t).mod == Modifier(NV50_IR_MOD_NOT));
1422             i->src(t).mod = Modifier(0);
1423             cmp->setCond = inverseCondCode(cmp->setCond);
1424          }
1425          i->op = OP_MOV;
1426          i->setSrc(s, NULL);
1427          if (t) {
1428             i->setSrc(0, i->getSrc(t));
1429             i->setSrc(t, NULL);
1430          }
1431       } else if (prog->getTarget()->isOpSupported(OP_EXTBF, TYPE_U32) &&
1432                  src->op == OP_SHR &&
1433                  src->src(1).getImmediate(imm1) &&
1434                  i->src(t).mod == Modifier(0) &&
1435                  util_is_power_of_two_or_zero(imm0.reg.data.u32 + 1)) {
1436          // low byte = offset, high byte = width
1437          uint32_t ext = (util_last_bit(imm0.reg.data.u32) << 8) | imm1.reg.data.u32;
1438          i->op = OP_EXTBF;
1439          i->setSrc(0, src->getSrc(0));
1440          i->setSrc(1, new_ImmediateValue(prog, ext));
1441       } else if (src->op == OP_SHL &&
1442                  src->src(1).getImmediate(imm1) &&
1443                  i->src(t).mod == Modifier(0) &&
1444                  util_is_power_of_two_or_zero(~imm0.reg.data.u32 + 1) &&
1445                  util_last_bit(~imm0.reg.data.u32) <= imm1.reg.data.u32) {
1446          i->op = OP_MOV;
1447          i->setSrc(s, NULL);
1448          if (t) {
1449             i->setSrc(0, i->getSrc(t));
1450             i->setSrc(t, NULL);
1451          }
1452       }
1453    }
1454       break;
1455 
1456    case OP_SHL:
1457    {
1458       if (s != 1 || i->src(0).mod != Modifier(0))
1459          break;
1460 
1461       if (imm0.reg.data.u32 == 0) {
1462          i->op = OP_MOV;
1463          i->setSrc(1, NULL);
1464          break;
1465       }
1466       // try to concatenate shifts
1467       Instruction *si = i->getSrc(0)->getInsn();
1468       if (!si)
1469          break;
1470       ImmediateValue imm1;
1471       switch (si->op) {
1472       case OP_SHL:
1473          if (si->src(1).getImmediate(imm1)) {
1474             bld.setPosition(i, false);
1475             i->setSrc(0, si->getSrc(0));
1476             i->setSrc(1, bld.loadImm(NULL, imm0.reg.data.u32 + imm1.reg.data.u32));
1477          }
1478          break;
1479       case OP_SHR:
1480          if (si->src(1).getImmediate(imm1) && imm0.reg.data.u32 == imm1.reg.data.u32) {
1481             bld.setPosition(i, false);
1482             i->op = OP_AND;
1483             i->setSrc(0, si->getSrc(0));
1484             i->setSrc(1, bld.loadImm(NULL, ~((1 << imm0.reg.data.u32) - 1)));
1485          }
1486          break;
1487       case OP_MUL:
1488          int muls;
1489          if (isFloatType(si->dType))
1490             return false;
1491          if (si->subOp)
1492             return false;
1493          if (si->src(1).getImmediate(imm1))
1494             muls = 1;
1495          else if (si->src(0).getImmediate(imm1))
1496             muls = 0;
1497          else
1498             return false;
1499 
1500          bld.setPosition(i, false);
1501          i->op = OP_MUL;
1502          i->subOp = 0;
1503          i->dType = si->dType;
1504          i->sType = si->sType;
1505          i->setSrc(0, si->getSrc(!muls));
1506          i->setSrc(1, bld.loadImm(NULL, imm1.reg.data.u32 << imm0.reg.data.u32));
1507          break;
1508       case OP_SUB:
1509       case OP_ADD:
1510          int adds;
1511          if (isFloatType(si->dType))
1512             return false;
1513          if (si->op != OP_SUB && si->src(0).getImmediate(imm1))
1514             adds = 0;
1515          else if (si->src(1).getImmediate(imm1))
1516             adds = 1;
1517          else
1518             return false;
1519          if (si->src(!adds).mod != Modifier(0))
1520             return false;
1521          // SHL(ADD(x, y), z) = ADD(SHL(x, z), SHL(y, z))
1522 
1523          // This is more operations, but if one of x, y is an immediate, then
1524          // we can get a situation where (a) we can use ISCADD, or (b)
1525          // propagate the add bit into an indirect load.
1526          bld.setPosition(i, false);
1527          i->op = si->op;
1528          i->setSrc(adds, bld.loadImm(NULL, imm1.reg.data.u32 << imm0.reg.data.u32));
1529          i->setSrc(!adds, bld.mkOp2v(OP_SHL, i->dType,
1530                                      bld.getSSA(i->def(0).getSize(), i->def(0).getFile()),
1531                                      si->getSrc(!adds),
1532                                      bld.mkImm(imm0.reg.data.u32)));
1533          break;
1534       default:
1535          return false;
1536       }
1537    }
1538       break;
1539 
1540    case OP_ABS:
1541    case OP_NEG:
1542    case OP_SAT:
1543    case OP_LG2:
1544    case OP_RCP:
1545    case OP_SQRT:
1546    case OP_RSQ:
1547    case OP_PRESIN:
1548    case OP_SIN:
1549    case OP_COS:
1550    case OP_PREEX2:
1551    case OP_EX2:
1552       unary(i, imm0);
1553       break;
1554    case OP_BFIND: {
1555       int32_t res;
1556       switch (i->dType) {
1557       case TYPE_S32: res = util_last_bit_signed(imm0.reg.data.s32) - 1; break;
1558       case TYPE_U32: res = util_last_bit(imm0.reg.data.u32) - 1; break;
1559       default:
1560          return false;
1561       }
1562       if (i->subOp == NV50_IR_SUBOP_BFIND_SAMT && res >= 0)
1563          res = 31 - res;
1564       bld.setPosition(i, false); /* make sure bld is init'ed */
1565       i->setSrc(0, bld.mkImm(res));
1566       i->setSrc(1, NULL);
1567       i->op = OP_MOV;
1568       i->subOp = 0;
1569       break;
1570    }
1571    case OP_BREV: {
1572       uint32_t res = util_bitreverse(imm0.reg.data.u32);
1573       i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res));
1574       i->op = OP_MOV;
1575       break;
1576    }
1577    case OP_POPCNT: {
1578       // Only deal with 1-arg POPCNT here
1579       if (i->srcExists(1))
1580          break;
1581       uint32_t res = util_bitcount(imm0.reg.data.u32);
1582       i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res));
1583       i->setSrc(1, NULL);
1584       i->op = OP_MOV;
1585       break;
1586    }
1587    case OP_CVT: {
1588       Storage res;
1589 
1590       // TODO: handle 64-bit values properly
1591       if (typeSizeof(i->dType) == 8 || typeSizeof(i->sType) == 8)
1592          return false;
1593 
1594       // TODO: handle single byte/word extractions
1595       if (i->subOp)
1596          return false;
1597 
1598       bld.setPosition(i, true); /* make sure bld is init'ed */
1599 
1600 #define CASE(type, dst, fmin, fmax, imin, imax, umin, umax) \
1601    case type: \
1602       switch (i->sType) { \
1603       case TYPE_F64: \
1604          res.data.dst = util_iround(i->saturate ? \
1605                                     CLAMP(imm0.reg.data.f64, fmin, fmax) : \
1606                                     imm0.reg.data.f64); \
1607          break; \
1608       case TYPE_F32: \
1609          res.data.dst = util_iround(i->saturate ? \
1610                                     CLAMP(imm0.reg.data.f32, fmin, fmax) : \
1611                                     imm0.reg.data.f32); \
1612          break; \
1613       case TYPE_S32: \
1614          res.data.dst = i->saturate ? \
1615                         CLAMP(imm0.reg.data.s32, imin, imax) : \
1616                         imm0.reg.data.s32; \
1617          break; \
1618       case TYPE_U32: \
1619          res.data.dst = i->saturate ? \
1620                         CLAMP(imm0.reg.data.u32, umin, umax) : \
1621                         imm0.reg.data.u32; \
1622          break; \
1623       case TYPE_S16: \
1624          res.data.dst = i->saturate ? \
1625                         CLAMP(imm0.reg.data.s16, imin, imax) : \
1626                         imm0.reg.data.s16; \
1627          break; \
1628       case TYPE_U16: \
1629          res.data.dst = i->saturate ? \
1630                         CLAMP(imm0.reg.data.u16, umin, umax) : \
1631                         imm0.reg.data.u16; \
1632          break; \
1633       default: return false; \
1634       } \
1635       i->setSrc(0, bld.mkImm(res.data.dst)); \
1636       break
1637 
1638       switch(i->dType) {
1639       CASE(TYPE_U16, u16, 0, UINT16_MAX, 0, UINT16_MAX, 0, UINT16_MAX);
1640       CASE(TYPE_S16, s16, INT16_MIN, INT16_MAX, INT16_MIN, INT16_MAX, 0, INT16_MAX);
1641       CASE(TYPE_U32, u32, 0, (float)UINT32_MAX, 0, INT32_MAX, 0, UINT32_MAX);
1642       CASE(TYPE_S32, s32, (float)INT32_MIN, (float)INT32_MAX, INT32_MIN, INT32_MAX, 0, INT32_MAX);
1643       case TYPE_F32:
1644          switch (i->sType) {
1645          case TYPE_F64:
1646             res.data.f32 = i->saturate ?
1647                SATURATE(imm0.reg.data.f64) :
1648                imm0.reg.data.f64;
1649             break;
1650          case TYPE_F32:
1651             res.data.f32 = i->saturate ?
1652                SATURATE(imm0.reg.data.f32) :
1653                imm0.reg.data.f32;
1654             break;
1655          case TYPE_U16: res.data.f32 = (float) imm0.reg.data.u16; break;
1656          case TYPE_U32: res.data.f32 = (float) imm0.reg.data.u32; break;
1657          case TYPE_S16: res.data.f32 = (float) imm0.reg.data.s16; break;
1658          case TYPE_S32: res.data.f32 = (float) imm0.reg.data.s32; break;
1659          default:
1660             return false;
1661          }
1662          i->setSrc(0, bld.mkImm(res.data.f32));
1663          break;
1664       case TYPE_F64:
1665          switch (i->sType) {
1666          case TYPE_F64:
1667             res.data.f64 = i->saturate ?
1668                SATURATE(imm0.reg.data.f64) :
1669                imm0.reg.data.f64;
1670             break;
1671          case TYPE_F32:
1672             res.data.f64 = i->saturate ?
1673                SATURATE(imm0.reg.data.f32) :
1674                imm0.reg.data.f32;
1675             break;
1676          case TYPE_U16: res.data.f64 = (double) imm0.reg.data.u16; break;
1677          case TYPE_U32: res.data.f64 = (double) imm0.reg.data.u32; break;
1678          case TYPE_S16: res.data.f64 = (double) imm0.reg.data.s16; break;
1679          case TYPE_S32: res.data.f64 = (double) imm0.reg.data.s32; break;
1680          default:
1681             return false;
1682          }
1683          i->setSrc(0, bld.mkImm(res.data.f64));
1684          break;
1685       default:
1686          return false;
1687       }
1688 #undef CASE
1689 
1690       i->setType(i->dType); /* Remove i->sType, which we don't need anymore */
1691       i->op = OP_MOV;
1692       i->saturate = 0;
1693       i->src(0).mod = Modifier(0); /* Clear the already applied modifier */
1694       break;
1695    }
1696    default:
1697       return false;
1698    }
1699 
1700    // This can get left behind some of the optimizations which simplify
1701    // saturatable values.
1702    if (newi->op == OP_MOV && newi->saturate) {
1703       ImmediateValue tmp;
1704       newi->saturate = 0;
1705       newi->op = OP_SAT;
1706       if (newi->src(0).getImmediate(tmp))
1707          unary(newi, tmp);
1708    }
1709 
1710    if (newi->op != op)
1711       foldCount++;
1712    return deleted;
1713 }
1714 
1715 // =============================================================================
1716 
1717 // Merge modifier operations (ABS, NEG, NOT) into ValueRefs where allowed.
1718 class ModifierFolding : public Pass
1719 {
1720 private:
1721    virtual bool visit(BasicBlock *);
1722 };
1723 
1724 bool
visit(BasicBlock * bb)1725 ModifierFolding::visit(BasicBlock *bb)
1726 {
1727    const Target *target = prog->getTarget();
1728 
1729    Instruction *i, *next, *mi;
1730    Modifier mod;
1731 
1732    for (i = bb->getEntry(); i; i = next) {
1733       next = i->next;
1734 
1735       if (false && i->op == OP_SUB) {
1736          // turn "sub" into "add neg" (do we really want this ?)
1737          i->op = OP_ADD;
1738          i->src(0).mod = i->src(0).mod ^ Modifier(NV50_IR_MOD_NEG);
1739       }
1740 
1741       for (int s = 0; s < 3 && i->srcExists(s); ++s) {
1742          mi = i->getSrc(s)->getInsn();
1743          if (!mi ||
1744              mi->predSrc >= 0 || mi->getDef(0)->refCount() > 8)
1745             continue;
1746          if (i->sType == TYPE_U32 && mi->dType == TYPE_S32) {
1747             if ((i->op != OP_ADD &&
1748                  i->op != OP_MUL) ||
1749                 (mi->op != OP_ABS &&
1750                  mi->op != OP_NEG))
1751                continue;
1752          } else
1753          if (i->sType != mi->dType) {
1754             continue;
1755          }
1756          if ((mod = Modifier(mi->op)) == Modifier(0))
1757             continue;
1758          mod *= mi->src(0).mod;
1759 
1760          if ((i->op == OP_ABS) || i->src(s).mod.abs()) {
1761             // abs neg [abs] = abs
1762             mod = mod & Modifier(~(NV50_IR_MOD_NEG | NV50_IR_MOD_ABS));
1763          } else
1764          if ((i->op == OP_NEG) && mod.neg()) {
1765             assert(s == 0);
1766             // neg as both opcode and modifier on same insn is prohibited
1767             // neg neg abs = abs, neg neg = identity
1768             mod = mod & Modifier(~NV50_IR_MOD_NEG);
1769             i->op = mod.getOp();
1770             mod = mod & Modifier(~NV50_IR_MOD_ABS);
1771             if (mod == Modifier(0))
1772                i->op = OP_MOV;
1773          }
1774 
1775          if (target->isModSupported(i, s, mod)) {
1776             i->setSrc(s, mi->getSrc(0));
1777             i->src(s).mod *= mod;
1778          }
1779       }
1780 
1781       if (i->op == OP_SAT) {
1782          mi = i->getSrc(0)->getInsn();
1783          if (mi &&
1784              mi->getDef(0)->refCount() <= 1 && target->isSatSupported(mi)) {
1785             mi->saturate = 1;
1786             mi->setDef(0, i->getDef(0));
1787             delete_Instruction(prog, i);
1788          }
1789       }
1790    }
1791 
1792    return true;
1793 }
1794 
1795 // =============================================================================
1796 
1797 // MUL + ADD -> MAD/FMA
1798 // MIN/MAX(a, a) -> a, etc.
1799 // SLCT(a, b, const) -> cc(const) ? a : b
1800 // RCP(RCP(a)) -> a
1801 // MUL(MUL(a, b), const) -> MUL_Xconst(a, b)
1802 // EXTBF(RDSV(COMBINED_TID)) -> RDSV(TID)
1803 class AlgebraicOpt : public Pass
1804 {
1805 private:
1806    virtual bool visit(BasicBlock *);
1807 
1808    void handleABS(Instruction *);
1809    bool handleADD(Instruction *);
1810    bool tryADDToMADOrSAD(Instruction *, operation toOp);
1811    void handleMINMAX(Instruction *);
1812    void handleRCP(Instruction *);
1813    void handleSLCT(Instruction *);
1814    void handleLOGOP(Instruction *);
1815    void handleCVT_NEG(Instruction *);
1816    void handleCVT_CVT(Instruction *);
1817    void handleCVT_EXTBF(Instruction *);
1818    void handleSUCLAMP(Instruction *);
1819    void handleNEG(Instruction *);
1820    void handleEXTBF_RDSV(Instruction *);
1821 
1822    BuildUtil bld;
1823 };
1824 
1825 void
handleABS(Instruction * abs)1826 AlgebraicOpt::handleABS(Instruction *abs)
1827 {
1828    Instruction *sub = abs->getSrc(0)->getInsn();
1829    DataType ty;
1830    if (!sub ||
1831        !prog->getTarget()->isOpSupported(OP_SAD, abs->dType))
1832       return;
1833    // hidden conversion ?
1834    ty = intTypeToSigned(sub->dType);
1835    if (abs->dType != abs->sType || ty != abs->sType)
1836       return;
1837 
1838    if ((sub->op != OP_ADD && sub->op != OP_SUB) ||
1839        sub->src(0).getFile() != FILE_GPR || sub->src(0).mod ||
1840        sub->src(1).getFile() != FILE_GPR || sub->src(1).mod)
1841          return;
1842 
1843    Value *src0 = sub->getSrc(0);
1844    Value *src1 = sub->getSrc(1);
1845 
1846    if (sub->op == OP_ADD) {
1847       Instruction *neg = sub->getSrc(1)->getInsn();
1848       if (neg && neg->op != OP_NEG) {
1849          neg = sub->getSrc(0)->getInsn();
1850          src0 = sub->getSrc(1);
1851       }
1852       if (!neg || neg->op != OP_NEG ||
1853           neg->dType != neg->sType || neg->sType != ty)
1854          return;
1855       src1 = neg->getSrc(0);
1856    }
1857 
1858    // found ABS(SUB))
1859    abs->moveSources(1, 2); // move sources >=1 up by 2
1860    abs->op = OP_SAD;
1861    abs->setType(sub->dType);
1862    abs->setSrc(0, src0);
1863    abs->setSrc(1, src1);
1864    bld.setPosition(abs, false);
1865    abs->setSrc(2, bld.loadImm(bld.getSSA(typeSizeof(ty)), 0));
1866 }
1867 
1868 bool
handleADD(Instruction * add)1869 AlgebraicOpt::handleADD(Instruction *add)
1870 {
1871    Value *src0 = add->getSrc(0);
1872    Value *src1 = add->getSrc(1);
1873 
1874    if (src0->reg.file != FILE_GPR || src1->reg.file != FILE_GPR)
1875       return false;
1876 
1877    bool changed = false;
1878    // we can't optimize to MAD if the add is precise
1879    if (!add->precise && prog->getTarget()->isOpSupported(OP_MAD, add->dType))
1880       changed = tryADDToMADOrSAD(add, OP_MAD);
1881    if (!changed && prog->getTarget()->isOpSupported(OP_SAD, add->dType))
1882       changed = tryADDToMADOrSAD(add, OP_SAD);
1883    return changed;
1884 }
1885 
1886 // ADD(SAD(a,b,0), c) -> SAD(a,b,c)
1887 // ADD(MUL(a,b), c) -> MAD(a,b,c)
1888 bool
tryADDToMADOrSAD(Instruction * add,operation toOp)1889 AlgebraicOpt::tryADDToMADOrSAD(Instruction *add, operation toOp)
1890 {
1891    Value *src0 = add->getSrc(0);
1892    Value *src1 = add->getSrc(1);
1893    Value *src;
1894    int s;
1895    const operation srcOp = toOp == OP_SAD ? OP_SAD : OP_MUL;
1896    const Modifier modBad = Modifier(~((toOp == OP_MAD) ? NV50_IR_MOD_NEG : 0));
1897    Modifier mod[4];
1898 
1899    if (src0->refCount() == 1 &&
1900        src0->getUniqueInsn() && src0->getUniqueInsn()->op == srcOp)
1901       s = 0;
1902    else
1903    if (src1->refCount() == 1 &&
1904        src1->getUniqueInsn() && src1->getUniqueInsn()->op == srcOp)
1905       s = 1;
1906    else
1907       return false;
1908 
1909    src = add->getSrc(s);
1910 
1911    if (src->getUniqueInsn() && src->getUniqueInsn()->bb != add->bb)
1912       return false;
1913 
1914    if (src->getInsn()->saturate || src->getInsn()->postFactor ||
1915        src->getInsn()->dnz || src->getInsn()->precise)
1916       return false;
1917 
1918    if (toOp == OP_SAD) {
1919       ImmediateValue imm;
1920       if (!src->getInsn()->src(2).getImmediate(imm))
1921          return false;
1922       if (!imm.isInteger(0))
1923          return false;
1924    }
1925 
1926    if (typeSizeof(add->dType) != typeSizeof(src->getInsn()->dType) ||
1927        isFloatType(add->dType) != isFloatType(src->getInsn()->dType))
1928       return false;
1929 
1930    mod[0] = add->src(0).mod;
1931    mod[1] = add->src(1).mod;
1932    mod[2] = src->getUniqueInsn()->src(0).mod;
1933    mod[3] = src->getUniqueInsn()->src(1).mod;
1934 
1935    if (((mod[0] | mod[1]) | (mod[2] | mod[3])) & modBad)
1936       return false;
1937 
1938    add->op = toOp;
1939    add->subOp = src->getInsn()->subOp; // potentially mul-high
1940    add->dnz = src->getInsn()->dnz;
1941    add->dType = src->getInsn()->dType; // sign matters for imad hi
1942    add->sType = src->getInsn()->sType;
1943 
1944    add->setSrc(2, add->src(s ? 0 : 1));
1945 
1946    add->setSrc(0, src->getInsn()->getSrc(0));
1947    add->src(0).mod = mod[2] ^ mod[s];
1948    add->setSrc(1, src->getInsn()->getSrc(1));
1949    add->src(1).mod = mod[3];
1950 
1951    return true;
1952 }
1953 
1954 void
handleMINMAX(Instruction * minmax)1955 AlgebraicOpt::handleMINMAX(Instruction *minmax)
1956 {
1957    Value *src0 = minmax->getSrc(0);
1958    Value *src1 = minmax->getSrc(1);
1959 
1960    if (src0 != src1 || src0->reg.file != FILE_GPR)
1961       return;
1962    if (minmax->src(0).mod == minmax->src(1).mod) {
1963       if (minmax->def(0).mayReplace(minmax->src(0))) {
1964          minmax->def(0).replace(minmax->src(0), false);
1965          delete_Instruction(prog, minmax);
1966       } else {
1967          minmax->op = OP_CVT;
1968          minmax->setSrc(1, NULL);
1969       }
1970    } else {
1971       // TODO:
1972       // min(x, -x) = -abs(x)
1973       // min(x, -abs(x)) = -abs(x)
1974       // min(x, abs(x)) = x
1975       // max(x, -abs(x)) = x
1976       // max(x, abs(x)) = abs(x)
1977       // max(x, -x) = abs(x)
1978    }
1979 }
1980 
1981 // rcp(rcp(a)) = a
1982 // rcp(sqrt(a)) = rsq(a)
1983 void
handleRCP(Instruction * rcp)1984 AlgebraicOpt::handleRCP(Instruction *rcp)
1985 {
1986    Instruction *si = rcp->getSrc(0)->getUniqueInsn();
1987 
1988    if (!si)
1989       return;
1990 
1991    if (si->op == OP_RCP) {
1992       Modifier mod = rcp->src(0).mod * si->src(0).mod;
1993       rcp->op = mod.getOp();
1994       rcp->setSrc(0, si->getSrc(0));
1995    } else if (si->op == OP_SQRT) {
1996       rcp->op = OP_RSQ;
1997       rcp->setSrc(0, si->getSrc(0));
1998       rcp->src(0).mod = rcp->src(0).mod * si->src(0).mod;
1999    }
2000 }
2001 
2002 void
handleSLCT(Instruction * slct)2003 AlgebraicOpt::handleSLCT(Instruction *slct)
2004 {
2005    if (slct->getSrc(2)->reg.file == FILE_IMMEDIATE) {
2006       if (slct->getSrc(2)->asImm()->compare(slct->asCmp()->setCond, 0.0f))
2007          slct->setSrc(0, slct->getSrc(1));
2008    } else
2009    if (slct->getSrc(0) != slct->getSrc(1)) {
2010       return;
2011    }
2012    slct->op = OP_MOV;
2013    slct->setSrc(1, NULL);
2014    slct->setSrc(2, NULL);
2015 }
2016 
2017 void
handleLOGOP(Instruction * logop)2018 AlgebraicOpt::handleLOGOP(Instruction *logop)
2019 {
2020    Value *src0 = logop->getSrc(0);
2021    Value *src1 = logop->getSrc(1);
2022 
2023    if (src0->reg.file != FILE_GPR || src1->reg.file != FILE_GPR)
2024       return;
2025 
2026    if (src0 == src1) {
2027       if ((logop->op == OP_AND || logop->op == OP_OR) &&
2028           logop->def(0).mayReplace(logop->src(0))) {
2029          logop->def(0).replace(logop->src(0), false);
2030          delete_Instruction(prog, logop);
2031       }
2032    } else {
2033       // try AND(SET, SET) -> SET_AND(SET)
2034       Instruction *set0 = src0->getInsn();
2035       Instruction *set1 = src1->getInsn();
2036 
2037       if (!set0 || set0->fixed || !set1 || set1->fixed)
2038          return;
2039       if (set1->op != OP_SET) {
2040          Instruction *xchg = set0;
2041          set0 = set1;
2042          set1 = xchg;
2043          if (set1->op != OP_SET)
2044             return;
2045       }
2046       operation redOp = (logop->op == OP_AND ? OP_SET_AND :
2047                          logop->op == OP_XOR ? OP_SET_XOR : OP_SET_OR);
2048       if (!prog->getTarget()->isOpSupported(redOp, set1->sType))
2049          return;
2050       if (set0->op != OP_SET &&
2051           set0->op != OP_SET_AND &&
2052           set0->op != OP_SET_OR &&
2053           set0->op != OP_SET_XOR)
2054          return;
2055       if (set0->getDef(0)->refCount() > 1 &&
2056           set1->getDef(0)->refCount() > 1)
2057          return;
2058       if (set0->getPredicate() || set1->getPredicate())
2059          return;
2060       // check that they don't source each other
2061       for (int s = 0; s < 2; ++s)
2062          if (set0->getSrc(s) == set1->getDef(0) ||
2063              set1->getSrc(s) == set0->getDef(0))
2064             return;
2065 
2066       set0 = cloneForward(func, set0);
2067       set1 = cloneShallow(func, set1);
2068       logop->bb->insertAfter(logop, set1);
2069       logop->bb->insertAfter(logop, set0);
2070 
2071       set0->dType = TYPE_U8;
2072       set0->getDef(0)->reg.file = FILE_PREDICATE;
2073       set0->getDef(0)->reg.size = 1;
2074       set1->setSrc(2, set0->getDef(0));
2075       set1->op = redOp;
2076       set1->setDef(0, logop->getDef(0));
2077       delete_Instruction(prog, logop);
2078    }
2079 }
2080 
2081 // F2I(NEG(SET with result 1.0f/0.0f)) -> SET with result -1/0
2082 // nv50:
2083 //  F2I(NEG(I2F(ABS(SET))))
2084 void
handleCVT_NEG(Instruction * cvt)2085 AlgebraicOpt::handleCVT_NEG(Instruction *cvt)
2086 {
2087    Instruction *insn = cvt->getSrc(0)->getInsn();
2088    if (cvt->sType != TYPE_F32 ||
2089        cvt->dType != TYPE_S32 || cvt->src(0).mod != Modifier(0))
2090       return;
2091    if (!insn || insn->op != OP_NEG || insn->dType != TYPE_F32)
2092       return;
2093    if (insn->src(0).mod != Modifier(0))
2094       return;
2095    insn = insn->getSrc(0)->getInsn();
2096 
2097    // check for nv50 SET(-1,0) -> SET(1.0f/0.0f) chain and nvc0's f32 SET
2098    if (insn && insn->op == OP_CVT &&
2099        insn->dType == TYPE_F32 &&
2100        insn->sType == TYPE_S32) {
2101       insn = insn->getSrc(0)->getInsn();
2102       if (!insn || insn->op != OP_ABS || insn->sType != TYPE_S32 ||
2103           insn->src(0).mod)
2104          return;
2105       insn = insn->getSrc(0)->getInsn();
2106       if (!insn || insn->op != OP_SET || insn->dType != TYPE_U32)
2107          return;
2108    } else
2109    if (!insn || insn->op != OP_SET || insn->dType != TYPE_F32) {
2110       return;
2111    }
2112 
2113    Instruction *bset = cloneShallow(func, insn);
2114    bset->dType = TYPE_U32;
2115    bset->setDef(0, cvt->getDef(0));
2116    cvt->bb->insertAfter(cvt, bset);
2117    delete_Instruction(prog, cvt);
2118 }
2119 
2120 // F2I(TRUNC()) and so on can be expressed as a single CVT. If the earlier CVT
2121 // does a type conversion, this becomes trickier as there might be range
2122 // changes/etc. We could handle those in theory as long as the range was being
2123 // reduced or kept the same.
2124 void
handleCVT_CVT(Instruction * cvt)2125 AlgebraicOpt::handleCVT_CVT(Instruction *cvt)
2126 {
2127    Instruction *insn = cvt->getSrc(0)->getInsn();
2128 
2129    if (!insn ||
2130        insn->saturate ||
2131        insn->subOp ||
2132        insn->dType != insn->sType ||
2133        insn->dType != cvt->sType)
2134       return;
2135 
2136    RoundMode rnd = insn->rnd;
2137    switch (insn->op) {
2138    case OP_CEIL:
2139       rnd = ROUND_PI;
2140       break;
2141    case OP_FLOOR:
2142       rnd = ROUND_MI;
2143       break;
2144    case OP_TRUNC:
2145       rnd = ROUND_ZI;
2146       break;
2147    case OP_CVT:
2148       break;
2149    default:
2150       return;
2151    }
2152 
2153    if (!isFloatType(cvt->dType) || !isFloatType(insn->sType))
2154       rnd = (RoundMode)(rnd & 3);
2155 
2156    cvt->rnd = rnd;
2157    cvt->setSrc(0, insn->getSrc(0));
2158    cvt->src(0).mod *= insn->src(0).mod;
2159    cvt->sType = insn->sType;
2160 }
2161 
2162 // Some shaders extract packed bytes out of words and convert them to
2163 // e.g. float. The Fermi+ CVT instruction can extract those directly, as can
2164 // nv50 for word sizes.
2165 //
2166 // CVT(EXTBF(x, byte/word))
2167 // CVT(AND(bytemask, x))
2168 // CVT(AND(bytemask, SHR(x, 8/16/24)))
2169 // CVT(SHR(x, 16/24))
2170 void
handleCVT_EXTBF(Instruction * cvt)2171 AlgebraicOpt::handleCVT_EXTBF(Instruction *cvt)
2172 {
2173    Instruction *insn = cvt->getSrc(0)->getInsn();
2174    ImmediateValue imm;
2175    Value *arg = NULL;
2176    unsigned width, offset = 0;
2177    if ((cvt->sType != TYPE_U32 && cvt->sType != TYPE_S32) || !insn)
2178       return;
2179    if (insn->op == OP_EXTBF && insn->src(1).getImmediate(imm)) {
2180       width = (imm.reg.data.u32 >> 8) & 0xff;
2181       offset = imm.reg.data.u32 & 0xff;
2182       arg = insn->getSrc(0);
2183 
2184       if (width != 8 && width != 16)
2185          return;
2186       if (width == 8 && offset & 0x7)
2187          return;
2188       if (width == 16 && offset & 0xf)
2189          return;
2190    } else if (insn->op == OP_AND) {
2191       int s;
2192       if (insn->src(0).getImmediate(imm))
2193          s = 0;
2194       else if (insn->src(1).getImmediate(imm))
2195          s = 1;
2196       else
2197          return;
2198 
2199       if (imm.reg.data.u32 == 0xff)
2200          width = 8;
2201       else if (imm.reg.data.u32 == 0xffff)
2202          width = 16;
2203       else
2204          return;
2205 
2206       arg = insn->getSrc(!s);
2207       Instruction *shift = arg->getInsn();
2208 
2209       if (shift && shift->op == OP_SHR &&
2210           shift->sType == cvt->sType &&
2211           shift->src(1).getImmediate(imm) &&
2212           ((width == 8 && (imm.reg.data.u32 & 0x7) == 0) ||
2213            (width == 16 && (imm.reg.data.u32 & 0xf) == 0))) {
2214          arg = shift->getSrc(0);
2215          offset = imm.reg.data.u32;
2216       }
2217       // We just AND'd the high bits away, which means this is effectively an
2218       // unsigned value.
2219       cvt->sType = TYPE_U32;
2220    } else if (insn->op == OP_SHR &&
2221               insn->sType == cvt->sType &&
2222               insn->src(1).getImmediate(imm)) {
2223       arg = insn->getSrc(0);
2224       if (imm.reg.data.u32 == 24) {
2225          width = 8;
2226          offset = 24;
2227       } else if (imm.reg.data.u32 == 16) {
2228          width = 16;
2229          offset = 16;
2230       } else {
2231          return;
2232       }
2233    }
2234 
2235    if (!arg)
2236       return;
2237 
2238    // Irrespective of what came earlier, we can undo a shift on the argument
2239    // by adjusting the offset.
2240    Instruction *shift = arg->getInsn();
2241    if (shift && shift->op == OP_SHL &&
2242        shift->src(1).getImmediate(imm) &&
2243        ((width == 8 && (imm.reg.data.u32 & 0x7) == 0) ||
2244         (width == 16 && (imm.reg.data.u32 & 0xf) == 0)) &&
2245        imm.reg.data.u32 <= offset) {
2246       arg = shift->getSrc(0);
2247       offset -= imm.reg.data.u32;
2248    }
2249 
2250    // The unpackSnorm lowering still leaves a few shifts behind, but it's too
2251    // annoying to detect them.
2252 
2253    if (width == 8) {
2254       cvt->sType = cvt->sType == TYPE_U32 ? TYPE_U8 : TYPE_S8;
2255    } else {
2256       assert(width == 16);
2257       cvt->sType = cvt->sType == TYPE_U32 ? TYPE_U16 : TYPE_S16;
2258    }
2259    cvt->setSrc(0, arg);
2260    cvt->subOp = offset >> 3;
2261 }
2262 
2263 // SUCLAMP dst, (ADD b imm), k, 0 -> SUCLAMP dst, b, k, imm (if imm fits s6)
2264 void
handleSUCLAMP(Instruction * insn)2265 AlgebraicOpt::handleSUCLAMP(Instruction *insn)
2266 {
2267    ImmediateValue imm;
2268    int32_t val = insn->getSrc(2)->asImm()->reg.data.s32;
2269    int s;
2270    Instruction *add;
2271 
2272    assert(insn->srcExists(0) && insn->src(0).getFile() == FILE_GPR);
2273 
2274    // look for ADD (TODO: only count references by non-SUCLAMP)
2275    if (insn->getSrc(0)->refCount() > 1)
2276       return;
2277    add = insn->getSrc(0)->getInsn();
2278    if (!add || add->op != OP_ADD ||
2279        (add->dType != TYPE_U32 &&
2280         add->dType != TYPE_S32))
2281       return;
2282 
2283    // look for immediate
2284    for (s = 0; s < 2; ++s)
2285       if (add->src(s).getImmediate(imm))
2286          break;
2287    if (s >= 2)
2288       return;
2289    s = s ? 0 : 1;
2290    // determine if immediate fits
2291    val += imm.reg.data.s32;
2292    if (val > 31 || val < -32)
2293       return;
2294    // determine if other addend fits
2295    if (add->src(s).getFile() != FILE_GPR || add->src(s).mod != Modifier(0))
2296       return;
2297 
2298    bld.setPosition(insn, false); // make sure bld is init'ed
2299    // replace sources
2300    insn->setSrc(2, bld.mkImm(val));
2301    insn->setSrc(0, add->getSrc(s));
2302 }
2303 
2304 // NEG(AND(SET, 1)) -> SET
2305 void
handleNEG(Instruction * i)2306 AlgebraicOpt::handleNEG(Instruction *i) {
2307    Instruction *src = i->getSrc(0)->getInsn();
2308    ImmediateValue imm;
2309    int b;
2310 
2311    if (isFloatType(i->sType) || !src || src->op != OP_AND)
2312       return;
2313 
2314    if (src->src(0).getImmediate(imm))
2315       b = 1;
2316    else if (src->src(1).getImmediate(imm))
2317       b = 0;
2318    else
2319       return;
2320 
2321    if (!imm.isInteger(1))
2322       return;
2323 
2324    Instruction *set = src->getSrc(b)->getInsn();
2325    if ((set->op == OP_SET || set->op == OP_SET_AND ||
2326        set->op == OP_SET_OR || set->op == OP_SET_XOR) &&
2327        !isFloatType(set->dType)) {
2328       i->def(0).replace(set->getDef(0), false);
2329    }
2330 }
2331 
2332 // EXTBF(RDSV(COMBINED_TID)) -> RDSV(TID)
2333 void
handleEXTBF_RDSV(Instruction * i)2334 AlgebraicOpt::handleEXTBF_RDSV(Instruction *i)
2335 {
2336    Instruction *rdsv = i->getSrc(0)->getUniqueInsn();
2337    if (rdsv->op != OP_RDSV ||
2338        rdsv->getSrc(0)->asSym()->reg.data.sv.sv != SV_COMBINED_TID)
2339       return;
2340    // Avoid creating more RDSV instructions
2341    if (rdsv->getDef(0)->refCount() > 1)
2342       return;
2343 
2344    ImmediateValue imm;
2345    if (!i->src(1).getImmediate(imm))
2346       return;
2347 
2348    int index;
2349    if (imm.isInteger(0x1000))
2350       index = 0;
2351    else
2352    if (imm.isInteger(0x0a10))
2353       index = 1;
2354    else
2355    if (imm.isInteger(0x061a))
2356       index = 2;
2357    else
2358       return;
2359 
2360    bld.setPosition(i, false);
2361 
2362    i->op = OP_RDSV;
2363    i->setSrc(0, bld.mkSysVal(SV_TID, index));
2364    i->setSrc(1, NULL);
2365 }
2366 
2367 bool
visit(BasicBlock * bb)2368 AlgebraicOpt::visit(BasicBlock *bb)
2369 {
2370    Instruction *next;
2371    for (Instruction *i = bb->getEntry(); i; i = next) {
2372       next = i->next;
2373       switch (i->op) {
2374       case OP_ABS:
2375          handleABS(i);
2376          break;
2377       case OP_ADD:
2378          handleADD(i);
2379          break;
2380       case OP_RCP:
2381          handleRCP(i);
2382          break;
2383       case OP_MIN:
2384       case OP_MAX:
2385          handleMINMAX(i);
2386          break;
2387       case OP_SLCT:
2388          handleSLCT(i);
2389          break;
2390       case OP_AND:
2391       case OP_OR:
2392       case OP_XOR:
2393          handleLOGOP(i);
2394          break;
2395       case OP_CVT:
2396          handleCVT_NEG(i);
2397          handleCVT_CVT(i);
2398          if (prog->getTarget()->isOpSupported(OP_EXTBF, TYPE_U32))
2399              handleCVT_EXTBF(i);
2400          break;
2401       case OP_SUCLAMP:
2402          handleSUCLAMP(i);
2403          break;
2404       case OP_NEG:
2405          handleNEG(i);
2406          break;
2407       case OP_EXTBF:
2408          handleEXTBF_RDSV(i);
2409          break;
2410       default:
2411          break;
2412       }
2413    }
2414 
2415    return true;
2416 }
2417 
2418 // =============================================================================
2419 
2420 // ADD(SHL(a, b), c) -> SHLADD(a, b, c)
2421 // MUL(a, b) -> a few XMADs
2422 // MAD/FMA(a, b, c) -> a few XMADs
2423 class LateAlgebraicOpt : public Pass
2424 {
2425 private:
2426    virtual bool visit(Instruction *);
2427 
2428    void handleADD(Instruction *);
2429    void handleMULMAD(Instruction *);
2430    bool tryADDToSHLADD(Instruction *);
2431 
2432    BuildUtil bld;
2433 };
2434 
2435 void
handleADD(Instruction * add)2436 LateAlgebraicOpt::handleADD(Instruction *add)
2437 {
2438    Value *src0 = add->getSrc(0);
2439    Value *src1 = add->getSrc(1);
2440 
2441    if (src0->reg.file != FILE_GPR || src1->reg.file != FILE_GPR)
2442       return;
2443 
2444    if (prog->getTarget()->isOpSupported(OP_SHLADD, add->dType))
2445       tryADDToSHLADD(add);
2446 }
2447 
2448 // ADD(SHL(a, b), c) -> SHLADD(a, b, c)
2449 bool
tryADDToSHLADD(Instruction * add)2450 LateAlgebraicOpt::tryADDToSHLADD(Instruction *add)
2451 {
2452    Value *src0 = add->getSrc(0);
2453    Value *src1 = add->getSrc(1);
2454    ImmediateValue imm;
2455    Instruction *shl;
2456    Value *src;
2457    int s;
2458 
2459    if (add->saturate || add->usesFlags() || typeSizeof(add->dType) == 8
2460        || isFloatType(add->dType))
2461       return false;
2462 
2463    if (src0->getUniqueInsn() && src0->getUniqueInsn()->op == OP_SHL)
2464       s = 0;
2465    else
2466    if (src1->getUniqueInsn() && src1->getUniqueInsn()->op == OP_SHL)
2467       s = 1;
2468    else
2469       return false;
2470 
2471    src = add->getSrc(s);
2472    shl = src->getUniqueInsn();
2473 
2474    if (shl->bb != add->bb || shl->usesFlags() || shl->subOp || shl->src(0).mod)
2475       return false;
2476 
2477    if (!shl->src(1).getImmediate(imm))
2478       return false;
2479 
2480    add->op = OP_SHLADD;
2481    add->setSrc(2, add->src(!s));
2482    // SHL can't have any modifiers, but the ADD source may have had
2483    // one. Preserve it.
2484    add->setSrc(0, shl->getSrc(0));
2485    if (s == 1)
2486       add->src(0).mod = add->src(1).mod;
2487    add->setSrc(1, new_ImmediateValue(shl->bb->getProgram(), imm.reg.data.u32));
2488    add->src(1).mod = Modifier(0);
2489 
2490    return true;
2491 }
2492 
2493 // MUL(a, b) -> a few XMADs
2494 // MAD/FMA(a, b, c) -> a few XMADs
2495 void
handleMULMAD(Instruction * i)2496 LateAlgebraicOpt::handleMULMAD(Instruction *i)
2497 {
2498    // TODO: handle NV50_IR_SUBOP_MUL_HIGH
2499    if (!prog->getTarget()->isOpSupported(OP_XMAD, TYPE_U32))
2500       return;
2501    if (isFloatType(i->dType) || typeSizeof(i->dType) != 4)
2502       return;
2503    if (i->subOp || i->usesFlags() || i->flagsDef >= 0)
2504       return;
2505 
2506    assert(!i->src(0).mod);
2507    assert(!i->src(1).mod);
2508    assert(i->op == OP_MUL ? 1 : !i->src(2).mod);
2509 
2510    bld.setPosition(i, false);
2511 
2512    Value *a = i->getSrc(0);
2513    Value *b = i->getSrc(1);
2514    Value *c = i->op == OP_MUL ? bld.mkImm(0) : i->getSrc(2);
2515 
2516    Value *tmp0 = bld.getSSA();
2517    Value *tmp1 = bld.getSSA();
2518 
2519    Instruction *insn = bld.mkOp3(OP_XMAD, TYPE_U32, tmp0, b, a, c);
2520    insn->setPredicate(i->cc, i->getPredicate());
2521 
2522    insn = bld.mkOp3(OP_XMAD, TYPE_U32, tmp1, b, a, bld.mkImm(0));
2523    insn->setPredicate(i->cc, i->getPredicate());
2524    insn->subOp = NV50_IR_SUBOP_XMAD_MRG | NV50_IR_SUBOP_XMAD_H1(1);
2525 
2526    Value *pred = i->getPredicate();
2527    i->setPredicate(i->cc, NULL);
2528 
2529    i->op = OP_XMAD;
2530    i->setSrc(0, b);
2531    i->setSrc(1, tmp1);
2532    i->setSrc(2, tmp0);
2533    i->subOp = NV50_IR_SUBOP_XMAD_PSL | NV50_IR_SUBOP_XMAD_CBCC;
2534    i->subOp |= NV50_IR_SUBOP_XMAD_H1(0) | NV50_IR_SUBOP_XMAD_H1(1);
2535 
2536    i->setPredicate(i->cc, pred);
2537 }
2538 
2539 bool
visit(Instruction * i)2540 LateAlgebraicOpt::visit(Instruction *i)
2541 {
2542    switch (i->op) {
2543    case OP_ADD:
2544       handleADD(i);
2545       break;
2546    case OP_MUL:
2547    case OP_MAD:
2548    case OP_FMA:
2549       handleMULMAD(i);
2550       break;
2551    default:
2552       break;
2553    }
2554 
2555    return true;
2556 }
2557 
2558 // =============================================================================
2559 
2560 // Split 64-bit MUL and MAD
2561 class Split64BitOpPreRA : public Pass
2562 {
2563 private:
2564    virtual bool visit(BasicBlock *);
2565    void split64MulMad(Function *, Instruction *, DataType);
2566 
2567    BuildUtil bld;
2568 };
2569 
2570 bool
visit(BasicBlock * bb)2571 Split64BitOpPreRA::visit(BasicBlock *bb)
2572 {
2573    Instruction *i, *next;
2574    Modifier mod;
2575 
2576    for (i = bb->getEntry(); i; i = next) {
2577       next = i->next;
2578 
2579       DataType hTy;
2580       switch (i->dType) {
2581       case TYPE_U64: hTy = TYPE_U32; break;
2582       case TYPE_S64: hTy = TYPE_S32; break;
2583       default:
2584          continue;
2585       }
2586 
2587       if (i->op == OP_MAD || i->op == OP_MUL)
2588          split64MulMad(func, i, hTy);
2589    }
2590 
2591    return true;
2592 }
2593 
2594 void
split64MulMad(Function * fn,Instruction * i,DataType hTy)2595 Split64BitOpPreRA::split64MulMad(Function *fn, Instruction *i, DataType hTy)
2596 {
2597    assert(i->op == OP_MAD || i->op == OP_MUL);
2598    assert(!isFloatType(i->dType) && !isFloatType(i->sType));
2599    assert(typeSizeof(hTy) == 4);
2600 
2601    bld.setPosition(i, true);
2602 
2603    Value *zero = bld.mkImm(0u);
2604    Value *carry = bld.getSSA(1, FILE_FLAGS);
2605 
2606    // We want to compute `d = a * b (+ c)?`, where a, b, c and d are 64-bit
2607    // values (a, b and c might be 32-bit values), using 32-bit operations. This
2608    // gives the following operations:
2609    // * `d.low = low(a.low * b.low) (+ c.low)?`
2610    // * `d.high = low(a.high * b.low) + low(a.low * b.high)
2611    //           + high(a.low * b.low) (+ c.high)?`
2612    //
2613    // To compute the high bits, we can split in the following operations:
2614    // * `tmp1   = low(a.high * b.low) (+ c.high)?`
2615    // * `tmp2   = low(a.low * b.high) + tmp1`
2616    // * `d.high = high(a.low * b.low) + tmp2`
2617    //
2618    // mkSplit put lower bits at index 0 and higher bits at index 1
2619 
2620    Value *op1[2];
2621    if (i->getSrc(0)->reg.size == 8)
2622       bld.mkSplit(op1, 4, i->getSrc(0));
2623    else {
2624       op1[0] = i->getSrc(0);
2625       op1[1] = zero;
2626    }
2627    Value *op2[2];
2628    if (i->getSrc(1)->reg.size == 8)
2629       bld.mkSplit(op2, 4, i->getSrc(1));
2630    else {
2631       op2[0] = i->getSrc(1);
2632       op2[1] = zero;
2633    }
2634 
2635    Value *op3[2] = { NULL, NULL };
2636    if (i->op == OP_MAD) {
2637       if (i->getSrc(2)->reg.size == 8)
2638          bld.mkSplit(op3, 4, i->getSrc(2));
2639       else {
2640          op3[0] = i->getSrc(2);
2641          op3[1] = zero;
2642       }
2643    }
2644 
2645    Value *tmpRes1Hi = bld.getSSA();
2646    if (i->op == OP_MAD)
2647       bld.mkOp3(OP_MAD, hTy, tmpRes1Hi, op1[1], op2[0], op3[1]);
2648    else
2649       bld.mkOp2(OP_MUL, hTy, tmpRes1Hi, op1[1], op2[0]);
2650 
2651    Value *tmpRes2Hi = bld.mkOp3v(OP_MAD, hTy, bld.getSSA(), op1[0], op2[1], tmpRes1Hi);
2652 
2653    Value *def[2] = { bld.getSSA(), bld.getSSA() };
2654 
2655    // If it was a MAD, add the carry from the low bits
2656    // It is not needed if it was a MUL, since we added high(a.low * b.low) to
2657    // d.high
2658    if (i->op == OP_MAD)
2659       bld.mkOp3(OP_MAD, hTy, def[0], op1[0], op2[0], op3[0])->setFlagsDef(1, carry);
2660    else
2661       bld.mkOp2(OP_MUL, hTy, def[0], op1[0], op2[0]);
2662 
2663    Instruction *hiPart3 = bld.mkOp3(OP_MAD, hTy, def[1], op1[0], op2[0], tmpRes2Hi);
2664    hiPart3->subOp = NV50_IR_SUBOP_MUL_HIGH;
2665    if (i->op == OP_MAD)
2666       hiPart3->setFlagsSrc(3, carry);
2667 
2668    bld.mkOp2(OP_MERGE, i->dType, i->getDef(0), def[0], def[1]);
2669 
2670    delete_Instruction(fn->getProgram(), i);
2671 }
2672 
2673 // =============================================================================
2674 
2675 static inline void
updateLdStOffset(Instruction * ldst,int32_t offset,Function * fn)2676 updateLdStOffset(Instruction *ldst, int32_t offset, Function *fn)
2677 {
2678    if (offset != ldst->getSrc(0)->reg.data.offset) {
2679       if (ldst->getSrc(0)->refCount() > 1)
2680          ldst->setSrc(0, cloneShallow(fn, ldst->getSrc(0)));
2681       ldst->getSrc(0)->reg.data.offset = offset;
2682    }
2683 }
2684 
2685 // Combine loads and stores, forward stores to loads where possible.
2686 class MemoryOpt : public Pass
2687 {
2688 private:
2689    class Record
2690    {
2691    public:
2692       Record *next;
2693       Instruction *insn;
2694       const Value *rel[2];
2695       const Value *base;
2696       int32_t offset;
2697       int8_t fileIndex;
2698       uint8_t size;
2699       bool locked;
2700       Record *prev;
2701 
2702       bool overlaps(const Instruction *ldst) const;
2703 
2704       inline void link(Record **);
2705       inline void unlink(Record **);
2706       inline void set(const Instruction *ldst);
2707    };
2708 
2709 public:
2710    MemoryOpt();
2711 
2712    Record *loads[DATA_FILE_COUNT];
2713    Record *stores[DATA_FILE_COUNT];
2714 
2715    MemoryPool recordPool;
2716 
2717 private:
2718    virtual bool visit(BasicBlock *);
2719    bool runOpt(BasicBlock *);
2720 
2721    Record **getList(const Instruction *);
2722 
2723    Record *findRecord(const Instruction *, bool load, bool& isAdjacent) const;
2724 
2725    // merge @insn into load/store instruction from @rec
2726    bool combineLd(Record *rec, Instruction *ld);
2727    bool combineSt(Record *rec, Instruction *st);
2728 
2729    bool replaceLdFromLd(Instruction *ld, Record *ldRec);
2730    bool replaceLdFromSt(Instruction *ld, Record *stRec);
2731    bool replaceStFromSt(Instruction *restrict st, Record *stRec);
2732 
2733    void addRecord(Instruction *ldst);
2734    void purgeRecords(Instruction *const st, DataFile);
2735    void lockStores(Instruction *const ld);
2736    void reset();
2737 
2738 private:
2739    Record *prevRecord;
2740 };
2741 
MemoryOpt()2742 MemoryOpt::MemoryOpt() : recordPool(sizeof(MemoryOpt::Record), 6)
2743 {
2744    for (int i = 0; i < DATA_FILE_COUNT; ++i) {
2745       loads[i] = NULL;
2746       stores[i] = NULL;
2747    }
2748    prevRecord = NULL;
2749 }
2750 
2751 void
reset()2752 MemoryOpt::reset()
2753 {
2754    for (unsigned int i = 0; i < DATA_FILE_COUNT; ++i) {
2755       Record *it, *next;
2756       for (it = loads[i]; it; it = next) {
2757          next = it->next;
2758          recordPool.release(it);
2759       }
2760       loads[i] = NULL;
2761       for (it = stores[i]; it; it = next) {
2762          next = it->next;
2763          recordPool.release(it);
2764       }
2765       stores[i] = NULL;
2766    }
2767 }
2768 
2769 bool
combineLd(Record * rec,Instruction * ld)2770 MemoryOpt::combineLd(Record *rec, Instruction *ld)
2771 {
2772    int32_t offRc = rec->offset;
2773    int32_t offLd = ld->getSrc(0)->reg.data.offset;
2774    int sizeRc = rec->size;
2775    int sizeLd = typeSizeof(ld->dType);
2776    int size = sizeRc + sizeLd;
2777    int d, j;
2778 
2779    if (!prog->getTarget()->
2780        isAccessSupported(ld->getSrc(0)->reg.file, typeOfSize(size)))
2781       return false;
2782    // no unaligned loads
2783    if (((size == 0x8) && (MIN2(offLd, offRc) & 0x7)) ||
2784        ((size == 0xc) && (MIN2(offLd, offRc) & 0xf)))
2785       return false;
2786    // for compute indirect loads are not guaranteed to be aligned
2787    if (prog->getType() == Program::TYPE_COMPUTE && rec->rel[0])
2788       return false;
2789 
2790    assert(sizeRc + sizeLd <= 16 && offRc != offLd);
2791 
2792    // lock any stores that overlap with the load being merged into the
2793    // existing record.
2794    lockStores(ld);
2795 
2796    for (j = 0; sizeRc; sizeRc -= rec->insn->getDef(j)->reg.size, ++j);
2797 
2798    if (offLd < offRc) {
2799       int sz;
2800       for (sz = 0, d = 0; sz < sizeLd; sz += ld->getDef(d)->reg.size, ++d);
2801       // d: nr of definitions in ld
2802       // j: nr of definitions in rec->insn, move:
2803       for (d = d + j - 1; j > 0; --j, --d)
2804          rec->insn->setDef(d, rec->insn->getDef(j - 1));
2805 
2806       if (rec->insn->getSrc(0)->refCount() > 1)
2807          rec->insn->setSrc(0, cloneShallow(func, rec->insn->getSrc(0)));
2808       rec->offset = rec->insn->getSrc(0)->reg.data.offset = offLd;
2809 
2810       d = 0;
2811    } else {
2812       d = j;
2813    }
2814    // move definitions of @ld to @rec->insn
2815    for (j = 0; sizeLd; ++j, ++d) {
2816       sizeLd -= ld->getDef(j)->reg.size;
2817       rec->insn->setDef(d, ld->getDef(j));
2818    }
2819 
2820    rec->size = size;
2821    rec->insn->getSrc(0)->reg.size = size;
2822    rec->insn->setType(typeOfSize(size));
2823 
2824    delete_Instruction(prog, ld);
2825 
2826    return true;
2827 }
2828 
2829 bool
combineSt(Record * rec,Instruction * st)2830 MemoryOpt::combineSt(Record *rec, Instruction *st)
2831 {
2832    int32_t offRc = rec->offset;
2833    int32_t offSt = st->getSrc(0)->reg.data.offset;
2834    int sizeRc = rec->size;
2835    int sizeSt = typeSizeof(st->dType);
2836    int s = sizeSt / 4;
2837    int size = sizeRc + sizeSt;
2838    int j, k;
2839    Value *src[4]; // no modifiers in ValueRef allowed for st
2840    Value *extra[3];
2841 
2842    if (!prog->getTarget()->
2843        isAccessSupported(st->getSrc(0)->reg.file, typeOfSize(size)))
2844       return false;
2845    // no unaligned stores
2846    if (size == 8 && MIN2(offRc, offSt) & 0x7)
2847       return false;
2848    // for compute indirect stores are not guaranteed to be aligned
2849    if (prog->getType() == Program::TYPE_COMPUTE && rec->rel[0])
2850       return false;
2851 
2852    // There's really no great place to put this in a generic manner. Seemingly
2853    // wide stores at 0x60 don't work in GS shaders on SM50+. Don't combine
2854    // those.
2855    if (prog->getTarget()->getChipset() >= NVISA_GM107_CHIPSET &&
2856        prog->getType() == Program::TYPE_GEOMETRY &&
2857        st->getSrc(0)->reg.file == FILE_SHADER_OUTPUT &&
2858        rec->rel[0] == NULL &&
2859        MIN2(offRc, offSt) == 0x60)
2860       return false;
2861 
2862    // remove any existing load/store records for the store being merged into
2863    // the existing record.
2864    purgeRecords(st, DATA_FILE_COUNT);
2865 
2866    st->takeExtraSources(0, extra); // save predicate and indirect address
2867 
2868    if (offRc < offSt) {
2869       // save values from @st
2870       for (s = 0; sizeSt; ++s) {
2871          sizeSt -= st->getSrc(s + 1)->reg.size;
2872          src[s] = st->getSrc(s + 1);
2873       }
2874       // set record's values as low sources of @st
2875       for (j = 1; sizeRc; ++j) {
2876          sizeRc -= rec->insn->getSrc(j)->reg.size;
2877          st->setSrc(j, rec->insn->getSrc(j));
2878       }
2879       // set saved values as high sources of @st
2880       for (k = j, j = 0; j < s; ++j)
2881          st->setSrc(k++, src[j]);
2882 
2883       updateLdStOffset(st, offRc, func);
2884    } else {
2885       for (j = 1; sizeSt; ++j)
2886          sizeSt -= st->getSrc(j)->reg.size;
2887       for (s = 1; sizeRc; ++j, ++s) {
2888          sizeRc -= rec->insn->getSrc(s)->reg.size;
2889          st->setSrc(j, rec->insn->getSrc(s));
2890       }
2891       rec->offset = offSt;
2892    }
2893    st->putExtraSources(0, extra); // restore pointer and predicate
2894 
2895    delete_Instruction(prog, rec->insn);
2896    rec->insn = st;
2897    rec->size = size;
2898    rec->insn->getSrc(0)->reg.size = size;
2899    rec->insn->setType(typeOfSize(size));
2900    return true;
2901 }
2902 
2903 void
set(const Instruction * ldst)2904 MemoryOpt::Record::set(const Instruction *ldst)
2905 {
2906    const Symbol *mem = ldst->getSrc(0)->asSym();
2907    fileIndex = mem->reg.fileIndex;
2908    rel[0] = ldst->getIndirect(0, 0);
2909    rel[1] = ldst->getIndirect(0, 1);
2910    offset = mem->reg.data.offset;
2911    base = mem->getBase();
2912    size = typeSizeof(ldst->sType);
2913 }
2914 
2915 void
link(Record ** list)2916 MemoryOpt::Record::link(Record **list)
2917 {
2918    next = *list;
2919    if (next)
2920       next->prev = this;
2921    prev = NULL;
2922    *list = this;
2923 }
2924 
2925 void
unlink(Record ** list)2926 MemoryOpt::Record::unlink(Record **list)
2927 {
2928    if (next)
2929       next->prev = prev;
2930    if (prev)
2931       prev->next = next;
2932    else
2933       *list = next;
2934 }
2935 
2936 MemoryOpt::Record **
getList(const Instruction * insn)2937 MemoryOpt::getList(const Instruction *insn)
2938 {
2939    if (insn->op == OP_LOAD || insn->op == OP_VFETCH)
2940       return &loads[insn->src(0).getFile()];
2941    return &stores[insn->src(0).getFile()];
2942 }
2943 
2944 void
addRecord(Instruction * i)2945 MemoryOpt::addRecord(Instruction *i)
2946 {
2947    Record **list = getList(i);
2948    Record *it = reinterpret_cast<Record *>(recordPool.allocate());
2949 
2950    it->link(list);
2951    it->set(i);
2952    it->insn = i;
2953    it->locked = false;
2954 }
2955 
2956 MemoryOpt::Record *
findRecord(const Instruction * insn,bool load,bool & isAdj) const2957 MemoryOpt::findRecord(const Instruction *insn, bool load, bool& isAdj) const
2958 {
2959    const Symbol *sym = insn->getSrc(0)->asSym();
2960    const int size = typeSizeof(insn->sType);
2961    Record *rec = NULL;
2962    Record *it = load ? loads[sym->reg.file] : stores[sym->reg.file];
2963 
2964    for (; it; it = it->next) {
2965       if (it->locked && insn->op != OP_LOAD && insn->op != OP_VFETCH)
2966          continue;
2967       if ((it->offset >> 4) != (sym->reg.data.offset >> 4) ||
2968           it->rel[0] != insn->getIndirect(0, 0) ||
2969           it->fileIndex != sym->reg.fileIndex ||
2970           it->rel[1] != insn->getIndirect(0, 1))
2971          continue;
2972 
2973       if (it->offset < sym->reg.data.offset) {
2974          if (it->offset + it->size >= sym->reg.data.offset) {
2975             isAdj = (it->offset + it->size == sym->reg.data.offset);
2976             if (!isAdj)
2977                return it;
2978             if (!(it->offset & 0x7))
2979                rec = it;
2980          }
2981       } else {
2982          isAdj = it->offset != sym->reg.data.offset;
2983          if (size <= it->size && !isAdj)
2984             return it;
2985          else
2986          if (!(sym->reg.data.offset & 0x7))
2987             if (it->offset - size <= sym->reg.data.offset)
2988                rec = it;
2989       }
2990    }
2991    return rec;
2992 }
2993 
2994 bool
replaceLdFromSt(Instruction * ld,Record * rec)2995 MemoryOpt::replaceLdFromSt(Instruction *ld, Record *rec)
2996 {
2997    Instruction *st = rec->insn;
2998    int32_t offSt = rec->offset;
2999    int32_t offLd = ld->getSrc(0)->reg.data.offset;
3000    int d, s;
3001 
3002    for (s = 1; offSt != offLd && st->srcExists(s); ++s)
3003       offSt += st->getSrc(s)->reg.size;
3004    if (offSt != offLd)
3005       return false;
3006 
3007    for (d = 0; ld->defExists(d) && st->srcExists(s); ++d, ++s) {
3008       if (ld->getDef(d)->reg.size != st->getSrc(s)->reg.size)
3009          return false;
3010       if (st->getSrc(s)->reg.file != FILE_GPR)
3011          return false;
3012       ld->def(d).replace(st->src(s), false);
3013    }
3014    ld->bb->remove(ld);
3015    return true;
3016 }
3017 
3018 bool
replaceLdFromLd(Instruction * ldE,Record * rec)3019 MemoryOpt::replaceLdFromLd(Instruction *ldE, Record *rec)
3020 {
3021    Instruction *ldR = rec->insn;
3022    int32_t offR = rec->offset;
3023    int32_t offE = ldE->getSrc(0)->reg.data.offset;
3024    int dR, dE;
3025 
3026    assert(offR <= offE);
3027    for (dR = 0; offR < offE && ldR->defExists(dR); ++dR)
3028       offR += ldR->getDef(dR)->reg.size;
3029    if (offR != offE)
3030       return false;
3031 
3032    for (dE = 0; ldE->defExists(dE) && ldR->defExists(dR); ++dE, ++dR) {
3033       if (ldE->getDef(dE)->reg.size != ldR->getDef(dR)->reg.size)
3034          return false;
3035       ldE->def(dE).replace(ldR->getDef(dR), false);
3036    }
3037 
3038    delete_Instruction(prog, ldE);
3039    return true;
3040 }
3041 
3042 bool
replaceStFromSt(Instruction * restrict st,Record * rec)3043 MemoryOpt::replaceStFromSt(Instruction *restrict st, Record *rec)
3044 {
3045    const Instruction *const ri = rec->insn;
3046    Value *extra[3];
3047 
3048    int32_t offS = st->getSrc(0)->reg.data.offset;
3049    int32_t offR = rec->offset;
3050    int32_t endS = offS + typeSizeof(st->dType);
3051    int32_t endR = offR + typeSizeof(ri->dType);
3052 
3053    rec->size = MAX2(endS, endR) - MIN2(offS, offR);
3054 
3055    st->takeExtraSources(0, extra);
3056 
3057    if (offR < offS) {
3058       Value *vals[10];
3059       int s, n;
3060       int k = 0;
3061       // get non-replaced sources of ri
3062       for (s = 1; offR < offS; offR += ri->getSrc(s)->reg.size, ++s)
3063          vals[k++] = ri->getSrc(s);
3064       n = s;
3065       // get replaced sources of st
3066       for (s = 1; st->srcExists(s); offS += st->getSrc(s)->reg.size, ++s)
3067          vals[k++] = st->getSrc(s);
3068       // skip replaced sources of ri
3069       for (s = n; offR < endS; offR += ri->getSrc(s)->reg.size, ++s);
3070       // get non-replaced sources after values covered by st
3071       for (; offR < endR; offR += ri->getSrc(s)->reg.size, ++s)
3072          vals[k++] = ri->getSrc(s);
3073       assert((unsigned int)k <= ARRAY_SIZE(vals));
3074       for (s = 0; s < k; ++s)
3075          st->setSrc(s + 1, vals[s]);
3076       st->setSrc(0, ri->getSrc(0));
3077    } else
3078    if (endR > endS) {
3079       int j, s;
3080       for (j = 1; offR < endS; offR += ri->getSrc(j++)->reg.size);
3081       for (s = 1; offS < endS; offS += st->getSrc(s++)->reg.size);
3082       for (; offR < endR; offR += ri->getSrc(j++)->reg.size)
3083          st->setSrc(s++, ri->getSrc(j));
3084    }
3085    st->putExtraSources(0, extra);
3086 
3087    delete_Instruction(prog, rec->insn);
3088 
3089    rec->insn = st;
3090    rec->offset = st->getSrc(0)->reg.data.offset;
3091 
3092    st->setType(typeOfSize(rec->size));
3093 
3094    return true;
3095 }
3096 
3097 bool
overlaps(const Instruction * ldst) const3098 MemoryOpt::Record::overlaps(const Instruction *ldst) const
3099 {
3100    Record that;
3101    that.set(ldst);
3102 
3103    // This assumes that images/buffers can't overlap. They can.
3104    // TODO: Plumb the restrict logic through, and only skip when it's a
3105    // restrict situation, or there can implicitly be no writes.
3106    if (this->fileIndex != that.fileIndex && this->rel[1] == that.rel[1])
3107       return false;
3108 
3109    if (this->rel[0] || that.rel[0])
3110       return this->base == that.base;
3111 
3112    return
3113       (this->offset < that.offset + that.size) &&
3114       (this->offset + this->size > that.offset);
3115 }
3116 
3117 // We must not eliminate stores that affect the result of @ld if
3118 // we find later stores to the same location, and we may no longer
3119 // merge them with later stores.
3120 // The stored value can, however, still be used to determine the value
3121 // returned by future loads.
3122 void
lockStores(Instruction * const ld)3123 MemoryOpt::lockStores(Instruction *const ld)
3124 {
3125    for (Record *r = stores[ld->src(0).getFile()]; r; r = r->next)
3126       if (!r->locked && r->overlaps(ld))
3127          r->locked = true;
3128 }
3129 
3130 // Prior loads from the location of @st are no longer valid.
3131 // Stores to the location of @st may no longer be used to derive
3132 // the value at it nor be coalesced into later stores.
3133 void
purgeRecords(Instruction * const st,DataFile f)3134 MemoryOpt::purgeRecords(Instruction *const st, DataFile f)
3135 {
3136    if (st)
3137       f = st->src(0).getFile();
3138 
3139    for (Record *r = loads[f]; r; r = r->next)
3140       if (!st || r->overlaps(st))
3141          r->unlink(&loads[f]);
3142 
3143    for (Record *r = stores[f]; r; r = r->next)
3144       if (!st || r->overlaps(st))
3145          r->unlink(&stores[f]);
3146 }
3147 
3148 bool
visit(BasicBlock * bb)3149 MemoryOpt::visit(BasicBlock *bb)
3150 {
3151    bool ret = runOpt(bb);
3152    // Run again, one pass won't combine 4 32 bit ld/st to a single 128 bit ld/st
3153    // where 96 bit memory operations are forbidden.
3154    if (ret)
3155       ret = runOpt(bb);
3156    return ret;
3157 }
3158 
3159 bool
runOpt(BasicBlock * bb)3160 MemoryOpt::runOpt(BasicBlock *bb)
3161 {
3162    Instruction *ldst, *next;
3163    Record *rec;
3164    bool isAdjacent = true;
3165 
3166    for (ldst = bb->getEntry(); ldst; ldst = next) {
3167       bool keep = true;
3168       bool isLoad = true;
3169       next = ldst->next;
3170 
3171       // TODO: Handle combining sub 4-bytes loads/stores.
3172       if (ldst->op == OP_STORE && typeSizeof(ldst->dType) < 4) {
3173          purgeRecords(ldst, ldst->src(0).getFile());
3174          continue;
3175       }
3176 
3177       if (ldst->op == OP_LOAD || ldst->op == OP_VFETCH) {
3178          if (ldst->subOp == NV50_IR_SUBOP_LOAD_LOCKED) {
3179             purgeRecords(ldst, ldst->src(0).getFile());
3180             continue;
3181          }
3182          if (ldst->isDead()) {
3183             // might have been produced by earlier optimization
3184             delete_Instruction(prog, ldst);
3185             continue;
3186          }
3187       } else
3188       if (ldst->op == OP_STORE || ldst->op == OP_EXPORT) {
3189          if (ldst->subOp == NV50_IR_SUBOP_STORE_UNLOCKED) {
3190             purgeRecords(ldst, ldst->src(0).getFile());
3191             continue;
3192          }
3193          if (typeSizeof(ldst->dType) == 4 &&
3194              ldst->src(1).getFile() == FILE_GPR &&
3195              ldst->getSrc(1)->getInsn()->op == OP_NOP) {
3196             delete_Instruction(prog, ldst);
3197             continue;
3198          }
3199          isLoad = false;
3200       } else {
3201          // TODO: maybe have all fixed ops act as barrier ?
3202          if (ldst->op == OP_CALL ||
3203              ldst->op == OP_BAR ||
3204              ldst->op == OP_MEMBAR) {
3205             purgeRecords(NULL, FILE_MEMORY_LOCAL);
3206             purgeRecords(NULL, FILE_MEMORY_GLOBAL);
3207             purgeRecords(NULL, FILE_MEMORY_SHARED);
3208             purgeRecords(NULL, FILE_SHADER_OUTPUT);
3209          } else
3210          if (ldst->op == OP_ATOM || ldst->op == OP_CCTL) {
3211             if (ldst->src(0).getFile() == FILE_MEMORY_GLOBAL) {
3212                purgeRecords(NULL, FILE_MEMORY_LOCAL);
3213                purgeRecords(NULL, FILE_MEMORY_GLOBAL);
3214                purgeRecords(NULL, FILE_MEMORY_SHARED);
3215             } else {
3216                purgeRecords(NULL, ldst->src(0).getFile());
3217             }
3218          } else
3219          if (ldst->op == OP_EMIT || ldst->op == OP_RESTART) {
3220             purgeRecords(NULL, FILE_SHADER_OUTPUT);
3221          }
3222          continue;
3223       }
3224 
3225       DataFile file = ldst->src(0).getFile();
3226       if (file != FILE_MEMORY_CONST &&
3227           file != FILE_SHADER_INPUT &&
3228           file != FILE_SHADER_OUTPUT)
3229          continue;
3230 
3231       if (ldst->getPredicate()) // TODO: handle predicated ld/st
3232          continue;
3233       if (ldst->perPatch) // TODO: create separate per-patch lists
3234          continue;
3235 
3236       if (isLoad) {
3237          // if ld l[]/g[] look for previous store to eliminate the reload
3238          if (file == FILE_MEMORY_GLOBAL || file == FILE_MEMORY_LOCAL) {
3239             // TODO: shared memory ?
3240             rec = findRecord(ldst, false, isAdjacent);
3241             if (rec && !isAdjacent)
3242                keep = !replaceLdFromSt(ldst, rec);
3243          }
3244 
3245          // or look for ld from the same location and replace this one
3246          rec = keep ? findRecord(ldst, true, isAdjacent) : NULL;
3247          if (rec) {
3248             if (!isAdjacent)
3249                keep = !replaceLdFromLd(ldst, rec);
3250             else
3251                // or combine a previous load with this one
3252                keep = !combineLd(rec, ldst);
3253          }
3254          if (keep)
3255             lockStores(ldst);
3256       } else {
3257          rec = findRecord(ldst, false, isAdjacent);
3258          if (rec) {
3259             if (!isAdjacent)
3260                keep = !replaceStFromSt(ldst, rec);
3261             else
3262                keep = !combineSt(rec, ldst);
3263          }
3264          if (keep)
3265             purgeRecords(ldst, DATA_FILE_COUNT);
3266       }
3267       if (keep)
3268          addRecord(ldst);
3269    }
3270    reset();
3271 
3272    return true;
3273 }
3274 
3275 // =============================================================================
3276 
3277 // Turn control flow into predicated instructions (after register allocation !).
3278 // TODO:
3279 // Could move this to before register allocation on NVC0 and also handle nested
3280 // constructs.
3281 class FlatteningPass : public Pass
3282 {
3283 public:
FlatteningPass()3284    FlatteningPass() : gpr_unit(0) {}
3285 
3286 private:
3287    virtual bool visit(Function *);
3288    virtual bool visit(BasicBlock *);
3289 
3290    bool tryPredicateConditional(BasicBlock *);
3291    void predicateInstructions(BasicBlock *, Value *pred, CondCode cc);
3292    void tryPropagateBranch(BasicBlock *);
3293    inline bool isConstantCondition(Value *pred);
3294    inline bool mayPredicate(const Instruction *, const Value *pred) const;
3295    inline void removeFlow(Instruction *);
3296 
3297    uint8_t gpr_unit;
3298 };
3299 
3300 bool
isConstantCondition(Value * pred)3301 FlatteningPass::isConstantCondition(Value *pred)
3302 {
3303    Instruction *insn = pred->getUniqueInsn();
3304    assert(insn);
3305    if (insn->op != OP_SET || insn->srcExists(2))
3306       return false;
3307 
3308    for (int s = 0; s < 2 && insn->srcExists(s); ++s) {
3309       Instruction *ld = insn->getSrc(s)->getUniqueInsn();
3310       DataFile file;
3311       if (ld) {
3312          if (ld->op != OP_MOV && ld->op != OP_LOAD)
3313             return false;
3314          if (ld->src(0).isIndirect(0))
3315             return false;
3316          file = ld->src(0).getFile();
3317       } else {
3318          file = insn->src(s).getFile();
3319          // catch $r63 on NVC0 and $r63/$r127 on NV50. Unfortunately maxGPR is
3320          // in register "units", which can vary between targets.
3321          if (file == FILE_GPR) {
3322             Value *v = insn->getSrc(s);
3323             int bytes = v->reg.data.id * MIN2(v->reg.size, 4);
3324             int units = bytes >> gpr_unit;
3325             if (units > prog->maxGPR)
3326                file = FILE_IMMEDIATE;
3327          }
3328       }
3329       if (file != FILE_IMMEDIATE && file != FILE_MEMORY_CONST)
3330          return false;
3331    }
3332    return true;
3333 }
3334 
3335 void
removeFlow(Instruction * insn)3336 FlatteningPass::removeFlow(Instruction *insn)
3337 {
3338    FlowInstruction *term = insn ? insn->asFlow() : NULL;
3339    if (!term)
3340       return;
3341    Graph::Edge::Type ty = term->bb->cfg.outgoing().getType();
3342 
3343    if (term->op == OP_BRA) {
3344       // TODO: this might get more difficult when we get arbitrary BRAs
3345       if (ty == Graph::Edge::CROSS || ty == Graph::Edge::BACK)
3346          return;
3347    } else
3348    if (term->op != OP_JOIN)
3349       return;
3350 
3351    Value *pred = term->getPredicate();
3352 
3353    delete_Instruction(prog, term);
3354 
3355    if (pred && pred->refCount() == 0) {
3356       Instruction *pSet = pred->getUniqueInsn();
3357       pred->join->reg.data.id = -1; // deallocate
3358       if (pSet->isDead())
3359          delete_Instruction(prog, pSet);
3360    }
3361 }
3362 
3363 void
predicateInstructions(BasicBlock * bb,Value * pred,CondCode cc)3364 FlatteningPass::predicateInstructions(BasicBlock *bb, Value *pred, CondCode cc)
3365 {
3366    for (Instruction *i = bb->getEntry(); i; i = i->next) {
3367       if (i->isNop())
3368          continue;
3369       assert(!i->getPredicate());
3370       i->setPredicate(cc, pred);
3371    }
3372    removeFlow(bb->getExit());
3373 }
3374 
3375 bool
mayPredicate(const Instruction * insn,const Value * pred) const3376 FlatteningPass::mayPredicate(const Instruction *insn, const Value *pred) const
3377 {
3378    if (insn->isPseudo())
3379       return true;
3380    // TODO: calls where we don't know which registers are modified
3381 
3382    if (!prog->getTarget()->mayPredicate(insn, pred))
3383       return false;
3384    for (int d = 0; insn->defExists(d); ++d)
3385       if (insn->getDef(d)->equals(pred))
3386          return false;
3387    return true;
3388 }
3389 
3390 // If we jump to BRA/RET/EXIT, replace the jump with it.
3391 // NOTE: We do not update the CFG anymore here !
3392 //
3393 // TODO: Handle cases where we skip over a branch (maybe do that elsewhere ?):
3394 //  BB:0
3395 //   @p0 bra BB:2 -> @!p0 bra BB:3 iff (!) BB:2 immediately adjoins BB:1
3396 //  BB1:
3397 //   bra BB:3
3398 //  BB2:
3399 //   ...
3400 //  BB3:
3401 //   ...
3402 void
tryPropagateBranch(BasicBlock * bb)3403 FlatteningPass::tryPropagateBranch(BasicBlock *bb)
3404 {
3405    for (Instruction *i = bb->getExit(); i && i->op == OP_BRA; i = i->prev) {
3406       BasicBlock *bf = i->asFlow()->target.bb;
3407 
3408       if (bf->getInsnCount() != 1)
3409          continue;
3410 
3411       FlowInstruction *bra = i->asFlow();
3412       FlowInstruction *rep = bf->getExit()->asFlow();
3413 
3414       if (!rep || rep->getPredicate())
3415          continue;
3416       if (rep->op != OP_BRA &&
3417           rep->op != OP_JOIN &&
3418           rep->op != OP_EXIT)
3419          continue;
3420 
3421       // TODO: If there are multiple branches to @rep, only the first would
3422       // be replaced, so only remove them after this pass is done ?
3423       // Also, need to check all incident blocks for fall-through exits and
3424       // add the branch there.
3425       bra->op = rep->op;
3426       bra->target.bb = rep->target.bb;
3427       if (bf->cfg.incidentCount() == 1)
3428          bf->remove(rep);
3429    }
3430 }
3431 
3432 bool
visit(Function * fn)3433 FlatteningPass::visit(Function *fn)
3434 {
3435    gpr_unit = prog->getTarget()->getFileUnit(FILE_GPR);
3436 
3437    return true;
3438 }
3439 
3440 bool
visit(BasicBlock * bb)3441 FlatteningPass::visit(BasicBlock *bb)
3442 {
3443    if (tryPredicateConditional(bb))
3444       return true;
3445 
3446    // try to attach join to previous instruction
3447    if (prog->getTarget()->hasJoin) {
3448       Instruction *insn = bb->getExit();
3449       if (insn && insn->op == OP_JOIN && !insn->getPredicate()) {
3450          insn = insn->prev;
3451          if (insn && !insn->getPredicate() &&
3452              !insn->asFlow() &&
3453              insn->op != OP_DISCARD &&
3454              insn->op != OP_TEXBAR &&
3455              !isTextureOp(insn->op) && // probably just nve4
3456              !isSurfaceOp(insn->op) && // not confirmed
3457              insn->op != OP_LINTERP && // probably just nve4
3458              insn->op != OP_PINTERP && // probably just nve4
3459              ((insn->op != OP_LOAD && insn->op != OP_STORE && insn->op != OP_ATOM) ||
3460               (typeSizeof(insn->dType) <= 4 && !insn->src(0).isIndirect(0))) &&
3461              !insn->isNop()) {
3462             insn->join = 1;
3463             bb->remove(bb->getExit());
3464             return true;
3465          }
3466       }
3467    }
3468 
3469    tryPropagateBranch(bb);
3470 
3471    return true;
3472 }
3473 
3474 bool
tryPredicateConditional(BasicBlock * bb)3475 FlatteningPass::tryPredicateConditional(BasicBlock *bb)
3476 {
3477    BasicBlock *bL = NULL, *bR = NULL;
3478    unsigned int nL = 0, nR = 0, limit = 12;
3479    Instruction *insn;
3480    unsigned int mask;
3481 
3482    mask = bb->initiatesSimpleConditional();
3483    if (!mask)
3484       return false;
3485 
3486    assert(bb->getExit());
3487    Value *pred = bb->getExit()->getPredicate();
3488    assert(pred);
3489 
3490    if (isConstantCondition(pred))
3491       limit = 4;
3492 
3493    Graph::EdgeIterator ei = bb->cfg.outgoing();
3494 
3495    if (mask & 1) {
3496       bL = BasicBlock::get(ei.getNode());
3497       for (insn = bL->getEntry(); insn; insn = insn->next, ++nL)
3498          if (!mayPredicate(insn, pred))
3499             return false;
3500       if (nL > limit)
3501          return false; // too long, do a real branch
3502    }
3503    ei.next();
3504 
3505    if (mask & 2) {
3506       bR = BasicBlock::get(ei.getNode());
3507       for (insn = bR->getEntry(); insn; insn = insn->next, ++nR)
3508          if (!mayPredicate(insn, pred))
3509             return false;
3510       if (nR > limit)
3511          return false; // too long, do a real branch
3512    }
3513 
3514    if (bL)
3515       predicateInstructions(bL, pred, bb->getExit()->cc);
3516    if (bR)
3517       predicateInstructions(bR, pred, inverseCondCode(bb->getExit()->cc));
3518 
3519    if (bb->joinAt) {
3520       bb->remove(bb->joinAt);
3521       bb->joinAt = NULL;
3522    }
3523    removeFlow(bb->getExit()); // delete the branch/join at the fork point
3524 
3525    // remove potential join operations at the end of the conditional
3526    if (prog->getTarget()->joinAnterior) {
3527       bb = BasicBlock::get((bL ? bL : bR)->cfg.outgoing().getNode());
3528       if (bb->getEntry() && bb->getEntry()->op == OP_JOIN)
3529          removeFlow(bb->getEntry());
3530    }
3531 
3532    return true;
3533 }
3534 
3535 // =============================================================================
3536 
3537 // Fold Immediate into MAD; must be done after register allocation due to
3538 // constraint SDST == SSRC2
3539 // TODO:
3540 // Does NVC0+ have other situations where this pass makes sense?
3541 class PostRaLoadPropagation : public Pass
3542 {
3543 private:
3544    virtual bool visit(Instruction *);
3545 
3546    void handleMADforNV50(Instruction *);
3547    void handleMADforNVC0(Instruction *);
3548 };
3549 
3550 static bool
post_ra_dead(Instruction * i)3551 post_ra_dead(Instruction *i)
3552 {
3553    for (int d = 0; i->defExists(d); ++d)
3554       if (i->getDef(d)->refCount())
3555          return false;
3556    return true;
3557 }
3558 
3559 // Fold Immediate into MAD; must be done after register allocation due to
3560 // constraint SDST == SSRC2
3561 void
handleMADforNV50(Instruction * i)3562 PostRaLoadPropagation::handleMADforNV50(Instruction *i)
3563 {
3564    if (i->def(0).getFile() != FILE_GPR ||
3565        i->src(0).getFile() != FILE_GPR ||
3566        i->src(1).getFile() != FILE_GPR ||
3567        i->src(2).getFile() != FILE_GPR ||
3568        i->getDef(0)->reg.data.id != i->getSrc(2)->reg.data.id)
3569       return;
3570 
3571    if (i->getDef(0)->reg.data.id >= 64 ||
3572        i->getSrc(0)->reg.data.id >= 64)
3573       return;
3574 
3575    if (i->flagsSrc >= 0 && i->getSrc(i->flagsSrc)->reg.data.id != 0)
3576       return;
3577 
3578    if (i->getPredicate())
3579       return;
3580 
3581    Value *vtmp;
3582    Instruction *def = i->getSrc(1)->getInsn();
3583 
3584    if (def && def->op == OP_SPLIT && typeSizeof(def->sType) == 4)
3585       def = def->getSrc(0)->getInsn();
3586    if (def && def->op == OP_MOV && def->src(0).getFile() == FILE_IMMEDIATE) {
3587       vtmp = i->getSrc(1);
3588       if (isFloatType(i->sType)) {
3589          i->setSrc(1, def->getSrc(0));
3590       } else {
3591          ImmediateValue val;
3592          // getImmediate() has side-effects on the argument so this *shouldn't*
3593          // be folded into the assert()
3594          ASSERTED bool ret = def->src(0).getImmediate(val);
3595          assert(ret);
3596          if (i->getSrc(1)->reg.data.id & 1)
3597             val.reg.data.u32 >>= 16;
3598          val.reg.data.u32 &= 0xffff;
3599          i->setSrc(1, new_ImmediateValue(prog, val.reg.data.u32));
3600       }
3601 
3602       /* There's no post-RA dead code elimination, so do it here
3603        * XXX: if we add more code-removing post-RA passes, we might
3604        *      want to create a post-RA dead-code elim pass */
3605       if (post_ra_dead(vtmp->getInsn())) {
3606          Value *src = vtmp->getInsn()->getSrc(0);
3607          // Careful -- splits will have already been removed from the
3608          // functions. Don't double-delete.
3609          if (vtmp->getInsn()->bb)
3610             delete_Instruction(prog, vtmp->getInsn());
3611          if (src->getInsn() && post_ra_dead(src->getInsn()))
3612             delete_Instruction(prog, src->getInsn());
3613       }
3614    }
3615 }
3616 
3617 void
handleMADforNVC0(Instruction * i)3618 PostRaLoadPropagation::handleMADforNVC0(Instruction *i)
3619 {
3620    if (i->def(0).getFile() != FILE_GPR ||
3621        i->src(0).getFile() != FILE_GPR ||
3622        i->src(1).getFile() != FILE_GPR ||
3623        i->src(2).getFile() != FILE_GPR ||
3624        i->getDef(0)->reg.data.id != i->getSrc(2)->reg.data.id)
3625       return;
3626 
3627    // TODO: gm107 can also do this for S32, maybe other chipsets as well
3628    if (i->dType != TYPE_F32)
3629       return;
3630 
3631    if ((i->src(2).mod | Modifier(NV50_IR_MOD_NEG)) != Modifier(NV50_IR_MOD_NEG))
3632       return;
3633 
3634    ImmediateValue val;
3635    int s;
3636 
3637    if (i->src(0).getImmediate(val))
3638       s = 1;
3639    else if (i->src(1).getImmediate(val))
3640       s = 0;
3641    else
3642       return;
3643 
3644    if ((i->src(s).mod | Modifier(NV50_IR_MOD_NEG)) != Modifier(NV50_IR_MOD_NEG))
3645       return;
3646 
3647    if (s == 1)
3648       i->swapSources(0, 1);
3649 
3650    Instruction *imm = i->getSrc(1)->getInsn();
3651    i->setSrc(1, imm->getSrc(0));
3652    if (post_ra_dead(imm))
3653       delete_Instruction(prog, imm);
3654 }
3655 
3656 bool
visit(Instruction * i)3657 PostRaLoadPropagation::visit(Instruction *i)
3658 {
3659    switch (i->op) {
3660    case OP_FMA:
3661    case OP_MAD:
3662       if (prog->getTarget()->getChipset() < 0xc0)
3663          handleMADforNV50(i);
3664       else
3665          handleMADforNVC0(i);
3666       break;
3667    default:
3668       break;
3669    }
3670 
3671    return true;
3672 }
3673 
3674 // =============================================================================
3675 
3676 // Common subexpression elimination. Stupid O^2 implementation.
3677 class LocalCSE : public Pass
3678 {
3679 private:
3680    virtual bool visit(BasicBlock *);
3681 
3682    inline bool tryReplace(Instruction **, Instruction *);
3683 
3684    DLList ops[OP_LAST + 1];
3685 };
3686 
3687 class GlobalCSE : public Pass
3688 {
3689 private:
3690    virtual bool visit(BasicBlock *);
3691 };
3692 
3693 bool
isActionEqual(const Instruction * that) const3694 Instruction::isActionEqual(const Instruction *that) const
3695 {
3696    if (this->op != that->op ||
3697        this->dType != that->dType ||
3698        this->sType != that->sType)
3699       return false;
3700    if (this->cc != that->cc)
3701       return false;
3702 
3703    if (this->asTex()) {
3704       if (memcmp(&this->asTex()->tex,
3705                  &that->asTex()->tex,
3706                  sizeof(this->asTex()->tex)))
3707          return false;
3708    } else
3709    if (this->asCmp()) {
3710       if (this->asCmp()->setCond != that->asCmp()->setCond)
3711          return false;
3712    } else
3713    if (this->asFlow()) {
3714       return false;
3715    } else
3716    if (this->op == OP_PHI && this->bb != that->bb) {
3717       /* TODO: we could probably be a bit smarter here by following the
3718        * control flow, but honestly, it is quite painful to check */
3719       return false;
3720    } else {
3721       if (this->ipa != that->ipa ||
3722           this->lanes != that->lanes ||
3723           this->perPatch != that->perPatch)
3724          return false;
3725       if (this->postFactor != that->postFactor)
3726          return false;
3727    }
3728 
3729    if (this->subOp != that->subOp ||
3730        this->saturate != that->saturate ||
3731        this->rnd != that->rnd ||
3732        this->ftz != that->ftz ||
3733        this->dnz != that->dnz ||
3734        this->cache != that->cache ||
3735        this->mask != that->mask)
3736       return false;
3737 
3738    return true;
3739 }
3740 
3741 bool
isResultEqual(const Instruction * that) const3742 Instruction::isResultEqual(const Instruction *that) const
3743 {
3744    unsigned int d, s;
3745 
3746    // NOTE: location of discard only affects tex with liveOnly and quadops
3747    if (!this->defExists(0) && this->op != OP_DISCARD)
3748       return false;
3749 
3750    if (!isActionEqual(that))
3751       return false;
3752 
3753    if (this->predSrc != that->predSrc)
3754       return false;
3755 
3756    for (d = 0; this->defExists(d); ++d) {
3757       if (!that->defExists(d) ||
3758           !this->getDef(d)->equals(that->getDef(d), false))
3759          return false;
3760    }
3761    if (that->defExists(d))
3762       return false;
3763 
3764    for (s = 0; this->srcExists(s); ++s) {
3765       if (!that->srcExists(s))
3766          return false;
3767       if (this->src(s).mod != that->src(s).mod)
3768          return false;
3769       if (!this->getSrc(s)->equals(that->getSrc(s), true))
3770          return false;
3771    }
3772    if (that->srcExists(s))
3773       return false;
3774 
3775    if (op == OP_LOAD || op == OP_VFETCH || op == OP_ATOM) {
3776       switch (src(0).getFile()) {
3777       case FILE_MEMORY_CONST:
3778       case FILE_SHADER_INPUT:
3779          return true;
3780       case FILE_SHADER_OUTPUT:
3781          return bb->getProgram()->getType() == Program::TYPE_TESSELLATION_EVAL;
3782       default:
3783          return false;
3784       }
3785    }
3786 
3787    return true;
3788 }
3789 
3790 // pull through common expressions from different in-blocks
3791 bool
visit(BasicBlock * bb)3792 GlobalCSE::visit(BasicBlock *bb)
3793 {
3794    Instruction *phi, *next, *ik;
3795    int s;
3796 
3797    // TODO: maybe do this with OP_UNION, too
3798 
3799    for (phi = bb->getPhi(); phi && phi->op == OP_PHI; phi = next) {
3800       next = phi->next;
3801       if (phi->getSrc(0)->refCount() > 1)
3802          continue;
3803       ik = phi->getSrc(0)->getInsn();
3804       if (!ik)
3805          continue; // probably a function input
3806       if (ik->defCount(0xff) > 1)
3807          continue; // too painful to check if we can really push this forward
3808       for (s = 1; phi->srcExists(s); ++s) {
3809          if (phi->getSrc(s)->refCount() > 1)
3810             break;
3811          if (!phi->getSrc(s)->getInsn() ||
3812              !phi->getSrc(s)->getInsn()->isResultEqual(ik))
3813             break;
3814       }
3815       if (!phi->srcExists(s)) {
3816          assert(ik->op != OP_PHI);
3817          Instruction *entry = bb->getEntry();
3818          ik->bb->remove(ik);
3819          if (!entry || entry->op != OP_JOIN)
3820             bb->insertHead(ik);
3821          else
3822             bb->insertAfter(entry, ik);
3823          ik->setDef(0, phi->getDef(0));
3824          delete_Instruction(prog, phi);
3825       }
3826    }
3827 
3828    return true;
3829 }
3830 
3831 bool
tryReplace(Instruction ** ptr,Instruction * i)3832 LocalCSE::tryReplace(Instruction **ptr, Instruction *i)
3833 {
3834    Instruction *old = *ptr;
3835 
3836    // TODO: maybe relax this later (causes trouble with OP_UNION)
3837    if (i->isPredicated())
3838       return false;
3839 
3840    if (!old->isResultEqual(i))
3841       return false;
3842 
3843    for (int d = 0; old->defExists(d); ++d)
3844       old->def(d).replace(i->getDef(d), false);
3845    delete_Instruction(prog, old);
3846    *ptr = NULL;
3847    return true;
3848 }
3849 
3850 bool
visit(BasicBlock * bb)3851 LocalCSE::visit(BasicBlock *bb)
3852 {
3853    unsigned int replaced;
3854 
3855    do {
3856       Instruction *ir, *next;
3857 
3858       replaced = 0;
3859 
3860       // will need to know the order of instructions
3861       int serial = 0;
3862       for (ir = bb->getFirst(); ir; ir = ir->next)
3863          ir->serial = serial++;
3864 
3865       for (ir = bb->getFirst(); ir; ir = next) {
3866          int s;
3867          Value *src = NULL;
3868 
3869          next = ir->next;
3870 
3871          if (ir->fixed) {
3872             ops[ir->op].insert(ir);
3873             continue;
3874          }
3875 
3876          for (s = 0; ir->srcExists(s); ++s)
3877             if (ir->getSrc(s)->asLValue())
3878                if (!src || ir->getSrc(s)->refCount() < src->refCount())
3879                   src = ir->getSrc(s);
3880 
3881          if (src) {
3882             for (Value::UseIterator it = src->uses.begin();
3883                  it != src->uses.end(); ++it) {
3884                Instruction *ik = (*it)->getInsn();
3885                if (ik && ik->bb == ir->bb && ik->serial < ir->serial)
3886                   if (tryReplace(&ir, ik))
3887                      break;
3888             }
3889          } else {
3890             DLLIST_FOR_EACH(&ops[ir->op], iter)
3891             {
3892                Instruction *ik = reinterpret_cast<Instruction *>(iter.get());
3893                if (tryReplace(&ir, ik))
3894                   break;
3895             }
3896          }
3897 
3898          if (ir)
3899             ops[ir->op].insert(ir);
3900          else
3901             ++replaced;
3902       }
3903       for (unsigned int i = 0; i <= OP_LAST; ++i)
3904          ops[i].clear();
3905 
3906    } while (replaced);
3907 
3908    return true;
3909 }
3910 
3911 // =============================================================================
3912 
3913 // Remove computations of unused values.
3914 class DeadCodeElim : public Pass
3915 {
3916 public:
DeadCodeElim()3917    DeadCodeElim() : deadCount(0) {}
3918    bool buryAll(Program *);
3919 
3920 private:
3921    virtual bool visit(BasicBlock *);
3922 
3923    void checkSplitLoad(Instruction *ld); // for partially dead loads
3924 
3925    unsigned int deadCount;
3926 };
3927 
3928 bool
buryAll(Program * prog)3929 DeadCodeElim::buryAll(Program *prog)
3930 {
3931    do {
3932       deadCount = 0;
3933       if (!this->run(prog, false, false))
3934          return false;
3935    } while (deadCount);
3936 
3937    return true;
3938 }
3939 
3940 bool
visit(BasicBlock * bb)3941 DeadCodeElim::visit(BasicBlock *bb)
3942 {
3943    Instruction *prev;
3944 
3945    for (Instruction *i = bb->getExit(); i; i = prev) {
3946       prev = i->prev;
3947       if (i->isDead()) {
3948          ++deadCount;
3949          delete_Instruction(prog, i);
3950       } else
3951       if (i->defExists(1) &&
3952           i->subOp == 0 &&
3953           (i->op == OP_VFETCH || i->op == OP_LOAD)) {
3954          checkSplitLoad(i);
3955       } else
3956       if (i->defExists(0) && !i->getDef(0)->refCount()) {
3957          if (i->op == OP_ATOM ||
3958              i->op == OP_SUREDP ||
3959              i->op == OP_SUREDB) {
3960             const Target *targ = prog->getTarget();
3961             if (targ->getChipset() >= NVISA_GF100_CHIPSET ||
3962                 i->subOp != NV50_IR_SUBOP_ATOM_CAS)
3963                i->setDef(0, NULL);
3964             if (i->op == OP_ATOM && i->subOp == NV50_IR_SUBOP_ATOM_EXCH) {
3965                i->cache = CACHE_CV;
3966                i->op = OP_STORE;
3967                i->subOp = 0;
3968             }
3969          } else if (i->op == OP_LOAD && i->subOp == NV50_IR_SUBOP_LOAD_LOCKED) {
3970             i->setDef(0, i->getDef(1));
3971             i->setDef(1, NULL);
3972          }
3973       }
3974    }
3975    return true;
3976 }
3977 
3978 // Each load can go into up to 4 destinations, any of which might potentially
3979 // be dead (i.e. a hole). These can always be split into 2 loads, independent
3980 // of where the holes are. We find the first contiguous region, put it into
3981 // the first load, and then put the second contiguous region into the second
3982 // load. There can be at most 2 contiguous regions.
3983 //
3984 // Note that there are some restrictions, for example it's not possible to do
3985 // a 64-bit load that's not 64-bit aligned, so such a load has to be split
3986 // up. Also hardware doesn't support 96-bit loads, so those also have to be
3987 // split into a 64-bit and 32-bit load.
3988 void
checkSplitLoad(Instruction * ld1)3989 DeadCodeElim::checkSplitLoad(Instruction *ld1)
3990 {
3991    Instruction *ld2 = NULL; // can get at most 2 loads
3992    Value *def1[4];
3993    Value *def2[4];
3994    int32_t addr1, addr2;
3995    int32_t size1, size2;
3996    int d, n1, n2;
3997    uint32_t mask = 0xffffffff;
3998 
3999    for (d = 0; ld1->defExists(d); ++d)
4000       if (!ld1->getDef(d)->refCount() && ld1->getDef(d)->reg.data.id < 0)
4001          mask &= ~(1 << d);
4002    if (mask == 0xffffffff)
4003       return;
4004 
4005    addr1 = ld1->getSrc(0)->reg.data.offset;
4006    n1 = n2 = 0;
4007    size1 = size2 = 0;
4008 
4009    // Compute address/width for first load
4010    for (d = 0; ld1->defExists(d); ++d) {
4011       if (mask & (1 << d)) {
4012          if (size1 && (addr1 & 0x7))
4013             break;
4014          def1[n1] = ld1->getDef(d);
4015          size1 += def1[n1++]->reg.size;
4016       } else
4017       if (!n1) {
4018          addr1 += ld1->getDef(d)->reg.size;
4019       } else {
4020          break;
4021       }
4022    }
4023 
4024    // Scale back the size of the first load until it can be loaded. This
4025    // typically happens for TYPE_B96 loads.
4026    while (n1 &&
4027           !prog->getTarget()->isAccessSupported(ld1->getSrc(0)->reg.file,
4028                                                 typeOfSize(size1))) {
4029       size1 -= def1[--n1]->reg.size;
4030       d--;
4031    }
4032 
4033    // Compute address/width for second load
4034    for (addr2 = addr1 + size1; ld1->defExists(d); ++d) {
4035       if (mask & (1 << d)) {
4036          assert(!size2 || !(addr2 & 0x7));
4037          def2[n2] = ld1->getDef(d);
4038          size2 += def2[n2++]->reg.size;
4039       } else if (!n2) {
4040          assert(!n2);
4041          addr2 += ld1->getDef(d)->reg.size;
4042       } else {
4043          break;
4044       }
4045    }
4046 
4047    // Make sure that we've processed all the values
4048    for (; ld1->defExists(d); ++d)
4049       assert(!(mask & (1 << d)));
4050 
4051    updateLdStOffset(ld1, addr1, func);
4052    ld1->setType(typeOfSize(size1));
4053    for (d = 0; d < 4; ++d)
4054       ld1->setDef(d, (d < n1) ? def1[d] : NULL);
4055 
4056    if (!n2)
4057       return;
4058 
4059    ld2 = cloneShallow(func, ld1);
4060    updateLdStOffset(ld2, addr2, func);
4061    ld2->setType(typeOfSize(size2));
4062    for (d = 0; d < 4; ++d)
4063       ld2->setDef(d, (d < n2) ? def2[d] : NULL);
4064 
4065    ld1->bb->insertAfter(ld1, ld2);
4066 }
4067 
4068 // =============================================================================
4069 
4070 #define RUN_PASS(l, n, f)                       \
4071    if (level >= (l)) {                          \
4072       if (dbgFlags & NV50_IR_DEBUG_VERBOSE)     \
4073          INFO("PEEPHOLE: %s\n", #n);            \
4074       n pass;                                   \
4075       if (!pass.f(this))                        \
4076          return false;                          \
4077    }
4078 
4079 bool
optimizeSSA(int level)4080 Program::optimizeSSA(int level)
4081 {
4082    RUN_PASS(1, DeadCodeElim, buryAll);
4083    RUN_PASS(1, CopyPropagation, run);
4084    RUN_PASS(1, MergeSplits, run);
4085    RUN_PASS(2, GlobalCSE, run);
4086    RUN_PASS(1, LocalCSE, run);
4087    RUN_PASS(2, AlgebraicOpt, run);
4088    RUN_PASS(2, ModifierFolding, run); // before load propagation -> less checks
4089    RUN_PASS(1, ConstantFolding, foldAll);
4090    RUN_PASS(0, Split64BitOpPreRA, run);
4091    RUN_PASS(2, LateAlgebraicOpt, run);
4092    RUN_PASS(1, LoadPropagation, run);
4093    RUN_PASS(1, IndirectPropagation, run);
4094    RUN_PASS(4, MemoryOpt, run);
4095    RUN_PASS(2, LocalCSE, run);
4096    RUN_PASS(0, DeadCodeElim, buryAll);
4097 
4098    return true;
4099 }
4100 
4101 bool
optimizePostRA(int level)4102 Program::optimizePostRA(int level)
4103 {
4104    RUN_PASS(2, FlatteningPass, run);
4105    RUN_PASS(2, PostRaLoadPropagation, run);
4106 
4107    return true;
4108 }
4109 
4110 }
4111