1 /*
2 * Copyright 2011 Christoph Bumiller
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22
23 #include "nv50_ir.h"
24 #include "nv50_ir_build_util.h"
25
26 #include "nv50_ir_target_nv50.h"
27
28 #define NV50_SU_INFO_SIZE_X 0x00
29 #define NV50_SU_INFO_SIZE_Y 0x04
30 #define NV50_SU_INFO_SIZE_Z 0x08
31 #define NV50_SU_INFO_BSIZE 0x0c
32 #define NV50_SU_INFO_STRIDE_Y 0x10
33 #define NV50_SU_INFO_MS_X 0x18
34 #define NV50_SU_INFO_MS_Y 0x1c
35 #define NV50_SU_INFO_TILE_SHIFT_X 0x20
36 #define NV50_SU_INFO_TILE_SHIFT_Y 0x24
37 #define NV50_SU_INFO_TILE_SHIFT_Z 0x28
38 #define NV50_SU_INFO_OFFSET_Z 0x2c
39
40 #define NV50_SU_INFO__STRIDE 0x30
41
42 #define NV50_SU_INFO_SIZE(i) (0x00 + (i) * 4)
43 #define NV50_SU_INFO_MS(i) (0x18 + (i) * 4)
44 #define NV50_SU_INFO_TILE_SHIFT(i) (0x20 + (i) * 4)
45
46 namespace nv50_ir {
47
48 // nv50 doesn't support 32 bit integer multiplication
49 //
50 // ah al * bh bl = LO32: (al * bh + ah * bl) << 16 + (al * bl)
51 // -------------------
52 // al*bh 00 HI32: (al * bh + ah * bl) >> 16 + (ah * bh) +
53 // ah*bh 00 00 ( carry1) << 16 + ( carry2)
54 // al*bl
55 // ah*bl 00
56 //
57 // fffe0001 + fffe0001
58 //
59 // Note that this sort of splitting doesn't work for signed values, so we
60 // compute the sign on those manually and then perform an unsigned multiply.
61 static bool
expandIntegerMUL(BuildUtil * bld,Instruction * mul)62 expandIntegerMUL(BuildUtil *bld, Instruction *mul)
63 {
64 const bool highResult = mul->subOp == NV50_IR_SUBOP_MUL_HIGH;
65 ImmediateValue src1;
66 bool src1imm = mul->src(1).getImmediate(src1);
67
68 DataType fTy; // full type
69 switch (mul->sType) {
70 case TYPE_S32: fTy = TYPE_U32; break;
71 case TYPE_S64: fTy = TYPE_U64; break;
72 default: fTy = mul->sType; break;
73 }
74
75 DataType hTy; // half type
76 switch (fTy) {
77 case TYPE_U32: hTy = TYPE_U16; break;
78 case TYPE_U64: hTy = TYPE_U32; break;
79 default:
80 return false;
81 }
82 unsigned int fullSize = typeSizeof(fTy);
83 unsigned int halfSize = typeSizeof(hTy);
84
85 Instruction *i[9];
86
87 bld->setPosition(mul, true);
88
89 Value *s[2];
90 Value *a[2], *b[2];
91 Value *t[4];
92 for (int j = 0; j < 4; ++j)
93 t[j] = bld->getSSA(fullSize);
94
95 if (isSignedType(mul->sType) && highResult) {
96 s[0] = bld->getSSA(fullSize);
97 s[1] = bld->getSSA(fullSize);
98 bld->mkOp1(OP_ABS, mul->sType, s[0], mul->getSrc(0));
99 bld->mkOp1(OP_ABS, mul->sType, s[1], mul->getSrc(1));
100 src1.reg.data.s32 = abs(src1.reg.data.s32);
101 } else {
102 s[0] = mul->getSrc(0);
103 s[1] = mul->getSrc(1);
104 }
105
106 // split sources into halves
107 i[0] = bld->mkSplit(a, halfSize, s[0]);
108 i[1] = bld->mkSplit(b, halfSize, s[1]);
109
110 if (src1imm && (src1.reg.data.u32 & 0xffff0000) == 0) {
111 i[2] = i[3] = bld->mkOp2(OP_MUL, fTy, t[1], a[1],
112 bld->mkImm(src1.reg.data.u32 & 0xffff));
113 } else {
114 i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0],
115 src1imm ? bld->mkImm(src1.reg.data.u32 >> 16) : b[1]);
116 if (src1imm && (src1.reg.data.u32 & 0x0000ffff) == 0) {
117 i[3] = i[2];
118 t[1] = t[0];
119 } else {
120 i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]);
121 }
122 }
123 i[7] = bld->mkOp2(OP_SHL, fTy, t[2], t[1], bld->mkImm(halfSize * 8));
124 if (src1imm && (src1.reg.data.u32 & 0x0000ffff) == 0) {
125 i[4] = i[3];
126 t[3] = t[2];
127 } else {
128 i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]);
129 }
130
131 if (highResult) {
132 Value *c[2];
133 Value *r[5];
134 Value *imm = bld->loadImm(NULL, 1 << (halfSize * 8));
135 c[0] = bld->getSSA(1, FILE_FLAGS);
136 c[1] = bld->getSSA(1, FILE_FLAGS);
137 for (int j = 0; j < 5; ++j)
138 r[j] = bld->getSSA(fullSize);
139
140 i[8] = bld->mkOp2(OP_SHR, fTy, r[0], t[1], bld->mkImm(halfSize * 8));
141 i[6] = bld->mkOp2(OP_ADD, fTy, r[1], r[0], imm);
142 bld->mkMov(r[3], r[0])->setPredicate(CC_NC, c[0]);
143 bld->mkOp2(OP_UNION, TYPE_U32, r[2], r[1], r[3]);
144 i[5] = bld->mkOp3(OP_MAD, fTy, r[4], a[1], b[1], r[2]);
145
146 // set carry defs / sources
147 i[3]->setFlagsDef(1, c[0]);
148 // actual result required in negative case, but ignored for
149 // unsigned. for some reason the compiler ends up dropping the whole
150 // instruction if the destination is unused but the flags are.
151 if (isSignedType(mul->sType))
152 i[4]->setFlagsDef(1, c[1]);
153 else
154 i[4]->setFlagsDef(0, c[1]);
155 i[6]->setPredicate(CC_C, c[0]);
156 i[5]->setFlagsSrc(3, c[1]);
157
158 if (isSignedType(mul->sType)) {
159 Value *cc[2];
160 Value *rr[7];
161 Value *one = bld->getSSA(fullSize);
162 bld->loadImm(one, 1);
163 for (int j = 0; j < 7; j++)
164 rr[j] = bld->getSSA(fullSize);
165
166 // NOTE: this logic uses predicates because splitting basic blocks is
167 // ~impossible during the SSA phase. The RA relies on a correlation
168 // between edge order and phi node sources.
169
170 // Set the sign of the result based on the inputs
171 bld->mkOp2(OP_XOR, fTy, NULL, mul->getSrc(0), mul->getSrc(1))
172 ->setFlagsDef(0, (cc[0] = bld->getSSA(1, FILE_FLAGS)));
173
174 // 1s complement of 64-bit value
175 bld->mkOp1(OP_NOT, fTy, rr[0], r[4])
176 ->setPredicate(CC_S, cc[0]);
177 bld->mkOp1(OP_NOT, fTy, rr[1], t[3])
178 ->setPredicate(CC_S, cc[0]);
179
180 // add to low 32-bits, keep track of the carry
181 Instruction *n = bld->mkOp2(OP_ADD, fTy, NULL, rr[1], one);
182 n->setPredicate(CC_S, cc[0]);
183 n->setFlagsDef(0, (cc[1] = bld->getSSA(1, FILE_FLAGS)));
184
185 // If there was a carry, add 1 to the upper 32 bits
186 // XXX: These get executed even if they shouldn't be
187 bld->mkOp2(OP_ADD, fTy, rr[2], rr[0], one)
188 ->setPredicate(CC_C, cc[1]);
189 bld->mkMov(rr[3], rr[0])
190 ->setPredicate(CC_NC, cc[1]);
191 bld->mkOp2(OP_UNION, fTy, rr[4], rr[2], rr[3]);
192
193 // Merge the results from the negative and non-negative paths
194 bld->mkMov(rr[5], rr[4])
195 ->setPredicate(CC_S, cc[0]);
196 bld->mkMov(rr[6], r[4])
197 ->setPredicate(CC_NS, cc[0]);
198 bld->mkOp2(OP_UNION, mul->sType, mul->getDef(0), rr[5], rr[6]);
199 } else {
200 bld->mkMov(mul->getDef(0), r[4]);
201 }
202 } else {
203 bld->mkMov(mul->getDef(0), t[3]);
204 }
205 delete_Instruction(bld->getProgram(), mul);
206
207 for (int j = 2; j <= (highResult ? 5 : 4); ++j)
208 if (i[j])
209 i[j]->sType = hTy;
210
211 return true;
212 }
213
214 #define QOP_ADD 0
215 #define QOP_SUBR 1
216 #define QOP_SUB 2
217 #define QOP_MOV2 3
218
219 // UL UR LL LR
220 #define QUADOP(q, r, s, t) \
221 ((QOP_##q << 6) | (QOP_##r << 4) | \
222 (QOP_##s << 2) | (QOP_##t << 0))
223
224 class NV50LegalizePostRA : public Pass
225 {
226 public:
NV50LegalizePostRA()227 NV50LegalizePostRA() : r63(NULL) { }
228
229 private:
230 virtual bool visit(Function *);
231 virtual bool visit(BasicBlock *);
232
233 void handlePRERET(FlowInstruction *);
234 void replaceZero(Instruction *);
235
236 BuildUtil bld;
237
238 LValue *r63;
239 };
240
241 bool
visit(Function * fn)242 NV50LegalizePostRA::visit(Function *fn)
243 {
244 Program *prog = fn->getProgram();
245
246 r63 = new_LValue(fn, FILE_GPR);
247 // GPR units on nv50 are in half-regs
248 if (prog->maxGPR < 126)
249 r63->reg.data.id = 63;
250 else
251 r63->reg.data.id = 127;
252
253 // this is actually per-program, but we can do it all on visiting main()
254 std::list<Instruction *> *outWrites =
255 reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
256
257 if (outWrites) {
258 for (std::list<Instruction *>::iterator it = outWrites->begin();
259 it != outWrites->end(); ++it)
260 (*it)->getSrc(1)->defs.front()->getInsn()->setDef(0, (*it)->getSrc(0));
261 // instructions will be deleted on exit
262 outWrites->clear();
263 }
264
265 return true;
266 }
267
268 void
replaceZero(Instruction * i)269 NV50LegalizePostRA::replaceZero(Instruction *i)
270 {
271 for (int s = 0; i->srcExists(s); ++s) {
272 ImmediateValue *imm = i->getSrc(s)->asImm();
273 if (imm && imm->reg.data.u64 == 0)
274 i->setSrc(s, r63);
275 }
276 }
277
278 // Emulate PRERET: jump to the target and call to the origin from there
279 //
280 // WARNING: atm only works if BBs are affected by at most a single PRERET
281 //
282 // BB:0
283 // preret BB:3
284 // (...)
285 // BB:3
286 // (...)
287 // --->
288 // BB:0
289 // bra BB:3 + n0 (directly to the call; move to beginning of BB and fixate)
290 // (...)
291 // BB:3
292 // bra BB:3 + n1 (skip the call)
293 // call BB:0 + n2 (skip bra at beginning of BB:0)
294 // (...)
295 void
handlePRERET(FlowInstruction * pre)296 NV50LegalizePostRA::handlePRERET(FlowInstruction *pre)
297 {
298 BasicBlock *bbE = pre->bb;
299 BasicBlock *bbT = pre->target.bb;
300
301 pre->subOp = NV50_IR_SUBOP_EMU_PRERET + 0;
302 bbE->remove(pre);
303 bbE->insertHead(pre);
304
305 Instruction *skip = new_FlowInstruction(func, OP_PRERET, bbT);
306 Instruction *call = new_FlowInstruction(func, OP_PRERET, bbE);
307
308 bbT->insertHead(call);
309 bbT->insertHead(skip);
310
311 // NOTE: maybe split blocks to prevent the instructions from moving ?
312
313 skip->subOp = NV50_IR_SUBOP_EMU_PRERET + 1;
314 call->subOp = NV50_IR_SUBOP_EMU_PRERET + 2;
315 }
316
317 bool
visit(BasicBlock * bb)318 NV50LegalizePostRA::visit(BasicBlock *bb)
319 {
320 Instruction *i, *next;
321
322 // remove pseudo operations and non-fixed no-ops, split 64 bit operations
323 for (i = bb->getFirst(); i; i = next) {
324 next = i->next;
325 if (i->isNop()) {
326 bb->remove(i);
327 } else
328 if (i->op == OP_PRERET && prog->getTarget()->getChipset() < 0xa0) {
329 handlePRERET(i->asFlow());
330 } else {
331 // TODO: We will want to do this before register allocation,
332 // since have to use a $c register for the carry flag.
333 if (typeSizeof(i->dType) == 8) {
334 Instruction *hi = BuildUtil::split64BitOpPostRA(func, i, r63, NULL);
335 if (hi)
336 next = hi;
337 }
338
339 if (i->op != OP_PFETCH && i->op != OP_BAR &&
340 (!i->defExists(0) || i->def(0).getFile() != FILE_ADDRESS))
341 replaceZero(i);
342 }
343 }
344 if (!bb->getEntry())
345 return true;
346
347 return true;
348 }
349
350 class NV50LegalizeSSA : public Pass
351 {
352 public:
353 NV50LegalizeSSA(Program *);
354
355 virtual bool visit(BasicBlock *bb);
356
357 private:
358 void propagateWriteToOutput(Instruction *);
359 void handleDIV(Instruction *);
360 void handleMOD(Instruction *);
361 void handleMUL(Instruction *);
362 void handleAddrDef(Instruction *);
363
364 inline bool isARL(const Instruction *) const;
365
366 BuildUtil bld;
367
368 std::list<Instruction *> *outWrites;
369 };
370
NV50LegalizeSSA(Program * prog)371 NV50LegalizeSSA::NV50LegalizeSSA(Program *prog)
372 {
373 bld.setProgram(prog);
374
375 if (prog->optLevel >= 2 &&
376 (prog->getType() == Program::TYPE_GEOMETRY ||
377 prog->getType() == Program::TYPE_VERTEX))
378 outWrites =
379 reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
380 else
381 outWrites = NULL;
382 }
383
384 void
propagateWriteToOutput(Instruction * st)385 NV50LegalizeSSA::propagateWriteToOutput(Instruction *st)
386 {
387 if (st->src(0).isIndirect(0) || st->getSrc(1)->refCount() != 1)
388 return;
389
390 // check def instruction can store
391 Instruction *di = st->getSrc(1)->defs.front()->getInsn();
392
393 // TODO: move exports (if beneficial) in common opt pass
394 if (di->isPseudo() || isTextureOp(di->op) || di->defCount(0xff, true) > 1)
395 return;
396
397 for (int s = 0; di->srcExists(s); ++s)
398 if (di->src(s).getFile() == FILE_IMMEDIATE ||
399 di->src(s).getFile() == FILE_MEMORY_LOCAL)
400 return;
401
402 if (prog->getType() == Program::TYPE_GEOMETRY) {
403 // Only propagate output writes in geometry shaders when we can be sure
404 // that we are propagating to the same output vertex.
405 if (di->bb != st->bb)
406 return;
407 Instruction *i;
408 for (i = di; i != st; i = i->next) {
409 if (i->op == OP_EMIT || i->op == OP_RESTART)
410 return;
411 }
412 assert(i); // st after di
413 }
414
415 // We cannot set defs to non-lvalues before register allocation, so
416 // save & remove (to save registers) the exports and replace later.
417 outWrites->push_back(st);
418 st->bb->remove(st);
419 }
420
421 bool
isARL(const Instruction * i) const422 NV50LegalizeSSA::isARL(const Instruction *i) const
423 {
424 ImmediateValue imm;
425
426 if (i->op != OP_SHL || i->src(0).getFile() != FILE_GPR)
427 return false;
428 if (!i->src(1).getImmediate(imm))
429 return false;
430 return imm.isInteger(0);
431 }
432
433 void
handleAddrDef(Instruction * i)434 NV50LegalizeSSA::handleAddrDef(Instruction *i)
435 {
436 Instruction *arl;
437
438 i->getDef(0)->reg.size = 2; // $aX are only 16 bit
439
440 // PFETCH can always write to $a
441 if (i->op == OP_PFETCH)
442 return;
443 // only ADDR <- SHL(GPR, IMM) and ADDR <- ADD(ADDR, IMM) are valid
444 if (i->srcExists(1) && i->src(1).getFile() == FILE_IMMEDIATE) {
445 if (i->op == OP_SHL && i->src(0).getFile() == FILE_GPR)
446 return;
447 if (i->op == OP_ADD && i->src(0).getFile() == FILE_ADDRESS)
448 return;
449 }
450
451 // turn $a sources into $r sources (can't operate on $a)
452 for (int s = 0; i->srcExists(s); ++s) {
453 Value *a = i->getSrc(s);
454 Value *r;
455 if (a->reg.file == FILE_ADDRESS) {
456 if (a->getInsn() && isARL(a->getInsn())) {
457 i->setSrc(s, a->getInsn()->getSrc(0));
458 } else {
459 bld.setPosition(i, false);
460 r = bld.getSSA();
461 bld.mkMov(r, a);
462 i->setSrc(s, r);
463 }
464 }
465 }
466 if (i->op == OP_SHL && i->src(1).getFile() == FILE_IMMEDIATE)
467 return;
468
469 // turn result back into $a
470 bld.setPosition(i, true);
471 arl = bld.mkOp2(OP_SHL, TYPE_U32, i->getDef(0), bld.getSSA(), bld.mkImm(0));
472 i->setDef(0, arl->getSrc(0));
473 }
474
475 void
handleMUL(Instruction * mul)476 NV50LegalizeSSA::handleMUL(Instruction *mul)
477 {
478 if (isFloatType(mul->sType) || typeSizeof(mul->sType) <= 2)
479 return;
480 Value *def = mul->getDef(0);
481 Value *pred = mul->getPredicate();
482 CondCode cc = mul->cc;
483 if (pred)
484 mul->setPredicate(CC_ALWAYS, NULL);
485
486 if (mul->op == OP_MAD) {
487 Instruction *add = mul;
488 bld.setPosition(add, false);
489 Value *res = cloneShallow(func, mul->getDef(0));
490 mul = bld.mkOp2(OP_MUL, add->sType, res, add->getSrc(0), add->getSrc(1));
491 add->op = OP_ADD;
492 add->setSrc(0, mul->getDef(0));
493 add->setSrc(1, add->getSrc(2));
494 for (int s = 2; add->srcExists(s); ++s)
495 add->setSrc(s, NULL);
496 mul->subOp = add->subOp;
497 add->subOp = 0;
498 }
499 expandIntegerMUL(&bld, mul);
500 if (pred)
501 def->getInsn()->setPredicate(cc, pred);
502 }
503
504 // Use f32 division: first compute an approximate result, use it to reduce
505 // the dividend, which should then be representable as f32, divide the reduced
506 // dividend, and add the quotients.
507 void
handleDIV(Instruction * div)508 NV50LegalizeSSA::handleDIV(Instruction *div)
509 {
510 const DataType ty = div->sType;
511
512 if (ty != TYPE_U32 && ty != TYPE_S32)
513 return;
514
515 Value *q, *q0, *qf, *aR, *aRf, *qRf, *qR, *t, *s, *m, *cond;
516
517 bld.setPosition(div, false);
518
519 Value *a, *af = bld.getSSA();
520 Value *b, *bf = bld.getSSA();
521
522 bld.mkCvt(OP_CVT, TYPE_F32, af, ty, div->getSrc(0));
523 bld.mkCvt(OP_CVT, TYPE_F32, bf, ty, div->getSrc(1));
524
525 if (isSignedType(ty)) {
526 af->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
527 bf->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
528 a = bld.getSSA();
529 b = bld.getSSA();
530 bld.mkOp1(OP_ABS, ty, a, div->getSrc(0));
531 bld.mkOp1(OP_ABS, ty, b, div->getSrc(1));
532 } else {
533 a = div->getSrc(0);
534 b = div->getSrc(1);
535 }
536
537 bf = bld.mkOp1v(OP_RCP, TYPE_F32, bld.getSSA(), bf);
538 bf = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), bf, bld.mkImm(-2));
539
540 bld.mkOp2(OP_MUL, TYPE_F32, (qf = bld.getSSA()), af, bf)->rnd = ROUND_Z;
541 bld.mkCvt(OP_CVT, ty, (q0 = bld.getSSA()), TYPE_F32, qf)->rnd = ROUND_Z;
542
543 // get error of 1st result
544 expandIntegerMUL(&bld,
545 bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q0, b));
546 bld.mkOp2(OP_SUB, TYPE_U32, (aRf = bld.getSSA()), a, t);
547
548 bld.mkCvt(OP_CVT, TYPE_F32, (aR = bld.getSSA()), TYPE_U32, aRf);
549
550 bld.mkOp2(OP_MUL, TYPE_F32, (qRf = bld.getSSA()), aR, bf)->rnd = ROUND_Z;
551 bld.mkCvt(OP_CVT, TYPE_U32, (qR = bld.getSSA()), TYPE_F32, qRf)
552 ->rnd = ROUND_Z;
553 bld.mkOp2(OP_ADD, ty, (q = bld.getSSA()), q0, qR); // add quotients
554
555 // correction: if modulus >= divisor, add 1
556 expandIntegerMUL(&bld,
557 bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q, b));
558 bld.mkOp2(OP_SUB, TYPE_U32, (m = bld.getSSA()), a, t);
559 bld.mkCmp(OP_SET, CC_GE, TYPE_U32, (s = bld.getSSA()), TYPE_U32, m, b);
560 if (!isSignedType(ty)) {
561 div->op = OP_SUB;
562 div->setSrc(0, q);
563 div->setSrc(1, s);
564 } else {
565 t = q;
566 bld.mkOp2(OP_SUB, TYPE_U32, (q = bld.getSSA()), t, s);
567 s = bld.getSSA();
568 t = bld.getSSA();
569 // fix the sign
570 bld.mkOp2(OP_XOR, TYPE_U32, NULL, div->getSrc(0), div->getSrc(1))
571 ->setFlagsDef(0, (cond = bld.getSSA(1, FILE_FLAGS)));
572 bld.mkOp1(OP_NEG, ty, s, q)->setPredicate(CC_S, cond);
573 bld.mkOp1(OP_MOV, ty, t, q)->setPredicate(CC_NS, cond);
574
575 div->op = OP_UNION;
576 div->setSrc(0, s);
577 div->setSrc(1, t);
578 }
579 }
580
581 void
handleMOD(Instruction * mod)582 NV50LegalizeSSA::handleMOD(Instruction *mod)
583 {
584 if (mod->dType != TYPE_U32 && mod->dType != TYPE_S32)
585 return;
586 bld.setPosition(mod, false);
587
588 Value *q = bld.getSSA();
589 Value *m = bld.getSSA();
590
591 bld.mkOp2(OP_DIV, mod->dType, q, mod->getSrc(0), mod->getSrc(1));
592 handleDIV(q->getInsn());
593
594 bld.setPosition(mod, false);
595 expandIntegerMUL(&bld, bld.mkOp2(OP_MUL, TYPE_U32, m, q, mod->getSrc(1)));
596
597 mod->op = OP_SUB;
598 mod->setSrc(1, m);
599 }
600
601 bool
visit(BasicBlock * bb)602 NV50LegalizeSSA::visit(BasicBlock *bb)
603 {
604 Instruction *insn, *next;
605 // skipping PHIs (don't pass them to handleAddrDef) !
606 for (insn = bb->getEntry(); insn; insn = next) {
607 next = insn->next;
608
609 if (insn->defExists(0) && insn->getDef(0)->reg.file == FILE_ADDRESS)
610 handleAddrDef(insn);
611
612 switch (insn->op) {
613 case OP_EXPORT:
614 if (outWrites)
615 propagateWriteToOutput(insn);
616 break;
617 case OP_DIV:
618 handleDIV(insn);
619 break;
620 case OP_MOD:
621 handleMOD(insn);
622 break;
623 case OP_MAD:
624 case OP_MUL:
625 handleMUL(insn);
626 break;
627 default:
628 break;
629 }
630 }
631 return true;
632 }
633
634 class NV50LoweringPreSSA : public Pass
635 {
636 public:
637 NV50LoweringPreSSA(Program *);
638
639 private:
640 virtual bool visit(Instruction *);
641 virtual bool visit(Function *);
642
643 bool handleRDSV(Instruction *);
644
645 bool handlePFETCH(Instruction *);
646 bool handleEXPORT(Instruction *);
647 bool handleLOAD(Instruction *);
648 bool handleLDST(Instruction *);
649 bool handleMEMBAR(Instruction *);
650 bool handleSharedATOM(Instruction *);
651 bool handleSULDP(TexInstruction *);
652 bool handleSUREDP(TexInstruction *);
653 bool handleSUSTP(TexInstruction *);
654 Value *processSurfaceCoords(TexInstruction *);
655
656 bool handleDIV(Instruction *);
657 bool handleSQRT(Instruction *);
658
659 bool handleSET(Instruction *);
660 bool handleSLCT(CmpInstruction *);
661 bool handleSELP(Instruction *);
662
663 bool handleTEX(TexInstruction *);
664 bool handleTXB(TexInstruction *); // I really
665 bool handleTXL(TexInstruction *); // hate
666 bool handleTXD(TexInstruction *); // these 3
667 bool handleTXLQ(TexInstruction *);
668 bool handleTXQ(TexInstruction *);
669 bool handleSUQ(TexInstruction *);
670 bool handleBUFQ(Instruction *);
671
672 bool handleCALL(Instruction *);
673 bool handlePRECONT(Instruction *);
674 bool handleCONT(Instruction *);
675
676 void checkPredicate(Instruction *);
677 void loadTexMsInfo(uint32_t off, Value **ms, Value **ms_x, Value **ms_y);
678 void loadMsInfo(Value *ms, Value *s, Value **dx, Value **dy);
679 Value *loadSuInfo(int slot, uint32_t off);
680 Value *loadSuInfo16(int slot, uint32_t off);
681
682 private:
683 const Target *const targ;
684
685 BuildUtil bld;
686
687 Value *tid;
688 };
689
NV50LoweringPreSSA(Program * prog)690 NV50LoweringPreSSA::NV50LoweringPreSSA(Program *prog) :
691 targ(prog->getTarget()), tid(NULL)
692 {
693 bld.setProgram(prog);
694 }
695
696 bool
visit(Function * f)697 NV50LoweringPreSSA::visit(Function *f)
698 {
699 BasicBlock *root = BasicBlock::get(func->cfg.getRoot());
700
701 if (prog->getType() == Program::TYPE_COMPUTE) {
702 // Add implicit "thread id" argument in $r0 to the function
703 Value *arg = new_LValue(func, FILE_GPR);
704 arg->reg.data.id = 0;
705 f->ins.push_back(arg);
706
707 bld.setPosition(root, false);
708 tid = bld.mkMov(bld.getScratch(), arg, TYPE_U32)->getDef(0);
709 }
710
711 return true;
712 }
713
loadTexMsInfo(uint32_t off,Value ** ms,Value ** ms_x,Value ** ms_y)714 void NV50LoweringPreSSA::loadTexMsInfo(uint32_t off, Value **ms,
715 Value **ms_x, Value **ms_y) {
716 // This loads the texture-indexed ms setting from the constant buffer
717 Value *tmp = new_LValue(func, FILE_GPR);
718 uint8_t b = prog->driver->io.auxCBSlot;
719 off += prog->driver->io.suInfoBase;
720 if (prog->getType() > Program::TYPE_VERTEX)
721 off += 16 * 2 * 4;
722 if (prog->getType() > Program::TYPE_GEOMETRY)
723 off += 16 * 2 * 4;
724 if (prog->getType() > Program::TYPE_FRAGMENT)
725 off += 16 * 2 * 4;
726 *ms_x = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
727 FILE_MEMORY_CONST, b, TYPE_U32, off + 0), NULL);
728 *ms_y = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
729 FILE_MEMORY_CONST, b, TYPE_U32, off + 4), NULL);
730 *ms = bld.mkOp2v(OP_ADD, TYPE_U32, tmp, *ms_x, *ms_y);
731 }
732
loadMsInfo(Value * ms,Value * s,Value ** dx,Value ** dy)733 void NV50LoweringPreSSA::loadMsInfo(Value *ms, Value *s, Value **dx, Value **dy) {
734 // Given a MS level, and a sample id, compute the delta x/y
735 uint8_t b = prog->driver->io.msInfoCBSlot;
736 Value *off = new_LValue(func, FILE_ADDRESS), *t = new_LValue(func, FILE_GPR);
737
738 // The required information is at mslevel * 16 * 4 + sample * 8
739 // = (mslevel * 8 + sample) * 8
740 bld.mkOp2(OP_SHL,
741 TYPE_U32,
742 off,
743 bld.mkOp2v(OP_ADD, TYPE_U32, t,
744 bld.mkOp2v(OP_SHL, TYPE_U32, t, ms, bld.mkImm(3)),
745 s),
746 bld.mkImm(3));
747 *dx = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
748 FILE_MEMORY_CONST, b, TYPE_U32,
749 prog->driver->io.msInfoBase), off);
750 *dy = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
751 FILE_MEMORY_CONST, b, TYPE_U32,
752 prog->driver->io.msInfoBase + 4), off);
753 }
754
755 Value *
loadSuInfo(int slot,uint32_t off)756 NV50LoweringPreSSA::loadSuInfo(int slot, uint32_t off)
757 {
758 uint8_t b = prog->driver->io.auxCBSlot;
759 off += prog->driver->io.bufInfoBase + slot * NV50_SU_INFO__STRIDE;
760 return bld.mkLoadv(TYPE_U32, bld.mkSymbol(
761 FILE_MEMORY_CONST, b, TYPE_U32, off), NULL);
762 }
763
764 Value *
loadSuInfo16(int slot,uint32_t off)765 NV50LoweringPreSSA::loadSuInfo16(int slot, uint32_t off)
766 {
767 uint8_t b = prog->driver->io.auxCBSlot;
768 off += prog->driver->io.bufInfoBase + slot * NV50_SU_INFO__STRIDE;
769 return bld.mkLoadv(TYPE_U16, bld.mkSymbol(
770 FILE_MEMORY_CONST, b, TYPE_U16, off), NULL);
771 }
772
773 bool
handleTEX(TexInstruction * i)774 NV50LoweringPreSSA::handleTEX(TexInstruction *i)
775 {
776 const int arg = i->tex.target.getArgCount();
777 const int dref = arg;
778 const int lod = i->tex.target.isShadow() ? (arg + 1) : arg;
779
780 /* Only normalize in the non-explicit derivatives case.
781 */
782 if (i->tex.target.isCube() && i->op != OP_TXD) {
783 Value *src[3], *val;
784 int c;
785 for (c = 0; c < 3; ++c)
786 src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), i->getSrc(c));
787 val = bld.getScratch();
788 bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
789 bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
790 bld.mkOp1(OP_RCP, TYPE_F32, val, val);
791 for (c = 0; c < 3; ++c) {
792 i->setSrc(c, bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(),
793 i->getSrc(c), val));
794 }
795 }
796
797 // handle MS, which means looking up the MS params for this texture, and
798 // adjusting the input coordinates to point at the right sample.
799 if (i->tex.target.isMS()) {
800 Value *x = i->getSrc(0);
801 Value *y = i->getSrc(1);
802 Value *s = i->getSrc(arg - 1);
803 Value *tx = new_LValue(func, FILE_GPR), *ty = new_LValue(func, FILE_GPR),
804 *ms, *ms_x, *ms_y, *dx, *dy;
805
806 i->tex.target.clearMS();
807
808 loadTexMsInfo(i->tex.r * 4 * 2, &ms, &ms_x, &ms_y);
809 loadMsInfo(ms, s, &dx, &dy);
810
811 bld.mkOp2(OP_SHL, TYPE_U32, tx, x, ms_x);
812 bld.mkOp2(OP_SHL, TYPE_U32, ty, y, ms_y);
813 bld.mkOp2(OP_ADD, TYPE_U32, tx, tx, dx);
814 bld.mkOp2(OP_ADD, TYPE_U32, ty, ty, dy);
815 i->setSrc(0, tx);
816 i->setSrc(1, ty);
817 i->setSrc(arg - 1, bld.loadImm(NULL, 0));
818 }
819
820 // dref comes before bias/lod
821 if (i->tex.target.isShadow())
822 if (i->op == OP_TXB || i->op == OP_TXL)
823 i->swapSources(dref, lod);
824
825 if (i->tex.target.isArray()) {
826 if (i->op != OP_TXF) {
827 // array index must be converted to u32, but it's already an integer
828 // for TXF
829 Value *layer = i->getSrc(arg - 1);
830 LValue *src = new_LValue(func, FILE_GPR);
831 bld.mkCvt(OP_CVT, TYPE_U32, src, TYPE_F32, layer);
832 bld.mkOp2(OP_MIN, TYPE_U32, src, src, bld.loadImm(NULL, 511));
833 i->setSrc(arg - 1, src);
834 }
835 if (i->tex.target.isCube() && i->srcCount() > 4) {
836 std::vector<Value *> acube, a2d;
837 int c;
838
839 acube.resize(4);
840 for (c = 0; c < 4; ++c)
841 acube[c] = i->getSrc(c);
842 a2d.resize(4);
843 for (c = 0; c < 3; ++c)
844 a2d[c] = new_LValue(func, FILE_GPR);
845 a2d[3] = NULL;
846
847 bld.mkTex(OP_TEXPREP, TEX_TARGET_CUBE_ARRAY, i->tex.r, i->tex.s,
848 a2d, acube)->asTex()->tex.mask = 0x7;
849
850 for (c = 0; c < 3; ++c)
851 i->setSrc(c, a2d[c]);
852 for (; i->srcExists(c + 1); ++c)
853 i->setSrc(c, i->getSrc(c + 1));
854 i->setSrc(c, NULL);
855 assert(c <= 4);
856
857 i->tex.target = i->tex.target.isShadow() ?
858 TEX_TARGET_2D_ARRAY_SHADOW : TEX_TARGET_2D_ARRAY;
859 }
860 }
861
862 // texel offsets are 3 immediate fields in the instruction,
863 // nv50 cannot do textureGatherOffsets
864 assert(i->tex.useOffsets <= 1);
865 if (i->tex.useOffsets) {
866 for (int c = 0; c < 3; ++c) {
867 ImmediateValue val;
868 if (!i->offset[0][c].getImmediate(val))
869 assert(!"non-immediate offset");
870 i->tex.offset[c] = val.reg.data.u32;
871 i->offset[0][c].set(NULL);
872 }
873 }
874
875 return true;
876 }
877
878 // Bias must be equal for all threads of a quad or lod calculation will fail.
879 //
880 // The lanes of a quad are grouped by the bit in the condition register they
881 // have set, which is selected by differing bias values.
882 // Move the input values for TEX into a new register set for each group and
883 // execute TEX only for a specific group.
884 // We always need to use 4 new registers for the inputs/outputs because the
885 // implicitly calculated derivatives must be correct.
886 //
887 // TODO: move to SSA phase so we can easily determine whether bias is constant
888 bool
handleTXB(TexInstruction * i)889 NV50LoweringPreSSA::handleTXB(TexInstruction *i)
890 {
891 const CondCode cc[4] = { CC_EQU, CC_S, CC_C, CC_O };
892 int l, d;
893
894 // We can't actually apply bias *and* do a compare for a cube
895 // texture. Since the compare has to be done before the filtering, just
896 // drop the bias on the floor.
897 if (i->tex.target == TEX_TARGET_CUBE_SHADOW) {
898 i->op = OP_TEX;
899 i->setSrc(3, i->getSrc(4));
900 i->setSrc(4, NULL);
901 return handleTEX(i);
902 }
903
904 handleTEX(i);
905 Value *bias = i->getSrc(i->tex.target.getArgCount());
906 if (bias->isUniform())
907 return true;
908
909 Instruction *cond = bld.mkOp1(OP_UNION, TYPE_U32, bld.getScratch(),
910 bld.loadImm(NULL, 1));
911 bld.setPosition(cond, false);
912
913 for (l = 1; l < 4; ++l) {
914 const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
915 Value *bit = bld.getSSA();
916 Value *pred = bld.getScratch(1, FILE_FLAGS);
917 Value *imm = bld.loadImm(NULL, (1 << l));
918 bld.mkQuadop(qop, pred, l, bias, bias)->flagsDef = 0;
919 bld.mkMov(bit, imm)->setPredicate(CC_EQ, pred);
920 cond->setSrc(l, bit);
921 }
922 Value *flags = bld.getScratch(1, FILE_FLAGS);
923 bld.setPosition(cond, true);
924 bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0))->flagsDef = 0;
925
926 Instruction *tex[4];
927 for (l = 0; l < 4; ++l) {
928 (tex[l] = cloneForward(func, i))->setPredicate(cc[l], flags);
929 bld.insert(tex[l]);
930 }
931
932 Value *res[4][4];
933 for (d = 0; i->defExists(d); ++d)
934 res[0][d] = tex[0]->getDef(d);
935 for (l = 1; l < 4; ++l) {
936 for (d = 0; tex[l]->defExists(d); ++d) {
937 res[l][d] = cloneShallow(func, res[0][d]);
938 bld.mkMov(res[l][d], tex[l]->getDef(d))->setPredicate(cc[l], flags);
939 }
940 }
941
942 for (d = 0; i->defExists(d); ++d) {
943 Instruction *dst = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(d));
944 for (l = 0; l < 4; ++l)
945 dst->setSrc(l, res[l][d]);
946 }
947 delete_Instruction(prog, i);
948 return true;
949 }
950
951 // LOD must be equal for all threads of a quad.
952 // Unlike with TXB, here we can just diverge since there's no LOD calculation
953 // that would require all 4 threads' sources to be set up properly.
954 bool
handleTXL(TexInstruction * i)955 NV50LoweringPreSSA::handleTXL(TexInstruction *i)
956 {
957 handleTEX(i);
958 Value *lod = i->getSrc(i->tex.target.getArgCount());
959 if (lod->isUniform())
960 return true;
961
962 BasicBlock *currBB = i->bb;
963 BasicBlock *texiBB = i->bb->splitBefore(i, false);
964 BasicBlock *joinBB = i->bb->splitAfter(i);
965
966 bld.setPosition(currBB, true);
967 assert(!currBB->joinAt);
968 currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
969
970 for (int l = 0; l <= 3; ++l) {
971 const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
972 Value *pred = bld.getScratch(1, FILE_FLAGS);
973 bld.setPosition(currBB, true);
974 bld.mkQuadop(qop, pred, l, lod, lod)->flagsDef = 0;
975 bld.mkFlow(OP_BRA, texiBB, CC_EQ, pred)->fixed = 1;
976 currBB->cfg.attach(&texiBB->cfg, Graph::Edge::FORWARD);
977 if (l <= 2) {
978 BasicBlock *laneBB = new BasicBlock(func);
979 currBB->cfg.attach(&laneBB->cfg, Graph::Edge::TREE);
980 currBB = laneBB;
981 }
982 }
983 bld.setPosition(joinBB, false);
984 bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1;
985 return true;
986 }
987
988 bool
handleTXD(TexInstruction * i)989 NV50LoweringPreSSA::handleTXD(TexInstruction *i)
990 {
991 static const uint8_t qOps[4][2] =
992 {
993 { QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(MOV2, MOV2, ADD, ADD) }, // l0
994 { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD, ADD) }, // l1
995 { QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2
996 { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3
997 };
998 Value *def[4][4];
999 Value *crd[3];
1000 Instruction *tex;
1001 Value *zero = bld.loadImm(bld.getSSA(), 0);
1002 int l, c;
1003 const int dim = i->tex.target.getDim() + i->tex.target.isCube();
1004
1005 handleTEX(i);
1006 i->op = OP_TEX; // no need to clone dPdx/dPdy later
1007 i->tex.derivAll = true;
1008
1009 for (c = 0; c < dim; ++c)
1010 crd[c] = bld.getScratch();
1011
1012 bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
1013 for (l = 0; l < 4; ++l) {
1014 Value *src[3], *val;
1015 // mov coordinates from lane l to all lanes
1016 for (c = 0; c < dim; ++c)
1017 bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero);
1018 // add dPdx from lane l to lanes dx
1019 for (c = 0; c < dim; ++c)
1020 bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]);
1021 // add dPdy from lane l to lanes dy
1022 for (c = 0; c < dim; ++c)
1023 bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
1024 // normalize cube coordinates if necessary
1025 if (i->tex.target.isCube()) {
1026 for (c = 0; c < 3; ++c)
1027 src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), crd[c]);
1028 val = bld.getScratch();
1029 bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
1030 bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
1031 bld.mkOp1(OP_RCP, TYPE_F32, val, val);
1032 for (c = 0; c < 3; ++c)
1033 src[c] = bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), crd[c], val);
1034 } else {
1035 for (c = 0; c < dim; ++c)
1036 src[c] = crd[c];
1037 }
1038 // texture
1039 bld.insert(tex = cloneForward(func, i));
1040 for (c = 0; c < dim; ++c)
1041 tex->setSrc(c, src[c]);
1042 // save results
1043 for (c = 0; i->defExists(c); ++c) {
1044 Instruction *mov;
1045 def[c][l] = bld.getSSA();
1046 mov = bld.mkMov(def[c][l], tex->getDef(c));
1047 mov->fixed = 1;
1048 mov->lanes = 1 << l;
1049 }
1050 }
1051 bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
1052
1053 for (c = 0; i->defExists(c); ++c) {
1054 Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
1055 for (l = 0; l < 4; ++l)
1056 u->setSrc(l, def[c][l]);
1057 }
1058
1059 i->bb->remove(i);
1060 return true;
1061 }
1062
1063 bool
handleTXLQ(TexInstruction * i)1064 NV50LoweringPreSSA::handleTXLQ(TexInstruction *i)
1065 {
1066 handleTEX(i);
1067 bld.setPosition(i, true);
1068
1069 /* The returned values are not quite what we want:
1070 * (a) convert from s32 to f32
1071 * (b) multiply by 1/256
1072 */
1073 for (int def = 0; def < 2; ++def) {
1074 if (!i->defExists(def))
1075 continue;
1076 bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(def), TYPE_S32, i->getDef(def));
1077 bld.mkOp2(OP_MUL, TYPE_F32, i->getDef(def),
1078 i->getDef(def), bld.loadImm(NULL, 1.0f / 256));
1079 }
1080 return true;
1081 }
1082
1083 bool
handleTXQ(TexInstruction * i)1084 NV50LoweringPreSSA::handleTXQ(TexInstruction *i)
1085 {
1086 Value *ms, *ms_x, *ms_y;
1087 if (i->tex.query == TXQ_DIMS) {
1088 if (i->tex.target.isMS()) {
1089 bld.setPosition(i, true);
1090 loadTexMsInfo(i->tex.r * 4 * 2, &ms, &ms_x, &ms_y);
1091 int d = 0;
1092 if (i->tex.mask & 1) {
1093 bld.mkOp2(OP_SHR, TYPE_U32, i->getDef(d), i->getDef(d), ms_x);
1094 d++;
1095 }
1096 if (i->tex.mask & 2) {
1097 bld.mkOp2(OP_SHR, TYPE_U32, i->getDef(d), i->getDef(d), ms_y);
1098 d++;
1099 }
1100 }
1101 return true;
1102 }
1103 assert(i->tex.query == TXQ_TYPE);
1104 assert(i->tex.mask == 4);
1105
1106 loadTexMsInfo(i->tex.r * 4 * 2, &ms, &ms_x, &ms_y);
1107 bld.mkOp2(OP_SHL, TYPE_U32, i->getDef(0), bld.loadImm(NULL, 1), ms);
1108 i->bb->remove(i);
1109
1110 return true;
1111 }
1112
1113 bool
handleSUQ(TexInstruction * suq)1114 NV50LoweringPreSSA::handleSUQ(TexInstruction *suq)
1115 {
1116 const int dim = suq->tex.target.getDim();
1117 const int arg = dim + (suq->tex.target.isArray() || suq->tex.target.isCube());
1118 int mask = suq->tex.mask;
1119 int slot = suq->tex.r;
1120 int c, d;
1121
1122 for (c = 0, d = 0; c < 3; ++c, mask >>= 1) {
1123 if (c >= arg || !(mask & 1))
1124 continue;
1125
1126 int offset;
1127
1128 if (c == 1 && suq->tex.target == TEX_TARGET_1D_ARRAY) {
1129 offset = NV50_SU_INFO_SIZE(2);
1130 } else {
1131 offset = NV50_SU_INFO_SIZE(c);
1132 }
1133 bld.mkMov(suq->getDef(d++), loadSuInfo(slot, offset));
1134 if (c == 2 && suq->tex.target.isCube())
1135 bld.mkOp2(OP_DIV, TYPE_U32, suq->getDef(d - 1), suq->getDef(d - 1),
1136 bld.loadImm(NULL, 6));
1137 }
1138
1139 if (mask & 1) {
1140 if (suq->tex.target.isMS()) {
1141 Value *ms_x = loadSuInfo(slot, NV50_SU_INFO_MS(0));
1142 Value *ms_y = loadSuInfo(slot, NV50_SU_INFO_MS(1));
1143 Value *ms = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(), ms_x, ms_y);
1144 bld.mkOp2(OP_SHL, TYPE_U32, suq->getDef(d++), bld.loadImm(NULL, 1), ms);
1145 } else {
1146 bld.mkMov(suq->getDef(d++), bld.loadImm(NULL, 1));
1147 }
1148 }
1149
1150 bld.remove(suq);
1151 return true;
1152 }
1153
1154 bool
handleBUFQ(Instruction * bufq)1155 NV50LoweringPreSSA::handleBUFQ(Instruction *bufq)
1156 {
1157 bufq->op = OP_MOV;
1158 bufq->setSrc(0, loadSuInfo(bufq->getSrc(0)->reg.fileIndex, NV50_SU_INFO_SIZE_X));
1159 bufq->setIndirect(0, 0, NULL);
1160 bufq->setIndirect(0, 1, NULL);
1161 return true;
1162 }
1163
1164 bool
handleSET(Instruction * i)1165 NV50LoweringPreSSA::handleSET(Instruction *i)
1166 {
1167 if (i->dType == TYPE_F32) {
1168 bld.setPosition(i, true);
1169 i->dType = TYPE_U32;
1170 bld.mkOp1(OP_ABS, TYPE_S32, i->getDef(0), i->getDef(0));
1171 bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(0), TYPE_S32, i->getDef(0));
1172 }
1173 return true;
1174 }
1175
1176 bool
handleSLCT(CmpInstruction * i)1177 NV50LoweringPreSSA::handleSLCT(CmpInstruction *i)
1178 {
1179 Value *src0 = bld.getSSA();
1180 Value *src1 = bld.getSSA();
1181 Value *pred = bld.getScratch(1, FILE_FLAGS);
1182
1183 Value *v0 = i->getSrc(0);
1184 Value *v1 = i->getSrc(1);
1185 // XXX: these probably shouldn't be immediates in the first place ...
1186 if (v0->asImm())
1187 v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
1188 if (v1->asImm())
1189 v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
1190
1191 bld.setPosition(i, true);
1192 bld.mkMov(src0, v0)->setPredicate(CC_NE, pred);
1193 bld.mkMov(src1, v1)->setPredicate(CC_EQ, pred);
1194 bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
1195
1196 bld.setPosition(i, false);
1197 i->op = OP_SET;
1198 i->setFlagsDef(0, pred);
1199 i->dType = TYPE_U8;
1200 i->setSrc(0, i->getSrc(2));
1201 i->setSrc(2, NULL);
1202 i->setSrc(1, bld.loadImm(NULL, 0));
1203
1204 return true;
1205 }
1206
1207 bool
handleSELP(Instruction * i)1208 NV50LoweringPreSSA::handleSELP(Instruction *i)
1209 {
1210 Value *src0 = bld.getSSA();
1211 Value *src1 = bld.getSSA();
1212
1213 Value *v0 = i->getSrc(0);
1214 Value *v1 = i->getSrc(1);
1215 if (v0->asImm())
1216 v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
1217 if (v1->asImm())
1218 v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
1219
1220 bld.mkMov(src0, v0)->setPredicate(CC_NE, i->getSrc(2));
1221 bld.mkMov(src1, v1)->setPredicate(CC_EQ, i->getSrc(2));
1222 bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
1223 delete_Instruction(prog, i);
1224 return true;
1225 }
1226
1227 bool
handleCALL(Instruction * i)1228 NV50LoweringPreSSA::handleCALL(Instruction *i)
1229 {
1230 if (prog->getType() == Program::TYPE_COMPUTE) {
1231 // Add implicit "thread id" argument in $r0 to the function
1232 i->setSrc(i->srcCount(), tid);
1233 }
1234 return true;
1235 }
1236
1237 bool
handlePRECONT(Instruction * i)1238 NV50LoweringPreSSA::handlePRECONT(Instruction *i)
1239 {
1240 delete_Instruction(prog, i);
1241 return true;
1242 }
1243
1244 bool
handleCONT(Instruction * i)1245 NV50LoweringPreSSA::handleCONT(Instruction *i)
1246 {
1247 i->op = OP_BRA;
1248 return true;
1249 }
1250
1251 bool
handleRDSV(Instruction * i)1252 NV50LoweringPreSSA::handleRDSV(Instruction *i)
1253 {
1254 Symbol *sym = i->getSrc(0)->asSym();
1255 uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym);
1256 Value *def = i->getDef(0);
1257 SVSemantic sv = sym->reg.data.sv.sv;
1258 int idx = sym->reg.data.sv.index;
1259
1260 if (addr >= 0x400) // mov $sreg
1261 return true;
1262
1263 switch (sv) {
1264 case SV_POSITION:
1265 assert(prog->getType() == Program::TYPE_FRAGMENT);
1266 bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
1267 break;
1268 case SV_FACE:
1269 bld.mkInterp(NV50_IR_INTERP_FLAT, def, addr, NULL);
1270 if (i->dType == TYPE_F32) {
1271 bld.mkOp2(OP_OR, TYPE_U32, def, def, bld.mkImm(0x00000001));
1272 bld.mkOp1(OP_NEG, TYPE_S32, def, def);
1273 bld.mkCvt(OP_CVT, TYPE_F32, def, TYPE_S32, def);
1274 }
1275 break;
1276 case SV_NCTAID:
1277 case SV_CTAID:
1278 case SV_NTID: {
1279 Value *x = bld.getSSA(2);
1280 bld.mkOp1(OP_LOAD, TYPE_U16, x,
1281 bld.mkSymbol(FILE_MEMORY_SHARED, 0, TYPE_U16, addr));
1282 bld.mkCvt(OP_CVT, TYPE_U32, def, TYPE_U16, x);
1283 break;
1284 }
1285 case SV_TID:
1286 if (idx == 0) {
1287 bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x0000ffff));
1288 } else if (idx == 1) {
1289 bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x03ff0000));
1290 bld.mkOp2(OP_SHR, TYPE_U32, def, def, bld.mkImm(16));
1291 } else if (idx == 2) {
1292 bld.mkOp2(OP_SHR, TYPE_U32, def, tid, bld.mkImm(26));
1293 } else {
1294 bld.mkMov(def, bld.mkImm(0));
1295 }
1296 break;
1297 case SV_COMBINED_TID:
1298 bld.mkMov(def, tid);
1299 break;
1300 case SV_SAMPLE_POS: {
1301 Value *off = new_LValue(func, FILE_ADDRESS);
1302 bld.mkOp1(OP_RDSV, TYPE_U32, def, bld.mkSysVal(SV_SAMPLE_INDEX, 0));
1303 bld.mkOp2(OP_SHL, TYPE_U32, off, def, bld.mkImm(3));
1304 bld.mkLoad(TYPE_F32,
1305 def,
1306 bld.mkSymbol(
1307 FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
1308 TYPE_U32, prog->driver->io.sampleInfoBase + 4 * idx),
1309 off);
1310 break;
1311 }
1312 case SV_THREAD_KILL:
1313 // Not actually supported. But it's implementation-dependent, so we can
1314 // always just say it's not a helper.
1315 bld.mkMov(def, bld.loadImm(NULL, 0));
1316 break;
1317 default:
1318 bld.mkFetch(i->getDef(0), i->dType,
1319 FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), NULL);
1320 break;
1321 }
1322 bld.getBB()->remove(i);
1323 return true;
1324 }
1325
1326 bool
handleDIV(Instruction * i)1327 NV50LoweringPreSSA::handleDIV(Instruction *i)
1328 {
1329 if (!isFloatType(i->dType))
1330 return true;
1331 bld.setPosition(i, false);
1332 Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(), i->getSrc(1));
1333 i->op = OP_MUL;
1334 i->setSrc(1, rcp->getDef(0));
1335 return true;
1336 }
1337
1338 bool
handleSQRT(Instruction * i)1339 NV50LoweringPreSSA::handleSQRT(Instruction *i)
1340 {
1341 bld.setPosition(i, true);
1342 i->op = OP_RSQ;
1343 bld.mkOp1(OP_RCP, i->dType, i->getDef(0), i->getDef(0));
1344
1345 return true;
1346 }
1347
1348 bool
handleEXPORT(Instruction * i)1349 NV50LoweringPreSSA::handleEXPORT(Instruction *i)
1350 {
1351 if (prog->getType() == Program::TYPE_FRAGMENT) {
1352 if (i->getIndirect(0, 0)) {
1353 // TODO: redirect to l[] here, load to GPRs at exit
1354 return false;
1355 } else {
1356 int id = i->getSrc(0)->reg.data.offset / 4; // in 32 bit reg units
1357
1358 i->op = OP_MOV;
1359 i->subOp = NV50_IR_SUBOP_MOV_FINAL;
1360 i->src(0).set(i->src(1));
1361 i->setSrc(1, NULL);
1362 i->setDef(0, new_LValue(func, FILE_GPR));
1363 i->getDef(0)->reg.data.id = id;
1364
1365 prog->maxGPR = MAX2(prog->maxGPR, id * 2);
1366 }
1367 }
1368 return true;
1369 }
1370
1371 // Handle indirect addressing in geometry shaders:
1372 //
1373 // ld $r0 a[$a1][$a2+k] ->
1374 // ld $r0 a[($a1 + $a2 * $vstride) + k], where k *= $vstride is implicit
1375 //
1376 bool
handleLOAD(Instruction * i)1377 NV50LoweringPreSSA::handleLOAD(Instruction *i)
1378 {
1379 ValueRef src = i->src(0);
1380 Symbol *sym = i->getSrc(0)->asSym();
1381
1382 if (prog->getType() == Program::TYPE_COMPUTE) {
1383 if (sym->inFile(FILE_MEMORY_SHARED) ||
1384 sym->inFile(FILE_MEMORY_BUFFER) ||
1385 sym->inFile(FILE_MEMORY_GLOBAL)) {
1386 return handleLDST(i);
1387 }
1388 }
1389
1390 if (src.isIndirect(1)) {
1391 assert(prog->getType() == Program::TYPE_GEOMETRY);
1392 Value *addr = i->getIndirect(0, 1);
1393
1394 if (src.isIndirect(0)) {
1395 // base address is in an address register, so move to a GPR
1396 Value *base = bld.getScratch();
1397 bld.mkMov(base, addr);
1398
1399 Symbol *sv = bld.mkSysVal(SV_VERTEX_STRIDE, 0);
1400 Value *vstride = bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getSSA(), sv);
1401 Value *attrib = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
1402 i->getIndirect(0, 0), bld.mkImm(2));
1403
1404 // Calculate final address: addr = base + attr*vstride; use 16-bit
1405 // multiplication since 32-bit would be lowered to multiple
1406 // instructions, and we only need the low 16 bits of the result
1407 Value *a[2], *b[2];
1408 bld.mkSplit(a, 2, attrib);
1409 bld.mkSplit(b, 2, vstride);
1410 Value *sum = bld.mkOp3v(OP_MAD, TYPE_U16, bld.getSSA(), a[0], b[0],
1411 base);
1412
1413 // move address from GPR into an address register
1414 addr = bld.getSSA(2, FILE_ADDRESS);
1415 bld.mkMov(addr, sum);
1416 }
1417
1418 i->setIndirect(0, 1, NULL);
1419 i->setIndirect(0, 0, addr);
1420 }
1421
1422 return true;
1423 }
1424
1425 bool
handleSharedATOM(Instruction * atom)1426 NV50LoweringPreSSA::handleSharedATOM(Instruction *atom)
1427 {
1428 assert(atom->src(0).getFile() == FILE_MEMORY_SHARED);
1429
1430 BasicBlock *currBB = atom->bb;
1431 BasicBlock *tryLockBB = atom->bb->splitBefore(atom, false);
1432 BasicBlock *joinBB = atom->bb->splitAfter(atom);
1433 BasicBlock *setAndUnlockBB = new BasicBlock(func);
1434 BasicBlock *failLockBB = new BasicBlock(func);
1435
1436 bld.setPosition(currBB, true);
1437 assert(!currBB->joinAt);
1438 currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
1439
1440 bld.mkFlow(OP_BRA, tryLockBB, CC_ALWAYS, NULL);
1441 currBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::TREE);
1442
1443 bld.setPosition(tryLockBB, true);
1444
1445 Instruction *ld =
1446 bld.mkLoad(TYPE_U32, atom->getDef(0), atom->getSrc(0)->asSym(),
1447 atom->getIndirect(0, 0));
1448 Value *locked = bld.getSSA(1, FILE_FLAGS);
1449 if (prog->getTarget()->getChipset() >= 0xa0) {
1450 ld->setFlagsDef(1, locked);
1451 ld->subOp = NV50_IR_SUBOP_LOAD_LOCKED;
1452 } else {
1453 bld.mkMov(locked, bld.loadImm(NULL, 2))
1454 ->flagsDef = 0;
1455 }
1456
1457 bld.mkFlow(OP_BRA, setAndUnlockBB, CC_LT, locked);
1458 bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL);
1459 tryLockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::CROSS);
1460 tryLockBB->cfg.attach(&setAndUnlockBB->cfg, Graph::Edge::TREE);
1461
1462 tryLockBB->cfg.detach(&joinBB->cfg);
1463 bld.remove(atom);
1464
1465 bld.setPosition(setAndUnlockBB, true);
1466 Value *stVal;
1467 if (atom->subOp == NV50_IR_SUBOP_ATOM_EXCH) {
1468 // Read the old value, and write the new one.
1469 stVal = atom->getSrc(1);
1470 } else if (atom->subOp == NV50_IR_SUBOP_ATOM_CAS) {
1471 CmpInstruction *set =
1472 bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_FLAGS),
1473 TYPE_U32, ld->getDef(0), atom->getSrc(1));
1474
1475 Instruction *selp =
1476 bld.mkOp3(OP_SELP, TYPE_U32, bld.getSSA(), atom->getSrc(2),
1477 ld->getDef(0), set->getDef(0));
1478 stVal = selp->getDef(0);
1479
1480 handleSELP(selp);
1481 } else {
1482 operation op;
1483
1484 switch (atom->subOp) {
1485 case NV50_IR_SUBOP_ATOM_ADD:
1486 op = OP_ADD;
1487 break;
1488 case NV50_IR_SUBOP_ATOM_AND:
1489 op = OP_AND;
1490 break;
1491 case NV50_IR_SUBOP_ATOM_OR:
1492 op = OP_OR;
1493 break;
1494 case NV50_IR_SUBOP_ATOM_XOR:
1495 op = OP_XOR;
1496 break;
1497 case NV50_IR_SUBOP_ATOM_MIN:
1498 op = OP_MIN;
1499 break;
1500 case NV50_IR_SUBOP_ATOM_MAX:
1501 op = OP_MAX;
1502 break;
1503 default:
1504 assert(0);
1505 return false;
1506 }
1507
1508 Instruction *i =
1509 bld.mkOp2(op, atom->dType, bld.getSSA(), ld->getDef(0),
1510 atom->getSrc(1));
1511
1512 stVal = i->getDef(0);
1513 }
1514
1515 Instruction *store = bld.mkStore(OP_STORE, TYPE_U32, atom->getSrc(0)->asSym(),
1516 atom->getIndirect(0, 0), stVal);
1517 if (prog->getTarget()->getChipset() >= 0xa0) {
1518 store->subOp = NV50_IR_SUBOP_STORE_UNLOCKED;
1519 }
1520
1521 bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL);
1522 setAndUnlockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::TREE);
1523
1524 // Loop until the lock is acquired.
1525 bld.setPosition(failLockBB, true);
1526 bld.mkFlow(OP_BRA, tryLockBB, CC_GEU, locked);
1527 bld.mkFlow(OP_BRA, joinBB, CC_ALWAYS, NULL);
1528 failLockBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::BACK);
1529 failLockBB->cfg.attach(&joinBB->cfg, Graph::Edge::TREE);
1530
1531 bld.setPosition(joinBB, false);
1532 bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1;
1533
1534 return true;
1535 }
1536
1537 bool
handleLDST(Instruction * i)1538 NV50LoweringPreSSA::handleLDST(Instruction *i)
1539 {
1540 ValueRef src = i->src(0);
1541 Symbol *sym = i->getSrc(0)->asSym();
1542
1543 if (prog->getType() != Program::TYPE_COMPUTE) {
1544 return true;
1545 }
1546
1547 // Buffers just map directly to the different global memory spaces
1548 if (sym->inFile(FILE_MEMORY_BUFFER)) {
1549 sym->reg.file = FILE_MEMORY_GLOBAL;
1550 }
1551
1552 if (sym->inFile(FILE_MEMORY_SHARED)) {
1553
1554 if (src.isIndirect(0)) {
1555 Value *addr = i->getIndirect(0, 0);
1556
1557 if (!addr->inFile(FILE_ADDRESS)) {
1558 // Move address from GPR into an address register
1559 Value *new_addr = bld.getSSA(2, FILE_ADDRESS);
1560 bld.mkMov(new_addr, addr);
1561
1562 i->setIndirect(0, 0, new_addr);
1563 }
1564 }
1565
1566 if (i->op == OP_ATOM)
1567 handleSharedATOM(i);
1568 } else if (sym->inFile(FILE_MEMORY_GLOBAL)) {
1569 // All global access must be indirect. There are no instruction forms
1570 // with direct access.
1571 Value *addr = i->getIndirect(0, 0);
1572
1573 Value *offset = bld.loadImm(bld.getSSA(), sym->reg.data.offset);
1574 Value *sum;
1575 if (addr != NULL)
1576 sum = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), addr,
1577 offset);
1578 else
1579 sum = offset;
1580
1581 i->setIndirect(0, 0, sum);
1582 sym->reg.data.offset = 0;
1583 }
1584
1585 return true;
1586 }
1587
1588 bool
handleMEMBAR(Instruction * i)1589 NV50LoweringPreSSA::handleMEMBAR(Instruction *i)
1590 {
1591 // For global memory, apparently doing a bunch of reads at different
1592 // addresses forces things to get sufficiently flushed.
1593 if (i->subOp & NV50_IR_SUBOP_MEMBAR_GL) {
1594 uint8_t b = prog->driver->io.auxCBSlot;
1595 Value *base =
1596 bld.mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32,
1597 prog->driver->io.membarOffset), NULL);
1598 Value *physid = bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getSSA(), bld.mkSysVal(SV_PHYSID, 0));
1599 Value *off = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
1600 bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(),
1601 physid, bld.loadImm(NULL, 0x1f)),
1602 bld.loadImm(NULL, 2));
1603 base = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), base, off);
1604 Symbol *gmemMembar = bld.mkSymbol(FILE_MEMORY_GLOBAL, prog->driver->io.gmemMembar, TYPE_U32, 0);
1605 for (int i = 0; i < 8; i++) {
1606 if (i != 0) {
1607 base = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), base, bld.loadImm(NULL, 0x100));
1608 }
1609 bld.mkLoad(TYPE_U32, bld.getSSA(), gmemMembar, base)
1610 ->fixed = 1;
1611 }
1612 }
1613
1614 // Both global and shared memory barriers also need a regular control bar
1615 // TODO: double-check this is the case
1616 i->op = OP_BAR;
1617 i->subOp = NV50_IR_SUBOP_BAR_SYNC;
1618 i->setSrc(0, bld.mkImm(0u));
1619 i->setSrc(1, bld.mkImm(0u));
1620
1621 return true;
1622 }
1623
1624 // The type that bests represents how each component can be stored when packed.
1625 static DataType
getPackedType(const TexInstruction::ImgFormatDesc * t,int c)1626 getPackedType(const TexInstruction::ImgFormatDesc *t, int c)
1627 {
1628 switch (t->type) {
1629 case FLOAT: return t->bits[c] == 16 ? TYPE_F16 : TYPE_F32;
1630 case UNORM: return t->bits[c] == 8 ? TYPE_U8 : TYPE_U16;
1631 case SNORM: return t->bits[c] == 8 ? TYPE_S8 : TYPE_S16;
1632 case UINT:
1633 return (t->bits[c] == 8 ? TYPE_U8 :
1634 (t->bits[c] <= 16 ? TYPE_U16 : TYPE_U32));
1635 case SINT:
1636 return (t->bits[c] == 8 ? TYPE_S8 :
1637 (t->bits[c] <= 16 ? TYPE_S16 : TYPE_S32));
1638 }
1639 return TYPE_NONE;
1640 }
1641
1642 // The type that the rest of the shader expects to process this image type in.
1643 static DataType
getShaderType(const ImgType type)1644 getShaderType(const ImgType type) {
1645 switch (type) {
1646 case FLOAT:
1647 case UNORM:
1648 case SNORM:
1649 return TYPE_F32;
1650 case UINT:
1651 return TYPE_U32;
1652 case SINT:
1653 return TYPE_S32;
1654 default:
1655 assert(!"Impossible type");
1656 return TYPE_NONE;
1657 }
1658 }
1659
1660 // Reads the raw coordinates out of the input instruction, and returns a
1661 // single-value coordinate which is what the hardware expects to receive in a
1662 // ld/st op.
1663 Value *
processSurfaceCoords(TexInstruction * su)1664 NV50LoweringPreSSA::processSurfaceCoords(TexInstruction *su)
1665 {
1666 const int slot = su->tex.r;
1667 const int dim = su->tex.target.getDim();
1668 const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
1669
1670 const TexInstruction::ImgFormatDesc *format = su->tex.format;
1671 const uint16_t bytes = (format->bits[0] + format->bits[1] +
1672 format->bits[2] + format->bits[3]) / 8;
1673 uint16_t shift = ffs(bytes) - 1;
1674
1675 // Buffer sizes don't necessarily fit in 16-bit values
1676 if (su->tex.target == TEX_TARGET_BUFFER) {
1677 return bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
1678 su->getSrc(0), bld.loadImm(NULL, (uint32_t)shift));
1679 }
1680
1681 // For buffers, we just need the byte offset. And for 2d buffers we want
1682 // the x coordinate in bytes as well.
1683 Value *coords[3] = {};
1684 for (int i = 0; i < arg; i++) {
1685 Value *src[2];
1686 bld.mkSplit(src, 2, su->getSrc(i));
1687 coords[i] = src[0];
1688 // For 1d-images, we want the y coord to be 0, which it will be here.
1689 if (i == 0)
1690 coords[1] = src[1];
1691 }
1692
1693 coords[0] = bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2),
1694 coords[0], bld.loadImm(NULL, shift));
1695
1696 if (su->tex.target.isMS()) {
1697 Value *ms_x = loadSuInfo16(slot, NV50_SU_INFO_MS(0));
1698 Value *ms_y = loadSuInfo16(slot, NV50_SU_INFO_MS(1));
1699 coords[0] = bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2), coords[0], ms_x);
1700 coords[1] = bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2), coords[1], ms_y);
1701 }
1702
1703 // If there are more dimensions, we just want the y-offset. But that needs
1704 // to be adjusted up by the y-stride for array images.
1705 if (su->tex.target.isArray() || su->tex.target.isCube()) {
1706 Value *index = coords[dim];
1707 Value *height = loadSuInfo16(slot, NV50_SU_INFO_STRIDE_Y);
1708 Instruction *mul = bld.mkOp2(OP_MUL, TYPE_U32, bld.getSSA(4), index, height);
1709 mul->sType = TYPE_U16;
1710 Value *muls[2];
1711 bld.mkSplit(muls, 2, mul->getDef(0));
1712 if (dim > 1)
1713 coords[1] = bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2), coords[1], muls[0]);
1714 else
1715 coords[1] = muls[0];
1716 }
1717
1718 // 3d is special-cased. Note that a single "slice" of a 3d image may
1719 // also be attached as 2d, so we have to do the same 3d processing for
1720 // 2d as well, just in case. In order to remap a 3d image onto a 2d
1721 // image, we have to retile it "by hand".
1722 if (su->tex.target == TEX_TARGET_3D || su->tex.target == TEX_TARGET_2D) {
1723 Value *z = loadSuInfo16(slot, NV50_SU_INFO_OFFSET_Z);
1724 Value *y_size_aligned = loadSuInfo16(slot, NV50_SU_INFO_STRIDE_Y);
1725 // Add the z coordinate for actual 3d-images
1726 if (dim > 2)
1727 coords[2] = bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2), z, coords[2]);
1728 else
1729 coords[2] = z;
1730
1731 // Compute the surface parameters from tile shifts
1732 Value *tile_shift[3];
1733 Value *tile_size[3];
1734 Value *tile_mask[3];
1735 // We only ever use one kind of X-tiling.
1736 tile_shift[0] = bld.loadImm(NULL, (uint16_t)6);
1737 tile_size[0] = bld.loadImm(NULL, (uint16_t)64);
1738 tile_mask[0] = bld.loadImm(NULL, (uint16_t)63);
1739 // Fetch the "real" tiling parameters of the underlying surface
1740 for (int i = 1; i < 3; i++) {
1741 tile_shift[i] = loadSuInfo16(slot, NV50_SU_INFO_TILE_SHIFT(i));
1742 tile_size[i] = bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2), bld.loadImm(NULL, (uint16_t)1), tile_shift[i]);
1743 tile_mask[i] = bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2), tile_size[i], bld.loadImm(NULL, (uint16_t)-1));
1744 }
1745
1746 // Compute the location of given coordinate, both inside the tile as
1747 // well as which (linearly-laid out) tile it's in.
1748 Value *coord_in_tile[3];
1749 Value *tile[3];
1750 for (int i = 0; i < 3; i++) {
1751 coord_in_tile[i] = bld.mkOp2v(OP_AND, TYPE_U16, bld.getSSA(2), coords[i], tile_mask[i]);
1752 tile[i] = bld.mkOp2v(OP_SHR, TYPE_U16, bld.getSSA(2), coords[i], tile_shift[i]);
1753 }
1754
1755 // Based on the "real" tiling parameters, compute x/y coordinates in the
1756 // larger surface with 2d tiling that was supplied to the hardware. This
1757 // was determined and verified with the help of the tiling pseudocode in
1758 // the envytools docs.
1759 //
1760 // adj_x = x_coord_in_tile + x_tile * x_tile_size * z_tile_size +
1761 // z_coord_in_tile * x_tile_size
1762 // adj_y = y_coord_in_tile + y_tile * y_tile_size +
1763 // z_tile * y_tile_size * y_tiles
1764 //
1765 // Note: STRIDE_Y = y_tile_size * y_tiles
1766
1767 coords[0] = bld.mkOp2v(
1768 OP_ADD, TYPE_U16, bld.getSSA(2),
1769 bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2),
1770 coord_in_tile[0],
1771 bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2),
1772 tile[0],
1773 bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2),
1774 tile_shift[2], tile_shift[0]))),
1775 bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2),
1776 coord_in_tile[2], tile_shift[0]));
1777
1778 Instruction *mul = bld.mkOp2(OP_MUL, TYPE_U32, bld.getSSA(4),
1779 tile[2], y_size_aligned);
1780 mul->sType = TYPE_U16;
1781 Value *muls[2];
1782 bld.mkSplit(muls, 2, mul->getDef(0));
1783
1784 coords[1] = bld.mkOp2v(
1785 OP_ADD, TYPE_U16, bld.getSSA(2),
1786 muls[0],
1787 bld.mkOp2v(OP_ADD, TYPE_U16, bld.getSSA(2),
1788 coord_in_tile[1],
1789 bld.mkOp2v(OP_SHL, TYPE_U16, bld.getSSA(2),
1790 tile[1], tile_shift[1])));
1791 }
1792
1793 return bld.mkOp2v(OP_MERGE, TYPE_U32, bld.getSSA(), coords[0], coords[1]);
1794 }
1795
1796 // This is largely a copy of NVC0LoweringPass::convertSurfaceFormat, but
1797 // adjusted to make use of 16-bit math where possible.
1798 bool
handleSULDP(TexInstruction * su)1799 NV50LoweringPreSSA::handleSULDP(TexInstruction *su)
1800 {
1801 const int slot = su->tex.r;
1802 assert(!su->getIndirectR());
1803
1804 bld.setPosition(su, false);
1805
1806 const TexInstruction::ImgFormatDesc *format = su->tex.format;
1807 const int bytes = (su->tex.format->bits[0] +
1808 su->tex.format->bits[1] +
1809 su->tex.format->bits[2] +
1810 su->tex.format->bits[3]) / 8;
1811 DataType ty = typeOfSize(bytes);
1812
1813 Value *coord = processSurfaceCoords(su);
1814
1815 Value *untypedDst[4] = {};
1816 Value *typedDst[4] = {};
1817 int i;
1818 for (i = 0; i < bytes / 4; i++)
1819 untypedDst[i] = bld.getSSA();
1820 if (bytes < 4)
1821 untypedDst[0] = bld.getSSA();
1822
1823 for (i = 0; i < 4; i++)
1824 typedDst[i] = su->getDef(i);
1825
1826 Instruction *load = bld.mkLoad(ty, NULL, bld.mkSymbol(FILE_MEMORY_GLOBAL, slot, ty, 0), coord);
1827 for (i = 0; i < 4 && untypedDst[i]; i++)
1828 load->setDef(i, untypedDst[i]);
1829
1830 // Unpack each component into the typed dsts
1831 int bits = 0;
1832 for (int i = 0; i < 4; bits += format->bits[i], i++) {
1833 if (!typedDst[i])
1834 continue;
1835
1836 if (i >= format->components) {
1837 if (format->type == FLOAT ||
1838 format->type == UNORM ||
1839 format->type == SNORM)
1840 bld.loadImm(typedDst[i], i == 3 ? 1.0f : 0.0f);
1841 else
1842 bld.loadImm(typedDst[i], i == 3 ? 1 : 0);
1843 continue;
1844 }
1845
1846 // Get just that component's data into the relevant place
1847 if (format->bits[i] == 32)
1848 bld.mkMov(typedDst[i], untypedDst[i]);
1849 else if (format->bits[i] == 16) {
1850 // We can always convert directly from the appropriate half of the
1851 // loaded value into the typed result.
1852 Value *src[2];
1853 bld.mkSplit(src, 2, untypedDst[i / 2]);
1854 bld.mkCvt(OP_CVT, getShaderType(format->type), typedDst[i],
1855 getPackedType(format, i), src[i & 1]);
1856 }
1857 else if (format->bits[i] == 8) {
1858 // Same approach as for 16 bits, but we have to massage the value a
1859 // bit more, since we have to get the appropriate 8 bits from the
1860 // half-register. In all cases, we can CVT from a 8-bit source, so we
1861 // only have to shift when we want the upper 8 bits.
1862 Value *src[2], *shifted;
1863 bld.mkSplit(src, 2, untypedDst[0]);
1864 DataType packedType = getPackedType(format, i);
1865 if (i & 1)
1866 shifted = bld.mkOp2v(OP_SHR, TYPE_U16, bld.getSSA(2), src[!!(i & 2)], bld.loadImm(NULL, (uint16_t)8));
1867 else
1868 shifted = src[!!(i & 2)];
1869
1870 bld.mkCvt(OP_CVT, getShaderType(format->type), typedDst[i],
1871 packedType, shifted);
1872 }
1873 else {
1874 // The options are 10, 11, and 2. Get it into a 32-bit reg, then
1875 // shift/mask. That's where it'll have to end up anyways. For signed,
1876 // we have to make sure to get sign-extension, so we actually have to
1877 // shift *up* first, and then shift down. There's no advantage to
1878 // AND'ing, so we don't.
1879 DataType ty = TYPE_U32;
1880 if (format->type == SNORM || format->type == SINT) {
1881 ty = TYPE_S32;
1882 }
1883
1884 // Poor man's EXTBF
1885 bld.mkOp2(
1886 OP_SHR, ty, typedDst[i],
1887 bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), untypedDst[0], bld.loadImm(NULL, 32 - bits - format->bits[i])),
1888 bld.loadImm(NULL, 32 - format->bits[i]));
1889
1890 // If the stored data is already in the appropriate type, we don't
1891 // have to do anything. Convert to float for the *NORM formats.
1892 if (format->type == UNORM || format->type == SNORM)
1893 bld.mkCvt(OP_CVT, TYPE_F32, typedDst[i], TYPE_U32, typedDst[i]);
1894 }
1895
1896 // Normalize / convert as necessary
1897 if (format->type == UNORM)
1898 bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f / ((1 << format->bits[i]) - 1)));
1899 else if (format->type == SNORM)
1900 bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f / ((1 << (format->bits[i] - 1)) - 1)));
1901 else if (format->type == FLOAT && format->bits[i] < 16) {
1902 // We expect the value to be in the low bits of the register, so we
1903 // have to shift back up.
1904 bld.mkOp2(OP_SHL, TYPE_U32, typedDst[i], typedDst[i], bld.loadImm(NULL, 15 - format->bits[i]));
1905 Value *src[2];
1906 bld.mkSplit(src, 2, typedDst[i]);
1907 bld.mkCvt(OP_CVT, TYPE_F32, typedDst[i], TYPE_F16, src[0]);
1908 }
1909 }
1910
1911 if (format->bgra) {
1912 std::swap(typedDst[0], typedDst[2]);
1913 }
1914
1915 bld.getBB()->remove(su);
1916 return true;
1917 }
1918
1919 bool
handleSUREDP(TexInstruction * su)1920 NV50LoweringPreSSA::handleSUREDP(TexInstruction *su)
1921 {
1922 const int slot = su->tex.r;
1923 const int dim = su->tex.target.getDim();
1924 const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
1925 assert(!su->getIndirectR());
1926
1927 bld.setPosition(su, false);
1928
1929 Value *coord = processSurfaceCoords(su);
1930
1931 // This is guaranteed to be a 32-bit format. So there's nothing to
1932 // pack/unpack.
1933 Instruction *atom = bld.mkOp2(
1934 OP_ATOM, su->dType, su->getDef(0),
1935 bld.mkSymbol(FILE_MEMORY_GLOBAL, slot, TYPE_U32, 0), su->getSrc(arg));
1936 if (su->subOp == NV50_IR_SUBOP_ATOM_CAS)
1937 atom->setSrc(2, su->getSrc(arg + 1));
1938 atom->setIndirect(0, 0, coord);
1939 atom->subOp = su->subOp;
1940
1941 bld.getBB()->remove(su);
1942 return true;
1943 }
1944
1945 bool
handleSUSTP(TexInstruction * su)1946 NV50LoweringPreSSA::handleSUSTP(TexInstruction *su)
1947 {
1948 const int slot = su->tex.r;
1949 const int dim = su->tex.target.getDim();
1950 const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
1951 assert(!su->getIndirectR());
1952
1953 bld.setPosition(su, false);
1954
1955 const TexInstruction::ImgFormatDesc *format = su->tex.format;
1956 const int bytes = (su->tex.format->bits[0] +
1957 su->tex.format->bits[1] +
1958 su->tex.format->bits[2] +
1959 su->tex.format->bits[3]) / 8;
1960 DataType ty = typeOfSize(bytes);
1961
1962 Value *coord = processSurfaceCoords(su);
1963
1964 // The packed values we will eventually store into memory
1965 Value *untypedDst[4] = {};
1966 // Each component's packed representation, in 16-bit registers (only used
1967 // where appropriate)
1968 Value *untypedDst16[4] = {};
1969 // The original values that are being packed
1970 Value *typedDst[4] = {};
1971 int i;
1972
1973 for (i = 0; i < bytes / 4; i++)
1974 untypedDst[i] = bld.getSSA();
1975 for (i = 0; i < format->components; i++)
1976 untypedDst16[i] = bld.getSSA(2);
1977 // Make sure we get at least one of each value allocated for the
1978 // super-narrow formats.
1979 if (bytes < 4)
1980 untypedDst[0] = bld.getSSA();
1981 if (bytes < 2)
1982 untypedDst16[0] = bld.getSSA(2);
1983
1984 for (i = 0; i < 4; i++) {
1985 typedDst[i] = bld.getSSA();
1986 bld.mkMov(typedDst[i], su->getSrc(arg + i));
1987 }
1988
1989 if (format->bgra) {
1990 std::swap(typedDst[0], typedDst[2]);
1991 }
1992
1993 // Pack each component into the untyped dsts.
1994 int bits = 0;
1995 for (int i = 0; i < format->components; bits += format->bits[i], i++) {
1996 // Un-normalize / convert as necessary
1997 if (format->type == UNORM)
1998 bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f * ((1 << format->bits[i]) - 1)));
1999 else if (format->type == SNORM)
2000 bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f * ((1 << (format->bits[i] - 1)) - 1)));
2001
2002 // There is nothing to convert/pack for 32-bit values
2003 if (format->bits[i] == 32) {
2004 bld.mkMov(untypedDst[i], typedDst[i]);
2005 continue;
2006 }
2007
2008 // The remainder of the cases will naturally want to deal in 16-bit
2009 // registers. We will put these into untypedDst16 and then merge them
2010 // together later.
2011 if (format->type == FLOAT && format->bits[i] < 16) {
2012 bld.mkCvt(OP_CVT, TYPE_F16, untypedDst16[i], TYPE_F32, typedDst[i]);
2013 bld.mkOp2(OP_SHR, TYPE_U16, untypedDst16[i], untypedDst16[i], bld.loadImm(NULL, (uint16_t)(15 - format->bits[i])));
2014
2015 // For odd bit sizes, it's easier to pack it into the final
2016 // destination directly.
2017 Value *tmp = bld.getSSA();
2018 bld.mkCvt(OP_CVT, TYPE_U32, tmp, TYPE_U16, untypedDst16[i]);
2019 if (i == 0) {
2020 untypedDst[0] = tmp;
2021 } else {
2022 bld.mkOp2(OP_SHL, TYPE_U32, tmp, tmp, bld.loadImm(NULL, bits));
2023 bld.mkOp2(OP_OR, TYPE_U32, untypedDst[0], untypedDst[0], tmp);
2024 }
2025 } else if (format->bits[i] == 16) {
2026 // We can always convert the shader value into the packed value
2027 // directly here
2028 bld.mkCvt(OP_CVT, getPackedType(format, i), untypedDst16[i],
2029 getShaderType(format->type), typedDst[i]);
2030 } else if (format->bits[i] < 16) {
2031 DataType packedType = getPackedType(format, i);
2032 DataType shaderType = getShaderType(format->type);
2033 // We can't convert F32 to U8/S8 directly, so go to U16/S16 first.
2034 if (shaderType == TYPE_F32 && typeSizeof(packedType) == 1) {
2035 packedType = format->type == SNORM ? TYPE_S16 : TYPE_U16;
2036 }
2037 bld.mkCvt(OP_CVT, packedType, untypedDst16[i], shaderType, typedDst[i]);
2038 // TODO: clamp for 10- and 2-bit sizes. Also, due to the oddness of
2039 // the size, it's easier to dump them into a 32-bit value and OR
2040 // everything later.
2041 if (format->bits[i] != 8) {
2042 // Restrict value to the appropriate bits (although maybe supposed
2043 // to clamp instead?)
2044 bld.mkOp2(OP_AND, TYPE_U16, untypedDst16[i], untypedDst16[i], bld.loadImm(NULL, (uint16_t)((1 << format->bits[i]) - 1)));
2045 // And merge into final packed value
2046 Value *tmp = bld.getSSA();
2047 bld.mkCvt(OP_CVT, TYPE_U32, tmp, TYPE_U16, untypedDst16[i]);
2048 if (i == 0) {
2049 untypedDst[0] = tmp;
2050 } else {
2051 bld.mkOp2(OP_SHL, TYPE_U32, tmp, tmp, bld.loadImm(NULL, bits));
2052 bld.mkOp2(OP_OR, TYPE_U32, untypedDst[0], untypedDst[0], tmp);
2053 }
2054 } else if (i & 1) {
2055 // Shift the 8-bit value up (so that it can be OR'd later)
2056 bld.mkOp2(OP_SHL, TYPE_U16, untypedDst16[i], untypedDst16[i], bld.loadImm(NULL, (uint16_t)(bits % 16)));
2057 } else if (packedType != TYPE_U8) {
2058 // S8 (or the *16 if converted from float) will all have high bits
2059 // set, so AND them out.
2060 bld.mkOp2(OP_AND, TYPE_U16, untypedDst16[i], untypedDst16[i], bld.loadImm(NULL, (uint16_t)0xff));
2061 }
2062 }
2063 }
2064
2065 // OR pairs of 8-bit values together (into the even value)
2066 if (format->bits[0] == 8) {
2067 for (i = 0; i < 2 && untypedDst16[2 * i] && untypedDst16[2 * i + 1]; i++)
2068 bld.mkOp2(OP_OR, TYPE_U16, untypedDst16[2 * i], untypedDst16[2 * i], untypedDst16[2 * i + 1]);
2069 }
2070
2071 // We'll always want to have at least a 32-bit source register for the store
2072 Instruction *merge = bld.mkOp(OP_MERGE, bytes < 4 ? TYPE_U32 : ty, bld.getSSA(bytes < 4 ? 4 : bytes));
2073 if (format->bits[0] == 32) {
2074 for (i = 0; i < 4 && untypedDst[i]; i++)
2075 merge->setSrc(i, untypedDst[i]);
2076 } else if (format->bits[0] == 16) {
2077 for (i = 0; i < 4 && untypedDst16[i]; i++)
2078 merge->setSrc(i, untypedDst16[i]);
2079 if (i == 1)
2080 merge->setSrc(i, bld.getSSA(2));
2081 } else if (format->bits[0] == 8) {
2082 for (i = 0; i < 2 && untypedDst16[2 * i]; i++)
2083 merge->setSrc(i, untypedDst16[2 * i]);
2084 if (i == 1)
2085 merge->setSrc(i, bld.getSSA(2));
2086 } else {
2087 merge->setSrc(0, untypedDst[0]);
2088 }
2089
2090 bld.mkStore(OP_STORE, ty, bld.mkSymbol(FILE_MEMORY_GLOBAL, slot, TYPE_U32, 0), coord, merge->getDef(0));
2091
2092 bld.getBB()->remove(su);
2093 return true;
2094 }
2095
2096 bool
handlePFETCH(Instruction * i)2097 NV50LoweringPreSSA::handlePFETCH(Instruction *i)
2098 {
2099 assert(prog->getType() == Program::TYPE_GEOMETRY);
2100
2101 // NOTE: cannot use getImmediate here, not in SSA form yet, move to
2102 // later phase if that assertion ever triggers:
2103
2104 ImmediateValue *imm = i->getSrc(0)->asImm();
2105 assert(imm);
2106
2107 assert(imm->reg.data.u32 <= 127); // TODO: use address reg if that happens
2108
2109 if (i->srcExists(1)) {
2110 // indirect addressing of vertex in primitive space
2111
2112 LValue *val = bld.getScratch();
2113 Value *ptr = bld.getSSA(2, FILE_ADDRESS);
2114 bld.mkOp2v(OP_SHL, TYPE_U32, ptr, i->getSrc(1), bld.mkImm(2));
2115 bld.mkOp2v(OP_PFETCH, TYPE_U32, val, imm, ptr);
2116
2117 // NOTE: PFETCH directly to an $aX only works with direct addressing
2118 i->op = OP_SHL;
2119 i->setSrc(0, val);
2120 i->setSrc(1, bld.mkImm(0));
2121 }
2122
2123 return true;
2124 }
2125
2126 // Set flags according to predicate and make the instruction read $cX.
2127 void
checkPredicate(Instruction * insn)2128 NV50LoweringPreSSA::checkPredicate(Instruction *insn)
2129 {
2130 Value *pred = insn->getPredicate();
2131 Value *cdst;
2132
2133 // FILE_PREDICATE will simply be changed to FLAGS on conversion to SSA
2134 if (!pred ||
2135 pred->reg.file == FILE_FLAGS || pred->reg.file == FILE_PREDICATE)
2136 return;
2137
2138 cdst = bld.getSSA(1, FILE_FLAGS);
2139
2140 bld.mkCmp(OP_SET, CC_NEU, insn->dType, cdst, insn->dType, bld.loadImm(NULL, 0), pred);
2141
2142 insn->setPredicate(insn->cc, cdst);
2143 }
2144
2145 //
2146 // - add quadop dance for texturing
2147 // - put FP outputs in GPRs
2148 // - convert instruction sequences
2149 //
2150 bool
visit(Instruction * i)2151 NV50LoweringPreSSA::visit(Instruction *i)
2152 {
2153 bld.setPosition(i, false);
2154
2155 if (i->cc != CC_ALWAYS)
2156 checkPredicate(i);
2157
2158 switch (i->op) {
2159 case OP_TEX:
2160 case OP_TXF:
2161 case OP_TXG:
2162 return handleTEX(i->asTex());
2163 case OP_TXB:
2164 return handleTXB(i->asTex());
2165 case OP_TXL:
2166 return handleTXL(i->asTex());
2167 case OP_TXD:
2168 return handleTXD(i->asTex());
2169 case OP_TXLQ:
2170 return handleTXLQ(i->asTex());
2171 case OP_TXQ:
2172 return handleTXQ(i->asTex());
2173 case OP_EX2:
2174 bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
2175 i->setSrc(0, i->getDef(0));
2176 break;
2177 case OP_SET:
2178 return handleSET(i);
2179 case OP_SLCT:
2180 return handleSLCT(i->asCmp());
2181 case OP_SELP:
2182 return handleSELP(i);
2183 case OP_DIV:
2184 return handleDIV(i);
2185 case OP_SQRT:
2186 return handleSQRT(i);
2187 case OP_EXPORT:
2188 return handleEXPORT(i);
2189 case OP_LOAD:
2190 return handleLOAD(i);
2191 case OP_MEMBAR:
2192 return handleMEMBAR(i);
2193 case OP_ATOM:
2194 case OP_STORE:
2195 return handleLDST(i);
2196 case OP_SULDP:
2197 return handleSULDP(i->asTex());
2198 case OP_SUSTP:
2199 return handleSUSTP(i->asTex());
2200 case OP_SUREDP:
2201 return handleSUREDP(i->asTex());
2202 case OP_SUQ:
2203 return handleSUQ(i->asTex());
2204 case OP_BUFQ:
2205 return handleBUFQ(i);
2206 case OP_RDSV:
2207 return handleRDSV(i);
2208 case OP_CALL:
2209 return handleCALL(i);
2210 case OP_PRECONT:
2211 return handlePRECONT(i);
2212 case OP_CONT:
2213 return handleCONT(i);
2214 case OP_PFETCH:
2215 return handlePFETCH(i);
2216 default:
2217 break;
2218 }
2219 return true;
2220 }
2221
2222 bool
runLegalizePass(Program * prog,CGStage stage) const2223 TargetNV50::runLegalizePass(Program *prog, CGStage stage) const
2224 {
2225 bool ret = false;
2226
2227 if (stage == CG_STAGE_PRE_SSA) {
2228 NV50LoweringPreSSA pass(prog);
2229 ret = pass.run(prog, false, true);
2230 } else
2231 if (stage == CG_STAGE_SSA) {
2232 if (!prog->targetPriv)
2233 prog->targetPriv = new std::list<Instruction *>();
2234 NV50LegalizeSSA pass(prog);
2235 ret = pass.run(prog, false, true);
2236 } else
2237 if (stage == CG_STAGE_POST_RA) {
2238 NV50LegalizePostRA pass;
2239 ret = pass.run(prog, false, true);
2240 if (prog->targetPriv)
2241 delete reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
2242 }
2243 return ret;
2244 }
2245
2246 } // namespace nv50_ir
2247