1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 */
23
24 /**
25 * \file lower_instructions.cpp
26 *
27 * Many GPUs lack native instructions for certain expression operations, and
28 * must replace them with some other expression tree. This pass lowers some
29 * of the most common cases, allowing the lowering code to be implemented once
30 * rather than in each driver backend.
31 */
32
33 #include "program/prog_instruction.h" /* for swizzle */
34 #include "compiler/glsl_types.h"
35 #include "ir.h"
36 #include "ir_builder.h"
37 #include "ir_optimization.h"
38 #include "util/half_float.h"
39
40 #include <math.h>
41
42 /* Operations for lower_instructions() */
43 #define FIND_LSB_TO_FLOAT_CAST 0x20000
44 #define FIND_MSB_TO_FLOAT_CAST 0x40000
45 #define IMUL_HIGH_TO_MUL 0x80000
46
47 using namespace ir_builder;
48
49 namespace {
50
51 class lower_instructions_visitor : public ir_hierarchical_visitor {
52 public:
lower_instructions_visitor(unsigned lower)53 lower_instructions_visitor(unsigned lower)
54 : progress(false), lower(lower) { }
55
56 ir_visitor_status visit_leave(ir_expression *);
57
58 bool progress;
59
60 private:
61 unsigned lower; /** Bitfield of which operations to lower */
62
63 void double_dot_to_fma(ir_expression *);
64 void double_lrp(ir_expression *);
65 void find_lsb_to_float_cast(ir_expression *ir);
66 void find_msb_to_float_cast(ir_expression *ir);
67 void imul_high_to_mul(ir_expression *ir);
68
69 ir_expression *_carry(operand a, operand b);
70
71 static ir_constant *_imm_fp(void *mem_ctx,
72 const glsl_type *type,
73 double f,
74 unsigned vector_elements=1);
75 };
76
77 } /* anonymous namespace */
78
79 /**
80 * Determine if a particular type of lowering should occur
81 */
82 #define lowering(x) (this->lower & x)
83
84 bool
lower_instructions(exec_list * instructions,bool have_gpu_shader5)85 lower_instructions(exec_list *instructions,bool have_gpu_shader5)
86 {
87 unsigned what_to_lower =
88 /* Assume that if ARB_gpu_shader5 is not supported then all of the
89 * extended integer functions need lowering. It may be necessary to add
90 * some caps for individual instructions.
91 */
92 (!have_gpu_shader5 ? FIND_LSB_TO_FLOAT_CAST |
93 FIND_MSB_TO_FLOAT_CAST |
94 IMUL_HIGH_TO_MUL : 0);
95
96 lower_instructions_visitor v(what_to_lower);
97
98 visit_list_elements(&v, instructions);
99 return v.progress;
100 }
101
102 void
double_dot_to_fma(ir_expression * ir)103 lower_instructions_visitor::double_dot_to_fma(ir_expression *ir)
104 {
105 ir_variable *temp = new(ir) ir_variable(glsl_get_base_glsl_type(ir->operands[0]->type), "dot_res",
106 ir_var_temporary);
107 this->base_ir->insert_before(temp);
108
109 int nc = glsl_get_components(ir->operands[0]->type);
110 for (int i = nc - 1; i >= 1; i--) {
111 ir_assignment *assig;
112 if (i == (nc - 1)) {
113 assig = assign(temp, mul(swizzle(ir->operands[0]->clone(ir, NULL), i, 1),
114 swizzle(ir->operands[1]->clone(ir, NULL), i, 1)));
115 } else {
116 assig = assign(temp, fma(swizzle(ir->operands[0]->clone(ir, NULL), i, 1),
117 swizzle(ir->operands[1]->clone(ir, NULL), i, 1),
118 temp));
119 }
120 this->base_ir->insert_before(assig);
121 }
122
123 ir->operation = ir_triop_fma;
124 ir->init_num_operands();
125 ir->operands[0] = swizzle(ir->operands[0], 0, 1);
126 ir->operands[1] = swizzle(ir->operands[1], 0, 1);
127 ir->operands[2] = new(ir) ir_dereference_variable(temp);
128
129 this->progress = true;
130
131 }
132
133 void
double_lrp(ir_expression * ir)134 lower_instructions_visitor::double_lrp(ir_expression *ir)
135 {
136 int swizval;
137 ir_rvalue *op0 = ir->operands[0], *op2 = ir->operands[2];
138 ir_constant *one = new(ir) ir_constant(1.0, op2->type->vector_elements);
139
140 switch (op2->type->vector_elements) {
141 case 1:
142 swizval = SWIZZLE_XXXX;
143 break;
144 default:
145 assert(op0->type->vector_elements == op2->type->vector_elements);
146 swizval = SWIZZLE_XYZW;
147 break;
148 }
149
150 ir->operation = ir_triop_fma;
151 ir->init_num_operands();
152 ir->operands[0] = swizzle(op2, swizval, op0->type->vector_elements);
153 ir->operands[2] = mul(sub(one, op2->clone(ir, NULL)), op0);
154
155 this->progress = true;
156 }
157
158 void
find_lsb_to_float_cast(ir_expression * ir)159 lower_instructions_visitor::find_lsb_to_float_cast(ir_expression *ir)
160 {
161 /* For more details, see:
162 *
163 * http://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightFloatCast
164 */
165 const unsigned elements = ir->operands[0]->type->vector_elements;
166 ir_constant *c0 = new(ir) ir_constant(unsigned(0), elements);
167 ir_constant *cminus1 = new(ir) ir_constant(int(-1), elements);
168 ir_constant *c23 = new(ir) ir_constant(int(23), elements);
169 ir_constant *c7F = new(ir) ir_constant(int(0x7F), elements);
170 ir_variable *temp =
171 new(ir) ir_variable(glsl_ivec_type(elements), "temp", ir_var_temporary);
172 ir_variable *lsb_only =
173 new(ir) ir_variable(glsl_uvec_type(elements), "lsb_only", ir_var_temporary);
174 ir_variable *as_float =
175 new(ir) ir_variable(glsl_vec_type(elements), "as_float", ir_var_temporary);
176 ir_variable *lsb =
177 new(ir) ir_variable(glsl_ivec_type(elements), "lsb", ir_var_temporary);
178
179 ir_instruction &i = *base_ir;
180
181 i.insert_before(temp);
182
183 if (ir->operands[0]->type->base_type == GLSL_TYPE_INT) {
184 i.insert_before(assign(temp, ir->operands[0]));
185 } else {
186 assert(ir->operands[0]->type->base_type == GLSL_TYPE_UINT);
187 i.insert_before(assign(temp, u2i(ir->operands[0])));
188 }
189
190 /* The int-to-float conversion is lossless because (value & -value) is
191 * either a power of two or zero. We don't use the result in the zero
192 * case. The uint() cast is necessary so that 0x80000000 does not
193 * generate a negative value.
194 *
195 * uint lsb_only = uint(value & -value);
196 * float as_float = float(lsb_only);
197 */
198 i.insert_before(lsb_only);
199 i.insert_before(assign(lsb_only, i2u(bit_and(temp, neg(temp)))));
200
201 i.insert_before(as_float);
202 i.insert_before(assign(as_float, u2f(lsb_only)));
203
204 /* This is basically an open-coded frexp. Implementations that have a
205 * native frexp instruction would be better served by that. This is
206 * optimized versus a full-featured open-coded implementation in two ways:
207 *
208 * - We don't care about a correct result from subnormal numbers (including
209 * 0.0), so the raw exponent can always be safely unbiased.
210 *
211 * - The value cannot be negative, so it does not need to be masked off to
212 * extract the exponent.
213 *
214 * int lsb = (floatBitsToInt(as_float) >> 23) - 0x7f;
215 */
216 i.insert_before(lsb);
217 i.insert_before(assign(lsb, sub(rshift(bitcast_f2i(as_float), c23), c7F)));
218
219 /* Use lsb_only in the comparison instead of temp so that the & (far above)
220 * can possibly generate the result without an explicit comparison.
221 *
222 * (lsb_only == 0) ? -1 : lsb;
223 *
224 * Since our input values are all integers, the unbiased exponent must not
225 * be negative. It will only be negative (-0x7f, in fact) if lsb_only is
226 * 0. Instead of using (lsb_only == 0), we could use (lsb >= 0). Which is
227 * better is likely GPU dependent. Either way, the difference should be
228 * small.
229 */
230 ir->operation = ir_triop_csel;
231 ir->init_num_operands();
232 ir->operands[0] = equal(lsb_only, c0);
233 ir->operands[1] = cminus1;
234 ir->operands[2] = new(ir) ir_dereference_variable(lsb);
235
236 this->progress = true;
237 }
238
239 void
find_msb_to_float_cast(ir_expression * ir)240 lower_instructions_visitor::find_msb_to_float_cast(ir_expression *ir)
241 {
242 /* For more details, see:
243 *
244 * http://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightFloatCast
245 */
246 const unsigned elements = ir->operands[0]->type->vector_elements;
247 ir_constant *c0 = new(ir) ir_constant(int(0), elements);
248 ir_constant *cminus1 = new(ir) ir_constant(int(-1), elements);
249 ir_constant *c23 = new(ir) ir_constant(int(23), elements);
250 ir_constant *c7F = new(ir) ir_constant(int(0x7F), elements);
251 ir_constant *c000000FF = new(ir) ir_constant(0x000000FFu, elements);
252 ir_constant *cFFFFFF00 = new(ir) ir_constant(0xFFFFFF00u, elements);
253 ir_variable *temp =
254 new(ir) ir_variable(glsl_uvec_type(elements), "temp", ir_var_temporary);
255 ir_variable *as_float =
256 new(ir) ir_variable(glsl_vec_type(elements), "as_float", ir_var_temporary);
257 ir_variable *msb =
258 new(ir) ir_variable(glsl_ivec_type(elements), "msb", ir_var_temporary);
259
260 ir_instruction &i = *base_ir;
261
262 i.insert_before(temp);
263
264 if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) {
265 i.insert_before(assign(temp, ir->operands[0]));
266 } else {
267 assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT);
268
269 /* findMSB(uint(abs(some_int))) almost always does the right thing.
270 * There are two problem values:
271 *
272 * * 0x80000000. Since abs(0x80000000) == 0x80000000, findMSB returns
273 * 31. However, findMSB(int(0x80000000)) == 30.
274 *
275 * * 0xffffffff. Since abs(0xffffffff) == 1, findMSB returns
276 * 31. Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
277 *
278 * For a value of zero or negative one, -1 will be returned.
279 *
280 * For all negative number cases, including 0x80000000 and 0xffffffff,
281 * the correct value is obtained from findMSB if instead of negating the
282 * (already negative) value the logical-not is used. A conditonal
283 * logical-not can be achieved in two instructions.
284 */
285 ir_variable *as_int =
286 new(ir) ir_variable(glsl_ivec_type(elements), "as_int", ir_var_temporary);
287 ir_constant *c31 = new(ir) ir_constant(int(31), elements);
288
289 i.insert_before(as_int);
290 i.insert_before(assign(as_int, ir->operands[0]));
291 i.insert_before(assign(temp, i2u(expr(ir_binop_bit_xor,
292 as_int,
293 rshift(as_int, c31)))));
294 }
295
296 /* The int-to-float conversion is lossless because bits are conditionally
297 * masked off the bottom of temp to ensure the value has at most 24 bits of
298 * data or is zero. We don't use the result in the zero case. The uint()
299 * cast is necessary so that 0x80000000 does not generate a negative value.
300 *
301 * float as_float = float(temp > 255 ? temp & ~255 : temp);
302 */
303 i.insert_before(as_float);
304 i.insert_before(assign(as_float, u2f(csel(greater(temp, c000000FF),
305 bit_and(temp, cFFFFFF00),
306 temp))));
307
308 /* This is basically an open-coded frexp. Implementations that have a
309 * native frexp instruction would be better served by that. This is
310 * optimized versus a full-featured open-coded implementation in two ways:
311 *
312 * - We don't care about a correct result from subnormal numbers (including
313 * 0.0), so the raw exponent can always be safely unbiased.
314 *
315 * - The value cannot be negative, so it does not need to be masked off to
316 * extract the exponent.
317 *
318 * int msb = (floatBitsToInt(as_float) >> 23) - 0x7f;
319 */
320 i.insert_before(msb);
321 i.insert_before(assign(msb, sub(rshift(bitcast_f2i(as_float), c23), c7F)));
322
323 /* Use msb in the comparison instead of temp so that the subtract can
324 * possibly generate the result without an explicit comparison.
325 *
326 * (msb < 0) ? -1 : msb;
327 *
328 * Since our input values are all integers, the unbiased exponent must not
329 * be negative. It will only be negative (-0x7f, in fact) if temp is 0.
330 */
331 ir->operation = ir_triop_csel;
332 ir->init_num_operands();
333 ir->operands[0] = less(msb, c0);
334 ir->operands[1] = cminus1;
335 ir->operands[2] = new(ir) ir_dereference_variable(msb);
336
337 this->progress = true;
338 }
339
340 ir_expression *
_carry(operand a,operand b)341 lower_instructions_visitor::_carry(operand a, operand b)
342 {
343 return i2u(b2i(less(add(a, b),
344 a.val->clone(ralloc_parent(a.val), NULL))));
345 }
346
347 void
imul_high_to_mul(ir_expression * ir)348 lower_instructions_visitor::imul_high_to_mul(ir_expression *ir)
349 {
350 /* ABCD
351 * * EFGH
352 * ======
353 * (GH * CD) + (GH * AB) << 16 + (EF * CD) << 16 + (EF * AB) << 32
354 *
355 * In GLSL, (a * b) becomes
356 *
357 * uint m1 = (a & 0x0000ffffu) * (b & 0x0000ffffu);
358 * uint m2 = (a & 0x0000ffffu) * (b >> 16);
359 * uint m3 = (a >> 16) * (b & 0x0000ffffu);
360 * uint m4 = (a >> 16) * (b >> 16);
361 *
362 * uint c1;
363 * uint c2;
364 * uint lo_result;
365 * uint hi_result;
366 *
367 * lo_result = uaddCarry(m1, m2 << 16, c1);
368 * hi_result = m4 + c1;
369 * lo_result = uaddCarry(lo_result, m3 << 16, c2);
370 * hi_result = hi_result + c2;
371 * hi_result = hi_result + (m2 >> 16) + (m3 >> 16);
372 */
373 const unsigned elements = ir->operands[0]->type->vector_elements;
374 ir_variable *src1 =
375 new(ir) ir_variable(glsl_uvec_type(elements), "src1", ir_var_temporary);
376 ir_variable *src1h =
377 new(ir) ir_variable(glsl_uvec_type(elements), "src1h", ir_var_temporary);
378 ir_variable *src1l =
379 new(ir) ir_variable(glsl_uvec_type(elements), "src1l", ir_var_temporary);
380 ir_variable *src2 =
381 new(ir) ir_variable(glsl_uvec_type(elements), "src2", ir_var_temporary);
382 ir_variable *src2h =
383 new(ir) ir_variable(glsl_uvec_type(elements), "src2h", ir_var_temporary);
384 ir_variable *src2l =
385 new(ir) ir_variable(glsl_uvec_type(elements), "src2l", ir_var_temporary);
386 ir_variable *t1 =
387 new(ir) ir_variable(glsl_uvec_type(elements), "t1", ir_var_temporary);
388 ir_variable *t2 =
389 new(ir) ir_variable(glsl_uvec_type(elements), "t2", ir_var_temporary);
390 ir_variable *lo =
391 new(ir) ir_variable(glsl_uvec_type(elements), "lo", ir_var_temporary);
392 ir_variable *hi =
393 new(ir) ir_variable(glsl_uvec_type(elements), "hi", ir_var_temporary);
394 ir_variable *different_signs = NULL;
395 ir_constant *c0000FFFF = new(ir) ir_constant(0x0000FFFFu, elements);
396 ir_constant *c16 = new(ir) ir_constant(16u, elements);
397
398 ir_instruction &i = *base_ir;
399
400 i.insert_before(src1);
401 i.insert_before(src2);
402 i.insert_before(src1h);
403 i.insert_before(src2h);
404 i.insert_before(src1l);
405 i.insert_before(src2l);
406
407 if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) {
408 i.insert_before(assign(src1, ir->operands[0]));
409 i.insert_before(assign(src2, ir->operands[1]));
410 } else {
411 assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT);
412
413 ir_variable *itmp1 =
414 new(ir) ir_variable(glsl_ivec_type(elements), "itmp1", ir_var_temporary);
415 ir_variable *itmp2 =
416 new(ir) ir_variable(glsl_ivec_type(elements), "itmp2", ir_var_temporary);
417 ir_constant *c0 = new(ir) ir_constant(int(0), elements);
418
419 i.insert_before(itmp1);
420 i.insert_before(itmp2);
421 i.insert_before(assign(itmp1, ir->operands[0]));
422 i.insert_before(assign(itmp2, ir->operands[1]));
423
424 different_signs =
425 new(ir) ir_variable(glsl_bvec_type(elements), "different_signs",
426 ir_var_temporary);
427
428 i.insert_before(different_signs);
429 i.insert_before(assign(different_signs, expr(ir_binop_logic_xor,
430 less(itmp1, c0),
431 less(itmp2, c0->clone(ir, NULL)))));
432
433 i.insert_before(assign(src1, i2u(abs(itmp1))));
434 i.insert_before(assign(src2, i2u(abs(itmp2))));
435 }
436
437 i.insert_before(assign(src1l, bit_and(src1, c0000FFFF)));
438 i.insert_before(assign(src2l, bit_and(src2, c0000FFFF->clone(ir, NULL))));
439 i.insert_before(assign(src1h, rshift(src1, c16)));
440 i.insert_before(assign(src2h, rshift(src2, c16->clone(ir, NULL))));
441
442 i.insert_before(lo);
443 i.insert_before(hi);
444 i.insert_before(t1);
445 i.insert_before(t2);
446
447 i.insert_before(assign(lo, mul(src1l, src2l)));
448 i.insert_before(assign(t1, mul(src1l, src2h)));
449 i.insert_before(assign(t2, mul(src1h, src2l)));
450 i.insert_before(assign(hi, mul(src1h, src2h)));
451
452 i.insert_before(assign(hi, add(hi, _carry(lo, lshift(t1, c16->clone(ir, NULL))))));
453 i.insert_before(assign(lo, add(lo, lshift(t1, c16->clone(ir, NULL)))));
454
455 i.insert_before(assign(hi, add(hi, _carry(lo, lshift(t2, c16->clone(ir, NULL))))));
456 i.insert_before(assign(lo, add(lo, lshift(t2, c16->clone(ir, NULL)))));
457
458 if (different_signs == NULL) {
459 assert(ir->operands[0]->type->base_type == GLSL_TYPE_UINT);
460
461 ir->operation = ir_binop_add;
462 ir->init_num_operands();
463 ir->operands[0] = add(hi, rshift(t1, c16->clone(ir, NULL)));
464 ir->operands[1] = rshift(t2, c16->clone(ir, NULL));
465 } else {
466 assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT);
467
468 i.insert_before(assign(hi, add(add(hi, rshift(t1, c16->clone(ir, NULL))),
469 rshift(t2, c16->clone(ir, NULL)))));
470
471 /* For channels where different_signs is set we have to perform a 64-bit
472 * negation. This is *not* the same as just negating the high 32-bits.
473 * Consider -3 * 2. The high 32-bits is 0, but the desired result is
474 * -1, not -0! Recall -x == ~x + 1.
475 */
476 ir_variable *neg_hi =
477 new(ir) ir_variable(glsl_ivec_type(elements), "neg_hi", ir_var_temporary);
478 ir_constant *c1 = new(ir) ir_constant(1u, elements);
479
480 i.insert_before(neg_hi);
481 i.insert_before(assign(neg_hi, add(bit_not(u2i(hi)),
482 u2i(_carry(bit_not(lo), c1)))));
483
484 ir->operation = ir_triop_csel;
485 ir->init_num_operands();
486 ir->operands[0] = new(ir) ir_dereference_variable(different_signs);
487 ir->operands[1] = new(ir) ir_dereference_variable(neg_hi);
488 ir->operands[2] = u2i(hi);
489 }
490 }
491
492 ir_visitor_status
visit_leave(ir_expression * ir)493 lower_instructions_visitor::visit_leave(ir_expression *ir)
494 {
495 switch (ir->operation) {
496 case ir_binop_dot:
497 if (glsl_type_is_double(ir->operands[0]->type))
498 double_dot_to_fma(ir);
499 break;
500 case ir_triop_lrp:
501 if (glsl_type_is_double(ir->operands[0]->type))
502 double_lrp(ir);
503 break;
504
505 case ir_unop_find_lsb:
506 if (lowering(FIND_LSB_TO_FLOAT_CAST))
507 find_lsb_to_float_cast(ir);
508 break;
509
510 case ir_unop_find_msb:
511 if (lowering(FIND_MSB_TO_FLOAT_CAST))
512 find_msb_to_float_cast(ir);
513 break;
514
515 case ir_binop_imul_high:
516 if (lowering(IMUL_HIGH_TO_MUL))
517 imul_high_to_mul(ir);
518 break;
519
520 default:
521 return visit_continue;
522 }
523
524 return visit_continue;
525 }
526