1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "elk_vec4.h"
25 #include "elk_cfg.h"
26 #include "elk_eu.h"
27 #include "util/u_math.h"
28
29 namespace elk {
30
vec4_instruction(enum elk_opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1,const src_reg & src2)31 vec4_instruction::vec4_instruction(enum elk_opcode opcode, const dst_reg &dst,
32 const src_reg &src0, const src_reg &src1,
33 const src_reg &src2)
34 {
35 this->opcode = opcode;
36 this->dst = dst;
37 this->src[0] = src0;
38 this->src[1] = src1;
39 this->src[2] = src2;
40 this->saturate = false;
41 this->force_writemask_all = false;
42 this->no_dd_clear = false;
43 this->no_dd_check = false;
44 this->writes_accumulator = false;
45 this->conditional_mod = ELK_CONDITIONAL_NONE;
46 this->predicate = ELK_PREDICATE_NONE;
47 this->predicate_inverse = false;
48 this->target = 0;
49 this->shadow_compare = false;
50 this->eot = false;
51 this->ir = NULL;
52 this->urb_write_flags = ELK_URB_WRITE_NO_FLAGS;
53 this->header_size = 0;
54 this->flag_subreg = 0;
55 this->mlen = 0;
56 this->base_mrf = 0;
57 this->offset = 0;
58 this->exec_size = 8;
59 this->group = 0;
60 this->size_written = (dst.file == BAD_FILE ?
61 0 : this->exec_size * type_sz(dst.type));
62 this->annotation = NULL;
63 }
64
65 vec4_instruction *
emit(vec4_instruction * inst)66 vec4_visitor::emit(vec4_instruction *inst)
67 {
68 inst->ir = this->base_ir;
69 inst->annotation = this->current_annotation;
70
71 this->instructions.push_tail(inst);
72
73 return inst;
74 }
75
76 vec4_instruction *
emit_before(elk_bblock_t * block,vec4_instruction * inst,vec4_instruction * new_inst)77 vec4_visitor::emit_before(elk_bblock_t *block, vec4_instruction *inst,
78 vec4_instruction *new_inst)
79 {
80 new_inst->ir = inst->ir;
81 new_inst->annotation = inst->annotation;
82
83 inst->insert_before(block, new_inst);
84
85 return inst;
86 }
87
88 vec4_instruction *
emit(enum elk_opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1,const src_reg & src2)89 vec4_visitor::emit(enum elk_opcode opcode, const dst_reg &dst, const src_reg &src0,
90 const src_reg &src1, const src_reg &src2)
91 {
92 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
93 }
94
95
96 vec4_instruction *
emit(enum elk_opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1)97 vec4_visitor::emit(enum elk_opcode opcode, const dst_reg &dst, const src_reg &src0,
98 const src_reg &src1)
99 {
100 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
101 }
102
103 vec4_instruction *
emit(enum elk_opcode opcode,const dst_reg & dst,const src_reg & src0)104 vec4_visitor::emit(enum elk_opcode opcode, const dst_reg &dst, const src_reg &src0)
105 {
106 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
107 }
108
109 vec4_instruction *
emit(enum elk_opcode opcode,const dst_reg & dst)110 vec4_visitor::emit(enum elk_opcode opcode, const dst_reg &dst)
111 {
112 return emit(new(mem_ctx) vec4_instruction(opcode, dst));
113 }
114
115 vec4_instruction *
emit(enum elk_opcode opcode)116 vec4_visitor::emit(enum elk_opcode opcode)
117 {
118 return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
119 }
120
121 #define ALU1(op) \
122 vec4_instruction * \
123 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
124 { \
125 return new(mem_ctx) vec4_instruction(ELK_OPCODE_##op, dst, src0); \
126 }
127
128 #define ALU2(op) \
129 vec4_instruction * \
130 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
131 const src_reg &src1) \
132 { \
133 return new(mem_ctx) vec4_instruction(ELK_OPCODE_##op, dst, \
134 src0, src1); \
135 }
136
137 #define ALU2_ACC(op) \
138 vec4_instruction * \
139 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
140 const src_reg &src1) \
141 { \
142 vec4_instruction *inst = new(mem_ctx) vec4_instruction( \
143 ELK_OPCODE_##op, dst, src0, src1); \
144 inst->writes_accumulator = true; \
145 return inst; \
146 }
147
148 #define ALU3(op) \
149 vec4_instruction * \
150 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
151 const src_reg &src1, const src_reg &src2) \
152 { \
153 assert(devinfo->ver >= 6); \
154 return new(mem_ctx) vec4_instruction(ELK_OPCODE_##op, dst, \
155 src0, src1, src2); \
156 }
157
158 ALU1(NOT)
ALU1(MOV)159 ALU1(MOV)
160 ALU1(FRC)
161 ALU1(RNDD)
162 ALU1(RNDE)
163 ALU1(RNDZ)
164 ALU1(F32TO16)
165 ALU1(F16TO32)
166 ALU2(ADD)
167 ALU2(MUL)
168 ALU2_ACC(MACH)
169 ALU2(AND)
170 ALU2(OR)
171 ALU2(XOR)
172 ALU2(DP3)
173 ALU2(DP4)
174 ALU2(DPH)
175 ALU2(SHL)
176 ALU2(SHR)
177 ALU2(ASR)
178 ALU3(LRP)
179 ALU1(BFREV)
180 ALU3(BFE)
181 ALU2(BFI1)
182 ALU3(BFI2)
183 ALU1(FBH)
184 ALU1(FBL)
185 ALU1(CBIT)
186 ALU1(LZD)
187 ALU3(MAD)
188 ALU2_ACC(ADDC)
189 ALU2_ACC(SUBB)
190 ALU2(MAC)
191 ALU1(DIM)
192
193 /** Gfx4 predicated IF. */
194 vec4_instruction *
195 vec4_visitor::IF(enum elk_predicate predicate)
196 {
197 vec4_instruction *inst;
198
199 inst = new(mem_ctx) vec4_instruction(ELK_OPCODE_IF);
200 inst->predicate = predicate;
201
202 return inst;
203 }
204
205 /** Gfx6 IF with embedded comparison. */
206 vec4_instruction *
IF(src_reg src0,src_reg src1,enum elk_conditional_mod condition)207 vec4_visitor::IF(src_reg src0, src_reg src1,
208 enum elk_conditional_mod condition)
209 {
210 assert(devinfo->ver == 6);
211
212 vec4_instruction *inst;
213
214 resolve_ud_negate(&src0);
215 resolve_ud_negate(&src1);
216
217 inst = new(mem_ctx) vec4_instruction(ELK_OPCODE_IF, dst_null_d(),
218 src0, src1);
219 inst->conditional_mod = condition;
220
221 return inst;
222 }
223
224 /**
225 * CMP: Sets the low bit of the destination channels with the result
226 * of the comparison, while the upper bits are undefined, and updates
227 * the flag register with the packed 16 bits of the result.
228 */
229 vec4_instruction *
CMP(dst_reg dst,src_reg src0,src_reg src1,enum elk_conditional_mod condition)230 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
231 enum elk_conditional_mod condition)
232 {
233 vec4_instruction *inst;
234
235 /* Take the instruction:
236 *
237 * CMP null<d> src0<f> src1<f>
238 *
239 * Original gfx4 does type conversion to the destination type before
240 * comparison, producing garbage results for floating point comparisons.
241 *
242 * The destination type doesn't matter on newer generations, so we set the
243 * type to match src0 so we can compact the instruction.
244 */
245 dst.type = src0.type;
246
247 resolve_ud_negate(&src0);
248 resolve_ud_negate(&src1);
249
250 inst = new(mem_ctx) vec4_instruction(ELK_OPCODE_CMP, dst, src0, src1);
251 inst->conditional_mod = condition;
252
253 return inst;
254 }
255
256 vec4_instruction *
SCRATCH_READ(const dst_reg & dst,const src_reg & index)257 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
258 {
259 vec4_instruction *inst;
260
261 inst = new(mem_ctx) vec4_instruction(ELK_SHADER_OPCODE_GFX4_SCRATCH_READ,
262 dst, index);
263 inst->base_mrf = FIRST_SPILL_MRF(devinfo->ver) + 1;
264 inst->mlen = 2;
265
266 return inst;
267 }
268
269 vec4_instruction *
SCRATCH_WRITE(const dst_reg & dst,const src_reg & src,const src_reg & index)270 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
271 const src_reg &index)
272 {
273 vec4_instruction *inst;
274
275 inst = new(mem_ctx) vec4_instruction(ELK_SHADER_OPCODE_GFX4_SCRATCH_WRITE,
276 dst, src, index);
277 inst->base_mrf = FIRST_SPILL_MRF(devinfo->ver);
278 inst->mlen = 3;
279
280 return inst;
281 }
282
283 src_reg
fix_3src_operand(const src_reg & src)284 vec4_visitor::fix_3src_operand(const src_reg &src)
285 {
286 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
287 * able to use vertical stride of zero to replicate the vec4 uniform, like
288 *
289 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
290 *
291 * But you can't, since vertical stride is always four in three-source
292 * instructions. Instead, insert a MOV instruction to do the replication so
293 * that the three-source instruction can consume it.
294 */
295
296 /* The MOV is only needed if the source is a uniform or immediate. */
297 if (src.file != UNIFORM && src.file != IMM)
298 return src;
299
300 if (src.file == UNIFORM && elk_is_single_value_swizzle(src.swizzle))
301 return src;
302
303 dst_reg expanded = dst_reg(this, glsl_vec4_type());
304 expanded.type = src.type;
305 emit(ELK_VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
306 return src_reg(expanded);
307 }
308
309 src_reg
fix_math_operand(const src_reg & src)310 vec4_visitor::fix_math_operand(const src_reg &src)
311 {
312 if (devinfo->ver < 6 || src.file == BAD_FILE)
313 return src;
314
315 /* The gfx6 math instruction ignores the source modifiers --
316 * swizzle, abs, negate, and at least some parts of the register
317 * region description.
318 *
319 * Rather than trying to enumerate all these cases, *always* expand the
320 * operand to a temp GRF for gfx6.
321 *
322 * For gfx7, keep the operand as-is, except if immediate, which gfx7 still
323 * can't use.
324 */
325
326 if (devinfo->ver == 7 && src.file != IMM)
327 return src;
328
329 dst_reg expanded = dst_reg(this, glsl_vec4_type());
330 expanded.type = src.type;
331 emit(MOV(expanded, src));
332 return src_reg(expanded);
333 }
334
335 vec4_instruction *
emit_math(enum elk_opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1)336 vec4_visitor::emit_math(enum elk_opcode opcode,
337 const dst_reg &dst,
338 const src_reg &src0, const src_reg &src1)
339 {
340 vec4_instruction *math =
341 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
342
343 if (devinfo->ver == 6 && dst.writemask != WRITEMASK_XYZW) {
344 /* MATH on Gfx6 must be align1, so we can't do writemasks. */
345 math->dst = dst_reg(this, glsl_vec4_type());
346 math->dst.type = dst.type;
347 math = emit(MOV(dst, src_reg(math->dst)));
348 } else if (devinfo->ver < 6) {
349 math->base_mrf = 1;
350 math->mlen = src1.file == BAD_FILE ? 1 : 2;
351 }
352
353 return math;
354 }
355
356 void
emit_pack_half_2x16(dst_reg dst,src_reg src0)357 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
358 {
359 if (devinfo->ver < 7) {
360 unreachable("ir_unop_pack_half_2x16 should be lowered");
361 }
362
363 assert(dst.type == ELK_REGISTER_TYPE_UD);
364 assert(src0.type == ELK_REGISTER_TYPE_F);
365
366 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
367 *
368 * Because this instruction does not have a 16-bit floating-point type,
369 * the destination data type must be Word (W).
370 *
371 * The destination must be DWord-aligned and specify a horizontal stride
372 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
373 * each destination channel and the upper word is not modified.
374 *
375 * The above restriction implies that the f32to16 instruction must use
376 * align1 mode, because only in align1 mode is it possible to specify
377 * horizontal stride. We choose here to defy the hardware docs and emit
378 * align16 instructions.
379 *
380 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
381 * instructions. I was partially successful in that the code passed all
382 * tests. However, the code was dubiously correct and fragile, and the
383 * tests were not harsh enough to probe that frailty. Not trusting the
384 * code, I chose instead to remain in align16 mode in defiance of the hw
385 * docs).
386 *
387 * I've [chadv] experimentally confirmed that, on gfx7 hardware and the
388 * simulator, emitting a f32to16 in align16 mode with UD as destination
389 * data type is safe. The behavior differs from that specified in the PRM
390 * in that the upper word of each destination channel is cleared to 0.
391 */
392
393 dst_reg tmp_dst(this, glsl_uvec2_type());
394 src_reg tmp_src(tmp_dst);
395
396 #if 0
397 /* Verify the undocumented behavior on which the following instructions
398 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
399 * then the result of the bit-or instruction below will be incorrect.
400 *
401 * You should inspect the disasm output in order to verify that the MOV is
402 * not optimized away.
403 */
404 emit(MOV(tmp_dst, elk_imm_ud(0x12345678u)));
405 #endif
406
407 /* Give tmp the form below, where "." means untouched.
408 *
409 * w z y x w z y x
410 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
411 *
412 * That the upper word of each write-channel be 0 is required for the
413 * following bit-shift and bit-or instructions to work. Note that this
414 * relies on the undocumented hardware behavior mentioned above.
415 */
416 tmp_dst.writemask = WRITEMASK_XY;
417 emit(F32TO16(tmp_dst, src0));
418
419 /* Give the write-channels of dst the form:
420 * 0xhhhh0000
421 */
422 tmp_src.swizzle = ELK_SWIZZLE_YYYY;
423 emit(SHL(dst, tmp_src, elk_imm_ud(16u)));
424
425 /* Finally, give the write-channels of dst the form of packHalf2x16's
426 * output:
427 * 0xhhhhllll
428 */
429 tmp_src.swizzle = ELK_SWIZZLE_XXXX;
430 emit(OR(dst, src_reg(dst), tmp_src));
431 }
432
433 void
emit_unpack_half_2x16(dst_reg dst,src_reg src0)434 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
435 {
436 if (devinfo->ver < 7) {
437 unreachable("ir_unop_unpack_half_2x16 should be lowered");
438 }
439
440 assert(dst.type == ELK_REGISTER_TYPE_F);
441 assert(src0.type == ELK_REGISTER_TYPE_UD);
442
443 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
444 *
445 * Because this instruction does not have a 16-bit floating-point type,
446 * the source data type must be Word (W). The destination type must be
447 * F (Float).
448 *
449 * To use W as the source data type, we must adjust horizontal strides,
450 * which is only possible in align1 mode. All my [chadv] attempts at
451 * emitting align1 instructions for unpackHalf2x16 failed to pass the
452 * Piglit tests, so I gave up.
453 *
454 * I've verified that, on gfx7 hardware and the simulator, it is safe to
455 * emit f16to32 in align16 mode with UD as source data type.
456 */
457
458 dst_reg tmp_dst(this, glsl_uvec2_type());
459 src_reg tmp_src(tmp_dst);
460
461 tmp_dst.writemask = WRITEMASK_X;
462 emit(AND(tmp_dst, src0, elk_imm_ud(0xffffu)));
463
464 tmp_dst.writemask = WRITEMASK_Y;
465 emit(SHR(tmp_dst, src0, elk_imm_ud(16u)));
466
467 dst.writemask = WRITEMASK_XY;
468 emit(F16TO32(dst, tmp_src));
469 }
470
471 void
emit_unpack_unorm_4x8(const dst_reg & dst,src_reg src0)472 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
473 {
474 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
475 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
476 * is not suitable to generate the shift values, but we can use the packed
477 * vector float and a type-converting MOV.
478 */
479 dst_reg shift(this, glsl_uvec4_type());
480 emit(MOV(shift, elk_imm_vf4(0x00, 0x60, 0x70, 0x78)));
481
482 dst_reg shifted(this, glsl_uvec4_type());
483 src0.swizzle = ELK_SWIZZLE_XXXX;
484 emit(SHR(shifted, src0, src_reg(shift)));
485
486 shifted.type = ELK_REGISTER_TYPE_UB;
487 dst_reg f(this, glsl_vec4_type());
488 emit(ELK_VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
489
490 emit(MUL(dst, src_reg(f), elk_imm_f(1.0f / 255.0f)));
491 }
492
493 void
emit_unpack_snorm_4x8(const dst_reg & dst,src_reg src0)494 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
495 {
496 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
497 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
498 * is not suitable to generate the shift values, but we can use the packed
499 * vector float and a type-converting MOV.
500 */
501 dst_reg shift(this, glsl_uvec4_type());
502 emit(MOV(shift, elk_imm_vf4(0x00, 0x60, 0x70, 0x78)));
503
504 dst_reg shifted(this, glsl_uvec4_type());
505 src0.swizzle = ELK_SWIZZLE_XXXX;
506 emit(SHR(shifted, src0, src_reg(shift)));
507
508 shifted.type = ELK_REGISTER_TYPE_B;
509 dst_reg f(this, glsl_vec4_type());
510 emit(ELK_VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
511
512 dst_reg scaled(this, glsl_vec4_type());
513 emit(MUL(scaled, src_reg(f), elk_imm_f(1.0f / 127.0f)));
514
515 dst_reg max(this, glsl_vec4_type());
516 emit_minmax(ELK_CONDITIONAL_GE, max, src_reg(scaled), elk_imm_f(-1.0f));
517 emit_minmax(ELK_CONDITIONAL_L, dst, src_reg(max), elk_imm_f(1.0f));
518 }
519
520 void
emit_pack_unorm_4x8(const dst_reg & dst,const src_reg & src0)521 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
522 {
523 dst_reg saturated(this, glsl_vec4_type());
524 vec4_instruction *inst = emit(MOV(saturated, src0));
525 inst->saturate = true;
526
527 dst_reg scaled(this, glsl_vec4_type());
528 emit(MUL(scaled, src_reg(saturated), elk_imm_f(255.0f)));
529
530 dst_reg rounded(this, glsl_vec4_type());
531 emit(RNDE(rounded, src_reg(scaled)));
532
533 dst_reg u(this, glsl_uvec4_type());
534 emit(MOV(u, src_reg(rounded)));
535
536 src_reg bytes(u);
537 emit(ELK_VEC4_OPCODE_PACK_BYTES, dst, bytes);
538 }
539
540 void
emit_pack_snorm_4x8(const dst_reg & dst,const src_reg & src0)541 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
542 {
543 dst_reg max(this, glsl_vec4_type());
544 emit_minmax(ELK_CONDITIONAL_GE, max, src0, elk_imm_f(-1.0f));
545
546 dst_reg min(this, glsl_vec4_type());
547 emit_minmax(ELK_CONDITIONAL_L, min, src_reg(max), elk_imm_f(1.0f));
548
549 dst_reg scaled(this, glsl_vec4_type());
550 emit(MUL(scaled, src_reg(min), elk_imm_f(127.0f)));
551
552 dst_reg rounded(this, glsl_vec4_type());
553 emit(RNDE(rounded, src_reg(scaled)));
554
555 dst_reg i(this, glsl_ivec4_type());
556 emit(MOV(i, src_reg(rounded)));
557
558 src_reg bytes(i);
559 emit(ELK_VEC4_OPCODE_PACK_BYTES, dst, bytes);
560 }
561
562 /*
563 * Returns the minimum number of vec4 (as_vec4 == true) or dvec4 (as_vec4 ==
564 * false) elements needed to pack a type.
565 */
566 static int
elk_type_size_xvec4(const struct glsl_type * type,bool as_vec4,bool bindless)567 elk_type_size_xvec4(const struct glsl_type *type, bool as_vec4, bool bindless)
568 {
569 unsigned int i;
570 int size;
571
572 switch (type->base_type) {
573 case GLSL_TYPE_UINT:
574 case GLSL_TYPE_INT:
575 case GLSL_TYPE_FLOAT:
576 case GLSL_TYPE_FLOAT16:
577 case GLSL_TYPE_BOOL:
578 case GLSL_TYPE_DOUBLE:
579 case GLSL_TYPE_UINT16:
580 case GLSL_TYPE_INT16:
581 case GLSL_TYPE_UINT8:
582 case GLSL_TYPE_INT8:
583 case GLSL_TYPE_UINT64:
584 case GLSL_TYPE_INT64:
585 if (glsl_type_is_matrix(type)) {
586 const glsl_type *col_type = glsl_get_column_type(type);
587 unsigned col_slots =
588 (as_vec4 && glsl_type_is_dual_slot(col_type)) ? 2 : 1;
589 return type->matrix_columns * col_slots;
590 } else {
591 /* Regardless of size of vector, it gets a vec4. This is bad
592 * packing for things like floats, but otherwise arrays become a
593 * mess. Hopefully a later pass over the code can pack scalars
594 * down if appropriate.
595 */
596 return (as_vec4 && glsl_type_is_dual_slot(type)) ? 2 : 1;
597 }
598 case GLSL_TYPE_ARRAY:
599 assert(type->length > 0);
600 return elk_type_size_xvec4(type->fields.array, as_vec4, bindless) *
601 type->length;
602 case GLSL_TYPE_STRUCT:
603 case GLSL_TYPE_INTERFACE:
604 size = 0;
605 for (i = 0; i < type->length; i++) {
606 size += elk_type_size_xvec4(type->fields.structure[i].type, as_vec4,
607 bindless);
608 }
609 return size;
610 case GLSL_TYPE_SUBROUTINE:
611 return 1;
612
613 case GLSL_TYPE_SAMPLER:
614 case GLSL_TYPE_TEXTURE:
615 /* Samplers and textures take up no register space, since they're baked
616 * in at link time.
617 */
618 return bindless ? 1 : 0;
619 case GLSL_TYPE_ATOMIC_UINT:
620 return 0;
621 case GLSL_TYPE_IMAGE:
622 return bindless ? 1 : DIV_ROUND_UP(ISL_IMAGE_PARAM_SIZE, 4);
623 case GLSL_TYPE_VOID:
624 case GLSL_TYPE_ERROR:
625 case GLSL_TYPE_COOPERATIVE_MATRIX:
626 unreachable("not reached");
627 }
628
629 return 0;
630 }
631
632 /**
633 * Returns the minimum number of vec4 elements needed to pack a type.
634 *
635 * For simple types, it will return 1 (a single vec4); for matrices, the
636 * number of columns; for array and struct, the sum of the vec4_size of
637 * each of its elements; and for sampler and atomic, zero.
638 *
639 * This method is useful to calculate how much register space is needed to
640 * store a particular type.
641 */
642 extern "C" int
elk_type_size_vec4(const struct glsl_type * type,bool bindless)643 elk_type_size_vec4(const struct glsl_type *type, bool bindless)
644 {
645 return elk_type_size_xvec4(type, true, bindless);
646 }
647
648 /**
649 * Returns the minimum number of dvec4 elements needed to pack a type.
650 *
651 * For simple types, it will return 1 (a single dvec4); for matrices, the
652 * number of columns; for array and struct, the sum of the dvec4_size of
653 * each of its elements; and for sampler and atomic, zero.
654 *
655 * This method is useful to calculate how much register space is needed to
656 * store a particular type.
657 *
658 * Measuring double-precision vertex inputs as dvec4 is required because
659 * ARB_vertex_attrib_64bit states that these uses the same number of locations
660 * than the single-precision version. That is, two consecutives dvec4 would be
661 * located in location "x" and location "x+1", not "x+2".
662 *
663 * In order to map vec4/dvec4 vertex inputs in the proper ATTRs,
664 * remap_vs_attrs() will take in account both the location and also if the
665 * type fits in one or two vec4 slots.
666 */
667 extern "C" int
elk_type_size_dvec4(const struct glsl_type * type,bool bindless)668 elk_type_size_dvec4(const struct glsl_type *type, bool bindless)
669 {
670 return elk_type_size_xvec4(type, false, bindless);
671 }
672
src_reg(class vec4_visitor * v,const struct glsl_type * type)673 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
674 {
675 init();
676
677 this->file = VGRF;
678 this->nr = v->alloc.allocate(elk_type_size_vec4(type, false));
679
680 if (glsl_type_is_array(type) || glsl_type_is_struct(type)) {
681 this->swizzle = ELK_SWIZZLE_NOOP;
682 } else {
683 this->swizzle = elk_swizzle_for_size(type->vector_elements);
684 }
685
686 this->type = elk_type_for_base_type(type);
687 }
688
src_reg(class vec4_visitor * v,const struct glsl_type * type,int size)689 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
690 {
691 assert(size > 0);
692
693 init();
694
695 this->file = VGRF;
696 this->nr = v->alloc.allocate(elk_type_size_vec4(type, false) * size);
697
698 this->swizzle = ELK_SWIZZLE_NOOP;
699
700 this->type = elk_type_for_base_type(type);
701 }
702
dst_reg(class vec4_visitor * v,const struct glsl_type * type)703 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
704 {
705 init();
706
707 this->file = VGRF;
708 this->nr = v->alloc.allocate(elk_type_size_vec4(type, false));
709
710 if (glsl_type_is_array(type) || glsl_type_is_struct(type)) {
711 this->writemask = WRITEMASK_XYZW;
712 } else {
713 this->writemask = (1 << type->vector_elements) - 1;
714 }
715
716 this->type = elk_type_for_base_type(type);
717 }
718
719 vec4_instruction *
emit_minmax(enum elk_conditional_mod conditionalmod,dst_reg dst,src_reg src0,src_reg src1)720 vec4_visitor::emit_minmax(enum elk_conditional_mod conditionalmod, dst_reg dst,
721 src_reg src0, src_reg src1)
722 {
723 vec4_instruction *inst = emit(ELK_OPCODE_SEL, dst, src0, src1);
724 inst->conditional_mod = conditionalmod;
725 return inst;
726 }
727
728 /**
729 * Emits the instructions needed to perform a pull constant load. before_block
730 * and before_inst can be NULL in which case the instruction will be appended
731 * to the end of the instruction list.
732 */
733 void
emit_pull_constant_load_reg(dst_reg dst,src_reg surf_index,src_reg offset_reg,elk_bblock_t * before_block,vec4_instruction * before_inst)734 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
735 src_reg surf_index,
736 src_reg offset_reg,
737 elk_bblock_t *before_block,
738 vec4_instruction *before_inst)
739 {
740 assert((before_inst == NULL && before_block == NULL) ||
741 (before_inst && before_block));
742
743 vec4_instruction *pull;
744
745 if (devinfo->ver >= 7) {
746 dst_reg grf_offset = dst_reg(this, glsl_uint_type());
747
748 grf_offset.type = offset_reg.type;
749
750 pull = MOV(grf_offset, offset_reg);
751
752 if (before_inst)
753 emit_before(before_block, before_inst, pull);
754 else
755 emit(pull);
756
757 pull = new(mem_ctx) vec4_instruction(ELK_VS_OPCODE_PULL_CONSTANT_LOAD_GFX7,
758 dst,
759 surf_index,
760 src_reg(grf_offset));
761 pull->mlen = 1;
762 } else {
763 pull = new(mem_ctx) vec4_instruction(ELK_VS_OPCODE_PULL_CONSTANT_LOAD,
764 dst,
765 surf_index,
766 offset_reg);
767 pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->ver) + 1;
768 pull->mlen = 1;
769 }
770
771 if (before_inst)
772 emit_before(before_block, before_inst, pull);
773 else
774 emit(pull);
775 }
776
777 src_reg
emit_uniformize(const src_reg & src)778 vec4_visitor::emit_uniformize(const src_reg &src)
779 {
780 const src_reg chan_index(this, glsl_uint_type());
781 const dst_reg dst = retype(dst_reg(this, glsl_uint_type()),
782 src.type);
783
784 emit(ELK_SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
785 ->force_writemask_all = true;
786 emit(ELK_SHADER_OPCODE_BROADCAST, dst, src, chan_index)
787 ->force_writemask_all = true;
788
789 return src_reg(dst);
790 }
791
792 void
gs_emit_vertex(int)793 vec4_visitor::gs_emit_vertex(int /* stream_id */)
794 {
795 unreachable("not reached");
796 }
797
798 void
gs_end_primitive()799 vec4_visitor::gs_end_primitive()
800 {
801 unreachable("not reached");
802 }
803
804 void
emit_ndc_computation()805 vec4_visitor::emit_ndc_computation()
806 {
807 if (output_reg[VARYING_SLOT_POS][0].file == BAD_FILE)
808 return;
809
810 /* Get the position */
811 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS][0]);
812
813 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
814 dst_reg ndc = dst_reg(this, glsl_vec4_type());
815 output_reg[ELK_VARYING_SLOT_NDC][0] = ndc;
816 output_num_components[ELK_VARYING_SLOT_NDC][0] = 4;
817
818 current_annotation = "NDC";
819 dst_reg ndc_w = ndc;
820 ndc_w.writemask = WRITEMASK_W;
821 src_reg pos_w = pos;
822 pos_w.swizzle = ELK_SWIZZLE4(ELK_SWIZZLE_W, ELK_SWIZZLE_W, ELK_SWIZZLE_W, ELK_SWIZZLE_W);
823 emit_math(ELK_SHADER_OPCODE_RCP, ndc_w, pos_w);
824
825 dst_reg ndc_xyz = ndc;
826 ndc_xyz.writemask = WRITEMASK_XYZ;
827
828 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
829 }
830
831 void
emit_psiz_and_flags(dst_reg reg)832 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
833 {
834 if (devinfo->ver < 6 &&
835 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
836 output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE ||
837 devinfo->has_negative_rhw_bug)) {
838 dst_reg header1 = dst_reg(this, glsl_uvec4_type());
839 dst_reg header1_w = header1;
840 header1_w.writemask = WRITEMASK_W;
841
842 emit(MOV(header1, elk_imm_ud(0u)));
843
844 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
845 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
846
847 current_annotation = "Point size";
848 emit(MUL(header1_w, psiz, elk_imm_f((float)(1 << 11))));
849 emit(AND(header1_w, src_reg(header1_w), elk_imm_d(0x7ff << 8)));
850 }
851
852 if (output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE) {
853 current_annotation = "Clipping flags";
854 dst_reg flags0 = dst_reg(this, glsl_uint_type());
855
856 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0][0]), elk_imm_f(0.0f), ELK_CONDITIONAL_L));
857 emit(ELK_VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, elk_imm_d(0));
858 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
859 }
860
861 if (output_reg[VARYING_SLOT_CLIP_DIST1][0].file != BAD_FILE) {
862 dst_reg flags1 = dst_reg(this, glsl_uint_type());
863 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1][0]), elk_imm_f(0.0f), ELK_CONDITIONAL_L));
864 emit(ELK_VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, elk_imm_d(0));
865 emit(SHL(flags1, src_reg(flags1), elk_imm_d(4)));
866 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
867 }
868
869 /* i965 clipping workaround:
870 * 1) Test for -ve rhw
871 * 2) If set,
872 * set ndc = (0,0,0,0)
873 * set ucp[6] = 1
874 *
875 * Later, clipping will detect ucp[6] and ensure the primitive is
876 * clipped against all fixed planes.
877 */
878 if (devinfo->has_negative_rhw_bug &&
879 output_reg[ELK_VARYING_SLOT_NDC][0].file != BAD_FILE) {
880 src_reg ndc_w = src_reg(output_reg[ELK_VARYING_SLOT_NDC][0]);
881 ndc_w.swizzle = ELK_SWIZZLE_WWWW;
882 emit(CMP(dst_null_f(), ndc_w, elk_imm_f(0.0f), ELK_CONDITIONAL_L));
883 vec4_instruction *inst;
884 inst = emit(OR(header1_w, src_reg(header1_w), elk_imm_ud(1u << 6)));
885 inst->predicate = ELK_PREDICATE_NORMAL;
886 output_reg[ELK_VARYING_SLOT_NDC][0].type = ELK_REGISTER_TYPE_F;
887 inst = emit(MOV(output_reg[ELK_VARYING_SLOT_NDC][0], elk_imm_f(0.0f)));
888 inst->predicate = ELK_PREDICATE_NORMAL;
889 }
890
891 emit(MOV(retype(reg, ELK_REGISTER_TYPE_UD), src_reg(header1)));
892 } else if (devinfo->ver < 6) {
893 emit(MOV(retype(reg, ELK_REGISTER_TYPE_UD), elk_imm_ud(0u)));
894 } else {
895 emit(MOV(retype(reg, ELK_REGISTER_TYPE_D), elk_imm_d(0)));
896 if (output_reg[VARYING_SLOT_PSIZ][0].file != BAD_FILE) {
897 dst_reg reg_w = reg;
898 reg_w.writemask = WRITEMASK_W;
899 src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
900 reg_as_src.type = reg_w.type;
901 reg_as_src.swizzle = elk_swizzle_for_size(1);
902 emit(MOV(reg_w, reg_as_src));
903 }
904 if (output_reg[VARYING_SLOT_LAYER][0].file != BAD_FILE) {
905 dst_reg reg_y = reg;
906 reg_y.writemask = WRITEMASK_Y;
907 reg_y.type = ELK_REGISTER_TYPE_D;
908 output_reg[VARYING_SLOT_LAYER][0].type = reg_y.type;
909 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER][0])));
910 }
911 if (output_reg[VARYING_SLOT_VIEWPORT][0].file != BAD_FILE) {
912 dst_reg reg_z = reg;
913 reg_z.writemask = WRITEMASK_Z;
914 reg_z.type = ELK_REGISTER_TYPE_D;
915 output_reg[VARYING_SLOT_VIEWPORT][0].type = reg_z.type;
916 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT][0])));
917 }
918 }
919 }
920
921 vec4_instruction *
emit_generic_urb_slot(dst_reg reg,int varying,int component)922 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying, int component)
923 {
924 assert(varying < VARYING_SLOT_MAX);
925
926 unsigned num_comps = output_num_components[varying][component];
927 if (num_comps == 0)
928 return NULL;
929
930 assert(output_reg[varying][component].type == reg.type);
931 current_annotation = output_reg_annotation[varying];
932 if (output_reg[varying][component].file != BAD_FILE) {
933 src_reg src = src_reg(output_reg[varying][component]);
934 src.swizzle = ELK_SWZ_COMP_OUTPUT(component);
935 reg.writemask =
936 elk_writemask_for_component_packing(num_comps, component);
937 return emit(MOV(reg, src));
938 }
939 return NULL;
940 }
941
942 void
emit_urb_slot(dst_reg reg,int varying)943 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
944 {
945 reg.type = ELK_REGISTER_TYPE_F;
946 output_reg[varying][0].type = reg.type;
947
948 switch (varying) {
949 case VARYING_SLOT_PSIZ:
950 {
951 /* PSIZ is always in slot 0, and is coupled with other flags. */
952 current_annotation = "indices, point width, clip flags";
953 emit_psiz_and_flags(reg);
954 break;
955 }
956 case ELK_VARYING_SLOT_NDC:
957 current_annotation = "NDC";
958 if (output_reg[ELK_VARYING_SLOT_NDC][0].file != BAD_FILE)
959 emit(MOV(reg, src_reg(output_reg[ELK_VARYING_SLOT_NDC][0])));
960 break;
961 case VARYING_SLOT_POS:
962 current_annotation = "gl_Position";
963 if (output_reg[VARYING_SLOT_POS][0].file != BAD_FILE)
964 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS][0])));
965 break;
966 case ELK_VARYING_SLOT_PAD:
967 /* No need to write to this slot */
968 break;
969 default:
970 for (int i = 0; i < 4; i++) {
971 emit_generic_urb_slot(reg, varying, i);
972 }
973 break;
974 }
975 }
976
977 static unsigned
align_interleaved_urb_mlen(const struct intel_device_info * devinfo,unsigned mlen)978 align_interleaved_urb_mlen(const struct intel_device_info *devinfo,
979 unsigned mlen)
980 {
981 if (devinfo->ver >= 6) {
982 /* URB data written (does not include the message header reg) must
983 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
984 * section 5.4.3.2.2: URB_INTERLEAVED.
985 *
986 * URB entries are allocated on a multiple of 1024 bits, so an
987 * extra 128 bits written here to make the end align to 256 is
988 * no problem.
989 */
990 if ((mlen % 2) != 1)
991 mlen++;
992 }
993
994 return mlen;
995 }
996
997
998 /**
999 * Generates the VUE payload plus the necessary URB write instructions to
1000 * output it.
1001 *
1002 * The VUE layout is documented in Volume 2a.
1003 */
1004 void
emit_vertex()1005 vec4_visitor::emit_vertex()
1006 {
1007 /* MRF 0 is reserved for the debugger, so start with message header
1008 * in MRF 1.
1009 */
1010 int base_mrf = 1;
1011 int mrf = base_mrf;
1012 /* In the process of generating our URB write message contents, we
1013 * may need to unspill a register or load from an array. Those
1014 * reads would use MRFs 14-15.
1015 */
1016 int max_usable_mrf = FIRST_SPILL_MRF(devinfo->ver);
1017
1018 /* The following assertion verifies that max_usable_mrf causes an
1019 * even-numbered amount of URB write data, which will meet gfx6's
1020 * requirements for length alignment.
1021 */
1022 assert ((max_usable_mrf - base_mrf) % 2 == 0);
1023
1024 /* First mrf is the g0-based message header containing URB handles and
1025 * such.
1026 */
1027 emit_urb_write_header(mrf++);
1028
1029 if (devinfo->ver < 6) {
1030 emit_ndc_computation();
1031 }
1032
1033 /* We may need to split this up into several URB writes, so do them in a
1034 * loop.
1035 */
1036 int slot = 0;
1037 bool complete = false;
1038 do {
1039 /* URB offset is in URB row increments, and each of our MRFs is half of
1040 * one of those, since we're doing interleaved writes.
1041 */
1042 int offset = slot / 2;
1043
1044 mrf = base_mrf + 1;
1045 for (; slot < prog_data->vue_map.num_slots; ++slot) {
1046 emit_urb_slot(dst_reg(MRF, mrf++),
1047 prog_data->vue_map.slot_to_varying[slot]);
1048
1049 /* If this was max_usable_mrf, we can't fit anything more into this
1050 * URB WRITE. Same thing if we reached the maximum length available.
1051 */
1052 if (mrf > max_usable_mrf ||
1053 align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > ELK_MAX_MSG_LENGTH) {
1054 slot++;
1055 break;
1056 }
1057 }
1058
1059 complete = slot >= prog_data->vue_map.num_slots;
1060 current_annotation = "URB write";
1061 vec4_instruction *inst = emit_urb_write_opcode(complete);
1062 inst->base_mrf = base_mrf;
1063 inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
1064 inst->offset += offset;
1065 } while(!complete);
1066 }
1067
1068
1069 src_reg
get_scratch_offset(elk_bblock_t * block,vec4_instruction * inst,src_reg * reladdr,int reg_offset)1070 vec4_visitor::get_scratch_offset(elk_bblock_t *block, vec4_instruction *inst,
1071 src_reg *reladdr, int reg_offset)
1072 {
1073 /* Because we store the values to scratch interleaved like our
1074 * vertex data, we need to scale the vec4 index by 2.
1075 */
1076 int message_header_scale = 2;
1077
1078 /* Pre-gfx6, the message header uses byte offsets instead of vec4
1079 * (16-byte) offset units.
1080 */
1081 if (devinfo->ver < 6)
1082 message_header_scale *= 16;
1083
1084 if (reladdr) {
1085 /* A vec4 is 16 bytes and a dvec4 is 32 bytes so for doubles we have
1086 * to multiply the reladdr by 2. Notice that the reg_offset part
1087 * is in units of 16 bytes and is used to select the low/high 16-byte
1088 * chunk of a full dvec4, so we don't want to multiply that part.
1089 */
1090 src_reg index = src_reg(this, glsl_int_type());
1091 if (type_sz(inst->dst.type) < 8) {
1092 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1093 elk_imm_d(reg_offset)));
1094 emit_before(block, inst, MUL(dst_reg(index), index,
1095 elk_imm_d(message_header_scale)));
1096 } else {
1097 emit_before(block, inst, MUL(dst_reg(index), *reladdr,
1098 elk_imm_d(message_header_scale * 2)));
1099 emit_before(block, inst, ADD(dst_reg(index), index,
1100 elk_imm_d(reg_offset * message_header_scale)));
1101 }
1102 return index;
1103 } else {
1104 return elk_imm_d(reg_offset * message_header_scale);
1105 }
1106 }
1107
1108 /**
1109 * Emits an instruction before @inst to load the value named by @orig_src
1110 * from scratch space at @base_offset to @temp.
1111 *
1112 * @base_offset is measured in 32-byte units (the size of a register).
1113 */
1114 void
emit_scratch_read(elk_bblock_t * block,vec4_instruction * inst,dst_reg temp,src_reg orig_src,int base_offset)1115 vec4_visitor::emit_scratch_read(elk_bblock_t *block, vec4_instruction *inst,
1116 dst_reg temp, src_reg orig_src,
1117 int base_offset)
1118 {
1119 assert(orig_src.offset % REG_SIZE == 0);
1120 int reg_offset = base_offset + orig_src.offset / REG_SIZE;
1121 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
1122 reg_offset);
1123
1124 if (type_sz(orig_src.type) < 8) {
1125 emit_before(block, inst, SCRATCH_READ(temp, index));
1126 } else {
1127 dst_reg shuffled = dst_reg(this, glsl_dvec4_type());
1128 dst_reg shuffled_float = retype(shuffled, ELK_REGISTER_TYPE_F);
1129 emit_before(block, inst, SCRATCH_READ(shuffled_float, index));
1130 index = get_scratch_offset(block, inst, orig_src.reladdr, reg_offset + 1);
1131 vec4_instruction *last_read =
1132 SCRATCH_READ(byte_offset(shuffled_float, REG_SIZE), index);
1133 emit_before(block, inst, last_read);
1134 shuffle_64bit_data(temp, src_reg(shuffled), false, true, block, last_read);
1135 }
1136 }
1137
1138 /**
1139 * Emits an instruction after @inst to store the value to be written
1140 * to @orig_dst to scratch space at @base_offset, from @temp.
1141 *
1142 * @base_offset is measured in 32-byte units (the size of a register).
1143 */
1144 void
emit_scratch_write(elk_bblock_t * block,vec4_instruction * inst,int base_offset)1145 vec4_visitor::emit_scratch_write(elk_bblock_t *block, vec4_instruction *inst,
1146 int base_offset)
1147 {
1148 assert(inst->dst.offset % REG_SIZE == 0);
1149 int reg_offset = base_offset + inst->dst.offset / REG_SIZE;
1150 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1151 reg_offset);
1152
1153 /* Create a temporary register to store *inst's result in.
1154 *
1155 * We have to be careful in MOVing from our temporary result register in
1156 * the scratch write. If we swizzle from channels of the temporary that
1157 * weren't initialized, it will confuse live interval analysis, which will
1158 * make spilling fail to make progress.
1159 */
1160 bool is_64bit = type_sz(inst->dst.type) == 8;
1161 const glsl_type *alloc_type =
1162 is_64bit ? glsl_dvec4_type() : glsl_vec4_type();
1163 const src_reg temp = swizzle(retype(src_reg(this, alloc_type),
1164 inst->dst.type),
1165 elk_swizzle_for_mask(inst->dst.writemask));
1166
1167 if (!is_64bit) {
1168 dst_reg dst = dst_reg(elk_writemask(elk_vec8_grf(0, 0),
1169 inst->dst.writemask));
1170 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
1171 if (inst->opcode != ELK_OPCODE_SEL)
1172 write->predicate = inst->predicate;
1173 write->ir = inst->ir;
1174 write->annotation = inst->annotation;
1175 inst->insert_after(block, write);
1176 } else {
1177 dst_reg shuffled = dst_reg(this, alloc_type);
1178 vec4_instruction *last =
1179 shuffle_64bit_data(shuffled, temp, true, true, block, inst);
1180 src_reg shuffled_float = src_reg(retype(shuffled, ELK_REGISTER_TYPE_F));
1181
1182 uint8_t mask = 0;
1183 if (inst->dst.writemask & WRITEMASK_X)
1184 mask |= WRITEMASK_XY;
1185 if (inst->dst.writemask & WRITEMASK_Y)
1186 mask |= WRITEMASK_ZW;
1187 if (mask) {
1188 dst_reg dst = dst_reg(elk_writemask(elk_vec8_grf(0, 0), mask));
1189
1190 vec4_instruction *write = SCRATCH_WRITE(dst, shuffled_float, index);
1191 if (inst->opcode != ELK_OPCODE_SEL)
1192 write->predicate = inst->predicate;
1193 write->ir = inst->ir;
1194 write->annotation = inst->annotation;
1195 last->insert_after(block, write);
1196 }
1197
1198 mask = 0;
1199 if (inst->dst.writemask & WRITEMASK_Z)
1200 mask |= WRITEMASK_XY;
1201 if (inst->dst.writemask & WRITEMASK_W)
1202 mask |= WRITEMASK_ZW;
1203 if (mask) {
1204 dst_reg dst = dst_reg(elk_writemask(elk_vec8_grf(0, 0), mask));
1205
1206 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1207 reg_offset + 1);
1208 vec4_instruction *write =
1209 SCRATCH_WRITE(dst, byte_offset(shuffled_float, REG_SIZE), index);
1210 if (inst->opcode != ELK_OPCODE_SEL)
1211 write->predicate = inst->predicate;
1212 write->ir = inst->ir;
1213 write->annotation = inst->annotation;
1214 last->insert_after(block, write);
1215 }
1216 }
1217
1218 inst->dst.file = temp.file;
1219 inst->dst.nr = temp.nr;
1220 inst->dst.offset %= REG_SIZE;
1221 inst->dst.reladdr = NULL;
1222 }
1223
1224 /**
1225 * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
1226 * adds the scratch read(s) before \p inst. The function also checks for
1227 * recursive reladdr scratch accesses, issuing the corresponding scratch
1228 * loads and rewriting reladdr references accordingly.
1229 *
1230 * \return \p src if it did not require a scratch load, otherwise, the
1231 * register holding the result of the scratch load that the caller should
1232 * use to rewrite src.
1233 */
1234 src_reg
emit_resolve_reladdr(int scratch_loc[],elk_bblock_t * block,vec4_instruction * inst,src_reg src)1235 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], elk_bblock_t *block,
1236 vec4_instruction *inst, src_reg src)
1237 {
1238 /* Resolve recursive reladdr scratch access by calling ourselves
1239 * with src.reladdr
1240 */
1241 if (src.reladdr)
1242 *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1243 *src.reladdr);
1244
1245 /* Now handle scratch access on src */
1246 if (src.file == VGRF && scratch_loc[src.nr] != -1) {
1247 dst_reg temp = dst_reg(this, type_sz(src.type) == 8 ?
1248 glsl_dvec4_type() : glsl_vec4_type());
1249 emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]);
1250 src.nr = temp.nr;
1251 src.offset %= REG_SIZE;
1252 src.reladdr = NULL;
1253 }
1254
1255 return src;
1256 }
1257
1258 /**
1259 * We can't generally support array access in GRF space, because a
1260 * single instruction's destination can only span 2 contiguous
1261 * registers. So, we send all GRF arrays that get variable index
1262 * access to scratch space.
1263 */
1264 void
move_grf_array_access_to_scratch()1265 vec4_visitor::move_grf_array_access_to_scratch()
1266 {
1267 int scratch_loc[this->alloc.count];
1268 memset(scratch_loc, -1, sizeof(scratch_loc));
1269
1270 /* First, calculate the set of virtual GRFs that need to be punted
1271 * to scratch due to having any array access on them, and where in
1272 * scratch.
1273 */
1274 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1275 if (inst->dst.file == VGRF && inst->dst.reladdr) {
1276 if (scratch_loc[inst->dst.nr] == -1) {
1277 scratch_loc[inst->dst.nr] = last_scratch;
1278 last_scratch += this->alloc.sizes[inst->dst.nr];
1279 }
1280
1281 for (src_reg *iter = inst->dst.reladdr;
1282 iter->reladdr;
1283 iter = iter->reladdr) {
1284 if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1285 scratch_loc[iter->nr] = last_scratch;
1286 last_scratch += this->alloc.sizes[iter->nr];
1287 }
1288 }
1289 }
1290
1291 for (int i = 0 ; i < 3; i++) {
1292 for (src_reg *iter = &inst->src[i];
1293 iter->reladdr;
1294 iter = iter->reladdr) {
1295 if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1296 scratch_loc[iter->nr] = last_scratch;
1297 last_scratch += this->alloc.sizes[iter->nr];
1298 }
1299 }
1300 }
1301 }
1302
1303 /* Now, for anything that will be accessed through scratch, rewrite
1304 * it to load/store. Note that this is a _safe list walk, because
1305 * we may generate a new scratch_write instruction after the one
1306 * we're processing.
1307 */
1308 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1309 /* Set up the annotation tracking for new generated instructions. */
1310 base_ir = inst->ir;
1311 current_annotation = inst->annotation;
1312
1313 /* First handle scratch access on the dst. Notice we have to handle
1314 * the case where the dst's reladdr also points to scratch space.
1315 */
1316 if (inst->dst.reladdr)
1317 *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1318 *inst->dst.reladdr);
1319
1320 /* Now that we have handled any (possibly recursive) reladdr scratch
1321 * accesses for dst we can safely do the scratch write for dst itself
1322 */
1323 if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1)
1324 emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]);
1325
1326 /* Now handle scratch access on any src. In this case, since inst->src[i]
1327 * already is a src_reg, we can just call emit_resolve_reladdr with
1328 * inst->src[i] and it will take care of handling scratch loads for
1329 * both src and src.reladdr (recursively).
1330 */
1331 for (int i = 0 ; i < 3; i++) {
1332 inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
1333 inst->src[i]);
1334 }
1335 }
1336 }
1337
1338 void
resolve_ud_negate(src_reg * reg)1339 vec4_visitor::resolve_ud_negate(src_reg *reg)
1340 {
1341 if (reg->type != ELK_REGISTER_TYPE_UD ||
1342 !reg->negate)
1343 return;
1344
1345 src_reg temp = src_reg(this, glsl_uvec4_type());
1346 emit(ELK_OPCODE_MOV, dst_reg(temp), *reg);
1347 *reg = temp;
1348 }
1349
1350 static elk_rnd_mode
elk_rnd_mode_from_execution_mode(unsigned execution_mode)1351 elk_rnd_mode_from_execution_mode(unsigned execution_mode)
1352 {
1353 if (nir_has_any_rounding_mode_rtne(execution_mode))
1354 return ELK_RND_MODE_RTNE;
1355 if (nir_has_any_rounding_mode_rtz(execution_mode))
1356 return ELK_RND_MODE_RTZ;
1357 return ELK_RND_MODE_UNSPECIFIED;
1358 }
1359
1360 void
emit_shader_float_controls_execution_mode()1361 vec4_visitor::emit_shader_float_controls_execution_mode()
1362 {
1363 unsigned execution_mode = this->nir->info.float_controls_execution_mode;
1364 if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1365 elk_rnd_mode rnd = elk_rnd_mode_from_execution_mode(execution_mode);
1366 const vec4_builder bld = vec4_builder(this).at_end();
1367 bld.exec_all().emit(ELK_SHADER_OPCODE_RND_MODE, dst_null_ud(), elk_imm_d(rnd));
1368 }
1369 }
1370
vec4_visitor(const struct elk_compiler * compiler,const struct elk_compile_params * params,const struct elk_sampler_prog_key_data * key_tex,struct elk_vue_prog_data * prog_data,const nir_shader * shader,bool no_spills,bool debug_enabled)1371 vec4_visitor::vec4_visitor(const struct elk_compiler *compiler,
1372 const struct elk_compile_params *params,
1373 const struct elk_sampler_prog_key_data *key_tex,
1374 struct elk_vue_prog_data *prog_data,
1375 const nir_shader *shader,
1376 bool no_spills,
1377 bool debug_enabled)
1378 : elk_backend_shader(compiler, params, shader, &prog_data->base, debug_enabled),
1379 key_tex(key_tex),
1380 prog_data(prog_data),
1381 fail_msg(NULL),
1382 first_non_payload_grf(0),
1383 ubo_push_start(),
1384 push_length(0),
1385 live_analysis(this), performance_analysis(this),
1386 no_spills(no_spills),
1387 last_scratch(0)
1388 {
1389 this->failed = false;
1390
1391 this->base_ir = NULL;
1392 this->current_annotation = NULL;
1393 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
1394
1395 memset(this->output_num_components, 0, sizeof(this->output_num_components));
1396
1397 this->max_grf = devinfo->ver >= 7 ? GFX7_MRF_HACK_START : ELK_MAX_GRF;
1398
1399 this->uniforms = 0;
1400
1401 this->nir_ssa_values = NULL;
1402 }
1403
1404
1405 void
fail(const char * format,...)1406 vec4_visitor::fail(const char *format, ...)
1407 {
1408 va_list va;
1409 char *msg;
1410
1411 if (failed)
1412 return;
1413
1414 failed = true;
1415
1416 va_start(va, format);
1417 msg = ralloc_vasprintf(mem_ctx, format, va);
1418 va_end(va);
1419 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n",
1420 _mesa_shader_stage_to_abbrev(stage), msg);
1421
1422 this->fail_msg = msg;
1423
1424 if (unlikely(debug_enabled)) {
1425 fprintf(stderr, "%s", msg);
1426 }
1427 }
1428
1429 } /* namespace elk */
1430