xref: /aosp_15_r20/external/mesa3d/src/intel/compiler/elk/elk_vec4_visitor.cpp (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2011 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "elk_vec4.h"
25 #include "elk_cfg.h"
26 #include "elk_eu.h"
27 #include "util/u_math.h"
28 
29 namespace elk {
30 
vec4_instruction(enum elk_opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1,const src_reg & src2)31 vec4_instruction::vec4_instruction(enum elk_opcode opcode, const dst_reg &dst,
32                                    const src_reg &src0, const src_reg &src1,
33                                    const src_reg &src2)
34 {
35    this->opcode = opcode;
36    this->dst = dst;
37    this->src[0] = src0;
38    this->src[1] = src1;
39    this->src[2] = src2;
40    this->saturate = false;
41    this->force_writemask_all = false;
42    this->no_dd_clear = false;
43    this->no_dd_check = false;
44    this->writes_accumulator = false;
45    this->conditional_mod = ELK_CONDITIONAL_NONE;
46    this->predicate = ELK_PREDICATE_NONE;
47    this->predicate_inverse = false;
48    this->target = 0;
49    this->shadow_compare = false;
50    this->eot = false;
51    this->ir = NULL;
52    this->urb_write_flags = ELK_URB_WRITE_NO_FLAGS;
53    this->header_size = 0;
54    this->flag_subreg = 0;
55    this->mlen = 0;
56    this->base_mrf = 0;
57    this->offset = 0;
58    this->exec_size = 8;
59    this->group = 0;
60    this->size_written = (dst.file == BAD_FILE ?
61                          0 : this->exec_size * type_sz(dst.type));
62    this->annotation = NULL;
63 }
64 
65 vec4_instruction *
emit(vec4_instruction * inst)66 vec4_visitor::emit(vec4_instruction *inst)
67 {
68    inst->ir = this->base_ir;
69    inst->annotation = this->current_annotation;
70 
71    this->instructions.push_tail(inst);
72 
73    return inst;
74 }
75 
76 vec4_instruction *
emit_before(elk_bblock_t * block,vec4_instruction * inst,vec4_instruction * new_inst)77 vec4_visitor::emit_before(elk_bblock_t *block, vec4_instruction *inst,
78                           vec4_instruction *new_inst)
79 {
80    new_inst->ir = inst->ir;
81    new_inst->annotation = inst->annotation;
82 
83    inst->insert_before(block, new_inst);
84 
85    return inst;
86 }
87 
88 vec4_instruction *
emit(enum elk_opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1,const src_reg & src2)89 vec4_visitor::emit(enum elk_opcode opcode, const dst_reg &dst, const src_reg &src0,
90                    const src_reg &src1, const src_reg &src2)
91 {
92    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
93 }
94 
95 
96 vec4_instruction *
emit(enum elk_opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1)97 vec4_visitor::emit(enum elk_opcode opcode, const dst_reg &dst, const src_reg &src0,
98                    const src_reg &src1)
99 {
100    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
101 }
102 
103 vec4_instruction *
emit(enum elk_opcode opcode,const dst_reg & dst,const src_reg & src0)104 vec4_visitor::emit(enum elk_opcode opcode, const dst_reg &dst, const src_reg &src0)
105 {
106    return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
107 }
108 
109 vec4_instruction *
emit(enum elk_opcode opcode,const dst_reg & dst)110 vec4_visitor::emit(enum elk_opcode opcode, const dst_reg &dst)
111 {
112    return emit(new(mem_ctx) vec4_instruction(opcode, dst));
113 }
114 
115 vec4_instruction *
emit(enum elk_opcode opcode)116 vec4_visitor::emit(enum elk_opcode opcode)
117 {
118    return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
119 }
120 
121 #define ALU1(op)							\
122    vec4_instruction *							\
123    vec4_visitor::op(const dst_reg &dst, const src_reg &src0)		\
124    {									\
125       return new(mem_ctx) vec4_instruction(ELK_OPCODE_##op, dst, src0); \
126    }
127 
128 #define ALU2(op)							\
129    vec4_instruction *							\
130    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,		\
131                     const src_reg &src1)				\
132    {									\
133       return new(mem_ctx) vec4_instruction(ELK_OPCODE_##op, dst,        \
134                                            src0, src1);                 \
135    }
136 
137 #define ALU2_ACC(op)							\
138    vec4_instruction *							\
139    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,		\
140                     const src_reg &src1)				\
141    {									\
142       vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
143                        ELK_OPCODE_##op, dst, src0, src1);		\
144       inst->writes_accumulator = true;                                  \
145       return inst;                                                      \
146    }
147 
148 #define ALU3(op)							\
149    vec4_instruction *							\
150    vec4_visitor::op(const dst_reg &dst, const src_reg &src0,		\
151                     const src_reg &src1, const src_reg &src2)		\
152    {									\
153       assert(devinfo->ver >= 6);						\
154       return new(mem_ctx) vec4_instruction(ELK_OPCODE_##op, dst,	\
155 					   src0, src1, src2);		\
156    }
157 
158 ALU1(NOT)
ALU1(MOV)159 ALU1(MOV)
160 ALU1(FRC)
161 ALU1(RNDD)
162 ALU1(RNDE)
163 ALU1(RNDZ)
164 ALU1(F32TO16)
165 ALU1(F16TO32)
166 ALU2(ADD)
167 ALU2(MUL)
168 ALU2_ACC(MACH)
169 ALU2(AND)
170 ALU2(OR)
171 ALU2(XOR)
172 ALU2(DP3)
173 ALU2(DP4)
174 ALU2(DPH)
175 ALU2(SHL)
176 ALU2(SHR)
177 ALU2(ASR)
178 ALU3(LRP)
179 ALU1(BFREV)
180 ALU3(BFE)
181 ALU2(BFI1)
182 ALU3(BFI2)
183 ALU1(FBH)
184 ALU1(FBL)
185 ALU1(CBIT)
186 ALU1(LZD)
187 ALU3(MAD)
188 ALU2_ACC(ADDC)
189 ALU2_ACC(SUBB)
190 ALU2(MAC)
191 ALU1(DIM)
192 
193 /** Gfx4 predicated IF. */
194 vec4_instruction *
195 vec4_visitor::IF(enum elk_predicate predicate)
196 {
197    vec4_instruction *inst;
198 
199    inst = new(mem_ctx) vec4_instruction(ELK_OPCODE_IF);
200    inst->predicate = predicate;
201 
202    return inst;
203 }
204 
205 /** Gfx6 IF with embedded comparison. */
206 vec4_instruction *
IF(src_reg src0,src_reg src1,enum elk_conditional_mod condition)207 vec4_visitor::IF(src_reg src0, src_reg src1,
208                  enum elk_conditional_mod condition)
209 {
210    assert(devinfo->ver == 6);
211 
212    vec4_instruction *inst;
213 
214    resolve_ud_negate(&src0);
215    resolve_ud_negate(&src1);
216 
217    inst = new(mem_ctx) vec4_instruction(ELK_OPCODE_IF, dst_null_d(),
218 					src0, src1);
219    inst->conditional_mod = condition;
220 
221    return inst;
222 }
223 
224 /**
225  * CMP: Sets the low bit of the destination channels with the result
226  * of the comparison, while the upper bits are undefined, and updates
227  * the flag register with the packed 16 bits of the result.
228  */
229 vec4_instruction *
CMP(dst_reg dst,src_reg src0,src_reg src1,enum elk_conditional_mod condition)230 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
231                   enum elk_conditional_mod condition)
232 {
233    vec4_instruction *inst;
234 
235    /* Take the instruction:
236     *
237     * CMP null<d> src0<f> src1<f>
238     *
239     * Original gfx4 does type conversion to the destination type before
240     * comparison, producing garbage results for floating point comparisons.
241     *
242     * The destination type doesn't matter on newer generations, so we set the
243     * type to match src0 so we can compact the instruction.
244     */
245    dst.type = src0.type;
246 
247    resolve_ud_negate(&src0);
248    resolve_ud_negate(&src1);
249 
250    inst = new(mem_ctx) vec4_instruction(ELK_OPCODE_CMP, dst, src0, src1);
251    inst->conditional_mod = condition;
252 
253    return inst;
254 }
255 
256 vec4_instruction *
SCRATCH_READ(const dst_reg & dst,const src_reg & index)257 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
258 {
259    vec4_instruction *inst;
260 
261    inst = new(mem_ctx) vec4_instruction(ELK_SHADER_OPCODE_GFX4_SCRATCH_READ,
262 					dst, index);
263    inst->base_mrf = FIRST_SPILL_MRF(devinfo->ver) + 1;
264    inst->mlen = 2;
265 
266    return inst;
267 }
268 
269 vec4_instruction *
SCRATCH_WRITE(const dst_reg & dst,const src_reg & src,const src_reg & index)270 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
271                             const src_reg &index)
272 {
273    vec4_instruction *inst;
274 
275    inst = new(mem_ctx) vec4_instruction(ELK_SHADER_OPCODE_GFX4_SCRATCH_WRITE,
276 					dst, src, index);
277    inst->base_mrf = FIRST_SPILL_MRF(devinfo->ver);
278    inst->mlen = 3;
279 
280    return inst;
281 }
282 
283 src_reg
fix_3src_operand(const src_reg & src)284 vec4_visitor::fix_3src_operand(const src_reg &src)
285 {
286    /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
287     * able to use vertical stride of zero to replicate the vec4 uniform, like
288     *
289     *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
290     *
291     * But you can't, since vertical stride is always four in three-source
292     * instructions. Instead, insert a MOV instruction to do the replication so
293     * that the three-source instruction can consume it.
294     */
295 
296    /* The MOV is only needed if the source is a uniform or immediate. */
297    if (src.file != UNIFORM && src.file != IMM)
298       return src;
299 
300    if (src.file == UNIFORM && elk_is_single_value_swizzle(src.swizzle))
301       return src;
302 
303    dst_reg expanded = dst_reg(this, glsl_vec4_type());
304    expanded.type = src.type;
305    emit(ELK_VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
306    return src_reg(expanded);
307 }
308 
309 src_reg
fix_math_operand(const src_reg & src)310 vec4_visitor::fix_math_operand(const src_reg &src)
311 {
312    if (devinfo->ver < 6 || src.file == BAD_FILE)
313       return src;
314 
315    /* The gfx6 math instruction ignores the source modifiers --
316     * swizzle, abs, negate, and at least some parts of the register
317     * region description.
318     *
319     * Rather than trying to enumerate all these cases, *always* expand the
320     * operand to a temp GRF for gfx6.
321     *
322     * For gfx7, keep the operand as-is, except if immediate, which gfx7 still
323     * can't use.
324     */
325 
326    if (devinfo->ver == 7 && src.file != IMM)
327       return src;
328 
329    dst_reg expanded = dst_reg(this, glsl_vec4_type());
330    expanded.type = src.type;
331    emit(MOV(expanded, src));
332    return src_reg(expanded);
333 }
334 
335 vec4_instruction *
emit_math(enum elk_opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1)336 vec4_visitor::emit_math(enum elk_opcode opcode,
337                         const dst_reg &dst,
338                         const src_reg &src0, const src_reg &src1)
339 {
340    vec4_instruction *math =
341       emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
342 
343    if (devinfo->ver == 6 && dst.writemask != WRITEMASK_XYZW) {
344       /* MATH on Gfx6 must be align1, so we can't do writemasks. */
345       math->dst = dst_reg(this, glsl_vec4_type());
346       math->dst.type = dst.type;
347       math = emit(MOV(dst, src_reg(math->dst)));
348    } else if (devinfo->ver < 6) {
349       math->base_mrf = 1;
350       math->mlen = src1.file == BAD_FILE ? 1 : 2;
351    }
352 
353    return math;
354 }
355 
356 void
emit_pack_half_2x16(dst_reg dst,src_reg src0)357 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
358 {
359    if (devinfo->ver < 7) {
360       unreachable("ir_unop_pack_half_2x16 should be lowered");
361    }
362 
363    assert(dst.type == ELK_REGISTER_TYPE_UD);
364    assert(src0.type == ELK_REGISTER_TYPE_F);
365 
366    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
367     *
368     *   Because this instruction does not have a 16-bit floating-point type,
369     *   the destination data type must be Word (W).
370     *
371     *   The destination must be DWord-aligned and specify a horizontal stride
372     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
373     *   each destination channel and the upper word is not modified.
374     *
375     * The above restriction implies that the f32to16 instruction must use
376     * align1 mode, because only in align1 mode is it possible to specify
377     * horizontal stride.  We choose here to defy the hardware docs and emit
378     * align16 instructions.
379     *
380     * (I [chadv] did attempt to emit align1 instructions for VS f32to16
381     * instructions. I was partially successful in that the code passed all
382     * tests.  However, the code was dubiously correct and fragile, and the
383     * tests were not harsh enough to probe that frailty. Not trusting the
384     * code, I chose instead to remain in align16 mode in defiance of the hw
385     * docs).
386     *
387     * I've [chadv] experimentally confirmed that, on gfx7 hardware and the
388     * simulator, emitting a f32to16 in align16 mode with UD as destination
389     * data type is safe. The behavior differs from that specified in the PRM
390     * in that the upper word of each destination channel is cleared to 0.
391     */
392 
393    dst_reg tmp_dst(this, glsl_uvec2_type());
394    src_reg tmp_src(tmp_dst);
395 
396 #if 0
397    /* Verify the undocumented behavior on which the following instructions
398     * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
399     * then the result of the bit-or instruction below will be incorrect.
400     *
401     * You should inspect the disasm output in order to verify that the MOV is
402     * not optimized away.
403     */
404    emit(MOV(tmp_dst, elk_imm_ud(0x12345678u)));
405 #endif
406 
407    /* Give tmp the form below, where "." means untouched.
408     *
409     *     w z          y          x w z          y          x
410     *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
411     *
412     * That the upper word of each write-channel be 0 is required for the
413     * following bit-shift and bit-or instructions to work. Note that this
414     * relies on the undocumented hardware behavior mentioned above.
415     */
416    tmp_dst.writemask = WRITEMASK_XY;
417    emit(F32TO16(tmp_dst, src0));
418 
419    /* Give the write-channels of dst the form:
420     *   0xhhhh0000
421     */
422    tmp_src.swizzle = ELK_SWIZZLE_YYYY;
423    emit(SHL(dst, tmp_src, elk_imm_ud(16u)));
424 
425    /* Finally, give the write-channels of dst the form of packHalf2x16's
426     * output:
427     *   0xhhhhllll
428     */
429    tmp_src.swizzle = ELK_SWIZZLE_XXXX;
430    emit(OR(dst, src_reg(dst), tmp_src));
431 }
432 
433 void
emit_unpack_half_2x16(dst_reg dst,src_reg src0)434 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
435 {
436    if (devinfo->ver < 7) {
437       unreachable("ir_unop_unpack_half_2x16 should be lowered");
438    }
439 
440    assert(dst.type == ELK_REGISTER_TYPE_F);
441    assert(src0.type == ELK_REGISTER_TYPE_UD);
442 
443    /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
444     *
445     *   Because this instruction does not have a 16-bit floating-point type,
446     *   the source data type must be Word (W). The destination type must be
447     *   F (Float).
448     *
449     * To use W as the source data type, we must adjust horizontal strides,
450     * which is only possible in align1 mode. All my [chadv] attempts at
451     * emitting align1 instructions for unpackHalf2x16 failed to pass the
452     * Piglit tests, so I gave up.
453     *
454     * I've verified that, on gfx7 hardware and the simulator, it is safe to
455     * emit f16to32 in align16 mode with UD as source data type.
456     */
457 
458    dst_reg tmp_dst(this, glsl_uvec2_type());
459    src_reg tmp_src(tmp_dst);
460 
461    tmp_dst.writemask = WRITEMASK_X;
462    emit(AND(tmp_dst, src0, elk_imm_ud(0xffffu)));
463 
464    tmp_dst.writemask = WRITEMASK_Y;
465    emit(SHR(tmp_dst, src0, elk_imm_ud(16u)));
466 
467    dst.writemask = WRITEMASK_XY;
468    emit(F16TO32(dst, tmp_src));
469 }
470 
471 void
emit_unpack_unorm_4x8(const dst_reg & dst,src_reg src0)472 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
473 {
474    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
475     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
476     * is not suitable to generate the shift values, but we can use the packed
477     * vector float and a type-converting MOV.
478     */
479    dst_reg shift(this, glsl_uvec4_type());
480    emit(MOV(shift, elk_imm_vf4(0x00, 0x60, 0x70, 0x78)));
481 
482    dst_reg shifted(this, glsl_uvec4_type());
483    src0.swizzle = ELK_SWIZZLE_XXXX;
484    emit(SHR(shifted, src0, src_reg(shift)));
485 
486    shifted.type = ELK_REGISTER_TYPE_UB;
487    dst_reg f(this, glsl_vec4_type());
488    emit(ELK_VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
489 
490    emit(MUL(dst, src_reg(f), elk_imm_f(1.0f / 255.0f)));
491 }
492 
493 void
emit_unpack_snorm_4x8(const dst_reg & dst,src_reg src0)494 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
495 {
496    /* Instead of splitting the 32-bit integer, shifting, and ORing it back
497     * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
498     * is not suitable to generate the shift values, but we can use the packed
499     * vector float and a type-converting MOV.
500     */
501    dst_reg shift(this, glsl_uvec4_type());
502    emit(MOV(shift, elk_imm_vf4(0x00, 0x60, 0x70, 0x78)));
503 
504    dst_reg shifted(this, glsl_uvec4_type());
505    src0.swizzle = ELK_SWIZZLE_XXXX;
506    emit(SHR(shifted, src0, src_reg(shift)));
507 
508    shifted.type = ELK_REGISTER_TYPE_B;
509    dst_reg f(this, glsl_vec4_type());
510    emit(ELK_VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
511 
512    dst_reg scaled(this, glsl_vec4_type());
513    emit(MUL(scaled, src_reg(f), elk_imm_f(1.0f / 127.0f)));
514 
515    dst_reg max(this, glsl_vec4_type());
516    emit_minmax(ELK_CONDITIONAL_GE, max, src_reg(scaled), elk_imm_f(-1.0f));
517    emit_minmax(ELK_CONDITIONAL_L, dst, src_reg(max), elk_imm_f(1.0f));
518 }
519 
520 void
emit_pack_unorm_4x8(const dst_reg & dst,const src_reg & src0)521 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
522 {
523    dst_reg saturated(this, glsl_vec4_type());
524    vec4_instruction *inst = emit(MOV(saturated, src0));
525    inst->saturate = true;
526 
527    dst_reg scaled(this, glsl_vec4_type());
528    emit(MUL(scaled, src_reg(saturated), elk_imm_f(255.0f)));
529 
530    dst_reg rounded(this, glsl_vec4_type());
531    emit(RNDE(rounded, src_reg(scaled)));
532 
533    dst_reg u(this, glsl_uvec4_type());
534    emit(MOV(u, src_reg(rounded)));
535 
536    src_reg bytes(u);
537    emit(ELK_VEC4_OPCODE_PACK_BYTES, dst, bytes);
538 }
539 
540 void
emit_pack_snorm_4x8(const dst_reg & dst,const src_reg & src0)541 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
542 {
543    dst_reg max(this, glsl_vec4_type());
544    emit_minmax(ELK_CONDITIONAL_GE, max, src0, elk_imm_f(-1.0f));
545 
546    dst_reg min(this, glsl_vec4_type());
547    emit_minmax(ELK_CONDITIONAL_L, min, src_reg(max), elk_imm_f(1.0f));
548 
549    dst_reg scaled(this, glsl_vec4_type());
550    emit(MUL(scaled, src_reg(min), elk_imm_f(127.0f)));
551 
552    dst_reg rounded(this, glsl_vec4_type());
553    emit(RNDE(rounded, src_reg(scaled)));
554 
555    dst_reg i(this, glsl_ivec4_type());
556    emit(MOV(i, src_reg(rounded)));
557 
558    src_reg bytes(i);
559    emit(ELK_VEC4_OPCODE_PACK_BYTES, dst, bytes);
560 }
561 
562 /*
563  * Returns the minimum number of vec4 (as_vec4 == true) or dvec4 (as_vec4 ==
564  * false) elements needed to pack a type.
565  */
566 static int
elk_type_size_xvec4(const struct glsl_type * type,bool as_vec4,bool bindless)567 elk_type_size_xvec4(const struct glsl_type *type, bool as_vec4, bool bindless)
568 {
569    unsigned int i;
570    int size;
571 
572    switch (type->base_type) {
573    case GLSL_TYPE_UINT:
574    case GLSL_TYPE_INT:
575    case GLSL_TYPE_FLOAT:
576    case GLSL_TYPE_FLOAT16:
577    case GLSL_TYPE_BOOL:
578    case GLSL_TYPE_DOUBLE:
579    case GLSL_TYPE_UINT16:
580    case GLSL_TYPE_INT16:
581    case GLSL_TYPE_UINT8:
582    case GLSL_TYPE_INT8:
583    case GLSL_TYPE_UINT64:
584    case GLSL_TYPE_INT64:
585       if (glsl_type_is_matrix(type)) {
586          const glsl_type *col_type = glsl_get_column_type(type);
587          unsigned col_slots =
588             (as_vec4 && glsl_type_is_dual_slot(col_type)) ? 2 : 1;
589          return type->matrix_columns * col_slots;
590       } else {
591          /* Regardless of size of vector, it gets a vec4. This is bad
592           * packing for things like floats, but otherwise arrays become a
593           * mess.  Hopefully a later pass over the code can pack scalars
594           * down if appropriate.
595           */
596          return (as_vec4 && glsl_type_is_dual_slot(type)) ? 2 : 1;
597       }
598    case GLSL_TYPE_ARRAY:
599       assert(type->length > 0);
600       return elk_type_size_xvec4(type->fields.array, as_vec4, bindless) *
601              type->length;
602    case GLSL_TYPE_STRUCT:
603    case GLSL_TYPE_INTERFACE:
604       size = 0;
605       for (i = 0; i < type->length; i++) {
606 	 size += elk_type_size_xvec4(type->fields.structure[i].type, as_vec4,
607                                  bindless);
608       }
609       return size;
610    case GLSL_TYPE_SUBROUTINE:
611       return 1;
612 
613    case GLSL_TYPE_SAMPLER:
614    case GLSL_TYPE_TEXTURE:
615       /* Samplers and textures take up no register space, since they're baked
616        * in at link time.
617        */
618       return bindless ? 1 : 0;
619    case GLSL_TYPE_ATOMIC_UINT:
620       return 0;
621    case GLSL_TYPE_IMAGE:
622       return bindless ? 1 : DIV_ROUND_UP(ISL_IMAGE_PARAM_SIZE, 4);
623    case GLSL_TYPE_VOID:
624    case GLSL_TYPE_ERROR:
625    case GLSL_TYPE_COOPERATIVE_MATRIX:
626       unreachable("not reached");
627    }
628 
629    return 0;
630 }
631 
632 /**
633  * Returns the minimum number of vec4 elements needed to pack a type.
634  *
635  * For simple types, it will return 1 (a single vec4); for matrices, the
636  * number of columns; for array and struct, the sum of the vec4_size of
637  * each of its elements; and for sampler and atomic, zero.
638  *
639  * This method is useful to calculate how much register space is needed to
640  * store a particular type.
641  */
642 extern "C" int
elk_type_size_vec4(const struct glsl_type * type,bool bindless)643 elk_type_size_vec4(const struct glsl_type *type, bool bindless)
644 {
645    return elk_type_size_xvec4(type, true, bindless);
646 }
647 
648 /**
649  * Returns the minimum number of dvec4 elements needed to pack a type.
650  *
651  * For simple types, it will return 1 (a single dvec4); for matrices, the
652  * number of columns; for array and struct, the sum of the dvec4_size of
653  * each of its elements; and for sampler and atomic, zero.
654  *
655  * This method is useful to calculate how much register space is needed to
656  * store a particular type.
657  *
658  * Measuring double-precision vertex inputs as dvec4 is required because
659  * ARB_vertex_attrib_64bit states that these uses the same number of locations
660  * than the single-precision version. That is, two consecutives dvec4 would be
661  * located in location "x" and location "x+1", not "x+2".
662  *
663  * In order to map vec4/dvec4 vertex inputs in the proper ATTRs,
664  * remap_vs_attrs() will take in account both the location and also if the
665  * type fits in one or two vec4 slots.
666  */
667 extern "C" int
elk_type_size_dvec4(const struct glsl_type * type,bool bindless)668 elk_type_size_dvec4(const struct glsl_type *type, bool bindless)
669 {
670    return elk_type_size_xvec4(type, false, bindless);
671 }
672 
src_reg(class vec4_visitor * v,const struct glsl_type * type)673 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
674 {
675    init();
676 
677    this->file = VGRF;
678    this->nr = v->alloc.allocate(elk_type_size_vec4(type, false));
679 
680    if (glsl_type_is_array(type) || glsl_type_is_struct(type)) {
681       this->swizzle = ELK_SWIZZLE_NOOP;
682    } else {
683       this->swizzle = elk_swizzle_for_size(type->vector_elements);
684    }
685 
686    this->type = elk_type_for_base_type(type);
687 }
688 
src_reg(class vec4_visitor * v,const struct glsl_type * type,int size)689 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
690 {
691    assert(size > 0);
692 
693    init();
694 
695    this->file = VGRF;
696    this->nr = v->alloc.allocate(elk_type_size_vec4(type, false) * size);
697 
698    this->swizzle = ELK_SWIZZLE_NOOP;
699 
700    this->type = elk_type_for_base_type(type);
701 }
702 
dst_reg(class vec4_visitor * v,const struct glsl_type * type)703 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
704 {
705    init();
706 
707    this->file = VGRF;
708    this->nr = v->alloc.allocate(elk_type_size_vec4(type, false));
709 
710    if (glsl_type_is_array(type) || glsl_type_is_struct(type)) {
711       this->writemask = WRITEMASK_XYZW;
712    } else {
713       this->writemask = (1 << type->vector_elements) - 1;
714    }
715 
716    this->type = elk_type_for_base_type(type);
717 }
718 
719 vec4_instruction *
emit_minmax(enum elk_conditional_mod conditionalmod,dst_reg dst,src_reg src0,src_reg src1)720 vec4_visitor::emit_minmax(enum elk_conditional_mod conditionalmod, dst_reg dst,
721                           src_reg src0, src_reg src1)
722 {
723    vec4_instruction *inst = emit(ELK_OPCODE_SEL, dst, src0, src1);
724    inst->conditional_mod = conditionalmod;
725    return inst;
726 }
727 
728 /**
729  * Emits the instructions needed to perform a pull constant load. before_block
730  * and before_inst can be NULL in which case the instruction will be appended
731  * to the end of the instruction list.
732  */
733 void
emit_pull_constant_load_reg(dst_reg dst,src_reg surf_index,src_reg offset_reg,elk_bblock_t * before_block,vec4_instruction * before_inst)734 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
735                                           src_reg surf_index,
736                                           src_reg offset_reg,
737                                           elk_bblock_t *before_block,
738                                           vec4_instruction *before_inst)
739 {
740    assert((before_inst == NULL && before_block == NULL) ||
741           (before_inst && before_block));
742 
743    vec4_instruction *pull;
744 
745    if (devinfo->ver >= 7) {
746       dst_reg grf_offset = dst_reg(this, glsl_uint_type());
747 
748       grf_offset.type = offset_reg.type;
749 
750       pull = MOV(grf_offset, offset_reg);
751 
752       if (before_inst)
753          emit_before(before_block, before_inst, pull);
754       else
755          emit(pull);
756 
757       pull = new(mem_ctx) vec4_instruction(ELK_VS_OPCODE_PULL_CONSTANT_LOAD_GFX7,
758                                            dst,
759                                            surf_index,
760                                            src_reg(grf_offset));
761       pull->mlen = 1;
762    } else {
763       pull = new(mem_ctx) vec4_instruction(ELK_VS_OPCODE_PULL_CONSTANT_LOAD,
764                                            dst,
765                                            surf_index,
766                                            offset_reg);
767       pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->ver) + 1;
768       pull->mlen = 1;
769    }
770 
771    if (before_inst)
772       emit_before(before_block, before_inst, pull);
773    else
774       emit(pull);
775 }
776 
777 src_reg
emit_uniformize(const src_reg & src)778 vec4_visitor::emit_uniformize(const src_reg &src)
779 {
780    const src_reg chan_index(this, glsl_uint_type());
781    const dst_reg dst = retype(dst_reg(this, glsl_uint_type()),
782                               src.type);
783 
784    emit(ELK_SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
785       ->force_writemask_all = true;
786    emit(ELK_SHADER_OPCODE_BROADCAST, dst, src, chan_index)
787       ->force_writemask_all = true;
788 
789    return src_reg(dst);
790 }
791 
792 void
gs_emit_vertex(int)793 vec4_visitor::gs_emit_vertex(int /* stream_id */)
794 {
795    unreachable("not reached");
796 }
797 
798 void
gs_end_primitive()799 vec4_visitor::gs_end_primitive()
800 {
801    unreachable("not reached");
802 }
803 
804 void
emit_ndc_computation()805 vec4_visitor::emit_ndc_computation()
806 {
807    if (output_reg[VARYING_SLOT_POS][0].file == BAD_FILE)
808       return;
809 
810    /* Get the position */
811    src_reg pos = src_reg(output_reg[VARYING_SLOT_POS][0]);
812 
813    /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
814    dst_reg ndc = dst_reg(this, glsl_vec4_type());
815    output_reg[ELK_VARYING_SLOT_NDC][0] = ndc;
816    output_num_components[ELK_VARYING_SLOT_NDC][0] = 4;
817 
818    current_annotation = "NDC";
819    dst_reg ndc_w = ndc;
820    ndc_w.writemask = WRITEMASK_W;
821    src_reg pos_w = pos;
822    pos_w.swizzle = ELK_SWIZZLE4(ELK_SWIZZLE_W, ELK_SWIZZLE_W, ELK_SWIZZLE_W, ELK_SWIZZLE_W);
823    emit_math(ELK_SHADER_OPCODE_RCP, ndc_w, pos_w);
824 
825    dst_reg ndc_xyz = ndc;
826    ndc_xyz.writemask = WRITEMASK_XYZ;
827 
828    emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
829 }
830 
831 void
emit_psiz_and_flags(dst_reg reg)832 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
833 {
834    if (devinfo->ver < 6 &&
835        ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
836         output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE ||
837         devinfo->has_negative_rhw_bug)) {
838       dst_reg header1 = dst_reg(this, glsl_uvec4_type());
839       dst_reg header1_w = header1;
840       header1_w.writemask = WRITEMASK_W;
841 
842       emit(MOV(header1, elk_imm_ud(0u)));
843 
844       if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
845 	 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
846 
847 	 current_annotation = "Point size";
848 	 emit(MUL(header1_w, psiz, elk_imm_f((float)(1 << 11))));
849 	 emit(AND(header1_w, src_reg(header1_w), elk_imm_d(0x7ff << 8)));
850       }
851 
852       if (output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE) {
853          current_annotation = "Clipping flags";
854          dst_reg flags0 = dst_reg(this, glsl_uint_type());
855 
856          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0][0]), elk_imm_f(0.0f), ELK_CONDITIONAL_L));
857          emit(ELK_VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, elk_imm_d(0));
858          emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
859       }
860 
861       if (output_reg[VARYING_SLOT_CLIP_DIST1][0].file != BAD_FILE) {
862          dst_reg flags1 = dst_reg(this, glsl_uint_type());
863          emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1][0]), elk_imm_f(0.0f), ELK_CONDITIONAL_L));
864          emit(ELK_VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, elk_imm_d(0));
865          emit(SHL(flags1, src_reg(flags1), elk_imm_d(4)));
866          emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
867       }
868 
869       /* i965 clipping workaround:
870        * 1) Test for -ve rhw
871        * 2) If set,
872        *      set ndc = (0,0,0,0)
873        *      set ucp[6] = 1
874        *
875        * Later, clipping will detect ucp[6] and ensure the primitive is
876        * clipped against all fixed planes.
877        */
878       if (devinfo->has_negative_rhw_bug &&
879           output_reg[ELK_VARYING_SLOT_NDC][0].file != BAD_FILE) {
880          src_reg ndc_w = src_reg(output_reg[ELK_VARYING_SLOT_NDC][0]);
881          ndc_w.swizzle = ELK_SWIZZLE_WWWW;
882          emit(CMP(dst_null_f(), ndc_w, elk_imm_f(0.0f), ELK_CONDITIONAL_L));
883          vec4_instruction *inst;
884          inst = emit(OR(header1_w, src_reg(header1_w), elk_imm_ud(1u << 6)));
885          inst->predicate = ELK_PREDICATE_NORMAL;
886          output_reg[ELK_VARYING_SLOT_NDC][0].type = ELK_REGISTER_TYPE_F;
887          inst = emit(MOV(output_reg[ELK_VARYING_SLOT_NDC][0], elk_imm_f(0.0f)));
888          inst->predicate = ELK_PREDICATE_NORMAL;
889       }
890 
891       emit(MOV(retype(reg, ELK_REGISTER_TYPE_UD), src_reg(header1)));
892    } else if (devinfo->ver < 6) {
893       emit(MOV(retype(reg, ELK_REGISTER_TYPE_UD), elk_imm_ud(0u)));
894    } else {
895       emit(MOV(retype(reg, ELK_REGISTER_TYPE_D), elk_imm_d(0)));
896       if (output_reg[VARYING_SLOT_PSIZ][0].file != BAD_FILE) {
897          dst_reg reg_w = reg;
898          reg_w.writemask = WRITEMASK_W;
899          src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
900          reg_as_src.type = reg_w.type;
901          reg_as_src.swizzle = elk_swizzle_for_size(1);
902          emit(MOV(reg_w, reg_as_src));
903       }
904       if (output_reg[VARYING_SLOT_LAYER][0].file != BAD_FILE) {
905          dst_reg reg_y = reg;
906          reg_y.writemask = WRITEMASK_Y;
907          reg_y.type = ELK_REGISTER_TYPE_D;
908          output_reg[VARYING_SLOT_LAYER][0].type = reg_y.type;
909          emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER][0])));
910       }
911       if (output_reg[VARYING_SLOT_VIEWPORT][0].file != BAD_FILE) {
912          dst_reg reg_z = reg;
913          reg_z.writemask = WRITEMASK_Z;
914          reg_z.type = ELK_REGISTER_TYPE_D;
915          output_reg[VARYING_SLOT_VIEWPORT][0].type = reg_z.type;
916          emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT][0])));
917       }
918    }
919 }
920 
921 vec4_instruction *
emit_generic_urb_slot(dst_reg reg,int varying,int component)922 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying, int component)
923 {
924    assert(varying < VARYING_SLOT_MAX);
925 
926    unsigned num_comps = output_num_components[varying][component];
927    if (num_comps == 0)
928       return NULL;
929 
930    assert(output_reg[varying][component].type == reg.type);
931    current_annotation = output_reg_annotation[varying];
932    if (output_reg[varying][component].file != BAD_FILE) {
933       src_reg src = src_reg(output_reg[varying][component]);
934       src.swizzle = ELK_SWZ_COMP_OUTPUT(component);
935       reg.writemask =
936          elk_writemask_for_component_packing(num_comps, component);
937       return emit(MOV(reg, src));
938    }
939    return NULL;
940 }
941 
942 void
emit_urb_slot(dst_reg reg,int varying)943 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
944 {
945    reg.type = ELK_REGISTER_TYPE_F;
946    output_reg[varying][0].type = reg.type;
947 
948    switch (varying) {
949    case VARYING_SLOT_PSIZ:
950    {
951       /* PSIZ is always in slot 0, and is coupled with other flags. */
952       current_annotation = "indices, point width, clip flags";
953       emit_psiz_and_flags(reg);
954       break;
955    }
956    case ELK_VARYING_SLOT_NDC:
957       current_annotation = "NDC";
958       if (output_reg[ELK_VARYING_SLOT_NDC][0].file != BAD_FILE)
959          emit(MOV(reg, src_reg(output_reg[ELK_VARYING_SLOT_NDC][0])));
960       break;
961    case VARYING_SLOT_POS:
962       current_annotation = "gl_Position";
963       if (output_reg[VARYING_SLOT_POS][0].file != BAD_FILE)
964          emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS][0])));
965       break;
966    case ELK_VARYING_SLOT_PAD:
967       /* No need to write to this slot */
968       break;
969    default:
970       for (int i = 0; i < 4; i++) {
971          emit_generic_urb_slot(reg, varying, i);
972       }
973       break;
974    }
975 }
976 
977 static unsigned
align_interleaved_urb_mlen(const struct intel_device_info * devinfo,unsigned mlen)978 align_interleaved_urb_mlen(const struct intel_device_info *devinfo,
979                            unsigned mlen)
980 {
981    if (devinfo->ver >= 6) {
982       /* URB data written (does not include the message header reg) must
983        * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
984        * section 5.4.3.2.2: URB_INTERLEAVED.
985        *
986        * URB entries are allocated on a multiple of 1024 bits, so an
987        * extra 128 bits written here to make the end align to 256 is
988        * no problem.
989        */
990       if ((mlen % 2) != 1)
991 	 mlen++;
992    }
993 
994    return mlen;
995 }
996 
997 
998 /**
999  * Generates the VUE payload plus the necessary URB write instructions to
1000  * output it.
1001  *
1002  * The VUE layout is documented in Volume 2a.
1003  */
1004 void
emit_vertex()1005 vec4_visitor::emit_vertex()
1006 {
1007    /* MRF 0 is reserved for the debugger, so start with message header
1008     * in MRF 1.
1009     */
1010    int base_mrf = 1;
1011    int mrf = base_mrf;
1012    /* In the process of generating our URB write message contents, we
1013     * may need to unspill a register or load from an array.  Those
1014     * reads would use MRFs 14-15.
1015     */
1016    int max_usable_mrf = FIRST_SPILL_MRF(devinfo->ver);
1017 
1018    /* The following assertion verifies that max_usable_mrf causes an
1019     * even-numbered amount of URB write data, which will meet gfx6's
1020     * requirements for length alignment.
1021     */
1022    assert ((max_usable_mrf - base_mrf) % 2 == 0);
1023 
1024    /* First mrf is the g0-based message header containing URB handles and
1025     * such.
1026     */
1027    emit_urb_write_header(mrf++);
1028 
1029    if (devinfo->ver < 6) {
1030       emit_ndc_computation();
1031    }
1032 
1033    /* We may need to split this up into several URB writes, so do them in a
1034     * loop.
1035     */
1036    int slot = 0;
1037    bool complete = false;
1038    do {
1039       /* URB offset is in URB row increments, and each of our MRFs is half of
1040        * one of those, since we're doing interleaved writes.
1041        */
1042       int offset = slot / 2;
1043 
1044       mrf = base_mrf + 1;
1045       for (; slot < prog_data->vue_map.num_slots; ++slot) {
1046          emit_urb_slot(dst_reg(MRF, mrf++),
1047                        prog_data->vue_map.slot_to_varying[slot]);
1048 
1049          /* If this was max_usable_mrf, we can't fit anything more into this
1050           * URB WRITE. Same thing if we reached the maximum length available.
1051           */
1052          if (mrf > max_usable_mrf ||
1053              align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > ELK_MAX_MSG_LENGTH) {
1054             slot++;
1055             break;
1056          }
1057       }
1058 
1059       complete = slot >= prog_data->vue_map.num_slots;
1060       current_annotation = "URB write";
1061       vec4_instruction *inst = emit_urb_write_opcode(complete);
1062       inst->base_mrf = base_mrf;
1063       inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
1064       inst->offset += offset;
1065    } while(!complete);
1066 }
1067 
1068 
1069 src_reg
get_scratch_offset(elk_bblock_t * block,vec4_instruction * inst,src_reg * reladdr,int reg_offset)1070 vec4_visitor::get_scratch_offset(elk_bblock_t *block, vec4_instruction *inst,
1071 				 src_reg *reladdr, int reg_offset)
1072 {
1073    /* Because we store the values to scratch interleaved like our
1074     * vertex data, we need to scale the vec4 index by 2.
1075     */
1076    int message_header_scale = 2;
1077 
1078    /* Pre-gfx6, the message header uses byte offsets instead of vec4
1079     * (16-byte) offset units.
1080     */
1081    if (devinfo->ver < 6)
1082       message_header_scale *= 16;
1083 
1084    if (reladdr) {
1085       /* A vec4 is 16 bytes and a dvec4 is 32 bytes so for doubles we have
1086        * to multiply the reladdr by 2. Notice that the reg_offset part
1087        * is in units of 16 bytes and is used to select the low/high 16-byte
1088        * chunk of a full dvec4, so we don't want to multiply that part.
1089        */
1090       src_reg index = src_reg(this, glsl_int_type());
1091       if (type_sz(inst->dst.type) < 8) {
1092          emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1093                                       elk_imm_d(reg_offset)));
1094          emit_before(block, inst, MUL(dst_reg(index), index,
1095                                       elk_imm_d(message_header_scale)));
1096       } else {
1097          emit_before(block, inst, MUL(dst_reg(index), *reladdr,
1098                                       elk_imm_d(message_header_scale * 2)));
1099          emit_before(block, inst, ADD(dst_reg(index), index,
1100                                       elk_imm_d(reg_offset * message_header_scale)));
1101       }
1102       return index;
1103    } else {
1104       return elk_imm_d(reg_offset * message_header_scale);
1105    }
1106 }
1107 
1108 /**
1109  * Emits an instruction before @inst to load the value named by @orig_src
1110  * from scratch space at @base_offset to @temp.
1111  *
1112  * @base_offset is measured in 32-byte units (the size of a register).
1113  */
1114 void
emit_scratch_read(elk_bblock_t * block,vec4_instruction * inst,dst_reg temp,src_reg orig_src,int base_offset)1115 vec4_visitor::emit_scratch_read(elk_bblock_t *block, vec4_instruction *inst,
1116 				dst_reg temp, src_reg orig_src,
1117 				int base_offset)
1118 {
1119    assert(orig_src.offset % REG_SIZE == 0);
1120    int reg_offset = base_offset + orig_src.offset / REG_SIZE;
1121    src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
1122                                       reg_offset);
1123 
1124    if (type_sz(orig_src.type) < 8) {
1125       emit_before(block, inst, SCRATCH_READ(temp, index));
1126    } else {
1127       dst_reg shuffled = dst_reg(this, glsl_dvec4_type());
1128       dst_reg shuffled_float = retype(shuffled, ELK_REGISTER_TYPE_F);
1129       emit_before(block, inst, SCRATCH_READ(shuffled_float, index));
1130       index = get_scratch_offset(block, inst, orig_src.reladdr, reg_offset + 1);
1131       vec4_instruction *last_read =
1132          SCRATCH_READ(byte_offset(shuffled_float, REG_SIZE), index);
1133       emit_before(block, inst, last_read);
1134       shuffle_64bit_data(temp, src_reg(shuffled), false, true, block, last_read);
1135    }
1136 }
1137 
1138 /**
1139  * Emits an instruction after @inst to store the value to be written
1140  * to @orig_dst to scratch space at @base_offset, from @temp.
1141  *
1142  * @base_offset is measured in 32-byte units (the size of a register).
1143  */
1144 void
emit_scratch_write(elk_bblock_t * block,vec4_instruction * inst,int base_offset)1145 vec4_visitor::emit_scratch_write(elk_bblock_t *block, vec4_instruction *inst,
1146                                  int base_offset)
1147 {
1148    assert(inst->dst.offset % REG_SIZE == 0);
1149    int reg_offset = base_offset + inst->dst.offset / REG_SIZE;
1150    src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1151                                       reg_offset);
1152 
1153    /* Create a temporary register to store *inst's result in.
1154     *
1155     * We have to be careful in MOVing from our temporary result register in
1156     * the scratch write.  If we swizzle from channels of the temporary that
1157     * weren't initialized, it will confuse live interval analysis, which will
1158     * make spilling fail to make progress.
1159     */
1160    bool is_64bit = type_sz(inst->dst.type) == 8;
1161    const glsl_type *alloc_type =
1162       is_64bit ? glsl_dvec4_type() : glsl_vec4_type();
1163    const src_reg temp = swizzle(retype(src_reg(this, alloc_type),
1164                                        inst->dst.type),
1165                                 elk_swizzle_for_mask(inst->dst.writemask));
1166 
1167    if (!is_64bit) {
1168       dst_reg dst = dst_reg(elk_writemask(elk_vec8_grf(0, 0),
1169 				          inst->dst.writemask));
1170       vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
1171       if (inst->opcode != ELK_OPCODE_SEL)
1172          write->predicate = inst->predicate;
1173       write->ir = inst->ir;
1174       write->annotation = inst->annotation;
1175       inst->insert_after(block, write);
1176    } else {
1177       dst_reg shuffled = dst_reg(this, alloc_type);
1178       vec4_instruction *last =
1179          shuffle_64bit_data(shuffled, temp, true, true, block, inst);
1180       src_reg shuffled_float = src_reg(retype(shuffled, ELK_REGISTER_TYPE_F));
1181 
1182       uint8_t mask = 0;
1183       if (inst->dst.writemask & WRITEMASK_X)
1184          mask |= WRITEMASK_XY;
1185       if (inst->dst.writemask & WRITEMASK_Y)
1186          mask |= WRITEMASK_ZW;
1187       if (mask) {
1188          dst_reg dst = dst_reg(elk_writemask(elk_vec8_grf(0, 0), mask));
1189 
1190          vec4_instruction *write = SCRATCH_WRITE(dst, shuffled_float, index);
1191          if (inst->opcode != ELK_OPCODE_SEL)
1192             write->predicate = inst->predicate;
1193          write->ir = inst->ir;
1194          write->annotation = inst->annotation;
1195          last->insert_after(block, write);
1196       }
1197 
1198       mask = 0;
1199       if (inst->dst.writemask & WRITEMASK_Z)
1200          mask |= WRITEMASK_XY;
1201       if (inst->dst.writemask & WRITEMASK_W)
1202          mask |= WRITEMASK_ZW;
1203       if (mask) {
1204          dst_reg dst = dst_reg(elk_writemask(elk_vec8_grf(0, 0), mask));
1205 
1206          src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1207                                             reg_offset + 1);
1208          vec4_instruction *write =
1209             SCRATCH_WRITE(dst, byte_offset(shuffled_float, REG_SIZE), index);
1210          if (inst->opcode != ELK_OPCODE_SEL)
1211             write->predicate = inst->predicate;
1212          write->ir = inst->ir;
1213          write->annotation = inst->annotation;
1214          last->insert_after(block, write);
1215       }
1216    }
1217 
1218    inst->dst.file = temp.file;
1219    inst->dst.nr = temp.nr;
1220    inst->dst.offset %= REG_SIZE;
1221    inst->dst.reladdr = NULL;
1222 }
1223 
1224 /**
1225  * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
1226  * adds the scratch read(s) before \p inst. The function also checks for
1227  * recursive reladdr scratch accesses, issuing the corresponding scratch
1228  * loads and rewriting reladdr references accordingly.
1229  *
1230  * \return \p src if it did not require a scratch load, otherwise, the
1231  * register holding the result of the scratch load that the caller should
1232  * use to rewrite src.
1233  */
1234 src_reg
emit_resolve_reladdr(int scratch_loc[],elk_bblock_t * block,vec4_instruction * inst,src_reg src)1235 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], elk_bblock_t *block,
1236                                    vec4_instruction *inst, src_reg src)
1237 {
1238    /* Resolve recursive reladdr scratch access by calling ourselves
1239     * with src.reladdr
1240     */
1241    if (src.reladdr)
1242       *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1243                                           *src.reladdr);
1244 
1245    /* Now handle scratch access on src */
1246    if (src.file == VGRF && scratch_loc[src.nr] != -1) {
1247       dst_reg temp = dst_reg(this, type_sz(src.type) == 8 ?
1248          glsl_dvec4_type() : glsl_vec4_type());
1249       emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]);
1250       src.nr = temp.nr;
1251       src.offset %= REG_SIZE;
1252       src.reladdr = NULL;
1253    }
1254 
1255    return src;
1256 }
1257 
1258 /**
1259  * We can't generally support array access in GRF space, because a
1260  * single instruction's destination can only span 2 contiguous
1261  * registers.  So, we send all GRF arrays that get variable index
1262  * access to scratch space.
1263  */
1264 void
move_grf_array_access_to_scratch()1265 vec4_visitor::move_grf_array_access_to_scratch()
1266 {
1267    int scratch_loc[this->alloc.count];
1268    memset(scratch_loc, -1, sizeof(scratch_loc));
1269 
1270    /* First, calculate the set of virtual GRFs that need to be punted
1271     * to scratch due to having any array access on them, and where in
1272     * scratch.
1273     */
1274    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1275       if (inst->dst.file == VGRF && inst->dst.reladdr) {
1276          if (scratch_loc[inst->dst.nr] == -1) {
1277             scratch_loc[inst->dst.nr] = last_scratch;
1278             last_scratch += this->alloc.sizes[inst->dst.nr];
1279          }
1280 
1281          for (src_reg *iter = inst->dst.reladdr;
1282               iter->reladdr;
1283               iter = iter->reladdr) {
1284             if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1285                scratch_loc[iter->nr] = last_scratch;
1286                last_scratch += this->alloc.sizes[iter->nr];
1287             }
1288          }
1289       }
1290 
1291       for (int i = 0 ; i < 3; i++) {
1292          for (src_reg *iter = &inst->src[i];
1293               iter->reladdr;
1294               iter = iter->reladdr) {
1295             if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1296                scratch_loc[iter->nr] = last_scratch;
1297                last_scratch += this->alloc.sizes[iter->nr];
1298             }
1299          }
1300       }
1301    }
1302 
1303    /* Now, for anything that will be accessed through scratch, rewrite
1304     * it to load/store.  Note that this is a _safe list walk, because
1305     * we may generate a new scratch_write instruction after the one
1306     * we're processing.
1307     */
1308    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1309       /* Set up the annotation tracking for new generated instructions. */
1310       base_ir = inst->ir;
1311       current_annotation = inst->annotation;
1312 
1313       /* First handle scratch access on the dst. Notice we have to handle
1314        * the case where the dst's reladdr also points to scratch space.
1315        */
1316       if (inst->dst.reladdr)
1317          *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1318                                                    *inst->dst.reladdr);
1319 
1320       /* Now that we have handled any (possibly recursive) reladdr scratch
1321        * accesses for dst we can safely do the scratch write for dst itself
1322        */
1323       if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1)
1324          emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]);
1325 
1326       /* Now handle scratch access on any src. In this case, since inst->src[i]
1327        * already is a src_reg, we can just call emit_resolve_reladdr with
1328        * inst->src[i] and it will take care of handling scratch loads for
1329        * both src and src.reladdr (recursively).
1330        */
1331       for (int i = 0 ; i < 3; i++) {
1332          inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
1333                                              inst->src[i]);
1334       }
1335    }
1336 }
1337 
1338 void
resolve_ud_negate(src_reg * reg)1339 vec4_visitor::resolve_ud_negate(src_reg *reg)
1340 {
1341    if (reg->type != ELK_REGISTER_TYPE_UD ||
1342        !reg->negate)
1343       return;
1344 
1345    src_reg temp = src_reg(this, glsl_uvec4_type());
1346    emit(ELK_OPCODE_MOV, dst_reg(temp), *reg);
1347    *reg = temp;
1348 }
1349 
1350 static elk_rnd_mode
elk_rnd_mode_from_execution_mode(unsigned execution_mode)1351 elk_rnd_mode_from_execution_mode(unsigned execution_mode)
1352 {
1353    if (nir_has_any_rounding_mode_rtne(execution_mode))
1354       return ELK_RND_MODE_RTNE;
1355    if (nir_has_any_rounding_mode_rtz(execution_mode))
1356       return ELK_RND_MODE_RTZ;
1357    return ELK_RND_MODE_UNSPECIFIED;
1358 }
1359 
1360 void
emit_shader_float_controls_execution_mode()1361 vec4_visitor::emit_shader_float_controls_execution_mode()
1362 {
1363    unsigned execution_mode = this->nir->info.float_controls_execution_mode;
1364    if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1365       elk_rnd_mode rnd = elk_rnd_mode_from_execution_mode(execution_mode);
1366       const vec4_builder bld = vec4_builder(this).at_end();
1367       bld.exec_all().emit(ELK_SHADER_OPCODE_RND_MODE, dst_null_ud(), elk_imm_d(rnd));
1368    }
1369 }
1370 
vec4_visitor(const struct elk_compiler * compiler,const struct elk_compile_params * params,const struct elk_sampler_prog_key_data * key_tex,struct elk_vue_prog_data * prog_data,const nir_shader * shader,bool no_spills,bool debug_enabled)1371 vec4_visitor::vec4_visitor(const struct elk_compiler *compiler,
1372                            const struct elk_compile_params *params,
1373                            const struct elk_sampler_prog_key_data *key_tex,
1374                            struct elk_vue_prog_data *prog_data,
1375                            const nir_shader *shader,
1376                            bool no_spills,
1377                            bool debug_enabled)
1378    : elk_backend_shader(compiler, params, shader, &prog_data->base, debug_enabled),
1379      key_tex(key_tex),
1380      prog_data(prog_data),
1381      fail_msg(NULL),
1382      first_non_payload_grf(0),
1383      ubo_push_start(),
1384      push_length(0),
1385      live_analysis(this), performance_analysis(this),
1386      no_spills(no_spills),
1387      last_scratch(0)
1388 {
1389    this->failed = false;
1390 
1391    this->base_ir = NULL;
1392    this->current_annotation = NULL;
1393    memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
1394 
1395    memset(this->output_num_components, 0, sizeof(this->output_num_components));
1396 
1397    this->max_grf = devinfo->ver >= 7 ? GFX7_MRF_HACK_START : ELK_MAX_GRF;
1398 
1399    this->uniforms = 0;
1400 
1401    this->nir_ssa_values = NULL;
1402 }
1403 
1404 
1405 void
fail(const char * format,...)1406 vec4_visitor::fail(const char *format, ...)
1407 {
1408    va_list va;
1409    char *msg;
1410 
1411    if (failed)
1412       return;
1413 
1414    failed = true;
1415 
1416    va_start(va, format);
1417    msg = ralloc_vasprintf(mem_ctx, format, va);
1418    va_end(va);
1419    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n",
1420                          _mesa_shader_stage_to_abbrev(stage), msg);
1421 
1422    this->fail_msg = msg;
1423 
1424    if (unlikely(debug_enabled)) {
1425       fprintf(stderr, "%s",  msg);
1426    }
1427 }
1428 
1429 } /* namespace elk */
1430