xref: /aosp_15_r20/external/mesa3d/src/intel/compiler/elk/elk_lower_logical_sends.cpp (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2010, 2022 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 /**
25  * @file elk_lower_logical_sends.cpp
26  */
27 
28 #include "elk_eu.h"
29 #include "elk_fs.h"
30 #include "elk_fs_builder.h"
31 
32 using namespace elk;
33 
34 static void
lower_urb_read_logical_send(const fs_builder & bld,elk_fs_inst * inst)35 lower_urb_read_logical_send(const fs_builder &bld, elk_fs_inst *inst)
36 {
37    const intel_device_info *devinfo = bld.shader->devinfo;
38    const bool per_slot_present =
39       inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS].file != BAD_FILE;
40 
41    assert(inst->size_written % REG_SIZE == 0);
42    assert(inst->header_size == 0);
43 
44    elk_fs_reg payload_sources[2];
45    unsigned header_size = 0;
46    payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_HANDLE];
47    if (per_slot_present)
48       payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS];
49 
50    elk_fs_reg payload = elk_fs_reg(VGRF, bld.shader->alloc.allocate(header_size),
51                            ELK_REGISTER_TYPE_F);
52    bld.LOAD_PAYLOAD(payload, payload_sources, header_size, header_size);
53 
54    inst->opcode = ELK_SHADER_OPCODE_SEND;
55    inst->header_size = header_size;
56 
57    inst->sfid = ELK_SFID_URB;
58    inst->desc = elk_urb_desc(devinfo,
59                              GFX8_URB_OPCODE_SIMD8_READ,
60                              per_slot_present,
61                              false,
62                              inst->offset);
63 
64    inst->mlen = header_size;
65    inst->send_is_volatile = true;
66 
67    inst->resize_sources(2);
68 
69    inst->src[0] = elk_imm_ud(0); /* desc */
70    inst->src[1] = payload;
71 }
72 
73 static void
lower_urb_write_logical_send(const fs_builder & bld,elk_fs_inst * inst)74 lower_urb_write_logical_send(const fs_builder &bld, elk_fs_inst *inst)
75 {
76    const intel_device_info *devinfo = bld.shader->devinfo;
77    const bool per_slot_present =
78       inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS].file != BAD_FILE;
79    const bool channel_mask_present =
80       inst->src[URB_LOGICAL_SRC_CHANNEL_MASK].file != BAD_FILE;
81 
82    assert(inst->header_size == 0);
83 
84    const unsigned length = 1 + per_slot_present + channel_mask_present +
85                            inst->components_read(URB_LOGICAL_SRC_DATA);
86 
87    elk_fs_reg *payload_sources = new elk_fs_reg[length];
88    elk_fs_reg payload = elk_fs_reg(VGRF, bld.shader->alloc.allocate(length),
89                            ELK_REGISTER_TYPE_F);
90 
91    unsigned header_size = 0;
92    payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_HANDLE];
93    if (per_slot_present)
94       payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS];
95 
96    if (channel_mask_present)
97       payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_CHANNEL_MASK];
98 
99    for (unsigned i = header_size, j = 0; i < length; i++, j++)
100       payload_sources[i] = offset(inst->src[URB_LOGICAL_SRC_DATA], bld, j);
101 
102    bld.LOAD_PAYLOAD(payload, payload_sources, length, header_size);
103 
104    delete [] payload_sources;
105 
106    inst->opcode = ELK_SHADER_OPCODE_SEND;
107    inst->header_size = header_size;
108    inst->dst = elk_null_reg();
109 
110    inst->sfid = ELK_SFID_URB;
111    inst->desc = elk_urb_desc(devinfo,
112                              GFX8_URB_OPCODE_SIMD8_WRITE,
113                              per_slot_present,
114                              channel_mask_present,
115                              inst->offset);
116 
117    inst->mlen = length;
118    inst->send_has_side_effects = true;
119 
120    inst->resize_sources(2);
121 
122    inst->src[0] = elk_imm_ud(0); /* desc */
123    inst->src[1] = payload;
124 }
125 
126 static void
setup_color_payload(const fs_builder & bld,const elk_wm_prog_key * key,elk_fs_reg * dst,elk_fs_reg color,unsigned components)127 setup_color_payload(const fs_builder &bld, const elk_wm_prog_key *key,
128                     elk_fs_reg *dst, elk_fs_reg color, unsigned components)
129 {
130    if (key->clamp_fragment_color) {
131       elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_F, 4);
132       assert(color.type == ELK_REGISTER_TYPE_F);
133 
134       for (unsigned i = 0; i < components; i++)
135          set_saturate(true,
136                       bld.MOV(offset(tmp, bld, i), offset(color, bld, i)));
137 
138       color = tmp;
139    }
140 
141    for (unsigned i = 0; i < components; i++)
142       dst[i] = offset(color, bld, i);
143 }
144 
145 static void
lower_fb_write_logical_send(const fs_builder & bld,elk_fs_inst * inst,const struct elk_wm_prog_data * prog_data,const elk_wm_prog_key * key,const elk_fs_thread_payload & payload)146 lower_fb_write_logical_send(const fs_builder &bld, elk_fs_inst *inst,
147                             const struct elk_wm_prog_data *prog_data,
148                             const elk_wm_prog_key *key,
149                             const elk_fs_thread_payload &payload)
150 {
151    assert(inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
152    const intel_device_info *devinfo = bld.shader->devinfo;
153    const elk_fs_reg color0 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR0];
154    const elk_fs_reg color1 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR1];
155    const elk_fs_reg src0_alpha = inst->src[FB_WRITE_LOGICAL_SRC_SRC0_ALPHA];
156    const elk_fs_reg src_depth = inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH];
157    const elk_fs_reg dst_depth = inst->src[FB_WRITE_LOGICAL_SRC_DST_DEPTH];
158    elk_fs_reg sample_mask = inst->src[FB_WRITE_LOGICAL_SRC_OMASK];
159    const unsigned components =
160       inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
161 
162    assert(inst->target != 0 || src0_alpha.file == BAD_FILE);
163 
164    /* We can potentially have a message length of up to 15, so we have to set
165     * base_mrf to either 0 or 1 in order to fit in m0..m15.
166     */
167    elk_fs_reg sources[15];
168    int header_size = 2, payload_header_size;
169    unsigned length = 0;
170 
171    if (devinfo->ver < 6) {
172       /* TODO: Support SIMD32 on gfx4-5 */
173       assert(bld.group() < 16);
174 
175       /* For gfx4-5, we always have a header consisting of g0 and g1.  We have
176        * an implied MOV from g0,g1 to the start of the message.  The MOV from
177        * g0 is handled by the hardware and the MOV from g1 is provided by the
178        * generator.  This is required because, on gfx4-5, the generator may
179        * generate two write messages with different message lengths in order
180        * to handle AA data properly.
181        *
182        * Also, since the pixel mask goes in the g0 portion of the message and
183        * since render target writes are the last thing in the shader, we write
184        * the pixel mask directly into g0 and it will get copied as part of the
185        * implied write.
186        */
187       if (prog_data->uses_kill) {
188          bld.exec_all().group(1, 0)
189             .MOV(retype(elk_vec1_grf(0, 0), ELK_REGISTER_TYPE_UW),
190                  elk_sample_mask_reg(bld));
191       }
192 
193       assert(length == 0);
194       length = 2;
195    } else if ((devinfo->verx10 <= 70 && prog_data->uses_kill) ||
196               color1.file != BAD_FILE ||
197               key->nr_color_regions > 1) {
198       /* From the Sandy Bridge PRM, volume 4, page 198:
199        *
200        *     "Dispatched Pixel Enables. One bit per pixel indicating
201        *      which pixels were originally enabled when the thread was
202        *      dispatched. This field is only required for the end-of-
203        *      thread message and on all dual-source messages."
204        */
205       const fs_builder ubld = bld.exec_all().group(8, 0);
206 
207       elk_fs_reg header = ubld.vgrf(ELK_REGISTER_TYPE_UD, 2);
208       if (bld.group() < 16) {
209          /* The header starts off as g0 and g1 for the first half */
210          ubld.group(16, 0).MOV(header, retype(elk_vec8_grf(0, 0),
211                                               ELK_REGISTER_TYPE_UD));
212       } else {
213          /* The header starts off as g0 and g2 for the second half */
214          assert(bld.group() < 32);
215          const elk_fs_reg header_sources[2] = {
216             retype(elk_vec8_grf(0, 0), ELK_REGISTER_TYPE_UD),
217             retype(elk_vec8_grf(2, 0), ELK_REGISTER_TYPE_UD),
218          };
219          ubld.LOAD_PAYLOAD(header, header_sources, 2, 0);
220       }
221 
222       uint32_t g00_bits = 0;
223 
224       /* Set "Source0 Alpha Present to RenderTarget" bit in message
225        * header.
226        */
227       if (src0_alpha.file != BAD_FILE)
228          g00_bits |= 1 << 11;
229 
230       /* Set computes stencil to render target */
231       if (prog_data->computed_stencil)
232          g00_bits |= 1 << 14;
233 
234       if (g00_bits) {
235          /* OR extra bits into g0.0 */
236          ubld.group(1, 0).OR(component(header, 0),
237                              retype(elk_vec1_grf(0, 0),
238                                     ELK_REGISTER_TYPE_UD),
239                              elk_imm_ud(g00_bits));
240       }
241 
242       /* Set the render target index for choosing BLEND_STATE. */
243       if (inst->target > 0) {
244          ubld.group(1, 0).MOV(component(header, 2), elk_imm_ud(inst->target));
245       }
246 
247       if (prog_data->uses_kill) {
248          ubld.group(1, 0).MOV(retype(component(header, 15),
249                                      ELK_REGISTER_TYPE_UW),
250                               elk_sample_mask_reg(bld));
251       }
252 
253       assert(length == 0);
254       sources[0] = header;
255       sources[1] = horiz_offset(header, 8);
256       length = 2;
257    }
258    assert(length == 0 || length == 2);
259    header_size = length;
260 
261    if (payload.aa_dest_stencil_reg[0]) {
262       assert(inst->group < 16);
263       sources[length] = elk_fs_reg(VGRF, bld.shader->alloc.allocate(1));
264       bld.group(8, 0).exec_all().annotate("FB write stencil/AA alpha")
265          .MOV(sources[length],
266               elk_fs_reg(elk_vec8_grf(payload.aa_dest_stencil_reg[0], 0)));
267       length++;
268    }
269 
270    if (src0_alpha.file != BAD_FILE) {
271       for (unsigned i = 0; i < bld.dispatch_width() / 8; i++) {
272          const fs_builder &ubld = bld.exec_all().group(8, i)
273                                     .annotate("FB write src0 alpha");
274          const elk_fs_reg tmp = ubld.vgrf(ELK_REGISTER_TYPE_F);
275          ubld.MOV(tmp, horiz_offset(src0_alpha, i * 8));
276          setup_color_payload(ubld, key, &sources[length], tmp, 1);
277          length++;
278       }
279    }
280 
281    if (sample_mask.file != BAD_FILE) {
282       const elk_fs_reg tmp(VGRF, bld.shader->alloc.allocate(reg_unit(devinfo)),
283                        ELK_REGISTER_TYPE_UD);
284 
285       /* Hand over gl_SampleMask.  Only the lower 16 bits of each channel are
286        * relevant.  Since it's unsigned single words one vgrf is always
287        * 16-wide, but only the lower or higher 8 channels will be used by the
288        * hardware when doing a SIMD8 write depending on whether we have
289        * selected the subspans for the first or second half respectively.
290        */
291       assert(sample_mask.file != BAD_FILE && type_sz(sample_mask.type) == 4);
292       sample_mask.type = ELK_REGISTER_TYPE_UW;
293       sample_mask.stride *= 2;
294 
295       bld.exec_all().annotate("FB write oMask")
296          .MOV(horiz_offset(retype(tmp, ELK_REGISTER_TYPE_UW),
297                            inst->group % (16 * reg_unit(devinfo))),
298               sample_mask);
299 
300       for (unsigned i = 0; i < reg_unit(devinfo); i++)
301          sources[length++] = byte_offset(tmp, REG_SIZE * i);
302    }
303 
304    payload_header_size = length;
305 
306    setup_color_payload(bld, key, &sources[length], color0, components);
307    length += 4;
308 
309    if (color1.file != BAD_FILE) {
310       setup_color_payload(bld, key, &sources[length], color1, components);
311       length += 4;
312    }
313 
314    if (src_depth.file != BAD_FILE) {
315       sources[length] = src_depth;
316       length++;
317    }
318 
319    if (dst_depth.file != BAD_FILE) {
320       sources[length] = dst_depth;
321       length++;
322    }
323 
324    elk_fs_inst *load;
325    if (devinfo->ver >= 7) {
326       /* Send from the GRF */
327       elk_fs_reg payload = elk_fs_reg(VGRF, -1, ELK_REGISTER_TYPE_F);
328       load = bld.LOAD_PAYLOAD(payload, sources, length, payload_header_size);
329       payload.nr = bld.shader->alloc.allocate(regs_written(load));
330       load->dst = payload;
331 
332       uint32_t msg_ctl = elk_fb_write_msg_control(inst, prog_data);
333 
334       inst->desc =
335          (inst->group / 16) << 11 | /* rt slot group */
336          elk_fb_write_desc(devinfo, inst->target, msg_ctl, inst->last_rt,
337                            0 /* coarse_rt_write */);
338 
339       inst->opcode = ELK_SHADER_OPCODE_SEND;
340       inst->resize_sources(2);
341       inst->sfid = GFX6_SFID_DATAPORT_RENDER_CACHE;
342       inst->src[0] = elk_imm_ud(0);
343       inst->src[1] = payload;
344       inst->mlen = regs_written(load);
345       inst->header_size = header_size;
346       inst->check_tdr = true;
347       inst->send_has_side_effects = true;
348    } else {
349       /* Send from the MRF */
350       load = bld.LOAD_PAYLOAD(elk_fs_reg(MRF, 1, ELK_REGISTER_TYPE_F),
351                               sources, length, payload_header_size);
352 
353       /* On pre-SNB, we have to interlace the color values.  LOAD_PAYLOAD
354        * will do this for us if we just give it a COMPR4 destination.
355        */
356       if (devinfo->ver < 6 && bld.dispatch_width() == 16)
357          load->dst.nr |= ELK_MRF_COMPR4;
358 
359       if (devinfo->ver < 6) {
360          /* Set up src[0] for the implied MOV from grf0-1 */
361          inst->resize_sources(1);
362          inst->src[0] = elk_vec8_grf(0, 0);
363       } else {
364          inst->resize_sources(0);
365       }
366       inst->base_mrf = 1;
367       inst->opcode = ELK_FS_OPCODE_FB_WRITE;
368       inst->mlen = regs_written(load);
369       inst->header_size = header_size;
370    }
371 }
372 
373 static void
lower_sampler_logical_send_gfx4(const fs_builder & bld,elk_fs_inst * inst,elk_opcode op,const elk_fs_reg & coordinate,const elk_fs_reg & shadow_c,const elk_fs_reg & lod,const elk_fs_reg & lod2,const elk_fs_reg & surface,const elk_fs_reg & sampler,unsigned coord_components,unsigned grad_components)374 lower_sampler_logical_send_gfx4(const fs_builder &bld, elk_fs_inst *inst, elk_opcode op,
375                                 const elk_fs_reg &coordinate,
376                                 const elk_fs_reg &shadow_c,
377                                 const elk_fs_reg &lod, const elk_fs_reg &lod2,
378                                 const elk_fs_reg &surface,
379                                 const elk_fs_reg &sampler,
380                                 unsigned coord_components,
381                                 unsigned grad_components)
382 {
383    const bool has_lod = (op == ELK_SHADER_OPCODE_TXL || op == ELK_FS_OPCODE_TXB ||
384                          op == ELK_SHADER_OPCODE_TXF || op == ELK_SHADER_OPCODE_TXS);
385    elk_fs_reg msg_begin(MRF, 1, ELK_REGISTER_TYPE_F);
386    elk_fs_reg msg_end = msg_begin;
387 
388    /* g0 header. */
389    msg_end = offset(msg_end, bld.group(8, 0), 1);
390 
391    for (unsigned i = 0; i < coord_components; i++)
392       bld.MOV(retype(offset(msg_end, bld, i), coordinate.type),
393               offset(coordinate, bld, i));
394 
395    msg_end = offset(msg_end, bld, coord_components);
396 
397    /* Messages other than SAMPLE and RESINFO in SIMD16 and TXD in SIMD8
398     * require all three components to be present and zero if they are unused.
399     */
400    if (coord_components > 0 &&
401        (has_lod || shadow_c.file != BAD_FILE ||
402         (op == ELK_SHADER_OPCODE_TEX && bld.dispatch_width() == 8))) {
403       assert(coord_components <= 3);
404       for (unsigned i = 0; i < 3 - coord_components; i++)
405          bld.MOV(offset(msg_end, bld, i), elk_imm_f(0.0f));
406 
407       msg_end = offset(msg_end, bld, 3 - coord_components);
408    }
409 
410    if (op == ELK_SHADER_OPCODE_TXD) {
411       /* TXD unsupported in SIMD16 mode. */
412       assert(bld.dispatch_width() == 8);
413 
414       /* the slots for u and v are always present, but r is optional */
415       if (coord_components < 2)
416          msg_end = offset(msg_end, bld, 2 - coord_components);
417 
418       /*  P   = u, v, r
419        * dPdx = dudx, dvdx, drdx
420        * dPdy = dudy, dvdy, drdy
421        *
422        * 1-arg: Does not exist.
423        *
424        * 2-arg: dudx   dvdx   dudy   dvdy
425        *        dPdx.x dPdx.y dPdy.x dPdy.y
426        *        m4     m5     m6     m7
427        *
428        * 3-arg: dudx   dvdx   drdx   dudy   dvdy   drdy
429        *        dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z
430        *        m5     m6     m7     m8     m9     m10
431        */
432       for (unsigned i = 0; i < grad_components; i++)
433          bld.MOV(offset(msg_end, bld, i), offset(lod, bld, i));
434 
435       msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
436 
437       for (unsigned i = 0; i < grad_components; i++)
438          bld.MOV(offset(msg_end, bld, i), offset(lod2, bld, i));
439 
440       msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
441    }
442 
443    if (has_lod) {
444       /* Bias/LOD with shadow comparator is unsupported in SIMD16 -- *Without*
445        * shadow comparator (including RESINFO) it's unsupported in SIMD8 mode.
446        */
447       assert(shadow_c.file != BAD_FILE ? bld.dispatch_width() == 8 :
448              bld.dispatch_width() == 16);
449 
450       const elk_reg_type type =
451          (op == ELK_SHADER_OPCODE_TXF || op == ELK_SHADER_OPCODE_TXS ?
452           ELK_REGISTER_TYPE_UD : ELK_REGISTER_TYPE_F);
453       bld.MOV(retype(msg_end, type), lod);
454       msg_end = offset(msg_end, bld, 1);
455    }
456 
457    if (shadow_c.file != BAD_FILE) {
458       if (op == ELK_SHADER_OPCODE_TEX && bld.dispatch_width() == 8) {
459          /* There's no plain shadow compare message, so we use shadow
460           * compare with a bias of 0.0.
461           */
462          bld.MOV(msg_end, elk_imm_f(0.0f));
463          msg_end = offset(msg_end, bld, 1);
464       }
465 
466       bld.MOV(msg_end, shadow_c);
467       msg_end = offset(msg_end, bld, 1);
468    }
469 
470    inst->opcode = op;
471    inst->src[0] = reg_undef;
472    inst->src[1] = surface;
473    inst->src[2] = sampler;
474    inst->resize_sources(3);
475    inst->base_mrf = msg_begin.nr;
476    inst->mlen = msg_end.nr - msg_begin.nr;
477    inst->header_size = 1;
478 }
479 
480 static void
lower_sampler_logical_send_gfx5(const fs_builder & bld,elk_fs_inst * inst,elk_opcode op,const elk_fs_reg & coordinate,const elk_fs_reg & shadow_c,const elk_fs_reg & lod,const elk_fs_reg & lod2,const elk_fs_reg & sample_index,const elk_fs_reg & surface,const elk_fs_reg & sampler,unsigned coord_components,unsigned grad_components)481 lower_sampler_logical_send_gfx5(const fs_builder &bld, elk_fs_inst *inst, elk_opcode op,
482                                 const elk_fs_reg &coordinate,
483                                 const elk_fs_reg &shadow_c,
484                                 const elk_fs_reg &lod, const elk_fs_reg &lod2,
485                                 const elk_fs_reg &sample_index,
486                                 const elk_fs_reg &surface,
487                                 const elk_fs_reg &sampler,
488                                 unsigned coord_components,
489                                 unsigned grad_components)
490 {
491    elk_fs_reg message(MRF, 2, ELK_REGISTER_TYPE_F);
492    elk_fs_reg msg_coords = message;
493    unsigned header_size = 0;
494 
495    if (inst->offset != 0) {
496       /* The offsets set up by the visitor are in the m1 header, so we can't
497        * go headerless.
498        */
499       header_size = 1;
500       message.nr--;
501    }
502 
503    for (unsigned i = 0; i < coord_components; i++)
504       bld.MOV(retype(offset(msg_coords, bld, i), coordinate.type),
505               offset(coordinate, bld, i));
506 
507    elk_fs_reg msg_end = offset(msg_coords, bld, coord_components);
508    elk_fs_reg msg_lod = offset(msg_coords, bld, 4);
509 
510    if (shadow_c.file != BAD_FILE) {
511       elk_fs_reg msg_shadow = msg_lod;
512       bld.MOV(msg_shadow, shadow_c);
513       msg_lod = offset(msg_shadow, bld, 1);
514       msg_end = msg_lod;
515    }
516 
517    switch (op) {
518    case ELK_SHADER_OPCODE_TXL:
519    case ELK_FS_OPCODE_TXB:
520       bld.MOV(msg_lod, lod);
521       msg_end = offset(msg_lod, bld, 1);
522       break;
523    case ELK_SHADER_OPCODE_TXD:
524       /**
525        *  P   =  u,    v,    r
526        * dPdx = dudx, dvdx, drdx
527        * dPdy = dudy, dvdy, drdy
528        *
529        * Load up these values:
530        * - dudx   dudy   dvdx   dvdy   drdx   drdy
531        * - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z
532        */
533       msg_end = msg_lod;
534       for (unsigned i = 0; i < grad_components; i++) {
535          bld.MOV(msg_end, offset(lod, bld, i));
536          msg_end = offset(msg_end, bld, 1);
537 
538          bld.MOV(msg_end, offset(lod2, bld, i));
539          msg_end = offset(msg_end, bld, 1);
540       }
541       break;
542    case ELK_SHADER_OPCODE_TXS:
543       msg_lod = retype(msg_end, ELK_REGISTER_TYPE_UD);
544       bld.MOV(msg_lod, lod);
545       msg_end = offset(msg_lod, bld, 1);
546       break;
547    case ELK_SHADER_OPCODE_TXF:
548       msg_lod = offset(msg_coords, bld, 3);
549       bld.MOV(retype(msg_lod, ELK_REGISTER_TYPE_UD), lod);
550       msg_end = offset(msg_lod, bld, 1);
551       break;
552    case ELK_SHADER_OPCODE_TXF_CMS:
553       msg_lod = offset(msg_coords, bld, 3);
554       /* lod */
555       bld.MOV(retype(msg_lod, ELK_REGISTER_TYPE_UD), elk_imm_ud(0u));
556       /* sample index */
557       bld.MOV(retype(offset(msg_lod, bld, 1), ELK_REGISTER_TYPE_UD), sample_index);
558       msg_end = offset(msg_lod, bld, 2);
559       break;
560    default:
561       break;
562    }
563 
564    inst->opcode = op;
565    inst->src[0] = reg_undef;
566    inst->src[1] = surface;
567    inst->src[2] = sampler;
568    inst->resize_sources(3);
569    inst->base_mrf = message.nr;
570    inst->mlen = msg_end.nr - message.nr;
571    inst->header_size = header_size;
572 
573    /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
574    assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);
575 }
576 
577 static bool
is_high_sampler(const struct intel_device_info * devinfo,const elk_fs_reg & sampler)578 is_high_sampler(const struct intel_device_info *devinfo, const elk_fs_reg &sampler)
579 {
580    if (devinfo->verx10 <= 70)
581       return false;
582 
583    return sampler.file != IMM || sampler.ud >= 16;
584 }
585 
586 static unsigned
sampler_msg_type(const intel_device_info * devinfo,elk_opcode opcode,bool shadow_compare,bool has_min_lod)587 sampler_msg_type(const intel_device_info *devinfo,
588                  elk_opcode opcode, bool shadow_compare, bool has_min_lod)
589 {
590    assert(devinfo->ver >= 5);
591    switch (opcode) {
592    case ELK_SHADER_OPCODE_TEX:
593       return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_COMPARE :
594                               GFX5_SAMPLER_MESSAGE_SAMPLE;
595    case ELK_FS_OPCODE_TXB:
596       return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE :
597                               GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS;
598    case ELK_SHADER_OPCODE_TXL:
599       assert(!has_min_lod);
600       return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE :
601                               GFX5_SAMPLER_MESSAGE_SAMPLE_LOD;
602    case ELK_SHADER_OPCODE_TXS:
603    case ELK_SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
604       assert(!has_min_lod);
605       return GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
606    case ELK_SHADER_OPCODE_TXD:
607       assert(!shadow_compare || devinfo->verx10 >= 75);
608       return shadow_compare ? HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE :
609                               GFX5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
610    case ELK_SHADER_OPCODE_TXF:
611       assert(!has_min_lod);
612       return GFX5_SAMPLER_MESSAGE_SAMPLE_LD;
613    case ELK_SHADER_OPCODE_TXF_CMS:
614       assert(!has_min_lod);
615       return devinfo->ver >= 7 ? GFX7_SAMPLER_MESSAGE_SAMPLE_LD2DMS :
616                                  GFX5_SAMPLER_MESSAGE_SAMPLE_LD;
617    case ELK_SHADER_OPCODE_TXF_UMS:
618       assert(!has_min_lod);
619       assert(devinfo->ver >= 7);
620       return GFX7_SAMPLER_MESSAGE_SAMPLE_LD2DSS;
621    case ELK_SHADER_OPCODE_TXF_MCS:
622       assert(!has_min_lod);
623       assert(devinfo->ver >= 7);
624       return GFX7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
625    case ELK_SHADER_OPCODE_LOD:
626       assert(!has_min_lod);
627       return GFX5_SAMPLER_MESSAGE_LOD;
628    case ELK_SHADER_OPCODE_TG4:
629       assert(!has_min_lod);
630       assert(devinfo->ver >= 7);
631       return shadow_compare ? GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C :
632                               GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
633       break;
634    case ELK_SHADER_OPCODE_TG4_OFFSET:
635       assert(!has_min_lod);
636       assert(devinfo->ver >= 7);
637       return shadow_compare ? GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C :
638                               GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
639    case ELK_SHADER_OPCODE_SAMPLEINFO:
640       assert(!has_min_lod);
641       return GFX6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO;
642    default:
643       unreachable("not reached");
644    }
645 }
646 
647 /**
648  * Emit a LOAD_PAYLOAD instruction while ensuring the sources are aligned to
649  * the given requested_alignment_sz.
650  */
651 static elk_fs_inst *
emit_load_payload_with_padding(const fs_builder & bld,const elk_fs_reg & dst,const elk_fs_reg * src,unsigned sources,unsigned header_size,unsigned requested_alignment_sz)652 emit_load_payload_with_padding(const fs_builder &bld, const elk_fs_reg &dst,
653                                const elk_fs_reg *src, unsigned sources,
654                                unsigned header_size,
655                                unsigned requested_alignment_sz)
656 {
657    unsigned length = 0;
658    unsigned num_srcs =
659       sources * DIV_ROUND_UP(requested_alignment_sz, bld.dispatch_width());
660    elk_fs_reg *src_comps = new elk_fs_reg[num_srcs];
661 
662    for (unsigned i = 0; i < header_size; i++)
663       src_comps[length++] = src[i];
664 
665    for (unsigned i = header_size; i < sources; i++) {
666       unsigned src_sz =
667          retype(dst, src[i].type).component_size(bld.dispatch_width());
668       const enum elk_reg_type padding_payload_type =
669          elk_reg_type_from_bit_size(type_sz(src[i].type) * 8,
670                                     ELK_REGISTER_TYPE_UD);
671 
672       src_comps[length++] = src[i];
673 
674       /* Expand the real sources if component of requested payload type is
675        * larger than real source component.
676        */
677       if (src_sz < requested_alignment_sz) {
678          for (unsigned j = 0; j < (requested_alignment_sz / src_sz) - 1; j++) {
679             src_comps[length++] = retype(elk_fs_reg(), padding_payload_type);
680          }
681       }
682    }
683 
684    elk_fs_inst *inst = bld.LOAD_PAYLOAD(dst, src_comps, length, header_size);
685    delete[] src_comps;
686 
687    return inst;
688 }
689 
690 static void
lower_sampler_logical_send_gfx7(const fs_builder & bld,elk_fs_inst * inst,elk_opcode op,const elk_fs_reg & coordinate,const elk_fs_reg & shadow_c,elk_fs_reg lod,const elk_fs_reg & lod2,const elk_fs_reg & min_lod,const elk_fs_reg & sample_index,const elk_fs_reg & mcs,const elk_fs_reg & surface,const elk_fs_reg & sampler,const elk_fs_reg & surface_handle,const elk_fs_reg & sampler_handle,const elk_fs_reg & tg4_offset,unsigned payload_type_bit_size,unsigned coord_components,unsigned grad_components,bool residency)691 lower_sampler_logical_send_gfx7(const fs_builder &bld, elk_fs_inst *inst, elk_opcode op,
692                                 const elk_fs_reg &coordinate,
693                                 const elk_fs_reg &shadow_c,
694                                 elk_fs_reg lod, const elk_fs_reg &lod2,
695                                 const elk_fs_reg &min_lod,
696                                 const elk_fs_reg &sample_index,
697                                 const elk_fs_reg &mcs,
698                                 const elk_fs_reg &surface,
699                                 const elk_fs_reg &sampler,
700                                 const elk_fs_reg &surface_handle,
701                                 const elk_fs_reg &sampler_handle,
702                                 const elk_fs_reg &tg4_offset,
703                                 unsigned payload_type_bit_size,
704                                 unsigned coord_components,
705                                 unsigned grad_components,
706                                 bool residency)
707 {
708    const intel_device_info *devinfo = bld.shader->devinfo;
709    const enum elk_reg_type payload_type =
710       elk_reg_type_from_bit_size(payload_type_bit_size, ELK_REGISTER_TYPE_F);
711    const enum elk_reg_type payload_unsigned_type =
712       elk_reg_type_from_bit_size(payload_type_bit_size, ELK_REGISTER_TYPE_UD);
713    const enum elk_reg_type payload_signed_type =
714       elk_reg_type_from_bit_size(payload_type_bit_size, ELK_REGISTER_TYPE_D);
715    unsigned reg_width = bld.dispatch_width() / 8;
716    unsigned header_size = 0, length = 0;
717    elk_fs_reg sources[1 + MAX_SAMPLER_MESSAGE_SIZE];
718    for (unsigned i = 0; i < ARRAY_SIZE(sources); i++)
719       sources[i] = bld.vgrf(payload_type);
720 
721    /* We must have exactly one of surface/sampler and surface/sampler_handle */
722    assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE));
723    assert((sampler.file == BAD_FILE) != (sampler_handle.file == BAD_FILE));
724 
725    if (op == ELK_SHADER_OPCODE_TG4 || op == ELK_SHADER_OPCODE_TG4_OFFSET ||
726        inst->offset != 0 || inst->eot ||
727        op == ELK_SHADER_OPCODE_SAMPLEINFO ||
728        sampler_handle.file != BAD_FILE ||
729        is_high_sampler(devinfo, sampler) ||
730        residency) {
731       /* For general texture offsets (no txf workaround), we need a header to
732        * put them in.
733        *
734        * TG4 needs to place its channel select in the header, for interaction
735        * with ARB_texture_swizzle.  The sampler index is only 4-bits, so for
736        * larger sampler numbers we need to offset the Sampler State Pointer in
737        * the header.
738        */
739       elk_fs_reg header = retype(sources[0], ELK_REGISTER_TYPE_UD);
740       for (header_size = 0; header_size < reg_unit(devinfo); header_size++)
741          sources[length++] = byte_offset(header, REG_SIZE * header_size);
742 
743       /* If we're requesting fewer than four channels worth of response,
744        * and we have an explicit header, we need to set up the sampler
745        * writemask.  It's reversed from normal: 1 means "don't write".
746        */
747       unsigned reg_count = regs_written(inst) - reg_unit(devinfo) * residency;
748       if (!inst->eot && reg_count < 4 * reg_width) {
749          assert(reg_count % reg_width == 0);
750          unsigned mask = ~((1 << (reg_count / reg_width)) - 1) & 0xf;
751          inst->offset |= mask << 12;
752       }
753 
754       if (residency)
755          inst->offset |= 1 << 23; /* g0.2 bit23 : Pixel Null Mask Enable */
756 
757       /* Build the actual header */
758       const fs_builder ubld = bld.exec_all().group(8 * reg_unit(devinfo), 0);
759       const fs_builder ubld1 = ubld.group(1, 0);
760       ubld.MOV(header, retype(elk_vec8_grf(0, 0), ELK_REGISTER_TYPE_UD));
761       if (inst->offset) {
762          ubld1.MOV(component(header, 2), elk_imm_ud(inst->offset));
763       } else if (bld.shader->stage != MESA_SHADER_VERTEX &&
764                  bld.shader->stage != MESA_SHADER_FRAGMENT) {
765          /* The vertex and fragment stages have g0.2 set to 0, so
766           * header0.2 is 0 when g0 is copied. Other stages may not, so we
767           * must set it to 0 to avoid setting undesirable bits in the
768           * message.
769           */
770          ubld1.MOV(component(header, 2), elk_imm_ud(0));
771       }
772 
773       if (sampler_handle.file != BAD_FILE) {
774          /* Bindless sampler handles aren't relative to the sampler state
775           * pointer passed into the shader through SAMPLER_STATE_POINTERS_*.
776           * Instead, it's an absolute pointer relative to dynamic state base
777           * address.
778           *
779           * Sampler states are 16 bytes each and the pointer we give here has
780           * to be 32-byte aligned.  In order to avoid more indirect messages
781           * than required, we assume that all bindless sampler states are
782           * 32-byte aligned.  This sacrifices a bit of general state base
783           * address space but means we can do something more efficient in the
784           * shader.
785           */
786          ubld1.MOV(component(header, 3), sampler_handle);
787       } else if (is_high_sampler(devinfo, sampler)) {
788          elk_fs_reg sampler_state_ptr =
789             retype(elk_vec1_grf(0, 3), ELK_REGISTER_TYPE_UD);
790 
791          if (sampler.file == ELK_IMMEDIATE_VALUE) {
792             assert(sampler.ud >= 16);
793             const int sampler_state_size = 16; /* 16 bytes */
794 
795             ubld1.ADD(component(header, 3), sampler_state_ptr,
796                       elk_imm_ud(16 * (sampler.ud / 16) * sampler_state_size));
797          } else {
798             elk_fs_reg tmp = ubld1.vgrf(ELK_REGISTER_TYPE_UD);
799             ubld1.AND(tmp, sampler, elk_imm_ud(0x0f0));
800             ubld1.SHL(tmp, tmp, elk_imm_ud(4));
801             ubld1.ADD(component(header, 3), sampler_state_ptr, tmp);
802          }
803       }
804    }
805 
806    /* On Xe2 and newer platforms, min_lod is the first parameter specifically
807     * so that a bunch of other, possibly unused, parameters don't need to also
808     * be included.
809     */
810    const unsigned msg_type =
811       sampler_msg_type(devinfo, op, inst->shadow_compare,
812                        min_lod.file != BAD_FILE);
813 
814    if (shadow_c.file != BAD_FILE) {
815       bld.MOV(sources[length], shadow_c);
816       length++;
817    }
818 
819    bool coordinate_done = false;
820 
821    /* Set up the LOD info */
822    switch (op) {
823    case ELK_FS_OPCODE_TXB:
824    case ELK_SHADER_OPCODE_TXL:
825       bld.MOV(sources[length], lod);
826       length++;
827       break;
828    case ELK_SHADER_OPCODE_TXD:
829       /* TXD should have been lowered in SIMD16 mode (in SIMD32 mode in
830        * Xe2+).
831        */
832       assert(bld.dispatch_width() == (8 * reg_unit(devinfo)));
833 
834       /* Load dPdx and the coordinate together:
835        * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
836        */
837       for (unsigned i = 0; i < coord_components; i++) {
838          bld.MOV(sources[length++], offset(coordinate, bld, i));
839 
840          /* For cube map array, the coordinate is (u,v,r,ai) but there are
841           * only derivatives for (u, v, r).
842           */
843          if (i < grad_components) {
844             bld.MOV(sources[length++], offset(lod, bld, i));
845             bld.MOV(sources[length++], offset(lod2, bld, i));
846          }
847       }
848 
849       coordinate_done = true;
850       break;
851    case ELK_SHADER_OPCODE_TXS:
852       sources[length] = retype(sources[length], payload_unsigned_type);
853       bld.MOV(sources[length++], lod);
854       break;
855    case ELK_SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
856       /* We need an LOD; just use 0 */
857       sources[length] = retype(sources[length], payload_unsigned_type);
858       bld.MOV(sources[length++], elk_imm_ud(0));
859       break;
860    case ELK_SHADER_OPCODE_TXF:
861    case ELK_SHADER_OPCODE_TXF_LZ:
862       /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r. */
863       sources[length] = retype(sources[length], payload_signed_type);
864       bld.MOV(sources[length++], coordinate);
865 
866       if (op != ELK_SHADER_OPCODE_TXF_LZ) {
867          sources[length] = retype(sources[length], payload_signed_type);
868          bld.MOV(sources[length++], lod);
869       }
870 
871       for (unsigned i = 1; i < coord_components; i++) {
872          sources[length] = retype(sources[length], payload_signed_type);
873          bld.MOV(sources[length++], offset(coordinate, bld, i));
874       }
875 
876       coordinate_done = true;
877       break;
878 
879    case ELK_SHADER_OPCODE_TXF_CMS:
880    case ELK_SHADER_OPCODE_TXF_CMS_W:
881    case ELK_SHADER_OPCODE_TXF_UMS:
882    case ELK_SHADER_OPCODE_TXF_MCS:
883       if (op == ELK_SHADER_OPCODE_TXF_UMS ||
884           op == ELK_SHADER_OPCODE_TXF_CMS ||
885           op == ELK_SHADER_OPCODE_TXF_CMS_W) {
886          sources[length] = retype(sources[length], payload_unsigned_type);
887          bld.MOV(sources[length++], sample_index);
888       }
889 
890       /* Data from the multisample control surface. */
891       if (op == ELK_SHADER_OPCODE_TXF_CMS || op == ELK_SHADER_OPCODE_TXF_CMS_W) {
892          unsigned num_mcs_components = 1;
893 
894          /* From the Gfx12HP BSpec: Render Engine - 3D and GPGPU Programs -
895           * Shared Functions - 3D Sampler - Messages - Message Format:
896           *
897           *    ld2dms_w   si  mcs0 mcs1 mcs2  mcs3  u  v  r
898           */
899          if (op == ELK_SHADER_OPCODE_TXF_CMS_W)
900             num_mcs_components = 2;
901 
902          for (unsigned i = 0; i < num_mcs_components; ++i) {
903             /* Sampler always writes 4/8 register worth of data but for ld_mcs
904              * only valid data is in first two register. So with 16-bit
905              * payload, we need to split 2-32bit register into 4-16-bit
906              * payload.
907              */
908             sources[length] = retype(sources[length], payload_unsigned_type);
909             bld.MOV(sources[length++],
910                     mcs.file == IMM ? mcs : offset(mcs, bld, i));
911          }
912       }
913 
914       /* There is no offsetting for this message; just copy in the integer
915        * texture coordinates.
916        */
917       for (unsigned i = 0; i < coord_components; i++) {
918          sources[length] = retype(sources[length], payload_signed_type);
919          bld.MOV(sources[length++], offset(coordinate, bld, i));
920       }
921 
922       coordinate_done = true;
923       break;
924    case ELK_SHADER_OPCODE_TG4_OFFSET:
925       /* More crazy intermixing */
926       for (unsigned i = 0; i < 2; i++) /* u, v */
927          bld.MOV(sources[length++], offset(coordinate, bld, i));
928 
929       for (unsigned i = 0; i < 2; i++) { /* offu, offv */
930          sources[length] = retype(sources[length], payload_signed_type);
931          bld.MOV(sources[length++], offset(tg4_offset, bld, i));
932       }
933 
934       if (coord_components == 3) /* r if present */
935          bld.MOV(sources[length++], offset(coordinate, bld, 2));
936 
937       coordinate_done = true;
938       break;
939    default:
940       break;
941    }
942 
943    /* Set up the coordinate (except for cases where it was done above) */
944    if (!coordinate_done) {
945       for (unsigned i = 0; i < coord_components; i++)
946          bld.MOV(retype(sources[length++], payload_type),
947                  offset(coordinate, bld, i));
948    }
949 
950    if (min_lod.file != BAD_FILE) {
951       /* Account for all of the missing coordinate sources */
952       length += 4 - coord_components;
953       if (op == ELK_SHADER_OPCODE_TXD)
954          length += (3 - grad_components) * 2;
955 
956       bld.MOV(sources[length++], min_lod);
957    }
958 
959    const elk_fs_reg src_payload =
960       elk_fs_reg(VGRF, bld.shader->alloc.allocate(length * reg_width),
961                                               ELK_REGISTER_TYPE_F);
962    /* In case of 16-bit payload each component takes one full register in
963     * both SIMD8H and SIMD16H modes. In both cases one reg can hold 16
964     * elements. In SIMD8H case hardware simply expects the components to be
965     * padded (i.e., aligned on reg boundary).
966     */
967    elk_fs_inst *load_payload_inst =
968       emit_load_payload_with_padding(bld, src_payload, sources, length,
969                                      header_size, REG_SIZE * reg_unit(devinfo));
970    unsigned mlen = load_payload_inst->size_written / REG_SIZE;
971    assert(payload_type_bit_size != 16);
972    unsigned simd_mode = inst->exec_size <= 8 ? ELK_SAMPLER_SIMD_MODE_SIMD8 :
973                                                ELK_SAMPLER_SIMD_MODE_SIMD16;
974 
975    /* Generate the SEND. */
976    inst->opcode = ELK_SHADER_OPCODE_SEND;
977    inst->mlen = mlen;
978    inst->header_size = header_size;
979 
980    assert(msg_type == sampler_msg_type(devinfo, op, inst->shadow_compare,
981                                        min_lod.file != BAD_FILE));
982 
983    inst->sfid = ELK_SFID_SAMPLER;
984    if (surface.file == IMM &&
985        (sampler.file == IMM || sampler_handle.file != BAD_FILE)) {
986       inst->desc = elk_sampler_desc(devinfo, surface.ud,
987                                     sampler.file == IMM ? sampler.ud % 16 : 0,
988                                     msg_type,
989                                     simd_mode,
990                                     0 /* return_format unused on gfx7+ */);
991       inst->src[0] = elk_imm_ud(0);
992    } else {
993       assert(surface_handle.file == BAD_FILE);
994 
995       /* Immediate portion of the descriptor */
996       inst->desc = elk_sampler_desc(devinfo,
997                                     0, /* surface */
998                                     0, /* sampler */
999                                     msg_type,
1000                                     simd_mode,
1001                                     0 /* return_format unused on gfx7+ */);
1002       const fs_builder ubld = bld.group(1, 0).exec_all();
1003       elk_fs_reg desc = ubld.vgrf(ELK_REGISTER_TYPE_UD);
1004       if (surface.equals(sampler)) {
1005          /* This case is common in GL */
1006          ubld.MUL(desc, surface, elk_imm_ud(0x101));
1007       } else {
1008          if (sampler_handle.file != BAD_FILE) {
1009             ubld.MOV(desc, surface);
1010          } else if (sampler.file == IMM) {
1011             ubld.OR(desc, surface, elk_imm_ud(sampler.ud << 8));
1012          } else {
1013             ubld.SHL(desc, sampler, elk_imm_ud(8));
1014             ubld.OR(desc, desc, surface);
1015          }
1016       }
1017       ubld.AND(desc, desc, elk_imm_ud(0xfff));
1018 
1019       inst->src[0] = component(desc, 0);
1020    }
1021 
1022    inst->src[1] = src_payload;
1023    inst->resize_sources(2);
1024 
1025    if (inst->eot) {
1026       /* EOT sampler messages don't make sense to split because it would
1027        * involve ending half of the thread early.
1028        */
1029       assert(inst->group == 0);
1030       /* We need to use SENDC for EOT sampler messages */
1031       inst->check_tdr = true;
1032       inst->send_has_side_effects = true;
1033    }
1034 
1035    /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
1036    assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE * reg_unit(devinfo));
1037 }
1038 
1039 static unsigned
get_sampler_msg_payload_type_bit_size(const intel_device_info * devinfo,elk_opcode op,const elk_fs_reg * src)1040 get_sampler_msg_payload_type_bit_size(const intel_device_info *devinfo,
1041                                       elk_opcode op, const elk_fs_reg *src)
1042 {
1043    unsigned src_type_size = 0;
1044 
1045    /* All sources need to have the same size, therefore seek the first valid
1046     * and take the size from there.
1047     */
1048    for (unsigned i = 0; i < TEX_LOGICAL_NUM_SRCS; i++) {
1049       if (src[i].file != BAD_FILE) {
1050          src_type_size = elk_reg_type_to_size(src[i].type);
1051          break;
1052       }
1053    }
1054 
1055    assert(src_type_size == 2 || src_type_size == 4);
1056 
1057 #ifndef NDEBUG
1058    /* Make sure all sources agree. */
1059    for (unsigned i = 0; i < TEX_LOGICAL_NUM_SRCS; i++) {
1060       assert(src[i].file == BAD_FILE ||
1061              elk_reg_type_to_size(src[i].type) == src_type_size);
1062    }
1063 #endif
1064 
1065    return src_type_size * 8;
1066 }
1067 
1068 static void
lower_sampler_logical_send(const fs_builder & bld,elk_fs_inst * inst,elk_opcode op)1069 lower_sampler_logical_send(const fs_builder &bld, elk_fs_inst *inst, elk_opcode op)
1070 {
1071    const intel_device_info *devinfo = bld.shader->devinfo;
1072    const elk_fs_reg coordinate = inst->src[TEX_LOGICAL_SRC_COORDINATE];
1073    const elk_fs_reg shadow_c = inst->src[TEX_LOGICAL_SRC_SHADOW_C];
1074    const elk_fs_reg lod = inst->src[TEX_LOGICAL_SRC_LOD];
1075    const elk_fs_reg lod2 = inst->src[TEX_LOGICAL_SRC_LOD2];
1076    const elk_fs_reg min_lod = inst->src[TEX_LOGICAL_SRC_MIN_LOD];
1077    const elk_fs_reg sample_index = inst->src[TEX_LOGICAL_SRC_SAMPLE_INDEX];
1078    const elk_fs_reg mcs = inst->src[TEX_LOGICAL_SRC_MCS];
1079    const elk_fs_reg surface = inst->src[TEX_LOGICAL_SRC_SURFACE];
1080    const elk_fs_reg sampler = inst->src[TEX_LOGICAL_SRC_SAMPLER];
1081    const elk_fs_reg surface_handle = inst->src[TEX_LOGICAL_SRC_SURFACE_HANDLE];
1082    const elk_fs_reg sampler_handle = inst->src[TEX_LOGICAL_SRC_SAMPLER_HANDLE];
1083    const elk_fs_reg tg4_offset = inst->src[TEX_LOGICAL_SRC_TG4_OFFSET];
1084    assert(inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM);
1085    const unsigned coord_components = inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
1086    assert(inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM);
1087    const unsigned grad_components = inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
1088    assert(inst->src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM);
1089    const bool residency = inst->src[TEX_LOGICAL_SRC_RESIDENCY].ud != 0;
1090    /* residency is only supported on Gfx8+ */
1091    assert(!residency || devinfo->ver >= 8);
1092 
1093    if (devinfo->ver >= 7) {
1094       const unsigned msg_payload_type_bit_size =
1095          get_sampler_msg_payload_type_bit_size(devinfo, op, inst->src);
1096 
1097       /* 16-bit payloads are available only on gfx11+ */
1098       assert(msg_payload_type_bit_size != 16);
1099 
1100       lower_sampler_logical_send_gfx7(bld, inst, op, coordinate,
1101                                       shadow_c, lod, lod2, min_lod,
1102                                       sample_index,
1103                                       mcs, surface, sampler,
1104                                       surface_handle, sampler_handle,
1105                                       tg4_offset,
1106                                       msg_payload_type_bit_size,
1107                                       coord_components, grad_components,
1108                                       residency);
1109    } else if (devinfo->ver >= 5) {
1110       lower_sampler_logical_send_gfx5(bld, inst, op, coordinate,
1111                                       shadow_c, lod, lod2, sample_index,
1112                                       surface, sampler,
1113                                       coord_components, grad_components);
1114    } else {
1115       lower_sampler_logical_send_gfx4(bld, inst, op, coordinate,
1116                                       shadow_c, lod, lod2,
1117                                       surface, sampler,
1118                                       coord_components, grad_components);
1119    }
1120 }
1121 
1122 /**
1123  * Predicate the specified instruction on the vector mask.
1124  */
1125 static void
emit_predicate_on_vector_mask(const fs_builder & bld,elk_fs_inst * inst)1126 emit_predicate_on_vector_mask(const fs_builder &bld, elk_fs_inst *inst)
1127 {
1128    assert(bld.shader->stage == MESA_SHADER_FRAGMENT &&
1129           bld.group() == inst->group &&
1130           bld.dispatch_width() == inst->exec_size);
1131 
1132    const fs_builder ubld = bld.exec_all().group(1, 0);
1133 
1134    const elk_fs_visitor &s = *bld.shader;
1135    const elk_fs_reg vector_mask = ubld.vgrf(ELK_REGISTER_TYPE_UW);
1136    ubld.UNDEF(vector_mask);
1137    ubld.emit(ELK_SHADER_OPCODE_READ_SR_REG, vector_mask, elk_imm_ud(3));
1138    const unsigned subreg = sample_mask_flag_subreg(s);
1139 
1140    ubld.MOV(elk_flag_subreg(subreg + inst->group / 16), vector_mask);
1141 
1142    if (inst->predicate) {
1143       assert(inst->predicate == ELK_PREDICATE_NORMAL);
1144       assert(!inst->predicate_inverse);
1145       assert(inst->flag_subreg == 0);
1146       /* Combine the vector mask with the existing predicate by using a
1147        * vertical predication mode.
1148        */
1149       inst->predicate = ELK_PREDICATE_ALIGN1_ALLV;
1150    } else {
1151       inst->flag_subreg = subreg;
1152       inst->predicate = ELK_PREDICATE_NORMAL;
1153       inst->predicate_inverse = false;
1154    }
1155 }
1156 
1157 static void
setup_surface_descriptors(const fs_builder & bld,elk_fs_inst * inst,uint32_t desc,const elk_fs_reg & surface,const elk_fs_reg & surface_handle)1158 setup_surface_descriptors(const fs_builder &bld, elk_fs_inst *inst, uint32_t desc,
1159                           const elk_fs_reg &surface, const elk_fs_reg &surface_handle)
1160 {
1161    /* We must have exactly one of surface and surface_handle */
1162    assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE));
1163 
1164    if (surface.file == IMM) {
1165       inst->desc = desc | (surface.ud & 0xff);
1166       inst->src[0] = elk_imm_ud(0);
1167    } else {
1168       assert(surface_handle.file == BAD_FILE);
1169 
1170       inst->desc = desc;
1171       const fs_builder ubld = bld.exec_all().group(1, 0);
1172       elk_fs_reg tmp = ubld.vgrf(ELK_REGISTER_TYPE_UD);
1173       ubld.AND(tmp, surface, elk_imm_ud(0xff));
1174       inst->src[0] = component(tmp, 0);
1175    }
1176 }
1177 
1178 static void
lower_surface_logical_send(const fs_builder & bld,elk_fs_inst * inst)1179 lower_surface_logical_send(const fs_builder &bld, elk_fs_inst *inst)
1180 {
1181    const intel_device_info *devinfo = bld.shader->devinfo;
1182 
1183    /* Get the logical send arguments. */
1184    const elk_fs_reg addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS];
1185    const elk_fs_reg src = inst->src[SURFACE_LOGICAL_SRC_DATA];
1186    const elk_fs_reg surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE];
1187    const elk_fs_reg surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE];
1188    const UNUSED elk_fs_reg dims = inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS];
1189    const elk_fs_reg arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG];
1190    const elk_fs_reg allow_sample_mask =
1191       inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK];
1192    assert(arg.file == IMM);
1193    assert(allow_sample_mask.file == IMM);
1194 
1195    /* Calculate the total number of components of the payload. */
1196    const unsigned addr_sz = inst->components_read(SURFACE_LOGICAL_SRC_ADDRESS);
1197    const unsigned src_sz = inst->components_read(SURFACE_LOGICAL_SRC_DATA);
1198 
1199    const bool is_typed_access =
1200       inst->opcode == ELK_SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL ||
1201       inst->opcode == ELK_SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL ||
1202       inst->opcode == ELK_SHADER_OPCODE_TYPED_ATOMIC_LOGICAL;
1203 
1204    const bool is_surface_access = is_typed_access ||
1205       inst->opcode == ELK_SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL ||
1206       inst->opcode == ELK_SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL ||
1207       inst->opcode == ELK_SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL;
1208 
1209    const bool is_stateless =
1210       surface.file == IMM && (surface.ud == ELK_BTI_STATELESS ||
1211                               surface.ud == GFX8_BTI_STATELESS_NON_COHERENT);
1212 
1213    const bool has_side_effects = inst->has_side_effects();
1214 
1215    elk_fs_reg sample_mask = allow_sample_mask.ud ? elk_sample_mask_reg(bld) :
1216                                                elk_fs_reg(elk_imm_ud(0xffffffff));
1217 
1218    /* From the BDW PRM Volume 7, page 147:
1219     *
1220     *  "For the Data Cache Data Port*, the header must be present for the
1221     *   following message types: [...] Typed read/write/atomics"
1222     *
1223     * Earlier generations have a similar wording.  Because of this restriction
1224     * we don't attempt to implement sample masks via predication for such
1225     * messages prior to Gfx9, since we have to provide a header anyway.  On
1226     * Gfx11+ the header has been removed so we can only use predication.
1227     *
1228     * For all stateless A32 messages, we also need a header
1229     */
1230    elk_fs_reg header;
1231    if (is_typed_access || is_stateless) {
1232       fs_builder ubld = bld.exec_all().group(8, 0);
1233       header = ubld.vgrf(ELK_REGISTER_TYPE_UD);
1234       if (is_stateless) {
1235          assert(!is_surface_access);
1236          ubld.emit(ELK_SHADER_OPCODE_SCRATCH_HEADER, header);
1237       } else {
1238          ubld.MOV(header, elk_imm_d(0));
1239          if (is_surface_access)
1240             ubld.group(1, 0).MOV(component(header, 7), sample_mask);
1241       }
1242    }
1243    const unsigned header_sz = header.file != BAD_FILE ? 1 : 0;
1244 
1245    elk_fs_reg payload, payload2;
1246    unsigned mlen;
1247 
1248    /* Allocate space for the payload. */
1249    const unsigned sz = header_sz + addr_sz + src_sz;
1250    payload = bld.vgrf(ELK_REGISTER_TYPE_UD, sz);
1251    elk_fs_reg *const components = new elk_fs_reg[sz];
1252    unsigned n = 0;
1253 
1254    /* Construct the payload. */
1255    if (header.file != BAD_FILE)
1256       components[n++] = header;
1257 
1258    for (unsigned i = 0; i < addr_sz; i++)
1259       components[n++] = offset(addr, bld, i);
1260 
1261    for (unsigned i = 0; i < src_sz; i++)
1262       components[n++] = offset(src, bld, i);
1263 
1264    bld.LOAD_PAYLOAD(payload, components, sz, header_sz);
1265    mlen = header_sz + (addr_sz + src_sz) * inst->exec_size / 8;
1266 
1267    delete[] components;
1268 
1269    /* Predicate the instruction on the sample mask if no header is
1270     * provided.
1271     */
1272    if ((header.file == BAD_FILE || !is_surface_access) &&
1273        sample_mask.file != BAD_FILE && sample_mask.file != IMM)
1274       elk_emit_predicate_on_sample_mask(bld, inst);
1275 
1276    uint32_t sfid;
1277    switch (inst->opcode) {
1278    case ELK_SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
1279    case ELK_SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
1280       /* Byte scattered opcodes go through the normal data cache */
1281       sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
1282       break;
1283 
1284    case ELK_SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
1285    case ELK_SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
1286       sfid =  devinfo->ver >= 7 ? GFX7_SFID_DATAPORT_DATA_CACHE :
1287               devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE :
1288                                   ELK_DATAPORT_READ_TARGET_RENDER_CACHE;
1289       break;
1290 
1291    case ELK_SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
1292    case ELK_SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
1293    case ELK_SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
1294       /* Untyped Surface messages go through the data cache but the SFID value
1295        * changed on Haswell.
1296        */
1297       sfid = (devinfo->verx10 >= 75 ?
1298               HSW_SFID_DATAPORT_DATA_CACHE_1 :
1299               GFX7_SFID_DATAPORT_DATA_CACHE);
1300       break;
1301 
1302    case ELK_SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
1303    case ELK_SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
1304    case ELK_SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
1305       /* Typed surface messages go through the render cache on IVB and the
1306        * data cache on HSW+.
1307        */
1308       sfid = (devinfo->verx10 >= 75 ?
1309               HSW_SFID_DATAPORT_DATA_CACHE_1 :
1310               GFX6_SFID_DATAPORT_RENDER_CACHE);
1311       break;
1312 
1313    default:
1314       unreachable("Unsupported surface opcode");
1315    }
1316 
1317    uint32_t desc;
1318    switch (inst->opcode) {
1319    case ELK_SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
1320       desc = elk_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,
1321                                             arg.ud, /* num_channels */
1322                                             false   /* write */);
1323       break;
1324 
1325    case ELK_SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
1326       desc = elk_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,
1327                                             arg.ud, /* num_channels */
1328                                             true    /* write */);
1329       break;
1330 
1331    case ELK_SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
1332       desc = elk_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,
1333                                            arg.ud, /* bit_size */
1334                                            false   /* write */);
1335       break;
1336 
1337    case ELK_SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
1338       desc = elk_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,
1339                                            arg.ud, /* bit_size */
1340                                            true    /* write */);
1341       break;
1342 
1343    case ELK_SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
1344       assert(arg.ud == 32); /* bit_size */
1345       desc = elk_dp_dword_scattered_rw_desc(devinfo, inst->exec_size,
1346                                             false  /* write */);
1347       break;
1348 
1349    case ELK_SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
1350       assert(arg.ud == 32); /* bit_size */
1351       desc = elk_dp_dword_scattered_rw_desc(devinfo, inst->exec_size,
1352                                             true   /* write */);
1353       break;
1354 
1355    case ELK_SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
1356       assert(!elk_lsc_opcode_is_atomic_float((enum elk_lsc_opcode) arg.ud));
1357       desc = elk_dp_untyped_atomic_desc(devinfo, inst->exec_size,
1358                                         lsc_op_to_legacy_atomic(arg.ud),
1359                                         !inst->dst.is_null());
1360       break;
1361 
1362    case ELK_SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
1363       desc = elk_dp_typed_surface_rw_desc(devinfo, inst->exec_size, inst->group,
1364                                           arg.ud, /* num_channels */
1365                                           false   /* write */);
1366       break;
1367 
1368    case ELK_SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
1369       desc = elk_dp_typed_surface_rw_desc(devinfo, inst->exec_size, inst->group,
1370                                           arg.ud, /* num_channels */
1371                                           true    /* write */);
1372       break;
1373 
1374    case ELK_SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
1375       desc = elk_dp_typed_atomic_desc(devinfo, inst->exec_size, inst->group,
1376                                       lsc_op_to_legacy_atomic(arg.ud),
1377                                       !inst->dst.is_null());
1378       break;
1379 
1380    default:
1381       unreachable("Unknown surface logical instruction");
1382    }
1383 
1384    /* Update the original instruction. */
1385    inst->opcode = ELK_SHADER_OPCODE_SEND;
1386    inst->mlen = mlen;
1387    inst->header_size = header_sz;
1388    inst->send_has_side_effects = has_side_effects;
1389    inst->send_is_volatile = !has_side_effects;
1390 
1391    /* Set up SFID and descriptors */
1392    inst->sfid = sfid;
1393    setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
1394 
1395    inst->resize_sources(2);
1396 
1397    /* Finally, the payload */
1398    inst->src[1] = payload;
1399 }
1400 
1401 static void
emit_fragment_mask(const fs_builder & bld,elk_fs_inst * inst)1402 emit_fragment_mask(const fs_builder &bld, elk_fs_inst *inst)
1403 {
1404    assert(inst->src[A64_LOGICAL_ENABLE_HELPERS].file == IMM);
1405    const bool enable_helpers = inst->src[A64_LOGICAL_ENABLE_HELPERS].ud;
1406 
1407    /* If we're a fragment shader, we have to predicate with the sample mask to
1408     * avoid helper invocations to avoid helper invocations in instructions
1409     * with side effects, unless they are explicitly required.
1410     *
1411     * There are also special cases when we actually want to run on helpers
1412     * (ray queries).
1413     */
1414    assert(bld.shader->stage == MESA_SHADER_FRAGMENT);
1415    if (enable_helpers)
1416       emit_predicate_on_vector_mask(bld, inst);
1417    else if (inst->has_side_effects())
1418       elk_emit_predicate_on_sample_mask(bld, inst);
1419 }
1420 
1421 static void
lower_a64_logical_send(const fs_builder & bld,elk_fs_inst * inst)1422 lower_a64_logical_send(const fs_builder &bld, elk_fs_inst *inst)
1423 {
1424    const intel_device_info *devinfo = bld.shader->devinfo;
1425 
1426    const elk_fs_reg addr = inst->src[A64_LOGICAL_ADDRESS];
1427    const elk_fs_reg src = inst->src[A64_LOGICAL_SRC];
1428    const unsigned src_comps = inst->components_read(1);
1429    assert(inst->src[A64_LOGICAL_ARG].file == IMM);
1430    const unsigned arg = inst->src[A64_LOGICAL_ARG].ud;
1431    const bool has_side_effects = inst->has_side_effects();
1432 
1433    elk_fs_reg payload, payload2;
1434    unsigned mlen, header_size = 0;
1435 
1436    /* Add two because the address is 64-bit */
1437    const unsigned dwords = 2 + src_comps;
1438    mlen = dwords * (inst->exec_size / 8);
1439 
1440    elk_fs_reg sources[5];
1441 
1442    sources[0] = addr;
1443 
1444    for (unsigned i = 0; i < src_comps; i++)
1445       sources[1 + i] = offset(src, bld, i);
1446 
1447    payload = bld.vgrf(ELK_REGISTER_TYPE_UD, dwords);
1448    bld.LOAD_PAYLOAD(payload, sources, 1 + src_comps, 0);
1449 
1450    uint32_t desc;
1451    switch (inst->opcode) {
1452    case ELK_SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
1453       desc = elk_dp_a64_untyped_surface_rw_desc(devinfo, inst->exec_size,
1454                                                 arg,   /* num_channels */
1455                                                 false  /* write */);
1456       break;
1457 
1458    case ELK_SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
1459       desc = elk_dp_a64_untyped_surface_rw_desc(devinfo, inst->exec_size,
1460                                                 arg,   /* num_channels */
1461                                                 true   /* write */);
1462       break;
1463 
1464    case ELK_SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
1465       desc = elk_dp_a64_oword_block_rw_desc(devinfo,
1466                                             true,    /* align_16B */
1467                                             arg,     /* num_dwords */
1468                                             false    /* write */);
1469       break;
1470 
1471    case ELK_SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
1472       desc = elk_dp_a64_oword_block_rw_desc(devinfo,
1473                                             false,   /* align_16B */
1474                                             arg,     /* num_dwords */
1475                                             false    /* write */);
1476       break;
1477 
1478    case ELK_SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
1479       desc = elk_dp_a64_oword_block_rw_desc(devinfo,
1480                                             true,    /* align_16B */
1481                                             arg,     /* num_dwords */
1482                                             true     /* write */);
1483       break;
1484 
1485    case ELK_SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
1486       desc = elk_dp_a64_byte_scattered_rw_desc(devinfo, inst->exec_size,
1487                                                arg,   /* bit_size */
1488                                                false  /* write */);
1489       break;
1490 
1491    case ELK_SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
1492       desc = elk_dp_a64_byte_scattered_rw_desc(devinfo, inst->exec_size,
1493                                                arg,   /* bit_size */
1494                                                true   /* write */);
1495       break;
1496 
1497    case ELK_SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
1498       assert(!elk_lsc_opcode_is_atomic_float((enum elk_lsc_opcode) arg));
1499       desc = elk_dp_a64_untyped_atomic_desc(devinfo, inst->exec_size,
1500                                             type_sz(inst->dst.type) * 8,
1501                                             lsc_op_to_legacy_atomic(arg),
1502                                             !inst->dst.is_null());
1503       break;
1504 
1505    default:
1506       unreachable("Unknown A64 logical instruction");
1507    }
1508 
1509    if (bld.shader->stage == MESA_SHADER_FRAGMENT)
1510       emit_fragment_mask(bld, inst);
1511 
1512    /* Update the original instruction. */
1513    inst->opcode = ELK_SHADER_OPCODE_SEND;
1514    inst->mlen = mlen;
1515    inst->header_size = header_size;
1516    inst->send_has_side_effects = has_side_effects;
1517    inst->send_is_volatile = !has_side_effects;
1518 
1519    /* Set up SFID and descriptors */
1520    inst->sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;
1521    inst->desc = desc;
1522    inst->resize_sources(2);
1523    inst->src[0] = elk_imm_ud(0); /* desc */
1524    inst->src[1] = payload;
1525 }
1526 
1527 static void
lower_varying_pull_constant_logical_send(const fs_builder & bld,elk_fs_inst * inst)1528 lower_varying_pull_constant_logical_send(const fs_builder &bld, elk_fs_inst *inst)
1529 {
1530    const intel_device_info *devinfo = bld.shader->devinfo;
1531    const elk_compiler *compiler = bld.shader->compiler;
1532 
1533    if (devinfo->ver >= 7) {
1534       elk_fs_reg surface = inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE];
1535       elk_fs_reg surface_handle = inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE_HANDLE];
1536       elk_fs_reg offset_B = inst->src[PULL_VARYING_CONSTANT_SRC_OFFSET];
1537 
1538       /* We are switching the instruction from an ALU-like instruction to a
1539        * send-from-grf instruction.  Since sends can't handle strides or
1540        * source modifiers, we have to make a copy of the offset source.
1541        */
1542       elk_fs_reg ubo_offset = bld.vgrf(ELK_REGISTER_TYPE_UD);
1543       bld.MOV(ubo_offset, offset_B);
1544 
1545       assert(inst->src[PULL_VARYING_CONSTANT_SRC_ALIGNMENT].file == ELK_IMMEDIATE_VALUE);
1546       unsigned alignment = inst->src[PULL_VARYING_CONSTANT_SRC_ALIGNMENT].ud;
1547 
1548       inst->opcode = ELK_SHADER_OPCODE_SEND;
1549       inst->mlen = inst->exec_size / 8;
1550       inst->resize_sources(3);
1551 
1552       /* src[0] is filled by setup_surface_descriptors() */
1553       inst->src[1] = ubo_offset; /* payload */
1554 
1555       if (compiler->indirect_ubos_use_sampler) {
1556          const unsigned simd_mode =
1557             inst->exec_size <= 8 ? ELK_SAMPLER_SIMD_MODE_SIMD8 :
1558                                    ELK_SAMPLER_SIMD_MODE_SIMD16;
1559          const uint32_t desc = elk_sampler_desc(devinfo, 0, 0,
1560                                                 GFX5_SAMPLER_MESSAGE_SAMPLE_LD,
1561                                                 simd_mode, 0);
1562 
1563          inst->sfid = ELK_SFID_SAMPLER;
1564          setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
1565       } else if (alignment >= 4) {
1566          const uint32_t desc =
1567             elk_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,
1568                                            4, /* num_channels */
1569                                            false   /* write */);
1570 
1571          inst->sfid = (devinfo->verx10 >= 75 ?
1572                        HSW_SFID_DATAPORT_DATA_CACHE_1 :
1573                        GFX7_SFID_DATAPORT_DATA_CACHE);
1574          setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
1575       } else {
1576          const uint32_t desc =
1577             elk_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,
1578                                           32,     /* bit_size */
1579                                           false   /* write */);
1580 
1581          inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
1582          setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
1583 
1584          /* The byte scattered messages can only read one dword at a time so
1585           * we have to duplicate the message 4 times to read the full vec4.
1586           * Hopefully, dead code will clean up the mess if some of them aren't
1587           * needed.
1588           */
1589          assert(inst->size_written == 16 * inst->exec_size);
1590          inst->size_written /= 4;
1591          for (unsigned c = 1; c < 4; c++) {
1592             /* Emit a copy of the instruction because we're about to modify
1593              * it.  Because this loop starts at 1, we will emit copies for the
1594              * first 3 and the final one will be the modified instruction.
1595              */
1596             bld.emit(*inst);
1597 
1598             /* Offset the source */
1599             inst->src[1] = bld.vgrf(ELK_REGISTER_TYPE_UD);
1600             bld.ADD(inst->src[1], ubo_offset, elk_imm_ud(c * 4));
1601 
1602             /* Offset the destination */
1603             inst->dst = offset(inst->dst, bld, 1);
1604          }
1605       }
1606    } else {
1607       elk_fs_reg surface = inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE];
1608       elk_fs_reg offset = inst->src[PULL_VARYING_CONSTANT_SRC_OFFSET];
1609       assert(inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE_HANDLE].file == BAD_FILE);
1610 
1611       const elk_fs_reg payload(MRF, FIRST_PULL_LOAD_MRF(devinfo->ver),
1612                            ELK_REGISTER_TYPE_UD);
1613 
1614       bld.MOV(byte_offset(payload, REG_SIZE), offset);
1615 
1616       inst->opcode = ELK_FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4;
1617       inst->base_mrf = payload.nr;
1618       inst->header_size = 1;
1619       inst->mlen = 1 + inst->exec_size / 8;
1620 
1621       inst->resize_sources(1);
1622       inst->src[0] = surface;
1623    }
1624 }
1625 
1626 static void
lower_math_logical_send(const fs_builder & bld,elk_fs_inst * inst)1627 lower_math_logical_send(const fs_builder &bld, elk_fs_inst *inst)
1628 {
1629    assert(bld.shader->devinfo->ver < 6);
1630 
1631    inst->base_mrf = 2;
1632    inst->mlen = inst->sources * inst->exec_size / 8;
1633 
1634    if (inst->sources > 1) {
1635       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1636        * "Message Payload":
1637        *
1638        * "Operand0[7].  For the INT DIV functions, this operand is the
1639        *  denominator."
1640        *  ...
1641        * "Operand1[7].  For the INT DIV functions, this operand is the
1642        *  numerator."
1643        */
1644       const bool is_int_div = inst->opcode != ELK_SHADER_OPCODE_POW;
1645       const elk_fs_reg src0 = is_int_div ? inst->src[1] : inst->src[0];
1646       const elk_fs_reg src1 = is_int_div ? inst->src[0] : inst->src[1];
1647 
1648       inst->resize_sources(1);
1649       inst->src[0] = src0;
1650 
1651       assert(inst->exec_size == 8);
1652       bld.MOV(elk_fs_reg(MRF, inst->base_mrf + 1, src1.type), src1);
1653    }
1654 }
1655 
1656 static void
lower_interpolator_logical_send(const fs_builder & bld,elk_fs_inst * inst,const struct elk_wm_prog_key * wm_prog_key,const struct elk_wm_prog_data * wm_prog_data)1657 lower_interpolator_logical_send(const fs_builder &bld, elk_fs_inst *inst,
1658                                 const struct elk_wm_prog_key *wm_prog_key,
1659                                 const struct elk_wm_prog_data *wm_prog_data)
1660 {
1661    const intel_device_info *devinfo = bld.shader->devinfo;
1662 
1663    /* We have to send something */
1664    elk_fs_reg payload = elk_vec8_grf(0, 0);
1665    unsigned mlen = 1;
1666 
1667    unsigned mode;
1668    switch (inst->opcode) {
1669    case ELK_FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1670       assert(inst->src[INTERP_SRC_OFFSET].file == BAD_FILE);
1671       mode = GFX7_PIXEL_INTERPOLATOR_LOC_SAMPLE;
1672       break;
1673 
1674    case ELK_FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1675       assert(inst->src[INTERP_SRC_OFFSET].file == BAD_FILE);
1676       mode = GFX7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET;
1677       break;
1678 
1679    case ELK_FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1680       payload = inst->src[INTERP_SRC_OFFSET];
1681       mlen = 2 * inst->exec_size / 8;
1682       mode = GFX7_PIXEL_INTERPOLATOR_LOC_PER_SLOT_OFFSET;
1683       break;
1684 
1685    default:
1686       unreachable("Invalid interpolator instruction");
1687    }
1688 
1689    const bool dynamic_mode =
1690       inst->src[INTERP_SRC_DYNAMIC_MODE].file != BAD_FILE;
1691 
1692    elk_fs_reg desc = inst->src[INTERP_SRC_MSG_DESC];
1693    uint32_t desc_imm =
1694       elk_pixel_interp_desc(devinfo,
1695                             /* Leave the mode at 0 if persample_dispatch is
1696                              * dynamic, it will be ORed in below.
1697                              */
1698                             dynamic_mode ? 0 : mode,
1699                             inst->pi_noperspective,
1700                             false /* coarse_pixel_rate */,
1701                             inst->exec_size, inst->group);
1702 
1703    /* If persample_dispatch is dynamic, select the interpolation mode
1704     * dynamically and OR into the descriptor to complete the static part
1705     * generated by elk_pixel_interp_desc().
1706     *
1707     * Why does this work? If you look at the SKL PRMs, Volume 7:
1708     * 3D-Media-GPGPU, Shared Functions Pixel Interpolater, you'll see that
1709     *
1710     *   - "Per Message Offset” Message Descriptor
1711     *   - “Sample Position Offset” Message Descriptor
1712     *
1713     * have different formats. Fortunately, a fragment shader dispatched at
1714     * pixel rate, will have gl_SampleID = 0 & gl_NumSamples = 1. So the value
1715     * we pack in “Sample Position Offset” will be a 0 and will cover the X/Y
1716     * components of "Per Message Offset”, which will give us the pixel offset 0x0.
1717     */
1718    if (dynamic_mode) {
1719       elk_fs_reg orig_desc = desc;
1720       const fs_builder &ubld = bld.exec_all().group(8, 0);
1721       desc = ubld.vgrf(ELK_REGISTER_TYPE_UD);
1722 
1723       /* The predicate should have been built in elk_fs_nir.cpp when emitting
1724        * NIR code. This guarantees that we do not have incorrect interactions
1725        * with the flag register holding the predication result.
1726        */
1727       if (orig_desc.file == IMM) {
1728          /* Not using SEL here because we would generate an instruction with 2
1729           * immediate sources which is not supported by HW.
1730           */
1731          set_predicate_inv(ELK_PREDICATE_NORMAL, false,
1732                            ubld.MOV(desc, elk_imm_ud(orig_desc.ud |
1733                                                      GFX7_PIXEL_INTERPOLATOR_LOC_SAMPLE << 12)));
1734          set_predicate_inv(ELK_PREDICATE_NORMAL, true,
1735                            ubld.MOV(desc, elk_imm_ud(orig_desc.ud |
1736                                                      GFX7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET << 12)));
1737       } else {
1738          set_predicate_inv(ELK_PREDICATE_NORMAL, false,
1739                            ubld.OR(desc, orig_desc,
1740                                    elk_imm_ud(GFX7_PIXEL_INTERPOLATOR_LOC_SAMPLE << 12)));
1741          set_predicate_inv(ELK_PREDICATE_NORMAL, true,
1742                            ubld.OR(desc, orig_desc,
1743                                    elk_imm_ud(GFX7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET << 12)));
1744       }
1745    }
1746 
1747    assert(bld.shader->devinfo->ver >= 7);
1748    inst->opcode = ELK_SHADER_OPCODE_SEND;
1749    inst->sfid = GFX7_SFID_PIXEL_INTERPOLATOR;
1750    inst->desc = desc_imm;
1751    inst->mlen = mlen;
1752    inst->send_has_side_effects = false;
1753    inst->send_is_volatile = false;
1754 
1755    inst->resize_sources(2);
1756    inst->src[0] = component(desc, 0);
1757    inst->src[1] = payload;
1758 }
1759 
1760 static void
lower_get_buffer_size(const fs_builder & bld,elk_fs_inst * inst)1761 lower_get_buffer_size(const fs_builder &bld, elk_fs_inst *inst)
1762 {
1763    const intel_device_info *devinfo = bld.shader->devinfo;
1764    assert(devinfo->ver >= 7);
1765    /* Since we can only execute this instruction on uniform bti/surface
1766     * handles, elk_fs_nir.cpp should already have limited this to SIMD8.
1767     */
1768    assert(inst->exec_size == 8);
1769 
1770    elk_fs_reg surface = inst->src[GET_BUFFER_SIZE_SRC_SURFACE];
1771    elk_fs_reg surface_handle = inst->src[GET_BUFFER_SIZE_SRC_SURFACE_HANDLE];
1772    elk_fs_reg lod = inst->src[GET_BUFFER_SIZE_SRC_LOD];
1773 
1774    inst->opcode = ELK_SHADER_OPCODE_SEND;
1775    inst->mlen = inst->exec_size / 8;
1776    inst->resize_sources(2);
1777 
1778    /* src[0] is filled by setup_surface_descriptors() */
1779    inst->src[1] = lod;
1780 
1781    const uint32_t return_format = devinfo->ver >= 8 ?
1782       GFX8_SAMPLER_RETURN_FORMAT_32BITS : ELK_SAMPLER_RETURN_FORMAT_SINT32;
1783 
1784    const uint32_t desc = elk_sampler_desc(devinfo, 0, 0,
1785                                           GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO,
1786                                           ELK_SAMPLER_SIMD_MODE_SIMD8,
1787                                           return_format);
1788 
1789    inst->dst = retype(inst->dst, ELK_REGISTER_TYPE_UW);
1790    inst->sfid = ELK_SFID_SAMPLER;
1791    setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
1792 }
1793 
1794 bool
lower_logical_sends()1795 elk_fs_visitor::lower_logical_sends()
1796 {
1797    bool progress = false;
1798 
1799    foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
1800       const fs_builder ibld(this, block, inst);
1801 
1802       switch (inst->opcode) {
1803       case ELK_FS_OPCODE_FB_WRITE_LOGICAL:
1804          assert(stage == MESA_SHADER_FRAGMENT);
1805          lower_fb_write_logical_send(ibld, inst,
1806                                      elk_wm_prog_data(prog_data),
1807                                      (const elk_wm_prog_key *)key,
1808                                      fs_payload());
1809          break;
1810 
1811       case ELK_SHADER_OPCODE_TEX_LOGICAL:
1812          lower_sampler_logical_send(ibld, inst, ELK_SHADER_OPCODE_TEX);
1813          break;
1814 
1815       case ELK_SHADER_OPCODE_TXD_LOGICAL:
1816          lower_sampler_logical_send(ibld, inst, ELK_SHADER_OPCODE_TXD);
1817          break;
1818 
1819       case ELK_SHADER_OPCODE_TXF_LOGICAL:
1820          lower_sampler_logical_send(ibld, inst, ELK_SHADER_OPCODE_TXF);
1821          break;
1822 
1823       case ELK_SHADER_OPCODE_TXL_LOGICAL:
1824          lower_sampler_logical_send(ibld, inst, ELK_SHADER_OPCODE_TXL);
1825          break;
1826 
1827       case ELK_SHADER_OPCODE_TXS_LOGICAL:
1828          lower_sampler_logical_send(ibld, inst, ELK_SHADER_OPCODE_TXS);
1829          break;
1830 
1831       case ELK_SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
1832          lower_sampler_logical_send(ibld, inst,
1833                                     ELK_SHADER_OPCODE_IMAGE_SIZE_LOGICAL);
1834          break;
1835 
1836       case ELK_FS_OPCODE_TXB_LOGICAL:
1837          lower_sampler_logical_send(ibld, inst, ELK_FS_OPCODE_TXB);
1838          break;
1839 
1840       case ELK_SHADER_OPCODE_TXF_CMS_LOGICAL:
1841          lower_sampler_logical_send(ibld, inst, ELK_SHADER_OPCODE_TXF_CMS);
1842          break;
1843 
1844       case ELK_SHADER_OPCODE_TXF_CMS_W_LOGICAL:
1845       case ELK_SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
1846          lower_sampler_logical_send(ibld, inst, ELK_SHADER_OPCODE_TXF_CMS_W);
1847          break;
1848 
1849       case ELK_SHADER_OPCODE_TXF_UMS_LOGICAL:
1850          lower_sampler_logical_send(ibld, inst, ELK_SHADER_OPCODE_TXF_UMS);
1851          break;
1852 
1853       case ELK_SHADER_OPCODE_TXF_MCS_LOGICAL:
1854          lower_sampler_logical_send(ibld, inst, ELK_SHADER_OPCODE_TXF_MCS);
1855          break;
1856 
1857       case ELK_SHADER_OPCODE_LOD_LOGICAL:
1858          lower_sampler_logical_send(ibld, inst, ELK_SHADER_OPCODE_LOD);
1859          break;
1860 
1861       case ELK_SHADER_OPCODE_TG4_LOGICAL:
1862          lower_sampler_logical_send(ibld, inst, ELK_SHADER_OPCODE_TG4);
1863          break;
1864 
1865       case ELK_SHADER_OPCODE_TG4_OFFSET_LOGICAL:
1866          lower_sampler_logical_send(ibld, inst, ELK_SHADER_OPCODE_TG4_OFFSET);
1867          break;
1868 
1869       case ELK_SHADER_OPCODE_SAMPLEINFO_LOGICAL:
1870          lower_sampler_logical_send(ibld, inst, ELK_SHADER_OPCODE_SAMPLEINFO);
1871          break;
1872 
1873       case ELK_SHADER_OPCODE_GET_BUFFER_SIZE:
1874          lower_get_buffer_size(ibld, inst);
1875          break;
1876 
1877       case ELK_SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
1878       case ELK_SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
1879       case ELK_SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
1880       case ELK_SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
1881       case ELK_SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
1882       case ELK_SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
1883       case ELK_SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
1884       case ELK_SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
1885       case ELK_SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
1886       case ELK_SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
1887          lower_surface_logical_send(ibld, inst);
1888          break;
1889 
1890       case ELK_SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
1891       case ELK_SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
1892       case ELK_SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
1893       case ELK_SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
1894       case ELK_SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
1895       case ELK_SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
1896       case ELK_SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
1897       case ELK_SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
1898          lower_a64_logical_send(ibld, inst);
1899          break;
1900 
1901       case ELK_FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
1902          lower_varying_pull_constant_logical_send(ibld, inst);
1903          break;
1904 
1905       case ELK_SHADER_OPCODE_RCP:
1906       case ELK_SHADER_OPCODE_RSQ:
1907       case ELK_SHADER_OPCODE_SQRT:
1908       case ELK_SHADER_OPCODE_EXP2:
1909       case ELK_SHADER_OPCODE_LOG2:
1910       case ELK_SHADER_OPCODE_SIN:
1911       case ELK_SHADER_OPCODE_COS:
1912       case ELK_SHADER_OPCODE_POW:
1913       case ELK_SHADER_OPCODE_INT_QUOTIENT:
1914       case ELK_SHADER_OPCODE_INT_REMAINDER:
1915          /* The math opcodes are overloaded for the send-like and
1916           * expression-like instructions which seems kind of icky.  Gfx6+ has
1917           * a native (but rather quirky) MATH instruction so we don't need to
1918           * do anything here.  On Gfx4-5 we'll have to lower the Gfx6-like
1919           * logical instructions (which we can easily recognize because they
1920           * have mlen = 0) into send-like virtual instructions.
1921           */
1922          if (devinfo->ver < 6 && inst->mlen == 0) {
1923             lower_math_logical_send(ibld, inst);
1924             break;
1925 
1926          } else {
1927             continue;
1928          }
1929 
1930       case ELK_FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1931       case ELK_FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1932       case ELK_FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1933          lower_interpolator_logical_send(ibld, inst,
1934                                          (const elk_wm_prog_key *)key,
1935                                          elk_wm_prog_data(prog_data));
1936          break;
1937 
1938       case ELK_SHADER_OPCODE_URB_READ_LOGICAL:
1939          lower_urb_read_logical_send(ibld, inst);
1940          break;
1941 
1942       case ELK_SHADER_OPCODE_URB_WRITE_LOGICAL:
1943          lower_urb_write_logical_send(ibld, inst);
1944          break;
1945 
1946       default:
1947          continue;
1948       }
1949 
1950       progress = true;
1951    }
1952 
1953    if (progress)
1954       invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
1955 
1956    return progress;
1957 }
1958 
1959 /**
1960  * Turns the generic expression-style uniform pull constant load instruction
1961  * into a hardware-specific series of instructions for loading a pull
1962  * constant.
1963  *
1964  * The expression style allows the CSE pass before this to optimize out
1965  * repeated loads from the same offset, and gives the pre-register-allocation
1966  * scheduling full flexibility, while the conversion to native instructions
1967  * allows the post-register-allocation scheduler the best information
1968  * possible.
1969  *
1970  * Note that execution masking for setting up pull constant loads is special:
1971  * the channels that need to be written are unrelated to the current execution
1972  * mask, since a later instruction will use one of the result channels as a
1973  * source operand for all 8 or 16 of its channels.
1974  */
1975 bool
lower_uniform_pull_constant_loads()1976 elk_fs_visitor::lower_uniform_pull_constant_loads()
1977 {
1978    bool progress = false;
1979 
1980    foreach_block_and_inst (block, elk_fs_inst, inst, cfg) {
1981       if (inst->opcode != ELK_FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
1982          continue;
1983 
1984       const elk_fs_reg surface = inst->src[PULL_UNIFORM_CONSTANT_SRC_SURFACE];
1985       const elk_fs_reg surface_handle = inst->src[PULL_UNIFORM_CONSTANT_SRC_SURFACE_HANDLE];
1986       const elk_fs_reg offset_B = inst->src[PULL_UNIFORM_CONSTANT_SRC_OFFSET];
1987       const elk_fs_reg size_B = inst->src[PULL_UNIFORM_CONSTANT_SRC_SIZE];
1988       assert(surface.file == BAD_FILE || surface_handle.file == BAD_FILE);
1989       assert(offset_B.file == IMM);
1990       assert(size_B.file == IMM);
1991 
1992       if (devinfo->ver >= 7) {
1993          const fs_builder ubld = fs_builder(this, block, inst).exec_all();
1994          elk_fs_reg header = fs_builder(this, 8).exec_all().vgrf(ELK_REGISTER_TYPE_UD);
1995 
1996          ubld.group(8, 0).MOV(header,
1997                               retype(elk_vec8_grf(0, 0), ELK_REGISTER_TYPE_UD));
1998          ubld.group(1, 0).MOV(component(header, 2),
1999                               elk_imm_ud(offset_B.ud / 16));
2000 
2001          inst->sfid = GFX6_SFID_DATAPORT_CONSTANT_CACHE;
2002          inst->opcode = ELK_SHADER_OPCODE_SEND;
2003          inst->header_size = 1;
2004          inst->mlen = 1;
2005 
2006          uint32_t desc =
2007             elk_dp_oword_block_rw_desc(devinfo, true /* align_16B */,
2008                                        size_B.ud / 4, false /* write */);
2009 
2010          inst->resize_sources(2);
2011 
2012          setup_surface_descriptors(ubld, inst, desc, surface, surface_handle);
2013 
2014          inst->src[1] = header;
2015 
2016          invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
2017       } else {
2018          assert(surface_handle.file == BAD_FILE);
2019          /* Before register allocation, we didn't tell the scheduler about the
2020           * MRF we use.  We know it's safe to use this MRF because nothing
2021           * else does except for register spill/unspill, which generates and
2022           * uses its MRF within a single IR instruction.
2023           */
2024          inst->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->ver) + 1;
2025          inst->mlen = 1;
2026       }
2027 
2028       progress = true;
2029    }
2030 
2031    return progress;
2032 }
2033