1 /*
2 * Copyright © 2010, 2022 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /**
25 * @file elk_lower_logical_sends.cpp
26 */
27
28 #include "elk_eu.h"
29 #include "elk_fs.h"
30 #include "elk_fs_builder.h"
31
32 using namespace elk;
33
34 static void
lower_urb_read_logical_send(const fs_builder & bld,elk_fs_inst * inst)35 lower_urb_read_logical_send(const fs_builder &bld, elk_fs_inst *inst)
36 {
37 const intel_device_info *devinfo = bld.shader->devinfo;
38 const bool per_slot_present =
39 inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS].file != BAD_FILE;
40
41 assert(inst->size_written % REG_SIZE == 0);
42 assert(inst->header_size == 0);
43
44 elk_fs_reg payload_sources[2];
45 unsigned header_size = 0;
46 payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_HANDLE];
47 if (per_slot_present)
48 payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS];
49
50 elk_fs_reg payload = elk_fs_reg(VGRF, bld.shader->alloc.allocate(header_size),
51 ELK_REGISTER_TYPE_F);
52 bld.LOAD_PAYLOAD(payload, payload_sources, header_size, header_size);
53
54 inst->opcode = ELK_SHADER_OPCODE_SEND;
55 inst->header_size = header_size;
56
57 inst->sfid = ELK_SFID_URB;
58 inst->desc = elk_urb_desc(devinfo,
59 GFX8_URB_OPCODE_SIMD8_READ,
60 per_slot_present,
61 false,
62 inst->offset);
63
64 inst->mlen = header_size;
65 inst->send_is_volatile = true;
66
67 inst->resize_sources(2);
68
69 inst->src[0] = elk_imm_ud(0); /* desc */
70 inst->src[1] = payload;
71 }
72
73 static void
lower_urb_write_logical_send(const fs_builder & bld,elk_fs_inst * inst)74 lower_urb_write_logical_send(const fs_builder &bld, elk_fs_inst *inst)
75 {
76 const intel_device_info *devinfo = bld.shader->devinfo;
77 const bool per_slot_present =
78 inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS].file != BAD_FILE;
79 const bool channel_mask_present =
80 inst->src[URB_LOGICAL_SRC_CHANNEL_MASK].file != BAD_FILE;
81
82 assert(inst->header_size == 0);
83
84 const unsigned length = 1 + per_slot_present + channel_mask_present +
85 inst->components_read(URB_LOGICAL_SRC_DATA);
86
87 elk_fs_reg *payload_sources = new elk_fs_reg[length];
88 elk_fs_reg payload = elk_fs_reg(VGRF, bld.shader->alloc.allocate(length),
89 ELK_REGISTER_TYPE_F);
90
91 unsigned header_size = 0;
92 payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_HANDLE];
93 if (per_slot_present)
94 payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS];
95
96 if (channel_mask_present)
97 payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_CHANNEL_MASK];
98
99 for (unsigned i = header_size, j = 0; i < length; i++, j++)
100 payload_sources[i] = offset(inst->src[URB_LOGICAL_SRC_DATA], bld, j);
101
102 bld.LOAD_PAYLOAD(payload, payload_sources, length, header_size);
103
104 delete [] payload_sources;
105
106 inst->opcode = ELK_SHADER_OPCODE_SEND;
107 inst->header_size = header_size;
108 inst->dst = elk_null_reg();
109
110 inst->sfid = ELK_SFID_URB;
111 inst->desc = elk_urb_desc(devinfo,
112 GFX8_URB_OPCODE_SIMD8_WRITE,
113 per_slot_present,
114 channel_mask_present,
115 inst->offset);
116
117 inst->mlen = length;
118 inst->send_has_side_effects = true;
119
120 inst->resize_sources(2);
121
122 inst->src[0] = elk_imm_ud(0); /* desc */
123 inst->src[1] = payload;
124 }
125
126 static void
setup_color_payload(const fs_builder & bld,const elk_wm_prog_key * key,elk_fs_reg * dst,elk_fs_reg color,unsigned components)127 setup_color_payload(const fs_builder &bld, const elk_wm_prog_key *key,
128 elk_fs_reg *dst, elk_fs_reg color, unsigned components)
129 {
130 if (key->clamp_fragment_color) {
131 elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_F, 4);
132 assert(color.type == ELK_REGISTER_TYPE_F);
133
134 for (unsigned i = 0; i < components; i++)
135 set_saturate(true,
136 bld.MOV(offset(tmp, bld, i), offset(color, bld, i)));
137
138 color = tmp;
139 }
140
141 for (unsigned i = 0; i < components; i++)
142 dst[i] = offset(color, bld, i);
143 }
144
145 static void
lower_fb_write_logical_send(const fs_builder & bld,elk_fs_inst * inst,const struct elk_wm_prog_data * prog_data,const elk_wm_prog_key * key,const elk_fs_thread_payload & payload)146 lower_fb_write_logical_send(const fs_builder &bld, elk_fs_inst *inst,
147 const struct elk_wm_prog_data *prog_data,
148 const elk_wm_prog_key *key,
149 const elk_fs_thread_payload &payload)
150 {
151 assert(inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
152 const intel_device_info *devinfo = bld.shader->devinfo;
153 const elk_fs_reg color0 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR0];
154 const elk_fs_reg color1 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR1];
155 const elk_fs_reg src0_alpha = inst->src[FB_WRITE_LOGICAL_SRC_SRC0_ALPHA];
156 const elk_fs_reg src_depth = inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH];
157 const elk_fs_reg dst_depth = inst->src[FB_WRITE_LOGICAL_SRC_DST_DEPTH];
158 elk_fs_reg sample_mask = inst->src[FB_WRITE_LOGICAL_SRC_OMASK];
159 const unsigned components =
160 inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
161
162 assert(inst->target != 0 || src0_alpha.file == BAD_FILE);
163
164 /* We can potentially have a message length of up to 15, so we have to set
165 * base_mrf to either 0 or 1 in order to fit in m0..m15.
166 */
167 elk_fs_reg sources[15];
168 int header_size = 2, payload_header_size;
169 unsigned length = 0;
170
171 if (devinfo->ver < 6) {
172 /* TODO: Support SIMD32 on gfx4-5 */
173 assert(bld.group() < 16);
174
175 /* For gfx4-5, we always have a header consisting of g0 and g1. We have
176 * an implied MOV from g0,g1 to the start of the message. The MOV from
177 * g0 is handled by the hardware and the MOV from g1 is provided by the
178 * generator. This is required because, on gfx4-5, the generator may
179 * generate two write messages with different message lengths in order
180 * to handle AA data properly.
181 *
182 * Also, since the pixel mask goes in the g0 portion of the message and
183 * since render target writes are the last thing in the shader, we write
184 * the pixel mask directly into g0 and it will get copied as part of the
185 * implied write.
186 */
187 if (prog_data->uses_kill) {
188 bld.exec_all().group(1, 0)
189 .MOV(retype(elk_vec1_grf(0, 0), ELK_REGISTER_TYPE_UW),
190 elk_sample_mask_reg(bld));
191 }
192
193 assert(length == 0);
194 length = 2;
195 } else if ((devinfo->verx10 <= 70 && prog_data->uses_kill) ||
196 color1.file != BAD_FILE ||
197 key->nr_color_regions > 1) {
198 /* From the Sandy Bridge PRM, volume 4, page 198:
199 *
200 * "Dispatched Pixel Enables. One bit per pixel indicating
201 * which pixels were originally enabled when the thread was
202 * dispatched. This field is only required for the end-of-
203 * thread message and on all dual-source messages."
204 */
205 const fs_builder ubld = bld.exec_all().group(8, 0);
206
207 elk_fs_reg header = ubld.vgrf(ELK_REGISTER_TYPE_UD, 2);
208 if (bld.group() < 16) {
209 /* The header starts off as g0 and g1 for the first half */
210 ubld.group(16, 0).MOV(header, retype(elk_vec8_grf(0, 0),
211 ELK_REGISTER_TYPE_UD));
212 } else {
213 /* The header starts off as g0 and g2 for the second half */
214 assert(bld.group() < 32);
215 const elk_fs_reg header_sources[2] = {
216 retype(elk_vec8_grf(0, 0), ELK_REGISTER_TYPE_UD),
217 retype(elk_vec8_grf(2, 0), ELK_REGISTER_TYPE_UD),
218 };
219 ubld.LOAD_PAYLOAD(header, header_sources, 2, 0);
220 }
221
222 uint32_t g00_bits = 0;
223
224 /* Set "Source0 Alpha Present to RenderTarget" bit in message
225 * header.
226 */
227 if (src0_alpha.file != BAD_FILE)
228 g00_bits |= 1 << 11;
229
230 /* Set computes stencil to render target */
231 if (prog_data->computed_stencil)
232 g00_bits |= 1 << 14;
233
234 if (g00_bits) {
235 /* OR extra bits into g0.0 */
236 ubld.group(1, 0).OR(component(header, 0),
237 retype(elk_vec1_grf(0, 0),
238 ELK_REGISTER_TYPE_UD),
239 elk_imm_ud(g00_bits));
240 }
241
242 /* Set the render target index for choosing BLEND_STATE. */
243 if (inst->target > 0) {
244 ubld.group(1, 0).MOV(component(header, 2), elk_imm_ud(inst->target));
245 }
246
247 if (prog_data->uses_kill) {
248 ubld.group(1, 0).MOV(retype(component(header, 15),
249 ELK_REGISTER_TYPE_UW),
250 elk_sample_mask_reg(bld));
251 }
252
253 assert(length == 0);
254 sources[0] = header;
255 sources[1] = horiz_offset(header, 8);
256 length = 2;
257 }
258 assert(length == 0 || length == 2);
259 header_size = length;
260
261 if (payload.aa_dest_stencil_reg[0]) {
262 assert(inst->group < 16);
263 sources[length] = elk_fs_reg(VGRF, bld.shader->alloc.allocate(1));
264 bld.group(8, 0).exec_all().annotate("FB write stencil/AA alpha")
265 .MOV(sources[length],
266 elk_fs_reg(elk_vec8_grf(payload.aa_dest_stencil_reg[0], 0)));
267 length++;
268 }
269
270 if (src0_alpha.file != BAD_FILE) {
271 for (unsigned i = 0; i < bld.dispatch_width() / 8; i++) {
272 const fs_builder &ubld = bld.exec_all().group(8, i)
273 .annotate("FB write src0 alpha");
274 const elk_fs_reg tmp = ubld.vgrf(ELK_REGISTER_TYPE_F);
275 ubld.MOV(tmp, horiz_offset(src0_alpha, i * 8));
276 setup_color_payload(ubld, key, &sources[length], tmp, 1);
277 length++;
278 }
279 }
280
281 if (sample_mask.file != BAD_FILE) {
282 const elk_fs_reg tmp(VGRF, bld.shader->alloc.allocate(reg_unit(devinfo)),
283 ELK_REGISTER_TYPE_UD);
284
285 /* Hand over gl_SampleMask. Only the lower 16 bits of each channel are
286 * relevant. Since it's unsigned single words one vgrf is always
287 * 16-wide, but only the lower or higher 8 channels will be used by the
288 * hardware when doing a SIMD8 write depending on whether we have
289 * selected the subspans for the first or second half respectively.
290 */
291 assert(sample_mask.file != BAD_FILE && type_sz(sample_mask.type) == 4);
292 sample_mask.type = ELK_REGISTER_TYPE_UW;
293 sample_mask.stride *= 2;
294
295 bld.exec_all().annotate("FB write oMask")
296 .MOV(horiz_offset(retype(tmp, ELK_REGISTER_TYPE_UW),
297 inst->group % (16 * reg_unit(devinfo))),
298 sample_mask);
299
300 for (unsigned i = 0; i < reg_unit(devinfo); i++)
301 sources[length++] = byte_offset(tmp, REG_SIZE * i);
302 }
303
304 payload_header_size = length;
305
306 setup_color_payload(bld, key, &sources[length], color0, components);
307 length += 4;
308
309 if (color1.file != BAD_FILE) {
310 setup_color_payload(bld, key, &sources[length], color1, components);
311 length += 4;
312 }
313
314 if (src_depth.file != BAD_FILE) {
315 sources[length] = src_depth;
316 length++;
317 }
318
319 if (dst_depth.file != BAD_FILE) {
320 sources[length] = dst_depth;
321 length++;
322 }
323
324 elk_fs_inst *load;
325 if (devinfo->ver >= 7) {
326 /* Send from the GRF */
327 elk_fs_reg payload = elk_fs_reg(VGRF, -1, ELK_REGISTER_TYPE_F);
328 load = bld.LOAD_PAYLOAD(payload, sources, length, payload_header_size);
329 payload.nr = bld.shader->alloc.allocate(regs_written(load));
330 load->dst = payload;
331
332 uint32_t msg_ctl = elk_fb_write_msg_control(inst, prog_data);
333
334 inst->desc =
335 (inst->group / 16) << 11 | /* rt slot group */
336 elk_fb_write_desc(devinfo, inst->target, msg_ctl, inst->last_rt,
337 0 /* coarse_rt_write */);
338
339 inst->opcode = ELK_SHADER_OPCODE_SEND;
340 inst->resize_sources(2);
341 inst->sfid = GFX6_SFID_DATAPORT_RENDER_CACHE;
342 inst->src[0] = elk_imm_ud(0);
343 inst->src[1] = payload;
344 inst->mlen = regs_written(load);
345 inst->header_size = header_size;
346 inst->check_tdr = true;
347 inst->send_has_side_effects = true;
348 } else {
349 /* Send from the MRF */
350 load = bld.LOAD_PAYLOAD(elk_fs_reg(MRF, 1, ELK_REGISTER_TYPE_F),
351 sources, length, payload_header_size);
352
353 /* On pre-SNB, we have to interlace the color values. LOAD_PAYLOAD
354 * will do this for us if we just give it a COMPR4 destination.
355 */
356 if (devinfo->ver < 6 && bld.dispatch_width() == 16)
357 load->dst.nr |= ELK_MRF_COMPR4;
358
359 if (devinfo->ver < 6) {
360 /* Set up src[0] for the implied MOV from grf0-1 */
361 inst->resize_sources(1);
362 inst->src[0] = elk_vec8_grf(0, 0);
363 } else {
364 inst->resize_sources(0);
365 }
366 inst->base_mrf = 1;
367 inst->opcode = ELK_FS_OPCODE_FB_WRITE;
368 inst->mlen = regs_written(load);
369 inst->header_size = header_size;
370 }
371 }
372
373 static void
lower_sampler_logical_send_gfx4(const fs_builder & bld,elk_fs_inst * inst,elk_opcode op,const elk_fs_reg & coordinate,const elk_fs_reg & shadow_c,const elk_fs_reg & lod,const elk_fs_reg & lod2,const elk_fs_reg & surface,const elk_fs_reg & sampler,unsigned coord_components,unsigned grad_components)374 lower_sampler_logical_send_gfx4(const fs_builder &bld, elk_fs_inst *inst, elk_opcode op,
375 const elk_fs_reg &coordinate,
376 const elk_fs_reg &shadow_c,
377 const elk_fs_reg &lod, const elk_fs_reg &lod2,
378 const elk_fs_reg &surface,
379 const elk_fs_reg &sampler,
380 unsigned coord_components,
381 unsigned grad_components)
382 {
383 const bool has_lod = (op == ELK_SHADER_OPCODE_TXL || op == ELK_FS_OPCODE_TXB ||
384 op == ELK_SHADER_OPCODE_TXF || op == ELK_SHADER_OPCODE_TXS);
385 elk_fs_reg msg_begin(MRF, 1, ELK_REGISTER_TYPE_F);
386 elk_fs_reg msg_end = msg_begin;
387
388 /* g0 header. */
389 msg_end = offset(msg_end, bld.group(8, 0), 1);
390
391 for (unsigned i = 0; i < coord_components; i++)
392 bld.MOV(retype(offset(msg_end, bld, i), coordinate.type),
393 offset(coordinate, bld, i));
394
395 msg_end = offset(msg_end, bld, coord_components);
396
397 /* Messages other than SAMPLE and RESINFO in SIMD16 and TXD in SIMD8
398 * require all three components to be present and zero if they are unused.
399 */
400 if (coord_components > 0 &&
401 (has_lod || shadow_c.file != BAD_FILE ||
402 (op == ELK_SHADER_OPCODE_TEX && bld.dispatch_width() == 8))) {
403 assert(coord_components <= 3);
404 for (unsigned i = 0; i < 3 - coord_components; i++)
405 bld.MOV(offset(msg_end, bld, i), elk_imm_f(0.0f));
406
407 msg_end = offset(msg_end, bld, 3 - coord_components);
408 }
409
410 if (op == ELK_SHADER_OPCODE_TXD) {
411 /* TXD unsupported in SIMD16 mode. */
412 assert(bld.dispatch_width() == 8);
413
414 /* the slots for u and v are always present, but r is optional */
415 if (coord_components < 2)
416 msg_end = offset(msg_end, bld, 2 - coord_components);
417
418 /* P = u, v, r
419 * dPdx = dudx, dvdx, drdx
420 * dPdy = dudy, dvdy, drdy
421 *
422 * 1-arg: Does not exist.
423 *
424 * 2-arg: dudx dvdx dudy dvdy
425 * dPdx.x dPdx.y dPdy.x dPdy.y
426 * m4 m5 m6 m7
427 *
428 * 3-arg: dudx dvdx drdx dudy dvdy drdy
429 * dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z
430 * m5 m6 m7 m8 m9 m10
431 */
432 for (unsigned i = 0; i < grad_components; i++)
433 bld.MOV(offset(msg_end, bld, i), offset(lod, bld, i));
434
435 msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
436
437 for (unsigned i = 0; i < grad_components; i++)
438 bld.MOV(offset(msg_end, bld, i), offset(lod2, bld, i));
439
440 msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
441 }
442
443 if (has_lod) {
444 /* Bias/LOD with shadow comparator is unsupported in SIMD16 -- *Without*
445 * shadow comparator (including RESINFO) it's unsupported in SIMD8 mode.
446 */
447 assert(shadow_c.file != BAD_FILE ? bld.dispatch_width() == 8 :
448 bld.dispatch_width() == 16);
449
450 const elk_reg_type type =
451 (op == ELK_SHADER_OPCODE_TXF || op == ELK_SHADER_OPCODE_TXS ?
452 ELK_REGISTER_TYPE_UD : ELK_REGISTER_TYPE_F);
453 bld.MOV(retype(msg_end, type), lod);
454 msg_end = offset(msg_end, bld, 1);
455 }
456
457 if (shadow_c.file != BAD_FILE) {
458 if (op == ELK_SHADER_OPCODE_TEX && bld.dispatch_width() == 8) {
459 /* There's no plain shadow compare message, so we use shadow
460 * compare with a bias of 0.0.
461 */
462 bld.MOV(msg_end, elk_imm_f(0.0f));
463 msg_end = offset(msg_end, bld, 1);
464 }
465
466 bld.MOV(msg_end, shadow_c);
467 msg_end = offset(msg_end, bld, 1);
468 }
469
470 inst->opcode = op;
471 inst->src[0] = reg_undef;
472 inst->src[1] = surface;
473 inst->src[2] = sampler;
474 inst->resize_sources(3);
475 inst->base_mrf = msg_begin.nr;
476 inst->mlen = msg_end.nr - msg_begin.nr;
477 inst->header_size = 1;
478 }
479
480 static void
lower_sampler_logical_send_gfx5(const fs_builder & bld,elk_fs_inst * inst,elk_opcode op,const elk_fs_reg & coordinate,const elk_fs_reg & shadow_c,const elk_fs_reg & lod,const elk_fs_reg & lod2,const elk_fs_reg & sample_index,const elk_fs_reg & surface,const elk_fs_reg & sampler,unsigned coord_components,unsigned grad_components)481 lower_sampler_logical_send_gfx5(const fs_builder &bld, elk_fs_inst *inst, elk_opcode op,
482 const elk_fs_reg &coordinate,
483 const elk_fs_reg &shadow_c,
484 const elk_fs_reg &lod, const elk_fs_reg &lod2,
485 const elk_fs_reg &sample_index,
486 const elk_fs_reg &surface,
487 const elk_fs_reg &sampler,
488 unsigned coord_components,
489 unsigned grad_components)
490 {
491 elk_fs_reg message(MRF, 2, ELK_REGISTER_TYPE_F);
492 elk_fs_reg msg_coords = message;
493 unsigned header_size = 0;
494
495 if (inst->offset != 0) {
496 /* The offsets set up by the visitor are in the m1 header, so we can't
497 * go headerless.
498 */
499 header_size = 1;
500 message.nr--;
501 }
502
503 for (unsigned i = 0; i < coord_components; i++)
504 bld.MOV(retype(offset(msg_coords, bld, i), coordinate.type),
505 offset(coordinate, bld, i));
506
507 elk_fs_reg msg_end = offset(msg_coords, bld, coord_components);
508 elk_fs_reg msg_lod = offset(msg_coords, bld, 4);
509
510 if (shadow_c.file != BAD_FILE) {
511 elk_fs_reg msg_shadow = msg_lod;
512 bld.MOV(msg_shadow, shadow_c);
513 msg_lod = offset(msg_shadow, bld, 1);
514 msg_end = msg_lod;
515 }
516
517 switch (op) {
518 case ELK_SHADER_OPCODE_TXL:
519 case ELK_FS_OPCODE_TXB:
520 bld.MOV(msg_lod, lod);
521 msg_end = offset(msg_lod, bld, 1);
522 break;
523 case ELK_SHADER_OPCODE_TXD:
524 /**
525 * P = u, v, r
526 * dPdx = dudx, dvdx, drdx
527 * dPdy = dudy, dvdy, drdy
528 *
529 * Load up these values:
530 * - dudx dudy dvdx dvdy drdx drdy
531 * - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z
532 */
533 msg_end = msg_lod;
534 for (unsigned i = 0; i < grad_components; i++) {
535 bld.MOV(msg_end, offset(lod, bld, i));
536 msg_end = offset(msg_end, bld, 1);
537
538 bld.MOV(msg_end, offset(lod2, bld, i));
539 msg_end = offset(msg_end, bld, 1);
540 }
541 break;
542 case ELK_SHADER_OPCODE_TXS:
543 msg_lod = retype(msg_end, ELK_REGISTER_TYPE_UD);
544 bld.MOV(msg_lod, lod);
545 msg_end = offset(msg_lod, bld, 1);
546 break;
547 case ELK_SHADER_OPCODE_TXF:
548 msg_lod = offset(msg_coords, bld, 3);
549 bld.MOV(retype(msg_lod, ELK_REGISTER_TYPE_UD), lod);
550 msg_end = offset(msg_lod, bld, 1);
551 break;
552 case ELK_SHADER_OPCODE_TXF_CMS:
553 msg_lod = offset(msg_coords, bld, 3);
554 /* lod */
555 bld.MOV(retype(msg_lod, ELK_REGISTER_TYPE_UD), elk_imm_ud(0u));
556 /* sample index */
557 bld.MOV(retype(offset(msg_lod, bld, 1), ELK_REGISTER_TYPE_UD), sample_index);
558 msg_end = offset(msg_lod, bld, 2);
559 break;
560 default:
561 break;
562 }
563
564 inst->opcode = op;
565 inst->src[0] = reg_undef;
566 inst->src[1] = surface;
567 inst->src[2] = sampler;
568 inst->resize_sources(3);
569 inst->base_mrf = message.nr;
570 inst->mlen = msg_end.nr - message.nr;
571 inst->header_size = header_size;
572
573 /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
574 assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);
575 }
576
577 static bool
is_high_sampler(const struct intel_device_info * devinfo,const elk_fs_reg & sampler)578 is_high_sampler(const struct intel_device_info *devinfo, const elk_fs_reg &sampler)
579 {
580 if (devinfo->verx10 <= 70)
581 return false;
582
583 return sampler.file != IMM || sampler.ud >= 16;
584 }
585
586 static unsigned
sampler_msg_type(const intel_device_info * devinfo,elk_opcode opcode,bool shadow_compare,bool has_min_lod)587 sampler_msg_type(const intel_device_info *devinfo,
588 elk_opcode opcode, bool shadow_compare, bool has_min_lod)
589 {
590 assert(devinfo->ver >= 5);
591 switch (opcode) {
592 case ELK_SHADER_OPCODE_TEX:
593 return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_COMPARE :
594 GFX5_SAMPLER_MESSAGE_SAMPLE;
595 case ELK_FS_OPCODE_TXB:
596 return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE :
597 GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS;
598 case ELK_SHADER_OPCODE_TXL:
599 assert(!has_min_lod);
600 return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE :
601 GFX5_SAMPLER_MESSAGE_SAMPLE_LOD;
602 case ELK_SHADER_OPCODE_TXS:
603 case ELK_SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
604 assert(!has_min_lod);
605 return GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
606 case ELK_SHADER_OPCODE_TXD:
607 assert(!shadow_compare || devinfo->verx10 >= 75);
608 return shadow_compare ? HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE :
609 GFX5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
610 case ELK_SHADER_OPCODE_TXF:
611 assert(!has_min_lod);
612 return GFX5_SAMPLER_MESSAGE_SAMPLE_LD;
613 case ELK_SHADER_OPCODE_TXF_CMS:
614 assert(!has_min_lod);
615 return devinfo->ver >= 7 ? GFX7_SAMPLER_MESSAGE_SAMPLE_LD2DMS :
616 GFX5_SAMPLER_MESSAGE_SAMPLE_LD;
617 case ELK_SHADER_OPCODE_TXF_UMS:
618 assert(!has_min_lod);
619 assert(devinfo->ver >= 7);
620 return GFX7_SAMPLER_MESSAGE_SAMPLE_LD2DSS;
621 case ELK_SHADER_OPCODE_TXF_MCS:
622 assert(!has_min_lod);
623 assert(devinfo->ver >= 7);
624 return GFX7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
625 case ELK_SHADER_OPCODE_LOD:
626 assert(!has_min_lod);
627 return GFX5_SAMPLER_MESSAGE_LOD;
628 case ELK_SHADER_OPCODE_TG4:
629 assert(!has_min_lod);
630 assert(devinfo->ver >= 7);
631 return shadow_compare ? GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C :
632 GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
633 break;
634 case ELK_SHADER_OPCODE_TG4_OFFSET:
635 assert(!has_min_lod);
636 assert(devinfo->ver >= 7);
637 return shadow_compare ? GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C :
638 GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
639 case ELK_SHADER_OPCODE_SAMPLEINFO:
640 assert(!has_min_lod);
641 return GFX6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO;
642 default:
643 unreachable("not reached");
644 }
645 }
646
647 /**
648 * Emit a LOAD_PAYLOAD instruction while ensuring the sources are aligned to
649 * the given requested_alignment_sz.
650 */
651 static elk_fs_inst *
emit_load_payload_with_padding(const fs_builder & bld,const elk_fs_reg & dst,const elk_fs_reg * src,unsigned sources,unsigned header_size,unsigned requested_alignment_sz)652 emit_load_payload_with_padding(const fs_builder &bld, const elk_fs_reg &dst,
653 const elk_fs_reg *src, unsigned sources,
654 unsigned header_size,
655 unsigned requested_alignment_sz)
656 {
657 unsigned length = 0;
658 unsigned num_srcs =
659 sources * DIV_ROUND_UP(requested_alignment_sz, bld.dispatch_width());
660 elk_fs_reg *src_comps = new elk_fs_reg[num_srcs];
661
662 for (unsigned i = 0; i < header_size; i++)
663 src_comps[length++] = src[i];
664
665 for (unsigned i = header_size; i < sources; i++) {
666 unsigned src_sz =
667 retype(dst, src[i].type).component_size(bld.dispatch_width());
668 const enum elk_reg_type padding_payload_type =
669 elk_reg_type_from_bit_size(type_sz(src[i].type) * 8,
670 ELK_REGISTER_TYPE_UD);
671
672 src_comps[length++] = src[i];
673
674 /* Expand the real sources if component of requested payload type is
675 * larger than real source component.
676 */
677 if (src_sz < requested_alignment_sz) {
678 for (unsigned j = 0; j < (requested_alignment_sz / src_sz) - 1; j++) {
679 src_comps[length++] = retype(elk_fs_reg(), padding_payload_type);
680 }
681 }
682 }
683
684 elk_fs_inst *inst = bld.LOAD_PAYLOAD(dst, src_comps, length, header_size);
685 delete[] src_comps;
686
687 return inst;
688 }
689
690 static void
lower_sampler_logical_send_gfx7(const fs_builder & bld,elk_fs_inst * inst,elk_opcode op,const elk_fs_reg & coordinate,const elk_fs_reg & shadow_c,elk_fs_reg lod,const elk_fs_reg & lod2,const elk_fs_reg & min_lod,const elk_fs_reg & sample_index,const elk_fs_reg & mcs,const elk_fs_reg & surface,const elk_fs_reg & sampler,const elk_fs_reg & surface_handle,const elk_fs_reg & sampler_handle,const elk_fs_reg & tg4_offset,unsigned payload_type_bit_size,unsigned coord_components,unsigned grad_components,bool residency)691 lower_sampler_logical_send_gfx7(const fs_builder &bld, elk_fs_inst *inst, elk_opcode op,
692 const elk_fs_reg &coordinate,
693 const elk_fs_reg &shadow_c,
694 elk_fs_reg lod, const elk_fs_reg &lod2,
695 const elk_fs_reg &min_lod,
696 const elk_fs_reg &sample_index,
697 const elk_fs_reg &mcs,
698 const elk_fs_reg &surface,
699 const elk_fs_reg &sampler,
700 const elk_fs_reg &surface_handle,
701 const elk_fs_reg &sampler_handle,
702 const elk_fs_reg &tg4_offset,
703 unsigned payload_type_bit_size,
704 unsigned coord_components,
705 unsigned grad_components,
706 bool residency)
707 {
708 const intel_device_info *devinfo = bld.shader->devinfo;
709 const enum elk_reg_type payload_type =
710 elk_reg_type_from_bit_size(payload_type_bit_size, ELK_REGISTER_TYPE_F);
711 const enum elk_reg_type payload_unsigned_type =
712 elk_reg_type_from_bit_size(payload_type_bit_size, ELK_REGISTER_TYPE_UD);
713 const enum elk_reg_type payload_signed_type =
714 elk_reg_type_from_bit_size(payload_type_bit_size, ELK_REGISTER_TYPE_D);
715 unsigned reg_width = bld.dispatch_width() / 8;
716 unsigned header_size = 0, length = 0;
717 elk_fs_reg sources[1 + MAX_SAMPLER_MESSAGE_SIZE];
718 for (unsigned i = 0; i < ARRAY_SIZE(sources); i++)
719 sources[i] = bld.vgrf(payload_type);
720
721 /* We must have exactly one of surface/sampler and surface/sampler_handle */
722 assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE));
723 assert((sampler.file == BAD_FILE) != (sampler_handle.file == BAD_FILE));
724
725 if (op == ELK_SHADER_OPCODE_TG4 || op == ELK_SHADER_OPCODE_TG4_OFFSET ||
726 inst->offset != 0 || inst->eot ||
727 op == ELK_SHADER_OPCODE_SAMPLEINFO ||
728 sampler_handle.file != BAD_FILE ||
729 is_high_sampler(devinfo, sampler) ||
730 residency) {
731 /* For general texture offsets (no txf workaround), we need a header to
732 * put them in.
733 *
734 * TG4 needs to place its channel select in the header, for interaction
735 * with ARB_texture_swizzle. The sampler index is only 4-bits, so for
736 * larger sampler numbers we need to offset the Sampler State Pointer in
737 * the header.
738 */
739 elk_fs_reg header = retype(sources[0], ELK_REGISTER_TYPE_UD);
740 for (header_size = 0; header_size < reg_unit(devinfo); header_size++)
741 sources[length++] = byte_offset(header, REG_SIZE * header_size);
742
743 /* If we're requesting fewer than four channels worth of response,
744 * and we have an explicit header, we need to set up the sampler
745 * writemask. It's reversed from normal: 1 means "don't write".
746 */
747 unsigned reg_count = regs_written(inst) - reg_unit(devinfo) * residency;
748 if (!inst->eot && reg_count < 4 * reg_width) {
749 assert(reg_count % reg_width == 0);
750 unsigned mask = ~((1 << (reg_count / reg_width)) - 1) & 0xf;
751 inst->offset |= mask << 12;
752 }
753
754 if (residency)
755 inst->offset |= 1 << 23; /* g0.2 bit23 : Pixel Null Mask Enable */
756
757 /* Build the actual header */
758 const fs_builder ubld = bld.exec_all().group(8 * reg_unit(devinfo), 0);
759 const fs_builder ubld1 = ubld.group(1, 0);
760 ubld.MOV(header, retype(elk_vec8_grf(0, 0), ELK_REGISTER_TYPE_UD));
761 if (inst->offset) {
762 ubld1.MOV(component(header, 2), elk_imm_ud(inst->offset));
763 } else if (bld.shader->stage != MESA_SHADER_VERTEX &&
764 bld.shader->stage != MESA_SHADER_FRAGMENT) {
765 /* The vertex and fragment stages have g0.2 set to 0, so
766 * header0.2 is 0 when g0 is copied. Other stages may not, so we
767 * must set it to 0 to avoid setting undesirable bits in the
768 * message.
769 */
770 ubld1.MOV(component(header, 2), elk_imm_ud(0));
771 }
772
773 if (sampler_handle.file != BAD_FILE) {
774 /* Bindless sampler handles aren't relative to the sampler state
775 * pointer passed into the shader through SAMPLER_STATE_POINTERS_*.
776 * Instead, it's an absolute pointer relative to dynamic state base
777 * address.
778 *
779 * Sampler states are 16 bytes each and the pointer we give here has
780 * to be 32-byte aligned. In order to avoid more indirect messages
781 * than required, we assume that all bindless sampler states are
782 * 32-byte aligned. This sacrifices a bit of general state base
783 * address space but means we can do something more efficient in the
784 * shader.
785 */
786 ubld1.MOV(component(header, 3), sampler_handle);
787 } else if (is_high_sampler(devinfo, sampler)) {
788 elk_fs_reg sampler_state_ptr =
789 retype(elk_vec1_grf(0, 3), ELK_REGISTER_TYPE_UD);
790
791 if (sampler.file == ELK_IMMEDIATE_VALUE) {
792 assert(sampler.ud >= 16);
793 const int sampler_state_size = 16; /* 16 bytes */
794
795 ubld1.ADD(component(header, 3), sampler_state_ptr,
796 elk_imm_ud(16 * (sampler.ud / 16) * sampler_state_size));
797 } else {
798 elk_fs_reg tmp = ubld1.vgrf(ELK_REGISTER_TYPE_UD);
799 ubld1.AND(tmp, sampler, elk_imm_ud(0x0f0));
800 ubld1.SHL(tmp, tmp, elk_imm_ud(4));
801 ubld1.ADD(component(header, 3), sampler_state_ptr, tmp);
802 }
803 }
804 }
805
806 /* On Xe2 and newer platforms, min_lod is the first parameter specifically
807 * so that a bunch of other, possibly unused, parameters don't need to also
808 * be included.
809 */
810 const unsigned msg_type =
811 sampler_msg_type(devinfo, op, inst->shadow_compare,
812 min_lod.file != BAD_FILE);
813
814 if (shadow_c.file != BAD_FILE) {
815 bld.MOV(sources[length], shadow_c);
816 length++;
817 }
818
819 bool coordinate_done = false;
820
821 /* Set up the LOD info */
822 switch (op) {
823 case ELK_FS_OPCODE_TXB:
824 case ELK_SHADER_OPCODE_TXL:
825 bld.MOV(sources[length], lod);
826 length++;
827 break;
828 case ELK_SHADER_OPCODE_TXD:
829 /* TXD should have been lowered in SIMD16 mode (in SIMD32 mode in
830 * Xe2+).
831 */
832 assert(bld.dispatch_width() == (8 * reg_unit(devinfo)));
833
834 /* Load dPdx and the coordinate together:
835 * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
836 */
837 for (unsigned i = 0; i < coord_components; i++) {
838 bld.MOV(sources[length++], offset(coordinate, bld, i));
839
840 /* For cube map array, the coordinate is (u,v,r,ai) but there are
841 * only derivatives for (u, v, r).
842 */
843 if (i < grad_components) {
844 bld.MOV(sources[length++], offset(lod, bld, i));
845 bld.MOV(sources[length++], offset(lod2, bld, i));
846 }
847 }
848
849 coordinate_done = true;
850 break;
851 case ELK_SHADER_OPCODE_TXS:
852 sources[length] = retype(sources[length], payload_unsigned_type);
853 bld.MOV(sources[length++], lod);
854 break;
855 case ELK_SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
856 /* We need an LOD; just use 0 */
857 sources[length] = retype(sources[length], payload_unsigned_type);
858 bld.MOV(sources[length++], elk_imm_ud(0));
859 break;
860 case ELK_SHADER_OPCODE_TXF:
861 case ELK_SHADER_OPCODE_TXF_LZ:
862 /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r. */
863 sources[length] = retype(sources[length], payload_signed_type);
864 bld.MOV(sources[length++], coordinate);
865
866 if (op != ELK_SHADER_OPCODE_TXF_LZ) {
867 sources[length] = retype(sources[length], payload_signed_type);
868 bld.MOV(sources[length++], lod);
869 }
870
871 for (unsigned i = 1; i < coord_components; i++) {
872 sources[length] = retype(sources[length], payload_signed_type);
873 bld.MOV(sources[length++], offset(coordinate, bld, i));
874 }
875
876 coordinate_done = true;
877 break;
878
879 case ELK_SHADER_OPCODE_TXF_CMS:
880 case ELK_SHADER_OPCODE_TXF_CMS_W:
881 case ELK_SHADER_OPCODE_TXF_UMS:
882 case ELK_SHADER_OPCODE_TXF_MCS:
883 if (op == ELK_SHADER_OPCODE_TXF_UMS ||
884 op == ELK_SHADER_OPCODE_TXF_CMS ||
885 op == ELK_SHADER_OPCODE_TXF_CMS_W) {
886 sources[length] = retype(sources[length], payload_unsigned_type);
887 bld.MOV(sources[length++], sample_index);
888 }
889
890 /* Data from the multisample control surface. */
891 if (op == ELK_SHADER_OPCODE_TXF_CMS || op == ELK_SHADER_OPCODE_TXF_CMS_W) {
892 unsigned num_mcs_components = 1;
893
894 /* From the Gfx12HP BSpec: Render Engine - 3D and GPGPU Programs -
895 * Shared Functions - 3D Sampler - Messages - Message Format:
896 *
897 * ld2dms_w si mcs0 mcs1 mcs2 mcs3 u v r
898 */
899 if (op == ELK_SHADER_OPCODE_TXF_CMS_W)
900 num_mcs_components = 2;
901
902 for (unsigned i = 0; i < num_mcs_components; ++i) {
903 /* Sampler always writes 4/8 register worth of data but for ld_mcs
904 * only valid data is in first two register. So with 16-bit
905 * payload, we need to split 2-32bit register into 4-16-bit
906 * payload.
907 */
908 sources[length] = retype(sources[length], payload_unsigned_type);
909 bld.MOV(sources[length++],
910 mcs.file == IMM ? mcs : offset(mcs, bld, i));
911 }
912 }
913
914 /* There is no offsetting for this message; just copy in the integer
915 * texture coordinates.
916 */
917 for (unsigned i = 0; i < coord_components; i++) {
918 sources[length] = retype(sources[length], payload_signed_type);
919 bld.MOV(sources[length++], offset(coordinate, bld, i));
920 }
921
922 coordinate_done = true;
923 break;
924 case ELK_SHADER_OPCODE_TG4_OFFSET:
925 /* More crazy intermixing */
926 for (unsigned i = 0; i < 2; i++) /* u, v */
927 bld.MOV(sources[length++], offset(coordinate, bld, i));
928
929 for (unsigned i = 0; i < 2; i++) { /* offu, offv */
930 sources[length] = retype(sources[length], payload_signed_type);
931 bld.MOV(sources[length++], offset(tg4_offset, bld, i));
932 }
933
934 if (coord_components == 3) /* r if present */
935 bld.MOV(sources[length++], offset(coordinate, bld, 2));
936
937 coordinate_done = true;
938 break;
939 default:
940 break;
941 }
942
943 /* Set up the coordinate (except for cases where it was done above) */
944 if (!coordinate_done) {
945 for (unsigned i = 0; i < coord_components; i++)
946 bld.MOV(retype(sources[length++], payload_type),
947 offset(coordinate, bld, i));
948 }
949
950 if (min_lod.file != BAD_FILE) {
951 /* Account for all of the missing coordinate sources */
952 length += 4 - coord_components;
953 if (op == ELK_SHADER_OPCODE_TXD)
954 length += (3 - grad_components) * 2;
955
956 bld.MOV(sources[length++], min_lod);
957 }
958
959 const elk_fs_reg src_payload =
960 elk_fs_reg(VGRF, bld.shader->alloc.allocate(length * reg_width),
961 ELK_REGISTER_TYPE_F);
962 /* In case of 16-bit payload each component takes one full register in
963 * both SIMD8H and SIMD16H modes. In both cases one reg can hold 16
964 * elements. In SIMD8H case hardware simply expects the components to be
965 * padded (i.e., aligned on reg boundary).
966 */
967 elk_fs_inst *load_payload_inst =
968 emit_load_payload_with_padding(bld, src_payload, sources, length,
969 header_size, REG_SIZE * reg_unit(devinfo));
970 unsigned mlen = load_payload_inst->size_written / REG_SIZE;
971 assert(payload_type_bit_size != 16);
972 unsigned simd_mode = inst->exec_size <= 8 ? ELK_SAMPLER_SIMD_MODE_SIMD8 :
973 ELK_SAMPLER_SIMD_MODE_SIMD16;
974
975 /* Generate the SEND. */
976 inst->opcode = ELK_SHADER_OPCODE_SEND;
977 inst->mlen = mlen;
978 inst->header_size = header_size;
979
980 assert(msg_type == sampler_msg_type(devinfo, op, inst->shadow_compare,
981 min_lod.file != BAD_FILE));
982
983 inst->sfid = ELK_SFID_SAMPLER;
984 if (surface.file == IMM &&
985 (sampler.file == IMM || sampler_handle.file != BAD_FILE)) {
986 inst->desc = elk_sampler_desc(devinfo, surface.ud,
987 sampler.file == IMM ? sampler.ud % 16 : 0,
988 msg_type,
989 simd_mode,
990 0 /* return_format unused on gfx7+ */);
991 inst->src[0] = elk_imm_ud(0);
992 } else {
993 assert(surface_handle.file == BAD_FILE);
994
995 /* Immediate portion of the descriptor */
996 inst->desc = elk_sampler_desc(devinfo,
997 0, /* surface */
998 0, /* sampler */
999 msg_type,
1000 simd_mode,
1001 0 /* return_format unused on gfx7+ */);
1002 const fs_builder ubld = bld.group(1, 0).exec_all();
1003 elk_fs_reg desc = ubld.vgrf(ELK_REGISTER_TYPE_UD);
1004 if (surface.equals(sampler)) {
1005 /* This case is common in GL */
1006 ubld.MUL(desc, surface, elk_imm_ud(0x101));
1007 } else {
1008 if (sampler_handle.file != BAD_FILE) {
1009 ubld.MOV(desc, surface);
1010 } else if (sampler.file == IMM) {
1011 ubld.OR(desc, surface, elk_imm_ud(sampler.ud << 8));
1012 } else {
1013 ubld.SHL(desc, sampler, elk_imm_ud(8));
1014 ubld.OR(desc, desc, surface);
1015 }
1016 }
1017 ubld.AND(desc, desc, elk_imm_ud(0xfff));
1018
1019 inst->src[0] = component(desc, 0);
1020 }
1021
1022 inst->src[1] = src_payload;
1023 inst->resize_sources(2);
1024
1025 if (inst->eot) {
1026 /* EOT sampler messages don't make sense to split because it would
1027 * involve ending half of the thread early.
1028 */
1029 assert(inst->group == 0);
1030 /* We need to use SENDC for EOT sampler messages */
1031 inst->check_tdr = true;
1032 inst->send_has_side_effects = true;
1033 }
1034
1035 /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
1036 assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE * reg_unit(devinfo));
1037 }
1038
1039 static unsigned
get_sampler_msg_payload_type_bit_size(const intel_device_info * devinfo,elk_opcode op,const elk_fs_reg * src)1040 get_sampler_msg_payload_type_bit_size(const intel_device_info *devinfo,
1041 elk_opcode op, const elk_fs_reg *src)
1042 {
1043 unsigned src_type_size = 0;
1044
1045 /* All sources need to have the same size, therefore seek the first valid
1046 * and take the size from there.
1047 */
1048 for (unsigned i = 0; i < TEX_LOGICAL_NUM_SRCS; i++) {
1049 if (src[i].file != BAD_FILE) {
1050 src_type_size = elk_reg_type_to_size(src[i].type);
1051 break;
1052 }
1053 }
1054
1055 assert(src_type_size == 2 || src_type_size == 4);
1056
1057 #ifndef NDEBUG
1058 /* Make sure all sources agree. */
1059 for (unsigned i = 0; i < TEX_LOGICAL_NUM_SRCS; i++) {
1060 assert(src[i].file == BAD_FILE ||
1061 elk_reg_type_to_size(src[i].type) == src_type_size);
1062 }
1063 #endif
1064
1065 return src_type_size * 8;
1066 }
1067
1068 static void
lower_sampler_logical_send(const fs_builder & bld,elk_fs_inst * inst,elk_opcode op)1069 lower_sampler_logical_send(const fs_builder &bld, elk_fs_inst *inst, elk_opcode op)
1070 {
1071 const intel_device_info *devinfo = bld.shader->devinfo;
1072 const elk_fs_reg coordinate = inst->src[TEX_LOGICAL_SRC_COORDINATE];
1073 const elk_fs_reg shadow_c = inst->src[TEX_LOGICAL_SRC_SHADOW_C];
1074 const elk_fs_reg lod = inst->src[TEX_LOGICAL_SRC_LOD];
1075 const elk_fs_reg lod2 = inst->src[TEX_LOGICAL_SRC_LOD2];
1076 const elk_fs_reg min_lod = inst->src[TEX_LOGICAL_SRC_MIN_LOD];
1077 const elk_fs_reg sample_index = inst->src[TEX_LOGICAL_SRC_SAMPLE_INDEX];
1078 const elk_fs_reg mcs = inst->src[TEX_LOGICAL_SRC_MCS];
1079 const elk_fs_reg surface = inst->src[TEX_LOGICAL_SRC_SURFACE];
1080 const elk_fs_reg sampler = inst->src[TEX_LOGICAL_SRC_SAMPLER];
1081 const elk_fs_reg surface_handle = inst->src[TEX_LOGICAL_SRC_SURFACE_HANDLE];
1082 const elk_fs_reg sampler_handle = inst->src[TEX_LOGICAL_SRC_SAMPLER_HANDLE];
1083 const elk_fs_reg tg4_offset = inst->src[TEX_LOGICAL_SRC_TG4_OFFSET];
1084 assert(inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM);
1085 const unsigned coord_components = inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
1086 assert(inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM);
1087 const unsigned grad_components = inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
1088 assert(inst->src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM);
1089 const bool residency = inst->src[TEX_LOGICAL_SRC_RESIDENCY].ud != 0;
1090 /* residency is only supported on Gfx8+ */
1091 assert(!residency || devinfo->ver >= 8);
1092
1093 if (devinfo->ver >= 7) {
1094 const unsigned msg_payload_type_bit_size =
1095 get_sampler_msg_payload_type_bit_size(devinfo, op, inst->src);
1096
1097 /* 16-bit payloads are available only on gfx11+ */
1098 assert(msg_payload_type_bit_size != 16);
1099
1100 lower_sampler_logical_send_gfx7(bld, inst, op, coordinate,
1101 shadow_c, lod, lod2, min_lod,
1102 sample_index,
1103 mcs, surface, sampler,
1104 surface_handle, sampler_handle,
1105 tg4_offset,
1106 msg_payload_type_bit_size,
1107 coord_components, grad_components,
1108 residency);
1109 } else if (devinfo->ver >= 5) {
1110 lower_sampler_logical_send_gfx5(bld, inst, op, coordinate,
1111 shadow_c, lod, lod2, sample_index,
1112 surface, sampler,
1113 coord_components, grad_components);
1114 } else {
1115 lower_sampler_logical_send_gfx4(bld, inst, op, coordinate,
1116 shadow_c, lod, lod2,
1117 surface, sampler,
1118 coord_components, grad_components);
1119 }
1120 }
1121
1122 /**
1123 * Predicate the specified instruction on the vector mask.
1124 */
1125 static void
emit_predicate_on_vector_mask(const fs_builder & bld,elk_fs_inst * inst)1126 emit_predicate_on_vector_mask(const fs_builder &bld, elk_fs_inst *inst)
1127 {
1128 assert(bld.shader->stage == MESA_SHADER_FRAGMENT &&
1129 bld.group() == inst->group &&
1130 bld.dispatch_width() == inst->exec_size);
1131
1132 const fs_builder ubld = bld.exec_all().group(1, 0);
1133
1134 const elk_fs_visitor &s = *bld.shader;
1135 const elk_fs_reg vector_mask = ubld.vgrf(ELK_REGISTER_TYPE_UW);
1136 ubld.UNDEF(vector_mask);
1137 ubld.emit(ELK_SHADER_OPCODE_READ_SR_REG, vector_mask, elk_imm_ud(3));
1138 const unsigned subreg = sample_mask_flag_subreg(s);
1139
1140 ubld.MOV(elk_flag_subreg(subreg + inst->group / 16), vector_mask);
1141
1142 if (inst->predicate) {
1143 assert(inst->predicate == ELK_PREDICATE_NORMAL);
1144 assert(!inst->predicate_inverse);
1145 assert(inst->flag_subreg == 0);
1146 /* Combine the vector mask with the existing predicate by using a
1147 * vertical predication mode.
1148 */
1149 inst->predicate = ELK_PREDICATE_ALIGN1_ALLV;
1150 } else {
1151 inst->flag_subreg = subreg;
1152 inst->predicate = ELK_PREDICATE_NORMAL;
1153 inst->predicate_inverse = false;
1154 }
1155 }
1156
1157 static void
setup_surface_descriptors(const fs_builder & bld,elk_fs_inst * inst,uint32_t desc,const elk_fs_reg & surface,const elk_fs_reg & surface_handle)1158 setup_surface_descriptors(const fs_builder &bld, elk_fs_inst *inst, uint32_t desc,
1159 const elk_fs_reg &surface, const elk_fs_reg &surface_handle)
1160 {
1161 /* We must have exactly one of surface and surface_handle */
1162 assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE));
1163
1164 if (surface.file == IMM) {
1165 inst->desc = desc | (surface.ud & 0xff);
1166 inst->src[0] = elk_imm_ud(0);
1167 } else {
1168 assert(surface_handle.file == BAD_FILE);
1169
1170 inst->desc = desc;
1171 const fs_builder ubld = bld.exec_all().group(1, 0);
1172 elk_fs_reg tmp = ubld.vgrf(ELK_REGISTER_TYPE_UD);
1173 ubld.AND(tmp, surface, elk_imm_ud(0xff));
1174 inst->src[0] = component(tmp, 0);
1175 }
1176 }
1177
1178 static void
lower_surface_logical_send(const fs_builder & bld,elk_fs_inst * inst)1179 lower_surface_logical_send(const fs_builder &bld, elk_fs_inst *inst)
1180 {
1181 const intel_device_info *devinfo = bld.shader->devinfo;
1182
1183 /* Get the logical send arguments. */
1184 const elk_fs_reg addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS];
1185 const elk_fs_reg src = inst->src[SURFACE_LOGICAL_SRC_DATA];
1186 const elk_fs_reg surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE];
1187 const elk_fs_reg surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE];
1188 const UNUSED elk_fs_reg dims = inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS];
1189 const elk_fs_reg arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG];
1190 const elk_fs_reg allow_sample_mask =
1191 inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK];
1192 assert(arg.file == IMM);
1193 assert(allow_sample_mask.file == IMM);
1194
1195 /* Calculate the total number of components of the payload. */
1196 const unsigned addr_sz = inst->components_read(SURFACE_LOGICAL_SRC_ADDRESS);
1197 const unsigned src_sz = inst->components_read(SURFACE_LOGICAL_SRC_DATA);
1198
1199 const bool is_typed_access =
1200 inst->opcode == ELK_SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL ||
1201 inst->opcode == ELK_SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL ||
1202 inst->opcode == ELK_SHADER_OPCODE_TYPED_ATOMIC_LOGICAL;
1203
1204 const bool is_surface_access = is_typed_access ||
1205 inst->opcode == ELK_SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL ||
1206 inst->opcode == ELK_SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL ||
1207 inst->opcode == ELK_SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL;
1208
1209 const bool is_stateless =
1210 surface.file == IMM && (surface.ud == ELK_BTI_STATELESS ||
1211 surface.ud == GFX8_BTI_STATELESS_NON_COHERENT);
1212
1213 const bool has_side_effects = inst->has_side_effects();
1214
1215 elk_fs_reg sample_mask = allow_sample_mask.ud ? elk_sample_mask_reg(bld) :
1216 elk_fs_reg(elk_imm_ud(0xffffffff));
1217
1218 /* From the BDW PRM Volume 7, page 147:
1219 *
1220 * "For the Data Cache Data Port*, the header must be present for the
1221 * following message types: [...] Typed read/write/atomics"
1222 *
1223 * Earlier generations have a similar wording. Because of this restriction
1224 * we don't attempt to implement sample masks via predication for such
1225 * messages prior to Gfx9, since we have to provide a header anyway. On
1226 * Gfx11+ the header has been removed so we can only use predication.
1227 *
1228 * For all stateless A32 messages, we also need a header
1229 */
1230 elk_fs_reg header;
1231 if (is_typed_access || is_stateless) {
1232 fs_builder ubld = bld.exec_all().group(8, 0);
1233 header = ubld.vgrf(ELK_REGISTER_TYPE_UD);
1234 if (is_stateless) {
1235 assert(!is_surface_access);
1236 ubld.emit(ELK_SHADER_OPCODE_SCRATCH_HEADER, header);
1237 } else {
1238 ubld.MOV(header, elk_imm_d(0));
1239 if (is_surface_access)
1240 ubld.group(1, 0).MOV(component(header, 7), sample_mask);
1241 }
1242 }
1243 const unsigned header_sz = header.file != BAD_FILE ? 1 : 0;
1244
1245 elk_fs_reg payload, payload2;
1246 unsigned mlen;
1247
1248 /* Allocate space for the payload. */
1249 const unsigned sz = header_sz + addr_sz + src_sz;
1250 payload = bld.vgrf(ELK_REGISTER_TYPE_UD, sz);
1251 elk_fs_reg *const components = new elk_fs_reg[sz];
1252 unsigned n = 0;
1253
1254 /* Construct the payload. */
1255 if (header.file != BAD_FILE)
1256 components[n++] = header;
1257
1258 for (unsigned i = 0; i < addr_sz; i++)
1259 components[n++] = offset(addr, bld, i);
1260
1261 for (unsigned i = 0; i < src_sz; i++)
1262 components[n++] = offset(src, bld, i);
1263
1264 bld.LOAD_PAYLOAD(payload, components, sz, header_sz);
1265 mlen = header_sz + (addr_sz + src_sz) * inst->exec_size / 8;
1266
1267 delete[] components;
1268
1269 /* Predicate the instruction on the sample mask if no header is
1270 * provided.
1271 */
1272 if ((header.file == BAD_FILE || !is_surface_access) &&
1273 sample_mask.file != BAD_FILE && sample_mask.file != IMM)
1274 elk_emit_predicate_on_sample_mask(bld, inst);
1275
1276 uint32_t sfid;
1277 switch (inst->opcode) {
1278 case ELK_SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
1279 case ELK_SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
1280 /* Byte scattered opcodes go through the normal data cache */
1281 sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
1282 break;
1283
1284 case ELK_SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
1285 case ELK_SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
1286 sfid = devinfo->ver >= 7 ? GFX7_SFID_DATAPORT_DATA_CACHE :
1287 devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE :
1288 ELK_DATAPORT_READ_TARGET_RENDER_CACHE;
1289 break;
1290
1291 case ELK_SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
1292 case ELK_SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
1293 case ELK_SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
1294 /* Untyped Surface messages go through the data cache but the SFID value
1295 * changed on Haswell.
1296 */
1297 sfid = (devinfo->verx10 >= 75 ?
1298 HSW_SFID_DATAPORT_DATA_CACHE_1 :
1299 GFX7_SFID_DATAPORT_DATA_CACHE);
1300 break;
1301
1302 case ELK_SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
1303 case ELK_SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
1304 case ELK_SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
1305 /* Typed surface messages go through the render cache on IVB and the
1306 * data cache on HSW+.
1307 */
1308 sfid = (devinfo->verx10 >= 75 ?
1309 HSW_SFID_DATAPORT_DATA_CACHE_1 :
1310 GFX6_SFID_DATAPORT_RENDER_CACHE);
1311 break;
1312
1313 default:
1314 unreachable("Unsupported surface opcode");
1315 }
1316
1317 uint32_t desc;
1318 switch (inst->opcode) {
1319 case ELK_SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
1320 desc = elk_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,
1321 arg.ud, /* num_channels */
1322 false /* write */);
1323 break;
1324
1325 case ELK_SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
1326 desc = elk_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,
1327 arg.ud, /* num_channels */
1328 true /* write */);
1329 break;
1330
1331 case ELK_SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
1332 desc = elk_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,
1333 arg.ud, /* bit_size */
1334 false /* write */);
1335 break;
1336
1337 case ELK_SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
1338 desc = elk_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,
1339 arg.ud, /* bit_size */
1340 true /* write */);
1341 break;
1342
1343 case ELK_SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
1344 assert(arg.ud == 32); /* bit_size */
1345 desc = elk_dp_dword_scattered_rw_desc(devinfo, inst->exec_size,
1346 false /* write */);
1347 break;
1348
1349 case ELK_SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
1350 assert(arg.ud == 32); /* bit_size */
1351 desc = elk_dp_dword_scattered_rw_desc(devinfo, inst->exec_size,
1352 true /* write */);
1353 break;
1354
1355 case ELK_SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
1356 assert(!elk_lsc_opcode_is_atomic_float((enum elk_lsc_opcode) arg.ud));
1357 desc = elk_dp_untyped_atomic_desc(devinfo, inst->exec_size,
1358 lsc_op_to_legacy_atomic(arg.ud),
1359 !inst->dst.is_null());
1360 break;
1361
1362 case ELK_SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
1363 desc = elk_dp_typed_surface_rw_desc(devinfo, inst->exec_size, inst->group,
1364 arg.ud, /* num_channels */
1365 false /* write */);
1366 break;
1367
1368 case ELK_SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
1369 desc = elk_dp_typed_surface_rw_desc(devinfo, inst->exec_size, inst->group,
1370 arg.ud, /* num_channels */
1371 true /* write */);
1372 break;
1373
1374 case ELK_SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
1375 desc = elk_dp_typed_atomic_desc(devinfo, inst->exec_size, inst->group,
1376 lsc_op_to_legacy_atomic(arg.ud),
1377 !inst->dst.is_null());
1378 break;
1379
1380 default:
1381 unreachable("Unknown surface logical instruction");
1382 }
1383
1384 /* Update the original instruction. */
1385 inst->opcode = ELK_SHADER_OPCODE_SEND;
1386 inst->mlen = mlen;
1387 inst->header_size = header_sz;
1388 inst->send_has_side_effects = has_side_effects;
1389 inst->send_is_volatile = !has_side_effects;
1390
1391 /* Set up SFID and descriptors */
1392 inst->sfid = sfid;
1393 setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
1394
1395 inst->resize_sources(2);
1396
1397 /* Finally, the payload */
1398 inst->src[1] = payload;
1399 }
1400
1401 static void
emit_fragment_mask(const fs_builder & bld,elk_fs_inst * inst)1402 emit_fragment_mask(const fs_builder &bld, elk_fs_inst *inst)
1403 {
1404 assert(inst->src[A64_LOGICAL_ENABLE_HELPERS].file == IMM);
1405 const bool enable_helpers = inst->src[A64_LOGICAL_ENABLE_HELPERS].ud;
1406
1407 /* If we're a fragment shader, we have to predicate with the sample mask to
1408 * avoid helper invocations to avoid helper invocations in instructions
1409 * with side effects, unless they are explicitly required.
1410 *
1411 * There are also special cases when we actually want to run on helpers
1412 * (ray queries).
1413 */
1414 assert(bld.shader->stage == MESA_SHADER_FRAGMENT);
1415 if (enable_helpers)
1416 emit_predicate_on_vector_mask(bld, inst);
1417 else if (inst->has_side_effects())
1418 elk_emit_predicate_on_sample_mask(bld, inst);
1419 }
1420
1421 static void
lower_a64_logical_send(const fs_builder & bld,elk_fs_inst * inst)1422 lower_a64_logical_send(const fs_builder &bld, elk_fs_inst *inst)
1423 {
1424 const intel_device_info *devinfo = bld.shader->devinfo;
1425
1426 const elk_fs_reg addr = inst->src[A64_LOGICAL_ADDRESS];
1427 const elk_fs_reg src = inst->src[A64_LOGICAL_SRC];
1428 const unsigned src_comps = inst->components_read(1);
1429 assert(inst->src[A64_LOGICAL_ARG].file == IMM);
1430 const unsigned arg = inst->src[A64_LOGICAL_ARG].ud;
1431 const bool has_side_effects = inst->has_side_effects();
1432
1433 elk_fs_reg payload, payload2;
1434 unsigned mlen, header_size = 0;
1435
1436 /* Add two because the address is 64-bit */
1437 const unsigned dwords = 2 + src_comps;
1438 mlen = dwords * (inst->exec_size / 8);
1439
1440 elk_fs_reg sources[5];
1441
1442 sources[0] = addr;
1443
1444 for (unsigned i = 0; i < src_comps; i++)
1445 sources[1 + i] = offset(src, bld, i);
1446
1447 payload = bld.vgrf(ELK_REGISTER_TYPE_UD, dwords);
1448 bld.LOAD_PAYLOAD(payload, sources, 1 + src_comps, 0);
1449
1450 uint32_t desc;
1451 switch (inst->opcode) {
1452 case ELK_SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
1453 desc = elk_dp_a64_untyped_surface_rw_desc(devinfo, inst->exec_size,
1454 arg, /* num_channels */
1455 false /* write */);
1456 break;
1457
1458 case ELK_SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
1459 desc = elk_dp_a64_untyped_surface_rw_desc(devinfo, inst->exec_size,
1460 arg, /* num_channels */
1461 true /* write */);
1462 break;
1463
1464 case ELK_SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
1465 desc = elk_dp_a64_oword_block_rw_desc(devinfo,
1466 true, /* align_16B */
1467 arg, /* num_dwords */
1468 false /* write */);
1469 break;
1470
1471 case ELK_SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
1472 desc = elk_dp_a64_oword_block_rw_desc(devinfo,
1473 false, /* align_16B */
1474 arg, /* num_dwords */
1475 false /* write */);
1476 break;
1477
1478 case ELK_SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
1479 desc = elk_dp_a64_oword_block_rw_desc(devinfo,
1480 true, /* align_16B */
1481 arg, /* num_dwords */
1482 true /* write */);
1483 break;
1484
1485 case ELK_SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
1486 desc = elk_dp_a64_byte_scattered_rw_desc(devinfo, inst->exec_size,
1487 arg, /* bit_size */
1488 false /* write */);
1489 break;
1490
1491 case ELK_SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
1492 desc = elk_dp_a64_byte_scattered_rw_desc(devinfo, inst->exec_size,
1493 arg, /* bit_size */
1494 true /* write */);
1495 break;
1496
1497 case ELK_SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
1498 assert(!elk_lsc_opcode_is_atomic_float((enum elk_lsc_opcode) arg));
1499 desc = elk_dp_a64_untyped_atomic_desc(devinfo, inst->exec_size,
1500 type_sz(inst->dst.type) * 8,
1501 lsc_op_to_legacy_atomic(arg),
1502 !inst->dst.is_null());
1503 break;
1504
1505 default:
1506 unreachable("Unknown A64 logical instruction");
1507 }
1508
1509 if (bld.shader->stage == MESA_SHADER_FRAGMENT)
1510 emit_fragment_mask(bld, inst);
1511
1512 /* Update the original instruction. */
1513 inst->opcode = ELK_SHADER_OPCODE_SEND;
1514 inst->mlen = mlen;
1515 inst->header_size = header_size;
1516 inst->send_has_side_effects = has_side_effects;
1517 inst->send_is_volatile = !has_side_effects;
1518
1519 /* Set up SFID and descriptors */
1520 inst->sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;
1521 inst->desc = desc;
1522 inst->resize_sources(2);
1523 inst->src[0] = elk_imm_ud(0); /* desc */
1524 inst->src[1] = payload;
1525 }
1526
1527 static void
lower_varying_pull_constant_logical_send(const fs_builder & bld,elk_fs_inst * inst)1528 lower_varying_pull_constant_logical_send(const fs_builder &bld, elk_fs_inst *inst)
1529 {
1530 const intel_device_info *devinfo = bld.shader->devinfo;
1531 const elk_compiler *compiler = bld.shader->compiler;
1532
1533 if (devinfo->ver >= 7) {
1534 elk_fs_reg surface = inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE];
1535 elk_fs_reg surface_handle = inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE_HANDLE];
1536 elk_fs_reg offset_B = inst->src[PULL_VARYING_CONSTANT_SRC_OFFSET];
1537
1538 /* We are switching the instruction from an ALU-like instruction to a
1539 * send-from-grf instruction. Since sends can't handle strides or
1540 * source modifiers, we have to make a copy of the offset source.
1541 */
1542 elk_fs_reg ubo_offset = bld.vgrf(ELK_REGISTER_TYPE_UD);
1543 bld.MOV(ubo_offset, offset_B);
1544
1545 assert(inst->src[PULL_VARYING_CONSTANT_SRC_ALIGNMENT].file == ELK_IMMEDIATE_VALUE);
1546 unsigned alignment = inst->src[PULL_VARYING_CONSTANT_SRC_ALIGNMENT].ud;
1547
1548 inst->opcode = ELK_SHADER_OPCODE_SEND;
1549 inst->mlen = inst->exec_size / 8;
1550 inst->resize_sources(3);
1551
1552 /* src[0] is filled by setup_surface_descriptors() */
1553 inst->src[1] = ubo_offset; /* payload */
1554
1555 if (compiler->indirect_ubos_use_sampler) {
1556 const unsigned simd_mode =
1557 inst->exec_size <= 8 ? ELK_SAMPLER_SIMD_MODE_SIMD8 :
1558 ELK_SAMPLER_SIMD_MODE_SIMD16;
1559 const uint32_t desc = elk_sampler_desc(devinfo, 0, 0,
1560 GFX5_SAMPLER_MESSAGE_SAMPLE_LD,
1561 simd_mode, 0);
1562
1563 inst->sfid = ELK_SFID_SAMPLER;
1564 setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
1565 } else if (alignment >= 4) {
1566 const uint32_t desc =
1567 elk_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,
1568 4, /* num_channels */
1569 false /* write */);
1570
1571 inst->sfid = (devinfo->verx10 >= 75 ?
1572 HSW_SFID_DATAPORT_DATA_CACHE_1 :
1573 GFX7_SFID_DATAPORT_DATA_CACHE);
1574 setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
1575 } else {
1576 const uint32_t desc =
1577 elk_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,
1578 32, /* bit_size */
1579 false /* write */);
1580
1581 inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
1582 setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
1583
1584 /* The byte scattered messages can only read one dword at a time so
1585 * we have to duplicate the message 4 times to read the full vec4.
1586 * Hopefully, dead code will clean up the mess if some of them aren't
1587 * needed.
1588 */
1589 assert(inst->size_written == 16 * inst->exec_size);
1590 inst->size_written /= 4;
1591 for (unsigned c = 1; c < 4; c++) {
1592 /* Emit a copy of the instruction because we're about to modify
1593 * it. Because this loop starts at 1, we will emit copies for the
1594 * first 3 and the final one will be the modified instruction.
1595 */
1596 bld.emit(*inst);
1597
1598 /* Offset the source */
1599 inst->src[1] = bld.vgrf(ELK_REGISTER_TYPE_UD);
1600 bld.ADD(inst->src[1], ubo_offset, elk_imm_ud(c * 4));
1601
1602 /* Offset the destination */
1603 inst->dst = offset(inst->dst, bld, 1);
1604 }
1605 }
1606 } else {
1607 elk_fs_reg surface = inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE];
1608 elk_fs_reg offset = inst->src[PULL_VARYING_CONSTANT_SRC_OFFSET];
1609 assert(inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE_HANDLE].file == BAD_FILE);
1610
1611 const elk_fs_reg payload(MRF, FIRST_PULL_LOAD_MRF(devinfo->ver),
1612 ELK_REGISTER_TYPE_UD);
1613
1614 bld.MOV(byte_offset(payload, REG_SIZE), offset);
1615
1616 inst->opcode = ELK_FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4;
1617 inst->base_mrf = payload.nr;
1618 inst->header_size = 1;
1619 inst->mlen = 1 + inst->exec_size / 8;
1620
1621 inst->resize_sources(1);
1622 inst->src[0] = surface;
1623 }
1624 }
1625
1626 static void
lower_math_logical_send(const fs_builder & bld,elk_fs_inst * inst)1627 lower_math_logical_send(const fs_builder &bld, elk_fs_inst *inst)
1628 {
1629 assert(bld.shader->devinfo->ver < 6);
1630
1631 inst->base_mrf = 2;
1632 inst->mlen = inst->sources * inst->exec_size / 8;
1633
1634 if (inst->sources > 1) {
1635 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1636 * "Message Payload":
1637 *
1638 * "Operand0[7]. For the INT DIV functions, this operand is the
1639 * denominator."
1640 * ...
1641 * "Operand1[7]. For the INT DIV functions, this operand is the
1642 * numerator."
1643 */
1644 const bool is_int_div = inst->opcode != ELK_SHADER_OPCODE_POW;
1645 const elk_fs_reg src0 = is_int_div ? inst->src[1] : inst->src[0];
1646 const elk_fs_reg src1 = is_int_div ? inst->src[0] : inst->src[1];
1647
1648 inst->resize_sources(1);
1649 inst->src[0] = src0;
1650
1651 assert(inst->exec_size == 8);
1652 bld.MOV(elk_fs_reg(MRF, inst->base_mrf + 1, src1.type), src1);
1653 }
1654 }
1655
1656 static void
lower_interpolator_logical_send(const fs_builder & bld,elk_fs_inst * inst,const struct elk_wm_prog_key * wm_prog_key,const struct elk_wm_prog_data * wm_prog_data)1657 lower_interpolator_logical_send(const fs_builder &bld, elk_fs_inst *inst,
1658 const struct elk_wm_prog_key *wm_prog_key,
1659 const struct elk_wm_prog_data *wm_prog_data)
1660 {
1661 const intel_device_info *devinfo = bld.shader->devinfo;
1662
1663 /* We have to send something */
1664 elk_fs_reg payload = elk_vec8_grf(0, 0);
1665 unsigned mlen = 1;
1666
1667 unsigned mode;
1668 switch (inst->opcode) {
1669 case ELK_FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1670 assert(inst->src[INTERP_SRC_OFFSET].file == BAD_FILE);
1671 mode = GFX7_PIXEL_INTERPOLATOR_LOC_SAMPLE;
1672 break;
1673
1674 case ELK_FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1675 assert(inst->src[INTERP_SRC_OFFSET].file == BAD_FILE);
1676 mode = GFX7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET;
1677 break;
1678
1679 case ELK_FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1680 payload = inst->src[INTERP_SRC_OFFSET];
1681 mlen = 2 * inst->exec_size / 8;
1682 mode = GFX7_PIXEL_INTERPOLATOR_LOC_PER_SLOT_OFFSET;
1683 break;
1684
1685 default:
1686 unreachable("Invalid interpolator instruction");
1687 }
1688
1689 const bool dynamic_mode =
1690 inst->src[INTERP_SRC_DYNAMIC_MODE].file != BAD_FILE;
1691
1692 elk_fs_reg desc = inst->src[INTERP_SRC_MSG_DESC];
1693 uint32_t desc_imm =
1694 elk_pixel_interp_desc(devinfo,
1695 /* Leave the mode at 0 if persample_dispatch is
1696 * dynamic, it will be ORed in below.
1697 */
1698 dynamic_mode ? 0 : mode,
1699 inst->pi_noperspective,
1700 false /* coarse_pixel_rate */,
1701 inst->exec_size, inst->group);
1702
1703 /* If persample_dispatch is dynamic, select the interpolation mode
1704 * dynamically and OR into the descriptor to complete the static part
1705 * generated by elk_pixel_interp_desc().
1706 *
1707 * Why does this work? If you look at the SKL PRMs, Volume 7:
1708 * 3D-Media-GPGPU, Shared Functions Pixel Interpolater, you'll see that
1709 *
1710 * - "Per Message Offset” Message Descriptor
1711 * - “Sample Position Offset” Message Descriptor
1712 *
1713 * have different formats. Fortunately, a fragment shader dispatched at
1714 * pixel rate, will have gl_SampleID = 0 & gl_NumSamples = 1. So the value
1715 * we pack in “Sample Position Offset” will be a 0 and will cover the X/Y
1716 * components of "Per Message Offset”, which will give us the pixel offset 0x0.
1717 */
1718 if (dynamic_mode) {
1719 elk_fs_reg orig_desc = desc;
1720 const fs_builder &ubld = bld.exec_all().group(8, 0);
1721 desc = ubld.vgrf(ELK_REGISTER_TYPE_UD);
1722
1723 /* The predicate should have been built in elk_fs_nir.cpp when emitting
1724 * NIR code. This guarantees that we do not have incorrect interactions
1725 * with the flag register holding the predication result.
1726 */
1727 if (orig_desc.file == IMM) {
1728 /* Not using SEL here because we would generate an instruction with 2
1729 * immediate sources which is not supported by HW.
1730 */
1731 set_predicate_inv(ELK_PREDICATE_NORMAL, false,
1732 ubld.MOV(desc, elk_imm_ud(orig_desc.ud |
1733 GFX7_PIXEL_INTERPOLATOR_LOC_SAMPLE << 12)));
1734 set_predicate_inv(ELK_PREDICATE_NORMAL, true,
1735 ubld.MOV(desc, elk_imm_ud(orig_desc.ud |
1736 GFX7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET << 12)));
1737 } else {
1738 set_predicate_inv(ELK_PREDICATE_NORMAL, false,
1739 ubld.OR(desc, orig_desc,
1740 elk_imm_ud(GFX7_PIXEL_INTERPOLATOR_LOC_SAMPLE << 12)));
1741 set_predicate_inv(ELK_PREDICATE_NORMAL, true,
1742 ubld.OR(desc, orig_desc,
1743 elk_imm_ud(GFX7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET << 12)));
1744 }
1745 }
1746
1747 assert(bld.shader->devinfo->ver >= 7);
1748 inst->opcode = ELK_SHADER_OPCODE_SEND;
1749 inst->sfid = GFX7_SFID_PIXEL_INTERPOLATOR;
1750 inst->desc = desc_imm;
1751 inst->mlen = mlen;
1752 inst->send_has_side_effects = false;
1753 inst->send_is_volatile = false;
1754
1755 inst->resize_sources(2);
1756 inst->src[0] = component(desc, 0);
1757 inst->src[1] = payload;
1758 }
1759
1760 static void
lower_get_buffer_size(const fs_builder & bld,elk_fs_inst * inst)1761 lower_get_buffer_size(const fs_builder &bld, elk_fs_inst *inst)
1762 {
1763 const intel_device_info *devinfo = bld.shader->devinfo;
1764 assert(devinfo->ver >= 7);
1765 /* Since we can only execute this instruction on uniform bti/surface
1766 * handles, elk_fs_nir.cpp should already have limited this to SIMD8.
1767 */
1768 assert(inst->exec_size == 8);
1769
1770 elk_fs_reg surface = inst->src[GET_BUFFER_SIZE_SRC_SURFACE];
1771 elk_fs_reg surface_handle = inst->src[GET_BUFFER_SIZE_SRC_SURFACE_HANDLE];
1772 elk_fs_reg lod = inst->src[GET_BUFFER_SIZE_SRC_LOD];
1773
1774 inst->opcode = ELK_SHADER_OPCODE_SEND;
1775 inst->mlen = inst->exec_size / 8;
1776 inst->resize_sources(2);
1777
1778 /* src[0] is filled by setup_surface_descriptors() */
1779 inst->src[1] = lod;
1780
1781 const uint32_t return_format = devinfo->ver >= 8 ?
1782 GFX8_SAMPLER_RETURN_FORMAT_32BITS : ELK_SAMPLER_RETURN_FORMAT_SINT32;
1783
1784 const uint32_t desc = elk_sampler_desc(devinfo, 0, 0,
1785 GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO,
1786 ELK_SAMPLER_SIMD_MODE_SIMD8,
1787 return_format);
1788
1789 inst->dst = retype(inst->dst, ELK_REGISTER_TYPE_UW);
1790 inst->sfid = ELK_SFID_SAMPLER;
1791 setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
1792 }
1793
1794 bool
lower_logical_sends()1795 elk_fs_visitor::lower_logical_sends()
1796 {
1797 bool progress = false;
1798
1799 foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
1800 const fs_builder ibld(this, block, inst);
1801
1802 switch (inst->opcode) {
1803 case ELK_FS_OPCODE_FB_WRITE_LOGICAL:
1804 assert(stage == MESA_SHADER_FRAGMENT);
1805 lower_fb_write_logical_send(ibld, inst,
1806 elk_wm_prog_data(prog_data),
1807 (const elk_wm_prog_key *)key,
1808 fs_payload());
1809 break;
1810
1811 case ELK_SHADER_OPCODE_TEX_LOGICAL:
1812 lower_sampler_logical_send(ibld, inst, ELK_SHADER_OPCODE_TEX);
1813 break;
1814
1815 case ELK_SHADER_OPCODE_TXD_LOGICAL:
1816 lower_sampler_logical_send(ibld, inst, ELK_SHADER_OPCODE_TXD);
1817 break;
1818
1819 case ELK_SHADER_OPCODE_TXF_LOGICAL:
1820 lower_sampler_logical_send(ibld, inst, ELK_SHADER_OPCODE_TXF);
1821 break;
1822
1823 case ELK_SHADER_OPCODE_TXL_LOGICAL:
1824 lower_sampler_logical_send(ibld, inst, ELK_SHADER_OPCODE_TXL);
1825 break;
1826
1827 case ELK_SHADER_OPCODE_TXS_LOGICAL:
1828 lower_sampler_logical_send(ibld, inst, ELK_SHADER_OPCODE_TXS);
1829 break;
1830
1831 case ELK_SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
1832 lower_sampler_logical_send(ibld, inst,
1833 ELK_SHADER_OPCODE_IMAGE_SIZE_LOGICAL);
1834 break;
1835
1836 case ELK_FS_OPCODE_TXB_LOGICAL:
1837 lower_sampler_logical_send(ibld, inst, ELK_FS_OPCODE_TXB);
1838 break;
1839
1840 case ELK_SHADER_OPCODE_TXF_CMS_LOGICAL:
1841 lower_sampler_logical_send(ibld, inst, ELK_SHADER_OPCODE_TXF_CMS);
1842 break;
1843
1844 case ELK_SHADER_OPCODE_TXF_CMS_W_LOGICAL:
1845 case ELK_SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
1846 lower_sampler_logical_send(ibld, inst, ELK_SHADER_OPCODE_TXF_CMS_W);
1847 break;
1848
1849 case ELK_SHADER_OPCODE_TXF_UMS_LOGICAL:
1850 lower_sampler_logical_send(ibld, inst, ELK_SHADER_OPCODE_TXF_UMS);
1851 break;
1852
1853 case ELK_SHADER_OPCODE_TXF_MCS_LOGICAL:
1854 lower_sampler_logical_send(ibld, inst, ELK_SHADER_OPCODE_TXF_MCS);
1855 break;
1856
1857 case ELK_SHADER_OPCODE_LOD_LOGICAL:
1858 lower_sampler_logical_send(ibld, inst, ELK_SHADER_OPCODE_LOD);
1859 break;
1860
1861 case ELK_SHADER_OPCODE_TG4_LOGICAL:
1862 lower_sampler_logical_send(ibld, inst, ELK_SHADER_OPCODE_TG4);
1863 break;
1864
1865 case ELK_SHADER_OPCODE_TG4_OFFSET_LOGICAL:
1866 lower_sampler_logical_send(ibld, inst, ELK_SHADER_OPCODE_TG4_OFFSET);
1867 break;
1868
1869 case ELK_SHADER_OPCODE_SAMPLEINFO_LOGICAL:
1870 lower_sampler_logical_send(ibld, inst, ELK_SHADER_OPCODE_SAMPLEINFO);
1871 break;
1872
1873 case ELK_SHADER_OPCODE_GET_BUFFER_SIZE:
1874 lower_get_buffer_size(ibld, inst);
1875 break;
1876
1877 case ELK_SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
1878 case ELK_SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
1879 case ELK_SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
1880 case ELK_SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
1881 case ELK_SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
1882 case ELK_SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
1883 case ELK_SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
1884 case ELK_SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
1885 case ELK_SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
1886 case ELK_SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
1887 lower_surface_logical_send(ibld, inst);
1888 break;
1889
1890 case ELK_SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
1891 case ELK_SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
1892 case ELK_SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
1893 case ELK_SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
1894 case ELK_SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
1895 case ELK_SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
1896 case ELK_SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
1897 case ELK_SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
1898 lower_a64_logical_send(ibld, inst);
1899 break;
1900
1901 case ELK_FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
1902 lower_varying_pull_constant_logical_send(ibld, inst);
1903 break;
1904
1905 case ELK_SHADER_OPCODE_RCP:
1906 case ELK_SHADER_OPCODE_RSQ:
1907 case ELK_SHADER_OPCODE_SQRT:
1908 case ELK_SHADER_OPCODE_EXP2:
1909 case ELK_SHADER_OPCODE_LOG2:
1910 case ELK_SHADER_OPCODE_SIN:
1911 case ELK_SHADER_OPCODE_COS:
1912 case ELK_SHADER_OPCODE_POW:
1913 case ELK_SHADER_OPCODE_INT_QUOTIENT:
1914 case ELK_SHADER_OPCODE_INT_REMAINDER:
1915 /* The math opcodes are overloaded for the send-like and
1916 * expression-like instructions which seems kind of icky. Gfx6+ has
1917 * a native (but rather quirky) MATH instruction so we don't need to
1918 * do anything here. On Gfx4-5 we'll have to lower the Gfx6-like
1919 * logical instructions (which we can easily recognize because they
1920 * have mlen = 0) into send-like virtual instructions.
1921 */
1922 if (devinfo->ver < 6 && inst->mlen == 0) {
1923 lower_math_logical_send(ibld, inst);
1924 break;
1925
1926 } else {
1927 continue;
1928 }
1929
1930 case ELK_FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1931 case ELK_FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1932 case ELK_FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1933 lower_interpolator_logical_send(ibld, inst,
1934 (const elk_wm_prog_key *)key,
1935 elk_wm_prog_data(prog_data));
1936 break;
1937
1938 case ELK_SHADER_OPCODE_URB_READ_LOGICAL:
1939 lower_urb_read_logical_send(ibld, inst);
1940 break;
1941
1942 case ELK_SHADER_OPCODE_URB_WRITE_LOGICAL:
1943 lower_urb_write_logical_send(ibld, inst);
1944 break;
1945
1946 default:
1947 continue;
1948 }
1949
1950 progress = true;
1951 }
1952
1953 if (progress)
1954 invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
1955
1956 return progress;
1957 }
1958
1959 /**
1960 * Turns the generic expression-style uniform pull constant load instruction
1961 * into a hardware-specific series of instructions for loading a pull
1962 * constant.
1963 *
1964 * The expression style allows the CSE pass before this to optimize out
1965 * repeated loads from the same offset, and gives the pre-register-allocation
1966 * scheduling full flexibility, while the conversion to native instructions
1967 * allows the post-register-allocation scheduler the best information
1968 * possible.
1969 *
1970 * Note that execution masking for setting up pull constant loads is special:
1971 * the channels that need to be written are unrelated to the current execution
1972 * mask, since a later instruction will use one of the result channels as a
1973 * source operand for all 8 or 16 of its channels.
1974 */
1975 bool
lower_uniform_pull_constant_loads()1976 elk_fs_visitor::lower_uniform_pull_constant_loads()
1977 {
1978 bool progress = false;
1979
1980 foreach_block_and_inst (block, elk_fs_inst, inst, cfg) {
1981 if (inst->opcode != ELK_FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
1982 continue;
1983
1984 const elk_fs_reg surface = inst->src[PULL_UNIFORM_CONSTANT_SRC_SURFACE];
1985 const elk_fs_reg surface_handle = inst->src[PULL_UNIFORM_CONSTANT_SRC_SURFACE_HANDLE];
1986 const elk_fs_reg offset_B = inst->src[PULL_UNIFORM_CONSTANT_SRC_OFFSET];
1987 const elk_fs_reg size_B = inst->src[PULL_UNIFORM_CONSTANT_SRC_SIZE];
1988 assert(surface.file == BAD_FILE || surface_handle.file == BAD_FILE);
1989 assert(offset_B.file == IMM);
1990 assert(size_B.file == IMM);
1991
1992 if (devinfo->ver >= 7) {
1993 const fs_builder ubld = fs_builder(this, block, inst).exec_all();
1994 elk_fs_reg header = fs_builder(this, 8).exec_all().vgrf(ELK_REGISTER_TYPE_UD);
1995
1996 ubld.group(8, 0).MOV(header,
1997 retype(elk_vec8_grf(0, 0), ELK_REGISTER_TYPE_UD));
1998 ubld.group(1, 0).MOV(component(header, 2),
1999 elk_imm_ud(offset_B.ud / 16));
2000
2001 inst->sfid = GFX6_SFID_DATAPORT_CONSTANT_CACHE;
2002 inst->opcode = ELK_SHADER_OPCODE_SEND;
2003 inst->header_size = 1;
2004 inst->mlen = 1;
2005
2006 uint32_t desc =
2007 elk_dp_oword_block_rw_desc(devinfo, true /* align_16B */,
2008 size_B.ud / 4, false /* write */);
2009
2010 inst->resize_sources(2);
2011
2012 setup_surface_descriptors(ubld, inst, desc, surface, surface_handle);
2013
2014 inst->src[1] = header;
2015
2016 invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
2017 } else {
2018 assert(surface_handle.file == BAD_FILE);
2019 /* Before register allocation, we didn't tell the scheduler about the
2020 * MRF we use. We know it's safe to use this MRF because nothing
2021 * else does except for register spill/unspill, which generates and
2022 * uses its MRF within a single IR instruction.
2023 */
2024 inst->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->ver) + 1;
2025 inst->mlen = 1;
2026 }
2027
2028 progress = true;
2029 }
2030
2031 return progress;
2032 }
2033