xref: /aosp_15_r20/external/mesa3d/src/freedreno/ir3/ir3_context.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2015-2018 Rob Clark <[email protected]>
3  * SPDX-License-Identifier: MIT
4  *
5  * Authors:
6  *    Rob Clark <[email protected]>
7  */
8 
9 #include "ir3_context.h"
10 #include "ir3_compiler.h"
11 #include "ir3_image.h"
12 #include "ir3_nir.h"
13 #include "ir3_shader.h"
14 #include "nir.h"
15 #include "nir_intrinsics_indices.h"
16 #include "util/u_math.h"
17 
18 struct ir3_context *
ir3_context_init(struct ir3_compiler * compiler,struct ir3_shader * shader,struct ir3_shader_variant * so)19 ir3_context_init(struct ir3_compiler *compiler, struct ir3_shader *shader,
20                  struct ir3_shader_variant *so)
21 {
22    MESA_TRACE_FUNC();
23 
24    struct ir3_context *ctx = rzalloc(NULL, struct ir3_context);
25 
26    if (compiler->gen == 4) {
27       if (so->type == MESA_SHADER_VERTEX) {
28          ctx->astc_srgb = so->key.vastc_srgb;
29          memcpy(ctx->sampler_swizzles, so->key.vsampler_swizzles, sizeof(ctx->sampler_swizzles));
30       } else if (so->type == MESA_SHADER_FRAGMENT ||
31             so->type == MESA_SHADER_COMPUTE) {
32          ctx->astc_srgb = so->key.fastc_srgb;
33          memcpy(ctx->sampler_swizzles, so->key.fsampler_swizzles, sizeof(ctx->sampler_swizzles));
34       }
35    } else if (compiler->gen == 3) {
36       if (so->type == MESA_SHADER_VERTEX) {
37          ctx->samples = so->key.vsamples;
38       } else if (so->type == MESA_SHADER_FRAGMENT) {
39          ctx->samples = so->key.fsamples;
40       }
41    }
42 
43    if (compiler->gen >= 6) {
44       ctx->funcs = &ir3_a6xx_funcs;
45    } else if (compiler->gen >= 4) {
46       ctx->funcs = &ir3_a4xx_funcs;
47    }
48 
49    ctx->compiler = compiler;
50    ctx->so = so;
51    ctx->def_ht =
52       _mesa_hash_table_create(ctx, _mesa_hash_pointer, _mesa_key_pointer_equal);
53    ctx->block_ht =
54       _mesa_hash_table_create(ctx, _mesa_hash_pointer, _mesa_key_pointer_equal);
55    ctx->continue_block_ht =
56       _mesa_hash_table_create(ctx, _mesa_hash_pointer, _mesa_key_pointer_equal);
57    ctx->sel_cond_conversions =
58       _mesa_hash_table_create(ctx, _mesa_hash_pointer, _mesa_key_pointer_equal);
59    ctx->predicate_conversions = _mesa_pointer_hash_table_create(ctx);
60 
61    /* TODO: maybe generate some sort of bitmask of what key
62     * lowers vs what shader has (ie. no need to lower
63     * texture clamp lowering if no texture sample instrs)..
64     * although should be done further up the stack to avoid
65     * creating duplicate variants..
66     */
67 
68    ctx->s = nir_shader_clone(ctx, shader->nir);
69    ir3_nir_lower_variant(so, ctx->s);
70 
71    bool progress = false;
72    bool needs_late_alg = false;
73 
74    /* We want to lower nir_op_imul as late as possible, to catch also
75     * those generated by earlier passes (e.g,
76     * nir_lower_locals_to_regs).  However, we want a final swing of a
77     * few passes to have a chance at optimizing the result.
78     */
79    NIR_PASS(progress, ctx->s, ir3_nir_lower_imul);
80    while (progress) {
81       progress = false;
82       NIR_PASS(progress, ctx->s, nir_opt_algebraic);
83       NIR_PASS(progress, ctx->s, nir_opt_copy_prop_vars);
84       NIR_PASS(progress, ctx->s, nir_opt_dead_write_vars);
85       NIR_PASS(progress, ctx->s, nir_opt_dce);
86       NIR_PASS(progress, ctx->s, nir_opt_constant_folding);
87       needs_late_alg = true;
88    }
89 
90    /* nir_opt_algebraic() above would have unfused our ffmas, re-fuse them. */
91    if (needs_late_alg) {
92       NIR_PASS(progress, ctx->s, nir_opt_algebraic_late);
93       NIR_PASS(progress, ctx->s, nir_opt_dce);
94    }
95 
96    /* This must run after the last nir_opt_algebraic or it gets undone. */
97    if (compiler->has_branch_and_or)
98       NIR_PASS_V(ctx->s, ir3_nir_opt_branch_and_or_not);
99 
100    /* Enable the texture pre-fetch feature only a4xx onwards.  But
101     * only enable it on generations that have been tested:
102     */
103    if ((so->type == MESA_SHADER_FRAGMENT) && compiler->has_fs_tex_prefetch)
104       NIR_PASS_V(ctx->s, ir3_nir_lower_tex_prefetch);
105 
106    bool vectorized = false;
107    NIR_PASS(vectorized, ctx->s, nir_opt_vectorize, ir3_nir_vectorize_filter,
108             NULL);
109 
110    if (vectorized) {
111       NIR_PASS_V(ctx->s, nir_opt_undef);
112       NIR_PASS_V(ctx->s, nir_copy_prop);
113       NIR_PASS_V(ctx->s, nir_opt_dce);
114    }
115 
116    NIR_PASS(progress, ctx->s, nir_convert_to_lcssa, true, true);
117 
118    /* This has to go at the absolute end to make sure that all SSA defs are
119     * correctly marked.
120     */
121    NIR_PASS_V(ctx->s, nir_divergence_analysis);
122 
123    /* Super crude heuristic to limit # of tex prefetch in small
124     * shaders.  This completely ignores loops.. but that's really
125     * not the worst of it's problems.  (A frag shader that has
126     * loops is probably going to be big enough to not trigger a
127     * lower threshold.)
128     *
129     *   1) probably want to do this in terms of ir3 instructions
130     *   2) probably really want to decide this after scheduling
131     *      (or at least pre-RA sched) so we have a rough idea about
132     *      nops, and don't count things that get cp'd away
133     *   3) blob seems to use higher thresholds with a mix of more
134     *      SFU instructions.  Which partly makes sense, more SFU
135     *      instructions probably means you want to get the real
136     *      shader started sooner, but that considers where in the
137     *      shader the SFU instructions are, which blob doesn't seem
138     *      to do.
139     *
140     * This uses more conservative thresholds assuming a more alu
141     * than sfu heavy instruction mix.
142     */
143    if (so->type == MESA_SHADER_FRAGMENT) {
144       nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->s);
145 
146       unsigned instruction_count = 0;
147       nir_foreach_block (block, fxn) {
148          nir_foreach_instr (instr, block) {
149             /* Vectorized ALU instructions expand to one scalar instruction per
150              * component.
151              */
152             if (instr->type == nir_instr_type_alu)
153                instruction_count += nir_instr_as_alu(instr)->def.num_components;
154             else
155                instruction_count++;
156          }
157       }
158 
159       if (instruction_count < 50) {
160          ctx->prefetch_limit = 2;
161       } else if (instruction_count < 70) {
162          ctx->prefetch_limit = 3;
163       } else {
164          ctx->prefetch_limit = IR3_MAX_SAMPLER_PREFETCH;
165       }
166    }
167 
168    if (shader_debug_enabled(so->type, ctx->s->info.internal)) {
169       mesa_logi("NIR (final form) for %s shader %s:", ir3_shader_stage(so),
170                 so->name);
171       nir_log_shaderi(ctx->s);
172    }
173 
174    ir3_ibo_mapping_init(&so->image_mapping, ctx->s->info.num_textures);
175 
176    /* Implement the "dual_color_blend_by_location" workaround for Unigine Heaven
177     * and Unigine Valley, by remapping FRAG_RESULT_DATA1 to be the 2nd color
178     * channel of FRAG_RESULT_DATA0.
179     */
180    if ((so->type == MESA_SHADER_FRAGMENT) && so->key.force_dual_color_blend) {
181       nir_variable *var = nir_find_variable_with_location(
182          ctx->s, nir_var_shader_out, FRAG_RESULT_DATA1);
183       if (var) {
184          var->data.location = FRAG_RESULT_DATA0;
185          var->data.index = 1;
186          nir_shader_gather_info(ctx->s, nir_shader_get_entrypoint(ctx->s));
187          so->dual_src_blend = true;
188       }
189    }
190 
191    return ctx;
192 }
193 
194 void
ir3_context_free(struct ir3_context * ctx)195 ir3_context_free(struct ir3_context *ctx)
196 {
197    ralloc_free(ctx);
198 }
199 
200 /*
201  * Misc helpers
202  */
203 
204 /* allocate a n element value array (to be populated by caller) and
205  * insert in def_ht
206  */
207 struct ir3_instruction **
ir3_get_dst_ssa(struct ir3_context * ctx,nir_def * dst,unsigned n)208 ir3_get_dst_ssa(struct ir3_context *ctx, nir_def *dst, unsigned n)
209 {
210    struct ir3_instruction **value =
211       ralloc_array(ctx->def_ht, struct ir3_instruction *, n);
212    _mesa_hash_table_insert(ctx->def_ht, dst, value);
213    return value;
214 }
215 
216 struct ir3_instruction **
ir3_get_def(struct ir3_context * ctx,nir_def * def,unsigned n)217 ir3_get_def(struct ir3_context *ctx, nir_def *def, unsigned n)
218 {
219    struct ir3_instruction **value = ir3_get_dst_ssa(ctx, def, n);
220 
221    compile_assert(ctx, !ctx->last_dst);
222    ctx->last_dst = value;
223    ctx->last_dst_n = n;
224 
225    return value;
226 }
227 
228 struct ir3_instruction *const *
ir3_get_src_maybe_shared(struct ir3_context * ctx,nir_src * src)229 ir3_get_src_maybe_shared(struct ir3_context *ctx, nir_src *src)
230 {
231    struct hash_entry *entry;
232    entry = _mesa_hash_table_search(ctx->def_ht, src->ssa);
233    compile_assert(ctx, entry);
234    return entry->data;
235 }
236 
237 static struct ir3_instruction *
get_shared(struct ir3_block * block,struct ir3_instruction * src,bool shared)238 get_shared(struct ir3_block *block, struct ir3_instruction *src, bool shared)
239 {
240    if (!!(src->dsts[0]->flags & IR3_REG_SHARED) != shared) {
241       struct ir3_instruction *mov =
242          ir3_MOV(block, src, (src->dsts[0]->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32);
243       mov->dsts[0]->flags &= ~IR3_REG_SHARED;
244       mov->dsts[0]->flags |= COND(shared, IR3_REG_SHARED);
245       return mov;
246    }
247 
248    return src;
249 }
250 
251 struct ir3_instruction *const *
ir3_get_src_shared(struct ir3_context * ctx,nir_src * src,bool shared)252 ir3_get_src_shared(struct ir3_context *ctx, nir_src *src, bool shared)
253 {
254    unsigned num_components = nir_src_num_components(*src);
255    struct ir3_instruction *const *value = ir3_get_src_maybe_shared(ctx, src);
256    bool mismatch = false;
257    for (unsigned i = 0; i < nir_src_num_components(*src); i++) {
258       if (!!(value[i]->dsts[0]->flags & IR3_REG_SHARED) != shared) {
259          mismatch = true;
260          break;
261       }
262    }
263 
264    if (!mismatch)
265       return value;
266 
267    struct ir3_instruction **new_value =
268       ralloc_array(ctx, struct ir3_instruction *, num_components);
269    for (unsigned i = 0; i < num_components; i++)
270       new_value[i] = get_shared(ctx->block, value[i], shared);
271 
272    return new_value;
273 }
274 
275 void
ir3_put_def(struct ir3_context * ctx,nir_def * def)276 ir3_put_def(struct ir3_context *ctx, nir_def *def)
277 {
278    unsigned bit_size = ir3_bitsize(ctx, def->bit_size);
279 
280    if (bit_size <= 16) {
281       for (unsigned i = 0; i < ctx->last_dst_n; i++) {
282          struct ir3_instruction *dst = ctx->last_dst[i];
283          ir3_set_dst_type(dst, true);
284          ir3_fixup_src_type(dst);
285          if (dst->opc == OPC_META_SPLIT) {
286             ir3_set_dst_type(ssa(dst->srcs[0]), true);
287             ir3_fixup_src_type(ssa(dst->srcs[0]));
288             dst->srcs[0]->flags |= IR3_REG_HALF;
289          }
290       }
291    }
292 
293    ctx->last_dst = NULL;
294    ctx->last_dst_n = 0;
295 }
296 
297 static unsigned
dest_flags(struct ir3_instruction * instr)298 dest_flags(struct ir3_instruction *instr)
299 {
300    return instr->dsts[0]->flags & (IR3_REG_HALF | IR3_REG_SHARED);
301 }
302 
303 struct ir3_instruction *
ir3_create_collect(struct ir3_block * block,struct ir3_instruction * const * arr,unsigned arrsz)304 ir3_create_collect(struct ir3_block *block, struct ir3_instruction *const *arr,
305                    unsigned arrsz)
306 {
307    struct ir3_instruction *collect;
308 
309    if (arrsz == 0)
310       return NULL;
311 
312    if (arrsz == 1)
313       return arr[0];
314 
315    unsigned flags = dest_flags(arr[0]);
316 
317    collect = ir3_instr_create(block, OPC_META_COLLECT, 1, arrsz);
318    __ssa_dst(collect)->flags |= flags;
319    for (unsigned i = 0; i < arrsz; i++) {
320       struct ir3_instruction *elem = arr[i];
321 
322       /* Since arrays are pre-colored in RA, we can't assume that
323        * things will end up in the right place.  (Ie. if a collect
324        * joins elements from two different arrays.)  So insert an
325        * extra mov.
326        *
327        * We could possibly skip this if all the collected elements
328        * are contiguous elements in a single array.. not sure how
329        * likely that is to happen.
330        *
331        * Fixes a problem with glamor shaders, that in effect do
332        * something like:
333        *
334        *   if (foo)
335        *     texcoord = ..
336        *   else
337        *     texcoord = ..
338        *   color = texture2D(tex, texcoord);
339        *
340        * In this case, texcoord will end up as nir registers (which
341        * translate to ir3 array's of length 1.  And we can't assume
342        * the two (or more) arrays will get allocated in consecutive
343        * scalar registers.
344        *
345        */
346       if (elem->dsts[0]->flags & IR3_REG_ARRAY) {
347          type_t type = (flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
348          elem = ir3_MOV(block, elem, type);
349       }
350 
351       assert(dest_flags(elem) == flags);
352       __ssa_src(collect, elem, flags);
353    }
354 
355    collect->dsts[0]->wrmask = MASK(arrsz);
356 
357    return collect;
358 }
359 
360 /* helper for instructions that produce multiple consecutive scalar
361  * outputs which need to have a split meta instruction inserted
362  */
363 void
ir3_split_dest(struct ir3_block * block,struct ir3_instruction ** dst,struct ir3_instruction * src,unsigned base,unsigned n)364 ir3_split_dest(struct ir3_block *block, struct ir3_instruction **dst,
365                struct ir3_instruction *src, unsigned base, unsigned n)
366 {
367    if ((n == 1) && (src->dsts[0]->wrmask == 0x1) &&
368        /* setup_input needs ir3_split_dest to generate a SPLIT instruction */
369        src->opc != OPC_META_INPUT) {
370       dst[0] = src;
371       return;
372    }
373 
374    if (src->opc == OPC_META_COLLECT) {
375       assert((base + n) <= src->srcs_count);
376 
377       for (int i = 0; i < n; i++) {
378          dst[i] = ssa(src->srcs[i + base]);
379       }
380 
381       return;
382    }
383 
384    unsigned flags = dest_flags(src);
385 
386    for (int i = 0, j = 0; i < n; i++) {
387       struct ir3_instruction *split =
388          ir3_instr_create(block, OPC_META_SPLIT, 1, 1);
389       __ssa_dst(split)->flags |= flags;
390       __ssa_src(split, src, flags);
391       split->split.off = i + base;
392 
393       if (src->dsts[0]->wrmask & (1 << (i + base)))
394          dst[j++] = split;
395    }
396 }
397 
398 NORETURN void
ir3_context_error(struct ir3_context * ctx,const char * format,...)399 ir3_context_error(struct ir3_context *ctx, const char *format, ...)
400 {
401    struct hash_table *errors = NULL;
402    va_list ap;
403    va_start(ap, format);
404    if (ctx->cur_instr) {
405       errors = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
406                                        _mesa_key_pointer_equal);
407       char *msg = ralloc_vasprintf(errors, format, ap);
408       _mesa_hash_table_insert(errors, ctx->cur_instr, msg);
409    } else {
410       mesa_loge_v(format, ap);
411    }
412    va_end(ap);
413    nir_log_shader_annotated(ctx->s, errors);
414    ralloc_free(errors);
415    ctx->error = true;
416    unreachable("");
417 }
418 
419 static struct ir3_instruction *
create_addr0(struct ir3_block * block,struct ir3_instruction * src,int align)420 create_addr0(struct ir3_block *block, struct ir3_instruction *src, int align)
421 {
422    struct ir3_instruction *instr, *immed;
423 
424    instr = ir3_COV(block, src, TYPE_U32, TYPE_S16);
425    bool shared = (src->dsts[0]->flags & IR3_REG_SHARED);
426 
427    switch (align) {
428    case 1:
429       /* src *= 1: */
430       break;
431    case 2:
432       /* src *= 2	=> src <<= 1: */
433       immed = create_immed_typed_shared(block, 1, TYPE_S16, shared);
434       instr = ir3_SHL_B(block, instr, 0, immed, 0);
435       break;
436    case 3:
437       /* src *= 3: */
438       immed = create_immed_typed_shared(block, 3, TYPE_S16, shared);
439       instr = ir3_MULL_U(block, instr, 0, immed, 0);
440       break;
441    case 4:
442       /* src *= 4 => src <<= 2: */
443       immed = create_immed_typed_shared(block, 2, TYPE_S16, shared);
444       instr = ir3_SHL_B(block, instr, 0, immed, 0);
445       break;
446    default:
447       unreachable("bad align");
448       return NULL;
449    }
450 
451    instr->dsts[0]->flags |= IR3_REG_HALF;
452 
453    instr = ir3_MOV(block, instr, TYPE_S16);
454    instr->dsts[0]->num = regid(REG_A0, 0);
455    instr->dsts[0]->flags &= ~IR3_REG_SHARED;
456 
457    return instr;
458 }
459 
460 static struct ir3_instruction *
create_addr1(struct ir3_block * block,unsigned const_val)461 create_addr1(struct ir3_block *block, unsigned const_val)
462 {
463    struct ir3_instruction *immed =
464       create_immed_typed(block, const_val, TYPE_U16);
465    struct ir3_instruction *instr = ir3_MOV(block, immed, TYPE_U16);
466    instr->dsts[0]->num = regid(REG_A0, 1);
467    return instr;
468 }
469 
470 /* caches addr values to avoid generating multiple cov/shl/mova
471  * sequences for each use of a given NIR level src as address
472  */
473 struct ir3_instruction *
ir3_get_addr0(struct ir3_context * ctx,struct ir3_instruction * src,int align)474 ir3_get_addr0(struct ir3_context *ctx, struct ir3_instruction *src, int align)
475 {
476    struct ir3_instruction *addr;
477    unsigned idx = align - 1;
478 
479    compile_assert(ctx, idx < ARRAY_SIZE(ctx->addr0_ht));
480 
481    if (!ctx->addr0_ht[idx]) {
482       ctx->addr0_ht[idx] = _mesa_hash_table_create(ctx, _mesa_hash_pointer,
483                                                    _mesa_key_pointer_equal);
484    } else {
485       struct hash_entry *entry;
486       entry = _mesa_hash_table_search(ctx->addr0_ht[idx], src);
487       if (entry)
488          return entry->data;
489    }
490 
491    addr = create_addr0(ctx->block, src, align);
492    _mesa_hash_table_insert(ctx->addr0_ht[idx], src, addr);
493 
494    return addr;
495 }
496 
497 /* Similar to ir3_get_addr0, but for a1.x. */
498 struct ir3_instruction *
ir3_get_addr1(struct ir3_context * ctx,unsigned const_val)499 ir3_get_addr1(struct ir3_context *ctx, unsigned const_val)
500 {
501    struct ir3_instruction *addr;
502 
503    if (!ctx->addr1_ht) {
504       ctx->addr1_ht = _mesa_hash_table_u64_create(ctx);
505    } else {
506       addr = _mesa_hash_table_u64_search(ctx->addr1_ht, const_val);
507       if (addr)
508          return addr;
509    }
510 
511    addr = create_addr1(ctx->block, const_val);
512    _mesa_hash_table_u64_insert(ctx->addr1_ht, const_val, addr);
513 
514    return addr;
515 }
516 
517 struct ir3_instruction *
ir3_get_predicate(struct ir3_context * ctx,struct ir3_instruction * src)518 ir3_get_predicate(struct ir3_context *ctx, struct ir3_instruction *src)
519 {
520    src = ir3_get_cond_for_nonzero_compare(src);
521 
522    struct hash_entry *src_entry =
523       _mesa_hash_table_search(ctx->predicate_conversions, src);
524    if (src_entry)
525       return src_entry->data;
526 
527    struct ir3_block *b = src->block;
528    struct ir3_instruction *cond;
529 
530    /* NOTE: we use cpms.s.ne x, 0 to move x into a predicate register */
531    struct ir3_instruction *zero =
532          create_immed_typed_shared(b, 0, is_half(src) ? TYPE_U16 : TYPE_U32,
533                                    src->dsts[0]->flags & IR3_REG_SHARED);
534    cond = ir3_CMPS_S(b, src, 0, zero, 0);
535    cond->cat2.condition = IR3_COND_NE;
536 
537    /* condition always goes in predicate register: */
538    cond->dsts[0]->flags |= IR3_REG_PREDICATE;
539    cond->dsts[0]->flags &= ~IR3_REG_SHARED;
540 
541    /* phi's should stay first in a block */
542    if (src->opc == OPC_META_PHI)
543       ir3_instr_move_after(zero, ir3_block_get_last_phi(src->block));
544    else
545       ir3_instr_move_after(zero, src);
546 
547    ir3_instr_move_after(cond, zero);
548 
549    _mesa_hash_table_insert(ctx->predicate_conversions, src, cond);
550    return cond;
551 }
552 
553 /*
554  * Array helpers
555  */
556 
557 void
ir3_declare_array(struct ir3_context * ctx,nir_intrinsic_instr * decl)558 ir3_declare_array(struct ir3_context *ctx, nir_intrinsic_instr *decl)
559 {
560    struct ir3_array *arr = rzalloc(ctx, struct ir3_array);
561    arr->id = ++ctx->num_arrays;
562    /* NOTE: sometimes we get non array regs, for example for arrays of
563     * length 1.  See fs-const-array-of-struct-of-array.shader_test.  So
564     * treat a non-array as if it was an array of length 1.
565     *
566     * It would be nice if there was a nir pass to convert arrays of
567     * length 1 to ssa.
568     */
569    arr->length = nir_intrinsic_num_components(decl) *
570                  MAX2(1, nir_intrinsic_num_array_elems(decl));
571 
572    compile_assert(ctx, arr->length > 0);
573    arr->r = &decl->def;
574    arr->half = ir3_bitsize(ctx, nir_intrinsic_bit_size(decl)) <= 16;
575    list_addtail(&arr->node, &ctx->ir->array_list);
576 }
577 
578 struct ir3_array *
ir3_get_array(struct ir3_context * ctx,nir_def * reg)579 ir3_get_array(struct ir3_context *ctx, nir_def *reg)
580 {
581    foreach_array (arr, &ctx->ir->array_list) {
582       if (arr->r == reg)
583          return arr;
584    }
585    ir3_context_error(ctx, "bogus reg: r%d\n", reg->index);
586    return NULL;
587 }
588 
589 /* relative (indirect) if address!=NULL */
590 struct ir3_instruction *
ir3_create_array_load(struct ir3_context * ctx,struct ir3_array * arr,int n,struct ir3_instruction * address)591 ir3_create_array_load(struct ir3_context *ctx, struct ir3_array *arr, int n,
592                       struct ir3_instruction *address)
593 {
594    struct ir3_block *block = ctx->block;
595    struct ir3_instruction *mov;
596    struct ir3_register *src;
597    unsigned flags = 0;
598 
599    mov = ir3_instr_create(block, OPC_MOV, 1, 1);
600    if (arr->half) {
601       mov->cat1.src_type = TYPE_U16;
602       mov->cat1.dst_type = TYPE_U16;
603       flags |= IR3_REG_HALF;
604    } else {
605       mov->cat1.src_type = TYPE_U32;
606       mov->cat1.dst_type = TYPE_U32;
607    }
608 
609    mov->barrier_class = IR3_BARRIER_ARRAY_R;
610    mov->barrier_conflict = IR3_BARRIER_ARRAY_W;
611    __ssa_dst(mov)->flags |= flags;
612    src = ir3_src_create(mov, 0,
613                         IR3_REG_ARRAY | COND(address, IR3_REG_RELATIV) | flags);
614    src->def = (arr->last_write && arr->last_write->instr->block == block)
615                  ? arr->last_write
616                  : NULL;
617    src->size = arr->length;
618    src->array.id = arr->id;
619    src->array.offset = n;
620    src->array.base = INVALID_REG;
621 
622    if (address)
623       ir3_instr_set_address(mov, address);
624 
625    return mov;
626 }
627 
628 /* relative (indirect) if address!=NULL */
629 void
ir3_create_array_store(struct ir3_context * ctx,struct ir3_array * arr,int n,struct ir3_instruction * src,struct ir3_instruction * address)630 ir3_create_array_store(struct ir3_context *ctx, struct ir3_array *arr, int n,
631                        struct ir3_instruction *src,
632                        struct ir3_instruction *address)
633 {
634    struct ir3_block *block = ctx->block;
635    struct ir3_instruction *mov;
636    struct ir3_register *dst;
637    unsigned flags = 0;
638 
639    mov = ir3_instr_create(block, OPC_MOV, 1, 1);
640    if (arr->half) {
641       mov->cat1.src_type = TYPE_U16;
642       mov->cat1.dst_type = TYPE_U16;
643       flags |= IR3_REG_HALF;
644    } else {
645       mov->cat1.src_type = TYPE_U32;
646       mov->cat1.dst_type = TYPE_U32;
647    }
648    mov->barrier_class = IR3_BARRIER_ARRAY_W;
649    mov->barrier_conflict = IR3_BARRIER_ARRAY_R | IR3_BARRIER_ARRAY_W;
650    dst = ir3_dst_create(
651       mov, INVALID_REG,
652       IR3_REG_SSA | IR3_REG_ARRAY | flags | COND(address, IR3_REG_RELATIV));
653    dst->instr = mov;
654    dst->size = arr->length;
655    dst->array.id = arr->id;
656    dst->array.offset = n;
657    dst->array.base = INVALID_REG;
658    ir3_src_create(mov, INVALID_REG, IR3_REG_SSA | flags |
659                   (src->dsts[0]->flags & IR3_REG_SHARED))->def = src->dsts[0];
660 
661    if (arr->last_write && arr->last_write->instr->block == block)
662       ir3_reg_set_last_array(mov, dst, arr->last_write);
663 
664    if (address)
665       ir3_instr_set_address(mov, address);
666 
667    arr->last_write = dst;
668 
669    /* the array store may only matter to something in an earlier
670     * block (ie. loops), but since arrays are not in SSA, depth
671     * pass won't know this.. so keep all array stores:
672     */
673    array_insert(block, block->keeps, mov);
674 }
675 
676 void
ir3_lower_imm_offset(struct ir3_context * ctx,nir_intrinsic_instr * intr,nir_src * offset_src,unsigned imm_offset_bits,struct ir3_instruction ** offset,unsigned * imm_offset)677 ir3_lower_imm_offset(struct ir3_context *ctx, nir_intrinsic_instr *intr,
678                      nir_src *offset_src, unsigned imm_offset_bits,
679                      struct ir3_instruction **offset, unsigned *imm_offset)
680 {
681    nir_const_value *nir_const_offset = nir_src_as_const_value(*offset_src);
682    int base = nir_intrinsic_base(intr);
683    unsigned imm_offset_bound = (1 << imm_offset_bits);
684    assert(base >= 0 && base < imm_offset_bound);
685 
686    if (nir_const_offset) {
687       /* If both the offset and the base (immed offset) are constants, lower the
688        * offset to a multiple of the bound and the immed offset to the
689        * remainder. This ensures that the offset register can often be reused
690        * among multiple contiguous accesses.
691        */
692       uint32_t full_offset = base + nir_const_offset->u32;
693       *offset =
694          create_immed(ctx->block, ROUND_DOWN_TO(full_offset, imm_offset_bound));
695       *imm_offset = full_offset % imm_offset_bound;
696    } else {
697       *offset = ir3_get_src(ctx, offset_src)[0];
698       *imm_offset = base;
699    }
700 }
701