xref: /aosp_15_r20/external/mesa3d/src/intel/compiler/elk/elk_nir_analyze_ubo_ranges.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2015 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "elk_nir.h"
25 #include "compiler/nir/nir.h"
26 #include "util/u_dynarray.h"
27 
28 /**
29  * \file elk_nir_analyze_ubo_ranges.c
30  *
31  * This pass decides which portions of UBOs to upload as push constants,
32  * so shaders can access them as part of the thread payload, rather than
33  * having to issue expensive memory reads to pull the data.
34  *
35  * The 3DSTATE_CONSTANT_* mechanism can push data from up to 4 different
36  * buffers, in GRF (256-bit/32-byte) units.
37  *
38  * To do this, we examine NIR load_ubo intrinsics, recording the number of
39  * loads at each offset.  We track offsets at a 32-byte granularity, so even
40  * fields with a bit of padding between them tend to fall into contiguous
41  * ranges.  We build a list of these ranges, tracking their "cost" (number
42  * of registers required) and "benefit" (number of pull loads eliminated
43  * by pushing the range).  We then sort the list to obtain the four best
44  * ranges (most benefit for the least cost).
45  */
46 
47 struct ubo_range_entry
48 {
49    struct elk_ubo_range range;
50    int benefit;
51 };
52 
53 static int
score(const struct ubo_range_entry * entry)54 score(const struct ubo_range_entry *entry)
55 {
56    return 2 * entry->benefit - entry->range.length;
57 }
58 
59 /**
60  * Compares score for two UBO range entries.
61  *
62  * For a descending qsort().
63  */
64 static int
cmp_ubo_range_entry(const void * va,const void * vb)65 cmp_ubo_range_entry(const void *va, const void *vb)
66 {
67    const struct ubo_range_entry *a = va;
68    const struct ubo_range_entry *b = vb;
69 
70    /* Rank based on scores, descending order */
71    int delta = score(b) - score(a);
72 
73    /* Then use the UBO block index as a tie-breaker, descending order */
74    if (delta == 0)
75       delta = b->range.block - a->range.block;
76 
77    /* Finally use the start offset as a second tie-breaker, ascending order */
78    if (delta == 0)
79       delta = a->range.start - b->range.start;
80 
81    return delta;
82 }
83 
84 struct ubo_block_info
85 {
86    /* Each bit in the offsets bitfield represents a 32-byte section of data.
87     * If it's set to one, there is interesting UBO data at that offset.  If
88     * not, there's a "hole" - padding between data - or just nothing at all.
89     */
90    uint64_t offsets;
91    uint8_t uses[64];
92 };
93 
94 struct ubo_analysis_state
95 {
96    struct hash_table *blocks;
97    bool uses_regular_uniforms;
98 };
99 
100 static struct ubo_block_info *
get_block_info(struct ubo_analysis_state * state,int block)101 get_block_info(struct ubo_analysis_state *state, int block)
102 {
103    uint32_t hash = block + 1;
104    void *key = (void *) (uintptr_t) hash;
105 
106    struct hash_entry *entry =
107       _mesa_hash_table_search_pre_hashed(state->blocks, hash, key);
108 
109    if (entry)
110       return (struct ubo_block_info *) entry->data;
111 
112    struct ubo_block_info *info =
113       rzalloc(state->blocks, struct ubo_block_info);
114    _mesa_hash_table_insert_pre_hashed(state->blocks, hash, key, info);
115 
116    return info;
117 }
118 
119 static void
analyze_ubos_block(struct ubo_analysis_state * state,nir_block * block)120 analyze_ubos_block(struct ubo_analysis_state *state, nir_block *block)
121 {
122    nir_foreach_instr(instr, block) {
123       if (instr->type != nir_instr_type_intrinsic)
124          continue;
125 
126       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
127       switch (intrin->intrinsic) {
128       case nir_intrinsic_load_uniform:
129       case nir_intrinsic_image_deref_load:
130       case nir_intrinsic_image_deref_store:
131       case nir_intrinsic_image_deref_atomic:
132       case nir_intrinsic_image_deref_atomic_swap:
133       case nir_intrinsic_image_deref_size:
134          state->uses_regular_uniforms = true;
135          continue;
136 
137       case nir_intrinsic_load_ubo:
138          break; /* Fall through to the analysis below */
139 
140       default:
141          continue; /* Not a uniform or UBO intrinsic */
142       }
143 
144       if (elk_nir_ubo_surface_index_is_pushable(intrin->src[0]) &&
145           nir_src_is_const(intrin->src[1])) {
146          const int block = elk_nir_ubo_surface_index_get_push_block(intrin->src[0]);
147          const unsigned byte_offset = nir_src_as_uint(intrin->src[1]);
148          const int offset = byte_offset / 32;
149 
150          /* Avoid shifting by larger than the width of our bitfield, as this
151           * is undefined in C.  Even if we require multiple bits to represent
152           * the entire value, it's OK to record a partial value - the backend
153           * is capable of falling back to pull loads for later components of
154           * vectors, as it has to shrink ranges for other reasons anyway.
155           */
156          if (offset >= 64)
157             continue;
158 
159          /* The value might span multiple 32-byte chunks. */
160          const int bytes = nir_intrinsic_dest_components(intrin) *
161                            (intrin->def.bit_size / 8);
162          const int start = ROUND_DOWN_TO(byte_offset, 32);
163          const int end = ALIGN(byte_offset + bytes, 32);
164          const int chunks = (end - start) / 32;
165 
166          /* TODO: should we count uses in loops as higher benefit? */
167 
168          struct ubo_block_info *info = get_block_info(state, block);
169          info->offsets |= ((1ull << chunks) - 1) << offset;
170          info->uses[offset]++;
171       }
172    }
173 }
174 
175 static void
print_ubo_entry(FILE * file,const struct ubo_range_entry * entry,struct ubo_analysis_state * state)176 print_ubo_entry(FILE *file,
177                 const struct ubo_range_entry *entry,
178                 struct ubo_analysis_state *state)
179 {
180    struct ubo_block_info *info = get_block_info(state, entry->range.block);
181 
182    fprintf(file,
183            "block %2d, start %2d, length %2d, bits = %"PRIx64", "
184            "benefit %2d, cost %2d, score = %2d\n",
185            entry->range.block, entry->range.start, entry->range.length,
186            info->offsets, entry->benefit, entry->range.length, score(entry));
187 }
188 
189 void
elk_nir_analyze_ubo_ranges(const struct elk_compiler * compiler,nir_shader * nir,struct elk_ubo_range out_ranges[4])190 elk_nir_analyze_ubo_ranges(const struct elk_compiler *compiler,
191                            nir_shader *nir,
192                            struct elk_ubo_range out_ranges[4])
193 {
194    void *mem_ctx = ralloc_context(NULL);
195 
196    struct ubo_analysis_state state = {
197       .uses_regular_uniforms = false,
198       .blocks =
199          _mesa_hash_table_create(mem_ctx, NULL, _mesa_key_pointer_equal),
200    };
201 
202    /* Compute shaders use push constants to get the subgroup ID so it's
203     * best to just assume some system values are pushed.
204     */
205    if (nir->info.stage == MESA_SHADER_COMPUTE)
206       state.uses_regular_uniforms = true;
207 
208    /* Walk the IR, recording how many times each UBO block/offset is used. */
209    nir_foreach_function_impl(impl, nir) {
210       nir_foreach_block(block, impl) {
211          analyze_ubos_block(&state, block);
212       }
213    }
214 
215    /* Find ranges: a block, starting 32-byte offset, and length. */
216    struct util_dynarray ranges;
217    util_dynarray_init(&ranges, mem_ctx);
218 
219    hash_table_foreach(state.blocks, entry) {
220       const int b = entry->hash - 1;
221       const struct ubo_block_info *info = entry->data;
222       uint64_t offsets = info->offsets;
223 
224       /* Walk through the offsets bitfield, finding contiguous regions of
225        * set bits:
226        *
227        *   0000000001111111111111000000000000111111111111110000000011111100
228        *            ^^^^^^^^^^^^^            ^^^^^^^^^^^^^^        ^^^^^^
229        *
230        * Each of these will become a UBO range.
231        */
232       while (offsets != 0) {
233          /* Find the first 1 in the offsets bitfield.  This represents the
234           * start of a range of interesting UBO data.  Make it zero-indexed.
235           */
236          int first_bit = ffsll(offsets) - 1;
237 
238          /* Find the first 0 bit in offsets beyond first_bit.  To find the
239           * first zero bit, we find the first 1 bit in the complement.  In
240           * order to ignore bits before first_bit, we mask off those bits.
241           */
242          int first_hole = ffsll(~offsets & ~((1ull << first_bit) - 1)) - 1;
243 
244          if (first_hole == -1) {
245             /* If we didn't find a hole, then set it to the end of the
246              * bitfield.  There are no more ranges to process.
247              */
248             first_hole = 64;
249             offsets = 0;
250          } else {
251             /* We've processed all bits before first_hole.  Mask them off. */
252             offsets &= ~((1ull << first_hole) - 1);
253          }
254 
255          struct ubo_range_entry *entry =
256             util_dynarray_grow(&ranges, struct ubo_range_entry, 1);
257 
258          entry->range.block = b;
259          entry->range.start = first_bit;
260          /* first_hole is one beyond the end, so we don't need to add 1 */
261          entry->range.length = first_hole - first_bit;
262          entry->benefit = 0;
263 
264          for (int i = 0; i < entry->range.length; i++)
265             entry->benefit += info->uses[first_bit + i];
266       }
267    }
268 
269    int nr_entries = ranges.size / sizeof(struct ubo_range_entry);
270 
271    if (0) {
272       util_dynarray_foreach(&ranges, struct ubo_range_entry, entry) {
273          print_ubo_entry(stderr, entry, &state);
274       }
275    }
276 
277    /* TODO: Consider combining ranges.
278     *
279     * We can only push 3-4 ranges via 3DSTATE_CONSTANT_XS.  If there are
280     * more ranges, and two are close by with only a small hole, it may be
281     * worth combining them.  The holes will waste register space, but the
282     * benefit of removing pulls may outweigh that cost.
283     */
284 
285    /* Sort the list so the most beneficial ranges are at the front. */
286    if (nr_entries > 0) {
287       qsort(ranges.data, nr_entries, sizeof(struct ubo_range_entry),
288             cmp_ubo_range_entry);
289    }
290 
291    struct ubo_range_entry *entries = ranges.data;
292 
293    /* Return the top 4 or so.  We drop by one if regular uniforms are in
294     * use, assuming one push buffer will be dedicated to those.  We may
295     * also only get 3 on Haswell if we can't write INSTPM.
296     *
297     * The backend may need to shrink these ranges to ensure that they
298     * don't exceed the maximum push constant limits.  It can simply drop
299     * the tail of the list, as that's the least valuable portion.  We
300     * unfortunately can't truncate it here, because we don't know what
301     * the backend is planning to do with regular uniforms.
302     */
303    const int max_ubos = (compiler->constant_buffer_0_is_relative ? 3 : 4) -
304                         state.uses_regular_uniforms;
305    nr_entries = MIN2(nr_entries, max_ubos);
306 
307    for (int i = 0; i < nr_entries; i++) {
308       out_ranges[i] = entries[i].range;
309    }
310    for (int i = nr_entries; i < 4; i++) {
311       out_ranges[i].block = 0;
312       out_ranges[i].start = 0;
313       out_ranges[i].length = 0;
314    }
315 
316    ralloc_free(ranges.mem_ctx);
317 }
318