xref: /aosp_15_r20/external/mesa3d/src/compiler/nir/nir_lower_subgroups.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2023 Collabora, Ltd.
3  * Copyright © 2017 Intel Corporation
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22  * IN THE SOFTWARE.
23  */
24 
25 #include "util/u_math.h"
26 #include "nir.h"
27 #include "nir_builder.h"
28 
29 /**
30  * \file nir_opt_intrinsics.c
31  */
32 
33 static nir_intrinsic_instr *
lower_subgroups_64bit_split_intrinsic(nir_builder * b,nir_intrinsic_instr * intrin,unsigned int component)34 lower_subgroups_64bit_split_intrinsic(nir_builder *b, nir_intrinsic_instr *intrin,
35                                       unsigned int component)
36 {
37    nir_def *comp;
38    if (component == 0)
39       comp = nir_unpack_64_2x32_split_x(b, intrin->src[0].ssa);
40    else
41       comp = nir_unpack_64_2x32_split_y(b, intrin->src[0].ssa);
42 
43    nir_intrinsic_instr *intr = nir_intrinsic_instr_create(b->shader, intrin->intrinsic);
44    nir_def_init(&intr->instr, &intr->def, 1, 32);
45    intr->const_index[0] = intrin->const_index[0];
46    intr->const_index[1] = intrin->const_index[1];
47    intr->src[0] = nir_src_for_ssa(comp);
48    if (nir_intrinsic_infos[intrin->intrinsic].num_srcs == 2)
49       intr->src[1] = nir_src_for_ssa(intrin->src[1].ssa);
50 
51    intr->num_components = 1;
52    nir_builder_instr_insert(b, &intr->instr);
53    return intr;
54 }
55 
56 static nir_def *
lower_subgroup_op_to_32bit(nir_builder * b,nir_intrinsic_instr * intrin)57 lower_subgroup_op_to_32bit(nir_builder *b, nir_intrinsic_instr *intrin)
58 {
59    assert(intrin->src[0].ssa->bit_size == 64);
60    nir_intrinsic_instr *intr_x = lower_subgroups_64bit_split_intrinsic(b, intrin, 0);
61    nir_intrinsic_instr *intr_y = lower_subgroups_64bit_split_intrinsic(b, intrin, 1);
62    return nir_pack_64_2x32_split(b, &intr_x->def, &intr_y->def);
63 }
64 
65 static nir_def *
ballot_type_to_uint(nir_builder * b,nir_def * value,const nir_lower_subgroups_options * options)66 ballot_type_to_uint(nir_builder *b, nir_def *value,
67                     const nir_lower_subgroups_options *options)
68 {
69    /* Allow internal generated ballots to pass through */
70    if (value->num_components == options->ballot_components &&
71        value->bit_size == options->ballot_bit_size)
72       return value;
73 
74    /* Only the new-style SPIR-V subgroup instructions take a ballot result as
75     * an argument, so we only use this on uvec4 types.
76     */
77    assert(value->num_components == 4 && value->bit_size == 32);
78 
79    return nir_extract_bits(b, &value, 1, 0, options->ballot_components,
80                            options->ballot_bit_size);
81 }
82 
83 static nir_def *
uint_to_ballot_type(nir_builder * b,nir_def * value,unsigned num_components,unsigned bit_size)84 uint_to_ballot_type(nir_builder *b, nir_def *value,
85                     unsigned num_components, unsigned bit_size)
86 {
87    assert(util_is_power_of_two_nonzero(num_components));
88    assert(util_is_power_of_two_nonzero(value->num_components));
89 
90    unsigned total_bits = bit_size * num_components;
91 
92    /* If the source doesn't have enough bits, zero-pad */
93    if (total_bits > value->bit_size * value->num_components)
94       value = nir_pad_vector_imm_int(b, value, 0, total_bits / value->bit_size);
95 
96    value = nir_bitcast_vector(b, value, bit_size);
97 
98    /* If the source has too many components, truncate.  This can happen if,
99     * for instance, we're implementing GL_ARB_shader_ballot or
100     * VK_EXT_shader_subgroup_ballot which have 64-bit ballot values on an
101     * architecture with a native 128-bit uvec4 ballot.  This comes up in Zink
102     * for OpenGL on Vulkan.  It's the job of the driver calling this lowering
103     * pass to ensure that it's restricted subgroup sizes sufficiently that we
104     * have enough ballot bits.
105     */
106    if (value->num_components > num_components)
107       value = nir_trim_vector(b, value, num_components);
108 
109    return value;
110 }
111 
112 static nir_def *
lower_subgroup_op_to_scalar(nir_builder * b,nir_intrinsic_instr * intrin)113 lower_subgroup_op_to_scalar(nir_builder *b, nir_intrinsic_instr *intrin)
114 {
115    /* This is safe to call on scalar things but it would be silly */
116    assert(intrin->def.num_components > 1);
117 
118    nir_def *value = intrin->src[0].ssa;
119    nir_def *reads[NIR_MAX_VEC_COMPONENTS];
120 
121    for (unsigned i = 0; i < intrin->num_components; i++) {
122       nir_intrinsic_instr *chan_intrin =
123          nir_intrinsic_instr_create(b->shader, intrin->intrinsic);
124       nir_def_init(&chan_intrin->instr, &chan_intrin->def, 1,
125                    intrin->def.bit_size);
126       chan_intrin->num_components = 1;
127 
128       /* value */
129       chan_intrin->src[0] = nir_src_for_ssa(nir_channel(b, value, i));
130       /* invocation */
131       if (nir_intrinsic_infos[intrin->intrinsic].num_srcs > 1) {
132          assert(nir_intrinsic_infos[intrin->intrinsic].num_srcs == 2);
133          chan_intrin->src[1] = nir_src_for_ssa(intrin->src[1].ssa);
134       }
135 
136       chan_intrin->const_index[0] = intrin->const_index[0];
137       chan_intrin->const_index[1] = intrin->const_index[1];
138 
139       nir_builder_instr_insert(b, &chan_intrin->instr);
140       reads[i] = &chan_intrin->def;
141    }
142 
143    return nir_vec(b, reads, intrin->num_components);
144 }
145 
146 static nir_def *
lower_vote_eq_to_scalar(nir_builder * b,nir_intrinsic_instr * intrin)147 lower_vote_eq_to_scalar(nir_builder *b, nir_intrinsic_instr *intrin)
148 {
149    nir_def *value = intrin->src[0].ssa;
150 
151    nir_def *result = NULL;
152    for (unsigned i = 0; i < intrin->num_components; i++) {
153       nir_def* chan = nir_channel(b, value, i);
154 
155       if (intrin->intrinsic == nir_intrinsic_vote_feq) {
156          chan = nir_vote_feq(b, intrin->def.bit_size, chan);
157       } else {
158          chan = nir_vote_ieq(b, intrin->def.bit_size, chan);
159       }
160 
161       if (result) {
162          result = nir_iand(b, result, chan);
163       } else {
164          result = chan;
165       }
166    }
167 
168    return result;
169 }
170 
171 static nir_def *
lower_vote_eq(nir_builder * b,nir_intrinsic_instr * intrin)172 lower_vote_eq(nir_builder *b, nir_intrinsic_instr *intrin)
173 {
174    nir_def *value = intrin->src[0].ssa;
175 
176    /* We have to implicitly lower to scalar */
177    nir_def *all_eq = NULL;
178    for (unsigned i = 0; i < intrin->num_components; i++) {
179       nir_def *rfi = nir_read_first_invocation(b, nir_channel(b, value, i));
180 
181       nir_def *is_eq;
182       if (intrin->intrinsic == nir_intrinsic_vote_feq) {
183          is_eq = nir_feq(b, rfi, nir_channel(b, value, i));
184       } else {
185          is_eq = nir_ieq(b, rfi, nir_channel(b, value, i));
186       }
187 
188       if (all_eq == NULL) {
189          all_eq = is_eq;
190       } else {
191          all_eq = nir_iand(b, all_eq, is_eq);
192       }
193    }
194 
195    return nir_vote_all(b, 1, all_eq);
196 }
197 
198 static nir_def *
lower_shuffle_to_swizzle(nir_builder * b,nir_intrinsic_instr * intrin)199 lower_shuffle_to_swizzle(nir_builder *b, nir_intrinsic_instr *intrin)
200 {
201    unsigned mask = nir_src_as_uint(intrin->src[1]);
202 
203    if (mask >= 32)
204       return NULL;
205 
206    return nir_masked_swizzle_amd(b, intrin->src[0].ssa,
207                                  .swizzle_mask = (mask << 10) | 0x1f,
208                                  .fetch_inactive = true);
209 }
210 
211 /* Lowers "specialized" shuffles to a generic nir_intrinsic_shuffle. */
212 
213 static nir_def *
lower_to_shuffle(nir_builder * b,nir_intrinsic_instr * intrin,const nir_lower_subgroups_options * options)214 lower_to_shuffle(nir_builder *b, nir_intrinsic_instr *intrin,
215                  const nir_lower_subgroups_options *options)
216 {
217    if (intrin->intrinsic == nir_intrinsic_shuffle_xor &&
218        options->lower_shuffle_to_swizzle_amd &&
219        nir_src_is_const(intrin->src[1])) {
220 
221       nir_def *result = lower_shuffle_to_swizzle(b, intrin);
222       if (result)
223          return result;
224    }
225 
226    nir_def *index = nir_load_subgroup_invocation(b);
227    switch (intrin->intrinsic) {
228    case nir_intrinsic_shuffle_xor:
229       index = nir_ixor(b, index, intrin->src[1].ssa);
230       break;
231    case nir_intrinsic_shuffle_up:
232       index = nir_isub(b, index, intrin->src[1].ssa);
233       break;
234    case nir_intrinsic_shuffle_down:
235       index = nir_iadd(b, index, intrin->src[1].ssa);
236       break;
237    case nir_intrinsic_quad_broadcast:
238       index = nir_ior(b, nir_iand_imm(b, index, ~0x3),
239                       intrin->src[1].ssa);
240       break;
241    case nir_intrinsic_quad_swap_horizontal:
242       /* For Quad operations, subgroups are divided into quads where
243        * (invocation % 4) is the index to a square arranged as follows:
244        *
245        *    +---+---+
246        *    | 0 | 1 |
247        *    +---+---+
248        *    | 2 | 3 |
249        *    +---+---+
250        */
251       index = nir_ixor(b, index, nir_imm_int(b, 0x1));
252       break;
253    case nir_intrinsic_quad_swap_vertical:
254       index = nir_ixor(b, index, nir_imm_int(b, 0x2));
255       break;
256    case nir_intrinsic_quad_swap_diagonal:
257       index = nir_ixor(b, index, nir_imm_int(b, 0x3));
258       break;
259    case nir_intrinsic_rotate: {
260       nir_def *delta = intrin->src[1].ssa;
261       nir_def *local_id = nir_load_subgroup_invocation(b);
262       const unsigned cluster_size = nir_intrinsic_cluster_size(intrin);
263 
264       nir_def *rotation_group_mask =
265          cluster_size > 0 ? nir_imm_int(b, (int)(cluster_size - 1)) : nir_iadd_imm(b, nir_load_subgroup_size(b), -1);
266 
267       index = nir_iand(b, nir_iadd(b, local_id, delta),
268                        rotation_group_mask);
269       if (cluster_size > 0) {
270          index = nir_iadd(b, index,
271                           nir_iand(b, local_id, nir_inot(b, rotation_group_mask)));
272       }
273       break;
274    }
275    default:
276       unreachable("Invalid intrinsic");
277    }
278 
279    return nir_shuffle(b, intrin->src[0].ssa, index);
280 }
281 
282 static const struct glsl_type *
glsl_type_for_ssa(nir_def * def)283 glsl_type_for_ssa(nir_def *def)
284 {
285    const struct glsl_type *comp_type = def->bit_size == 1 ? glsl_bool_type() : glsl_uintN_t_type(def->bit_size);
286    return glsl_replace_vector_type(comp_type, def->num_components);
287 }
288 
289 /* Lower nir_intrinsic_shuffle to a waterfall loop + nir_read_invocation.
290  */
291 static nir_def *
lower_shuffle(nir_builder * b,nir_intrinsic_instr * intrin)292 lower_shuffle(nir_builder *b, nir_intrinsic_instr *intrin)
293 {
294    nir_def *val = intrin->src[0].ssa;
295    nir_def *id = intrin->src[1].ssa;
296 
297    /* The loop is something like:
298     *
299     * while (true) {
300     *    first_id = readFirstInvocation(gl_SubgroupInvocationID);
301     *    first_val = readFirstInvocation(val);
302     *    first_result = readInvocation(val, readFirstInvocation(id));
303     *    if (id == first_id)
304     *       result = first_val;
305     *    if (elect()) {
306     *       if (id > gl_SubgroupInvocationID) {
307     *          result = first_result;
308     *       }
309     *       break;
310     *    }
311     * }
312     *
313     * The idea is to guarantee, on each iteration of the loop, that anything
314     * reading from first_id gets the correct value, so that we can then kill
315     * it off by breaking out of the loop. Before doing that we also have to
316     * ensure that first_id invocation gets the correct value. It only won't be
317     * assigned the correct value already if the invocation it's reading from
318     * isn't already killed off, that is, if it's later than its own ID.
319     * Invocations where id <= gl_SubgroupInvocationID will be assigned their
320     * result in the first if, and invocations where id >
321     * gl_SubgroupInvocationID will be assigned their result in the second if.
322     *
323     * We do this more complicated loop rather than looping over all id's
324     * explicitly because at this point we don't know the "actual" subgroup
325     * size and at the moment there's no way to get at it, which means we may
326     * loop over always-inactive invocations.
327     */
328 
329    nir_def *subgroup_id = nir_load_subgroup_invocation(b);
330 
331    nir_variable *result =
332       nir_local_variable_create(b->impl, glsl_type_for_ssa(val), "result");
333 
334    nir_loop *loop = nir_push_loop(b);
335    {
336       nir_def *first_id = nir_read_first_invocation(b, subgroup_id);
337       nir_def *first_val = nir_read_first_invocation(b, val);
338       nir_def *first_result =
339          nir_read_invocation(b, val, nir_read_first_invocation(b, id));
340 
341       nir_if *nif = nir_push_if(b, nir_ieq(b, id, first_id));
342       {
343          nir_store_var(b, result, first_val, BITFIELD_MASK(val->num_components));
344       }
345       nir_pop_if(b, nif);
346 
347       nir_if *nif2 = nir_push_if(b, nir_elect(b, 1));
348       {
349          nir_if *nif3 = nir_push_if(b, nir_ult(b, subgroup_id, id));
350          {
351             nir_store_var(b, result, first_result, BITFIELD_MASK(val->num_components));
352          }
353          nir_pop_if(b, nif3);
354 
355          nir_jump(b, nir_jump_break);
356       }
357       nir_pop_if(b, nif2);
358    }
359    nir_pop_loop(b, loop);
360 
361    return nir_load_var(b, result);
362 }
363 
364 static nir_def *
lower_boolean_shuffle(nir_builder * b,nir_intrinsic_instr * intrin,const nir_lower_subgroups_options * options)365 lower_boolean_shuffle(nir_builder *b, nir_intrinsic_instr *intrin,
366                       const nir_lower_subgroups_options *options)
367 {
368    assert(options->ballot_components == 1 && options->subgroup_size);
369    nir_def *ballot = nir_ballot_relaxed(b, 1, options->ballot_bit_size, intrin->src[0].ssa);
370 
371    nir_def *index = NULL;
372 
373    /* If the shuffle amount isn't constant, it might be divergent but
374     * inverse_ballot requires a uniform source, so take a different path.
375     * rotate allows us to assume the delta is uniform unlike shuffle_up/down.
376     */
377    switch (intrin->intrinsic) {
378    case nir_intrinsic_shuffle_up:
379       if (nir_src_is_const(intrin->src[1]))
380          ballot = nir_ishl(b, ballot, intrin->src[1].ssa);
381       else
382          index = nir_isub(b, nir_load_subgroup_invocation(b), intrin->src[1].ssa);
383       break;
384    case nir_intrinsic_shuffle_down:
385       if (nir_src_is_const(intrin->src[1]))
386          ballot = nir_ushr(b, ballot, intrin->src[1].ssa);
387       else
388          index = nir_iadd(b, nir_load_subgroup_invocation(b), intrin->src[1].ssa);
389       break;
390    case nir_intrinsic_shuffle_xor:
391       index = nir_ixor(b, nir_load_subgroup_invocation(b), intrin->src[1].ssa);
392       break;
393    case nir_intrinsic_rotate: {
394       nir_def *delta = nir_as_uniform(b, intrin->src[1].ssa);
395       uint32_t cluster_size = nir_intrinsic_cluster_size(intrin);
396       cluster_size = cluster_size ? cluster_size : options->subgroup_size;
397       cluster_size = MIN2(cluster_size, options->subgroup_size);
398       if (cluster_size == 1) {
399          return intrin->src[0].ssa;
400       } else if (cluster_size == 2) {
401          delta = nir_iand_imm(b, delta, cluster_size - 1);
402          nir_def *lo = nir_iand_imm(b, nir_ushr_imm(b, ballot, 1), 0x5555555555555555ull);
403          nir_def *hi = nir_iand_imm(b, nir_ishl_imm(b, ballot, 1), 0xaaaaaaaaaaaaaaaaull);
404          ballot = nir_bcsel(b, nir_ine_imm(b, delta, 0), nir_ior(b, hi, lo), ballot);
405       } else if (cluster_size == ballot->bit_size) {
406          ballot = nir_uror(b, ballot, delta);
407       } else if (cluster_size == 32) {
408          nir_def *unpacked = nir_unpack_64_2x32(b, ballot);
409          unpacked = nir_uror(b, unpacked, delta);
410          ballot = nir_pack_64_2x32(b, unpacked);
411       } else {
412          delta = nir_iand_imm(b, delta, cluster_size - 1);
413          nir_def *delta_rev = nir_isub_imm(b, cluster_size, delta);
414          nir_def *mask = nir_mask(b, delta_rev, ballot->bit_size);
415          for (uint32_t i = cluster_size; i < ballot->bit_size; i *= 2) {
416             mask = nir_ior(b, nir_ishl_imm(b, mask, i), mask);
417          }
418          nir_def *lo = nir_iand(b, nir_ushr(b, ballot, delta), mask);
419          nir_def *hi = nir_iand(b, nir_ishl(b, ballot, delta_rev), nir_inot(b, mask));
420          ballot = nir_ior(b, lo, hi);
421       }
422       break;
423    }
424    case nir_intrinsic_shuffle:
425       index = intrin->src[1].ssa;
426       break;
427    case nir_intrinsic_read_invocation:
428       index = nir_as_uniform(b, intrin->src[1].ssa);
429       break;
430    default:
431       unreachable("not a boolean shuffle");
432    }
433 
434    if (index) {
435       nir_def *mask = nir_ishl(b, nir_imm_intN_t(b, 1, ballot->bit_size), index);
436       return nir_ine_imm(b, nir_iand(b, ballot, mask), 0);
437    } else {
438       return nir_inverse_ballot(b, 1, ballot);
439    }
440 }
441 
442 static nir_def *
vec_bit_count(nir_builder * b,nir_def * value)443 vec_bit_count(nir_builder *b, nir_def *value)
444 {
445    nir_def *vec_result = nir_bit_count(b, value);
446    nir_def *result = nir_channel(b, vec_result, 0);
447    for (unsigned i = 1; i < value->num_components; i++)
448       result = nir_iadd(b, result, nir_channel(b, vec_result, i));
449    return result;
450 }
451 
452 /* produce a bitmask of 111...000...111... alternating between "size"
453  * 1's and "size" 0's (the LSB is 1).
454  */
455 static uint64_t
reduce_mask(unsigned size,unsigned ballot_bit_size)456 reduce_mask(unsigned size, unsigned ballot_bit_size)
457 {
458    uint64_t mask = 0;
459    for (unsigned i = 0; i < ballot_bit_size; i += 2 * size) {
460       mask |= ((1ull << size) - 1) << i;
461    }
462 
463    return mask;
464 }
465 
466 /* operate on a uniform per-thread bitmask provided by ballot() to perform the
467  * desired Boolean reduction. Assumes that the identity of the operation is
468  * false (so, no iand).
469  */
470 static nir_def *
lower_boolean_reduce_internal(nir_builder * b,nir_def * src,unsigned cluster_size,nir_op op,const nir_lower_subgroups_options * options)471 lower_boolean_reduce_internal(nir_builder *b, nir_def *src,
472                               unsigned cluster_size, nir_op op,
473                               const nir_lower_subgroups_options *options)
474 {
475    for (unsigned size = 1; size < cluster_size; size *= 2) {
476       nir_def *shifted = nir_ushr_imm(b, src, size);
477       src = nir_build_alu2(b, op, shifted, src);
478       uint64_t mask = reduce_mask(size, options->ballot_bit_size);
479       src = nir_iand_imm(b, src, mask);
480       shifted = nir_ishl_imm(b, src, size);
481       src = nir_ior(b, src, shifted);
482    }
483 
484    return src;
485 }
486 
487 /* operate on a uniform per-thread bitmask provided by ballot() to perform the
488  * desired Boolean inclusive scan. Assumes that the identity of the operation is
489  * false (so, no iand).
490  */
491 static nir_def *
lower_boolean_scan_internal(nir_builder * b,nir_def * src,nir_op op,const nir_lower_subgroups_options * options)492 lower_boolean_scan_internal(nir_builder *b, nir_def *src,
493                             nir_op op,
494                             const nir_lower_subgroups_options *options)
495 {
496    if (op == nir_op_ior) {
497       /* We want to return a bitmask with all 1's starting at the first 1 in
498        * src. -src is equivalent to ~src + 1. While src | ~src returns all
499        * 1's, src | (~src + 1) returns all 1's except for the bits changed by
500        * the increment. Any 1's before the least significant 0 of ~src are
501        * turned into 0 (zeroing those bits after or'ing) and the least
502        * signficant 0 of ~src is turned into 1 (not doing anything). So the
503        * final output is what we want.
504        */
505       return nir_ior(b, src, nir_ineg(b, src));
506    } else {
507       assert(op == nir_op_ixor);
508       for (unsigned shift = 1; shift < options->ballot_bit_size; shift *= 2) {
509          src = nir_ixor(b, src, nir_ishl_imm(b, src, shift));
510       }
511       return src;
512    }
513 }
514 
515 static nir_def *
lower_boolean_reduce(nir_builder * b,nir_intrinsic_instr * intrin,const nir_lower_subgroups_options * options)516 lower_boolean_reduce(nir_builder *b, nir_intrinsic_instr *intrin,
517                      const nir_lower_subgroups_options *options)
518 {
519    assert(intrin->num_components == 1);
520    assert(options->ballot_components == 1);
521 
522    unsigned cluster_size =
523       intrin->intrinsic == nir_intrinsic_reduce ? nir_intrinsic_cluster_size(intrin) : 0;
524    nir_op op = nir_intrinsic_reduction_op(intrin);
525 
526    /* For certain cluster sizes, reductions of iand and ior can be implemented
527     * more efficiently.
528     */
529    if (intrin->intrinsic == nir_intrinsic_reduce) {
530       if (cluster_size == 0) {
531          if (op == nir_op_iand)
532             return nir_vote_all(b, 1, intrin->src[0].ssa);
533          else if (op == nir_op_ior)
534             return nir_vote_any(b, 1, intrin->src[0].ssa);
535          else if (op == nir_op_ixor)
536             return nir_i2b(b, nir_iand_imm(b, vec_bit_count(b, nir_ballot(b,
537                                                                           options->ballot_components,
538                                                                           options->ballot_bit_size,
539                                                                           intrin->src[0].ssa)),
540                                            1));
541          else
542             unreachable("bad boolean reduction op");
543       }
544 
545       if (cluster_size == 4) {
546          if (op == nir_op_iand)
547             return nir_quad_vote_all(b, 1, intrin->src[0].ssa);
548          else if (op == nir_op_ior)
549             return nir_quad_vote_any(b, 1, intrin->src[0].ssa);
550       }
551    }
552 
553    nir_def *src = intrin->src[0].ssa;
554 
555    /* Apply DeMorgan's law to implement "and" reductions, since all the
556     * lower_boolean_*_internal() functions assume an identity of 0 to make the
557     * generated code shorter.
558     */
559    nir_op new_op = (op == nir_op_iand) ? nir_op_ior : op;
560    if (op == nir_op_iand) {
561       src = nir_inot(b, src);
562    }
563 
564    nir_def *val = nir_ballot(b, options->ballot_components, options->ballot_bit_size, src);
565 
566    switch (intrin->intrinsic) {
567    case nir_intrinsic_reduce:
568       val = lower_boolean_reduce_internal(b, val, cluster_size, new_op, options);
569       break;
570    case nir_intrinsic_inclusive_scan:
571       val = lower_boolean_scan_internal(b, val, new_op, options);
572       break;
573    case nir_intrinsic_exclusive_scan:
574       val = lower_boolean_scan_internal(b, val, new_op, options);
575       val = nir_ishl_imm(b, val, 1);
576       break;
577    default:
578       unreachable("bad intrinsic");
579    }
580 
581    if (op == nir_op_iand) {
582       val = nir_inot(b, val);
583    }
584 
585    return nir_inverse_ballot(b, 1, val);
586 }
587 
588 static nir_def *
build_identity(nir_builder * b,unsigned bit_size,nir_op op)589 build_identity(nir_builder *b, unsigned bit_size, nir_op op)
590 {
591    nir_const_value ident_const = nir_alu_binop_identity(op, bit_size);
592    return nir_build_imm(b, 1, bit_size, &ident_const);
593 }
594 
595 /* Implementation of scan/reduce that assumes a full subgroup */
596 static nir_def *
build_scan_full(nir_builder * b,nir_intrinsic_op op,nir_op red_op,nir_def * data,unsigned cluster_size)597 build_scan_full(nir_builder *b, nir_intrinsic_op op, nir_op red_op,
598                 nir_def *data, unsigned cluster_size)
599 {
600    switch (op) {
601    case nir_intrinsic_exclusive_scan:
602    case nir_intrinsic_inclusive_scan: {
603       for (unsigned i = 1; i < cluster_size; i *= 2) {
604          nir_def *idx = nir_load_subgroup_invocation(b);
605          nir_def *has_buddy = nir_ige_imm(b, idx, i);
606 
607          nir_def *buddy_data = nir_shuffle_up(b, data, nir_imm_int(b, i));
608          nir_def *accum = nir_build_alu2(b, red_op, data, buddy_data);
609          data = nir_bcsel(b, has_buddy, accum, data);
610       }
611 
612       if (op == nir_intrinsic_exclusive_scan) {
613          /* For exclusive scans, we need to shift one more time and fill in the
614           * bottom channel with identity.
615           */
616          nir_def *idx = nir_load_subgroup_invocation(b);
617          nir_def *has_buddy = nir_ige_imm(b, idx, 1);
618 
619          nir_def *buddy_data = nir_shuffle_up(b, data, nir_imm_int(b, 1));
620          nir_def *identity = build_identity(b, data->bit_size, red_op);
621          data = nir_bcsel(b, has_buddy, buddy_data, identity);
622       }
623 
624       return data;
625    }
626 
627    case nir_intrinsic_reduce: {
628       for (unsigned i = 1; i < cluster_size; i *= 2) {
629          nir_def *buddy_data = nir_shuffle_xor(b, data, nir_imm_int(b, i));
630          data = nir_build_alu2(b, red_op, data, buddy_data);
631       }
632       return data;
633    }
634 
635    default:
636       unreachable("Unsupported scan/reduce op");
637    }
638 }
639 
640 /* Fully generic implementation of scan/reduce that takes a mask */
641 static nir_def *
build_scan_reduce(nir_builder * b,nir_intrinsic_op op,nir_op red_op,nir_def * data,nir_def * mask,unsigned max_mask_bits,unsigned subgroup_size)642 build_scan_reduce(nir_builder *b, nir_intrinsic_op op, nir_op red_op,
643                   nir_def *data, nir_def *mask, unsigned max_mask_bits,
644                   unsigned subgroup_size)
645 {
646    nir_def *lt_mask = nir_load_subgroup_lt_mask(b, 1, subgroup_size);
647 
648    /* Mask of all channels whose values we need to accumulate.  Our own value
649     * is already in accum, if inclusive, thanks to the initialization above.
650     * We only need to consider lower indexed invocations.
651     */
652    nir_def *remaining = nir_iand(b, mask, lt_mask);
653 
654    for (unsigned i = 1; i < max_mask_bits; i *= 2) {
655       /* At each step, our buddy channel is the first channel we have yet to
656        * take into account in the accumulator.
657        */
658       nir_def *has_buddy = nir_ine_imm(b, remaining, 0);
659       nir_def *buddy = nir_ufind_msb(b, remaining);
660 
661       /* Accumulate with our buddy channel, if any */
662       nir_def *buddy_data = nir_shuffle(b, data, buddy);
663       nir_def *accum = nir_build_alu2(b, red_op, data, buddy_data);
664       data = nir_bcsel(b, has_buddy, accum, data);
665 
666       /* We just took into account everything in our buddy's accumulator from
667        * the previous step.  The only things remaining are whatever channels
668        * were remaining for our buddy.
669        */
670       nir_def *buddy_remaining = nir_shuffle(b, remaining, buddy);
671       remaining = nir_bcsel(b, has_buddy, buddy_remaining, nir_imm_int(b, 0));
672    }
673 
674    switch (op) {
675    case nir_intrinsic_exclusive_scan: {
676       /* For exclusive scans, we need to shift one more time and fill in the
677        * bottom channel with identity.
678        *
679        * Some of this will get CSE'd with the first step but that's okay. The
680        * code is cleaner this way.
681        */
682       nir_def *lower = nir_iand(b, mask, lt_mask);
683       nir_def *has_buddy = nir_ine_imm(b, lower, 0);
684       nir_def *buddy = nir_ufind_msb(b, lower);
685 
686       nir_def *buddy_data = nir_shuffle(b, data, buddy);
687       nir_def *identity = build_identity(b, data->bit_size, red_op);
688       return nir_bcsel(b, has_buddy, buddy_data, identity);
689    }
690 
691    case nir_intrinsic_inclusive_scan:
692       return data;
693 
694    case nir_intrinsic_reduce: {
695       /* For reductions, we need to take the top value of the scan */
696       nir_def *idx = nir_ufind_msb(b, mask);
697       return nir_shuffle(b, data, idx);
698    }
699 
700    default:
701       unreachable("Unsupported scan/reduce op");
702    }
703 }
704 
705 static nir_def *
lower_scan_reduce(nir_builder * b,nir_intrinsic_instr * intrin,unsigned subgroup_size)706 lower_scan_reduce(nir_builder *b, nir_intrinsic_instr *intrin,
707                   unsigned subgroup_size)
708 {
709    const nir_op red_op = nir_intrinsic_reduction_op(intrin);
710 
711    /* Grab the cluster size */
712    unsigned cluster_size = subgroup_size;
713    if (nir_intrinsic_has_cluster_size(intrin)) {
714       cluster_size = nir_intrinsic_cluster_size(intrin);
715       if (cluster_size == 0 || cluster_size > subgroup_size)
716          cluster_size = subgroup_size;
717    }
718 
719    /* Check if all invocations are active. If so, we use the fast path. */
720    nir_def *mask = nir_ballot(b, 1, subgroup_size, nir_imm_true(b));
721 
722    nir_def *full, *partial;
723    nir_push_if(b, nir_ieq_imm(b, mask, -1));
724    {
725       full = build_scan_full(b, intrin->intrinsic, red_op,
726                              intrin->src[0].ssa, cluster_size);
727    }
728    nir_push_else(b, NULL);
729    {
730       /* Mask according to the cluster size */
731       if (cluster_size < subgroup_size) {
732          nir_def *idx = nir_load_subgroup_invocation(b);
733          nir_def *cluster = nir_iand_imm(b, idx, ~(uint64_t)(cluster_size - 1));
734 
735          nir_def *cluster_mask = nir_imm_int(b, BITFIELD_MASK(cluster_size));
736          cluster_mask = nir_ishl(b, cluster_mask, cluster);
737 
738          mask = nir_iand(b, mask, cluster_mask);
739       }
740 
741       partial = build_scan_reduce(b, intrin->intrinsic, red_op,
742                                   intrin->src[0].ssa, mask, cluster_size,
743                                   subgroup_size);
744    }
745    nir_pop_if(b, NULL);
746    return nir_if_phi(b, full, partial);
747 }
748 
749 static bool
lower_subgroups_filter(const nir_instr * instr,const void * _options)750 lower_subgroups_filter(const nir_instr *instr, const void *_options)
751 {
752    return instr->type == nir_instr_type_intrinsic;
753 }
754 
755 /* Return a ballot-mask-sized value which represents "val" sign-extended and
756  * then shifted left by "shift". Only particular values for "val" are
757  * supported, see below.
758  */
759 static nir_def *
build_ballot_imm_ishl(nir_builder * b,int64_t val,nir_def * shift,const nir_lower_subgroups_options * options)760 build_ballot_imm_ishl(nir_builder *b, int64_t val, nir_def *shift,
761                       const nir_lower_subgroups_options *options)
762 {
763    /* This only works if all the high bits are the same as bit 1. */
764    assert((val >> 2) == (val & 0x2 ? -1 : 0));
765 
766    /* First compute the result assuming one ballot component. */
767    nir_def *result =
768       nir_ishl(b, nir_imm_intN_t(b, val, options->ballot_bit_size), shift);
769 
770    if (options->ballot_components == 1)
771       return result;
772 
773    /* Fix up the result when there is > 1 component. The idea is that nir_ishl
774     * masks out the high bits of the shift value already, so in case there's
775     * more than one component the component which 1 would be shifted into
776     * already has the right value and all we have to do is fixup the other
777     * components. Components below it should always be 0, and components above
778     * it must be either 0 or ~0 because of the assert above. For example, if
779     * the target ballot size is 2 x uint32, and we're shifting 1 by 33, then
780     * we'll feed 33 into ishl, which will mask it off to get 1, so we'll
781     * compute a single-component result of 2, which is correct for the second
782     * component, but the first component needs to be 0, which we get by
783     * comparing the high bits of the shift with 0 and selecting the original
784     * answer or 0 for the first component (and something similar with the
785     * second component). This idea is generalized here for any component count
786     */
787    nir_const_value min_shift[4];
788    for (unsigned i = 0; i < options->ballot_components; i++)
789       min_shift[i] = nir_const_value_for_int(i * options->ballot_bit_size, 32);
790    nir_def *min_shift_val = nir_build_imm(b, options->ballot_components, 32, min_shift);
791 
792    nir_const_value max_shift[4];
793    for (unsigned i = 0; i < options->ballot_components; i++)
794       max_shift[i] = nir_const_value_for_int((i + 1) * options->ballot_bit_size, 32);
795    nir_def *max_shift_val = nir_build_imm(b, options->ballot_components, 32, max_shift);
796 
797    return nir_bcsel(b, nir_ult(b, shift, max_shift_val),
798                     nir_bcsel(b, nir_ult(b, shift, min_shift_val),
799                               nir_imm_intN_t(b, val >> 63, result->bit_size),
800                               result),
801                     nir_imm_intN_t(b, 0, result->bit_size));
802 }
803 
804 static nir_def *
build_subgroup_eq_mask(nir_builder * b,const nir_lower_subgroups_options * options)805 build_subgroup_eq_mask(nir_builder *b,
806                        const nir_lower_subgroups_options *options)
807 {
808    nir_def *subgroup_idx = nir_load_subgroup_invocation(b);
809 
810    return build_ballot_imm_ishl(b, 1, subgroup_idx, options);
811 }
812 
813 static nir_def *
build_subgroup_ge_mask(nir_builder * b,const nir_lower_subgroups_options * options)814 build_subgroup_ge_mask(nir_builder *b,
815                        const nir_lower_subgroups_options *options)
816 {
817    nir_def *subgroup_idx = nir_load_subgroup_invocation(b);
818 
819    return build_ballot_imm_ishl(b, ~0ull, subgroup_idx, options);
820 }
821 
822 static nir_def *
build_subgroup_gt_mask(nir_builder * b,const nir_lower_subgroups_options * options)823 build_subgroup_gt_mask(nir_builder *b,
824                        const nir_lower_subgroups_options *options)
825 {
826    nir_def *subgroup_idx = nir_load_subgroup_invocation(b);
827 
828    return build_ballot_imm_ishl(b, ~1ull, subgroup_idx, options);
829 }
830 
831 /* Return a mask which is 1 for threads up to the run-time subgroup size, i.e.
832  * 1 for the entire subgroup. SPIR-V requires us to return 0 for indices at or
833  * above the subgroup size for the masks, but gt_mask and ge_mask make them 1
834  * so we have to "and" with this mask.
835  */
836 static nir_def *
build_subgroup_mask(nir_builder * b,const nir_lower_subgroups_options * options)837 build_subgroup_mask(nir_builder *b,
838                     const nir_lower_subgroups_options *options)
839 {
840    nir_def *subgroup_size = nir_load_subgroup_size(b);
841 
842    /* First compute the result assuming one ballot component. */
843    nir_def *result =
844       nir_ushr(b, nir_imm_intN_t(b, ~0ull, options->ballot_bit_size),
845                nir_isub_imm(b, options->ballot_bit_size,
846                             subgroup_size));
847 
848    /* Since the subgroup size and ballot bitsize are both powers of two, there
849     * are two possible cases to consider:
850     *
851     * (1) The subgroup size is less than the ballot bitsize. We need to return
852     * "result" in the first component and 0 in every other component.
853     * (2) The subgroup size is a multiple of the ballot bitsize. We need to
854     * return ~0 if the subgroup size divided by the ballot bitsize is less
855     * than or equal to the index in the vector and 0 otherwise. For example,
856     * with a target ballot type of 4 x uint32 and subgroup_size = 64 we'd need
857     * to return { ~0, ~0, 0, 0 }.
858     *
859     * In case (2) it turns out that "result" will be ~0, because
860     * "ballot_bit_size - subgroup_size" is also a multiple of
861     * "ballot_bit_size" and since nir_ushr masks the shift value it will
862     * shifted by 0. This means that the first component can just be "result"
863     * in all cases.  The other components will also get the correct value in
864     * case (1) if we just use the rule in case (2), so we'll get the correct
865     * result if we just follow (2) and then replace the first component with
866     * "result".
867     */
868    nir_const_value min_idx[4];
869    for (unsigned i = 0; i < options->ballot_components; i++)
870       min_idx[i] = nir_const_value_for_int(i * options->ballot_bit_size, 32);
871    nir_def *min_idx_val = nir_build_imm(b, options->ballot_components, 32, min_idx);
872 
873    nir_def *result_extended =
874       nir_pad_vector_imm_int(b, result, ~0ull, options->ballot_components);
875 
876    return nir_bcsel(b, nir_ult(b, min_idx_val, subgroup_size),
877                     result_extended, nir_imm_intN_t(b, 0, options->ballot_bit_size));
878 }
879 
880 static nir_def *
vec_find_lsb(nir_builder * b,nir_def * value)881 vec_find_lsb(nir_builder *b, nir_def *value)
882 {
883    nir_def *vec_result = nir_find_lsb(b, value);
884    nir_def *result = nir_imm_int(b, -1);
885    for (int i = value->num_components - 1; i >= 0; i--) {
886       nir_def *channel = nir_channel(b, vec_result, i);
887       /* result = channel >= 0 ? (i * bitsize + channel) : result */
888       result = nir_bcsel(b, nir_ige_imm(b, channel, 0),
889                          nir_iadd_imm(b, channel, i * value->bit_size),
890                          result);
891    }
892    return result;
893 }
894 
895 static nir_def *
vec_find_msb(nir_builder * b,nir_def * value)896 vec_find_msb(nir_builder *b, nir_def *value)
897 {
898    nir_def *vec_result = nir_ufind_msb(b, value);
899    nir_def *result = nir_imm_int(b, -1);
900    for (unsigned i = 0; i < value->num_components; i++) {
901       nir_def *channel = nir_channel(b, vec_result, i);
902       /* result = channel >= 0 ? (i * bitsize + channel) : result */
903       result = nir_bcsel(b, nir_ige_imm(b, channel, 0),
904                          nir_iadd_imm(b, channel, i * value->bit_size),
905                          result);
906    }
907    return result;
908 }
909 
910 static nir_def *
lower_dynamic_quad_broadcast(nir_builder * b,nir_intrinsic_instr * intrin,const nir_lower_subgroups_options * options)911 lower_dynamic_quad_broadcast(nir_builder *b, nir_intrinsic_instr *intrin,
912                              const nir_lower_subgroups_options *options)
913 {
914    if (!options->lower_quad_broadcast_dynamic_to_const)
915       return lower_to_shuffle(b, intrin, options);
916 
917    nir_def *dst = NULL;
918 
919    for (unsigned i = 0; i < 4; ++i) {
920       nir_def *qbcst = nir_quad_broadcast(b, intrin->src[0].ssa,
921                                               nir_imm_int(b, i));
922 
923       if (i)
924          dst = nir_bcsel(b, nir_ieq_imm(b, intrin->src[1].ssa, i),
925                          qbcst, dst);
926       else
927          dst = qbcst;
928    }
929 
930    return dst;
931 }
932 
933 static nir_def *
lower_first_invocation_to_ballot(nir_builder * b,nir_intrinsic_instr * intrin,const nir_lower_subgroups_options * options)934 lower_first_invocation_to_ballot(nir_builder *b, nir_intrinsic_instr *intrin,
935                                  const nir_lower_subgroups_options *options)
936 {
937    return nir_ballot_find_lsb(b, 32, nir_ballot(b, 4, 32, nir_imm_true(b)));
938 }
939 
940 static nir_def *
lower_read_first_invocation(nir_builder * b,nir_intrinsic_instr * intrin)941 lower_read_first_invocation(nir_builder *b, nir_intrinsic_instr *intrin)
942 {
943    return nir_read_invocation(b, intrin->src[0].ssa, nir_first_invocation(b));
944 }
945 
946 static nir_def *
lower_read_invocation_to_cond(nir_builder * b,nir_intrinsic_instr * intrin)947 lower_read_invocation_to_cond(nir_builder *b, nir_intrinsic_instr *intrin)
948 {
949    return nir_read_invocation_cond_ir3(b, intrin->def.bit_size,
950                                        intrin->src[0].ssa,
951                                        nir_ieq(b, intrin->src[1].ssa,
952                                                nir_load_subgroup_invocation(b)));
953 }
954 
955 static nir_def *
lower_subgroups_instr(nir_builder * b,nir_instr * instr,void * _options)956 lower_subgroups_instr(nir_builder *b, nir_instr *instr, void *_options)
957 {
958    const nir_lower_subgroups_options *options = _options;
959 
960    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
961    switch (intrin->intrinsic) {
962    case nir_intrinsic_vote_any:
963    case nir_intrinsic_vote_all:
964       if (options->lower_vote_trivial)
965          return intrin->src[0].ssa;
966       break;
967 
968    case nir_intrinsic_vote_feq:
969    case nir_intrinsic_vote_ieq:
970       if (options->lower_vote_trivial)
971          return nir_imm_true(b);
972 
973       if (nir_src_bit_size(intrin->src[0]) == 1) {
974          if (options->lower_vote_bool_eq)
975             return lower_vote_eq(b, intrin);
976       } else {
977          if (options->lower_vote_eq)
978             return lower_vote_eq(b, intrin);
979       }
980 
981       if (options->lower_to_scalar && intrin->num_components > 1)
982          return lower_vote_eq_to_scalar(b, intrin);
983       break;
984 
985    case nir_intrinsic_load_subgroup_size:
986       if (options->subgroup_size)
987          return nir_imm_int(b, options->subgroup_size);
988       break;
989 
990    case nir_intrinsic_first_invocation:
991       if (options->subgroup_size == 1)
992          return nir_imm_int(b, 0);
993 
994       if (options->lower_first_invocation_to_ballot)
995          return lower_first_invocation_to_ballot(b, intrin, options);
996 
997       break;
998 
999    case nir_intrinsic_read_invocation:
1000       if (options->lower_to_scalar && intrin->num_components > 1)
1001          return lower_subgroup_op_to_scalar(b, intrin);
1002 
1003       if (options->lower_boolean_shuffle && intrin->src[0].ssa->bit_size == 1)
1004          return lower_boolean_shuffle(b, intrin, options);
1005 
1006       if (options->lower_read_invocation_to_cond)
1007          return lower_read_invocation_to_cond(b, intrin);
1008 
1009       break;
1010 
1011    case nir_intrinsic_read_first_invocation:
1012       if (options->lower_to_scalar && intrin->num_components > 1)
1013          return lower_subgroup_op_to_scalar(b, intrin);
1014 
1015       if (options->lower_read_first_invocation)
1016          return lower_read_first_invocation(b, intrin);
1017       break;
1018 
1019    case nir_intrinsic_load_subgroup_eq_mask:
1020    case nir_intrinsic_load_subgroup_ge_mask:
1021    case nir_intrinsic_load_subgroup_gt_mask:
1022    case nir_intrinsic_load_subgroup_le_mask:
1023    case nir_intrinsic_load_subgroup_lt_mask: {
1024       if (!options->lower_subgroup_masks)
1025          return NULL;
1026 
1027       nir_def *val;
1028       switch (intrin->intrinsic) {
1029       case nir_intrinsic_load_subgroup_eq_mask:
1030          val = build_subgroup_eq_mask(b, options);
1031          break;
1032       case nir_intrinsic_load_subgroup_ge_mask:
1033          val = nir_iand(b, build_subgroup_ge_mask(b, options),
1034                         build_subgroup_mask(b, options));
1035          break;
1036       case nir_intrinsic_load_subgroup_gt_mask:
1037          val = nir_iand(b, build_subgroup_gt_mask(b, options),
1038                         build_subgroup_mask(b, options));
1039          break;
1040       case nir_intrinsic_load_subgroup_le_mask:
1041          val = nir_inot(b, build_subgroup_gt_mask(b, options));
1042          break;
1043       case nir_intrinsic_load_subgroup_lt_mask:
1044          val = nir_inot(b, build_subgroup_ge_mask(b, options));
1045          break;
1046       default:
1047          unreachable("you seriously can't tell this is unreachable?");
1048       }
1049 
1050       return uint_to_ballot_type(b, val,
1051                                  intrin->def.num_components,
1052                                  intrin->def.bit_size);
1053    }
1054 
1055    case nir_intrinsic_ballot: {
1056       if (intrin->def.num_components == options->ballot_components &&
1057           intrin->def.bit_size == options->ballot_bit_size)
1058          return NULL;
1059 
1060       nir_def *ballot =
1061          nir_ballot(b, options->ballot_components, options->ballot_bit_size,
1062                     intrin->src[0].ssa);
1063 
1064       return uint_to_ballot_type(b, ballot,
1065                                  intrin->def.num_components,
1066                                  intrin->def.bit_size);
1067    }
1068 
1069    case nir_intrinsic_inverse_ballot:
1070       if (options->lower_inverse_ballot) {
1071          return nir_ballot_bitfield_extract(b, 1, intrin->src[0].ssa,
1072                                             nir_load_subgroup_invocation(b));
1073       } else if (intrin->src[0].ssa->num_components != options->ballot_components ||
1074                  intrin->src[0].ssa->bit_size != options->ballot_bit_size) {
1075          return nir_inverse_ballot(b, 1, ballot_type_to_uint(b, intrin->src[0].ssa, options));
1076       }
1077       break;
1078 
1079    case nir_intrinsic_ballot_bitfield_extract:
1080    case nir_intrinsic_ballot_bit_count_reduce:
1081    case nir_intrinsic_ballot_find_lsb:
1082    case nir_intrinsic_ballot_find_msb: {
1083       nir_def *int_val = ballot_type_to_uint(b, intrin->src[0].ssa,
1084                                              options);
1085 
1086       if (intrin->intrinsic != nir_intrinsic_ballot_bitfield_extract &&
1087           intrin->intrinsic != nir_intrinsic_ballot_find_lsb) {
1088          /* For OpGroupNonUniformBallotFindMSB, the SPIR-V Spec says:
1089           *
1090           *    "Find the most significant bit set to 1 in Value, considering
1091           *    only the bits in Value required to represent all bits of the
1092           *    group’s invocations.  If none of the considered bits is set to
1093           *    1, the result is undefined."
1094           *
1095           * It has similar text for the other three.  This means that, in case
1096           * the subgroup size is less than 32, we have to mask off the unused
1097           * bits.  If the subgroup size is fixed and greater than or equal to
1098           * 32, the mask will be 0xffffffff and nir_opt_algebraic will delete
1099           * the iand.
1100           *
1101           * We only have to worry about this for BitCount and FindMSB because
1102           * FindLSB counts from the bottom and BitfieldExtract selects
1103           * individual bits.  In either case, if run outside the range of
1104           * valid bits, we hit the undefined results case and we can return
1105           * anything we want.
1106           */
1107          int_val = nir_iand(b, int_val, build_subgroup_mask(b, options));
1108       }
1109 
1110       switch (intrin->intrinsic) {
1111       case nir_intrinsic_ballot_bitfield_extract: {
1112          nir_def *idx = intrin->src[1].ssa;
1113          if (int_val->num_components > 1) {
1114             /* idx will be truncated by nir_ushr, so we just need to select
1115              * the right component using the bits of idx that are truncated in
1116              * the shift.
1117              */
1118             int_val =
1119                nir_vector_extract(b, int_val,
1120                                   nir_udiv_imm(b, idx, int_val->bit_size));
1121          }
1122 
1123          return nir_test_mask(b, nir_ushr(b, int_val, idx), 1);
1124       }
1125       case nir_intrinsic_ballot_bit_count_reduce:
1126          return vec_bit_count(b, int_val);
1127       case nir_intrinsic_ballot_find_lsb:
1128          return vec_find_lsb(b, int_val);
1129       case nir_intrinsic_ballot_find_msb:
1130          return vec_find_msb(b, int_val);
1131       default:
1132          unreachable("you seriously can't tell this is unreachable?");
1133       }
1134    }
1135 
1136    case nir_intrinsic_ballot_bit_count_exclusive:
1137    case nir_intrinsic_ballot_bit_count_inclusive: {
1138       nir_def *int_val = ballot_type_to_uint(b, intrin->src[0].ssa,
1139                                              options);
1140       if (options->lower_ballot_bit_count_to_mbcnt_amd) {
1141          nir_def *acc;
1142          if (intrin->intrinsic == nir_intrinsic_ballot_bit_count_exclusive) {
1143             acc = nir_imm_int(b, 0);
1144          } else {
1145             acc = nir_iand_imm(b, nir_u2u32(b, int_val), 0x1);
1146             int_val = nir_ushr_imm(b, int_val, 1);
1147          }
1148          return nir_mbcnt_amd(b, int_val, acc);
1149       }
1150 
1151       nir_def *mask;
1152       if (intrin->intrinsic == nir_intrinsic_ballot_bit_count_inclusive) {
1153          mask = nir_inot(b, build_subgroup_gt_mask(b, options));
1154       } else {
1155          mask = nir_inot(b, build_subgroup_ge_mask(b, options));
1156       }
1157 
1158       return vec_bit_count(b, nir_iand(b, int_val, mask));
1159    }
1160 
1161    case nir_intrinsic_elect: {
1162       if (!options->lower_elect)
1163          return NULL;
1164 
1165       return nir_ieq(b, nir_load_subgroup_invocation(b), nir_first_invocation(b));
1166    }
1167 
1168    case nir_intrinsic_shuffle:
1169       if (options->lower_shuffle &&
1170           (!options->lower_boolean_shuffle || intrin->src[0].ssa->bit_size != 1))
1171          return lower_shuffle(b, intrin);
1172       else if (options->lower_to_scalar && intrin->num_components > 1)
1173          return lower_subgroup_op_to_scalar(b, intrin);
1174       else if (options->lower_boolean_shuffle && intrin->src[0].ssa->bit_size == 1)
1175          return lower_boolean_shuffle(b, intrin, options);
1176       else if (options->lower_shuffle_to_32bit && intrin->src[0].ssa->bit_size == 64)
1177          return lower_subgroup_op_to_32bit(b, intrin);
1178       break;
1179    case nir_intrinsic_shuffle_xor:
1180    case nir_intrinsic_shuffle_up:
1181    case nir_intrinsic_shuffle_down:
1182       if (options->lower_relative_shuffle &&
1183           (!options->lower_boolean_shuffle || intrin->src[0].ssa->bit_size != 1))
1184          return lower_to_shuffle(b, intrin, options);
1185       else if (options->lower_to_scalar && intrin->num_components > 1)
1186          return lower_subgroup_op_to_scalar(b, intrin);
1187       else if (options->lower_boolean_shuffle && intrin->src[0].ssa->bit_size == 1)
1188          return lower_boolean_shuffle(b, intrin, options);
1189       else if (options->lower_shuffle_to_32bit && intrin->src[0].ssa->bit_size == 64)
1190          return lower_subgroup_op_to_32bit(b, intrin);
1191       break;
1192 
1193    case nir_intrinsic_quad_broadcast:
1194    case nir_intrinsic_quad_swap_horizontal:
1195    case nir_intrinsic_quad_swap_vertical:
1196    case nir_intrinsic_quad_swap_diagonal:
1197       if (options->lower_quad ||
1198           (options->lower_quad_broadcast_dynamic &&
1199            intrin->intrinsic == nir_intrinsic_quad_broadcast &&
1200            !nir_src_is_const(intrin->src[1])))
1201          return lower_dynamic_quad_broadcast(b, intrin, options);
1202       else if (options->lower_to_scalar && intrin->num_components > 1)
1203          return lower_subgroup_op_to_scalar(b, intrin);
1204       break;
1205 
1206    case nir_intrinsic_reduce: {
1207       nir_def *ret = NULL;
1208       /* A cluster size greater than the subgroup size is implemention defined */
1209       if (options->subgroup_size &&
1210           nir_intrinsic_cluster_size(intrin) >= options->subgroup_size) {
1211          nir_intrinsic_set_cluster_size(intrin, 0);
1212          ret = NIR_LOWER_INSTR_PROGRESS;
1213       }
1214       if (nir_intrinsic_cluster_size(intrin) == 1)
1215          return intrin->src[0].ssa;
1216       if (options->lower_to_scalar && intrin->num_components > 1)
1217          return lower_subgroup_op_to_scalar(b, intrin);
1218       if (intrin->def.bit_size == 1 &&
1219           (options->lower_boolean_reduce || options->lower_reduce))
1220          return lower_boolean_reduce(b, intrin, options);
1221       if (options->lower_reduce)
1222          return lower_scan_reduce(b, intrin, options->subgroup_size);
1223       return ret;
1224    }
1225    case nir_intrinsic_inclusive_scan:
1226    case nir_intrinsic_exclusive_scan:
1227       if (options->lower_to_scalar && intrin->num_components > 1)
1228          return lower_subgroup_op_to_scalar(b, intrin);
1229       if (intrin->def.bit_size == 1 &&
1230           (options->lower_boolean_reduce || options->lower_reduce))
1231          return lower_boolean_reduce(b, intrin, options);
1232       if (options->lower_reduce)
1233          return lower_scan_reduce(b, intrin, options->subgroup_size);
1234       break;
1235 
1236    case nir_intrinsic_rotate:
1237       if (options->lower_rotate_to_shuffle &&
1238           (!options->lower_boolean_shuffle || intrin->src[0].ssa->bit_size != 1))
1239          return lower_to_shuffle(b, intrin, options);
1240       else if (options->lower_to_scalar && intrin->num_components > 1)
1241          return lower_subgroup_op_to_scalar(b, intrin);
1242       else if (options->lower_boolean_shuffle && intrin->src[0].ssa->bit_size == 1)
1243          return lower_boolean_shuffle(b, intrin, options);
1244       else if (options->lower_shuffle_to_32bit && intrin->src[0].ssa->bit_size == 64)
1245          return lower_subgroup_op_to_32bit(b, intrin);
1246       break;
1247    case nir_intrinsic_masked_swizzle_amd:
1248       if (options->lower_to_scalar && intrin->num_components > 1) {
1249          return lower_subgroup_op_to_scalar(b, intrin);
1250       } else if (options->lower_shuffle_to_32bit && intrin->src[0].ssa->bit_size == 64) {
1251          return lower_subgroup_op_to_32bit(b, intrin);
1252       }
1253       break;
1254 
1255    default:
1256       break;
1257    }
1258 
1259    return NULL;
1260 }
1261 
1262 bool
nir_lower_subgroups(nir_shader * shader,const nir_lower_subgroups_options * options)1263 nir_lower_subgroups(nir_shader *shader,
1264                     const nir_lower_subgroups_options *options)
1265 {
1266    void *filter = options->filter ? options->filter : lower_subgroups_filter;
1267    return nir_shader_lower_instructions(shader, filter,
1268                                         lower_subgroups_instr,
1269                                         (void *)options);
1270 }
1271