xref: /aosp_15_r20/external/mesa3d/src/broadcom/compiler/v3d_nir_lower_load_store_bitsize.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2021 Raspberry Pi Ltd
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "compiler/v3d_compiler.h"
25 #include "compiler/nir/nir_builder.h"
26 
27 /**
28  * The V3D TMU unit can only do 32-bit general vector access so for anything
29  * else we need to split vector load/store instructions to scalar.
30  *
31  * Note that a vectorization pass after this lowering may be able to
32  * re-vectorize some of these using 32-bit load/store instructions instead,
33  * which we do support.
34  */
35 
36 static nir_intrinsic_instr *
init_scalar_intrinsic(nir_builder * b,nir_intrinsic_instr * intr,uint32_t component,nir_def * offset,uint32_t bit_size,nir_def ** scalar_offset)37 init_scalar_intrinsic(nir_builder *b,
38                       nir_intrinsic_instr *intr,
39                       uint32_t component,
40                       nir_def *offset,
41                       uint32_t bit_size,
42                       nir_def **scalar_offset)
43 {
44 
45         nir_intrinsic_instr *new_intr =
46                 nir_intrinsic_instr_create(b->shader, intr->intrinsic);
47 
48         nir_intrinsic_copy_const_indices(new_intr, intr);
49 
50         const int offset_units = bit_size / 8;
51         assert(offset_units >= 1);
52         assert(!nir_intrinsic_has_align_mul(intr));
53         assert(nir_intrinsic_has_base(intr));
54 
55         *scalar_offset = offset;
56         unsigned offset_adj = offset_units * component;
57         nir_intrinsic_set_base(new_intr, nir_intrinsic_base(intr) + offset_adj);
58         new_intr->num_components = 1;
59 
60         return new_intr;
61 }
62 
63 static bool
lower_load_bitsize(nir_builder * b,nir_intrinsic_instr * intr)64 lower_load_bitsize(nir_builder *b,
65                    nir_intrinsic_instr *intr)
66 {
67         uint32_t bit_size = intr->def.bit_size;
68         if (bit_size == 32)
69                 return false;
70 
71         /* No need to split if it is already scalar */
72         int num_comp = nir_intrinsic_dest_components(intr);
73         if (num_comp <= 1)
74                 return false;
75 
76         b->cursor = nir_before_instr(&intr->instr);
77 
78         unsigned offset_idx = nir_get_io_offset_src_number(intr);
79         nir_def *offset = intr->src[offset_idx].ssa;
80 
81         /* Split vector store to multiple scalar loads */
82         nir_def *dest_components[16] = { NULL };
83         const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic];
84         for (int component = 0; component < num_comp; component++) {
85                 nir_def *scalar_offset;
86                 nir_intrinsic_instr *new_intr =
87                         init_scalar_intrinsic(b, intr, component, offset,
88                                               bit_size, &scalar_offset);
89 
90                 for (unsigned i = 0; i < info->num_srcs; i++) {
91                         if (i == offset_idx) {
92                                 new_intr->src[i] = nir_src_for_ssa(scalar_offset);
93                         } else {
94                                 new_intr->src[i] = intr->src[i];
95                         }
96                 }
97 
98                 nir_def_init(&new_intr->instr, &new_intr->def, 1,
99                              bit_size);
100                 dest_components[component] = &new_intr->def;
101 
102                 nir_builder_instr_insert(b, &new_intr->instr);
103         }
104 
105         nir_def *new_dst = nir_vec(b, dest_components, num_comp);
106         nir_def_replace(&intr->def, new_dst);
107         return true;
108 }
109 
110 static bool
lower_load_store_bitsize(nir_builder * b,nir_intrinsic_instr * intr,void * data)111 lower_load_store_bitsize(nir_builder *b, nir_intrinsic_instr *intr,
112                          void *data)
113 {
114         switch (intr->intrinsic) {
115         case nir_intrinsic_load_uniform:
116                return lower_load_bitsize(b, intr);
117 
118         default:
119                 return false;
120         }
121 }
122 
123 /*
124  * The idea here is to lower bit_sizes until we meet the alignment of the data
125  * in order not having to use atomics. Also we keep load/stores we can operate
126  * on with a bit_size of 32 vectorized to up to 4 components at most.
127  */
128 static nir_mem_access_size_align
v3d_size_align_cb(nir_intrinsic_op intrin,uint8_t bytes,uint8_t input_bit_size,uint32_t align,uint32_t align_offset,bool offset_is_const,const void * cb_data)129 v3d_size_align_cb(nir_intrinsic_op intrin, uint8_t bytes,
130                   uint8_t input_bit_size, uint32_t align,
131                   uint32_t align_offset, bool offset_is_const,
132                   const void *cb_data)
133 {
134         /* we only support single component 32 bit load/stores on scratch */
135         if (intrin == nir_intrinsic_load_scratch ||
136             intrin == nir_intrinsic_store_scratch) {
137                 return (nir_mem_access_size_align){
138                         .num_components = 1,
139                         .bit_size = 32,
140                         .align = 4,
141                 };
142         }
143 
144         align = nir_combined_align(align, align_offset);
145         assert(util_is_power_of_two_nonzero(align));
146 
147         /* TODO: we could update the bit_size to 32 if possible, but that might
148          * cause suboptimal pack/unpack operations.
149          */
150         unsigned bit_size = MIN2(32, input_bit_size);
151 
152         /* But if we're only aligned to 1 byte, use 8-bit loads. If we're only
153          * aligned to 2 bytes, use 16-bit loads, unless we needed 8-bit loads due to
154          * the size.
155          */
156         if (align == 1)
157                 bit_size = 8;
158         else if (align == 2)
159                 bit_size = MIN2(bit_size, 16);
160 
161         /* But we only support single component loads for anything below 32 bit.
162          * And only up to 4 components for 32 bit.
163          */
164         unsigned num_components;
165         if (bit_size == 32) {
166                 num_components = MIN2(bytes / 4, 4);
167 
168                 /* Now we have to reduce the num_components even further for unaligned
169                  * vector load/stores
170                  */
171                 num_components = MIN2(align / 4, num_components);
172         } else {
173                 num_components = 1;
174         }
175 
176         return (nir_mem_access_size_align){
177                 .num_components = num_components,
178                 .bit_size = bit_size,
179                 .align = (bit_size / 8) * (num_components == 3 ? 4 : num_components),
180         };
181 }
182 
183 static nir_intrinsic_op
convert_global_2x32_to_scalar(nir_intrinsic_op op)184 convert_global_2x32_to_scalar(nir_intrinsic_op op)
185 {
186         switch (op) {
187         case nir_intrinsic_global_atomic_2x32:
188                 return nir_intrinsic_global_atomic;
189         case nir_intrinsic_global_atomic_swap_2x32:
190                 return nir_intrinsic_global_atomic_swap;
191         case nir_intrinsic_load_global_2x32:
192                 return nir_intrinsic_load_global;
193         case nir_intrinsic_store_global_2x32:
194                 return nir_intrinsic_store_global;
195         default:
196                 return op;
197         }
198 }
199 
200 static bool
lower_global_2x32(nir_builder * b,nir_intrinsic_instr * intr,void * data)201 lower_global_2x32(nir_builder *b, nir_intrinsic_instr *intr, void *data)
202 {
203         nir_intrinsic_op op = convert_global_2x32_to_scalar(intr->intrinsic);
204         if (op == intr->intrinsic)
205             return false;
206 
207         b->cursor = nir_before_instr(&intr->instr);
208         nir_src *addr_src = nir_get_io_offset_src(intr);
209         nir_src_rewrite(addr_src, nir_channel(b, addr_src->ssa, 0));
210         intr->intrinsic = op;
211 
212         return true;
213 }
214 
215 bool
v3d_nir_lower_load_store_bitsize(nir_shader * s)216 v3d_nir_lower_load_store_bitsize(nir_shader *s)
217 {
218         nir_lower_mem_access_bit_sizes_options lower_options = {
219                 .modes = nir_var_mem_global | nir_var_mem_ssbo |
220                          nir_var_mem_ubo | nir_var_mem_constant |
221                          nir_var_mem_shared | nir_var_function_temp,
222                 .callback = v3d_size_align_cb,
223         };
224 
225         bool res = nir_shader_intrinsics_pass(s, lower_load_store_bitsize,
226                                               nir_metadata_control_flow,
227                                               NULL);
228         res |= nir_lower_mem_access_bit_sizes(s, &lower_options);
229         return res;
230 }
231 
232 bool
v3d_nir_lower_global_2x32(nir_shader * s)233 v3d_nir_lower_global_2x32(nir_shader *s)
234 {
235         return  nir_shader_intrinsics_pass(s, lower_global_2x32,
236                                            nir_metadata_control_flow,
237                                            NULL);
238 }
239