1 /*
2 * Copyright © 2021 Raspberry Pi Ltd
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "compiler/v3d_compiler.h"
25 #include "compiler/nir/nir_builder.h"
26
27 /**
28 * The V3D TMU unit can only do 32-bit general vector access so for anything
29 * else we need to split vector load/store instructions to scalar.
30 *
31 * Note that a vectorization pass after this lowering may be able to
32 * re-vectorize some of these using 32-bit load/store instructions instead,
33 * which we do support.
34 */
35
36 static nir_intrinsic_instr *
init_scalar_intrinsic(nir_builder * b,nir_intrinsic_instr * intr,uint32_t component,nir_def * offset,uint32_t bit_size,nir_def ** scalar_offset)37 init_scalar_intrinsic(nir_builder *b,
38 nir_intrinsic_instr *intr,
39 uint32_t component,
40 nir_def *offset,
41 uint32_t bit_size,
42 nir_def **scalar_offset)
43 {
44
45 nir_intrinsic_instr *new_intr =
46 nir_intrinsic_instr_create(b->shader, intr->intrinsic);
47
48 nir_intrinsic_copy_const_indices(new_intr, intr);
49
50 const int offset_units = bit_size / 8;
51 assert(offset_units >= 1);
52 assert(!nir_intrinsic_has_align_mul(intr));
53 assert(nir_intrinsic_has_base(intr));
54
55 *scalar_offset = offset;
56 unsigned offset_adj = offset_units * component;
57 nir_intrinsic_set_base(new_intr, nir_intrinsic_base(intr) + offset_adj);
58 new_intr->num_components = 1;
59
60 return new_intr;
61 }
62
63 static bool
lower_load_bitsize(nir_builder * b,nir_intrinsic_instr * intr)64 lower_load_bitsize(nir_builder *b,
65 nir_intrinsic_instr *intr)
66 {
67 uint32_t bit_size = intr->def.bit_size;
68 if (bit_size == 32)
69 return false;
70
71 /* No need to split if it is already scalar */
72 int num_comp = nir_intrinsic_dest_components(intr);
73 if (num_comp <= 1)
74 return false;
75
76 b->cursor = nir_before_instr(&intr->instr);
77
78 unsigned offset_idx = nir_get_io_offset_src_number(intr);
79 nir_def *offset = intr->src[offset_idx].ssa;
80
81 /* Split vector store to multiple scalar loads */
82 nir_def *dest_components[16] = { NULL };
83 const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic];
84 for (int component = 0; component < num_comp; component++) {
85 nir_def *scalar_offset;
86 nir_intrinsic_instr *new_intr =
87 init_scalar_intrinsic(b, intr, component, offset,
88 bit_size, &scalar_offset);
89
90 for (unsigned i = 0; i < info->num_srcs; i++) {
91 if (i == offset_idx) {
92 new_intr->src[i] = nir_src_for_ssa(scalar_offset);
93 } else {
94 new_intr->src[i] = intr->src[i];
95 }
96 }
97
98 nir_def_init(&new_intr->instr, &new_intr->def, 1,
99 bit_size);
100 dest_components[component] = &new_intr->def;
101
102 nir_builder_instr_insert(b, &new_intr->instr);
103 }
104
105 nir_def *new_dst = nir_vec(b, dest_components, num_comp);
106 nir_def_replace(&intr->def, new_dst);
107 return true;
108 }
109
110 static bool
lower_load_store_bitsize(nir_builder * b,nir_intrinsic_instr * intr,void * data)111 lower_load_store_bitsize(nir_builder *b, nir_intrinsic_instr *intr,
112 void *data)
113 {
114 switch (intr->intrinsic) {
115 case nir_intrinsic_load_uniform:
116 return lower_load_bitsize(b, intr);
117
118 default:
119 return false;
120 }
121 }
122
123 /*
124 * The idea here is to lower bit_sizes until we meet the alignment of the data
125 * in order not having to use atomics. Also we keep load/stores we can operate
126 * on with a bit_size of 32 vectorized to up to 4 components at most.
127 */
128 static nir_mem_access_size_align
v3d_size_align_cb(nir_intrinsic_op intrin,uint8_t bytes,uint8_t input_bit_size,uint32_t align,uint32_t align_offset,bool offset_is_const,const void * cb_data)129 v3d_size_align_cb(nir_intrinsic_op intrin, uint8_t bytes,
130 uint8_t input_bit_size, uint32_t align,
131 uint32_t align_offset, bool offset_is_const,
132 const void *cb_data)
133 {
134 /* we only support single component 32 bit load/stores on scratch */
135 if (intrin == nir_intrinsic_load_scratch ||
136 intrin == nir_intrinsic_store_scratch) {
137 return (nir_mem_access_size_align){
138 .num_components = 1,
139 .bit_size = 32,
140 .align = 4,
141 };
142 }
143
144 align = nir_combined_align(align, align_offset);
145 assert(util_is_power_of_two_nonzero(align));
146
147 /* TODO: we could update the bit_size to 32 if possible, but that might
148 * cause suboptimal pack/unpack operations.
149 */
150 unsigned bit_size = MIN2(32, input_bit_size);
151
152 /* But if we're only aligned to 1 byte, use 8-bit loads. If we're only
153 * aligned to 2 bytes, use 16-bit loads, unless we needed 8-bit loads due to
154 * the size.
155 */
156 if (align == 1)
157 bit_size = 8;
158 else if (align == 2)
159 bit_size = MIN2(bit_size, 16);
160
161 /* But we only support single component loads for anything below 32 bit.
162 * And only up to 4 components for 32 bit.
163 */
164 unsigned num_components;
165 if (bit_size == 32) {
166 num_components = MIN2(bytes / 4, 4);
167
168 /* Now we have to reduce the num_components even further for unaligned
169 * vector load/stores
170 */
171 num_components = MIN2(align / 4, num_components);
172 } else {
173 num_components = 1;
174 }
175
176 return (nir_mem_access_size_align){
177 .num_components = num_components,
178 .bit_size = bit_size,
179 .align = (bit_size / 8) * (num_components == 3 ? 4 : num_components),
180 };
181 }
182
183 static nir_intrinsic_op
convert_global_2x32_to_scalar(nir_intrinsic_op op)184 convert_global_2x32_to_scalar(nir_intrinsic_op op)
185 {
186 switch (op) {
187 case nir_intrinsic_global_atomic_2x32:
188 return nir_intrinsic_global_atomic;
189 case nir_intrinsic_global_atomic_swap_2x32:
190 return nir_intrinsic_global_atomic_swap;
191 case nir_intrinsic_load_global_2x32:
192 return nir_intrinsic_load_global;
193 case nir_intrinsic_store_global_2x32:
194 return nir_intrinsic_store_global;
195 default:
196 return op;
197 }
198 }
199
200 static bool
lower_global_2x32(nir_builder * b,nir_intrinsic_instr * intr,void * data)201 lower_global_2x32(nir_builder *b, nir_intrinsic_instr *intr, void *data)
202 {
203 nir_intrinsic_op op = convert_global_2x32_to_scalar(intr->intrinsic);
204 if (op == intr->intrinsic)
205 return false;
206
207 b->cursor = nir_before_instr(&intr->instr);
208 nir_src *addr_src = nir_get_io_offset_src(intr);
209 nir_src_rewrite(addr_src, nir_channel(b, addr_src->ssa, 0));
210 intr->intrinsic = op;
211
212 return true;
213 }
214
215 bool
v3d_nir_lower_load_store_bitsize(nir_shader * s)216 v3d_nir_lower_load_store_bitsize(nir_shader *s)
217 {
218 nir_lower_mem_access_bit_sizes_options lower_options = {
219 .modes = nir_var_mem_global | nir_var_mem_ssbo |
220 nir_var_mem_ubo | nir_var_mem_constant |
221 nir_var_mem_shared | nir_var_function_temp,
222 .callback = v3d_size_align_cb,
223 };
224
225 bool res = nir_shader_intrinsics_pass(s, lower_load_store_bitsize,
226 nir_metadata_control_flow,
227 NULL);
228 res |= nir_lower_mem_access_bit_sizes(s, &lower_options);
229 return res;
230 }
231
232 bool
v3d_nir_lower_global_2x32(nir_shader * s)233 v3d_nir_lower_global_2x32(nir_shader *s)
234 {
235 return nir_shader_intrinsics_pass(s, lower_global_2x32,
236 nir_metadata_control_flow,
237 NULL);
238 }
239