xref: /aosp_15_r20/external/mesa3d/src/broadcom/compiler/v3d_nir_lower_image_load_store.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2018 Intel Corporation
3  * Copyright © 2018 Broadcom
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22  * IN THE SOFTWARE.
23  */
24 
25 #include "v3d_compiler.h"
26 #include "compiler/nir/nir_builder.h"
27 #include "compiler/nir/nir_format_convert.h"
28 
29 /** @file v3d_nir_lower_image_load_store.c
30  *
31  * Performs any necessary lowering of GL_ARB_shader_image_load_store
32  * operations.
33  *
34  * On V3D 4.x, we just need to do format conversion for stores such that the
35  * GPU can effectively memcpy the arguments (in increments of 32-bit words)
36  * into the texel.  Loads are the same as texturing, where we may need to
37  * unpack from 16-bit ints or floats.
38  *
39  * On V3D 3.x, to implement image load store we would need to do manual tiling
40  * calculations and load/store using the TMU general memory access path.
41  */
42 
43 static const unsigned bits_8[4] = {8, 8, 8, 8};
44 static const unsigned bits_16[4] = {16, 16, 16, 16};
45 static const unsigned bits_1010102[4] = {10, 10, 10, 2};
46 
47 bool
v3d_gl_format_is_return_32(enum pipe_format format)48 v3d_gl_format_is_return_32(enum pipe_format format)
49 {
50         /* We can get a NONE format in Vulkan because we support the
51          * shaderStorageImageReadWithoutFormat feature. We consider these to
52          * always use 32-bit precision.
53          */
54         if (format == PIPE_FORMAT_NONE)
55                 return true;
56 
57         const struct util_format_description *desc =
58                 util_format_description(format);
59         const struct util_format_channel_description *chan = &desc->channel[0];
60 
61         return chan->size > 16 || (chan->size == 16 && chan->normalized);
62 }
63 
64 /* Packs a 32-bit vector of colors in the range [0, (1 << bits[i]) - 1] to a
65  * 32-bit SSA value, with as many channels as necessary to store all the bits
66  *
67  * This is the generic helper, using all common nir operations.
68  */
69 static nir_def *
pack_bits(nir_builder * b,nir_def * color,const unsigned * bits,int num_components,bool mask)70 pack_bits(nir_builder *b, nir_def *color, const unsigned *bits,
71           int num_components, bool mask)
72 {
73         nir_def *results[4];
74         int offset = 0;
75         for (int i = 0; i < num_components; i++) {
76                 nir_def *chan = nir_channel(b, color, i);
77 
78                 /* Channels being stored shouldn't cross a 32-bit boundary. */
79                 assert((offset & ~31) == ((offset + bits[i] - 1) & ~31));
80 
81                 if (mask) {
82                         chan = nir_iand(b, chan,
83                                         nir_imm_int(b, (1 << bits[i]) - 1));
84                 }
85 
86                 if (offset % 32 == 0) {
87                         results[offset / 32] = chan;
88                 } else {
89                         results[offset / 32] =
90                                 nir_ior(b, results[offset / 32],
91                                         nir_ishl(b, chan,
92                                                  nir_imm_int(b, offset % 32)));
93                 }
94                 offset += bits[i];
95         }
96 
97         return nir_vec(b, results, DIV_ROUND_UP(offset, 32));
98 }
99 
100 /* Utility wrapper as half_2x16_split is mapped to vfpack, and sometimes it is
101  * just easier to read vfpack on the code, specially while using the PRM as
102  * reference
103  */
104 static inline nir_def *
nir_vfpack(nir_builder * b,nir_def * p1,nir_def * p2)105 nir_vfpack(nir_builder *b, nir_def *p1, nir_def *p2)
106 {
107         return nir_pack_half_2x16_split(b, p1, p2);
108 }
109 
110 static inline nir_def *
pack_11f11f10f(nir_builder * b,nir_def * color)111 pack_11f11f10f(nir_builder *b, nir_def *color)
112 {
113         nir_def *p1 = nir_vfpack(b, nir_channel(b, color, 0),
114                                      nir_channel(b, color, 1));
115         nir_def *undef = nir_undef(b, 1, color->bit_size);
116         nir_def *p2 = nir_vfpack(b, nir_channel(b, color, 2), undef);
117 
118         return nir_pack_32_to_r11g11b10_v3d(b, p1, p2);
119 }
120 
121 static inline nir_def *
pack_r10g10b10a2_uint(nir_builder * b,nir_def * color)122 pack_r10g10b10a2_uint(nir_builder *b, nir_def *color)
123 {
124         nir_def *p1 = nir_pack_2x32_to_2x16_v3d(b, nir_channel(b, color, 0),
125                                                 nir_channel(b, color, 1));
126         nir_def *p2 = nir_pack_2x32_to_2x16_v3d(b, nir_channel(b, color, 2),
127                                                 nir_channel(b, color, 3));
128 
129         return nir_pack_uint_32_to_r10g10b10a2_v3d(b, p1, p2);
130 }
131 
132 static inline nir_def *
pack_r10g10b10a2_unorm(nir_builder * b,nir_def * color)133 pack_r10g10b10a2_unorm(nir_builder *b, nir_def *color)
134 {
135         nir_def *p1 = nir_vfpack(b, nir_channel(b, color, 0),
136                                      nir_channel(b, color, 1));
137         p1 = nir_pack_2x16_to_unorm_2x10_v3d(b, p1);
138 
139         nir_def *p2 = nir_vfpack(b, nir_channel(b, color, 2),
140                                      nir_channel(b, color, 3));
141         p2 = nir_pack_2x16_to_unorm_10_2_v3d(b, p2);
142 
143         return nir_pack_uint_32_to_r10g10b10a2_v3d(b, p1, p2);
144 }
145 
146 enum hw_conversion {
147         NONE,
148         TO_SNORM,
149         TO_UNORM
150 };
151 
152 static inline nir_def *
pack_8bit(nir_builder * b,nir_def * color,unsigned num_components,enum hw_conversion conversion)153 pack_8bit(nir_builder *b, nir_def *color,
154                         unsigned num_components,
155                         enum hw_conversion conversion)
156 {
157         /* Note that usually you should not use this method (that relies on
158          * custom packing) for 1 component if we are not doing any
159          * conversion. But we support also that case, and let the caller
160          * decide which method to use.
161          */
162         nir_def *p1;
163         nir_def *p2;
164 
165         if (conversion == NONE) {
166                 p1 = nir_pack_2x32_to_2x16_v3d(b, nir_channel(b, color, 0),
167                                                nir_channel(b, color, num_components == 1 ? 0 : 1));
168         } else {
169                 p1 = nir_vfpack(b, nir_channel(b, color, 0),
170                                 nir_channel(b, color, num_components == 1 ? 0 : 1));
171                 p1 = (conversion == TO_UNORM) ?
172                    nir_pack_2x16_to_unorm_2x8_v3d(b, p1) :
173                    nir_pack_2x16_to_snorm_2x8_v3d(b, p1);
174         }
175         if (num_components == 4) {
176                 if (conversion == NONE) {
177                         p2 = nir_pack_2x32_to_2x16_v3d(b, nir_channel(b, color, 2),
178                                                        nir_channel(b, color, 3));
179                 } else {
180                         p2 = nir_vfpack(b, nir_channel(b, color, 2),
181                                         nir_channel(b, color, 3));
182                         p2 = (conversion == TO_UNORM) ?
183                            nir_pack_2x16_to_unorm_2x8_v3d(b, p2) :
184                            nir_pack_2x16_to_snorm_2x8_v3d(b, p2);
185                 }
186         } else {
187                 /* Using an undef here would be more correct. But for this
188                  * case we are getting worse shader-db values with some CTS
189                  * tests, so we just reuse the first packing.
190                  */
191                 p2 = p1;
192         }
193 
194         return nir_pack_4x16_to_4x8_v3d(b, p1, p2);
195 }
196 
197 static inline nir_def *
pack_16bit(nir_builder * b,nir_def * color,unsigned num_components,enum hw_conversion conversion)198 pack_16bit(nir_builder *b, nir_def *color,
199                          unsigned num_components,
200                          enum hw_conversion conversion)
201 {
202         nir_def *results[2] = {0};
203         nir_def *channels[4] = {0};
204 
205         for (unsigned i = 0; i < num_components; i++) {
206                 channels[i] = nir_channel(b, color, i);
207                 switch (conversion) {
208                 case TO_SNORM:
209                         channels[i] = nir_f2snorm_16_v3d(b, channels[i]);
210                         break;
211                 case TO_UNORM:
212                         channels[i] = nir_f2unorm_16_v3d(b, channels[i]);
213                         break;
214                 default:
215                         /* Note that usually you should not use this method
216                          * (that relies on custom packing) if we are not doing
217                          * any conversion. But we support also that case, and
218                          * let the caller decide which method to use.
219                          */
220                         break;
221                 }
222         }
223 
224         switch (num_components) {
225         case 1:
226                 results[0] = channels[0];
227                 break;
228         case 4:
229                 results[1] = nir_pack_2x32_to_2x16_v3d(b, channels[2], channels[3]);
230                 FALLTHROUGH;
231         case 2:
232                 results[0] = nir_pack_2x32_to_2x16_v3d(b, channels[0], channels[1]);
233                 break;
234         default:
235                 unreachable("Invalid number of components");
236         }
237 
238         return nir_vec(b, results, DIV_ROUND_UP(num_components, 2));
239 }
240 
241 static inline nir_def *
pack_xbit(nir_builder * b,nir_def * color,unsigned num_components,const struct util_format_channel_description * r_chan)242 pack_xbit(nir_builder *b, nir_def *color,
243           unsigned num_components,
244           const struct util_format_channel_description *r_chan)
245 {
246         bool pack_mask = (r_chan->type == UTIL_FORMAT_TYPE_SIGNED);
247         enum hw_conversion conversion = NONE;
248         if (r_chan->normalized) {
249                 conversion =
250                         (r_chan->type == UTIL_FORMAT_TYPE_UNSIGNED) ? TO_UNORM : TO_SNORM;
251         }
252 
253         switch (r_chan->size) {
254         case 8:
255                 if (conversion == NONE && num_components < 2)
256                         return pack_bits(b, color, bits_8, num_components, pack_mask);
257                 else
258                         return pack_8bit(b, color, num_components, conversion);
259                 break;
260         case 16:
261                 /* pack_mask implies that the generic packing method would
262                  * need to include extra operations to handle negative values,
263                  * so in that case, even without a conversion, it is better to
264                  * use the packing using custom hw operations.
265                  */
266                 if (conversion == NONE && !pack_mask)
267                         return pack_bits(b, color, bits_16, num_components, pack_mask);
268                 else
269                         return pack_16bit(b, color, num_components, conversion);
270                 break;
271         default:
272                 unreachable("unrecognized bits");
273         }
274 }
275 
276 static bool
v3d42_nir_lower_image_store(nir_builder * b,nir_intrinsic_instr * instr)277 v3d42_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr)
278 {
279         enum pipe_format format = nir_intrinsic_format(instr);
280         assert(format != PIPE_FORMAT_NONE);
281         const struct util_format_description *desc =
282                 util_format_description(format);
283         const struct util_format_channel_description *r_chan = &desc->channel[0];
284         unsigned num_components = util_format_get_nr_components(format);
285 
286         b->cursor = nir_before_instr(&instr->instr);
287 
288         nir_def *color = nir_trim_vector(b,
289                                              instr->src[3].ssa,
290                                              num_components);
291         nir_def *formatted = NULL;
292 
293         if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
294                 formatted = nir_format_pack_11f11f10f(b, color);
295         } else if (format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
296                 formatted = nir_format_pack_r9g9b9e5(b, color);
297         } else if (r_chan->size == 32) {
298                 /* For 32-bit formats, we just have to move the vector
299                  * across (possibly reducing the number of channels).
300                  */
301                 formatted = color;
302         } else {
303                 const unsigned *bits;
304 
305                 switch (r_chan->size) {
306                 case 8:
307                         bits = bits_8;
308                         break;
309                 case 10:
310                         bits = bits_1010102;
311                         break;
312                 case 16:
313                         bits = bits_16;
314                         break;
315                 default:
316                         unreachable("unrecognized bits");
317                 }
318 
319                 bool pack_mask = false;
320                 if (r_chan->pure_integer &&
321                     r_chan->type == UTIL_FORMAT_TYPE_SIGNED) {
322                         /* We don't need to do any conversion or clamping in this case */
323                         formatted = color;
324                         pack_mask = true;
325                 } else if (r_chan->pure_integer &&
326                            r_chan->type == UTIL_FORMAT_TYPE_UNSIGNED) {
327                         /* We don't need to do any conversion or clamping in this case */
328                         formatted = color;
329                 } else if (r_chan->normalized &&
330                            r_chan->type == UTIL_FORMAT_TYPE_SIGNED) {
331                         formatted = nir_format_float_to_snorm(b, color, bits);
332                         pack_mask = true;
333                 } else if (r_chan->normalized &&
334                            r_chan->type == UTIL_FORMAT_TYPE_UNSIGNED) {
335                         formatted = nir_format_float_to_unorm(b, color, bits);
336                 } else {
337                         assert(r_chan->size == 16);
338                         assert(r_chan->type == UTIL_FORMAT_TYPE_FLOAT);
339                         formatted = nir_format_float_to_half(b, color);
340                 }
341 
342                 formatted = pack_bits(b, formatted, bits, num_components,
343                                       pack_mask);
344         }
345 
346         nir_src_rewrite(&instr->src[3], formatted);
347         instr->num_components = formatted->num_components;
348 
349         return true;
350 }
351 
352 
353 static bool
v3d71_nir_lower_image_store(nir_builder * b,nir_intrinsic_instr * instr)354 v3d71_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr)
355 {
356         enum pipe_format format = nir_intrinsic_format(instr);
357         assert(format != PIPE_FORMAT_NONE);
358         const struct util_format_description *desc =
359                 util_format_description(format);
360         const struct util_format_channel_description *r_chan = &desc->channel[0];
361         unsigned num_components = util_format_get_nr_components(format);
362         b->cursor = nir_before_instr(&instr->instr);
363 
364         nir_def *color =
365            nir_trim_vector(b, instr->src[3].ssa, num_components);
366         nir_def *formatted = NULL;
367         if (format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
368                 formatted = nir_format_pack_r9g9b9e5(b, color);
369         } else if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
370                 formatted = pack_11f11f10f(b, color);
371         } else if (format == PIPE_FORMAT_R10G10B10A2_UINT) {
372                 formatted = pack_r10g10b10a2_uint(b, color);
373         } else if (format == PIPE_FORMAT_R10G10B10A2_UNORM) {
374                 formatted = pack_r10g10b10a2_unorm(b, color);
375         } else if (r_chan->size == 32) {
376                 /* For 32-bit formats, we just have to move the vector
377                  * across (possibly reducing the number of channels).
378                  */
379                 formatted = color;
380         } else if (r_chan->type == UTIL_FORMAT_TYPE_FLOAT) {
381                 assert(r_chan->size == 16);
382                 formatted = nir_format_float_to_half(b, color);
383                 formatted = pack_bits(b, formatted, bits_16, num_components,
384                                       false);
385         } else {
386                 assert(r_chan->size == 8 || r_chan->size == 16);
387                 formatted = pack_xbit(b, color, num_components, r_chan);
388         }
389 
390         nir_src_rewrite(&instr->src[3], formatted);
391         instr->num_components = formatted->num_components;
392 
393         return true;
394 }
395 
396 static bool
v3d_nir_lower_image_load(nir_builder * b,nir_intrinsic_instr * instr)397 v3d_nir_lower_image_load(nir_builder *b, nir_intrinsic_instr *instr)
398 {
399         static const unsigned bits16[] = {16, 16, 16, 16};
400         enum pipe_format format = nir_intrinsic_format(instr);
401 
402         if (v3d_gl_format_is_return_32(format))
403                 return false;
404 
405         b->cursor = nir_after_instr(&instr->instr);
406 
407         nir_def *result = &instr->def;
408         if (util_format_is_pure_uint(format)) {
409                 result = nir_format_unpack_uint(b, result, bits16, 4);
410         } else if (util_format_is_pure_sint(format)) {
411                 result = nir_format_unpack_sint(b, result, bits16, 4);
412         } else {
413                 nir_def *rg = nir_channel(b, result, 0);
414                 nir_def *ba = nir_channel(b, result, 1);
415                 result = nir_vec4(b,
416                                   nir_unpack_half_2x16_split_x(b, rg),
417                                   nir_unpack_half_2x16_split_y(b, rg),
418                                   nir_unpack_half_2x16_split_x(b, ba),
419                                   nir_unpack_half_2x16_split_y(b, ba));
420         }
421 
422         nir_def_rewrite_uses_after(&instr->def, result,
423                                        result->parent_instr);
424 
425         return true;
426 }
427 
428 static bool
v3d_nir_lower_image_load_store_cb(nir_builder * b,nir_intrinsic_instr * intr,void * _state)429 v3d_nir_lower_image_load_store_cb(nir_builder *b,
430                                   nir_intrinsic_instr *intr,
431                                   void *_state)
432 {
433         struct v3d_compile *c = (struct v3d_compile *) _state;
434 
435         switch (intr->intrinsic) {
436         case nir_intrinsic_image_load:
437                 return v3d_nir_lower_image_load(b, intr);
438         case nir_intrinsic_image_store:
439                 if (c->devinfo->ver >= 71)
440                         return v3d71_nir_lower_image_store(b, intr);
441                 else
442                         return v3d42_nir_lower_image_store(b, intr);
443                 break;
444         default:
445                 return false;
446         }
447 
448         return false;
449 }
450 
451 bool
v3d_nir_lower_image_load_store(nir_shader * s,struct v3d_compile * c)452 v3d_nir_lower_image_load_store(nir_shader *s, struct v3d_compile *c)
453 {
454         return nir_shader_intrinsics_pass(s,
455                                             v3d_nir_lower_image_load_store_cb,
456                                             nir_metadata_control_flow, c);
457 }
458