1 /*
2 * Copyright © 2018 Intel Corporation
3 * Copyright © 2018 Broadcom
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24
25 #include "v3d_compiler.h"
26 #include "compiler/nir/nir_builder.h"
27 #include "compiler/nir/nir_format_convert.h"
28
29 /** @file v3d_nir_lower_image_load_store.c
30 *
31 * Performs any necessary lowering of GL_ARB_shader_image_load_store
32 * operations.
33 *
34 * On V3D 4.x, we just need to do format conversion for stores such that the
35 * GPU can effectively memcpy the arguments (in increments of 32-bit words)
36 * into the texel. Loads are the same as texturing, where we may need to
37 * unpack from 16-bit ints or floats.
38 *
39 * On V3D 3.x, to implement image load store we would need to do manual tiling
40 * calculations and load/store using the TMU general memory access path.
41 */
42
43 static const unsigned bits_8[4] = {8, 8, 8, 8};
44 static const unsigned bits_16[4] = {16, 16, 16, 16};
45 static const unsigned bits_1010102[4] = {10, 10, 10, 2};
46
47 bool
v3d_gl_format_is_return_32(enum pipe_format format)48 v3d_gl_format_is_return_32(enum pipe_format format)
49 {
50 /* We can get a NONE format in Vulkan because we support the
51 * shaderStorageImageReadWithoutFormat feature. We consider these to
52 * always use 32-bit precision.
53 */
54 if (format == PIPE_FORMAT_NONE)
55 return true;
56
57 const struct util_format_description *desc =
58 util_format_description(format);
59 const struct util_format_channel_description *chan = &desc->channel[0];
60
61 return chan->size > 16 || (chan->size == 16 && chan->normalized);
62 }
63
64 /* Packs a 32-bit vector of colors in the range [0, (1 << bits[i]) - 1] to a
65 * 32-bit SSA value, with as many channels as necessary to store all the bits
66 *
67 * This is the generic helper, using all common nir operations.
68 */
69 static nir_def *
pack_bits(nir_builder * b,nir_def * color,const unsigned * bits,int num_components,bool mask)70 pack_bits(nir_builder *b, nir_def *color, const unsigned *bits,
71 int num_components, bool mask)
72 {
73 nir_def *results[4];
74 int offset = 0;
75 for (int i = 0; i < num_components; i++) {
76 nir_def *chan = nir_channel(b, color, i);
77
78 /* Channels being stored shouldn't cross a 32-bit boundary. */
79 assert((offset & ~31) == ((offset + bits[i] - 1) & ~31));
80
81 if (mask) {
82 chan = nir_iand(b, chan,
83 nir_imm_int(b, (1 << bits[i]) - 1));
84 }
85
86 if (offset % 32 == 0) {
87 results[offset / 32] = chan;
88 } else {
89 results[offset / 32] =
90 nir_ior(b, results[offset / 32],
91 nir_ishl(b, chan,
92 nir_imm_int(b, offset % 32)));
93 }
94 offset += bits[i];
95 }
96
97 return nir_vec(b, results, DIV_ROUND_UP(offset, 32));
98 }
99
100 /* Utility wrapper as half_2x16_split is mapped to vfpack, and sometimes it is
101 * just easier to read vfpack on the code, specially while using the PRM as
102 * reference
103 */
104 static inline nir_def *
nir_vfpack(nir_builder * b,nir_def * p1,nir_def * p2)105 nir_vfpack(nir_builder *b, nir_def *p1, nir_def *p2)
106 {
107 return nir_pack_half_2x16_split(b, p1, p2);
108 }
109
110 static inline nir_def *
pack_11f11f10f(nir_builder * b,nir_def * color)111 pack_11f11f10f(nir_builder *b, nir_def *color)
112 {
113 nir_def *p1 = nir_vfpack(b, nir_channel(b, color, 0),
114 nir_channel(b, color, 1));
115 nir_def *undef = nir_undef(b, 1, color->bit_size);
116 nir_def *p2 = nir_vfpack(b, nir_channel(b, color, 2), undef);
117
118 return nir_pack_32_to_r11g11b10_v3d(b, p1, p2);
119 }
120
121 static inline nir_def *
pack_r10g10b10a2_uint(nir_builder * b,nir_def * color)122 pack_r10g10b10a2_uint(nir_builder *b, nir_def *color)
123 {
124 nir_def *p1 = nir_pack_2x32_to_2x16_v3d(b, nir_channel(b, color, 0),
125 nir_channel(b, color, 1));
126 nir_def *p2 = nir_pack_2x32_to_2x16_v3d(b, nir_channel(b, color, 2),
127 nir_channel(b, color, 3));
128
129 return nir_pack_uint_32_to_r10g10b10a2_v3d(b, p1, p2);
130 }
131
132 static inline nir_def *
pack_r10g10b10a2_unorm(nir_builder * b,nir_def * color)133 pack_r10g10b10a2_unorm(nir_builder *b, nir_def *color)
134 {
135 nir_def *p1 = nir_vfpack(b, nir_channel(b, color, 0),
136 nir_channel(b, color, 1));
137 p1 = nir_pack_2x16_to_unorm_2x10_v3d(b, p1);
138
139 nir_def *p2 = nir_vfpack(b, nir_channel(b, color, 2),
140 nir_channel(b, color, 3));
141 p2 = nir_pack_2x16_to_unorm_10_2_v3d(b, p2);
142
143 return nir_pack_uint_32_to_r10g10b10a2_v3d(b, p1, p2);
144 }
145
146 enum hw_conversion {
147 NONE,
148 TO_SNORM,
149 TO_UNORM
150 };
151
152 static inline nir_def *
pack_8bit(nir_builder * b,nir_def * color,unsigned num_components,enum hw_conversion conversion)153 pack_8bit(nir_builder *b, nir_def *color,
154 unsigned num_components,
155 enum hw_conversion conversion)
156 {
157 /* Note that usually you should not use this method (that relies on
158 * custom packing) for 1 component if we are not doing any
159 * conversion. But we support also that case, and let the caller
160 * decide which method to use.
161 */
162 nir_def *p1;
163 nir_def *p2;
164
165 if (conversion == NONE) {
166 p1 = nir_pack_2x32_to_2x16_v3d(b, nir_channel(b, color, 0),
167 nir_channel(b, color, num_components == 1 ? 0 : 1));
168 } else {
169 p1 = nir_vfpack(b, nir_channel(b, color, 0),
170 nir_channel(b, color, num_components == 1 ? 0 : 1));
171 p1 = (conversion == TO_UNORM) ?
172 nir_pack_2x16_to_unorm_2x8_v3d(b, p1) :
173 nir_pack_2x16_to_snorm_2x8_v3d(b, p1);
174 }
175 if (num_components == 4) {
176 if (conversion == NONE) {
177 p2 = nir_pack_2x32_to_2x16_v3d(b, nir_channel(b, color, 2),
178 nir_channel(b, color, 3));
179 } else {
180 p2 = nir_vfpack(b, nir_channel(b, color, 2),
181 nir_channel(b, color, 3));
182 p2 = (conversion == TO_UNORM) ?
183 nir_pack_2x16_to_unorm_2x8_v3d(b, p2) :
184 nir_pack_2x16_to_snorm_2x8_v3d(b, p2);
185 }
186 } else {
187 /* Using an undef here would be more correct. But for this
188 * case we are getting worse shader-db values with some CTS
189 * tests, so we just reuse the first packing.
190 */
191 p2 = p1;
192 }
193
194 return nir_pack_4x16_to_4x8_v3d(b, p1, p2);
195 }
196
197 static inline nir_def *
pack_16bit(nir_builder * b,nir_def * color,unsigned num_components,enum hw_conversion conversion)198 pack_16bit(nir_builder *b, nir_def *color,
199 unsigned num_components,
200 enum hw_conversion conversion)
201 {
202 nir_def *results[2] = {0};
203 nir_def *channels[4] = {0};
204
205 for (unsigned i = 0; i < num_components; i++) {
206 channels[i] = nir_channel(b, color, i);
207 switch (conversion) {
208 case TO_SNORM:
209 channels[i] = nir_f2snorm_16_v3d(b, channels[i]);
210 break;
211 case TO_UNORM:
212 channels[i] = nir_f2unorm_16_v3d(b, channels[i]);
213 break;
214 default:
215 /* Note that usually you should not use this method
216 * (that relies on custom packing) if we are not doing
217 * any conversion. But we support also that case, and
218 * let the caller decide which method to use.
219 */
220 break;
221 }
222 }
223
224 switch (num_components) {
225 case 1:
226 results[0] = channels[0];
227 break;
228 case 4:
229 results[1] = nir_pack_2x32_to_2x16_v3d(b, channels[2], channels[3]);
230 FALLTHROUGH;
231 case 2:
232 results[0] = nir_pack_2x32_to_2x16_v3d(b, channels[0], channels[1]);
233 break;
234 default:
235 unreachable("Invalid number of components");
236 }
237
238 return nir_vec(b, results, DIV_ROUND_UP(num_components, 2));
239 }
240
241 static inline nir_def *
pack_xbit(nir_builder * b,nir_def * color,unsigned num_components,const struct util_format_channel_description * r_chan)242 pack_xbit(nir_builder *b, nir_def *color,
243 unsigned num_components,
244 const struct util_format_channel_description *r_chan)
245 {
246 bool pack_mask = (r_chan->type == UTIL_FORMAT_TYPE_SIGNED);
247 enum hw_conversion conversion = NONE;
248 if (r_chan->normalized) {
249 conversion =
250 (r_chan->type == UTIL_FORMAT_TYPE_UNSIGNED) ? TO_UNORM : TO_SNORM;
251 }
252
253 switch (r_chan->size) {
254 case 8:
255 if (conversion == NONE && num_components < 2)
256 return pack_bits(b, color, bits_8, num_components, pack_mask);
257 else
258 return pack_8bit(b, color, num_components, conversion);
259 break;
260 case 16:
261 /* pack_mask implies that the generic packing method would
262 * need to include extra operations to handle negative values,
263 * so in that case, even without a conversion, it is better to
264 * use the packing using custom hw operations.
265 */
266 if (conversion == NONE && !pack_mask)
267 return pack_bits(b, color, bits_16, num_components, pack_mask);
268 else
269 return pack_16bit(b, color, num_components, conversion);
270 break;
271 default:
272 unreachable("unrecognized bits");
273 }
274 }
275
276 static bool
v3d42_nir_lower_image_store(nir_builder * b,nir_intrinsic_instr * instr)277 v3d42_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr)
278 {
279 enum pipe_format format = nir_intrinsic_format(instr);
280 assert(format != PIPE_FORMAT_NONE);
281 const struct util_format_description *desc =
282 util_format_description(format);
283 const struct util_format_channel_description *r_chan = &desc->channel[0];
284 unsigned num_components = util_format_get_nr_components(format);
285
286 b->cursor = nir_before_instr(&instr->instr);
287
288 nir_def *color = nir_trim_vector(b,
289 instr->src[3].ssa,
290 num_components);
291 nir_def *formatted = NULL;
292
293 if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
294 formatted = nir_format_pack_11f11f10f(b, color);
295 } else if (format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
296 formatted = nir_format_pack_r9g9b9e5(b, color);
297 } else if (r_chan->size == 32) {
298 /* For 32-bit formats, we just have to move the vector
299 * across (possibly reducing the number of channels).
300 */
301 formatted = color;
302 } else {
303 const unsigned *bits;
304
305 switch (r_chan->size) {
306 case 8:
307 bits = bits_8;
308 break;
309 case 10:
310 bits = bits_1010102;
311 break;
312 case 16:
313 bits = bits_16;
314 break;
315 default:
316 unreachable("unrecognized bits");
317 }
318
319 bool pack_mask = false;
320 if (r_chan->pure_integer &&
321 r_chan->type == UTIL_FORMAT_TYPE_SIGNED) {
322 /* We don't need to do any conversion or clamping in this case */
323 formatted = color;
324 pack_mask = true;
325 } else if (r_chan->pure_integer &&
326 r_chan->type == UTIL_FORMAT_TYPE_UNSIGNED) {
327 /* We don't need to do any conversion or clamping in this case */
328 formatted = color;
329 } else if (r_chan->normalized &&
330 r_chan->type == UTIL_FORMAT_TYPE_SIGNED) {
331 formatted = nir_format_float_to_snorm(b, color, bits);
332 pack_mask = true;
333 } else if (r_chan->normalized &&
334 r_chan->type == UTIL_FORMAT_TYPE_UNSIGNED) {
335 formatted = nir_format_float_to_unorm(b, color, bits);
336 } else {
337 assert(r_chan->size == 16);
338 assert(r_chan->type == UTIL_FORMAT_TYPE_FLOAT);
339 formatted = nir_format_float_to_half(b, color);
340 }
341
342 formatted = pack_bits(b, formatted, bits, num_components,
343 pack_mask);
344 }
345
346 nir_src_rewrite(&instr->src[3], formatted);
347 instr->num_components = formatted->num_components;
348
349 return true;
350 }
351
352
353 static bool
v3d71_nir_lower_image_store(nir_builder * b,nir_intrinsic_instr * instr)354 v3d71_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr)
355 {
356 enum pipe_format format = nir_intrinsic_format(instr);
357 assert(format != PIPE_FORMAT_NONE);
358 const struct util_format_description *desc =
359 util_format_description(format);
360 const struct util_format_channel_description *r_chan = &desc->channel[0];
361 unsigned num_components = util_format_get_nr_components(format);
362 b->cursor = nir_before_instr(&instr->instr);
363
364 nir_def *color =
365 nir_trim_vector(b, instr->src[3].ssa, num_components);
366 nir_def *formatted = NULL;
367 if (format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
368 formatted = nir_format_pack_r9g9b9e5(b, color);
369 } else if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
370 formatted = pack_11f11f10f(b, color);
371 } else if (format == PIPE_FORMAT_R10G10B10A2_UINT) {
372 formatted = pack_r10g10b10a2_uint(b, color);
373 } else if (format == PIPE_FORMAT_R10G10B10A2_UNORM) {
374 formatted = pack_r10g10b10a2_unorm(b, color);
375 } else if (r_chan->size == 32) {
376 /* For 32-bit formats, we just have to move the vector
377 * across (possibly reducing the number of channels).
378 */
379 formatted = color;
380 } else if (r_chan->type == UTIL_FORMAT_TYPE_FLOAT) {
381 assert(r_chan->size == 16);
382 formatted = nir_format_float_to_half(b, color);
383 formatted = pack_bits(b, formatted, bits_16, num_components,
384 false);
385 } else {
386 assert(r_chan->size == 8 || r_chan->size == 16);
387 formatted = pack_xbit(b, color, num_components, r_chan);
388 }
389
390 nir_src_rewrite(&instr->src[3], formatted);
391 instr->num_components = formatted->num_components;
392
393 return true;
394 }
395
396 static bool
v3d_nir_lower_image_load(nir_builder * b,nir_intrinsic_instr * instr)397 v3d_nir_lower_image_load(nir_builder *b, nir_intrinsic_instr *instr)
398 {
399 static const unsigned bits16[] = {16, 16, 16, 16};
400 enum pipe_format format = nir_intrinsic_format(instr);
401
402 if (v3d_gl_format_is_return_32(format))
403 return false;
404
405 b->cursor = nir_after_instr(&instr->instr);
406
407 nir_def *result = &instr->def;
408 if (util_format_is_pure_uint(format)) {
409 result = nir_format_unpack_uint(b, result, bits16, 4);
410 } else if (util_format_is_pure_sint(format)) {
411 result = nir_format_unpack_sint(b, result, bits16, 4);
412 } else {
413 nir_def *rg = nir_channel(b, result, 0);
414 nir_def *ba = nir_channel(b, result, 1);
415 result = nir_vec4(b,
416 nir_unpack_half_2x16_split_x(b, rg),
417 nir_unpack_half_2x16_split_y(b, rg),
418 nir_unpack_half_2x16_split_x(b, ba),
419 nir_unpack_half_2x16_split_y(b, ba));
420 }
421
422 nir_def_rewrite_uses_after(&instr->def, result,
423 result->parent_instr);
424
425 return true;
426 }
427
428 static bool
v3d_nir_lower_image_load_store_cb(nir_builder * b,nir_intrinsic_instr * intr,void * _state)429 v3d_nir_lower_image_load_store_cb(nir_builder *b,
430 nir_intrinsic_instr *intr,
431 void *_state)
432 {
433 struct v3d_compile *c = (struct v3d_compile *) _state;
434
435 switch (intr->intrinsic) {
436 case nir_intrinsic_image_load:
437 return v3d_nir_lower_image_load(b, intr);
438 case nir_intrinsic_image_store:
439 if (c->devinfo->ver >= 71)
440 return v3d71_nir_lower_image_store(b, intr);
441 else
442 return v3d42_nir_lower_image_store(b, intr);
443 break;
444 default:
445 return false;
446 }
447
448 return false;
449 }
450
451 bool
v3d_nir_lower_image_load_store(nir_shader * s,struct v3d_compile * c)452 v3d_nir_lower_image_load_store(nir_shader *s, struct v3d_compile *c)
453 {
454 return nir_shader_intrinsics_pass(s,
455 v3d_nir_lower_image_load_store_cb,
456 nir_metadata_control_flow, c);
457 }
458