xref: /aosp_15_r20/external/mesa3d/src/mesa/state_tracker/st_pbo_compute.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2021 Valve Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * Authors:
24  *    Mike Blumenkrantz <[email protected]>
25  */
26 
27 #include <stdbool.h>
28 #include "main/image.h"
29 #include "main/pbo.h"
30 
31 #include "nir/pipe_nir.h"
32 #include "state_tracker/st_nir.h"
33 #include "state_tracker/st_format.h"
34 #include "state_tracker/st_pbo.h"
35 #include "state_tracker/st_program.h"
36 #include "state_tracker/st_texture.h"
37 #include "compiler/nir/nir_builder.h"
38 #include "compiler/nir/nir_format_convert.h"
39 #include "compiler/glsl/gl_nir.h"
40 #include "compiler/glsl/gl_nir_linker.h"
41 #include "util/u_sampler.h"
42 #include "util/streaming-load-memcpy.h"
43 
44 #define SPEC_USES_THRESHOLD 5
45 
46 struct pbo_spec_async_data {
47    uint32_t data[4]; //must be first
48    bool created;
49    unsigned uses;
50    struct util_queue_fence fence;
51    nir_shader *nir;
52    struct pipe_shader_state *cs;
53 };
54 
55 struct pbo_async_data {
56    struct st_context *st;
57    enum pipe_texture_target target;
58    unsigned num_components;
59    struct util_queue_fence fence;
60    nir_shader *nir;
61    nir_shader *copy; //immutable
62    struct pipe_shader_state *cs;
63    struct set specialized;
64 };
65 
66 #define BGR_FORMAT(NAME) \
67     {{ \
68      [0] = PIPE_FORMAT_##NAME##_SNORM, \
69      [1] = PIPE_FORMAT_##NAME##_SINT, \
70     }, \
71     { \
72      [0] = PIPE_FORMAT_##NAME##_UNORM, \
73      [1] = PIPE_FORMAT_##NAME##_UINT, \
74     }}
75 
76 #define FORMAT(NAME, NAME16, NAME32) \
77    {{ \
78     [1] = PIPE_FORMAT_##NAME##_SNORM, \
79     [2] = PIPE_FORMAT_##NAME16##_SNORM, \
80     [4] = PIPE_FORMAT_##NAME32##_SNORM, \
81    }, \
82    { \
83     [1] = PIPE_FORMAT_##NAME##_UNORM, \
84     [2] = PIPE_FORMAT_##NAME16##_UNORM, \
85     [4] = PIPE_FORMAT_##NAME32##_UNORM, \
86    }}
87 
88 /* don't try these at home */
89 static enum pipe_format
get_convert_format(struct gl_context * ctx,enum pipe_format src_format,GLenum format,GLenum type,bool * need_bgra_swizzle)90 get_convert_format(struct gl_context *ctx,
91                 enum pipe_format src_format,
92                 GLenum format, GLenum type,
93                 bool *need_bgra_swizzle)
94 {
95    struct st_context *st = st_context(ctx);
96    GLint bpp = _mesa_bytes_per_pixel(format, type);
97    if (_mesa_is_depth_format(format) ||
98        format == GL_STENCIL_INDEX ||
99        format == GL_GREEN_INTEGER ||
100        format == GL_BLUE_INTEGER) {
101       switch (bpp) {
102       case 1:
103          return _mesa_is_type_unsigned(type) ? PIPE_FORMAT_R8_UINT : PIPE_FORMAT_R8_SINT;
104       case 2:
105          return _mesa_is_type_unsigned(type) ? PIPE_FORMAT_R16_UINT : PIPE_FORMAT_R16_SINT;
106       case 4:
107          return _mesa_is_type_unsigned(type) ? PIPE_FORMAT_R32_UINT : PIPE_FORMAT_R32_SINT;
108       }
109    }
110    mesa_format mformat = _mesa_tex_format_from_format_and_type(ctx, format, type);
111    enum pipe_format pformat = st_mesa_format_to_pipe_format(st, mformat);
112    if (!pformat) {
113       GLint dst_components = _mesa_components_in_format(format);
114       bpp /= dst_components;
115       if (format == GL_BGR || format == GL_BGRA) {
116          pformat = st_pbo_get_dst_format(ctx, PIPE_TEXTURE_2D, src_format, false, format == GL_BGR ? GL_RGB : GL_RGBA, type, 0);
117          if (!pformat)
118             pformat = get_convert_format(ctx, src_format, format == GL_BGR ? GL_RGB : GL_RGBA, type, need_bgra_swizzle);
119          assert(pformat);
120          *need_bgra_swizzle = true;
121       } else if (format == GL_BGR_INTEGER || format == GL_BGRA_INTEGER) {
122          pformat = st_pbo_get_dst_format(ctx, PIPE_TEXTURE_2D, src_format, false, format == GL_BGR_INTEGER ? GL_RGB_INTEGER : GL_RGBA_INTEGER, type, 0);
123          if (!pformat)
124             pformat = get_convert_format(ctx, src_format, format == GL_BGR_INTEGER ? GL_RGB_INTEGER : GL_RGBA_INTEGER, type, need_bgra_swizzle);
125          assert(pformat);
126          *need_bgra_swizzle = true;
127       } else {
128          /* [signed,unsigned][bpp] */
129          enum pipe_format rgb[5][2][5] = {
130             [1] = FORMAT(R8, R16, R32),
131             [2] = FORMAT(R8G8, R16G16, R32G32),
132             [3] = FORMAT(R8G8B8, R16G16B16, R32G32B32),
133             [4] = FORMAT(R8G8B8A8, R16G16B16A16, R32G32B32A32),
134          };
135          pformat = rgb[dst_components][_mesa_is_type_unsigned(type)][bpp];
136       }
137       assert(util_format_get_nr_components(pformat) == dst_components);
138    }
139    assert(pformat);
140    return pformat;
141 }
142 #undef BGR_FORMAT
143 #undef FORMAT
144 
145 
146 struct pbo_shader_data {
147    nir_def *offset;
148    nir_def *range;
149    nir_def *invert;
150    nir_def *blocksize;
151    nir_def *alignment;
152    nir_def *dst_bit_size;
153    nir_def *channels;
154    nir_def *normalized;
155    nir_def *integer;
156    nir_def *clamp_uint;
157    nir_def *r11g11b10_or_sint;
158    nir_def *r9g9b9e5;
159    nir_def *bits1;
160    nir_def *bits2;
161    nir_def *bits3;
162    nir_def *bits4;
163    nir_def *swap;
164    nir_def *bits; //vec4
165 };
166 
167 
168 /* must be under 16bytes / sizeof(vec4) / 128 bits) */
169 struct pbo_data {
170    union {
171        struct {
172           struct {
173              uint16_t x, y;
174           };
175           struct {
176              uint16_t width, height;
177           };
178           struct {
179              uint16_t depth;
180              uint8_t invert : 1;
181              uint8_t blocksize : 7;
182 
183              uint8_t clamp_uint : 1;
184              uint8_t r11g11b10_or_sint : 1;
185              uint8_t r9g9b9e5 : 1;
186              uint8_t swap : 1;
187              uint16_t alignment : 2;
188              uint8_t dst_bit_size : 2; //8, 16, 32, 64
189           };
190 
191           struct {
192              uint8_t channels : 2;
193              uint8_t bits1 : 6;
194              uint8_t normalized : 1;
195              uint8_t integer : 1;
196              uint8_t bits2 : 6;
197              uint8_t bits3 : 6;
198              uint8_t pad1 : 2;
199              uint8_t bits4 : 6;
200              uint8_t pad2 : 2;
201           };
202       };
203       float vec[4];
204    };
205 };
206 
207 
208 #define STRUCT_OFFSET(name) (offsetof(struct pbo_data, name) * 8)
209 
210 #define STRUCT_BLOCK(offset, ...) \
211    do { \
212       assert(offset % 8 == 0); \
213       nir_def *block##offset = nir_u2u32(b, nir_extract_bits(b, &ubo_load, 1, (offset), 1, 8)); \
214       __VA_ARGS__ \
215    } while (0)
216 #define STRUCT_MEMBER(blockoffset, name, offset, size, op, clamp) \
217    do { \
218       assert(offset + size <= 8); \
219       nir_def *val = nir_iand_imm(b, block##blockoffset, u_bit_consecutive(offset, size)); \
220       if (offset) \
221          val = nir_ushr_imm(b, val, offset); \
222       sd->name = op; \
223       if (clamp) \
224          sd->name = nir_umin(b, sd->name, nir_imm_int(b, clamp)); \
225    } while (0)
226 #define STRUCT_MEMBER_SHIFTED_2BIT(blockoffset, name, offset, shift, clamp) \
227    STRUCT_MEMBER(blockoffset, name, offset, 2, nir_ishl(b, nir_imm_int(b, shift), val), clamp)
228 
229 #define STRUCT_MEMBER_BOOL(blockoffset, name, offset) \
230    STRUCT_MEMBER(blockoffset, name, offset, 1, nir_ieq_imm(b, val, 1), 0)
231 
232 /* this function extracts the conversion data from pbo_data using the
233  * size annotations for each grouping. data is compacted into bitfields,
234  * so bitwise operations must be used to "unpact" everything
235  */
236 static void
init_pbo_shader_data(nir_builder * b,struct pbo_shader_data * sd,unsigned coord_components)237 init_pbo_shader_data(nir_builder *b, struct pbo_shader_data *sd, unsigned coord_components)
238 {
239    nir_variable *ubo = nir_variable_create(b->shader, nir_var_uniform, glsl_uvec4_type(), "offset");
240    nir_def *ubo_load = nir_load_var(b, ubo);
241 
242    sd->offset = nir_u2u32(b, nir_extract_bits(b, &ubo_load, 1, STRUCT_OFFSET(x), 2, 16));
243    if (coord_components == 1)
244       sd->offset = nir_vector_insert_imm(b, sd->offset, nir_imm_int(b, 0), 1);
245    sd->range = nir_u2u32(b, nir_extract_bits(b, &ubo_load, 1, STRUCT_OFFSET(width), 3, 16));
246    if (coord_components < 3) {
247       sd->range = nir_vector_insert_imm(b, sd->range, nir_imm_int(b, 1), 2);
248       if (coord_components == 1)
249          sd->range = nir_vector_insert_imm(b, sd->range, nir_imm_int(b, 1), 1);
250    }
251 
252    STRUCT_BLOCK(80,
253       STRUCT_MEMBER_BOOL(80, invert, 0);
254       STRUCT_MEMBER(80, blocksize, 1, 7, nir_iadd_imm(b, val, 1), 128);
255    );
256 
257    STRUCT_BLOCK(88,
258       STRUCT_MEMBER_BOOL(88, clamp_uint, 0);
259       STRUCT_MEMBER_BOOL(88, r11g11b10_or_sint, 1);
260       STRUCT_MEMBER_BOOL(88, r9g9b9e5, 2);
261       STRUCT_MEMBER_BOOL(88, swap, 3);
262       STRUCT_MEMBER_SHIFTED_2BIT(88, alignment, 4, 1, 8);
263       STRUCT_MEMBER_SHIFTED_2BIT(88, dst_bit_size, 6, 8, 64);
264    );
265 
266    STRUCT_BLOCK(96,
267       STRUCT_MEMBER(96, channels, 0, 2, nir_iadd_imm(b, val, 1), 4);
268       STRUCT_MEMBER(96, bits1, 2, 6, val, 32);
269    );
270 
271    STRUCT_BLOCK(104,
272       STRUCT_MEMBER_BOOL(104, normalized, 0);
273       STRUCT_MEMBER_BOOL(104, integer, 1);
274       STRUCT_MEMBER(104, bits2, 2, 6, val, 32);
275    );
276 
277 
278    STRUCT_BLOCK(112,
279       STRUCT_MEMBER(112, bits3, 0, 6, val, 32);
280    );
281 
282    STRUCT_BLOCK(120,
283       STRUCT_MEMBER(120, bits4, 0, 6, val, 32);
284    );
285    sd->bits = nir_vec4(b, sd->bits1, sd->bits2, sd->bits3, sd->bits4);
286 
287    /* clamp swap in the shader to enable better optimizing */
288    /* TODO?
289    sd->swap = nir_bcsel(b, nir_ior(b,
290                                    nir_ieq_imm(b, sd->blocksize, 8),
291                                    nir_bcsel(b,
292                                              nir_ieq_imm(b, sd->bits1, 8),
293                                              nir_bcsel(b,
294                                                        nir_uge_imm(b, sd->channels, 2),
295                                                        nir_bcsel(b,
296                                                                  nir_uge_imm(b, sd->channels, 3),
297                                                                  nir_bcsel(b,
298                                                                            nir_ieq_imm(b, sd->channels, 4),
299                                                                            nir_ball(b, nir_ieq_imm(b, sd->bits, 8)),
300                                                                            nir_ball(b, nir_ieq_imm(b, nir_channels(b, sd->bits, 7), 8))),
301                                                                  nir_ball(b, nir_ieq_imm(b, nir_channels(b, sd->bits, 3), 8))),
302                                                        nir_imm_false(b)),
303                                              nir_imm_false(b))),
304                            nir_imm_false(b),
305                            sd->swap);
306      */
307 }
308 
309 static unsigned
fill_pbo_data(struct pbo_data * pd,enum pipe_format src_format,enum pipe_format dst_format,bool swap)310 fill_pbo_data(struct pbo_data *pd, enum pipe_format src_format, enum pipe_format dst_format, bool swap)
311 {
312    unsigned bits[4] = {0};
313    bool weird_packed = false;
314    const struct util_format_description *dst_desc = util_format_description(dst_format);
315    bool is_8bit = true;
316 
317    for (unsigned c = 0; c < 4; c++) {
318       bits[c] = dst_desc->channel[c].size;
319       if (c < dst_desc->nr_channels) {
320          weird_packed |= bits[c] != bits[0] || bits[c] % 8 != 0;
321          if (bits[c] != 8)
322             is_8bit = false;
323       }
324    }
325 
326    if (is_8bit || dst_desc->block.bits == 8)
327       swap = false;
328 
329    unsigned dst_bit_size = 0;
330    if (weird_packed) {
331       dst_bit_size = dst_desc->block.bits;
332    } else {
333       dst_bit_size = dst_desc->block.bits / dst_desc->nr_channels;
334    }
335    assert(dst_bit_size);
336    assert(dst_bit_size <= 64);
337 
338    pd->dst_bit_size = dst_bit_size >> 4;
339    pd->channels = dst_desc->nr_channels - 1;
340    pd->normalized = dst_desc->is_unorm || dst_desc->is_snorm;
341    pd->clamp_uint = dst_desc->is_unorm ||
342                     (util_format_is_pure_sint(dst_format) &&
343                      !util_format_is_pure_sint(src_format) &&
344                      !util_format_is_snorm(src_format)) ||
345                     util_format_is_pure_uint(dst_format);
346    pd->integer = util_format_is_pure_uint(dst_format) || util_format_is_pure_sint(dst_format);
347    pd->r11g11b10_or_sint = dst_format == PIPE_FORMAT_R11G11B10_FLOAT || util_format_is_pure_sint(dst_format);
348    pd->r9g9b9e5 = dst_format == PIPE_FORMAT_R9G9B9E5_FLOAT;
349    pd->bits1 = bits[0];
350    pd->bits2 = bits[1];
351    pd->bits3 = bits[2];
352    pd->bits4 = bits[3];
353    pd->swap = swap;
354 
355    return weird_packed ? 1 : dst_desc->nr_channels;
356 }
357 
358 static nir_def *
get_buffer_offset(nir_builder * b,nir_def * coord,struct pbo_shader_data * sd)359 get_buffer_offset(nir_builder *b, nir_def *coord, struct pbo_shader_data *sd)
360 {
361 /* from _mesa_image_offset():
362       offset = topOfImage
363                + (skippixels + column) * bytes_per_pixel
364                + (skiprows + row) * bytes_per_row
365                + (skipimages + img) * bytes_per_image;
366  */
367    nir_def *bytes_per_row = nir_imul(b, nir_channel(b, sd->range, 0), sd->blocksize);
368    bytes_per_row = nir_bcsel(b, nir_ult_imm(b, sd->alignment, 2),
369                              bytes_per_row,
370                              nir_iand(b,
371                                       nir_iadd_imm(b, nir_iadd(b, bytes_per_row, sd->alignment), -1),
372                                       nir_inot(b, nir_iadd_imm(b, sd->alignment, -1))));
373    nir_def *bytes_per_image = nir_imul(b, bytes_per_row, nir_channel(b, sd->range, 1));
374    bytes_per_row = nir_bcsel(b, sd->invert,
375                              nir_ineg(b, bytes_per_row),
376                              bytes_per_row);
377    return nir_iadd(b,
378                    nir_imul(b, nir_channel(b, coord, 0), sd->blocksize),
379                    nir_iadd(b,
380                             nir_imul(b, nir_channel(b, coord, 1), bytes_per_row),
381                             nir_imul(b, nir_channel(b, coord, 2), bytes_per_image)));
382 }
383 
384 static inline void
write_ssbo(nir_builder * b,nir_def * pixel,nir_def * buffer_offset)385 write_ssbo(nir_builder *b, nir_def *pixel, nir_def *buffer_offset)
386 {
387    nir_store_ssbo(b, pixel, nir_imm_zero(b, 1, 32), buffer_offset,
388                   .align_mul = pixel->bit_size / 8,
389                   .write_mask = (1 << pixel->num_components) - 1);
390 }
391 
392 static void
write_conversion(nir_builder * b,nir_def * pixel,nir_def * buffer_offset,struct pbo_shader_data * sd)393 write_conversion(nir_builder *b, nir_def *pixel, nir_def *buffer_offset, struct pbo_shader_data *sd)
394 {
395    nir_push_if(b, nir_ilt_imm(b, sd->dst_bit_size, 32));
396       nir_push_if(b, nir_ieq_imm(b, sd->dst_bit_size, 16));
397          write_ssbo(b, nir_u2u16(b, pixel), buffer_offset);
398       nir_push_else(b, NULL);
399          write_ssbo(b, nir_u2u8(b, pixel), buffer_offset);
400       nir_pop_if(b, NULL);
401    nir_push_else(b, NULL);
402       write_ssbo(b, pixel, buffer_offset);
403    nir_pop_if(b, NULL);
404 }
405 
406 static nir_def *
swap2(nir_builder * b,nir_def * src)407 swap2(nir_builder *b, nir_def *src)
408 {
409    /* dst[i] = (src[i] >> 8) | ((src[i] << 8) & 0xff00); */
410    return nir_ior(b,
411                   nir_ushr_imm(b, src, 8),
412                   nir_iand_imm(b, nir_ishl_imm(b, src, 8), 0xff00));
413 }
414 
415 static nir_def *
swap4(nir_builder * b,nir_def * src)416 swap4(nir_builder *b, nir_def *src)
417 {
418    /* a = (b >> 24) | ((b >> 8) & 0xff00) | ((b << 8) & 0xff0000) | ((b << 24) & 0xff000000); */
419    return nir_ior(b,
420                   /* (b >> 24) */
421                   nir_ushr_imm(b, src, 24),
422                   nir_ior(b,
423                           /* ((b >> 8) & 0xff00) */
424                           nir_iand_imm(b, nir_ushr_imm(b, src, 8), 0xff00),
425                           nir_ior(b,
426                                   /* ((b << 8) & 0xff0000) */
427                                   nir_iand_imm(b, nir_ishl_imm(b, src, 8), 0xff0000),
428                                   /* ((b << 24) & 0xff000000) */
429                                   nir_iand_imm(b, nir_ishl_imm(b, src, 24), 0xff000000))));
430 }
431 
432 /* explode the cf to handle channel counts in the shader */
433 static void
grab_components(nir_builder * b,nir_def * pixel,nir_def * buffer_offset,struct pbo_shader_data * sd,bool weird_packed)434 grab_components(nir_builder *b, nir_def *pixel, nir_def *buffer_offset, struct pbo_shader_data *sd, bool weird_packed)
435 {
436    if (weird_packed) {
437       nir_push_if(b, nir_ieq_imm(b, sd->bits1, 32));
438          write_conversion(b, nir_trim_vector(b, pixel, 2), buffer_offset, sd);
439       nir_push_else(b, NULL);
440          write_conversion(b, nir_channel(b, pixel, 0), buffer_offset, sd);
441       nir_pop_if(b, NULL);
442    } else {
443       nir_push_if(b, nir_ieq_imm(b, sd->channels, 1));
444          write_conversion(b, nir_channel(b, pixel, 0), buffer_offset, sd);
445       nir_push_else(b, NULL);
446          nir_push_if(b, nir_ieq_imm(b, sd->channels, 2));
447             write_conversion(b, nir_trim_vector(b, pixel, 2), buffer_offset,
448                              sd);
449          nir_push_else(b, NULL);
450             nir_push_if(b, nir_ieq_imm(b, sd->channels, 3));
451                write_conversion(b, nir_trim_vector(b, pixel, 3),
452                                 buffer_offset, sd);
453             nir_push_else(b, NULL);
454                write_conversion(b, nir_trim_vector(b, pixel, 4),
455                                 buffer_offset, sd);
456             nir_pop_if(b, NULL);
457          nir_pop_if(b, NULL);
458       nir_pop_if(b, NULL);
459    }
460 }
461 
462 /* if byteswap is enabled, handle that and then write the components */
463 static void
handle_swap(nir_builder * b,nir_def * pixel,nir_def * buffer_offset,struct pbo_shader_data * sd,unsigned num_components,bool weird_packed)464 handle_swap(nir_builder *b, nir_def *pixel, nir_def *buffer_offset,
465             struct pbo_shader_data *sd, unsigned num_components, bool weird_packed)
466 {
467    nir_push_if(b, sd->swap); {
468       nir_push_if(b, nir_ieq_imm(b, nir_udiv_imm(b, sd->blocksize, num_components), 2)); {
469          /* this is a single high/low swap per component */
470          nir_def *components[4];
471          for (unsigned i = 0; i < 4; i++)
472             components[i] = swap2(b, nir_channel(b, pixel, i));
473          nir_def *v = nir_vec(b, components, 4);
474          grab_components(b, v, buffer_offset, sd, weird_packed);
475       } nir_push_else(b, NULL); {
476          /* this is a pair of high/low swaps for each half of the component */
477          nir_def *components[4];
478          for (unsigned i = 0; i < 4; i++)
479             components[i] = swap4(b, nir_channel(b, pixel, i));
480          nir_def *v = nir_vec(b, components, 4);
481          grab_components(b, v, buffer_offset, sd, weird_packed);
482       } nir_pop_if(b, NULL);
483    } nir_push_else(b, NULL); {
484       /* swap disabled */
485       grab_components(b, pixel, buffer_offset, sd, weird_packed);
486    } nir_pop_if(b, NULL);
487 }
488 
489 static nir_def *
check_for_weird_packing(nir_builder * b,struct pbo_shader_data * sd,unsigned component)490 check_for_weird_packing(nir_builder *b, struct pbo_shader_data *sd, unsigned component)
491 {
492    nir_def *c = nir_channel(b, sd->bits, component - 1);
493 
494    return nir_bcsel(b,
495                     nir_ige_imm(b, sd->channels, component),
496                     nir_ior(b,
497                             nir_ine(b, c, sd->bits1),
498                             nir_ine_imm(b, nir_imod_imm(b, c, 8), 0)),
499                     nir_imm_false(b));
500 }
501 
502 /* convenience function for clamping signed integers */
503 static inline nir_def *
nir_imin_imax(nir_builder * build,nir_def * src,nir_def * clamp_to_min,nir_def * clamp_to_max)504 nir_imin_imax(nir_builder *build, nir_def *src, nir_def *clamp_to_min, nir_def *clamp_to_max)
505 {
506    return nir_imax(build, nir_imin(build, src, clamp_to_min), clamp_to_max);
507 }
508 
509 static inline nir_def *
nir_format_float_to_unorm_with_factor(nir_builder * b,nir_def * f,nir_def * factor)510 nir_format_float_to_unorm_with_factor(nir_builder *b, nir_def *f, nir_def *factor)
511 {
512    /* Clamp to the range [0, 1] */
513    f = nir_fsat(b, f);
514 
515    return nir_f2u32(b, nir_fround_even(b, nir_fmul(b, f, factor)));
516 }
517 
518 static inline nir_def *
nir_format_float_to_snorm_with_factor(nir_builder * b,nir_def * f,nir_def * factor)519 nir_format_float_to_snorm_with_factor(nir_builder *b, nir_def *f, nir_def *factor)
520 {
521    /* Clamp to the range [-1, 1] */
522    f = nir_fmin(b, nir_fmax(b, f, nir_imm_float(b, -1)), nir_imm_float(b, 1));
523 
524    return nir_f2i32(b, nir_fround_even(b, nir_fmul(b, f, factor)));
525 }
526 
527 static nir_def *
clamp_and_mask(nir_builder * b,nir_def * src,nir_def * channels)528 clamp_and_mask(nir_builder *b, nir_def *src, nir_def *channels)
529 {
530    nir_def *one = nir_imm_ivec4(b, 1, 0, 0, 0);
531    nir_def *two = nir_imm_ivec4(b, 1, 1, 0, 0);
532    nir_def *three = nir_imm_ivec4(b, 1, 1, 1, 0);
533    nir_def *four = nir_imm_ivec4(b, 1, 1, 1, 1);
534    /* avoid underflow by clamping to channel count */
535    src = nir_bcsel(b,
536                    nir_ieq(b, channels, one),
537                    nir_isub(b, src, one),
538                    nir_bcsel(b,
539                              nir_ieq_imm(b, channels, 2),
540                              nir_isub(b, src, two),
541                              nir_bcsel(b,
542                                        nir_ieq_imm(b, channels, 3),
543                                        nir_isub(b, src, three),
544                                        nir_isub(b, src, four))));
545 
546    return nir_mask(b, src, 32);
547 }
548 
549 static void
convert_swap_write(nir_builder * b,nir_def * pixel,nir_def * buffer_offset,unsigned num_components,struct pbo_shader_data * sd)550 convert_swap_write(nir_builder *b, nir_def *pixel, nir_def *buffer_offset,
551                    unsigned num_components,
552                    struct pbo_shader_data *sd)
553 {
554 
555    nir_def *weird_packed = nir_ior(b,
556                                        nir_ior(b,
557                                                check_for_weird_packing(b, sd, 4),
558                                                check_for_weird_packing(b, sd, 3)),
559                                        check_for_weird_packing(b, sd, 2));
560    if (num_components == 1) {
561       nir_push_if(b, weird_packed);
562          nir_push_if(b, sd->r11g11b10_or_sint);
563             handle_swap(b, nir_pad_vec4(b, nir_format_pack_11f11f10f(b, pixel)), buffer_offset, sd, 1, true);
564          nir_push_else(b, NULL);
565             nir_push_if(b, sd->r9g9b9e5);
566                handle_swap(b, nir_pad_vec4(b, nir_format_pack_r9g9b9e5(b, pixel)), buffer_offset, sd, 1, true);
567             nir_push_else(b, NULL);
568                nir_push_if(b, nir_ieq_imm(b, sd->bits1, 32)); { //PIPE_FORMAT_Z32_FLOAT_S8X24_UINT
569                   nir_def *pack[2];
570                   pack[0] = nir_format_pack_uint_unmasked_ssa(b, nir_channel(b, pixel, 0), nir_channel(b, sd->bits, 0));
571                   pack[1] = nir_format_pack_uint_unmasked_ssa(b, nir_channels(b, pixel, 6), nir_channels(b, sd->bits, 6));
572                   handle_swap(b, nir_pad_vec4(b, nir_vec2(b, pack[0], pack[1])), buffer_offset, sd, 2, true);
573                } nir_push_else(b, NULL);
574                   handle_swap(b, nir_pad_vec4(b, nir_format_pack_uint_unmasked_ssa(b, pixel, sd->bits)), buffer_offset, sd, 1, true);
575                nir_pop_if(b, NULL);
576             nir_pop_if(b, NULL);
577          nir_pop_if(b, NULL);
578       nir_push_else(b, NULL);
579          handle_swap(b, pixel, buffer_offset, sd, num_components, false);
580       nir_pop_if(b, NULL);
581    } else {
582       nir_push_if(b, weird_packed);
583          handle_swap(b, pixel, buffer_offset, sd, num_components, true);
584       nir_push_else(b, NULL);
585          handle_swap(b, pixel, buffer_offset, sd, num_components, false);
586       nir_pop_if(b, NULL);
587    }
588 }
589 
590 static void
do_shader_conversion(nir_builder * b,nir_def * pixel,unsigned num_components,nir_def * coord,struct pbo_shader_data * sd)591 do_shader_conversion(nir_builder *b, nir_def *pixel,
592                      unsigned num_components,
593                      nir_def *coord, struct pbo_shader_data *sd)
594 {
595    nir_def *buffer_offset = get_buffer_offset(b, coord, sd);
596 
597    nir_def *signed_bit_mask = clamp_and_mask(b, sd->bits, sd->channels);
598 
599 #define CONVERT_SWAP_WRITE(PIXEL) \
600    convert_swap_write(b, PIXEL, buffer_offset, num_components, sd);
601    nir_push_if(b, sd->normalized);
602       nir_push_if(b, sd->clamp_uint); //unorm
603          CONVERT_SWAP_WRITE(nir_format_float_to_unorm_with_factor(b, pixel, nir_u2f32(b, nir_mask(b, sd->bits, 32))));
604       nir_push_else(b, NULL);
605          CONVERT_SWAP_WRITE(nir_format_float_to_snorm_with_factor(b, pixel, nir_u2f32(b, signed_bit_mask)));
606       nir_pop_if(b, NULL);
607    nir_push_else(b, NULL);
608       nir_push_if(b, sd->integer);
609          nir_push_if(b, sd->r11g11b10_or_sint); //sint
610             nir_push_if(b, sd->clamp_uint); //uint -> sint
611                CONVERT_SWAP_WRITE(nir_umin(b, pixel, signed_bit_mask));
612             nir_push_else(b, NULL);
613                CONVERT_SWAP_WRITE(nir_imin_imax(b, pixel, signed_bit_mask, nir_iadd_imm(b, nir_ineg(b, signed_bit_mask), -1)));
614             nir_pop_if(b, NULL);
615          nir_push_else(b, NULL);
616             nir_push_if(b, sd->clamp_uint); //uint
617                /* nir_format_clamp_uint */
618                CONVERT_SWAP_WRITE(nir_umin(b, pixel, nir_mask(b, sd->bits, 32)));
619             nir_pop_if(b, NULL);
620          nir_pop_if(b, NULL);
621       nir_push_else(b, NULL);
622          nir_push_if(b, nir_ieq_imm(b, sd->bits1, 16)); //half
623             CONVERT_SWAP_WRITE(nir_format_float_to_half(b, pixel));
624          nir_push_else(b, NULL);
625             CONVERT_SWAP_WRITE(pixel);
626          nir_pop_if(b, NULL);
627    nir_pop_if(b, NULL);
628 }
629 
630 static nir_shader *
create_conversion_shader(struct st_context * st,enum pipe_texture_target target,unsigned num_components)631 create_conversion_shader(struct st_context *st, enum pipe_texture_target target, unsigned num_components)
632 {
633    const nir_shader_compiler_options *options = st_get_nir_compiler_options(st, MESA_SHADER_COMPUTE);
634    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "%s", "convert");
635    b.shader->info.workgroup_size[0] = target != PIPE_TEXTURE_1D ? 8 : 64;
636    b.shader->info.workgroup_size[1] = target != PIPE_TEXTURE_1D ? 8 : 1;
637 
638    b.shader->info.workgroup_size[2] = 1;
639    b.shader->info.textures_used[0] = 1;
640    b.shader->info.num_ssbos = 1;
641    b.shader->num_uniforms = 2;
642    nir_variable_create(b.shader, nir_var_mem_ssbo, glsl_array_type(glsl_float_type(), 0, 4), "ssbo");
643    nir_variable *sampler = nir_variable_create(b.shader, nir_var_uniform, st_pbo_sampler_type_for_target(target, ST_PBO_CONVERT_FLOAT), "sampler");
644    unsigned coord_components = glsl_get_sampler_coordinate_components(sampler->type);
645    sampler->data.explicit_binding = 1;
646 
647    struct pbo_shader_data sd;
648    init_pbo_shader_data(&b, &sd, coord_components);
649 
650    nir_def *bsize = nir_imm_ivec4(&b,
651                                       b.shader->info.workgroup_size[0],
652                                       b.shader->info.workgroup_size[1],
653                                       b.shader->info.workgroup_size[2],
654                                       0);
655    nir_def *wid = nir_load_workgroup_id(&b);
656    nir_def *iid = nir_load_local_invocation_id(&b);
657    nir_def *tile = nir_imul(&b, wid, bsize);
658    nir_def *global_id = nir_iadd(&b, tile, iid);
659    nir_def *start = nir_iadd(&b, nir_trim_vector(&b, global_id, 2), sd.offset);
660 
661    nir_def *coord;
662    if (coord_components < 3)
663       coord = start;
664    else {
665       /* pad offset vec with global_id to get correct z offset */
666       assert(coord_components == 3);
667       coord = nir_vec3(&b, nir_channel(&b, start, 0),
668                            nir_channel(&b, start, 1),
669                            nir_channel(&b, global_id, 2));
670    }
671    coord = nir_trim_vector(&b, coord, coord_components);
672    nir_def *offset = coord_components > 2 ?
673                          nir_pad_vector_imm_int(&b, sd.offset, 0, 3) :
674                          nir_trim_vector(&b, sd.offset, coord_components);
675    nir_def *range = nir_trim_vector(&b, sd.range, coord_components);
676    nir_def *max = nir_iadd(&b, offset, range);
677    nir_push_if(&b, nir_ball(&b, nir_ilt(&b, coord, max)));
678    nir_tex_instr *txf = nir_tex_instr_create(b.shader, 3);
679    txf->is_array = glsl_sampler_type_is_array(sampler->type);
680    txf->op = nir_texop_txf;
681    txf->sampler_dim = glsl_get_sampler_dim(sampler->type);
682    txf->dest_type = nir_type_float32;
683    txf->coord_components = coord_components;
684    txf->texture_index = 0;
685    txf->sampler_index = 0;
686    txf->src[0] = nir_tex_src_for_ssa(nir_tex_src_coord, coord);
687    txf->src[1] = nir_tex_src_for_ssa(nir_tex_src_lod, nir_imm_int(&b, 0));
688    txf->src[2].src_type = nir_tex_src_texture_deref;
689    nir_deref_instr *sampler_deref = nir_build_deref_var(&b, sampler);
690    txf->src[2].src = nir_src_for_ssa(&sampler_deref->def);
691 
692    nir_def_init(&txf->instr, &txf->def, 4, 32);
693    nir_builder_instr_insert(&b, &txf->instr);
694 
695    /* pass the grid offset as the coord to get the zero-indexed buffer offset */
696    do_shader_conversion(&b, &txf->def, num_components, global_id, &sd);
697 
698    nir_pop_if(&b, NULL);
699 
700    nir_validate_shader(b.shader, NULL);
701    gl_nir_opts(b.shader);
702    st_nir_finish_builtin_nir(st, b.shader);
703    return b.shader;
704 }
705 
706 static void
invert_swizzle(uint8_t * out,const uint8_t * in)707 invert_swizzle(uint8_t *out, const uint8_t *in)
708 {
709    /* First, default to all zeroes to prevent uninitialized junk */
710    for (unsigned c = 0; c < 4; ++c)
711       out[c] = PIPE_SWIZZLE_0;
712 
713    /* Now "do" what the swizzle says */
714    for (unsigned c = 0; c < 4; ++c) {
715       unsigned char i = in[c];
716 
717       /* Who cares? */
718       assert(PIPE_SWIZZLE_X == 0);
719       if (i > PIPE_SWIZZLE_W)
720          continue;
721       /* Invert */
722       unsigned idx = i - PIPE_SWIZZLE_X;
723       out[idx] = PIPE_SWIZZLE_X + c;
724    }
725 }
726 
727 static uint32_t
compute_shader_key(enum pipe_texture_target target,unsigned num_components)728 compute_shader_key(enum pipe_texture_target target, unsigned num_components)
729 {
730    uint8_t key_target[] = {
731       [PIPE_BUFFER] = UINT8_MAX,
732       [PIPE_TEXTURE_1D] = 1,
733       [PIPE_TEXTURE_2D] = 2,
734       [PIPE_TEXTURE_3D] = 3,
735       [PIPE_TEXTURE_CUBE] = 4,
736       [PIPE_TEXTURE_RECT] = UINT8_MAX,
737       [PIPE_TEXTURE_1D_ARRAY] = 5,
738       [PIPE_TEXTURE_2D_ARRAY] = 6,
739       [PIPE_TEXTURE_CUBE_ARRAY] = UINT8_MAX,
740    };
741    assert(target < ARRAY_SIZE(key_target));
742    assert(key_target[target] != UINT8_MAX);
743    return key_target[target] | (num_components << 3);
744 }
745 
746 static unsigned
get_dim_from_target(enum pipe_texture_target target)747 get_dim_from_target(enum pipe_texture_target target)
748 {
749    switch (target) {
750    case PIPE_TEXTURE_1D:
751       return 1;
752    case PIPE_TEXTURE_2D_ARRAY:
753    case PIPE_TEXTURE_3D:
754       return 3;
755    default:
756       return 2;
757    }
758 }
759 
760 static enum pipe_texture_target
get_target_from_texture(struct pipe_resource * src)761 get_target_from_texture(struct pipe_resource *src)
762 {
763    enum pipe_texture_target view_target;
764    switch (src->target) {
765    case PIPE_TEXTURE_RECT:
766       view_target = PIPE_TEXTURE_2D;
767       break;
768    case PIPE_TEXTURE_CUBE:
769    case PIPE_TEXTURE_CUBE_ARRAY:
770       view_target = PIPE_TEXTURE_2D_ARRAY;
771       break;
772    default:
773       view_target = src->target;
774       break;
775    }
776    return view_target;
777 }
778 
779 /* force swizzling behavior for sampling */
780 enum swizzle_clamp {
781    /* force component selection for named format */
782    SWIZZLE_CLAMP_LUMINANCE = 1,
783    SWIZZLE_CLAMP_ALPHA = 2,
784    SWIZZLE_CLAMP_LUMINANCE_ALPHA = 3,
785    SWIZZLE_CLAMP_INTENSITY = 4,
786    SWIZZLE_CLAMP_RGBX = 5,
787 
788    /* select only 1 component */
789    SWIZZLE_CLAMP_GREEN = 8,
790    SWIZZLE_CLAMP_BLUE = 16,
791 
792    /* reverse ordering for format emulation */
793    SWIZZLE_CLAMP_BGRA = 32,
794 };
795 
796 static bool
can_copy_direct(const struct gl_pixelstore_attrib * pack)797 can_copy_direct(const struct gl_pixelstore_attrib *pack)
798 {
799    return !(pack->RowLength ||
800             pack->SkipPixels ||
801             pack->SkipRows ||
802             pack->ImageHeight ||
803             pack->SkipImages);
804 }
805 
806 static void
create_conversion_shader_async(void * data,void * gdata,int thread_index)807 create_conversion_shader_async(void *data, void *gdata, int thread_index)
808 {
809    struct pbo_async_data *async = data;
810    async->nir = create_conversion_shader(async->st, async->target, async->num_components);
811    /* this is hefty, but specialized shaders need a base to work from */
812    async->copy = nir_shader_clone(NULL, async->nir);
813 }
814 
815 static void
create_spec_shader_async(void * data,void * gdata,int thread_index)816 create_spec_shader_async(void *data, void *gdata, int thread_index)
817 {
818    struct pbo_spec_async_data *spec = data;
819    /* this is still the immutable clone: create our own copy */
820    spec->nir = nir_shader_clone(NULL, spec->nir);
821    /* do not inline geometry */
822    uint16_t offsets[2] = {2, 3};
823    nir_inline_uniforms(spec->nir, ARRAY_SIZE(offsets), &spec->data[2], offsets);
824    spec->created = true;
825 }
826 
827 static uint32_t
hash_pbo_data(const void * data)828 hash_pbo_data(const void *data)
829 {
830    const struct pbo_data *p = data;
831    return _mesa_hash_data(&p->vec[2], sizeof(uint32_t) * 2);
832 }
833 
834 static bool
equals_pbo_data(const void * a,const void * b)835 equals_pbo_data(const void *a, const void *b)
836 {
837    const struct pbo_data *pa = a, *pb = b;
838    return !memcmp(&pa->vec[2], &pb->vec[2], sizeof(uint32_t) * 2);
839 }
840 
841 static struct pbo_spec_async_data *
add_spec_data(struct pbo_async_data * async,struct pbo_data * pd)842 add_spec_data(struct pbo_async_data *async, struct pbo_data *pd)
843 {
844    bool found = false;
845    struct pbo_spec_async_data *spec;
846    struct set_entry *entry = _mesa_set_search_or_add(&async->specialized, pd, &found);
847    if (!found) {
848       spec = calloc(1, sizeof(struct pbo_spec_async_data));
849       util_queue_fence_init(&spec->fence);
850       memcpy(spec->data, pd, sizeof(struct pbo_data));
851       entry->key = spec;
852    }
853    spec = (void*)entry->key;
854    if (!spec->nir && !spec->created)
855       spec->nir = async->copy;
856    spec->uses++;
857    return spec;
858 }
859 
860 static struct pbo_async_data *
add_async_data(struct st_context * st,enum pipe_texture_target view_target,unsigned num_components,uint32_t hash_key)861 add_async_data(struct st_context *st, enum pipe_texture_target view_target, unsigned num_components, uint32_t hash_key)
862 {
863    struct pbo_async_data *async = calloc(1, sizeof(struct pbo_async_data));
864    async->st = st;
865    async->target = view_target;
866    async->num_components = num_components;
867    util_queue_fence_init(&async->fence);
868    _mesa_hash_table_insert(st->pbo.shaders, (void*)(uintptr_t)hash_key, async);
869    _mesa_set_init(&async->specialized, NULL, hash_pbo_data, equals_pbo_data);
870    return async;
871 }
872 
873 static struct pipe_resource *
download_texture_compute(struct st_context * st,const struct gl_pixelstore_attrib * pack,GLint xoffset,GLint yoffset,GLint zoffset,GLsizei width,GLsizei height,GLint depth,unsigned level,unsigned layer,GLenum format,GLenum type,enum pipe_format src_format,enum pipe_texture_target view_target,struct pipe_resource * src,enum pipe_format dst_format,enum swizzle_clamp swizzle_clamp)874 download_texture_compute(struct st_context *st,
875                          const struct gl_pixelstore_attrib *pack,
876                          GLint xoffset, GLint yoffset, GLint zoffset,
877                          GLsizei width, GLsizei height, GLint depth,
878                          unsigned level, unsigned layer,
879                          GLenum format, GLenum type,
880                          enum pipe_format src_format,
881                          enum pipe_texture_target view_target,
882                          struct pipe_resource *src,
883                          enum pipe_format dst_format,
884                          enum swizzle_clamp swizzle_clamp)
885 {
886    struct pipe_context *pipe = st->pipe;
887    struct pipe_screen *screen = st->screen;
888    struct pipe_resource *dst = NULL;
889    unsigned dim = get_dim_from_target(view_target);
890 
891    /* clamp 3d offsets based on slice */
892    if (view_target == PIPE_TEXTURE_3D)
893       zoffset += layer;
894 
895    unsigned num_components = 0;
896    /* Upload constants */
897    struct pipe_constant_buffer cb;
898    assert(view_target != PIPE_TEXTURE_1D_ARRAY || !yoffset);
899    struct pbo_data pd = {
900       .x = MIN2(xoffset, 65535),
901       .y = view_target == PIPE_TEXTURE_1D_ARRAY ? 0 : MIN2(yoffset, 65535),
902       .width = MIN2(width, 65535),
903       .height = MIN2(height, 65535),
904       .depth = MIN2(depth, 65535),
905       .invert = pack->Invert,
906       .blocksize = util_format_get_blocksize(dst_format) - 1,
907       .alignment = ffs(MAX2(pack->Alignment, 1)) - 1,
908    };
909    num_components = fill_pbo_data(&pd, src_format, dst_format, pack->SwapBytes == 1);
910 
911    cb.buffer = NULL;
912    cb.user_buffer = &pd;
913    cb.buffer_offset = 0;
914    cb.buffer_size = sizeof(pd);
915 
916    uint32_t hash_key = compute_shader_key(view_target, num_components);
917    assert(hash_key != 0);
918 
919    struct hash_entry *he = _mesa_hash_table_search(st->pbo.shaders, (void*)(uintptr_t)hash_key);
920    void *cs = NULL;
921    if (he) {
922       /* disable async if MESA_COMPUTE_PBO is set */
923       if (st->force_specialized_compute_transfer) {
924          struct pbo_async_data *async = he->data;
925          struct pbo_spec_async_data *spec = add_spec_data(async, &pd);
926          if (spec->cs) {
927             cs = spec->cs;
928          } else {
929             create_spec_shader_async(spec, NULL, 0);
930             struct pipe_shader_state state = {
931                .type = PIPE_SHADER_IR_NIR,
932                .ir.nir = spec->nir,
933             };
934             cs = spec->cs = st_create_nir_shader(st, &state);
935             spec->nir = NULL;
936          }
937          cb.buffer_size = 2 * sizeof(uint32_t);
938       } else if (!st->force_compute_based_texture_transfer && screen->driver_thread_add_job) {
939          struct pbo_async_data *async = he->data;
940          struct pbo_spec_async_data *spec = add_spec_data(async, &pd);
941          if (!util_queue_fence_is_signalled(&async->fence))
942             return NULL;
943          /* nir is definitely done */
944          if (!async->cs) {
945             /* cs job not yet started */
946             assert(async->nir && !async->cs);
947             async->cs = pipe_shader_from_nir(pipe, async->nir);
948             async->nir = NULL;
949          }
950          /* cs *may* be done */
951          if (screen->is_parallel_shader_compilation_finished &&
952              !screen->is_parallel_shader_compilation_finished(screen, async->cs, MESA_SHADER_COMPUTE))
953             return NULL;
954          cs = async->cs;
955          if (spec->uses > SPEC_USES_THRESHOLD && util_queue_fence_is_signalled(&spec->fence)) {
956             if (spec->created) {
957                if (!spec->cs) {
958                   spec->cs = pipe_shader_from_nir(pipe, spec->nir);
959                   spec->nir = NULL;
960                }
961                if (screen->is_parallel_shader_compilation_finished &&
962                    screen->is_parallel_shader_compilation_finished(screen, spec->cs, MESA_SHADER_COMPUTE)) {
963                   cs = spec->cs;
964                   cb.buffer_size = 2 * sizeof(uint32_t);
965                }
966             } else {
967                screen->driver_thread_add_job(screen, spec, &spec->fence, create_spec_shader_async, NULL, 0);
968             }
969          }
970       } else {
971          cs = he->data;
972       }
973    } else {
974       if (!st->force_compute_based_texture_transfer && screen->driver_thread_add_job) {
975          struct pbo_async_data *async = add_async_data(st, view_target, num_components, hash_key);
976          screen->driver_thread_add_job(screen, async, &async->fence, create_conversion_shader_async, NULL, 0);
977          add_spec_data(async, &pd);
978          return NULL;
979       }
980 
981       if (st->force_specialized_compute_transfer) {
982          struct pbo_async_data *async = add_async_data(st, view_target, num_components, hash_key);
983          create_conversion_shader_async(async, NULL, 0);
984          struct pbo_spec_async_data *spec = add_spec_data(async, &pd);
985          create_spec_shader_async(spec, NULL, 0);
986          struct pipe_shader_state state = {
987             .type = PIPE_SHADER_IR_NIR,
988             .ir.nir = spec->nir,
989          };
990          cs = spec->cs = st_create_nir_shader(st, &state);
991          spec->nir = NULL;
992          cb.buffer_size = 2 * sizeof(uint32_t);
993       } else {
994          nir_shader *nir = create_conversion_shader(st, view_target, num_components);
995          struct pipe_shader_state state = {
996             .type = PIPE_SHADER_IR_NIR,
997             .ir.nir = nir,
998          };
999          cs = st_create_nir_shader(st, &state);
1000          he = _mesa_hash_table_insert(st->pbo.shaders, (void*)(uintptr_t)hash_key, cs);
1001       }
1002    }
1003    assert(cs);
1004    struct cso_context *cso = st->cso_context;
1005 
1006    pipe->set_constant_buffer(pipe, PIPE_SHADER_COMPUTE, 0, false, &cb);
1007 
1008    cso_save_compute_state(cso, CSO_BIT_COMPUTE_SHADER | CSO_BIT_COMPUTE_SAMPLERS);
1009    cso_set_compute_shader_handle(cso, cs);
1010 
1011    /* Set up the sampler_view */
1012    {
1013       struct pipe_sampler_view templ;
1014       struct pipe_sampler_view *sampler_view;
1015       struct pipe_sampler_state sampler = {0};
1016       const struct pipe_sampler_state *samplers[1] = {&sampler};
1017       const struct util_format_description *desc = util_format_description(dst_format);
1018 
1019       u_sampler_view_default_template(&templ, src, src_format);
1020       if (util_format_is_depth_or_stencil(dst_format)) {
1021          templ.swizzle_r = PIPE_SWIZZLE_X;
1022          templ.swizzle_g = PIPE_SWIZZLE_X;
1023          templ.swizzle_b = PIPE_SWIZZLE_X;
1024          templ.swizzle_a = PIPE_SWIZZLE_X;
1025       } else {
1026          uint8_t invswizzle[4];
1027          const uint8_t *swizzle;
1028 
1029          /* these swizzle output bits require explicit component selection/ordering */
1030          if (swizzle_clamp & SWIZZLE_CLAMP_GREEN) {
1031             for (unsigned i = 0; i < 4; i++)
1032                invswizzle[i] = PIPE_SWIZZLE_Y;
1033          } else if (swizzle_clamp & SWIZZLE_CLAMP_BLUE) {
1034             for (unsigned i = 0; i < 4; i++)
1035                invswizzle[i] = PIPE_SWIZZLE_Z;
1036          } else {
1037             if (swizzle_clamp & SWIZZLE_CLAMP_BGRA) {
1038                if (util_format_get_nr_components(dst_format) == 3)
1039                   swizzle = util_format_description(PIPE_FORMAT_B8G8R8_UNORM)->swizzle;
1040                else
1041                   swizzle = util_format_description(PIPE_FORMAT_B8G8R8A8_UNORM)->swizzle;
1042             } else {
1043                swizzle = desc->swizzle;
1044             }
1045             invert_swizzle(invswizzle, swizzle);
1046          }
1047          swizzle_clamp &= ~(SWIZZLE_CLAMP_BGRA | SWIZZLE_CLAMP_GREEN | SWIZZLE_CLAMP_BLUE);
1048 
1049          /* these swizzle input modes clamp unused components to 0 and (sometimes) alpha to 1 */
1050          switch (swizzle_clamp) {
1051          case SWIZZLE_CLAMP_LUMINANCE:
1052             if (util_format_is_luminance(dst_format))
1053                break;
1054             for (unsigned i = 0; i < 4; i++) {
1055                if (invswizzle[i] != PIPE_SWIZZLE_X)
1056                   invswizzle[i] = invswizzle[i] == PIPE_SWIZZLE_W ? PIPE_SWIZZLE_1 : PIPE_SWIZZLE_0;
1057             }
1058             break;
1059          case SWIZZLE_CLAMP_ALPHA:
1060             for (unsigned i = 0; i < 4; i++) {
1061                if (invswizzle[i] != PIPE_SWIZZLE_W)
1062                   invswizzle[i] = PIPE_SWIZZLE_0;
1063             }
1064             break;
1065          case SWIZZLE_CLAMP_LUMINANCE_ALPHA:
1066             if (util_format_is_luminance_alpha(dst_format))
1067                break;
1068             for (unsigned i = 0; i < 4; i++) {
1069                if (invswizzle[i] != PIPE_SWIZZLE_X && invswizzle[i] != PIPE_SWIZZLE_W)
1070                   invswizzle[i] = PIPE_SWIZZLE_0;
1071             }
1072             break;
1073          case SWIZZLE_CLAMP_INTENSITY:
1074             for (unsigned i = 0; i < 4; i++) {
1075                if (invswizzle[i] == PIPE_SWIZZLE_W)
1076                   invswizzle[i] = PIPE_SWIZZLE_1;
1077                else if (invswizzle[i] != PIPE_SWIZZLE_X)
1078                   invswizzle[i] = PIPE_SWIZZLE_0;
1079             }
1080             break;
1081          case SWIZZLE_CLAMP_RGBX:
1082             for (unsigned i = 0; i < 4; i++) {
1083                if (invswizzle[i] == PIPE_SWIZZLE_W)
1084                   invswizzle[i] = PIPE_SWIZZLE_1;
1085             }
1086             break;
1087          default: break;
1088          }
1089          templ.swizzle_r = invswizzle[0];
1090          templ.swizzle_g = invswizzle[1];
1091          templ.swizzle_b = invswizzle[2];
1092          templ.swizzle_a = invswizzle[3];
1093       }
1094       templ.target = view_target;
1095       templ.u.tex.first_level = level;
1096       templ.u.tex.last_level = level;
1097 
1098       /* array textures expect to have array index provided */
1099       if (view_target != PIPE_TEXTURE_3D && src->array_size) {
1100          templ.u.tex.first_layer = layer;
1101          if (view_target == PIPE_TEXTURE_1D_ARRAY) {
1102             templ.u.tex.first_layer += yoffset;
1103             templ.u.tex.last_layer = templ.u.tex.first_layer + height - 1;
1104          } else {
1105             templ.u.tex.first_layer += zoffset;
1106             templ.u.tex.last_layer = templ.u.tex.first_layer + depth - 1;
1107          }
1108       }
1109 
1110       sampler_view = pipe->create_sampler_view(pipe, src, &templ);
1111       if (sampler_view == NULL)
1112          goto fail;
1113 
1114       pipe->set_sampler_views(pipe, PIPE_SHADER_COMPUTE, 0, 1, 0, false,
1115                               &sampler_view);
1116       st->state.num_sampler_views[PIPE_SHADER_COMPUTE] =
1117          MAX2(st->state.num_sampler_views[PIPE_SHADER_COMPUTE], 1);
1118 
1119       pipe_sampler_view_reference(&sampler_view, NULL);
1120 
1121       cso_set_samplers(cso, PIPE_SHADER_COMPUTE, 1, samplers);
1122    }
1123 
1124    /* Set up destination buffer */
1125    intptr_t img_stride = src->target == PIPE_TEXTURE_3D ||
1126                          src->target == PIPE_TEXTURE_2D_ARRAY ||
1127                          src->target == PIPE_TEXTURE_CUBE_ARRAY ?
1128                          /* only use image stride for 3d images to avoid pulling in IMAGE_HEIGHT pixelstore */
1129                          _mesa_image_image_stride(pack, width, height, format, type) :
1130                          _mesa_image_row_stride(pack, width, format, type) * height;
1131    intptr_t buffer_size = (depth + (dim == 3 ? pack->SkipImages : 0)) * img_stride;
1132    assert(buffer_size <= UINT32_MAX);
1133    {
1134       struct pipe_shader_buffer buffer;
1135       memset(&buffer, 0, sizeof(buffer));
1136       if (can_copy_direct(pack) && pack->BufferObj) {
1137          dst = pack->BufferObj->buffer;
1138          assert(pack->BufferObj->Size >= buffer_size);
1139       } else {
1140          dst = pipe_buffer_create(screen, PIPE_BIND_SHADER_BUFFER, PIPE_USAGE_STAGING, buffer_size);
1141          if (!dst)
1142             goto fail;
1143       }
1144       buffer.buffer = dst;
1145       buffer.buffer_size = buffer_size;
1146 
1147       pipe->set_shader_buffers(pipe, PIPE_SHADER_COMPUTE, 0, 1, &buffer, 0x1);
1148    }
1149 
1150    struct pipe_grid_info info = { 0 };
1151    info.block[0] = src->target != PIPE_TEXTURE_1D ? 8 : 64;
1152    info.block[1] = src->target != PIPE_TEXTURE_1D ? 8 : 1;
1153    info.last_block[0] = width % info.block[0];
1154    info.last_block[1] = height % info.block[1];
1155    info.block[2] = 1;
1156    info.grid[0] = DIV_ROUND_UP(width, info.block[0]);
1157    info.grid[1] = DIV_ROUND_UP(height, info.block[1]);
1158    info.grid[2] = depth;
1159 
1160    pipe->launch_grid(pipe, &info);
1161 
1162 fail:
1163    cso_restore_compute_state(cso);
1164 
1165    /* Unbind all because st/mesa won't do it if the current shader doesn't
1166     * use them.
1167     */
1168    pipe->set_sampler_views(pipe, PIPE_SHADER_COMPUTE, 0, 0,
1169                            st->state.num_sampler_views[PIPE_SHADER_COMPUTE],
1170                            false, NULL);
1171    st->state.num_sampler_views[PIPE_SHADER_COMPUTE] = 0;
1172    pipe->set_shader_buffers(pipe, PIPE_SHADER_COMPUTE, 0, 1, NULL, 0);
1173 
1174    st->ctx->NewDriverState |= ST_NEW_CS_CONSTANTS |
1175                               ST_NEW_CS_SSBOS |
1176                               ST_NEW_CS_SAMPLER_VIEWS;
1177 
1178    return dst;
1179 }
1180 
1181 static void
copy_converted_buffer(struct gl_context * ctx,struct gl_pixelstore_attrib * pack,enum pipe_texture_target view_target,struct pipe_resource * dst,enum pipe_format dst_format,GLint xoffset,GLint yoffset,GLint zoffset,GLsizei width,GLsizei height,GLint depth,GLenum format,GLenum type,void * pixels)1182 copy_converted_buffer(struct gl_context * ctx,
1183                     struct gl_pixelstore_attrib *pack,
1184                     enum pipe_texture_target view_target,
1185                     struct pipe_resource *dst, enum pipe_format dst_format,
1186                     GLint xoffset, GLint yoffset, GLint zoffset,
1187                     GLsizei width, GLsizei height, GLint depth,
1188                     GLenum format, GLenum type, void *pixels)
1189 {
1190    struct pipe_transfer *xfer;
1191    struct st_context *st = st_context(ctx);
1192    unsigned dim = get_dim_from_target(view_target);
1193    uint8_t *map = pipe_buffer_map(st->pipe, dst, PIPE_MAP_READ | PIPE_MAP_ONCE, &xfer);
1194    if (!map)
1195       return;
1196 
1197    pixels = _mesa_map_pbo_dest(ctx, pack, pixels);
1198    /* compute shader doesn't handle these to cut down on uniform size */
1199    if (!can_copy_direct(pack)) {
1200       if (view_target == PIPE_TEXTURE_1D_ARRAY) {
1201          depth = height;
1202          height = 1;
1203          zoffset = yoffset;
1204          yoffset = 0;
1205       }
1206 
1207       struct gl_pixelstore_attrib packing = *pack;
1208 
1209       /* source image is tightly packed */
1210       packing.RowLength = 0;
1211       packing.SkipPixels = 0;
1212       packing.SkipRows = 0;
1213       packing.ImageHeight = 0;
1214       packing.SkipImages = 0;
1215 
1216       for (unsigned z = 0; z < depth; z++) {
1217          for (unsigned y = 0; y < height; y++) {
1218             GLubyte *dst = _mesa_image_address(dim, pack, pixels,
1219                                        width, height, format, type,
1220                                        z, y, 0);
1221             GLubyte *srcpx = _mesa_image_address(dim, &packing, map,
1222                                                  width, height, format, type,
1223                                                  z, y, 0);
1224             util_streaming_load_memcpy(dst, srcpx, util_format_get_stride(dst_format, width));
1225          }
1226       }
1227    } else {
1228       /* direct copy for all other cases */
1229       util_streaming_load_memcpy(pixels, map, dst->width0);
1230    }
1231 
1232    _mesa_unmap_pbo_dest(ctx, pack);
1233    pipe_buffer_unmap(st->pipe, xfer);
1234 }
1235 
1236 bool
st_GetTexSubImage_shader(struct gl_context * ctx,GLint xoffset,GLint yoffset,GLint zoffset,GLsizei width,GLsizei height,GLint depth,GLenum format,GLenum type,void * pixels,struct gl_texture_image * texImage)1237 st_GetTexSubImage_shader(struct gl_context * ctx,
1238                          GLint xoffset, GLint yoffset, GLint zoffset,
1239                          GLsizei width, GLsizei height, GLint depth,
1240                          GLenum format, GLenum type, void * pixels,
1241                          struct gl_texture_image *texImage)
1242 {
1243    struct st_context *st = st_context(ctx);
1244    struct pipe_screen *screen = st->screen;
1245    struct gl_texture_object *stObj = texImage->TexObject;
1246    struct pipe_resource *src = texImage->pt;
1247    struct pipe_resource *dst = NULL;
1248    enum pipe_format dst_format, src_format;
1249    unsigned level = (texImage->pt != stObj->pt ? 0 : texImage->Level) + texImage->TexObject->Attrib.MinLevel;
1250    unsigned layer = texImage->Face + texImage->TexObject->Attrib.MinLayer;
1251    enum pipe_texture_target view_target;
1252 
1253    assert(!_mesa_is_format_etc2(texImage->TexFormat) &&
1254           !_mesa_is_format_astc_2d(texImage->TexFormat) &&
1255           texImage->TexFormat != MESA_FORMAT_ETC1_RGB8);
1256 
1257    /* See if the texture format already matches the format and type,
1258     * in which case the memcpy-based fast path will be used. */
1259    if (_mesa_format_matches_format_and_type(texImage->TexFormat, format,
1260                                             type, ctx->Pack.SwapBytes, NULL)) {
1261       return false;
1262    }
1263    enum swizzle_clamp swizzle_clamp = 0;
1264    src_format = st_pbo_get_src_format(screen, stObj->surface_based ? stObj->surface_format : src->format, src);
1265    if (src_format == PIPE_FORMAT_NONE)
1266       return false;
1267 
1268    /* special case for stencil extraction */
1269    if (format == GL_STENCIL_INDEX && util_format_is_depth_and_stencil(src_format))
1270       src_format = PIPE_FORMAT_X24S8_UINT;
1271 
1272    if (texImage->_BaseFormat != _mesa_get_format_base_format(texImage->TexFormat)) {
1273       /* special handling for drivers that don't support these formats natively */
1274       if (texImage->_BaseFormat == GL_LUMINANCE)
1275          swizzle_clamp = SWIZZLE_CLAMP_LUMINANCE;
1276       else if (texImage->_BaseFormat == GL_LUMINANCE_ALPHA)
1277          swizzle_clamp = SWIZZLE_CLAMP_LUMINANCE_ALPHA;
1278       else if (texImage->_BaseFormat == GL_ALPHA)
1279          swizzle_clamp = SWIZZLE_CLAMP_ALPHA;
1280       else if (texImage->_BaseFormat == GL_INTENSITY)
1281          swizzle_clamp = SWIZZLE_CLAMP_INTENSITY;
1282       else if (texImage->_BaseFormat == GL_RGB)
1283          swizzle_clamp = SWIZZLE_CLAMP_RGBX;
1284    }
1285 
1286    dst_format = st_pbo_get_dst_format(ctx, PIPE_BUFFER, src_format, false, format, type, 0);
1287 
1288    if (dst_format == PIPE_FORMAT_NONE) {
1289       bool need_bgra_swizzle = false;
1290       dst_format = get_convert_format(ctx, src_format, format, type, &need_bgra_swizzle);
1291       if (dst_format == PIPE_FORMAT_NONE)
1292          return false;
1293       /* special swizzling for component selection */
1294       if (need_bgra_swizzle)
1295          swizzle_clamp |= SWIZZLE_CLAMP_BGRA;
1296       else if (format == GL_GREEN_INTEGER)
1297          swizzle_clamp |= SWIZZLE_CLAMP_GREEN;
1298       else if (format == GL_BLUE_INTEGER)
1299          swizzle_clamp |= SWIZZLE_CLAMP_BLUE;
1300    }
1301 
1302    /* check with the driver to see if memcpy is likely to be faster */
1303    if (!st->force_compute_based_texture_transfer &&
1304        !screen->is_compute_copy_faster(screen, src_format, dst_format, width, height, depth, true))
1305       return false;
1306 
1307    view_target = get_target_from_texture(src);
1308    /* I don't know why this works
1309     * only for the texture rects
1310     * but that's how it is
1311     */
1312    if ((src->target != PIPE_TEXTURE_RECT &&
1313        /* this would need multiple samplerviews */
1314        ((util_format_is_depth_and_stencil(src_format) && util_format_is_depth_and_stencil(dst_format)) ||
1315        /* these format just doesn't work and science can't explain why */
1316        dst_format == PIPE_FORMAT_Z32_FLOAT)) ||
1317        /* L8 -> L32_FLOAT is another thinker */
1318        (!util_format_is_float(src_format) && dst_format == PIPE_FORMAT_L32_FLOAT))
1319       return false;
1320 
1321    dst = download_texture_compute(st, &ctx->Pack, xoffset, yoffset, zoffset, width, height, depth,
1322                                   level, layer, format, type, src_format, view_target, src, dst_format,
1323                                   swizzle_clamp);
1324    if (!dst)
1325       return false;
1326 
1327    if (!can_copy_direct(&ctx->Pack) || !ctx->Pack.BufferObj) {
1328       copy_converted_buffer(ctx, &ctx->Pack, view_target, dst, dst_format, xoffset, yoffset, zoffset,
1329                           width, height, depth, format, type, pixels);
1330 
1331       pipe_resource_reference(&dst, NULL);
1332    }
1333 
1334    return true;
1335 }
1336 
1337 void
st_pbo_compute_deinit(struct st_context * st)1338 st_pbo_compute_deinit(struct st_context *st)
1339 {
1340    struct pipe_screen *screen = st->screen;
1341    if (!st->pbo.shaders)
1342       return;
1343    hash_table_foreach(st->pbo.shaders, entry) {
1344       if (st->force_specialized_compute_transfer ||
1345           (!st->force_compute_based_texture_transfer && screen->driver_thread_add_job)) {
1346          struct pbo_async_data *async = entry->data;
1347          util_queue_fence_wait(&async->fence);
1348          if (async->cs)
1349             st->pipe->delete_compute_state(st->pipe, async->cs);
1350          util_queue_fence_destroy(&async->fence);
1351          ralloc_free(async->nir);
1352          ralloc_free(async->copy);
1353          set_foreach_remove(&async->specialized, se) {
1354             struct pbo_spec_async_data *spec = (void*)se->key;
1355             util_queue_fence_wait(&spec->fence);
1356             util_queue_fence_destroy(&spec->fence);
1357             if (spec->created) {
1358                ralloc_free(spec->nir);
1359                st->pipe->delete_compute_state(st->pipe, spec->cs);
1360             }
1361             free(spec);
1362          }
1363          ralloc_free(async->specialized.table);
1364          free(async);
1365       } else {
1366          st->pipe->delete_compute_state(st->pipe, entry->data);
1367       }
1368    }
1369    _mesa_hash_table_destroy(st->pbo.shaders, NULL);
1370 }
1371