xref: /aosp_15_r20/external/mesa3d/src/gallium/drivers/llvmpipe/lp_state_fs.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /**************************************************************************
2  *
3  * Copyright 2009 VMware, Inc.
4  * Copyright 2007 VMware, Inc.
5  * All Rights Reserved.
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a
8  * copy of this software and associated documentation files (the
9  * "Software"), to deal in the Software without restriction, including
10  * without limitation the rights to use, copy, modify, merge, publish,
11  * distribute, sub license, and/or sell copies of the Software, and to
12  * permit persons to whom the Software is furnished to do so, subject to
13  * the following conditions:
14  *
15  * The above copyright notice and this permission notice (including the
16  * next paragraph) shall be included in all copies or substantial portions
17  * of the Software.
18  *
19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
22  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
23  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26  *
27  **************************************************************************/
28 
29 /**
30  * @file
31  * Code generate the whole fragment pipeline.
32  *
33  * The fragment pipeline consists of the following stages:
34  * - early depth test
35  * - fragment shader
36  * - alpha test
37  * - depth/stencil test
38  * - blending
39  *
40  * This file has only the glue to assemble the fragment pipeline.  The actual
41  * plumbing of converting Gallium state into LLVM IR is done elsewhere, in the
42  * lp_bld_*.[ch] files, and in a complete generic and reusable way. Here we
43  * muster the LLVM JIT execution engine to create a function that follows an
44  * established binary interface and that can be called from C directly.
45  *
46  * A big source of complexity here is that we often want to run different
47  * stages with different precisions and data types and precisions. For example,
48  * the fragment shader needs typically to be done in floats, but the
49  * depth/stencil test and blending is better done in the type that most closely
50  * matches the depth/stencil and color buffer respectively.
51  *
52  * Since the width of a SIMD vector register stays the same regardless of the
53  * element type, different types imply different number of elements, so we must
54  * code generate more instances of the stages with larger types to be able to
55  * feed/consume the stages with smaller types.
56  *
57  * @author Jose Fonseca <[email protected]>
58  */
59 
60 #include <limits.h>
61 #include "pipe/p_defines.h"
62 #include "util/u_inlines.h"
63 #include "util/u_memory.h"
64 #include "util/u_pointer.h"
65 #include "util/format/u_format.h"
66 #include "util/u_dump.h"
67 #include "util/u_string.h"
68 #include "util/u_dual_blend.h"
69 #include "util/u_upload_mgr.h"
70 #include "util/os_time.h"
71 #include "pipe/p_shader_tokens.h"
72 #include "draw/draw_context.h"
73 #include "nir/tgsi_to_nir.h"
74 #include "gallivm/lp_bld_type.h"
75 #include "gallivm/lp_bld_const.h"
76 #include "gallivm/lp_bld_conv.h"
77 #include "gallivm/lp_bld_init.h"
78 #include "gallivm/lp_bld_intr.h"
79 #include "gallivm/lp_bld_logic.h"
80 #include "gallivm/lp_bld_tgsi.h"
81 #include "gallivm/lp_bld_nir.h"
82 #include "gallivm/lp_bld_swizzle.h"
83 #include "gallivm/lp_bld_flow.h"
84 #include "gallivm/lp_bld_debug.h"
85 #include "gallivm/lp_bld_arit.h"
86 #include "gallivm/lp_bld_bitarit.h"
87 #include "gallivm/lp_bld_pack.h"
88 #include "gallivm/lp_bld_format.h"
89 #include "gallivm/lp_bld_quad.h"
90 #include "gallivm/lp_bld_gather.h"
91 #include "gallivm/lp_bld_jit_sample.h"
92 
93 #include "lp_bld_alpha.h"
94 #include "lp_bld_blend.h"
95 #include "lp_bld_depth.h"
96 #include "lp_bld_interp.h"
97 #include "lp_context.h"
98 #include "lp_debug.h"
99 #include "lp_perf.h"
100 #include "lp_setup.h"
101 #include "lp_state.h"
102 #include "lp_tex_sample.h"
103 #include "lp_flush.h"
104 #include "lp_state_fs.h"
105 #include "lp_rast.h"
106 #include "nir/nir_to_tgsi_info.h"
107 
108 #include "lp_screen.h"
109 #include "compiler/nir/nir_serialize.h"
110 #include "util/mesa-sha1.h"
111 
112 
113 /** Fragment shader number (for debugging) */
114 static unsigned fs_no = 0;
115 
116 
117 static void
118 load_unswizzled_block(struct gallivm_state *gallivm,
119                       LLVMTypeRef base_type,
120                       LLVMValueRef base_ptr,
121                       LLVMValueRef stride,
122                       unsigned block_width,
123                       unsigned block_height,
124                       LLVMValueRef* dst,
125                       struct lp_type dst_type,
126                       unsigned dst_count,
127                       unsigned dst_alignment);
128 /**
129  * Checks if a format description is an arithmetic format
130  *
131  * A format which has irregular channel sizes such as R3_G3_B2 or R5_G6_B5.
132  */
133 static inline bool
is_arithmetic_format(const struct util_format_description * format_desc)134 is_arithmetic_format(const struct util_format_description *format_desc)
135 {
136    bool arith = false;
137 
138    for (unsigned i = 0; i < format_desc->nr_channels; ++i) {
139       arith |= format_desc->channel[i].size != format_desc->channel[0].size;
140       arith |= (format_desc->channel[i].size % 8) != 0;
141    }
142 
143    return arith;
144 }
145 
146 
147 /**
148  * Checks if this format requires special handling due to required expansion
149  * to floats for blending, and furthermore has "natural" packed AoS ->
150  * unpacked SoA conversion.
151  */
152 static inline bool
format_expands_to_float_soa(const struct util_format_description * format_desc)153 format_expands_to_float_soa(const struct util_format_description *format_desc)
154 {
155    if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT ||
156        format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
157       return true;
158    }
159    return false;
160 }
161 
162 
163 /**
164  * Retrieves the type representing the memory layout for a format
165  *
166  * e.g. RGBA16F = 4x half-float and R3G3B2 = 1x byte
167  */
168 static inline void
lp_mem_type_from_format_desc(const struct util_format_description * format_desc,struct lp_type * type)169 lp_mem_type_from_format_desc(const struct util_format_description *format_desc,
170                              struct lp_type* type)
171 {
172    if (format_expands_to_float_soa(format_desc)) {
173       /* just make this a uint with width of block */
174       type->floating = false;
175       type->fixed = false;
176       type->sign = false;
177       type->norm = false;
178       type->width = format_desc->block.bits;
179       type->length = 1;
180       return;
181    }
182 
183    int chan = util_format_get_first_non_void_channel(format_desc->format);
184 
185    memset(type, 0, sizeof(struct lp_type));
186    type->floating = format_desc->channel[chan].type == UTIL_FORMAT_TYPE_FLOAT;
187    type->fixed    = format_desc->channel[chan].type == UTIL_FORMAT_TYPE_FIXED;
188    type->sign     = format_desc->channel[chan].type != UTIL_FORMAT_TYPE_UNSIGNED;
189    type->norm     = format_desc->channel[chan].normalized;
190 
191    if (is_arithmetic_format(format_desc)) {
192       type->width = 0;
193       type->length = 1;
194 
195       for (unsigned i = 0; i < format_desc->nr_channels; ++i) {
196          type->width += format_desc->channel[i].size;
197       }
198    } else {
199       type->width = format_desc->channel[chan].size;
200       type->length = format_desc->nr_channels;
201    }
202 }
203 
204 
205 /**
206  * Expand the relevant bits of mask_input to a n*4-dword mask for the
207  * n*four pixels in n 2x2 quads.  This will set the n*four elements of the
208  * quad mask vector to 0 or ~0.
209  * Grouping is 01, 23 for 2 quad mode hence only 0 and 2 are valid
210  * quad arguments with fs length 8.
211  *
212  * \param first_quad  which quad(s) of the quad group to test, in [0,3]
213  * \param mask_input  bitwise mask for the whole 4x4 stamp
214  */
215 static LLVMValueRef
generate_quad_mask(struct gallivm_state * gallivm,struct lp_type fs_type,unsigned first_quad,unsigned sample,LLVMValueRef mask_input)216 generate_quad_mask(struct gallivm_state *gallivm,
217                    struct lp_type fs_type,
218                    unsigned first_quad,
219                    unsigned sample,
220                    LLVMValueRef mask_input) /* int64 */
221 {
222    LLVMBuilderRef builder = gallivm->builder;
223    LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
224    LLVMValueRef bits[16];
225    LLVMValueRef mask, bits_vec;
226 
227    /*
228     * XXX: We'll need a different path for 16 x u8
229     */
230    assert(fs_type.width == 32);
231    assert(fs_type.length <= ARRAY_SIZE(bits));
232    struct lp_type mask_type = lp_int_type(fs_type);
233 
234    /*
235     * mask_input >>= (quad * 4)
236     */
237    int shift;
238    switch (first_quad) {
239    case 0:
240       shift = 0;
241       break;
242    case 1:
243       assert(fs_type.length == 4);
244       shift = 2;
245       break;
246    case 2:
247       shift = 8;
248       break;
249    case 3:
250       assert(fs_type.length == 4);
251       shift = 10;
252       break;
253    default:
254       assert(0);
255       shift = 0;
256    }
257 
258    mask_input = LLVMBuildLShr(builder, mask_input,
259                               lp_build_const_int64(gallivm, 16 * sample), "");
260    mask_input = LLVMBuildTrunc(builder, mask_input, i32t, "");
261    mask_input = LLVMBuildAnd(builder, mask_input,
262                              lp_build_const_int32(gallivm, 0xffff), "");
263    mask_input = LLVMBuildLShr(builder, mask_input,
264                               LLVMConstInt(i32t, shift, 0), "");
265 
266    /*
267     * mask = { mask_input & (1 << i), for i in [0,3] }
268     */
269    mask = lp_build_broadcast(gallivm,
270                              lp_build_vec_type(gallivm, mask_type),
271                              mask_input);
272 
273    for (int i = 0; i < fs_type.length / 4; i++) {
274       unsigned j = 2 * (i % 2) + (i / 2) * 8;
275       bits[4*i + 0] = LLVMConstInt(i32t, 1ULL << (j + 0), 0);
276       bits[4*i + 1] = LLVMConstInt(i32t, 1ULL << (j + 1), 0);
277       bits[4*i + 2] = LLVMConstInt(i32t, 1ULL << (j + 4), 0);
278       bits[4*i + 3] = LLVMConstInt(i32t, 1ULL << (j + 5), 0);
279    }
280    bits_vec = LLVMConstVector(bits, fs_type.length);
281    mask = LLVMBuildAnd(builder, mask, bits_vec, "");
282 
283    /*
284     * mask = mask == bits ? ~0 : 0
285     */
286    mask = lp_build_compare(gallivm,
287                            mask_type, PIPE_FUNC_EQUAL,
288                            mask, bits_vec);
289 
290    return mask;
291 }
292 
293 
294 #define EARLY_DEPTH_TEST  0x1
295 #define LATE_DEPTH_TEST   0x2
296 #define EARLY_DEPTH_WRITE 0x4
297 #define LATE_DEPTH_WRITE  0x8
298 #define EARLY_DEPTH_TEST_INFERRED  0x10 //only with EARLY_DEPTH_TEST
299 
300 static unsigned
get_cbuf_location(nir_variable * var,unsigned slot)301 get_cbuf_location(nir_variable *var, unsigned slot)
302 {
303    return (var->data.location - FRAG_RESULT_DATA0) + var->data.index + slot;
304 }
305 
306 static int
find_output_by_frag_result(struct nir_shader * shader,gl_frag_result frag_result)307 find_output_by_frag_result(struct nir_shader *shader,
308                            gl_frag_result frag_result)
309 {
310    nir_foreach_shader_out_variable(var, shader) {
311       int slots = nir_variable_count_slots(var, var->type);
312       for (unsigned s = 0; s < slots; s++) {
313          if (var->data.location + var->data.index + s == frag_result)
314             return var->data.driver_location + s;
315       }
316    }
317 
318    return -1;
319 }
320 
321 /**
322  * Fetch the specified lp_jit_viewport structure for a given viewport_index.
323  */
324 static LLVMValueRef
lp_llvm_viewport(LLVMTypeRef context_type,LLVMValueRef context_ptr,struct gallivm_state * gallivm,LLVMValueRef viewport_index)325 lp_llvm_viewport(LLVMTypeRef context_type,
326                  LLVMValueRef context_ptr,
327                  struct gallivm_state *gallivm,
328                  LLVMValueRef viewport_index)
329 {
330    LLVMBuilderRef builder = gallivm->builder;
331    LLVMValueRef ptr;
332    LLVMValueRef res;
333    struct lp_type viewport_type =
334       lp_type_float_vec(32, 32 * LP_JIT_VIEWPORT_NUM_FIELDS);
335    LLVMTypeRef vtype = lp_build_vec_type(gallivm, viewport_type);
336 
337    ptr = lp_jit_context_viewports(gallivm, context_type, context_ptr);
338    ptr = LLVMBuildPointerCast(builder, ptr,
339             LLVMPointerType(vtype, 0), "");
340 
341    res = lp_build_pointer_get2(builder, vtype, ptr, viewport_index);
342 
343    return res;
344 }
345 
346 
347 static LLVMValueRef
lp_build_depth_clamp(struct gallivm_state * gallivm,LLVMBuilderRef builder,bool depth_clamp,bool restrict_depth,struct lp_type type,LLVMTypeRef context_type,LLVMValueRef context_ptr,LLVMTypeRef thread_data_type,LLVMValueRef thread_data_ptr,LLVMValueRef z)348 lp_build_depth_clamp(struct gallivm_state *gallivm,
349                      LLVMBuilderRef builder,
350                      bool depth_clamp,
351                      bool restrict_depth,
352                      struct lp_type type,
353                      LLVMTypeRef context_type,
354                      LLVMValueRef context_ptr,
355                      LLVMTypeRef thread_data_type,
356                      LLVMValueRef thread_data_ptr,
357                      LLVMValueRef z)
358 {
359    LLVMValueRef viewport, min_depth, max_depth;
360    LLVMValueRef viewport_index;
361    struct lp_build_context f32_bld;
362 
363    assert(type.floating);
364    lp_build_context_init(&f32_bld, gallivm, type);
365 
366    if (restrict_depth)
367       z = lp_build_clamp(&f32_bld, z, f32_bld.zero, f32_bld.one);
368 
369    if (!depth_clamp)
370       return z;
371 
372    /*
373     * Assumes clamping of the viewport index will occur in setup/gs. Value
374     * is passed through the rasterization stage via lp_rast_shader_inputs.
375     *
376     * See: draw_clamp_viewport_idx and lp_clamp_viewport_idx for clamping
377     *      semantics.
378     */
379    viewport_index = lp_jit_thread_data_raster_state_viewport_index(gallivm,
380                                                                    thread_data_type,
381                                                                    thread_data_ptr);
382 
383    /*
384     * Load the min and max depth from the lp_jit_context.viewports
385     * array of lp_jit_viewport structures.
386     */
387    viewport = lp_llvm_viewport(context_type, context_ptr, gallivm, viewport_index);
388 
389    /* viewports[viewport_index].min_depth */
390    min_depth = LLVMBuildExtractElement(builder, viewport,
391                   lp_build_const_int32(gallivm, LP_JIT_VIEWPORT_MIN_DEPTH), "");
392    min_depth = lp_build_broadcast_scalar(&f32_bld, min_depth);
393 
394    /* viewports[viewport_index].max_depth */
395    max_depth = LLVMBuildExtractElement(builder, viewport,
396                   lp_build_const_int32(gallivm, LP_JIT_VIEWPORT_MAX_DEPTH), "");
397    max_depth = lp_build_broadcast_scalar(&f32_bld, max_depth);
398 
399    /*
400     * Clamp to the min and max depth values for the given viewport.
401     */
402    return lp_build_clamp(&f32_bld, z, min_depth, max_depth);
403 }
404 
405 
406 static void
lp_build_sample_alpha_to_coverage(struct gallivm_state * gallivm,struct lp_type type,unsigned coverage_samples,LLVMValueRef num_loop,LLVMValueRef loop_counter,LLVMTypeRef coverage_mask_type,LLVMValueRef coverage_mask_store,LLVMValueRef alpha)407 lp_build_sample_alpha_to_coverage(struct gallivm_state *gallivm,
408                                   struct lp_type type,
409                                   unsigned coverage_samples,
410                                   LLVMValueRef num_loop,
411                                   LLVMValueRef loop_counter,
412                                   LLVMTypeRef coverage_mask_type,
413                                   LLVMValueRef coverage_mask_store,
414                                   LLVMValueRef alpha)
415 {
416    struct lp_build_context bld;
417    LLVMBuilderRef builder = gallivm->builder;
418    float step = 1.0 / coverage_samples;
419 
420    lp_build_context_init(&bld, gallivm, type);
421    for (unsigned s = 0; s < coverage_samples; s++) {
422       LLVMValueRef alpha_ref_value = lp_build_const_vec(gallivm, type, step * s);
423       LLVMValueRef test = lp_build_cmp(&bld, PIPE_FUNC_GREATER, alpha, alpha_ref_value);
424 
425       LLVMValueRef s_mask_idx = LLVMBuildMul(builder, lp_build_const_int32(gallivm, s), num_loop, "");
426       s_mask_idx = LLVMBuildAdd(builder, s_mask_idx, loop_counter, "");
427       LLVMValueRef s_mask_ptr = LLVMBuildGEP2(builder, coverage_mask_type,
428                                               coverage_mask_store, &s_mask_idx, 1, "");
429       LLVMValueRef s_mask = LLVMBuildLoad2(builder, coverage_mask_type, s_mask_ptr, "");
430       s_mask = LLVMBuildAnd(builder, s_mask, test, "");
431       LLVMBuildStore(builder, s_mask, s_mask_ptr);
432    }
433 };
434 
435 
436 struct lp_build_fs_llvm_iface {
437    struct lp_build_fs_iface base;
438    struct lp_build_interp_soa_context *interp;
439    struct lp_build_for_loop_state *loop_state;
440    LLVMTypeRef mask_type;
441    LLVMValueRef mask_store;
442    LLVMValueRef sample_id;
443    LLVMValueRef color_ptr_ptr;
444    LLVMValueRef color_stride_ptr;
445    LLVMValueRef color_sample_stride_ptr;
446    LLVMValueRef zs_base_ptr;
447    LLVMValueRef zs_stride;
448    LLVMValueRef zs_sample_stride;
449    const struct lp_fragment_shader_variant_key *key;
450 };
451 
452 
453 static LLVMValueRef
fs_interp(const struct lp_build_fs_iface * iface,struct lp_build_context * bld,unsigned attrib,unsigned chan,bool centroid,bool sample,LLVMValueRef attrib_indir,LLVMValueRef offsets[2])454 fs_interp(const struct lp_build_fs_iface *iface,
455           struct lp_build_context *bld,
456           unsigned attrib, unsigned chan,
457           bool centroid, bool sample,
458           LLVMValueRef attrib_indir,
459           LLVMValueRef offsets[2])
460 {
461    struct lp_build_fs_llvm_iface *fs_iface = (struct lp_build_fs_llvm_iface *)iface;
462    struct lp_build_interp_soa_context *interp = fs_iface->interp;
463    unsigned loc = TGSI_INTERPOLATE_LOC_CENTER;
464    if (centroid)
465       loc = TGSI_INTERPOLATE_LOC_CENTROID;
466    if (sample)
467       loc = TGSI_INTERPOLATE_LOC_SAMPLE;
468 
469    return lp_build_interp_soa(interp, bld->gallivm, fs_iface->loop_state->counter,
470                               fs_iface->mask_type, fs_iface->mask_store,
471                               attrib, chan, loc, attrib_indir, offsets);
472 }
473 
474 
475 /**
476  * Convert depth-stencil format to a single component one, returning
477  * PIPE_FORMAT_NONE if it doesn't contain the required component.
478  */
479 static enum pipe_format
select_zs_component_format(enum pipe_format format,bool fetch_stencil)480 select_zs_component_format(enum pipe_format format,
481                            bool fetch_stencil)
482 {
483    const struct util_format_description* desc = util_format_description(format);
484    if (fetch_stencil && !util_format_has_stencil(desc))
485       return PIPE_FORMAT_NONE;
486    if (!fetch_stencil && !util_format_has_depth(desc))
487       return PIPE_FORMAT_NONE;
488 
489    switch (format) {
490    case PIPE_FORMAT_Z24_UNORM_S8_UINT:
491       return fetch_stencil ? PIPE_FORMAT_X24S8_UINT : PIPE_FORMAT_Z24X8_UNORM;
492    case PIPE_FORMAT_S8_UINT_Z24_UNORM:
493       return fetch_stencil ? PIPE_FORMAT_S8X24_UINT : PIPE_FORMAT_X8Z24_UNORM;
494    case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
495       return fetch_stencil ? PIPE_FORMAT_X32_S8X24_UINT : format;
496    default:
497       return format;
498    }
499 }
500 
501 static void
fs_fb_fetch(const struct lp_build_fs_iface * iface,struct lp_build_context * bld,int location,LLVMValueRef result[4])502 fs_fb_fetch(const struct lp_build_fs_iface *iface,
503             struct lp_build_context *bld,
504             int location,
505             LLVMValueRef result[4])
506 {
507    struct lp_build_fs_llvm_iface *fs_iface = (struct lp_build_fs_llvm_iface *)iface;
508    struct gallivm_state *gallivm = bld->gallivm;
509    LLVMBuilderRef builder = gallivm->builder;
510    LLVMTypeRef int32_type = LLVMInt32TypeInContext(gallivm->context);
511    LLVMTypeRef int8_type = LLVMInt8TypeInContext(gallivm->context);
512    LLVMTypeRef int8p_type = LLVMPointerType(int8_type, 0);
513    const struct lp_fragment_shader_variant_key *key = fs_iface->key;
514 
515    LLVMValueRef buf_ptr;
516    LLVMValueRef stride;
517    enum pipe_format buf_format;
518 
519    const bool fetch_stencil = location == FRAG_RESULT_STENCIL;
520    const bool fetch_zs = fetch_stencil || location == FRAG_RESULT_DEPTH;
521    if (fetch_zs) {
522       buf_ptr = fs_iface->zs_base_ptr;
523       stride = fs_iface->zs_stride;
524       buf_format = select_zs_component_format(key->zsbuf_format, fetch_stencil);
525    } else {
526       assert(location >= FRAG_RESULT_DATA0 && location <= FRAG_RESULT_DATA7);
527       const int cbuf = location - FRAG_RESULT_DATA0;
528       LLVMValueRef index = lp_build_const_int32(gallivm, cbuf);
529 
530       buf_ptr = LLVMBuildLoad2(builder, int8p_type,
531                                LLVMBuildGEP2(builder, int8p_type,
532                                              fs_iface->color_ptr_ptr, &index, 1, ""), "");
533       stride = LLVMBuildLoad2(builder, int32_type,
534                               LLVMBuildGEP2(builder, int32_type,
535                                             fs_iface->color_stride_ptr, &index, 1, ""), "");
536       buf_format = key->cbuf_format[cbuf];
537    }
538 
539    const struct util_format_description* out_format_desc = util_format_description(buf_format);
540    if (out_format_desc->format == PIPE_FORMAT_NONE) {
541       result[0] = result[1] = result[2] = result[3] = bld->undef;
542       return;
543    }
544 
545    unsigned block_size = bld->type.length;
546    unsigned block_height = key->resource_1d ? 1 : 2;
547    unsigned block_width = block_size / block_height;
548 
549    if (key->multisample) {
550       LLVMValueRef sample_stride;
551 
552       if (fetch_zs) {
553          sample_stride = fs_iface->zs_sample_stride;
554       } else {
555          LLVMValueRef index = lp_build_const_int32(gallivm, location - FRAG_RESULT_DATA0);
556          sample_stride = LLVMBuildLoad2(builder, int32_type,
557                                        LLVMBuildGEP2(builder,
558                                                      int32_type,
559                                                      fs_iface->color_sample_stride_ptr,
560                                                      &index, 1, ""), "");
561       }
562 
563       LLVMValueRef sample_offset = LLVMBuildMul(builder, sample_stride, fs_iface->sample_id, "");
564       buf_ptr = LLVMBuildGEP2(builder, int8_type,
565                               buf_ptr, &sample_offset, 1, "");
566    }
567 
568    /* fragment shader executes on 4x4 blocks. depending on vector width it can
569     * execute 2 or 4 iterations.  only move to the next row once the top row
570     * has completed 8 wide 1 iteration, 4 wide 2 iterations */
571    LLVMValueRef x_offset = NULL, y_offset = NULL;
572    if (!key->resource_1d) {
573       LLVMValueRef counter = fs_iface->loop_state->counter;
574 
575       if (block_size == 4) {
576          x_offset = LLVMBuildShl(builder,
577                                  LLVMBuildAnd(builder, fs_iface->loop_state->counter, lp_build_const_int32(gallivm, 1), ""),
578                                  lp_build_const_int32(gallivm, 1), "");
579          counter = LLVMBuildLShr(builder, fs_iface->loop_state->counter, lp_build_const_int32(gallivm, 1), "");
580       }
581       y_offset = LLVMBuildMul(builder, counter, lp_build_const_int32(gallivm, 2), "");
582    }
583 
584    LLVMValueRef offsets[4 * 4];
585    for (unsigned i = 0; i < block_size; i++) {
586       unsigned x = i % block_width;
587       unsigned y = i / block_width;
588 
589       if (block_size == 8) {
590          /* remap the raw slots into the fragment shader execution mode. */
591          /* this math took me way too long to work out, I'm sure it's
592           * overkill.
593           */
594          x = (i & 1) + ((i >> 2) << 1);
595          if (!key->resource_1d)
596             y = (i & 2) >> 1;
597       }
598 
599       LLVMValueRef x_val;
600       if (x_offset) {
601          x_val = LLVMBuildAdd(builder, lp_build_const_int32(gallivm, x), x_offset, "");
602          x_val = LLVMBuildMul(builder, x_val, lp_build_const_int32(gallivm, out_format_desc->block.bits / 8), "");
603       } else {
604          x_val = lp_build_const_int32(gallivm, x * (out_format_desc->block.bits / 8));
605       }
606 
607       LLVMValueRef y_val = lp_build_const_int32(gallivm, y);
608       if (y_offset)
609          y_val = LLVMBuildAdd(builder, y_val, y_offset, "");
610       y_val = LLVMBuildMul(builder, y_val, stride, "");
611 
612       offsets[i] = LLVMBuildAdd(builder, x_val, y_val, "");
613    }
614    LLVMValueRef offset = lp_build_gather_values(gallivm, offsets, block_size);
615 
616    struct lp_type texel_type = bld->type;
617    if (out_format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB &&
618        out_format_desc->channel[0].pure_integer) {
619       if (out_format_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) {
620          texel_type = lp_type_int_vec(bld->type.width, bld->type.width * bld->type.length);
621       } else if (out_format_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) {
622          texel_type = lp_type_uint_vec(bld->type.width, bld->type.width * bld->type.length);
623       }
624    } else if (fetch_stencil) {
625       texel_type = lp_type_uint_vec(bld->type.width, bld->type.width * bld->type.length);
626    }
627 
628    lp_build_fetch_rgba_soa(gallivm, out_format_desc, texel_type,
629                            true, buf_ptr, offset,
630                            NULL, NULL, NULL, result);
631 }
632 
633 /**
634  * Generate the fragment shader, depth/stencil test, and alpha tests.
635  */
636 static void
generate_fs_loop(struct gallivm_state * gallivm,struct lp_fragment_shader * shader,const struct lp_fragment_shader_variant_key * key,LLVMBuilderRef builder,struct lp_type type,LLVMTypeRef context_type,LLVMValueRef context_ptr,LLVMTypeRef resources_type,LLVMValueRef resources_ptr,LLVMTypeRef sample_pos_type,LLVMValueRef sample_pos_array,LLVMValueRef num_loop,struct lp_build_interp_soa_context * interp,const struct lp_build_sampler_soa * sampler,const struct lp_build_image_soa * image,LLVMTypeRef mask_type,LLVMValueRef mask_store,LLVMValueRef (* out_color)[4],LLVMValueRef depth_base_ptr,LLVMValueRef depth_stride,LLVMValueRef depth_sample_stride,LLVMValueRef color_ptr_ptr,LLVMValueRef color_stride_ptr,LLVMValueRef color_sample_stride_ptr,LLVMValueRef facing,LLVMTypeRef thread_data_type,LLVMValueRef thread_data_ptr)637 generate_fs_loop(struct gallivm_state *gallivm,
638                  struct lp_fragment_shader *shader,
639                  const struct lp_fragment_shader_variant_key *key,
640                  LLVMBuilderRef builder,
641                  struct lp_type type,
642                  LLVMTypeRef context_type,
643                  LLVMValueRef context_ptr,
644                  LLVMTypeRef resources_type,
645                  LLVMValueRef resources_ptr,
646                  LLVMTypeRef sample_pos_type,
647                  LLVMValueRef sample_pos_array,
648                  LLVMValueRef num_loop,
649                  struct lp_build_interp_soa_context *interp,
650                  const struct lp_build_sampler_soa *sampler,
651                  const struct lp_build_image_soa *image,
652                  LLVMTypeRef mask_type,
653                  LLVMValueRef mask_store,
654                  LLVMValueRef (*out_color)[4],
655                  LLVMValueRef depth_base_ptr,
656                  LLVMValueRef depth_stride,
657                  LLVMValueRef depth_sample_stride,
658                  LLVMValueRef color_ptr_ptr,
659                  LLVMValueRef color_stride_ptr,
660                  LLVMValueRef color_sample_stride_ptr,
661                  LLVMValueRef facing,
662                  LLVMTypeRef thread_data_type,
663                  LLVMValueRef thread_data_ptr)
664 {
665    struct lp_type int_type = lp_int_type(type);
666    LLVMValueRef mask_ptr = NULL, mask_val = NULL;
667    LLVMValueRef z;
668    LLVMValueRef z_value, s_value;
669    LLVMValueRef z_fb, s_fb;
670    LLVMValueRef zs_samples = lp_build_const_int32(gallivm, key->zsbuf_nr_samples);
671    LLVMValueRef z_out = NULL, s_out = NULL;
672    struct lp_build_for_loop_state loop_state, sample_loop_state = {0};
673    struct lp_build_mask_context mask;
674    struct nir_shader *nir = shader->base.ir.nir;
675    const bool dual_source_blend = key->blend.rt[0].blend_enable &&
676                                   util_blend_state_is_dual(&key->blend, 0);
677    const bool post_depth_coverage = nir->info.fs.post_depth_coverage;
678 
679    struct lp_bld_tgsi_system_values system_values;
680 
681    memset(&system_values, 0, sizeof(system_values));
682 
683    /* truncate then sign extend. */
684    system_values.front_facing =
685       LLVMBuildTrunc(gallivm->builder, facing,
686                      LLVMInt1TypeInContext(gallivm->context), "");
687    system_values.front_facing =
688       LLVMBuildSExt(gallivm->builder, system_values.front_facing,
689                     LLVMInt32TypeInContext(gallivm->context), "");
690    system_values.view_index =
691       lp_jit_thread_data_raster_state_view_index(gallivm,
692                                                  thread_data_type,
693                                                  thread_data_ptr);
694 
695    unsigned depth_mode;
696    const struct util_format_description *zs_format_desc = NULL;
697    if (key->depth.enabled ||
698        key->stencil[0].enabled) {
699       zs_format_desc = util_format_description(key->zsbuf_format);
700 
701       if (nir->info.fs.early_fragment_tests || nir->info.fs.post_depth_coverage) {
702          depth_mode = EARLY_DEPTH_TEST | EARLY_DEPTH_WRITE;
703       } else if (!(nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) &&
704                  !(nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL)) &&
705                  !nir->info.fs.uses_fbfetch_output && !nir->info.writes_memory) {
706          if (key->alpha.enabled ||
707              key->blend.alpha_to_coverage ||
708              nir->info.fs.uses_discard ||
709              nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)) {
710             /* With alpha test and kill, can do the depth test early
711              * and hopefully eliminate some quads.  But need to do a
712              * special deferred depth write once the final mask value
713              * is known. This only works though if there's either no
714              * stencil test or the stencil value isn't written.
715              */
716             if (key->stencil[0].enabled && (key->stencil[0].writemask ||
717                                             (key->stencil[1].enabled &&
718                                              key->stencil[1].writemask)))
719                depth_mode = LATE_DEPTH_TEST | LATE_DEPTH_WRITE;
720             else
721                depth_mode = EARLY_DEPTH_TEST | LATE_DEPTH_WRITE | EARLY_DEPTH_TEST_INFERRED;
722          } else {
723             depth_mode = EARLY_DEPTH_TEST | EARLY_DEPTH_WRITE | EARLY_DEPTH_TEST_INFERRED;
724          }
725       } else {
726          depth_mode = LATE_DEPTH_TEST | LATE_DEPTH_WRITE;
727       }
728 
729       if (!(key->depth.enabled && key->depth.writemask) &&
730           !(key->stencil[0].enabled && (key->stencil[0].writemask ||
731                                         (key->stencil[1].enabled &&
732                                          key->stencil[1].writemask))))
733          depth_mode &= ~(LATE_DEPTH_WRITE | EARLY_DEPTH_WRITE);
734    } else {
735       depth_mode = 0;
736    }
737 
738    LLVMTypeRef vec_type = lp_build_vec_type(gallivm, type);
739    LLVMTypeRef int_vec_type = lp_build_vec_type(gallivm, int_type);
740 
741    LLVMValueRef stencil_refs[2];
742    stencil_refs[0] = lp_jit_context_stencil_ref_front_value(gallivm, context_type, context_ptr);
743    stencil_refs[1] = lp_jit_context_stencil_ref_back_value(gallivm, context_type, context_ptr);
744    /* convert scalar stencil refs into vectors */
745    stencil_refs[0] = lp_build_broadcast(gallivm, int_vec_type, stencil_refs[0]);
746    stencil_refs[1] = lp_build_broadcast(gallivm, int_vec_type, stencil_refs[1]);
747 
748    LLVMValueRef consts_ptr = lp_jit_resources_constants(gallivm, resources_type, resources_ptr);
749 
750    LLVMValueRef ssbo_ptr = lp_jit_resources_ssbos(gallivm, resources_type, resources_ptr);
751 
752    LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][TGSI_NUM_CHANNELS];
753    memset(outputs, 0, sizeof outputs);
754 
755    /* Allocate color storage for each fragment sample */
756    LLVMValueRef color_store_size = num_loop;
757    if (key->min_samples > 1)
758       color_store_size = LLVMBuildMul(builder, num_loop, lp_build_const_int32(gallivm, key->min_samples), "");
759 
760    for (unsigned cbuf = 0; cbuf < key->nr_cbufs; cbuf++) {
761       for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
762          out_color[cbuf][chan] = lp_build_array_alloca(gallivm,
763                                                        lp_build_vec_type(gallivm,
764                                                                          type),
765                                                        color_store_size, "color");
766       }
767    }
768    if (dual_source_blend) {
769       assert(key->nr_cbufs <= 1);
770       for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
771          out_color[1][chan] = lp_build_array_alloca(gallivm,
772                                                     lp_build_vec_type(gallivm,
773                                                                       type),
774                                                     color_store_size, "color1");
775       }
776    }
777    if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
778       z_out = lp_build_array_alloca(gallivm,
779                                     lp_build_vec_type(gallivm, type),
780                                     color_store_size, "depth");
781    }
782 
783    if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL)) {
784       s_out = lp_build_array_alloca(gallivm,
785                                     lp_build_vec_type(gallivm, type),
786                                     color_store_size, "depth");
787    }
788 
789    lp_build_for_loop_begin(&loop_state, gallivm,
790                            lp_build_const_int32(gallivm, 0),
791                            LLVMIntULT,
792                            num_loop,
793                            lp_build_const_int32(gallivm, 1));
794 
795    LLVMValueRef sample_mask_in;
796    if (key->multisample) {
797       sample_mask_in = lp_build_const_int_vec(gallivm, type, 0);
798       /* create shader execution mask by combining all sample masks. */
799       for (unsigned s = 0; s < key->coverage_samples; s++) {
800          LLVMValueRef s_mask_idx = LLVMBuildMul(builder, num_loop, lp_build_const_int32(gallivm, s), "");
801          s_mask_idx = LLVMBuildAdd(builder, s_mask_idx, loop_state.counter, "");
802          LLVMValueRef s_mask = lp_build_pointer_get2(builder, mask_type, mask_store, s_mask_idx);
803          if (s == 0)
804             mask_val = s_mask;
805          else
806             mask_val = LLVMBuildOr(builder, s_mask, mask_val, "");
807 
808          LLVMValueRef mask_in = LLVMBuildAnd(builder, s_mask, lp_build_const_int_vec(gallivm, type, (1ll << s)), "");
809          sample_mask_in = LLVMBuildOr(builder, sample_mask_in, mask_in, "");
810       }
811    } else {
812       sample_mask_in = lp_build_const_int_vec(gallivm, type, 1);
813       mask_ptr = LLVMBuildGEP2(builder, mask_type, mask_store,
814                               &loop_state.counter, 1, "mask_ptr");
815       mask_val = LLVMBuildLoad2(builder, mask_type, mask_ptr, "");
816 
817       LLVMValueRef mask_in = LLVMBuildAnd(builder, mask_val, lp_build_const_int_vec(gallivm, type, 1), "");
818       sample_mask_in = LLVMBuildOr(builder, sample_mask_in, mask_in, "");
819    }
820 
821    /* 'mask' will control execution based on quad's pixel alive/killed state */
822    lp_build_mask_begin(&mask, gallivm, type, mask_val);
823 
824    if (!(depth_mode & EARLY_DEPTH_TEST))
825       lp_build_mask_check(&mask);
826 
827    /* Create storage for recombining sample masks after early Z pass. */
828    LLVMValueRef s_mask_or = lp_build_alloca(gallivm, int_vec_type, "cov_mask_early_depth");
829    LLVMBuildStore(builder, LLVMConstNull(int_vec_type), s_mask_or);
830 
831    /* Create storage for post depth sample mask */
832    LLVMValueRef post_depth_sample_mask_in = NULL;
833    if (post_depth_coverage)
834       post_depth_sample_mask_in = lp_build_alloca(gallivm, int_vec_type, "post_depth_sample_mask_in");
835 
836    LLVMValueRef s_mask = NULL, s_mask_ptr = NULL;
837    LLVMValueRef z_sample_value_store = NULL, s_sample_value_store = NULL;
838    LLVMValueRef z_fb_store = NULL, s_fb_store = NULL;
839    LLVMTypeRef z_type = NULL, z_fb_type = NULL;
840 
841    /* Run early depth once per sample */
842    if (key->multisample) {
843 
844       if (zs_format_desc) {
845          struct lp_type zs_type = lp_depth_type(zs_format_desc, type.length);
846          struct lp_type z_type = zs_type;
847          struct lp_type s_type = zs_type;
848          if (zs_format_desc->block.bits < type.width)
849             z_type.width = type.width;
850          if (zs_format_desc->block.bits == 8) {
851             s_type.width = type.width;
852          } else if (zs_format_desc->block.bits > 32) {
853             z_type.width = z_type.width / 2;
854             s_type.width = s_type.width / 2;
855             s_type.floating = 0;
856          }
857          z_sample_value_store = lp_build_array_alloca(gallivm, lp_build_int_vec_type(gallivm, type),
858                                                       zs_samples, "z_sample_store");
859          s_sample_value_store = lp_build_array_alloca(gallivm, lp_build_int_vec_type(gallivm, type),
860                                                       zs_samples, "s_sample_store");
861          z_fb_store = lp_build_array_alloca(gallivm, lp_build_vec_type(gallivm, z_type),
862                                             zs_samples, "z_fb_store");
863          s_fb_store = lp_build_array_alloca(gallivm, lp_build_vec_type(gallivm, s_type),
864                                             zs_samples, "s_fb_store");
865       }
866       lp_build_for_loop_begin(&sample_loop_state, gallivm,
867                               lp_build_const_int32(gallivm, 0),
868                               LLVMIntULT, lp_build_const_int32(gallivm, key->coverage_samples),
869                               lp_build_const_int32(gallivm, 1));
870 
871       LLVMValueRef s_mask_idx = LLVMBuildMul(builder, sample_loop_state.counter, num_loop, "");
872       s_mask_idx = LLVMBuildAdd(builder, s_mask_idx, loop_state.counter, "");
873       s_mask_ptr = LLVMBuildGEP2(builder, mask_type, mask_store, &s_mask_idx, 1, "");
874 
875       s_mask = LLVMBuildLoad2(builder, mask_type, s_mask_ptr, "");
876       s_mask = LLVMBuildAnd(builder, s_mask, mask_val, "");
877    }
878 
879 
880    /* for multisample Z needs to be interpolated at sample points for testing. */
881    lp_build_interp_soa_update_pos_dyn(interp, gallivm, loop_state.counter,
882                                       key->multisample
883                                       ? sample_loop_state.counter : NULL);
884    z = interp->pos[2];
885 
886    LLVMValueRef depth_ptr = depth_base_ptr;
887    if (key->multisample) {
888       LLVMValueRef sample_offset =
889          LLVMBuildMul(builder, sample_loop_state.counter,
890                       depth_sample_stride, "");
891       depth_ptr = LLVMBuildGEP2(builder, LLVMInt8TypeInContext(gallivm->context),
892                                 depth_ptr, &sample_offset, 1, "");
893    }
894 
895    if (depth_mode & EARLY_DEPTH_TEST) {
896       z = lp_build_depth_clamp(gallivm, builder, key->depth_clamp,
897                                key->restrict_depth_values, type,
898                                context_type, context_ptr,
899                                thread_data_type, thread_data_ptr, z);
900 
901       lp_build_depth_stencil_load_swizzled(gallivm, type,
902                                            zs_format_desc, key->resource_1d,
903                                            depth_ptr, depth_stride,
904                                            &z_fb, &s_fb, loop_state.counter);
905       lp_build_depth_stencil_test(gallivm,
906                                   &key->depth,
907                                   key->stencil,
908                                   type,
909                                   zs_format_desc,
910                                   key->multisample ? NULL : &mask,
911                                   &s_mask,
912                                   stencil_refs,
913                                   z, z_fb, s_fb,
914                                   facing,
915                                   &z_value, &s_value,
916                                   !key->multisample,
917                                   key->restrict_depth_values);
918 
919       if (depth_mode & EARLY_DEPTH_WRITE) {
920          lp_build_depth_stencil_write_swizzled(gallivm, type,
921                                                zs_format_desc, key->resource_1d,
922                                                NULL, NULL, NULL, loop_state.counter,
923                                                depth_ptr, depth_stride,
924                                                z_value, s_value);
925       }
926       /*
927        * Note mask check if stencil is enabled must be after ds write not
928        * after stencil test otherwise new stencil values may not get written
929        * if all fragments got killed by depth/stencil test.
930        */
931       if (key->stencil[0].enabled && !key->multisample)
932          lp_build_mask_check(&mask);
933 
934       if (key->multisample) {
935          z_fb_type = LLVMTypeOf(z_fb);
936          z_type = LLVMTypeOf(z_value);
937          lp_build_pointer_set(builder, z_sample_value_store, sample_loop_state.counter, LLVMBuildBitCast(builder, z_value, lp_build_int_vec_type(gallivm, type), ""));
938          lp_build_pointer_set(builder, s_sample_value_store, sample_loop_state.counter, LLVMBuildBitCast(builder, s_value, lp_build_int_vec_type(gallivm, type), ""));
939          lp_build_pointer_set(builder, z_fb_store, sample_loop_state.counter, z_fb);
940          lp_build_pointer_set(builder, s_fb_store, sample_loop_state.counter, s_fb);
941       }
942       if (key->occlusion_count && !(depth_mode & EARLY_DEPTH_TEST_INFERRED)) {
943          LLVMValueRef counter = lp_jit_thread_data_vis_counter(gallivm, thread_data_type, thread_data_ptr);
944          lp_build_name(counter, "counter");
945          lp_build_occlusion_count(gallivm, type,
946                                  key->multisample ? s_mask : lp_build_mask_value(&mask), counter);
947       }
948    }
949 
950    if (key->multisample) {
951       /*
952        * Store the post-early Z coverage mask.
953        * Recombine the resulting coverage masks post early Z into the fragment
954        * shader execution mask.
955        */
956       LLVMValueRef tmp_s_mask_or = LLVMBuildLoad2(builder, int_vec_type, s_mask_or, "");
957       tmp_s_mask_or = LLVMBuildOr(builder, tmp_s_mask_or, s_mask, "");
958       LLVMBuildStore(builder, tmp_s_mask_or, s_mask_or);
959 
960       if (post_depth_coverage) {
961          LLVMValueRef mask_bit_idx = LLVMBuildShl(builder, lp_build_const_int32(gallivm, 1), sample_loop_state.counter, "");
962          LLVMValueRef post_depth_mask_in = LLVMBuildLoad2(builder, int_vec_type, post_depth_sample_mask_in, "");
963          mask_bit_idx = LLVMBuildAnd(builder, s_mask, lp_build_broadcast(gallivm, int_vec_type, mask_bit_idx), "");
964          post_depth_mask_in = LLVMBuildOr(builder, post_depth_mask_in, mask_bit_idx, "");
965          LLVMBuildStore(builder, post_depth_mask_in, post_depth_sample_mask_in);
966       }
967 
968       LLVMBuildStore(builder, s_mask, s_mask_ptr);
969 
970       lp_build_for_loop_end(&sample_loop_state);
971 
972       /* recombined all the coverage masks in the shader exec mask. */
973       tmp_s_mask_or = LLVMBuildLoad2(builder, int_vec_type, s_mask_or, "");
974       lp_build_mask_update(&mask, tmp_s_mask_or);
975 
976       if (key->min_samples == 1) {
977          /* for multisample Z needs to be re interpolated at pixel center */
978          lp_build_interp_soa_update_pos_dyn(interp, gallivm, loop_state.counter, NULL);
979          z = interp->pos[2];
980          lp_build_mask_update(&mask, tmp_s_mask_or);
981       }
982    } else {
983       if (post_depth_coverage) {
984          LLVMValueRef post_depth_mask_in = LLVMBuildAnd(builder, lp_build_mask_value(&mask), lp_build_const_int_vec(gallivm, type, 1), "");
985          LLVMBuildStore(builder, post_depth_mask_in, post_depth_sample_mask_in);
986       }
987    }
988 
989    LLVMValueRef out_sample_mask_storage = NULL;
990    if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)) {
991       out_sample_mask_storage = lp_build_alloca(gallivm, int_vec_type, "write_mask");
992       if (key->min_samples > 1)
993          LLVMBuildStore(builder, LLVMConstNull(int_vec_type), out_sample_mask_storage);
994    }
995 
996    if (post_depth_coverage) {
997       system_values.sample_mask_in = LLVMBuildLoad2(builder, int_vec_type, post_depth_sample_mask_in, "");
998    } else {
999       system_values.sample_mask_in = sample_mask_in;
1000    }
1001    if (key->multisample && key->min_samples > 1) {
1002       lp_build_for_loop_begin(&sample_loop_state, gallivm,
1003                               lp_build_const_int32(gallivm, 0),
1004                               LLVMIntULT,
1005                               lp_build_const_int32(gallivm, key->min_samples),
1006                               lp_build_const_int32(gallivm, 1));
1007 
1008       LLVMValueRef s_mask_idx = LLVMBuildMul(builder, sample_loop_state.counter, num_loop, "");
1009       s_mask_idx = LLVMBuildAdd(builder, s_mask_idx, loop_state.counter, "");
1010       s_mask_ptr = LLVMBuildGEP2(builder, mask_type, mask_store, &s_mask_idx, 1, "");
1011       s_mask = LLVMBuildLoad2(builder, mask_type, s_mask_ptr, "");
1012       lp_build_mask_force(&mask, s_mask);
1013       lp_build_interp_soa_update_pos_dyn(interp, gallivm, loop_state.counter, sample_loop_state.counter);
1014       system_values.sample_id = sample_loop_state.counter;
1015       system_values.sample_mask_in = LLVMBuildAnd(builder, system_values.sample_mask_in,
1016                                                   lp_build_broadcast(gallivm, int_vec_type,
1017                                                                      LLVMBuildShl(builder, lp_build_const_int32(gallivm, 1), sample_loop_state.counter, "")), "");
1018    } else {
1019       system_values.sample_id = lp_build_const_int32(gallivm, 0);
1020 
1021    }
1022    system_values.sample_pos = sample_pos_array;
1023    system_values.sample_pos_type = sample_pos_type;
1024 
1025    lp_build_interp_soa_update_inputs_dyn(interp, gallivm, loop_state.counter,
1026                                          mask_type, mask_store, sample_loop_state.counter);
1027 
1028    struct lp_build_fs_llvm_iface fs_iface = {
1029      .base.interp_fn = fs_interp,
1030      .base.fb_fetch = fs_fb_fetch,
1031      .interp = interp,
1032      .loop_state = &loop_state,
1033      .sample_id = system_values.sample_id,
1034      .mask_type = mask_type,
1035      .mask_store = mask_store,
1036      .color_ptr_ptr = color_ptr_ptr,
1037      .color_stride_ptr = color_stride_ptr,
1038      .color_sample_stride_ptr = color_sample_stride_ptr,
1039      .zs_base_ptr = depth_base_ptr,
1040      .zs_stride = depth_stride,
1041      .zs_sample_stride = depth_sample_stride,
1042      .key = key,
1043    };
1044 
1045    struct lp_build_tgsi_params params;
1046    memset(&params, 0, sizeof(params));
1047 
1048    params.type = type;
1049    params.mask = &mask;
1050    params.fs_iface = &fs_iface.base;
1051    params.consts_ptr = consts_ptr;
1052    params.system_values = &system_values;
1053    params.inputs = interp->inputs;
1054    params.num_inputs = interp->num_attribs - 1;
1055    params.context_type = context_type;
1056    params.context_ptr = context_ptr;
1057    params.resources_type = resources_type;
1058    params.resources_ptr = resources_ptr;
1059    params.thread_data_type = thread_data_type;
1060    params.thread_data_ptr = thread_data_ptr;
1061    params.sampler = sampler;
1062    params.info = &shader->info.base;
1063    params.ssbo_ptr = ssbo_ptr;
1064    params.image = image;
1065    params.aniso_filter_table = lp_jit_resources_aniso_filter_table(gallivm, resources_type, resources_ptr);
1066 
1067    /* Build the actual shader */
1068    lp_build_nir_soa(gallivm, nir, &params, outputs);
1069 
1070    /*
1071     * Must not count ps invocations if there's a null shader.
1072     * (It would be ok to count with null shader if there's d/s tests,
1073     * but only if there's d/s buffers too, which is different
1074     * to implicit rasterization disable which must not depend
1075     * on the d/s buffers.)
1076     * Could disable if there's no stats query, but maybe not worth it.
1077     */
1078    if (shader->info.base.num_instructions > 1) {
1079       LLVMValueRef invocs = lp_jit_thread_data_ps_invocations(gallivm, thread_data_type, thread_data_ptr);
1080       lp_build_occlusion_count(gallivm, type, lp_build_mask_value(&mask), invocs);
1081    }
1082 
1083    /* Alpha test */
1084    if (key->alpha.enabled) {
1085       int color0 = find_output_by_frag_result(nir, FRAG_RESULT_DATA0);
1086 
1087       if (color0 != -1 && outputs[color0][3]) {
1088          const struct util_format_description *cbuf_format_desc;
1089          LLVMValueRef alpha = LLVMBuildLoad2(builder, vec_type, outputs[color0][3], "alpha");
1090          LLVMValueRef alpha_ref_value;
1091 
1092          alpha_ref_value = lp_jit_context_alpha_ref_value(gallivm, context_type, context_ptr);
1093          alpha_ref_value = lp_build_broadcast(gallivm, vec_type, alpha_ref_value);
1094 
1095          cbuf_format_desc = util_format_description(key->cbuf_format[0]);
1096 
1097          lp_build_alpha_test(gallivm, key->alpha.func, type, cbuf_format_desc,
1098                              &mask, alpha, alpha_ref_value,
1099                              ((depth_mode & LATE_DEPTH_TEST) != 0) && !key->multisample);
1100       }
1101    }
1102 
1103    /* Emulate Alpha to Coverage with Alpha test */
1104    if (key->blend.alpha_to_coverage) {
1105       int color0 = find_output_by_frag_result(nir, FRAG_RESULT_DATA0);
1106 
1107       if (color0 != -1 && outputs[color0][3]) {
1108          LLVMValueRef alpha = LLVMBuildLoad2(builder, vec_type, outputs[color0][3], "alpha");
1109 
1110          if (!key->multisample) {
1111             lp_build_alpha_to_coverage(gallivm, type,
1112                                        &mask, alpha,
1113                                        (depth_mode & LATE_DEPTH_TEST) != 0);
1114          } else {
1115             lp_build_sample_alpha_to_coverage(gallivm, type, key->coverage_samples, num_loop,
1116                                               loop_state.counter,
1117                                               mask_type, mask_store, alpha);
1118          }
1119       }
1120    }
1121 
1122    if (key->blend.alpha_to_one) {
1123       nir_foreach_shader_out_variable(var, nir) {
1124          if (var->data.location < FRAG_RESULT_DATA0)
1125             continue;
1126          int slots = nir_variable_count_slots(var, var->type);
1127          for (unsigned s = 0; s < slots; s++) {
1128             unsigned cbuf = get_cbuf_location(var, s);
1129             if ((cbuf < key->nr_cbufs) || (cbuf == 1 && dual_source_blend))
1130                if (outputs[cbuf][3]) {
1131                   LLVMBuildStore(builder, lp_build_const_vec(gallivm, type, 1.0),
1132                                  outputs[cbuf][3]);
1133                }
1134          }
1135       }
1136    }
1137 
1138    if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)) {
1139       LLVMValueRef output_smask = NULL;
1140       int smaski = find_output_by_frag_result(nir, FRAG_RESULT_SAMPLE_MASK);
1141 
1142       struct lp_build_context smask_bld;
1143       lp_build_context_init(&smask_bld, gallivm, int_type);
1144 
1145       assert(smaski >= 0);
1146       output_smask = LLVMBuildLoad2(builder, vec_type, outputs[smaski][0], "smask");
1147       output_smask = LLVMBuildBitCast(builder, output_smask, smask_bld.vec_type, "");
1148       if (!key->multisample && key->no_ms_sample_mask_out) {
1149          output_smask = lp_build_and(&smask_bld, output_smask, smask_bld.one);
1150          output_smask = lp_build_cmp(&smask_bld, PIPE_FUNC_NOTEQUAL, output_smask, smask_bld.zero);
1151          lp_build_mask_update(&mask, output_smask);
1152       }
1153 
1154       if (key->min_samples > 1) {
1155          /* only the bit corresponding to this sample is to be used. */
1156          LLVMValueRef tmp_mask = LLVMBuildLoad2(builder, int_vec_type, out_sample_mask_storage, "tmp_mask");
1157          LLVMValueRef out_smask_idx = LLVMBuildShl(builder, lp_build_const_int32(gallivm, 1), sample_loop_state.counter, "");
1158          LLVMValueRef smask_bit = LLVMBuildAnd(builder, output_smask, lp_build_broadcast(gallivm, int_vec_type, out_smask_idx), "");
1159          output_smask = LLVMBuildOr(builder, tmp_mask, smask_bit, "");
1160       }
1161 
1162       LLVMBuildStore(builder, output_smask, out_sample_mask_storage);
1163    }
1164 
1165    if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
1166       int pos0 = find_output_by_frag_result(nir, FRAG_RESULT_DEPTH);
1167 
1168       LLVMValueRef out = LLVMBuildLoad2(builder, vec_type, outputs[pos0][2], "");
1169       LLVMValueRef idx = loop_state.counter;
1170       if (key->min_samples > 1)
1171          idx = LLVMBuildAdd(builder, idx,
1172                             LLVMBuildMul(builder, sample_loop_state.counter, num_loop, ""), "");
1173       LLVMValueRef ptr = LLVMBuildGEP2(builder, vec_type, z_out, &idx, 1, "");
1174       LLVMBuildStore(builder, out, ptr);
1175    }
1176 
1177    if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL)) {
1178       int sten_out = find_output_by_frag_result(nir, FRAG_RESULT_STENCIL);
1179 
1180       LLVMValueRef out = LLVMBuildLoad2(builder, vec_type,
1181                                         outputs[sten_out][1], "output.s");
1182       LLVMValueRef idx = loop_state.counter;
1183       if (key->min_samples > 1)
1184          idx = LLVMBuildAdd(builder, idx,
1185                             LLVMBuildMul(builder, sample_loop_state.counter, num_loop, ""), "");
1186       LLVMValueRef ptr = LLVMBuildGEP2(builder, vec_type, s_out, &idx, 1, "");
1187       LLVMBuildStore(builder, out, ptr);
1188    }
1189 
1190    bool has_cbuf0_write = false;
1191    /* Color write - per fragment sample */
1192    nir_foreach_shader_out_variable(var, nir) {
1193       if (var->data.location < FRAG_RESULT_DATA0)
1194          continue;
1195       int slots = nir_variable_count_slots(var, var->type);
1196 
1197       for (unsigned s = 0; s < slots; s++) {
1198          unsigned cbuf = get_cbuf_location(var, s);
1199          unsigned attrib = var->data.driver_location + s;
1200          if ((cbuf < key->nr_cbufs) || (cbuf == 1 && dual_source_blend)) {
1201             if (cbuf == 0) {
1202                /* XXX: there is an edge case with FB fetch where gl_FragColor and
1203                 * gl_LastFragData[0] are used together. This creates both
1204                 * FRAG_RESULT_COLOR and FRAG_RESULT_DATA* output variables. This
1205                 * loop then writes to cbuf 0 twice, owerwriting the correct value
1206                 * from gl_FragColor with some garbage. This case is excercised in
1207                 * one of deqp tests.  A similar bug can happen if
1208                 * gl_SecondaryFragColorEXT and gl_LastFragData[1] are mixed in
1209                 * the same fashion...  This workaround will break if
1210                 * gl_LastFragData[0] goes in outputs list before
1211                 * gl_FragColor. This doesn't seem to happen though.
1212                 */
1213                if (has_cbuf0_write)
1214                   continue;
1215                has_cbuf0_write = true;
1216             }
1217 
1218             for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
1219                if (outputs[attrib][chan]) {
1220                   /* XXX: just initialize outputs to point at colors[] and
1221                    * skip this.
1222                    */
1223                   LLVMValueRef out = LLVMBuildLoad2(builder, vec_type, outputs[attrib][chan], "");
1224                   LLVMValueRef color_ptr;
1225                   LLVMValueRef color_idx = loop_state.counter;
1226                   if (key->min_samples > 1)
1227                      color_idx = LLVMBuildAdd(builder, color_idx,
1228                                               LLVMBuildMul(builder, sample_loop_state.counter, num_loop, ""), "");
1229                   color_ptr = LLVMBuildGEP2(builder, vec_type, out_color[cbuf][chan],
1230                                             &color_idx, 1, "");
1231                   lp_build_name(out, "color%u.%c", attrib, "rgba"[chan]);
1232                   LLVMBuildStore(builder, out, color_ptr);
1233                }
1234             }
1235          }
1236       }
1237    }
1238 
1239    if (key->multisample && key->min_samples > 1) {
1240       LLVMBuildStore(builder, lp_build_mask_value(&mask), s_mask_ptr);
1241       lp_build_for_loop_end(&sample_loop_state);
1242    }
1243 
1244    if (key->multisample) {
1245       /* execute depth test for each sample */
1246       lp_build_for_loop_begin(&sample_loop_state, gallivm,
1247                               lp_build_const_int32(gallivm, 0),
1248                               LLVMIntULT, lp_build_const_int32(gallivm, key->coverage_samples),
1249                               lp_build_const_int32(gallivm, 1));
1250 
1251       /* load the per-sample coverage mask */
1252       LLVMValueRef s_mask_idx = LLVMBuildMul(builder, sample_loop_state.counter, num_loop, "");
1253       s_mask_idx = LLVMBuildAdd(builder, s_mask_idx, loop_state.counter, "");
1254       s_mask_ptr = LLVMBuildGEP2(builder, mask_type, mask_store, &s_mask_idx, 1, "");
1255 
1256       /* combine the execution mask post fragment shader with the coverage mask. */
1257       s_mask = LLVMBuildLoad2(builder, mask_type, s_mask_ptr, "");
1258       if (key->min_samples == 1)
1259          s_mask = LLVMBuildAnd(builder, s_mask, lp_build_mask_value(&mask), "");
1260 
1261       /* if the shader writes sample mask use that,
1262        * but only if this isn't genuine early-depth to avoid breaking occlusion query */
1263       if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK) &&
1264           (!(depth_mode & EARLY_DEPTH_TEST) || (depth_mode & (EARLY_DEPTH_TEST_INFERRED)))) {
1265          LLVMValueRef out_smask_idx = LLVMBuildShl(builder, lp_build_const_int32(gallivm, 1), sample_loop_state.counter, "");
1266          out_smask_idx = lp_build_broadcast(gallivm, int_vec_type, out_smask_idx);
1267          LLVMValueRef output_smask = LLVMBuildLoad2(builder, int_vec_type, out_sample_mask_storage, "");
1268          LLVMValueRef smask_bit = LLVMBuildAnd(builder, output_smask, out_smask_idx, "");
1269          LLVMValueRef cmp = LLVMBuildICmp(builder, LLVMIntNE, smask_bit, lp_build_const_int_vec(gallivm, int_type, 0), "");
1270          smask_bit = LLVMBuildSExt(builder, cmp, int_vec_type, "");
1271 
1272          s_mask = LLVMBuildAnd(builder, s_mask, smask_bit, "");
1273       }
1274    }
1275 
1276    depth_ptr = depth_base_ptr;
1277    if (key->multisample) {
1278       LLVMValueRef sample_offset = LLVMBuildMul(builder, sample_loop_state.counter, depth_sample_stride, "");
1279       depth_ptr = LLVMBuildGEP2(builder, LLVMInt8TypeInContext(gallivm->context),
1280                                 depth_ptr, &sample_offset, 1, "");
1281    }
1282 
1283    /* Late Z test */
1284    if (depth_mode & LATE_DEPTH_TEST) {
1285       if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
1286          LLVMValueRef idx = loop_state.counter;
1287          if (key->min_samples > 1)
1288             idx = LLVMBuildAdd(builder, idx,
1289                                LLVMBuildMul(builder, sample_loop_state.counter, num_loop, ""), "");
1290          LLVMValueRef ptr = LLVMBuildGEP2(builder, vec_type, z_out, &idx, 1, "");
1291          z = LLVMBuildLoad2(builder, vec_type, ptr, "output.z");
1292       } else {
1293          if (key->multisample) {
1294             lp_build_interp_soa_update_pos_dyn(interp, gallivm, loop_state.counter, key->multisample ? sample_loop_state.counter : NULL);
1295             z = interp->pos[2];
1296          }
1297       }
1298 
1299       /*
1300        * Clamp according to ARB_depth_clamp semantics.
1301        */
1302       z = lp_build_depth_clamp(gallivm, builder, key->depth_clamp,
1303                                key->restrict_depth_values, type,
1304                                context_type, context_ptr,
1305                                thread_data_type, thread_data_ptr, z);
1306 
1307       if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL)) {
1308          LLVMValueRef idx = loop_state.counter;
1309          if (key->min_samples > 1)
1310             idx = LLVMBuildAdd(builder, idx,
1311                                LLVMBuildMul(builder, sample_loop_state.counter, num_loop, ""), "");
1312          LLVMValueRef ptr = LLVMBuildGEP2(builder, vec_type, s_out, &idx, 1, "");
1313          stencil_refs[0] = LLVMBuildLoad2(builder, vec_type, ptr, "output.s");
1314          /* there's only one value, and spec says to discard additional bits */
1315          LLVMValueRef s_max_mask = lp_build_const_int_vec(gallivm, int_type, 255);
1316          stencil_refs[0] = LLVMBuildBitCast(builder, stencil_refs[0], int_vec_type, "");
1317          stencil_refs[0] = LLVMBuildAnd(builder, stencil_refs[0], s_max_mask, "");
1318          stencil_refs[1] = stencil_refs[0];
1319       }
1320 
1321       lp_build_depth_stencil_load_swizzled(gallivm, type,
1322                                            zs_format_desc, key->resource_1d,
1323                                            depth_ptr, depth_stride,
1324                                            &z_fb, &s_fb, loop_state.counter);
1325 
1326       lp_build_depth_stencil_test(gallivm,
1327                                   &key->depth,
1328                                   key->stencil,
1329                                   type,
1330                                   zs_format_desc,
1331                                   key->multisample ? NULL : &mask,
1332                                   &s_mask,
1333                                   stencil_refs,
1334                                   z, z_fb, s_fb,
1335                                   facing,
1336                                   &z_value, &s_value,
1337                                   false,
1338                                   key->restrict_depth_values);
1339       /* Late Z write */
1340       if (depth_mode & LATE_DEPTH_WRITE) {
1341          lp_build_depth_stencil_write_swizzled(gallivm, type,
1342                                                zs_format_desc, key->resource_1d,
1343                                                NULL, NULL, NULL, loop_state.counter,
1344                                                depth_ptr, depth_stride,
1345                                                z_value, s_value);
1346       }
1347    } else if ((depth_mode & EARLY_DEPTH_TEST) &&
1348               (depth_mode & LATE_DEPTH_WRITE)) {
1349       /* Need to apply a reduced mask to the depth write.  Reload the
1350        * depth value, update from zs_value with the new mask value and
1351        * write that out.
1352        */
1353       if (key->multisample) {
1354          z_value = LLVMBuildBitCast(builder, lp_build_pointer_get2(builder, int_vec_type, z_sample_value_store, sample_loop_state.counter), z_type, "");
1355          s_value = lp_build_pointer_get2(builder, int_vec_type, s_sample_value_store, sample_loop_state.counter);
1356          z_fb = LLVMBuildBitCast(builder, lp_build_pointer_get2(builder, int_vec_type, z_fb_store, sample_loop_state.counter), z_fb_type, "");
1357          s_fb = lp_build_pointer_get2(builder, int_vec_type, s_fb_store, sample_loop_state.counter);
1358       }
1359       lp_build_depth_stencil_write_swizzled(gallivm, type,
1360                                             zs_format_desc, key->resource_1d,
1361                                             key->multisample ? s_mask : lp_build_mask_value(&mask), z_fb, s_fb, loop_state.counter,
1362                                             depth_ptr, depth_stride,
1363                                             z_value, s_value);
1364    }
1365 
1366    if (key->occlusion_count && (!(depth_mode & EARLY_DEPTH_TEST) || (depth_mode & EARLY_DEPTH_TEST_INFERRED))) {
1367       LLVMValueRef counter = lp_jit_thread_data_vis_counter(gallivm, thread_data_type, thread_data_ptr);
1368       lp_build_name(counter, "counter");
1369 
1370       lp_build_occlusion_count(gallivm, type,
1371                                key->multisample ? s_mask : lp_build_mask_value(&mask), counter);
1372    }
1373 
1374    /* if this is genuine early-depth in the shader, write samplemask now
1375     * after occlusion count has been updated
1376     */
1377    if (key->multisample &&
1378        nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK) &&
1379        (depth_mode & (EARLY_DEPTH_TEST_INFERRED | EARLY_DEPTH_TEST)) == EARLY_DEPTH_TEST) {
1380       /* if the shader writes sample mask use that */
1381          LLVMValueRef out_smask_idx = LLVMBuildShl(builder, lp_build_const_int32(gallivm, 1), sample_loop_state.counter, "");
1382          out_smask_idx = lp_build_broadcast(gallivm, int_vec_type, out_smask_idx);
1383          LLVMValueRef output_smask = LLVMBuildLoad2(builder, int_vec_type, out_sample_mask_storage, "");
1384          LLVMValueRef smask_bit = LLVMBuildAnd(builder, output_smask, out_smask_idx, "");
1385          LLVMValueRef cmp = LLVMBuildICmp(builder, LLVMIntNE, smask_bit, lp_build_const_int_vec(gallivm, int_type, 0), "");
1386          smask_bit = LLVMBuildSExt(builder, cmp, int_vec_type, "");
1387 
1388          s_mask = LLVMBuildAnd(builder, s_mask, smask_bit, "");
1389    }
1390 
1391 
1392    if (key->multisample) {
1393       /* store the sample mask for this loop */
1394       LLVMBuildStore(builder, s_mask, s_mask_ptr);
1395       lp_build_for_loop_end(&sample_loop_state);
1396    }
1397 
1398    mask_val = lp_build_mask_end(&mask);
1399    if (!key->multisample)
1400       LLVMBuildStore(builder, mask_val, mask_ptr);
1401    lp_build_for_loop_end(&loop_state);
1402 }
1403 
1404 
1405 /**
1406  * This function will reorder pixels from the fragment shader SoA to memory
1407  * layout AoS
1408  *
1409  * Fragment Shader outputs pixels in small 2x2 blocks
1410  *  e.g. (0, 0), (1, 0), (0, 1), (1, 1) ; (2, 0) ...
1411  *
1412  * However in memory pixels are stored in rows
1413  *  e.g. (0, 0), (1, 0), (2, 0), (3, 0) ; (0, 1) ...
1414  *
1415  * @param type            fragment shader type (4x or 8x float)
1416  * @param num_fs          number of fs_src
1417  * @param is_1d           whether we're outputting to a 1d resource
1418  * @param dst_channels    number of output channels
1419  * @param fs_src          output from fragment shader
1420  * @param dst             pointer to store result
1421  * @param pad_inline      is channel padding inline or at end of row
1422  * @return                the number of dsts
1423  */
1424 static int
generate_fs_twiddle(struct gallivm_state * gallivm,struct lp_type type,unsigned num_fs,unsigned dst_channels,LLVMValueRef fs_src[][4],LLVMValueRef * dst,bool pad_inline)1425 generate_fs_twiddle(struct gallivm_state *gallivm,
1426                     struct lp_type type,
1427                     unsigned num_fs,
1428                     unsigned dst_channels,
1429                     LLVMValueRef fs_src[][4],
1430                     LLVMValueRef* dst,
1431                     bool pad_inline)
1432 {
1433    LLVMValueRef src[16];
1434    unsigned pixels = type.length / 4;
1435    unsigned src_channels = dst_channels < 3 ? dst_channels : 4;
1436    unsigned src_count = num_fs * src_channels;
1437 
1438    assert(pixels == 2 || pixels == 1);
1439    assert(num_fs * src_channels <= ARRAY_SIZE(src));
1440 
1441    /*
1442     * Transpose from SoA -> AoS
1443     */
1444    for (unsigned i = 0; i < num_fs; ++i) {
1445       lp_build_transpose_aos_n(gallivm, type, &fs_src[i][0], src_channels,
1446                                &src[i * src_channels]);
1447    }
1448 
1449    /*
1450     * Pick transformation options
1451     */
1452    bool swizzle_pad = false;
1453    bool twiddle = false;
1454    bool split = false;
1455    unsigned reorder_group = 0;
1456 
1457    if (dst_channels == 1) {
1458       twiddle = true;
1459       if (pixels == 2) {
1460          split = true;
1461       }
1462    } else if (dst_channels == 2) {
1463       if (pixels == 1) {
1464          reorder_group = 1;
1465       }
1466    } else if (dst_channels > 2) {
1467       if (pixels == 1) {
1468          reorder_group = 2;
1469       } else {
1470          twiddle = true;
1471       }
1472 
1473       if (!pad_inline && dst_channels == 3 && pixels > 1) {
1474          swizzle_pad = true;
1475       }
1476    }
1477 
1478    /*
1479     * Split the src in half
1480     */
1481    if (split) {
1482       for (unsigned i = num_fs; i > 0; --i) {
1483          src[(i - 1)*2 + 1] = lp_build_extract_range(gallivm, src[i - 1], 4, 4);
1484          src[(i - 1)*2 + 0] = lp_build_extract_range(gallivm, src[i - 1], 0, 4);
1485       }
1486 
1487       src_count *= 2;
1488       type.length = 4;
1489    }
1490 
1491    /*
1492     * Ensure pixels are in memory order
1493     */
1494    if (reorder_group) {
1495       /* Twiddle pixels by reordering the array, e.g.:
1496        *
1497        * src_count =  8 -> 0 2 1 3 4 6 5 7
1498        * src_count = 16 -> 0 1 4 5 2 3 6 7 8 9 12 13 10 11 14 15
1499        */
1500       const unsigned reorder_sw[] = { 0, 2, 1, 3 };
1501 
1502       for (unsigned i = 0; i < src_count; ++i) {
1503          unsigned group = i / reorder_group;
1504          unsigned block = (group / 4) * 4 * reorder_group;
1505          unsigned j = block + (reorder_sw[group % 4] * reorder_group) + (i % reorder_group);
1506          dst[i] = src[j];
1507       }
1508    } else if (twiddle) {
1509       /* Twiddle pixels across elements of array */
1510       /*
1511        * XXX: we should avoid this in some cases, but would need to tell
1512        * lp_build_conv to reorder (or deal with it ourselves).
1513        */
1514       lp_bld_quad_twiddle(gallivm, type, src, src_count, dst);
1515    } else {
1516       /* Do nothing */
1517       memcpy(dst, src, sizeof(LLVMValueRef) * src_count);
1518    }
1519 
1520    /*
1521     * Moves any padding between pixels to the end
1522     * e.g. RGBXRGBX -> RGBRGBXX
1523     */
1524    if (swizzle_pad) {
1525       unsigned char swizzles[16];
1526       unsigned elems = pixels * dst_channels;
1527 
1528       for (unsigned i = 0; i < type.length; ++i) {
1529          if (i < elems)
1530             swizzles[i] = i % dst_channels + (i / dst_channels) * 4;
1531          else
1532             swizzles[i] = LP_BLD_SWIZZLE_DONTCARE;
1533       }
1534 
1535       for (unsigned i = 0; i < src_count; ++i) {
1536          dst[i] = lp_build_swizzle_aos_n(gallivm, dst[i], swizzles,
1537                                          type.length, type.length);
1538       }
1539    }
1540 
1541    return src_count;
1542 }
1543 
1544 
1545 /*
1546  * Untwiddle and transpose, much like the above.
1547  * However, this is after conversion, so we get packed vectors.
1548  * At this time only handle 4x16i8 rgba / 2x16i8 rg / 1x16i8 r data,
1549  * the vectors will look like:
1550  * r0r1r4r5r2r3r6r7r8r9r12... (albeit color channels may
1551  * be swizzled here). Extending to 16bit should be trivial.
1552  * Should also be extended to handle twice wide vectors with AVX2...
1553  */
1554 static void
fs_twiddle_transpose(struct gallivm_state * gallivm,struct lp_type type,LLVMValueRef * src,unsigned src_count,LLVMValueRef * dst)1555 fs_twiddle_transpose(struct gallivm_state *gallivm,
1556                      struct lp_type type,
1557                      LLVMValueRef *src,
1558                      unsigned src_count,
1559                      LLVMValueRef *dst)
1560 {
1561    struct lp_type type64, type16, type32;
1562    LLVMTypeRef type64_t, type8_t, type16_t, type32_t;
1563    LLVMBuilderRef builder = gallivm->builder;
1564    LLVMValueRef tmp[4], shuf[8];
1565    for (unsigned j = 0; j < 2; j++) {
1566       shuf[j*4 + 0] = lp_build_const_int32(gallivm, j*4 + 0);
1567       shuf[j*4 + 1] = lp_build_const_int32(gallivm, j*4 + 2);
1568       shuf[j*4 + 2] = lp_build_const_int32(gallivm, j*4 + 1);
1569       shuf[j*4 + 3] = lp_build_const_int32(gallivm, j*4 + 3);
1570    }
1571 
1572    assert(src_count == 4 || src_count == 2 || src_count == 1);
1573    assert(type.width == 8);
1574    assert(type.length == 16);
1575 
1576    type8_t = lp_build_vec_type(gallivm, type);
1577 
1578    type64 = type;
1579    type64.length /= 8;
1580    type64.width *= 8;
1581    type64_t = lp_build_vec_type(gallivm, type64);
1582 
1583    type16 = type;
1584    type16.length /= 2;
1585    type16.width *= 2;
1586    type16_t = lp_build_vec_type(gallivm, type16);
1587 
1588    type32 = type;
1589    type32.length /= 4;
1590    type32.width *= 4;
1591    type32_t = lp_build_vec_type(gallivm, type32);
1592 
1593    lp_build_transpose_aos_n(gallivm, type, src, src_count, tmp);
1594 
1595    if (src_count == 1) {
1596       /* transpose was no-op, just untwiddle */
1597       LLVMValueRef shuf_vec;
1598       shuf_vec = LLVMConstVector(shuf, 8);
1599       tmp[0] = LLVMBuildBitCast(builder, src[0], type16_t, "");
1600       tmp[0] = LLVMBuildShuffleVector(builder, tmp[0], tmp[0], shuf_vec, "");
1601       dst[0] = LLVMBuildBitCast(builder, tmp[0], type8_t, "");
1602    } else if (src_count == 2) {
1603       LLVMValueRef shuf_vec;
1604       shuf_vec = LLVMConstVector(shuf, 4);
1605 
1606       for (unsigned i = 0; i < 2; i++) {
1607          tmp[i] = LLVMBuildBitCast(builder, tmp[i], type32_t, "");
1608          tmp[i] = LLVMBuildShuffleVector(builder, tmp[i], tmp[i], shuf_vec, "");
1609          dst[i] = LLVMBuildBitCast(builder, tmp[i], type8_t, "");
1610       }
1611    } else {
1612       for (unsigned j = 0; j < 2; j++) {
1613          LLVMValueRef lo, hi, lo2, hi2;
1614           /*
1615           * Note that if we only really have 3 valid channels (rgb)
1616           * and we don't need alpha we could substitute a undef here
1617           * for the respective channel (causing llvm to drop conversion
1618           * for alpha).
1619           */
1620          /* we now have rgba0rgba1rgba4rgba5 etc, untwiddle */
1621          lo2 = LLVMBuildBitCast(builder, tmp[j*2], type64_t, "");
1622          hi2 = LLVMBuildBitCast(builder, tmp[j*2 + 1], type64_t, "");
1623          lo = lp_build_interleave2(gallivm, type64, lo2, hi2, 0);
1624          hi = lp_build_interleave2(gallivm, type64, lo2, hi2, 1);
1625          dst[j*2] = LLVMBuildBitCast(builder, lo, type8_t, "");
1626          dst[j*2 + 1] = LLVMBuildBitCast(builder, hi, type8_t, "");
1627       }
1628    }
1629 }
1630 
1631 
1632 /**
1633  * Load an unswizzled block of pixels from memory
1634  */
1635 static void
load_unswizzled_block(struct gallivm_state * gallivm,LLVMTypeRef base_type,LLVMValueRef base_ptr,LLVMValueRef stride,unsigned block_width,unsigned block_height,LLVMValueRef * dst,struct lp_type dst_type,unsigned dst_count,unsigned dst_alignment)1636 load_unswizzled_block(struct gallivm_state *gallivm,
1637                       LLVMTypeRef base_type,
1638                       LLVMValueRef base_ptr,
1639                       LLVMValueRef stride,
1640                       unsigned block_width,
1641                       unsigned block_height,
1642                       LLVMValueRef* dst,
1643                       struct lp_type dst_type,
1644                       unsigned dst_count,
1645                       unsigned dst_alignment)
1646 {
1647    LLVMBuilderRef builder = gallivm->builder;
1648    const unsigned row_size = dst_count / block_height;
1649 
1650    /* Ensure block exactly fits into dst */
1651    assert((block_width * block_height) % dst_count == 0);
1652 
1653    for (unsigned i = 0; i < dst_count; ++i) {
1654       unsigned x = i % row_size;
1655       unsigned y = i / row_size;
1656 
1657       LLVMValueRef bx = lp_build_const_int32(gallivm, x * (dst_type.width / 8) * dst_type.length);
1658       LLVMValueRef by = LLVMBuildMul(builder, lp_build_const_int32(gallivm, y), stride, "");
1659 
1660       LLVMValueRef gep[2];
1661       LLVMValueRef dst_ptr;
1662 
1663       gep[0] = lp_build_const_int32(gallivm, 0);
1664       gep[1] = LLVMBuildAdd(builder, bx, by, "");
1665 
1666       dst_ptr = LLVMBuildGEP2(builder, base_type, base_ptr, gep, 2, "");
1667       dst_ptr = LLVMBuildBitCast(builder, dst_ptr,
1668                                  LLVMPointerType(lp_build_vec_type(gallivm, dst_type), 0), "");
1669 
1670       dst[i] = LLVMBuildLoad2(builder,
1671                               lp_build_vec_type(gallivm, dst_type),
1672                               dst_ptr, "");
1673 
1674       LLVMSetAlignment(dst[i], dst_alignment);
1675    }
1676 }
1677 
1678 
1679 /**
1680  * Store an unswizzled block of pixels to memory
1681  */
1682 static void
store_unswizzled_block(struct gallivm_state * gallivm,LLVMTypeRef base_type,LLVMValueRef base_ptr,LLVMValueRef stride,unsigned block_width,unsigned block_height,LLVMValueRef src[],struct lp_type src_type,unsigned src_count,unsigned src_alignment)1683 store_unswizzled_block(struct gallivm_state *gallivm,
1684                        LLVMTypeRef base_type,
1685                        LLVMValueRef base_ptr,
1686                        LLVMValueRef stride,
1687                        unsigned block_width,
1688                        unsigned block_height,
1689                        LLVMValueRef src[],   // [src_count]
1690                        struct lp_type src_type,
1691                        unsigned src_count,
1692                        unsigned src_alignment)
1693 {
1694    LLVMBuilderRef builder = gallivm->builder;
1695    const unsigned row_size = src_count / block_height;
1696 
1697    /* Ensure src exactly fits into block */
1698    assert((block_width * block_height) % src_count == 0);
1699 
1700    for (unsigned i = 0; i < src_count; ++i) {
1701       unsigned x = i % row_size;
1702       unsigned y = i / row_size;
1703 
1704       LLVMValueRef bx = lp_build_const_int32(gallivm, x * (src_type.width / 8) * src_type.length);
1705       LLVMValueRef by = LLVMBuildMul(builder, lp_build_const_int32(gallivm, y), stride, "");
1706 
1707       LLVMValueRef gep[2];
1708       LLVMValueRef src_ptr;
1709 
1710       gep[0] = lp_build_const_int32(gallivm, 0);
1711       gep[1] = LLVMBuildAdd(builder, bx, by, "");
1712 
1713       src_ptr = LLVMBuildGEP2(builder, base_type, base_ptr, gep, 2, "");
1714       src_ptr = LLVMBuildBitCast(builder, src_ptr,
1715                                  LLVMPointerType(lp_build_vec_type(gallivm, src_type), 0), "");
1716 
1717       src_ptr = LLVMBuildStore(builder, src[i], src_ptr);
1718 
1719       LLVMSetAlignment(src_ptr, src_alignment);
1720    }
1721 }
1722 
1723 
1724 
1725 /**
1726  * Retrieves the type for a format which is usable in the blending code.
1727  *
1728  * e.g. RGBA16F = 4x float, R3G3B2 = 3x byte
1729  */
1730 static inline void
lp_blend_type_from_format_desc(const struct util_format_description * format_desc,struct lp_type * type)1731 lp_blend_type_from_format_desc(const struct util_format_description *format_desc,
1732                                struct lp_type* type)
1733 {
1734    if (format_expands_to_float_soa(format_desc)) {
1735       /* always use ordinary floats for blending */
1736       type->floating = true;
1737       type->fixed = false;
1738       type->sign = true;
1739       type->norm = false;
1740       type->width = 32;
1741       type->length = 4;
1742       return;
1743    }
1744 
1745    const int chan = util_format_get_first_non_void_channel(format_desc->format);
1746 
1747    memset(type, 0, sizeof(struct lp_type));
1748    type->floating = format_desc->channel[chan].type == UTIL_FORMAT_TYPE_FLOAT;
1749    type->fixed    = format_desc->channel[chan].type == UTIL_FORMAT_TYPE_FIXED;
1750    type->sign     = format_desc->channel[chan].type != UTIL_FORMAT_TYPE_UNSIGNED;
1751    type->norm     = format_desc->channel[chan].normalized;
1752    type->width    = format_desc->channel[chan].size;
1753    type->length   = format_desc->nr_channels;
1754 
1755    for (unsigned i = 1; i < format_desc->nr_channels; ++i) {
1756       if (format_desc->channel[i].size > type->width)
1757          type->width = format_desc->channel[i].size;
1758    }
1759 
1760    if (type->floating) {
1761       type->width = 32;
1762    } else {
1763       if (type->width <= 8) {
1764          type->width = 8;
1765       } else if (type->width <= 16) {
1766          type->width = 16;
1767       } else {
1768          type->width = 32;
1769       }
1770    }
1771 
1772    if (is_arithmetic_format(format_desc) && type->length == 3) {
1773       type->length = 4;
1774    }
1775 }
1776 
1777 
1778 /**
1779  * Scale a normalized value from src_bits to dst_bits.
1780  *
1781  * The exact calculation is
1782  *
1783  *    dst = iround(src * dst_mask / src_mask)
1784  *
1785  *  or with integer rounding
1786  *
1787  *    dst = src * (2*dst_mask + sign(src)*src_mask) / (2*src_mask)
1788  *
1789  *  where
1790  *
1791  *    src_mask = (1 << src_bits) - 1
1792  *    dst_mask = (1 << dst_bits) - 1
1793  *
1794  * but we try to avoid division and multiplication through shifts.
1795  */
1796 static inline LLVMValueRef
scale_bits(struct gallivm_state * gallivm,int src_bits,int dst_bits,LLVMValueRef src,struct lp_type src_type)1797 scale_bits(struct gallivm_state *gallivm,
1798            int src_bits,
1799            int dst_bits,
1800            LLVMValueRef src,
1801            struct lp_type src_type)
1802 {
1803    LLVMBuilderRef builder = gallivm->builder;
1804    LLVMValueRef result = src;
1805 
1806    if (dst_bits < src_bits) {
1807       int delta_bits = src_bits - dst_bits;
1808 
1809       if (delta_bits <= dst_bits) {
1810 
1811          if (dst_bits == 4) {
1812             struct lp_type flt_type =
1813                lp_type_float_vec(32, src_type.length * 32);
1814 
1815             result = lp_build_unsigned_norm_to_float(gallivm, src_bits,
1816                                                      flt_type, src);
1817             result = lp_build_clamped_float_to_unsigned_norm(gallivm, flt_type,
1818                                                              dst_bits, result);
1819             result = LLVMBuildTrunc(gallivm->builder, result,
1820                                     lp_build_int_vec_type(gallivm, src_type),
1821                                     "");
1822             return result;
1823          }
1824 
1825          /*
1826           * Approximate the rescaling with a single shift.
1827           *
1828           * This gives the wrong rounding.
1829           */
1830 
1831          result = LLVMBuildLShr(builder, src,
1832                                 lp_build_const_int_vec(gallivm, src_type,
1833                                                        delta_bits),
1834                                 "");
1835       } else {
1836          /*
1837           * Try more accurate rescaling.
1838           */
1839 
1840          /*
1841           * Drop the least significant bits to make space for the
1842           * multiplication.
1843           *
1844           * XXX: A better approach would be to use a wider integer type as
1845           * intermediate.  But this is enough to convert alpha from 16bits ->
1846           * 2 when rendering to PIPE_FORMAT_R10G10B10A2_UNORM.
1847           */
1848          result = LLVMBuildLShr(builder, src,
1849                                 lp_build_const_int_vec(gallivm, src_type,
1850                                                        dst_bits),
1851                                 "");
1852 
1853 
1854          result = LLVMBuildMul(builder, result,
1855                                lp_build_const_int_vec(gallivm, src_type,
1856                                                       (1LL << dst_bits) - 1),
1857                                "");
1858 
1859          /*
1860           * Add a rounding term before the division.
1861           *
1862           * TODO: Handle signed integers too.
1863           */
1864          if (!src_type.sign) {
1865             result = LLVMBuildAdd(builder, result,
1866                                   lp_build_const_int_vec(gallivm, src_type,
1867                                                     (1LL << (delta_bits - 1))),
1868                                   "");
1869          }
1870 
1871          /*
1872           * Approximate the division by src_mask with a src_bits shift.
1873           *
1874           * Given the src has already been shifted by dst_bits, all we need
1875           * to do is to shift by the difference.
1876           */
1877 
1878          result = LLVMBuildLShr(builder,
1879                                 result,
1880                                 lp_build_const_int_vec(gallivm, src_type, delta_bits),
1881                                 "");
1882       }
1883 
1884    } else if (dst_bits > src_bits) {
1885       /* Scale up bits */
1886       int db = dst_bits - src_bits;
1887 
1888       /* Shift left by difference in bits */
1889       result = LLVMBuildShl(builder,
1890                             src,
1891                             lp_build_const_int_vec(gallivm, src_type, db),
1892                             "");
1893 
1894       if (db <= src_bits) {
1895          /* Enough bits in src to fill the remainder */
1896          LLVMValueRef lower = LLVMBuildLShr(builder,
1897                                             src,
1898                                             lp_build_const_int_vec(gallivm, src_type, src_bits - db),
1899                                             "");
1900 
1901          result = LLVMBuildOr(builder, result, lower, "");
1902       } else if (db > src_bits) {
1903          /* Need to repeatedly copy src bits to fill remainder in dst */
1904          unsigned n;
1905 
1906          for (n = src_bits; n < dst_bits; n *= 2) {
1907             LLVMValueRef shuv = lp_build_const_int_vec(gallivm, src_type, n);
1908 
1909             result = LLVMBuildOr(builder,
1910                                  result,
1911                                  LLVMBuildLShr(builder, result, shuv, ""),
1912                                  "");
1913          }
1914       }
1915    }
1916 
1917    return result;
1918 }
1919 
1920 /**
1921  * If RT is a smallfloat (needing denorms) format
1922  */
1923 static inline int
have_smallfloat_format(struct lp_type dst_type,enum pipe_format format)1924 have_smallfloat_format(struct lp_type dst_type,
1925                        enum pipe_format format)
1926 {
1927    return ((dst_type.floating && dst_type.width != 32) ||
1928     /* due to format handling hacks this format doesn't have floating set
1929      * here (and actually has width set to 32 too) so special case this.
1930      */
1931     (format == PIPE_FORMAT_R11G11B10_FLOAT));
1932 }
1933 
1934 
1935 /**
1936  * Convert from memory format to blending format
1937  *
1938  * e.g. GL_R3G3B2 is 1 byte in memory but 3 bytes for blending
1939  */
1940 static void
convert_to_blend_type(struct gallivm_state * gallivm,unsigned block_size,const struct util_format_description * src_fmt,struct lp_type src_type,struct lp_type dst_type,LLVMValueRef * src,unsigned num_srcs)1941 convert_to_blend_type(struct gallivm_state *gallivm,
1942                       unsigned block_size,
1943                       const struct util_format_description *src_fmt,
1944                       struct lp_type src_type,
1945                       struct lp_type dst_type,
1946                       LLVMValueRef* src, // and dst
1947                       unsigned num_srcs)
1948 {
1949    LLVMValueRef *dst = src;
1950    LLVMBuilderRef builder = gallivm->builder;
1951    struct lp_type blend_type;
1952    struct lp_type mem_type;
1953    unsigned i, j;
1954    unsigned pixels = block_size / num_srcs;
1955    bool is_arith;
1956 
1957    /*
1958     * full custom path for packed floats and srgb formats - none of the later
1959     * functions would do anything useful, and given the lp_type representation
1960     * they can't be fixed. Should really have some SoA blend path for these
1961     * kind of formats rather than hacking them in here.
1962     */
1963    if (format_expands_to_float_soa(src_fmt)) {
1964       LLVMValueRef tmpsrc[4];
1965       /*
1966        * This is pretty suboptimal for this case blending in SoA would be much
1967        * better, since conversion gets us SoA values so need to convert back.
1968        */
1969       assert(src_type.width == 32 || src_type.width == 16);
1970       assert(dst_type.floating);
1971       assert(dst_type.width == 32);
1972       assert(dst_type.length % 4 == 0);
1973       assert(num_srcs % 4 == 0);
1974 
1975       if (src_type.width == 16) {
1976          /* expand 4x16bit values to 4x32bit */
1977          struct lp_type type32x4 = src_type;
1978          LLVMTypeRef ltype32x4;
1979          unsigned num_fetch = dst_type.length == 8 ? num_srcs / 2 : num_srcs / 4;
1980          type32x4.width = 32;
1981          ltype32x4 = lp_build_vec_type(gallivm, type32x4);
1982          for (i = 0; i < num_fetch; i++) {
1983             src[i] = LLVMBuildZExt(builder, src[i], ltype32x4, "");
1984          }
1985          src_type.width = 32;
1986       }
1987       for (i = 0; i < 4; i++) {
1988          tmpsrc[i] = src[i];
1989       }
1990       for (i = 0; i < num_srcs / 4; i++) {
1991          LLVMValueRef tmpsoa[4];
1992          LLVMValueRef tmps = tmpsrc[i];
1993          if (dst_type.length == 8) {
1994             LLVMValueRef shuffles[8];
1995             unsigned j;
1996             /* fetch was 4 values but need 8-wide output values */
1997             tmps = lp_build_concat(gallivm, &tmpsrc[i * 2], src_type, 2);
1998             /*
1999              * for 8-wide aos transpose would give us wrong order not matching
2000              * incoming converted fs values and mask. ARGH.
2001              */
2002             for (j = 0; j < 4; j++) {
2003                shuffles[j] = lp_build_const_int32(gallivm, j * 2);
2004                shuffles[j + 4] = lp_build_const_int32(gallivm, j * 2 + 1);
2005             }
2006             tmps = LLVMBuildShuffleVector(builder, tmps, tmps,
2007                                           LLVMConstVector(shuffles, 8), "");
2008          }
2009          if (src_fmt->format == PIPE_FORMAT_R11G11B10_FLOAT) {
2010             lp_build_r11g11b10_to_float(gallivm, tmps, tmpsoa);
2011          } else {
2012             lp_build_unpack_rgba_soa(gallivm, src_fmt, dst_type, tmps, tmpsoa);
2013          }
2014          lp_build_transpose_aos(gallivm, dst_type, tmpsoa, &src[i * 4]);
2015       }
2016       return;
2017    }
2018 
2019    lp_mem_type_from_format_desc(src_fmt, &mem_type);
2020    lp_blend_type_from_format_desc(src_fmt, &blend_type);
2021 
2022    /* Is the format arithmetic */
2023    is_arith = blend_type.length * blend_type.width != mem_type.width * mem_type.length;
2024    is_arith &= !(mem_type.width == 16 && mem_type.floating);
2025 
2026    /* Pad if necessary */
2027    if (!is_arith && src_type.length < dst_type.length) {
2028       for (i = 0; i < num_srcs; ++i) {
2029          dst[i] = lp_build_pad_vector(gallivm, src[i], dst_type.length);
2030       }
2031 
2032       src_type.length = dst_type.length;
2033    }
2034 
2035    /* Special case for half-floats */
2036    if (mem_type.width == 16 && mem_type.floating) {
2037       assert(blend_type.width == 32 && blend_type.floating);
2038       lp_build_conv_auto(gallivm, src_type, &dst_type, dst, num_srcs, dst);
2039       is_arith = false;
2040    }
2041 
2042    if (!is_arith) {
2043       return;
2044    }
2045 
2046    src_type.width = blend_type.width * blend_type.length;
2047    blend_type.length *= pixels;
2048    src_type.length *= pixels / (src_type.length / mem_type.length);
2049 
2050    for (i = 0; i < num_srcs; ++i) {
2051       LLVMValueRef chans;
2052       LLVMValueRef res = NULL;
2053 
2054       dst[i] = LLVMBuildZExt(builder, src[i], lp_build_vec_type(gallivm, src_type), "");
2055 
2056       for (j = 0; j < src_fmt->nr_channels; ++j) {
2057          unsigned mask = 0;
2058          unsigned sa = src_fmt->channel[j].shift;
2059 #if UTIL_ARCH_LITTLE_ENDIAN
2060          unsigned from_lsb = j;
2061 #else
2062          unsigned from_lsb = (blend_type.length / pixels) - j - 1;
2063 #endif
2064 
2065          mask = (1 << src_fmt->channel[j].size) - 1;
2066 
2067          /* Extract bits from source */
2068          chans = LLVMBuildLShr(builder,
2069                                dst[i],
2070                                lp_build_const_int_vec(gallivm, src_type, sa),
2071                                "");
2072 
2073          chans = LLVMBuildAnd(builder,
2074                               chans,
2075                               lp_build_const_int_vec(gallivm, src_type, mask),
2076                               "");
2077 
2078          /* Scale bits */
2079          if (src_type.norm) {
2080             chans = scale_bits(gallivm, src_fmt->channel[j].size,
2081                                blend_type.width, chans, src_type);
2082          }
2083 
2084          /* Insert bits into correct position */
2085          chans = LLVMBuildShl(builder,
2086                               chans,
2087                               lp_build_const_int_vec(gallivm, src_type, from_lsb * blend_type.width),
2088                               "");
2089 
2090          if (j == 0) {
2091             res = chans;
2092          } else {
2093             res = LLVMBuildOr(builder, res, chans, "");
2094          }
2095       }
2096 
2097       dst[i] = LLVMBuildBitCast(builder, res, lp_build_vec_type(gallivm, blend_type), "");
2098    }
2099 }
2100 
2101 
2102 /**
2103  * Convert from blending format to memory format
2104  *
2105  * e.g. GL_R3G3B2 is 3 bytes for blending but 1 byte in memory
2106  */
2107 static void
convert_from_blend_type(struct gallivm_state * gallivm,unsigned block_size,const struct util_format_description * src_fmt,struct lp_type src_type,struct lp_type dst_type,LLVMValueRef * src,unsigned num_srcs)2108 convert_from_blend_type(struct gallivm_state *gallivm,
2109                         unsigned block_size,
2110                         const struct util_format_description *src_fmt,
2111                         struct lp_type src_type,
2112                         struct lp_type dst_type,
2113                         LLVMValueRef* src, // and dst
2114                         unsigned num_srcs)
2115 {
2116    LLVMValueRef* dst = src;
2117    unsigned i, j, k;
2118    struct lp_type mem_type;
2119    struct lp_type blend_type;
2120    LLVMBuilderRef builder = gallivm->builder;
2121    unsigned pixels = block_size / num_srcs;
2122    bool is_arith;
2123 
2124    /*
2125     * full custom path for packed floats and srgb formats - none of the later
2126     * functions would do anything useful, and given the lp_type representation
2127     * they can't be fixed. Should really have some SoA blend path for these
2128     * kind of formats rather than hacking them in here.
2129     */
2130    if (format_expands_to_float_soa(src_fmt)) {
2131       /*
2132        * This is pretty suboptimal for this case blending in SoA would be much
2133        * better - we need to transpose the AoS values back to SoA values for
2134        * conversion/packing.
2135        */
2136       assert(src_type.floating);
2137       assert(src_type.width == 32);
2138       assert(src_type.length % 4 == 0);
2139       assert(dst_type.width == 32 || dst_type.width == 16);
2140 
2141       for (i = 0; i < num_srcs / 4; i++) {
2142          LLVMValueRef tmpsoa[4], tmpdst;
2143          lp_build_transpose_aos(gallivm, src_type, &src[i * 4], tmpsoa);
2144          /* really really need SoA here */
2145 
2146          if (src_fmt->format == PIPE_FORMAT_R11G11B10_FLOAT) {
2147             tmpdst = lp_build_float_to_r11g11b10(gallivm, tmpsoa);
2148          } else {
2149             tmpdst = lp_build_float_to_srgb_packed(gallivm, src_fmt,
2150                                                    src_type, tmpsoa);
2151          }
2152 
2153          if (src_type.length == 8) {
2154             LLVMValueRef tmpaos, shuffles[8];
2155             unsigned j;
2156             /*
2157              * for 8-wide aos transpose has given us wrong order not matching
2158              * output order. HMPF. Also need to split the output values
2159              * manually.
2160              */
2161             for (j = 0; j < 4; j++) {
2162                shuffles[j * 2] = lp_build_const_int32(gallivm, j);
2163                shuffles[j * 2 + 1] = lp_build_const_int32(gallivm, j + 4);
2164             }
2165             tmpaos = LLVMBuildShuffleVector(builder, tmpdst, tmpdst,
2166                                             LLVMConstVector(shuffles, 8), "");
2167             src[i * 2] = lp_build_extract_range(gallivm, tmpaos, 0, 4);
2168             src[i * 2 + 1] = lp_build_extract_range(gallivm, tmpaos, 4, 4);
2169          } else {
2170             src[i] = tmpdst;
2171          }
2172       }
2173       if (dst_type.width == 16) {
2174          struct lp_type type16x8 = dst_type;
2175          struct lp_type type32x4 = dst_type;
2176          LLVMTypeRef ltype16x4, ltypei64, ltypei128;
2177          unsigned num_fetch = src_type.length == 8 ? num_srcs / 2 : num_srcs / 4;
2178          type16x8.length = 8;
2179          type32x4.width = 32;
2180          ltypei128 = LLVMIntTypeInContext(gallivm->context, 128);
2181          ltypei64 = LLVMIntTypeInContext(gallivm->context, 64);
2182          ltype16x4 = lp_build_vec_type(gallivm, dst_type);
2183          /* We could do vector truncation but it doesn't generate very good code */
2184          for (i = 0; i < num_fetch; i++) {
2185             src[i] = lp_build_pack2(gallivm, type32x4, type16x8,
2186                                     src[i], lp_build_zero(gallivm, type32x4));
2187             src[i] = LLVMBuildBitCast(builder, src[i], ltypei128, "");
2188             src[i] = LLVMBuildTrunc(builder, src[i], ltypei64, "");
2189             src[i] = LLVMBuildBitCast(builder, src[i], ltype16x4, "");
2190          }
2191       }
2192       return;
2193    }
2194 
2195    lp_mem_type_from_format_desc(src_fmt, &mem_type);
2196    lp_blend_type_from_format_desc(src_fmt, &blend_type);
2197 
2198    is_arith = (blend_type.length * blend_type.width != mem_type.width * mem_type.length);
2199 
2200    /* Special case for half-floats */
2201    if (mem_type.width == 16 && mem_type.floating) {
2202       int length = dst_type.length;
2203       assert(blend_type.width == 32 && blend_type.floating);
2204 
2205       dst_type.length = src_type.length;
2206 
2207       lp_build_conv_auto(gallivm, src_type, &dst_type, dst, num_srcs, dst);
2208 
2209       dst_type.length = length;
2210       is_arith = false;
2211    }
2212 
2213    /* Remove any padding */
2214    if (!is_arith && (src_type.length % mem_type.length)) {
2215       src_type.length -= (src_type.length % mem_type.length);
2216 
2217       for (i = 0; i < num_srcs; ++i) {
2218          dst[i] = lp_build_extract_range(gallivm, dst[i], 0, src_type.length);
2219       }
2220    }
2221 
2222    /* No bit arithmetic to do */
2223    if (!is_arith) {
2224       return;
2225    }
2226 
2227    src_type.length = pixels;
2228    src_type.width = blend_type.length * blend_type.width;
2229    dst_type.length = pixels;
2230 
2231    for (i = 0; i < num_srcs; ++i) {
2232       LLVMValueRef chans;
2233       LLVMValueRef res = NULL;
2234 
2235       dst[i] = LLVMBuildBitCast(builder, src[i], lp_build_vec_type(gallivm, src_type), "");
2236 
2237       for (j = 0; j < src_fmt->nr_channels; ++j) {
2238          unsigned mask = 0;
2239          unsigned sa = src_fmt->channel[j].shift;
2240          unsigned sz_a = src_fmt->channel[j].size;
2241 #if UTIL_ARCH_LITTLE_ENDIAN
2242          unsigned from_lsb = j;
2243 #else
2244          unsigned from_lsb = blend_type.length - j - 1;
2245 #endif
2246 
2247          assert(blend_type.width > src_fmt->channel[j].size);
2248 
2249          for (k = 0; k < blend_type.width; ++k) {
2250             mask |= 1 << k;
2251          }
2252 
2253          /* Extract bits */
2254          chans = LLVMBuildLShr(builder,
2255                                dst[i],
2256                                lp_build_const_int_vec(gallivm, src_type,
2257                                                       from_lsb * blend_type.width),
2258                                "");
2259 
2260          chans = LLVMBuildAnd(builder,
2261                               chans,
2262                               lp_build_const_int_vec(gallivm, src_type, mask),
2263                               "");
2264 
2265          /* Scale down bits */
2266          if (src_type.norm) {
2267             chans = scale_bits(gallivm, blend_type.width,
2268                                src_fmt->channel[j].size, chans, src_type);
2269          } else if (!src_type.floating && sz_a < blend_type.width) {
2270             LLVMValueRef mask_val = lp_build_const_int_vec(gallivm, src_type, (1UL << sz_a) - 1);
2271             LLVMValueRef mask = LLVMBuildICmp(builder, LLVMIntUGT, chans, mask_val, "");
2272             chans = LLVMBuildSelect(builder, mask, mask_val, chans, "");
2273          }
2274 
2275          /* Insert bits */
2276          chans = LLVMBuildShl(builder,
2277                               chans,
2278                               lp_build_const_int_vec(gallivm, src_type, sa),
2279                               "");
2280 
2281          sa += src_fmt->channel[j].size;
2282 
2283          if (j == 0) {
2284             res = chans;
2285          } else {
2286             res = LLVMBuildOr(builder, res, chans, "");
2287          }
2288       }
2289 
2290       assert (dst_type.width != 24);
2291 
2292       dst[i] = LLVMBuildTrunc(builder, res, lp_build_vec_type(gallivm, dst_type), "");
2293    }
2294 }
2295 
2296 
2297 /**
2298  * Convert alpha to same blend type as src
2299  */
2300 static void
convert_alpha(struct gallivm_state * gallivm,struct lp_type row_type,struct lp_type alpha_type,const unsigned block_size,const unsigned block_height,const unsigned src_count,const unsigned dst_channels,const bool pad_inline,LLVMValueRef * src_alpha)2301 convert_alpha(struct gallivm_state *gallivm,
2302               struct lp_type row_type,
2303               struct lp_type alpha_type,
2304               const unsigned block_size,
2305               const unsigned block_height,
2306               const unsigned src_count,
2307               const unsigned dst_channels,
2308               const bool pad_inline,
2309               LLVMValueRef* src_alpha)
2310 {
2311    LLVMBuilderRef builder = gallivm->builder;
2312    const unsigned length = row_type.length;
2313    row_type.length = alpha_type.length;
2314 
2315    /* Twiddle the alpha to match pixels */
2316    lp_bld_quad_twiddle(gallivm, alpha_type, src_alpha, block_height, src_alpha);
2317 
2318    /*
2319     * TODO this should use single lp_build_conv call for
2320     * src_count == 1 && dst_channels == 1 case (dropping the concat below)
2321     */
2322    for (unsigned i = 0; i < block_height; ++i) {
2323       lp_build_conv(gallivm, alpha_type, row_type, &src_alpha[i], 1,
2324                     &src_alpha[i], 1);
2325    }
2326 
2327    alpha_type = row_type;
2328    row_type.length = length;
2329 
2330    /* If only one channel we can only need the single alpha value per pixel */
2331    if (src_count == 1 && dst_channels == 1) {
2332       lp_build_concat_n(gallivm, alpha_type, src_alpha, block_height,
2333                         src_alpha, src_count);
2334    } else {
2335       /* If there are more srcs than rows then we need to split alpha up */
2336       if (src_count > block_height) {
2337          for (unsigned i = src_count; i > 0; --i) {
2338             unsigned pixels = block_size / src_count;
2339             unsigned idx = i - 1;
2340 
2341             src_alpha[idx] =
2342                lp_build_extract_range(gallivm, src_alpha[(idx * pixels) / 4],
2343                                       (idx * pixels) % 4, pixels);
2344          }
2345       }
2346 
2347       /* If there is a src for each pixel broadcast the alpha across whole
2348        * row
2349        */
2350       if (src_count == block_size) {
2351          for (unsigned i = 0; i < src_count; ++i) {
2352             src_alpha[i] = lp_build_broadcast(gallivm,
2353                               lp_build_vec_type(gallivm, row_type), src_alpha[i]);
2354          }
2355       } else {
2356          unsigned pixels = block_size / src_count;
2357          unsigned channels = pad_inline ? TGSI_NUM_CHANNELS : dst_channels;
2358          unsigned alpha_span = 1;
2359          LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
2360 
2361          /* Check if we need 2 src_alphas for our shuffles */
2362          if (pixels > alpha_type.length) {
2363             alpha_span = 2;
2364          }
2365 
2366          /* Broadcast alpha across all channels, e.g. a1a2 to a1a1a1a1a2a2a2a2 */
2367          for (unsigned j = 0; j < row_type.length; ++j) {
2368             if (j < pixels * channels) {
2369                shuffles[j] = lp_build_const_int32(gallivm, j / channels);
2370             } else {
2371                shuffles[j] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
2372             }
2373          }
2374 
2375          for (unsigned i = 0; i < src_count; ++i) {
2376             unsigned idx1 = i, idx2 = i;
2377 
2378             if (alpha_span > 1){
2379                idx1 *= alpha_span;
2380                idx2 = idx1 + 1;
2381             }
2382 
2383             src_alpha[i] = LLVMBuildShuffleVector(builder,
2384                                                   src_alpha[idx1],
2385                                                   src_alpha[idx2],
2386                                                   LLVMConstVector(shuffles, row_type.length),
2387                                                   "");
2388          }
2389       }
2390    }
2391 }
2392 
2393 
2394 /**
2395  * Generates the blend function for unswizzled colour buffers
2396  * Also generates the read & write from colour buffer
2397  */
2398 static void
generate_unswizzled_blend(struct gallivm_state * gallivm,unsigned rt,struct lp_fragment_shader_variant * variant,enum pipe_format out_format,unsigned int num_fs,struct lp_type fs_type,LLVMValueRef * fs_mask,LLVMValueRef fs_out_color[PIPE_MAX_COLOR_BUFS][TGSI_NUM_CHANNELS][4],LLVMTypeRef context_type,LLVMValueRef context_ptr,LLVMTypeRef color_type,LLVMValueRef color_ptr,LLVMValueRef stride,unsigned partial_mask,bool do_branch)2399 generate_unswizzled_blend(struct gallivm_state *gallivm,
2400                           unsigned rt,
2401                           struct lp_fragment_shader_variant *variant,
2402                           enum pipe_format out_format,
2403                           unsigned int num_fs,
2404                           struct lp_type fs_type,
2405                           LLVMValueRef* fs_mask,
2406                           LLVMValueRef fs_out_color[PIPE_MAX_COLOR_BUFS][TGSI_NUM_CHANNELS][4],
2407                           LLVMTypeRef context_type,
2408                           LLVMValueRef context_ptr,
2409                           LLVMTypeRef color_type,
2410                           LLVMValueRef color_ptr,
2411                           LLVMValueRef stride,
2412                           unsigned partial_mask,
2413                           bool do_branch)
2414 {
2415    const unsigned alpha_channel = 3;
2416    const unsigned block_width = LP_RASTER_BLOCK_SIZE;
2417    const unsigned block_height = LP_RASTER_BLOCK_SIZE;
2418    const unsigned block_size = block_width * block_height;
2419    const unsigned lp_integer_vector_width = 128;
2420 
2421    LLVMBuilderRef builder = gallivm->builder;
2422    LLVMValueRef fs_src[4][TGSI_NUM_CHANNELS];
2423    LLVMValueRef fs_src1[4][TGSI_NUM_CHANNELS];
2424    LLVMValueRef src_alpha[4 * 4];
2425    LLVMValueRef src1_alpha[4 * 4] = { NULL };
2426    LLVMValueRef src_mask[4 * 4];
2427    LLVMValueRef src[4 * 4];
2428    LLVMValueRef src1[4 * 4];
2429    LLVMValueRef dst[4 * 4];
2430 
2431    struct lp_build_mask_context mask_ctx;
2432 
2433    unsigned char swizzle[TGSI_NUM_CHANNELS];
2434    unsigned src_channels = TGSI_NUM_CHANNELS;
2435 
2436    const struct util_format_description *out_format_desc =
2437       util_format_description(out_format);
2438 
2439    bool pad_inline = is_arithmetic_format(out_format_desc);
2440    const bool dual_source_blend =
2441       variant->key.blend.rt[0].blend_enable &&
2442       util_blend_state_is_dual(&variant->key.blend, 0);
2443 
2444    const bool is_1d = variant->key.resource_1d;
2445    const unsigned num_fullblock_fs = is_1d ? 2 * num_fs : num_fs;
2446    LLVMValueRef fpstate = NULL;
2447 
2448    LLVMTypeRef fs_vec_type = lp_build_vec_type(gallivm, fs_type);
2449 
2450    /* Get type from output format */
2451    struct lp_type row_type, dst_type;
2452    lp_blend_type_from_format_desc(out_format_desc, &row_type);
2453    lp_mem_type_from_format_desc(out_format_desc, &dst_type);
2454 
2455    /*
2456     * Technically this code should go into lp_build_smallfloat_to_float
2457     * and lp_build_float_to_smallfloat but due to the
2458     * http://llvm.org/bugs/show_bug.cgi?id=6393
2459     * llvm reorders the mxcsr intrinsics in a way that breaks the code.
2460     * So the ordering is important here and there shouldn't be any
2461     * llvm ir instrunctions in this function before
2462     * this, otherwise half-float format conversions won't work
2463     * (again due to llvm bug #6393).
2464     */
2465    if (have_smallfloat_format(dst_type, out_format)) {
2466       /* We need to make sure that denorms are ok for half float
2467          conversions */
2468       fpstate = lp_build_fpstate_get(gallivm);
2469       lp_build_fpstate_set_denorms_zero(gallivm, false);
2470    }
2471 
2472    struct lp_type mask_type = lp_int32_vec4_type();
2473    mask_type.length = fs_type.length;
2474 
2475    for (unsigned i = num_fs; i < num_fullblock_fs; i++) {
2476       fs_mask[i] = lp_build_zero(gallivm, mask_type);
2477    }
2478 
2479    /* Do not bother executing code when mask is empty.. */
2480    if (do_branch) {
2481       LLVMValueRef check_mask =
2482          LLVMConstNull(lp_build_int_vec_type(gallivm, mask_type));
2483 
2484       for (unsigned i = 0; i < num_fullblock_fs; ++i) {
2485          check_mask = LLVMBuildOr(builder, check_mask, fs_mask[i], "");
2486       }
2487 
2488       lp_build_mask_begin(&mask_ctx, gallivm, mask_type, check_mask);
2489       lp_build_mask_check(&mask_ctx);
2490    }
2491 
2492    partial_mask |= !variant->opaque;
2493    LLVMValueRef i32_zero = lp_build_const_int32(gallivm, 0);
2494 
2495    LLVMValueRef undef_src_val = lp_build_undef(gallivm, fs_type);
2496 
2497    row_type.length = fs_type.length;
2498    unsigned vector_width =
2499       dst_type.floating ? lp_native_vector_width : lp_integer_vector_width;
2500 
2501    /* Compute correct swizzle and count channels */
2502    memset(swizzle, LP_BLD_SWIZZLE_DONTCARE, TGSI_NUM_CHANNELS);
2503    unsigned dst_channels = 0;
2504 
2505    bool has_alpha = false;
2506    for (unsigned i = 0; i < TGSI_NUM_CHANNELS; ++i) {
2507       /* Ensure channel is used */
2508       if (out_format_desc->swizzle[i] >= TGSI_NUM_CHANNELS) {
2509          continue;
2510       }
2511 
2512       /* Ensure not already written to (happens in case with GL_ALPHA) */
2513       if (swizzle[out_format_desc->swizzle[i]] < TGSI_NUM_CHANNELS) {
2514          continue;
2515       }
2516 
2517       /* Ensure we haven't already found all channels */
2518       if (dst_channels >= out_format_desc->nr_channels) {
2519          continue;
2520       }
2521 
2522       swizzle[out_format_desc->swizzle[i]] = i;
2523       ++dst_channels;
2524 
2525       if (i == alpha_channel) {
2526          has_alpha = true;
2527       }
2528    }
2529 
2530    if (format_expands_to_float_soa(out_format_desc)) {
2531       /*
2532        * the code above can't work for layout_other
2533        * for srgb it would sort of work but we short-circuit swizzles, etc.
2534        * as that is done as part of unpack / pack.
2535        */
2536       dst_channels = 4; /* HACK: this is fake 4 really but need it due to transpose stuff later */
2537       has_alpha = true;
2538       swizzle[0] = 0;
2539       swizzle[1] = 1;
2540       swizzle[2] = 2;
2541       swizzle[3] = 3;
2542       pad_inline = true; /* HACK: prevent rgbxrgbx->rgbrgbxx conversion later */
2543    }
2544 
2545    /* If 3 channels then pad to include alpha for 4 element transpose */
2546    if (dst_channels == 3) {
2547       assert (!has_alpha);
2548       for (unsigned i = 0; i < TGSI_NUM_CHANNELS; i++) {
2549          if (swizzle[i] > TGSI_NUM_CHANNELS)
2550             swizzle[i] = 3;
2551       }
2552       if (out_format_desc->nr_channels == 4) {
2553          dst_channels = 4;
2554          /*
2555           * We use alpha from the color conversion, not separate one.
2556           * We had to include it for transpose, hence it will get converted
2557           * too (albeit when doing transpose after conversion, that would
2558           * no longer be the case necessarily).
2559           * (It works only with 4 channel dsts, e.g. rgbx formats, because
2560           * otherwise we really have padding, not alpha, included.)
2561           */
2562          has_alpha = true;
2563       }
2564    }
2565 
2566    /*
2567     * Load shader output
2568     */
2569    for (unsigned i = 0; i < num_fullblock_fs; ++i) {
2570       /* Always load alpha for use in blending */
2571       LLVMValueRef alpha;
2572       if (i < num_fs) {
2573          alpha = LLVMBuildLoad2(builder, fs_vec_type,
2574                                 fs_out_color[rt][alpha_channel][i], "");
2575       } else {
2576          alpha = undef_src_val;
2577       }
2578 
2579       /* Load each channel */
2580       for (unsigned j = 0; j < dst_channels; ++j) {
2581          assert(swizzle[j] < 4);
2582          if (i < num_fs) {
2583             fs_src[i][j] = LLVMBuildLoad2(builder, fs_vec_type,
2584                                           fs_out_color[rt][swizzle[j]][i], "");
2585          } else {
2586             fs_src[i][j] = undef_src_val;
2587          }
2588       }
2589 
2590       /* If 3 channels then pad to include alpha for 4 element transpose */
2591       /*
2592        * XXX If we include that here maybe could actually use it instead of
2593        * separate alpha for blending?
2594        * (Difficult though we actually convert pad channels, not alpha.)
2595        */
2596       if (dst_channels == 3 && !has_alpha) {
2597          fs_src[i][3] = alpha;
2598       }
2599 
2600       /* We split the row_mask and row_alpha as we want 128bit interleave */
2601       if (fs_type.length == 8) {
2602          src_mask[i*2 + 0]  = lp_build_extract_range(gallivm, fs_mask[i],
2603                                                      0, src_channels);
2604          src_mask[i*2 + 1]  = lp_build_extract_range(gallivm, fs_mask[i],
2605                                                      src_channels,
2606                                                      src_channels);
2607 
2608          src_alpha[i*2 + 0] = lp_build_extract_range(gallivm, alpha,
2609                                                      0, src_channels);
2610          src_alpha[i*2 + 1] = lp_build_extract_range(gallivm, alpha,
2611                                                      src_channels,
2612                                                      src_channels);
2613       } else {
2614          src_mask[i] = fs_mask[i];
2615          src_alpha[i] = alpha;
2616       }
2617    }
2618    if (dual_source_blend) {
2619       /* same as above except different src/dst, skip masks and comments... */
2620       for (unsigned i = 0; i < num_fullblock_fs; ++i) {
2621          LLVMValueRef alpha;
2622          if (i < num_fs) {
2623             alpha = LLVMBuildLoad2(builder, fs_vec_type,
2624                                    fs_out_color[1][alpha_channel][i], "");
2625          } else {
2626             alpha = undef_src_val;
2627          }
2628 
2629          for (unsigned j = 0; j < dst_channels; ++j) {
2630             assert(swizzle[j] < 4);
2631             if (i < num_fs) {
2632                fs_src1[i][j] = LLVMBuildLoad2(builder, fs_vec_type,
2633                                               fs_out_color[1][swizzle[j]][i], "");
2634             } else {
2635                fs_src1[i][j] = undef_src_val;
2636             }
2637          }
2638          if (dst_channels == 3 && !has_alpha) {
2639             fs_src1[i][3] = alpha;
2640          }
2641          if (fs_type.length == 8) {
2642             src1_alpha[i*2 + 0] = lp_build_extract_range(gallivm, alpha, 0, src_channels);
2643             src1_alpha[i*2 + 1] = lp_build_extract_range(gallivm, alpha,
2644                                                          src_channels, src_channels);
2645          } else {
2646             src1_alpha[i] = alpha;
2647          }
2648       }
2649    }
2650 
2651    if (util_format_is_pure_integer(out_format)) {
2652       /*
2653        * In this case fs_type was really ints or uints disguised as floats,
2654        * fix that up now.
2655        */
2656       fs_type.floating = 0;
2657       fs_type.sign = dst_type.sign;
2658       fs_vec_type = lp_build_vec_type(gallivm, fs_type);
2659       for (unsigned i = 0; i < num_fullblock_fs; ++i) {
2660          for (unsigned j = 0; j < dst_channels; ++j) {
2661             fs_src[i][j] = LLVMBuildBitCast(builder, fs_src[i][j],
2662                                             fs_vec_type, "");
2663          }
2664          if (dst_channels == 3 && !has_alpha) {
2665             fs_src[i][3] = LLVMBuildBitCast(builder, fs_src[i][3],
2666                                             fs_vec_type, "");
2667          }
2668       }
2669    }
2670 
2671    /*
2672     * We actually should generally do conversion first (for non-1d cases)
2673     * when the blend format is 8 or 16 bits. The reason is obvious,
2674     * there's 2 or 4 times less vectors to deal with for the interleave...
2675     * Albeit for the AVX (not AVX2) case there's no benefit with 16 bit
2676     * vectors (as it can do 32bit unpack with 256bit vectors, but 8/16bit
2677     * unpack only with 128bit vectors).
2678     * Note: for 16bit sizes really need matching pack conversion code
2679     */
2680    bool twiddle_after_convert = false;
2681    if (!is_1d && dst_channels != 3 && dst_type.width == 8) {
2682       twiddle_after_convert = true;
2683    }
2684 
2685    /*
2686     * Pixel twiddle from fragment shader order to memory order
2687     */
2688    unsigned src_count;
2689    if (!twiddle_after_convert) {
2690       src_count = generate_fs_twiddle(gallivm, fs_type, num_fullblock_fs,
2691                                       dst_channels, fs_src, src, pad_inline);
2692       if (dual_source_blend) {
2693          generate_fs_twiddle(gallivm, fs_type, num_fullblock_fs, dst_channels,
2694                              fs_src1, src1, pad_inline);
2695       }
2696    } else {
2697       src_count = num_fullblock_fs * dst_channels;
2698       /*
2699        * We reorder things a bit here, so the cases for 4-wide and 8-wide
2700        * (AVX) turn out the same later when untwiddling/transpose (albeit
2701        * for true AVX2 path untwiddle needs to be different).
2702        * For now just order by colors first (so we can use unpack later).
2703        */
2704       for (unsigned j = 0; j < num_fullblock_fs; j++) {
2705          for (unsigned i = 0; i < dst_channels; i++) {
2706             src[i*num_fullblock_fs + j] = fs_src[j][i];
2707             if (dual_source_blend) {
2708                src1[i*num_fullblock_fs + j] = fs_src1[j][i];
2709             }
2710          }
2711       }
2712    }
2713 
2714    src_channels = dst_channels < 3 ? dst_channels : 4;
2715    if (src_count != num_fullblock_fs * src_channels) {
2716       unsigned ds = src_count / (num_fullblock_fs * src_channels);
2717       row_type.length /= ds;
2718       fs_type.length = row_type.length;
2719       fs_vec_type = lp_build_vec_type(gallivm, fs_type);
2720    }
2721 
2722    struct lp_type blend_type = row_type;
2723    mask_type.length = 4;
2724 
2725    /* Convert src to row_type */
2726    if (dual_source_blend) {
2727       struct lp_type old_row_type = row_type;
2728       lp_build_conv_auto(gallivm, fs_type, &row_type, src, src_count, src);
2729       src_count = lp_build_conv_auto(gallivm, fs_type, &old_row_type,
2730                                      src1, src_count, src1);
2731    } else {
2732       src_count = lp_build_conv_auto(gallivm, fs_type, &row_type,
2733                                      src, src_count, src);
2734    }
2735 
2736    /* If the rows are not an SSE vector, combine them to become SSE size! */
2737    if ((row_type.width * row_type.length) % 128) {
2738       unsigned bits = row_type.width * row_type.length;
2739       unsigned combined;
2740 
2741       assert(src_count >= (vector_width / bits));
2742 
2743       const unsigned dst_count = src_count / (vector_width / bits);
2744 
2745       combined = lp_build_concat_n(gallivm, row_type, src, src_count,
2746                                    src, dst_count);
2747       if (dual_source_blend) {
2748          lp_build_concat_n(gallivm, row_type, src1, src_count, src1, dst_count);
2749       }
2750 
2751       row_type.length *= combined;
2752       src_count /= combined;
2753 
2754       bits = row_type.width * row_type.length;
2755       assert(bits == 128 || bits == 256);
2756    }
2757 
2758    if (twiddle_after_convert) {
2759       fs_twiddle_transpose(gallivm, row_type, src, src_count, src);
2760       if (dual_source_blend) {
2761          fs_twiddle_transpose(gallivm, row_type, src1, src_count, src1);
2762       }
2763    }
2764 
2765    /*
2766     * Blend Colour conversion
2767     */
2768    LLVMValueRef blend_color =
2769       lp_jit_context_f_blend_color(gallivm, context_type, context_ptr);
2770    blend_color = LLVMBuildPointerCast(builder, blend_color,
2771                                       LLVMPointerType(fs_vec_type, 0),
2772                                       "");
2773    blend_color = LLVMBuildLoad2(builder, fs_vec_type,
2774                                 LLVMBuildGEP2(builder, fs_vec_type,
2775                                               blend_color,
2776                                               &i32_zero, 1, ""), "");
2777 
2778    /* Convert */
2779    lp_build_conv(gallivm, fs_type, blend_type, &blend_color, 1,
2780                  &blend_color, 1);
2781 
2782    if (out_format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
2783       /*
2784        * since blending is done with floats, there was no conversion.
2785        * However, the rules according to fixed point renderbuffers still
2786        * apply, that is we must clamp inputs to 0.0/1.0.
2787        * (This would apply to separate alpha conversion too but we currently
2788        * force has_alpha to be true.)
2789        * TODO: should skip this with "fake" blend, since post-blend conversion
2790        * will clamp anyway.
2791        * TODO: could also skip this if fragment color clamping is enabled.
2792        * We don't support it natively so it gets baked into the shader
2793        * however, so can't really tell here.
2794        */
2795       struct lp_build_context f32_bld;
2796       assert(row_type.floating);
2797       lp_build_context_init(&f32_bld, gallivm, row_type);
2798       for (unsigned i = 0; i < src_count; i++) {
2799          src[i] = lp_build_clamp_zero_one_nanzero(&f32_bld, src[i]);
2800       }
2801       if (dual_source_blend) {
2802          for (unsigned i = 0; i < src_count; i++) {
2803             src1[i] = lp_build_clamp_zero_one_nanzero(&f32_bld, src1[i]);
2804          }
2805       }
2806       /* probably can't be different than row_type but better safe than sorry... */
2807       lp_build_context_init(&f32_bld, gallivm, blend_type);
2808       blend_color = lp_build_clamp(&f32_bld, blend_color,
2809                                    f32_bld.zero, f32_bld.one);
2810    }
2811 
2812    /* Extract alpha */
2813    LLVMValueRef blend_alpha =
2814       lp_build_extract_broadcast(gallivm, blend_type, row_type,
2815                                  blend_color,
2816                                  lp_build_const_int32(gallivm, 3));
2817 
2818    /* Swizzle to appropriate channels, e.g. from RGBA to BGRA BGRA */
2819    pad_inline &= (dst_channels * (block_size / src_count) * row_type.width)
2820       != vector_width;
2821    if (pad_inline) {
2822       /* Use all 4 channels e.g. from RGBA RGBA to RGxx RGxx */
2823       blend_color = lp_build_swizzle_aos_n(gallivm, blend_color, swizzle,
2824                                            TGSI_NUM_CHANNELS, row_type.length);
2825    } else {
2826       /* Only use dst_channels e.g. RGBA RGBA to RG RG xxxx */
2827       blend_color = lp_build_swizzle_aos_n(gallivm, blend_color, swizzle,
2828                                            dst_channels, row_type.length);
2829    }
2830 
2831    /*
2832     * Mask conversion
2833     */
2834    lp_bld_quad_twiddle(gallivm, mask_type, &src_mask[0],
2835                        block_height, &src_mask[0]);
2836 
2837    if (src_count < block_height) {
2838       lp_build_concat_n(gallivm, mask_type, src_mask, 4, src_mask, src_count);
2839    } else if (src_count > block_height) {
2840       for (unsigned i = src_count; i > 0; --i) {
2841          unsigned pixels = block_size / src_count;
2842          unsigned idx = i - 1;
2843 
2844          src_mask[idx] = lp_build_extract_range(gallivm,
2845                                                 src_mask[(idx * pixels) / 4],
2846                                                 (idx * pixels) % 4, pixels);
2847       }
2848    }
2849 
2850    assert(mask_type.width == 32);
2851 
2852    for (unsigned i = 0; i < src_count; ++i) {
2853       unsigned pixels = block_size / src_count;
2854       unsigned pixel_width = row_type.width * dst_channels;
2855 
2856       if (pixel_width == 24) {
2857          mask_type.width = 8;
2858          mask_type.length = vector_width / mask_type.width;
2859       } else {
2860          mask_type.length = pixels;
2861          mask_type.width = row_type.width * dst_channels;
2862 
2863          /*
2864           * If mask_type width is smaller than 32bit, this doesn't quite
2865           * generate the most efficient code (could use some pack).
2866           */
2867          src_mask[i] = LLVMBuildIntCast(builder, src_mask[i],
2868                                         lp_build_int_vec_type(gallivm,
2869                                                               mask_type), "");
2870 
2871          mask_type.length *= dst_channels;
2872          mask_type.width /= dst_channels;
2873       }
2874 
2875       src_mask[i] = LLVMBuildBitCast(builder, src_mask[i],
2876                                      lp_build_int_vec_type(gallivm, mask_type),
2877                                      "");
2878       src_mask[i] = lp_build_pad_vector(gallivm, src_mask[i], row_type.length);
2879    }
2880 
2881    /*
2882     * Alpha conversion
2883     */
2884    if (!has_alpha) {
2885       struct lp_type alpha_type = fs_type;
2886       alpha_type.length = 4;
2887       convert_alpha(gallivm, row_type, alpha_type,
2888                     block_size, block_height,
2889                     src_count, dst_channels,
2890                     pad_inline, src_alpha);
2891       if (dual_source_blend) {
2892          convert_alpha(gallivm, row_type, alpha_type,
2893                        block_size, block_height,
2894                        src_count, dst_channels,
2895                        pad_inline, src1_alpha);
2896       }
2897    }
2898 
2899 
2900    /*
2901     * Load dst from memory
2902     */
2903    unsigned dst_count;
2904    if (src_count < block_height) {
2905       dst_count = block_height;
2906    } else {
2907       dst_count = src_count;
2908    }
2909 
2910    dst_type.length *= block_size / dst_count;
2911 
2912    if (format_expands_to_float_soa(out_format_desc)) {
2913       /*
2914        * we need multiple values at once for the conversion, so can as well
2915        * load them vectorized here too instead of concatenating later.
2916        * (Still need concatenation later for 8-wide vectors).
2917        */
2918       dst_count = block_height;
2919       dst_type.length = block_width;
2920    }
2921 
2922    /*
2923     * Compute the alignment of the destination pointer in bytes
2924     * We fetch 1-4 pixels, if the format has pot alignment then those fetches
2925     * are always aligned by MIN2(16, fetch_width) except for buffers (not
2926     * 1d tex but can't distinguish here) so need to stick with per-pixel
2927     * alignment in this case.
2928     */
2929    unsigned dst_alignment;
2930    if (is_1d) {
2931       dst_alignment = (out_format_desc->block.bits + 7)/(out_format_desc->block.width * 8);
2932    } else {
2933       dst_alignment = dst_type.length * dst_type.width / 8;
2934    }
2935    /* Force power-of-two alignment by extracting only the least-significant-bit */
2936    dst_alignment = 1 << (ffs(dst_alignment) - 1);
2937    /*
2938     * Resource base and stride pointers are aligned to 16 bytes, so that's
2939     * the maximum alignment we can guarantee
2940     */
2941    dst_alignment = MIN2(16, dst_alignment);
2942 
2943    struct lp_type ls_type = dst_type;
2944 
2945    if (dst_count > src_count) {
2946       if ((dst_type.width == 8 || dst_type.width == 16) &&
2947           util_is_power_of_two_or_zero(dst_type.length) &&
2948           dst_type.length * dst_type.width < 128) {
2949          /*
2950           * Never try to load values as 4xi8 which we will then
2951           * concatenate to larger vectors. This gives llvm a real
2952           * headache (the problem is the type legalizer (?) will
2953           * try to load that as 4xi8 zext to 4xi32 to fill the vector,
2954           * then the shuffles to concatenate are more or less impossible
2955           * - llvm is easily capable of generating a sequence of 32
2956           * pextrb/pinsrb instructions for that. Albeit it appears to
2957           * be fixed in llvm 4.0. So, load and concatenate with 32bit
2958           * width to avoid the trouble (16bit seems not as bad, llvm
2959           * probably recognizes the load+shuffle as only one shuffle
2960           * is necessary, but we can do just the same anyway).
2961           */
2962          ls_type.length = dst_type.length * dst_type.width / 32;
2963          ls_type.width = 32;
2964       }
2965    }
2966 
2967    if (is_1d) {
2968       load_unswizzled_block(gallivm, color_type, color_ptr, stride, block_width, 1,
2969                             dst, ls_type, dst_count / 4, dst_alignment);
2970       for (unsigned i = dst_count / 4; i < dst_count; i++) {
2971          dst[i] = lp_build_undef(gallivm, ls_type);
2972       }
2973    } else {
2974       load_unswizzled_block(gallivm, color_type, color_ptr, stride, block_width,
2975                             block_height, dst, ls_type, dst_count,
2976                             dst_alignment);
2977    }
2978 
2979 
2980    /*
2981     * Convert from dst/output format to src/blending format.
2982     *
2983     * This is necessary as we can only read 1 row from memory at a time,
2984     * so the minimum dst_count will ever be at this point is 4.
2985     *
2986     * With, for example, R8 format you can have all 16 pixels in a 128 bit
2987     * vector, this will take the 4 dsts and combine them into 1 src so we can
2988     * perform blending on all 16 pixels in that single vector at once.
2989     */
2990    if (dst_count > src_count) {
2991       if (ls_type.length != dst_type.length && ls_type.length == 1) {
2992          LLVMTypeRef elem_type = lp_build_elem_type(gallivm, ls_type);
2993          LLVMTypeRef ls_vec_type = LLVMVectorType(elem_type, 1);
2994          for (unsigned i = 0; i < dst_count; i++) {
2995             dst[i] = LLVMBuildBitCast(builder, dst[i], ls_vec_type, "");
2996          }
2997       }
2998 
2999       lp_build_concat_n(gallivm, ls_type, dst, 4, dst, src_count);
3000 
3001       if (ls_type.length != dst_type.length) {
3002          struct lp_type tmp_type = dst_type;
3003          tmp_type.length = dst_type.length * 4 / src_count;
3004          for (unsigned i = 0; i < src_count; i++) {
3005             dst[i] = LLVMBuildBitCast(builder, dst[i],
3006                                       lp_build_vec_type(gallivm, tmp_type), "");
3007          }
3008       }
3009    }
3010 
3011    /*
3012     * Blending
3013     */
3014    /* XXX this is broken for RGB8 formats -
3015     * they get expanded from 12 to 16 elements (to include alpha)
3016     * by convert_to_blend_type then reduced to 15 instead of 12
3017     * by convert_from_blend_type (a simple fix though breaks A8...).
3018     * R16G16B16 also crashes differently however something going wrong
3019     * inside llvm handling npot vector sizes seemingly.
3020     * It seems some cleanup could be done here (like skipping conversion/blend
3021     * when not needed).
3022     */
3023    convert_to_blend_type(gallivm, block_size, out_format_desc, dst_type,
3024                          row_type, dst, src_count);
3025 
3026    /*
3027     * FIXME: Really should get logic ops / masks out of generic blend / row
3028     * format. Logic ops will definitely not work on the blend float format
3029     * used for SRGB here and I think OpenGL expects this to work as expected
3030     * (that is incoming values converted to srgb then logic op applied).
3031     */
3032    for (unsigned i = 0; i < src_count; ++i) {
3033       dst[i] = lp_build_blend_aos(gallivm,
3034                                   &variant->key.blend,
3035                                   out_format,
3036                                   row_type,
3037                                   rt,
3038                                   src[i],
3039                                   has_alpha ? NULL : src_alpha[i],
3040                                   src1[i],
3041                                   has_alpha ? NULL : src1_alpha[i],
3042                                   dst[i],
3043                                   partial_mask ? src_mask[i] : NULL,
3044                                   blend_color,
3045                                   has_alpha ? NULL : blend_alpha,
3046                                   swizzle,
3047                                   pad_inline ? 4 : dst_channels);
3048    }
3049 
3050    convert_from_blend_type(gallivm, block_size, out_format_desc,
3051                            row_type, dst_type, dst, src_count);
3052 
3053    /* Split the blend rows back to memory rows */
3054    if (dst_count > src_count) {
3055       row_type.length = dst_type.length * (dst_count / src_count);
3056 
3057       if (src_count == 1) {
3058          dst[1] = lp_build_extract_range(gallivm, dst[0],
3059                                          row_type.length / 2,
3060                                          row_type.length / 2);
3061          dst[0] = lp_build_extract_range(gallivm, dst[0],
3062                                          0, row_type.length / 2);
3063 
3064          row_type.length /= 2;
3065          src_count *= 2;
3066       }
3067 
3068       dst[3] = lp_build_extract_range(gallivm, dst[1], row_type.length / 2,
3069                                       row_type.length / 2);
3070       dst[2] = lp_build_extract_range(gallivm, dst[1], 0, row_type.length / 2);
3071       dst[1] = lp_build_extract_range(gallivm, dst[0], row_type.length / 2,
3072                                       row_type.length / 2);
3073       dst[0] = lp_build_extract_range(gallivm, dst[0], 0, row_type.length / 2);
3074 
3075       row_type.length /= 2;
3076       src_count *= 2;
3077    }
3078 
3079    /*
3080     * Store blend result to memory
3081     */
3082    if (is_1d) {
3083       store_unswizzled_block(gallivm, color_type, color_ptr, stride, block_width, 1,
3084                              dst, dst_type, dst_count / 4, dst_alignment);
3085    } else {
3086       store_unswizzled_block(gallivm, color_type, color_ptr, stride, block_width,
3087                              block_height,
3088                              dst, dst_type, dst_count, dst_alignment);
3089    }
3090 
3091    if (do_branch) {
3092       lp_build_mask_end(&mask_ctx);
3093    }
3094 
3095    if (fpstate) {
3096       lp_build_fpstate_set(gallivm, fpstate);
3097    }
3098 }
3099 
3100 
3101 /**
3102  * Generate the runtime callable function for the whole fragment pipeline.
3103  * Note that the function which we generate operates on a block of 16
3104  * pixels at at time.  The block contains 2x2 quads.  Each quad contains
3105  * 2x2 pixels.
3106  */
3107 static void
generate_fragment(struct llvmpipe_context * lp,struct lp_fragment_shader * shader,struct lp_fragment_shader_variant * variant,unsigned partial_mask)3108 generate_fragment(struct llvmpipe_context *lp,
3109                   struct lp_fragment_shader *shader,
3110                   struct lp_fragment_shader_variant *variant,
3111                   unsigned partial_mask)
3112 {
3113    assert(partial_mask == RAST_WHOLE ||
3114           partial_mask == RAST_EDGE_TEST);
3115 
3116    struct nir_shader *nir = shader->base.ir.nir;
3117    struct gallivm_state *gallivm = variant->gallivm;
3118    struct lp_fragment_shader_variant_key *key = &variant->key;
3119    struct lp_shader_input inputs[PIPE_MAX_SHADER_INPUTS];
3120    LLVMTypeRef fs_elem_type;
3121    LLVMTypeRef blend_vec_type;
3122    LLVMTypeRef arg_types[16];
3123    LLVMTypeRef func_type;
3124    LLVMTypeRef int32_type = LLVMInt32TypeInContext(gallivm->context);
3125    LLVMTypeRef int32p_type = LLVMPointerType(int32_type, 0);
3126    LLVMTypeRef int8_type = LLVMInt8TypeInContext(gallivm->context);
3127    LLVMTypeRef int8p_type = LLVMPointerType(int8_type, 0);
3128    LLVMValueRef context_ptr;
3129    LLVMValueRef resources_ptr;
3130    LLVMValueRef x;
3131    LLVMValueRef y;
3132    LLVMValueRef a0_ptr;
3133    LLVMValueRef dadx_ptr;
3134    LLVMValueRef dady_ptr;
3135    LLVMValueRef color_ptr_ptr;
3136    LLVMValueRef stride_ptr;
3137    LLVMValueRef color_sample_stride_ptr;
3138    LLVMValueRef depth_ptr;
3139    LLVMValueRef depth_stride;
3140    LLVMValueRef depth_sample_stride;
3141    LLVMValueRef mask_input;
3142    LLVMValueRef thread_data_ptr;
3143    LLVMBasicBlockRef block;
3144    LLVMBuilderRef builder;
3145    struct lp_build_interp_soa_context interp;
3146    LLVMValueRef fs_mask[(16 / 4) * LP_MAX_SAMPLES];
3147    LLVMValueRef fs_out_color[LP_MAX_SAMPLES][PIPE_MAX_COLOR_BUFS][TGSI_NUM_CHANNELS][16 / 4];
3148    LLVMValueRef function;
3149    LLVMValueRef facing;
3150    const bool dual_source_blend = key->blend.rt[0].blend_enable &&
3151                                   util_blend_state_is_dual(&key->blend, 0);
3152 
3153    assert(lp_native_vector_width / 32 >= 4);
3154 
3155    /* Adjust color input interpolation according to flatshade state:
3156     */
3157    nir_foreach_shader_in_variable(var, nir) {
3158       unsigned idx = var->data.driver_location;
3159       unsigned slots = nir_variable_count_slots(var, var->type);
3160       memcpy(&inputs[idx], &shader->inputs[idx], (sizeof inputs[0] * slots));
3161       for (unsigned s = 0; s < slots; s++) {
3162          if (inputs[idx + s].interp == LP_INTERP_COLOR)
3163             inputs[idx + s].interp = key->flatshade ? LP_INTERP_CONSTANT : LP_INTERP_PERSPECTIVE;
3164       }
3165    }
3166 
3167    /* TODO: actually pick these based on the fs and color buffer
3168     * characteristics. */
3169 
3170    struct lp_type fs_type;
3171    memset(&fs_type, 0, sizeof fs_type);
3172    fs_type.floating = true;      /* floating point values */
3173    fs_type.sign = true;          /* values are signed */
3174    fs_type.norm = false;         /* values are not limited to [0,1] or [-1,1] */
3175    fs_type.width = 32;           /* 32-bit float */
3176    fs_type.length = MIN2(lp_native_vector_width / 32, 16); /* n*4 elements per vector */
3177 
3178    struct lp_type blend_type;
3179    memset(&blend_type, 0, sizeof blend_type);
3180    blend_type.floating = false; /* values are integers */
3181    blend_type.sign = false;     /* values are unsigned */
3182    blend_type.norm = true;      /* values are in [0,1] or [-1,1] */
3183    blend_type.width = 8;        /* 8-bit ubyte values */
3184    blend_type.length = 16;      /* 16 elements per vector */
3185 
3186    /*
3187     * Generate the function prototype. Any change here must be reflected in
3188     * lp_jit.h's lp_jit_frag_func function pointer type, and vice-versa.
3189     */
3190 
3191    fs_elem_type = lp_build_elem_type(gallivm, fs_type);
3192 
3193    blend_vec_type = lp_build_vec_type(gallivm, blend_type);
3194 
3195    char func_name[64];
3196    snprintf(func_name, sizeof(func_name), "fs_variant_%s",
3197             partial_mask ? "partial" : "whole");
3198 
3199    arg_types[0] = variant->jit_context_ptr_type;       /* context */
3200    arg_types[1] = variant->jit_resources_ptr_type;       /* context */
3201    arg_types[2] = int32_type;                          /* x */
3202    arg_types[3] = int32_type;                          /* y */
3203    arg_types[4] = int32_type;                          /* facing */
3204    arg_types[5] = LLVMPointerType(fs_elem_type, 0);    /* a0 */
3205    arg_types[6] = LLVMPointerType(fs_elem_type, 0);    /* dadx */
3206    arg_types[7] = LLVMPointerType(fs_elem_type, 0);    /* dady */
3207    arg_types[8] = LLVMPointerType(int8p_type, 0);  /* color */
3208    arg_types[9] = int8p_type;       /* depth */
3209    arg_types[10] = LLVMInt64TypeInContext(gallivm->context);  /* mask_input */
3210    arg_types[11] = variant->jit_thread_data_ptr_type;  /* per thread data */
3211    arg_types[12] = int32p_type;     /* stride */
3212    arg_types[13] = int32_type;                         /* depth_stride */
3213    arg_types[14] = int32p_type;     /* color sample strides */
3214    arg_types[15] = int32_type;                         /* depth sample stride */
3215 
3216    func_type = LLVMFunctionType(LLVMVoidTypeInContext(gallivm->context),
3217                                 arg_types, ARRAY_SIZE(arg_types), 0);
3218 
3219    function = LLVMAddFunction(gallivm->module, func_name, func_type);
3220    LLVMSetFunctionCallConv(function, LLVMCCallConv);
3221 
3222    variant->function[partial_mask] = function;
3223    variant->function_name[partial_mask] = MALLOC(strlen(func_name)+1);
3224    strcpy(variant->function_name[partial_mask], func_name);
3225 
3226    /* XXX: need to propagate noalias down into color param now we are
3227     * passing a pointer-to-pointer?
3228     */
3229    for (unsigned i = 0; i < ARRAY_SIZE(arg_types); ++i)
3230       if (LLVMGetTypeKind(arg_types[i]) == LLVMPointerTypeKind)
3231          lp_add_function_attr(function, i + 1, LP_FUNC_ATTR_NOALIAS);
3232 
3233    if (variant->gallivm->cache->data_size) {
3234       gallivm_stub_func(gallivm, function);
3235       return;
3236    }
3237 
3238    context_ptr  = LLVMGetParam(function, 0);
3239    resources_ptr  = LLVMGetParam(function, 1);
3240    x            = LLVMGetParam(function, 2);
3241    y            = LLVMGetParam(function, 3);
3242    facing       = LLVMGetParam(function, 4);
3243    a0_ptr       = LLVMGetParam(function, 5);
3244    dadx_ptr     = LLVMGetParam(function, 6);
3245    dady_ptr     = LLVMGetParam(function, 7);
3246    color_ptr_ptr = LLVMGetParam(function, 8);
3247    depth_ptr    = LLVMGetParam(function, 9);
3248    mask_input   = LLVMGetParam(function, 10);
3249    thread_data_ptr  = LLVMGetParam(function, 11);
3250    stride_ptr   = LLVMGetParam(function, 12);
3251    depth_stride = LLVMGetParam(function, 13);
3252    color_sample_stride_ptr = LLVMGetParam(function, 14);
3253    depth_sample_stride = LLVMGetParam(function, 15);
3254 
3255    lp_build_name(context_ptr, "context");
3256    lp_build_name(resources_ptr, "resources");
3257    lp_build_name(x, "x");
3258    lp_build_name(y, "y");
3259    lp_build_name(a0_ptr, "a0");
3260    lp_build_name(dadx_ptr, "dadx");
3261    lp_build_name(dady_ptr, "dady");
3262    lp_build_name(color_ptr_ptr, "color_ptr_ptr");
3263    lp_build_name(depth_ptr, "depth");
3264    lp_build_name(mask_input, "mask_input");
3265    lp_build_name(thread_data_ptr, "thread_data");
3266    lp_build_name(stride_ptr, "stride_ptr");
3267    lp_build_name(depth_stride, "depth_stride");
3268    lp_build_name(color_sample_stride_ptr, "color_sample_stride_ptr");
3269    lp_build_name(depth_sample_stride, "depth_sample_stride");
3270 
3271    /*
3272     * Function body
3273     */
3274 
3275    block = LLVMAppendBasicBlockInContext(gallivm->context, function, "entry");
3276    builder = gallivm->builder;
3277    assert(builder);
3278    LLVMPositionBuilderAtEnd(builder, block);
3279 
3280    /* code generated texture sampling */
3281    struct lp_build_sampler_soa *sampler =
3282       lp_llvm_sampler_soa_create(lp_fs_variant_key_samplers(key),
3283                                  MAX2(key->nr_samplers,
3284                                       key->nr_sampler_views));
3285    struct lp_build_image_soa *image =
3286       lp_bld_llvm_image_soa_create(lp_fs_variant_key_images(key), key->nr_images);
3287 
3288    unsigned num_fs = 16 / fs_type.length; /* number of loops per 4x4 stamp */
3289    /* for 1d resources only run "upper half" of stamp */
3290    if (key->resource_1d)
3291       num_fs /= 2;
3292 
3293    {
3294       LLVMValueRef num_loop = lp_build_const_int32(gallivm, num_fs);
3295       LLVMTypeRef mask_type = lp_build_int_vec_type(gallivm, fs_type);
3296       LLVMValueRef num_loop_samp =
3297          lp_build_const_int32(gallivm, num_fs * key->coverage_samples);
3298       LLVMValueRef mask_store =
3299          lp_build_array_alloca(gallivm, mask_type,
3300                                num_loop_samp, "mask_store");
3301       LLVMTypeRef flt_type = LLVMFloatTypeInContext(gallivm->context);
3302       LLVMValueRef glob_sample_pos =
3303          LLVMAddGlobal(gallivm->module,
3304                        LLVMArrayType(flt_type, key->coverage_samples * 2), "");
3305       LLVMSetLinkage(glob_sample_pos, LLVMInternalLinkage);
3306       LLVMValueRef sample_pos_array;
3307 
3308       if (key->multisample && key->coverage_samples == 4) {
3309          LLVMValueRef sample_pos_arr[8];
3310          for (unsigned i = 0; i < 4; i++) {
3311             sample_pos_arr[i * 2] = LLVMConstReal(flt_type,
3312                                                   lp_sample_pos_4x[i][0]);
3313             sample_pos_arr[i * 2 + 1] = LLVMConstReal(flt_type,
3314                                                       lp_sample_pos_4x[i][1]);
3315          }
3316          sample_pos_array =
3317             LLVMConstArray(LLVMFloatTypeInContext(gallivm->context),
3318                            sample_pos_arr, 8);
3319       } else {
3320          LLVMValueRef sample_pos_arr[2];
3321          sample_pos_arr[0] = LLVMConstReal(flt_type, 0.5);
3322          sample_pos_arr[1] = LLVMConstReal(flt_type, 0.5);
3323          sample_pos_array =
3324             LLVMConstArray(LLVMFloatTypeInContext(gallivm->context),
3325                            sample_pos_arr, 2);
3326       }
3327       LLVMSetInitializer(glob_sample_pos, sample_pos_array);
3328 
3329       LLVMValueRef color_store[PIPE_MAX_COLOR_BUFS][TGSI_NUM_CHANNELS];
3330       bool pixel_center_integer = nir->info.fs.pixel_center_integer;
3331 
3332       /*
3333        * The shader input interpolation info is not explicitely baked in the
3334        * shader key, but everything it derives from (TGSI, and flatshade) is
3335        * already included in the shader key.
3336        */
3337       lp_build_interp_soa_init(&interp,
3338                                gallivm,
3339                                nir->num_inputs,
3340                                inputs,
3341                                pixel_center_integer,
3342                                key->coverage_samples,
3343                                LLVMTypeOf(sample_pos_array),
3344                                glob_sample_pos,
3345                                num_loop,
3346                                builder, fs_type,
3347                                a0_ptr, dadx_ptr, dady_ptr,
3348                                x, y);
3349 
3350       for (unsigned i = 0; i < num_fs; i++) {
3351          if (key->multisample) {
3352             LLVMValueRef smask_val =
3353                LLVMBuildLoad2(builder, int32_type,
3354                               lp_jit_context_sample_mask(gallivm, variant->jit_context_type, context_ptr),
3355                               "");
3356 
3357             /*
3358              * For multisampling, extract the per-sample mask from the
3359              * incoming 64-bit mask, store to the per sample mask storage. Or
3360              * all of them together to generate the fragment shader
3361              * mask. (sample shading TODO).  Take the incoming state coverage
3362              * mask into account.
3363              */
3364             for (unsigned s = 0; s < key->coverage_samples; s++) {
3365                LLVMValueRef sindexi =
3366                   lp_build_const_int32(gallivm, i + (s * num_fs));
3367                LLVMValueRef sample_mask_ptr =
3368                   LLVMBuildGEP2(builder, mask_type, mask_store, &sindexi, 1,
3369                                 "sample_mask_ptr");
3370                LLVMValueRef s_mask =
3371                   generate_quad_mask(gallivm, fs_type,
3372                                      i * fs_type.length / 4, s, mask_input);
3373                LLVMValueRef smask_bit =
3374                   LLVMBuildAnd(builder, smask_val,
3375                                lp_build_const_int32(gallivm, (1 << s)), "");
3376                LLVMValueRef cmp =
3377                   LLVMBuildICmp(builder, LLVMIntNE, smask_bit,
3378                                 lp_build_const_int32(gallivm, 0), "");
3379                smask_bit = LLVMBuildSExt(builder, cmp, int32_type, "");
3380                smask_bit = lp_build_broadcast(gallivm, mask_type, smask_bit);
3381 
3382                s_mask = LLVMBuildAnd(builder, s_mask, smask_bit, "");
3383                LLVMBuildStore(builder, s_mask, sample_mask_ptr);
3384             }
3385          } else {
3386             LLVMValueRef mask;
3387             LLVMValueRef indexi = lp_build_const_int32(gallivm, i);
3388             LLVMValueRef mask_ptr = LLVMBuildGEP2(builder, mask_type, mask_store,
3389                                                   &indexi, 1, "mask_ptr");
3390 
3391             if (partial_mask) {
3392                mask = generate_quad_mask(gallivm, fs_type,
3393                                          i * fs_type.length / 4, 0, mask_input);
3394             } else {
3395                mask = lp_build_const_int_vec(gallivm, fs_type, ~0);
3396             }
3397             LLVMBuildStore(builder, mask, mask_ptr);
3398          }
3399       }
3400 
3401       generate_fs_loop(gallivm,
3402                        shader, key,
3403                        builder,
3404                        fs_type,
3405                        variant->jit_context_type,
3406                        context_ptr,
3407                        variant->jit_resources_type,
3408                        resources_ptr,
3409                        LLVMTypeOf(sample_pos_array),
3410                        glob_sample_pos,
3411                        num_loop,
3412                        &interp,
3413                        sampler,
3414                        image,
3415                        mask_type,
3416                        mask_store, /* output */
3417                        color_store,
3418                        depth_ptr,
3419                        depth_stride,
3420                        depth_sample_stride,
3421                        color_ptr_ptr,
3422                        stride_ptr,
3423                        color_sample_stride_ptr,
3424                        facing,
3425                        variant->jit_thread_data_type,
3426                        thread_data_ptr);
3427 
3428       LLVMTypeRef fs_vec_type = lp_build_vec_type(gallivm, fs_type);
3429       for (unsigned i = 0; i < num_fs; i++) {
3430          LLVMValueRef ptr;
3431          for (unsigned s = 0; s < key->coverage_samples; s++) {
3432             int idx = (i + (s * num_fs));
3433             LLVMValueRef sindexi = lp_build_const_int32(gallivm, idx);
3434             ptr = LLVMBuildGEP2(builder, mask_type, mask_store, &sindexi, 1, "");
3435 
3436             fs_mask[idx] = LLVMBuildLoad2(builder, mask_type, ptr, "smask");
3437          }
3438 
3439          for (unsigned s = 0; s < key->min_samples; s++) {
3440             /* This is fucked up need to reorganize things */
3441             int idx = s * num_fs + i;
3442             LLVMValueRef sindexi = lp_build_const_int32(gallivm, idx);
3443             for (unsigned cbuf = 0; cbuf < key->nr_cbufs; cbuf++) {
3444                for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
3445                   ptr = LLVMBuildGEP2(builder, fs_vec_type,
3446                                       color_store[cbuf][chan],
3447                                       &sindexi, 1, "");
3448                   fs_out_color[s][cbuf][chan][i] = ptr;
3449                }
3450             }
3451             if (dual_source_blend) {
3452                /* only support one dual source blend target hence always use
3453                 * output 1
3454                 */
3455                for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
3456                   ptr = LLVMBuildGEP2(builder, fs_vec_type,
3457                                       color_store[1][chan],
3458                                       &sindexi, 1, "");
3459                   fs_out_color[s][1][chan][i] = ptr;
3460                }
3461             }
3462          }
3463       }
3464    }
3465 
3466    lp_bld_llvm_sampler_soa_destroy(sampler);
3467    lp_bld_llvm_image_soa_destroy(image);
3468 
3469    /* Loop over color outputs / color buffers to do blending */
3470    for (unsigned cbuf = 0; cbuf < key->nr_cbufs; cbuf++) {
3471       if (key->cbuf_format[cbuf] != PIPE_FORMAT_NONE &&
3472           (key->blend.rt[cbuf].blend_enable || key->blend.logicop_enable ||
3473            find_output_by_frag_result(nir, FRAG_RESULT_DATA0 + cbuf) != -1)) {
3474          LLVMValueRef color_ptr;
3475          LLVMValueRef stride;
3476          LLVMValueRef sample_stride = NULL;
3477          LLVMValueRef index = lp_build_const_int32(gallivm, cbuf);
3478 
3479          bool do_branch = ((key->depth.enabled
3480                             || key->stencil[0].enabled
3481                             || key->alpha.enabled)
3482                            && !nir->info.fs.uses_discard);
3483 
3484          color_ptr = LLVMBuildLoad2(builder, int8p_type,
3485                                     LLVMBuildGEP2(builder, int8p_type, color_ptr_ptr,
3486                                                  &index, 1, ""),
3487                                     "");
3488 
3489          stride = LLVMBuildLoad2(builder, int32_type,
3490                                  LLVMBuildGEP2(builder, int32_type, stride_ptr,
3491                                              &index, 1, ""),
3492                                  "");
3493 
3494          if (key->cbuf_nr_samples[cbuf] > 1)
3495             sample_stride = LLVMBuildLoad2(builder, int32_type,
3496                                            LLVMBuildGEP2(builder,
3497                                                          int32_type,
3498                                                          color_sample_stride_ptr,
3499                                                          &index, 1, ""), "");
3500 
3501          for (unsigned s = 0; s < key->cbuf_nr_samples[cbuf]; s++) {
3502             unsigned mask_idx = num_fs * (key->multisample ? s : 0);
3503             unsigned out_idx = key->min_samples == 1 ? 0 : s;
3504             LLVMValueRef out_ptr = color_ptr;
3505 
3506             if (sample_stride) {
3507                LLVMValueRef sample_offset =
3508                   LLVMBuildMul(builder, sample_stride,
3509                                lp_build_const_int32(gallivm, s), "");
3510                out_ptr = LLVMBuildGEP2(builder, int8_type, out_ptr, &sample_offset, 1, "");
3511             }
3512             out_ptr = LLVMBuildBitCast(builder, out_ptr,
3513                                        LLVMPointerType(blend_vec_type, 0), "");
3514 
3515             lp_build_name(out_ptr, "color_ptr%d", cbuf);
3516 
3517             generate_unswizzled_blend(gallivm, cbuf, variant,
3518                                       key->cbuf_format[cbuf],
3519                                       num_fs, fs_type, &fs_mask[mask_idx],
3520                                       fs_out_color[out_idx],
3521                                       variant->jit_context_type,
3522                                       context_ptr, blend_vec_type, out_ptr, stride,
3523                                       partial_mask, do_branch);
3524          }
3525       }
3526    }
3527 
3528    LLVMBuildRetVoid(builder);
3529 
3530    gallivm_verify_function(gallivm, function);
3531 }
3532 
3533 
3534 static void
dump_fs_variant_key(struct lp_fragment_shader_variant_key * key)3535 dump_fs_variant_key(struct lp_fragment_shader_variant_key *key)
3536 {
3537    debug_printf("fs variant %p:\n", (void *) key);
3538 
3539    if (key->flatshade) {
3540       debug_printf("flatshade = 1\n");
3541    }
3542    if (key->depth_clamp)
3543       debug_printf("depth_clamp = 1\n");
3544 
3545    if (key->restrict_depth_values)
3546       debug_printf("restrict_depth_values = 1\n");
3547 
3548    if (key->multisample) {
3549       debug_printf("multisample = 1\n");
3550       debug_printf("coverage samples = %d\n", key->coverage_samples);
3551       debug_printf("min samples = %d\n", key->min_samples);
3552    }
3553    for (unsigned i = 0; i < key->nr_cbufs; ++i) {
3554       debug_printf("cbuf_format[%u] = %s\n", i, util_format_name(key->cbuf_format[i]));
3555       debug_printf("cbuf nr_samples[%u] = %d\n", i, key->cbuf_nr_samples[i]);
3556    }
3557    if (key->depth.enabled || key->stencil[0].enabled) {
3558       debug_printf("depth.format = %s\n", util_format_name(key->zsbuf_format));
3559       debug_printf("depth nr_samples = %d\n", key->zsbuf_nr_samples);
3560    }
3561    if (key->depth.enabled) {
3562       debug_printf("depth.func = %s\n", util_str_func(key->depth.func, true));
3563       debug_printf("depth.writemask = %u\n", key->depth.writemask);
3564    }
3565 
3566    for (unsigned i = 0; i < 2; ++i) {
3567       if (key->stencil[i].enabled) {
3568          debug_printf("stencil[%u].func = %s\n", i, util_str_func(key->stencil[i].func, true));
3569          debug_printf("stencil[%u].fail_op = %s\n", i, util_str_stencil_op(key->stencil[i].fail_op, true));
3570          debug_printf("stencil[%u].zpass_op = %s\n", i, util_str_stencil_op(key->stencil[i].zpass_op, true));
3571          debug_printf("stencil[%u].zfail_op = %s\n", i, util_str_stencil_op(key->stencil[i].zfail_op, true));
3572          debug_printf("stencil[%u].valuemask = 0x%x\n", i, key->stencil[i].valuemask);
3573          debug_printf("stencil[%u].writemask = 0x%x\n", i, key->stencil[i].writemask);
3574       }
3575    }
3576 
3577    if (key->alpha.enabled) {
3578       debug_printf("alpha.func = %s\n", util_str_func(key->alpha.func, true));
3579    }
3580 
3581    if (key->occlusion_count) {
3582       debug_printf("occlusion_count = 1\n");
3583    }
3584 
3585    if (key->blend.logicop_enable) {
3586       debug_printf("blend.logicop_func = %s\n", util_str_logicop(key->blend.logicop_func, true));
3587    } else if (key->blend.rt[0].blend_enable) {
3588       debug_printf("blend.rgb_func = %s\n",   util_str_blend_func  (key->blend.rt[0].rgb_func, true));
3589       debug_printf("blend.rgb_src_factor = %s\n",   util_str_blend_factor(key->blend.rt[0].rgb_src_factor, true));
3590       debug_printf("blend.rgb_dst_factor = %s\n",   util_str_blend_factor(key->blend.rt[0].rgb_dst_factor, true));
3591       debug_printf("blend.alpha_func = %s\n",       util_str_blend_func  (key->blend.rt[0].alpha_func, true));
3592       debug_printf("blend.alpha_src_factor = %s\n", util_str_blend_factor(key->blend.rt[0].alpha_src_factor, true));
3593       debug_printf("blend.alpha_dst_factor = %s\n", util_str_blend_factor(key->blend.rt[0].alpha_dst_factor, true));
3594    }
3595    debug_printf("blend.colormask = 0x%x\n", key->blend.rt[0].colormask);
3596    if (key->blend.alpha_to_coverage) {
3597       debug_printf("blend.alpha_to_coverage is enabled\n");
3598    }
3599    for (unsigned i = 0; i < key->nr_samplers; ++i) {
3600       const struct lp_sampler_static_state *samplers = lp_fs_variant_key_samplers(key);
3601       const struct lp_static_sampler_state *sampler = &samplers[i].sampler_state;
3602       debug_printf("sampler[%u] = \n", i);
3603       debug_printf("  .wrap = %s %s %s\n",
3604                    util_str_tex_wrap(sampler->wrap_s, true),
3605                    util_str_tex_wrap(sampler->wrap_t, true),
3606                    util_str_tex_wrap(sampler->wrap_r, true));
3607       debug_printf("  .min_img_filter = %s\n",
3608                    util_str_tex_filter(sampler->min_img_filter, true));
3609       debug_printf("  .min_mip_filter = %s\n",
3610                    util_str_tex_mipfilter(sampler->min_mip_filter, true));
3611       debug_printf("  .mag_img_filter = %s\n",
3612                    util_str_tex_filter(sampler->mag_img_filter, true));
3613       if (sampler->compare_mode != PIPE_TEX_COMPARE_NONE)
3614          debug_printf("  .compare_func = %s\n", util_str_func(sampler->compare_func, true));
3615       debug_printf("  .normalized_coords = %u\n", sampler->normalized_coords);
3616       debug_printf("  .min_max_lod_equal = %u\n", sampler->min_max_lod_equal);
3617       debug_printf("  .lod_bias_non_zero = %u\n", sampler->lod_bias_non_zero);
3618       debug_printf("  .apply_min_lod = %u\n", sampler->apply_min_lod);
3619       debug_printf("  .apply_max_lod = %u\n", sampler->apply_max_lod);
3620       debug_printf("  .reduction_mode = %u\n", sampler->reduction_mode);
3621       debug_printf("  .aniso = %u\n", sampler->aniso);
3622    }
3623    for (unsigned i = 0; i < key->nr_sampler_views; ++i) {
3624       const struct lp_sampler_static_state *samplers = lp_fs_variant_key_samplers(key);
3625       const struct lp_static_texture_state *texture = &samplers[i].texture_state;
3626       debug_printf("texture[%u] = \n", i);
3627       debug_printf("  .format = %s\n",
3628                    util_format_name(texture->format));
3629       debug_printf("  .target = %s\n",
3630                    util_str_tex_target(texture->target, true));
3631       debug_printf("  .level_zero_only = %u\n",
3632                    texture->level_zero_only);
3633       debug_printf("  .pot = %u %u %u\n",
3634                    texture->pot_width,
3635                    texture->pot_height,
3636                    texture->pot_depth);
3637    }
3638    struct lp_image_static_state *images = lp_fs_variant_key_images(key);
3639    for (unsigned i = 0; i < key->nr_images; ++i) {
3640       const struct lp_static_texture_state *image = &images[i].image_state;
3641       debug_printf("image[%u] = \n", i);
3642       debug_printf("  .format = %s\n",
3643                    util_format_name(image->format));
3644       debug_printf("  .target = %s\n",
3645                    util_str_tex_target(image->target, true));
3646       debug_printf("  .level_zero_only = %u\n",
3647                    image->level_zero_only);
3648       debug_printf("  .pot = %u %u %u\n",
3649                    image->pot_width,
3650                    image->pot_height,
3651                    image->pot_depth);
3652    }
3653 }
3654 
3655 
3656 const char *
lp_debug_fs_kind(enum lp_fs_kind kind)3657 lp_debug_fs_kind(enum lp_fs_kind kind)
3658 {
3659    switch (kind) {
3660    case LP_FS_KIND_GENERAL:
3661       return "GENERAL";
3662    case LP_FS_KIND_BLIT_RGBA:
3663       return "BLIT_RGBA";
3664    case LP_FS_KIND_BLIT_RGB1:
3665       return "BLIT_RGB1";
3666    case LP_FS_KIND_AERO_MINIFICATION:
3667       return "AERO_MINIFICATION";
3668    case LP_FS_KIND_LLVM_LINEAR:
3669       return "LLVM_LINEAR";
3670    default:
3671       return "unknown";
3672    }
3673 }
3674 
3675 
3676 void
lp_debug_fs_variant(struct lp_fragment_shader_variant * variant)3677 lp_debug_fs_variant(struct lp_fragment_shader_variant *variant)
3678 {
3679    debug_printf("llvmpipe: Fragment shader #%u variant #%u:\n",
3680                 variant->shader->no, variant->no);
3681    nir_print_shader(variant->shader->base.ir.nir, stderr);
3682    dump_fs_variant_key(&variant->key);
3683    debug_printf("variant->opaque = %u\n", variant->opaque);
3684    debug_printf("variant->potentially_opaque = %u\n", variant->potentially_opaque);
3685    debug_printf("variant->blit = %u\n", variant->blit);
3686    debug_printf("shader->kind = %s\n", lp_debug_fs_kind(variant->shader->kind));
3687    debug_printf("\n");
3688 }
3689 
3690 
3691 static void
lp_fs_get_ir_cache_key(struct lp_fragment_shader_variant * variant,unsigned char ir_sha1_cache_key[20])3692 lp_fs_get_ir_cache_key(struct lp_fragment_shader_variant *variant,
3693                        unsigned char ir_sha1_cache_key[20])
3694 {
3695    struct blob blob = { 0 };
3696    unsigned ir_size;
3697    void *ir_binary;
3698 
3699    blob_init(&blob);
3700    nir_serialize(&blob, variant->shader->base.ir.nir, true);
3701    ir_binary = blob.data;
3702    ir_size = blob.size;
3703 
3704    struct mesa_sha1 ctx;
3705    _mesa_sha1_init(&ctx);
3706    _mesa_sha1_update(&ctx, &variant->key, variant->shader->variant_key_size);
3707    _mesa_sha1_update(&ctx, ir_binary, ir_size);
3708    _mesa_sha1_final(&ctx, ir_sha1_cache_key);
3709 
3710    blob_finish(&blob);
3711 }
3712 
3713 
3714 /**
3715  * Generate a new fragment shader variant from the shader code and
3716  * other state indicated by the key.
3717  */
3718 static struct lp_fragment_shader_variant *
generate_variant(struct llvmpipe_context * lp,struct lp_fragment_shader * shader,const struct lp_fragment_shader_variant_key * key)3719 generate_variant(struct llvmpipe_context *lp,
3720                  struct lp_fragment_shader *shader,
3721                  const struct lp_fragment_shader_variant_key *key)
3722 {
3723    struct nir_shader *nir = shader->base.ir.nir;
3724    struct lp_fragment_shader_variant *variant =
3725       MALLOC(sizeof *variant + shader->variant_key_size - sizeof variant->key);
3726    if (!variant)
3727       return NULL;
3728 
3729    memset(variant, 0, sizeof(*variant));
3730 
3731    pipe_reference_init(&variant->reference, 1);
3732    lp_fs_reference(lp, &variant->shader, shader);
3733 
3734    memcpy(&variant->key, key, shader->variant_key_size);
3735 
3736    struct llvmpipe_screen *screen = llvmpipe_screen(lp->pipe.screen);
3737    struct lp_cached_code cached = { 0 };
3738    unsigned char ir_sha1_cache_key[20];
3739    bool needs_caching = false;
3740    if (shader->base.ir.nir) {
3741       lp_fs_get_ir_cache_key(variant, ir_sha1_cache_key);
3742 
3743       lp_disk_cache_find_shader(screen, &cached, ir_sha1_cache_key);
3744       if (!cached.data_size)
3745          needs_caching = true;
3746    }
3747 
3748    char module_name[64];
3749    snprintf(module_name, sizeof(module_name), "fs%u_variant%u",
3750             shader->no, shader->variants_created);
3751    variant->gallivm = gallivm_create(module_name, &lp->context, &cached);
3752    if (!variant->gallivm) {
3753       FREE(variant);
3754       return NULL;
3755    }
3756 
3757    variant->list_item_global.base = variant;
3758    variant->list_item_local.base = variant;
3759    variant->no = shader->variants_created++;
3760 
3761    /*
3762     * Determine whether we are touching all channels in the color buffer.
3763     */
3764    const struct util_format_description *cbuf0_format_desc = NULL;
3765    bool fullcolormask = false;
3766    if (key->nr_cbufs == 1) {
3767       cbuf0_format_desc = util_format_description(key->cbuf_format[0]);
3768       fullcolormask = util_format_colormask_full(cbuf0_format_desc,
3769                                                  key->blend.rt[0].colormask);
3770    }
3771 
3772    /* The scissor is ignored here as only tiles inside the scissoring
3773     * rectangle will refer to this.
3774     */
3775    const bool no_kill =
3776          fullcolormask &&
3777          !key->stencil[0].enabled &&
3778          !key->alpha.enabled &&
3779          !key->multisample &&
3780          !key->blend.alpha_to_coverage &&
3781          !key->depth.enabled &&
3782          !nir->info.fs.uses_discard &&
3783          !(nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)) &&
3784          !nir->info.fs.uses_fbfetch_output;
3785 
3786    variant->opaque =
3787          no_kill &&
3788          !key->blend.logicop_enable &&
3789          !key->blend.rt[0].blend_enable
3790          ? true : false;
3791 
3792    variant->potentially_opaque =
3793          no_kill &&
3794          !key->blend.logicop_enable &&
3795          key->blend.rt[0].blend_enable &&
3796          key->blend.rt[0].rgb_func == PIPE_BLEND_ADD &&
3797          key->blend.rt[0].rgb_dst_factor == PIPE_BLENDFACTOR_INV_SRC_ALPHA &&
3798          key->blend.rt[0].alpha_func == key->blend.rt[0].rgb_func &&
3799          key->blend.rt[0].alpha_dst_factor == key->blend.rt[0].rgb_dst_factor &&
3800          shader->base.type == PIPE_SHADER_IR_TGSI &&
3801          /*
3802           * FIXME: for NIR, all of the fields of info.xxx (except info.base)
3803           * are zeros, hence shader analysis (here and elsewhere) using these
3804           * bits cannot work and will silently fail (cbuf is the only pointer
3805           * field, hence causing a crash).
3806           */
3807          shader->info.cbuf[0][3].file != TGSI_FILE_NULL
3808          ? true : false;
3809 
3810    /* We only care about opaque blits for now */
3811    if (variant->opaque &&
3812        (shader->kind == LP_FS_KIND_BLIT_RGBA ||
3813         shader->kind == LP_FS_KIND_BLIT_RGB1)) {
3814       const struct lp_sampler_static_state *samp0 =
3815          lp_fs_variant_key_sampler_idx(key, 0);
3816       assert(samp0);
3817 
3818       const enum pipe_format texture_format = samp0->texture_state.format;
3819       const enum pipe_texture_target target = samp0->texture_state.target;
3820       const unsigned min_img_filter = samp0->sampler_state.min_img_filter;
3821       const unsigned mag_img_filter = samp0->sampler_state.mag_img_filter;
3822 
3823       unsigned min_mip_filter;
3824       if (samp0->texture_state.level_zero_only) {
3825          min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
3826       } else {
3827          min_mip_filter = samp0->sampler_state.min_mip_filter;
3828       }
3829 
3830       if (target == PIPE_TEXTURE_2D &&
3831           min_img_filter == PIPE_TEX_FILTER_NEAREST &&
3832           mag_img_filter == PIPE_TEX_FILTER_NEAREST &&
3833           min_mip_filter == PIPE_TEX_MIPFILTER_NONE &&
3834           ((texture_format &&
3835             util_is_format_compatible(util_format_description(texture_format),
3836                                       cbuf0_format_desc)) ||
3837            (shader->kind == LP_FS_KIND_BLIT_RGB1 &&
3838             (texture_format == PIPE_FORMAT_B8G8R8A8_UNORM ||
3839              texture_format == PIPE_FORMAT_B8G8R8X8_UNORM) &&
3840             (key->cbuf_format[0] == PIPE_FORMAT_B8G8R8A8_UNORM ||
3841              key->cbuf_format[0] == PIPE_FORMAT_B8G8R8X8_UNORM)))) {
3842          variant->blit = 1;
3843       }
3844    }
3845 
3846    /* Determine whether this shader + pipeline state is a candidate for
3847     * the linear path.
3848     */
3849    const bool linear_pipeline =
3850          !key->stencil[0].enabled &&
3851          !key->depth.enabled &&
3852          !nir->info.fs.uses_discard &&
3853          !key->blend.logicop_enable &&
3854          (key->cbuf_format[0] == PIPE_FORMAT_B8G8R8A8_UNORM ||
3855           key->cbuf_format[0] == PIPE_FORMAT_B8G8R8X8_UNORM ||
3856           key->cbuf_format[0] == PIPE_FORMAT_R8G8B8A8_UNORM ||
3857           key->cbuf_format[0] == PIPE_FORMAT_R8G8B8X8_UNORM);
3858 
3859    memcpy(&variant->key, key, sizeof *key);
3860 
3861    if ((LP_DEBUG & DEBUG_FS) || (gallivm_debug & GALLIVM_DEBUG_IR)) {
3862       lp_debug_fs_variant(variant);
3863    }
3864 
3865    llvmpipe_fs_variant_fastpath(variant);
3866 
3867    lp_jit_init_types(variant);
3868 
3869    if (variant->jit_function[RAST_EDGE_TEST] == NULL)
3870       generate_fragment(lp, shader, variant, RAST_EDGE_TEST);
3871 
3872    if (variant->jit_function[RAST_WHOLE] == NULL) {
3873       if (variant->opaque) {
3874          /* Specialized shader, which doesn't need to read the color buffer. */
3875          generate_fragment(lp, shader, variant, RAST_WHOLE);
3876       }
3877    }
3878 
3879    if (linear_pipeline) {
3880       /* Currently keeping both the old fastpaths and new linear path
3881        * active.  The older code is still somewhat faster for the cases
3882        * it covers.
3883        *
3884        * XXX: consider restricting this to aero-mode only.
3885        */
3886       if (fullcolormask &&
3887           !key->alpha.enabled &&
3888           !key->blend.alpha_to_coverage) {
3889          llvmpipe_fs_variant_linear_fastpath(variant);
3890       }
3891 
3892       /* If the original fastpath doesn't cover this variant, try the new
3893        * code:
3894        */
3895       if (variant->jit_linear == NULL) {
3896          if (shader->kind == LP_FS_KIND_BLIT_RGBA ||
3897              shader->kind == LP_FS_KIND_BLIT_RGB1 ||
3898              shader->kind == LP_FS_KIND_LLVM_LINEAR) {
3899             llvmpipe_fs_variant_linear_llvm(lp, shader, variant);
3900          }
3901       }
3902    } else {
3903       if (LP_DEBUG & DEBUG_LINEAR) {
3904          lp_debug_fs_variant(variant);
3905          debug_printf("    ----> no linear path for this variant\n");
3906       }
3907    }
3908 
3909    /*
3910     * Compile everything
3911     */
3912 
3913 #if GALLIVM_USE_ORCJIT
3914 /* module has been moved into ORCJIT after gallivm_compile_module */
3915    variant->nr_instrs += lp_build_count_ir_module(variant->gallivm->module);
3916 
3917    gallivm_compile_module(variant->gallivm);
3918 #else
3919    gallivm_compile_module(variant->gallivm);
3920 
3921    variant->nr_instrs += lp_build_count_ir_module(variant->gallivm->module);
3922 #endif
3923 
3924    if (variant->function[RAST_EDGE_TEST]) {
3925       variant->jit_function[RAST_EDGE_TEST] = (lp_jit_frag_func)
3926             gallivm_jit_function(variant->gallivm,
3927                                  variant->function[RAST_EDGE_TEST],
3928                                  variant->function_name[RAST_EDGE_TEST]);
3929    }
3930 
3931    if (variant->function[RAST_WHOLE]) {
3932       variant->jit_function[RAST_WHOLE] = (lp_jit_frag_func)
3933          gallivm_jit_function(variant->gallivm,
3934                               variant->function[RAST_WHOLE],
3935                               variant->function_name[RAST_WHOLE]);
3936    } else if (!variant->jit_function[RAST_WHOLE]) {
3937       variant->jit_function[RAST_WHOLE] = (lp_jit_frag_func)
3938          variant->jit_function[RAST_EDGE_TEST];
3939    }
3940 
3941    if (linear_pipeline) {
3942       if (variant->linear_function) {
3943          variant->jit_linear_llvm = (lp_jit_linear_llvm_func)
3944             gallivm_jit_function(variant->gallivm, variant->linear_function,
3945                                  variant->linear_function_name);
3946       }
3947 
3948       /*
3949        * This must be done after LLVM compilation, as it will call the JIT'ed
3950        * code to determine active inputs.
3951        */
3952       lp_linear_check_variant(variant);
3953    }
3954 
3955    if (needs_caching) {
3956       lp_disk_cache_insert_shader(screen, &cached, ir_sha1_cache_key);
3957    }
3958 
3959    gallivm_free_ir(variant->gallivm);
3960 
3961    return variant;
3962 }
3963 
3964 
3965 static void *
llvmpipe_create_fs_state(struct pipe_context * pipe,const struct pipe_shader_state * templ)3966 llvmpipe_create_fs_state(struct pipe_context *pipe,
3967                          const struct pipe_shader_state *templ)
3968 {
3969    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
3970 
3971    struct lp_fragment_shader *shader = CALLOC_STRUCT(lp_fragment_shader);
3972    if (!shader)
3973       return NULL;
3974 
3975    pipe_reference_init(&shader->reference, 1);
3976    shader->no = fs_no++;
3977    list_inithead(&shader->variants.list);
3978 
3979    shader->base.type = PIPE_SHADER_IR_NIR;
3980 
3981    if (templ->type == PIPE_SHADER_IR_TGSI) {
3982       shader->base.ir.nir = tgsi_to_nir(templ->tokens, pipe->screen, false);
3983    } else {
3984       shader->base.ir.nir = templ->ir.nir;
3985    }
3986 
3987    /* lower FRAG_RESULT_COLOR -> DATA[0-7] to correctly handle unused attachments */
3988    nir_shader *nir = shader->base.ir.nir;
3989    NIR_PASS_V(nir, nir_lower_fragcolor, nir->info.fs.color_is_dual_source ? 1 : 8);
3990 
3991    nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
3992    nir_tgsi_scan_shader(nir, &shader->info.base, true);
3993    shader->info.num_texs = shader->info.base.opcode_count[TGSI_OPCODE_TEX];
3994 
3995    llvmpipe_register_shader(pipe, &shader->base);
3996 
3997    shader->draw_data = draw_create_fragment_shader(llvmpipe->draw, templ);
3998    if (shader->draw_data == NULL) {
3999       FREE(shader);
4000       return NULL;
4001    }
4002 
4003    const int nr_samplers = BITSET_LAST_BIT(nir->info.samplers_used);
4004    const int nr_sampler_views = BITSET_LAST_BIT(nir->info.textures_used);
4005    const int nr_images = BITSET_LAST_BIT(nir->info.images_used);
4006 
4007    shader->variant_key_size = lp_fs_variant_key_size(MAX2(nr_samplers,
4008                                                           nr_sampler_views),
4009                                                      nr_images);
4010 
4011    nir_foreach_shader_in_variable(var, nir) {
4012       unsigned idx = var->data.driver_location;
4013       unsigned slots = nir_variable_count_slots(var, var->type);
4014 
4015       if (var->data.centroid)
4016          shader->inputs[idx].location = TGSI_INTERPOLATE_LOC_CENTROID;
4017       if (var->data.sample)
4018          shader->inputs[idx].location = TGSI_INTERPOLATE_LOC_SAMPLE;
4019 
4020       enum glsl_base_type base_type =
4021          glsl_get_base_type(glsl_without_array(var->type));
4022       switch (var->data.interpolation) {
4023       case INTERP_MODE_NONE:
4024          if (glsl_base_type_is_integer(base_type) || var->data.per_primitive) {
4025             shader->inputs[idx].interp = LP_INTERP_CONSTANT;
4026             break;
4027          }
4028          if (var->data.location == VARYING_SLOT_COL0 ||
4029              var->data.location == VARYING_SLOT_COL1) {
4030             shader->inputs[idx].interp = LP_INTERP_COLOR;
4031             break;
4032          }
4033          FALLTHROUGH;
4034       case INTERP_MODE_SMOOTH:
4035          shader->inputs[idx].interp = LP_INTERP_PERSPECTIVE;
4036          break;
4037       case INTERP_MODE_NOPERSPECTIVE:
4038          shader->inputs[idx].interp = LP_INTERP_LINEAR;
4039          break;
4040       case INTERP_MODE_FLAT:
4041          shader->inputs[idx].interp = LP_INTERP_CONSTANT;
4042          break;
4043       }
4044 
4045       /* XXX this is a completely pointless index map... */
4046       shader->inputs[idx].src_index = idx + 1;
4047       if (var->data.location == VARYING_SLOT_FACE)
4048          shader->inputs[idx].interp = LP_INTERP_FACING;
4049       else if (var->data.location == VARYING_SLOT_POS) {
4050          shader->inputs[idx].src_index = 0;
4051          shader->inputs[idx].interp = LP_INTERP_POSITION;
4052       }
4053 
4054       shader->inputs[idx].usage_mask = shader->info.base.input_usage_mask[idx];
4055       for (unsigned s = 1; s < slots; s++) {
4056          shader->inputs[idx + s] = shader->inputs[idx];
4057          shader->inputs[idx + s].src_index = idx + s + 1;
4058          shader->inputs[idx + s].usage_mask = shader->info.base.input_usage_mask[idx + s];
4059       }
4060    }
4061 
4062    llvmpipe_fs_analyse_nir(shader);
4063 
4064    return shader;
4065 }
4066 
4067 
4068 static void
llvmpipe_bind_fs_state(struct pipe_context * pipe,void * fs)4069 llvmpipe_bind_fs_state(struct pipe_context *pipe, void *fs)
4070 {
4071    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
4072    struct lp_fragment_shader *lp_fs = (struct lp_fragment_shader *)fs;
4073    if (llvmpipe->fs == lp_fs)
4074       return;
4075 
4076    draw_bind_fragment_shader(llvmpipe->draw,
4077                              (lp_fs ? lp_fs->draw_data : NULL));
4078 
4079    lp_fs_reference(llvmpipe, &llvmpipe->fs, lp_fs);
4080 
4081    /* invalidate the setup link, NEW_FS will make it update */
4082    lp_setup_set_fs_variant(llvmpipe->setup, NULL);
4083    llvmpipe->dirty |= LP_NEW_FS;
4084 }
4085 
4086 
4087 /**
4088  * Remove shader variant from two lists: the shader's variant list
4089  * and the context's variant list.
4090  */
4091 static void
llvmpipe_remove_shader_variant(struct llvmpipe_context * lp,struct lp_fragment_shader_variant * variant)4092 llvmpipe_remove_shader_variant(struct llvmpipe_context *lp,
4093                                struct lp_fragment_shader_variant *variant)
4094 {
4095    if ((LP_DEBUG & DEBUG_FS) || (gallivm_debug & GALLIVM_DEBUG_IR)) {
4096       debug_printf("llvmpipe: del fs #%u var %u v created %u v cached %u "
4097                    "v total cached %u inst %u total inst %u\n",
4098                    variant->shader->no, variant->no,
4099                    variant->shader->variants_created,
4100                    variant->shader->variants_cached,
4101                    lp->nr_fs_variants, variant->nr_instrs, lp->nr_fs_instrs);
4102    }
4103 
4104    /* remove from shader's list */
4105    list_del(&variant->list_item_local.list);
4106    variant->shader->variants_cached--;
4107 
4108    /* remove from context's list */
4109    list_del(&variant->list_item_global.list);
4110    lp->nr_fs_variants--;
4111    lp->nr_fs_instrs -= variant->nr_instrs;
4112 }
4113 
4114 
4115 void
llvmpipe_destroy_shader_variant(struct llvmpipe_context * lp,struct lp_fragment_shader_variant * variant)4116 llvmpipe_destroy_shader_variant(struct llvmpipe_context *lp,
4117                                 struct lp_fragment_shader_variant *variant)
4118 {
4119    gallivm_destroy(variant->gallivm);
4120    lp_fs_reference(lp, &variant->shader, NULL);
4121    if (variant->function_name[RAST_EDGE_TEST])
4122       FREE(variant->function_name[RAST_EDGE_TEST]);
4123    if (variant->function_name[RAST_WHOLE])
4124       FREE(variant->function_name[RAST_WHOLE]);
4125    if (variant->linear_function_name)
4126       FREE(variant->linear_function_name);
4127    FREE(variant);
4128 }
4129 
4130 
4131 void
llvmpipe_destroy_fs(struct llvmpipe_context * llvmpipe,struct lp_fragment_shader * shader)4132 llvmpipe_destroy_fs(struct llvmpipe_context *llvmpipe,
4133                     struct lp_fragment_shader *shader)
4134 {
4135    /* Delete draw module's data */
4136    draw_delete_fragment_shader(llvmpipe->draw, shader->draw_data);
4137 
4138    ralloc_free(shader->base.ir.nir);
4139    assert(shader->variants_cached == 0);
4140    FREE(shader);
4141 }
4142 
4143 
4144 static void
llvmpipe_delete_fs_state(struct pipe_context * pipe,void * fs)4145 llvmpipe_delete_fs_state(struct pipe_context *pipe, void *fs)
4146 {
4147    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
4148    struct lp_fragment_shader *shader = fs;
4149    struct lp_fs_variant_list_item *li, *next;
4150 
4151    /* Delete all the variants */
4152    LIST_FOR_EACH_ENTRY_SAFE(li, next, &shader->variants.list, list) {
4153       struct lp_fragment_shader_variant *variant;
4154       variant = li->base;
4155       llvmpipe_remove_shader_variant(llvmpipe, li->base);
4156       lp_fs_variant_reference(llvmpipe, &variant, NULL);
4157    }
4158 
4159    lp_fs_reference(llvmpipe, &shader, NULL);
4160 }
4161 
4162 
4163 static void
llvmpipe_set_constant_buffer(struct pipe_context * pipe,enum pipe_shader_type shader,uint index,bool take_ownership,const struct pipe_constant_buffer * cb)4164 llvmpipe_set_constant_buffer(struct pipe_context *pipe,
4165                              enum pipe_shader_type shader, uint index,
4166                              bool take_ownership,
4167                              const struct pipe_constant_buffer *cb)
4168 {
4169    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
4170    struct pipe_constant_buffer *constants = &llvmpipe->constants[shader][index];
4171 
4172    assert(shader < PIPE_SHADER_MESH_TYPES);
4173    assert(index < ARRAY_SIZE(llvmpipe->constants[shader]));
4174 
4175    /* note: reference counting */
4176    util_copy_constant_buffer(&llvmpipe->constants[shader][index], cb,
4177                              take_ownership);
4178 
4179    /* user_buffer is only valid until the next set_constant_buffer (at most,
4180     * possibly until shader deletion), so we need to upload it now to make
4181     * sure it doesn't get updated/freed out from under us.
4182     */
4183    if (constants->user_buffer) {
4184       u_upload_data(llvmpipe->pipe.const_uploader, 0, constants->buffer_size,
4185                     16, constants->user_buffer, &constants->buffer_offset,
4186                     &constants->buffer);
4187    }
4188    if (constants->buffer) {
4189        if (!(constants->buffer->bind & PIPE_BIND_CONSTANT_BUFFER)) {
4190          debug_printf("Illegal set constant without bind flag\n");
4191          constants->buffer->bind |= PIPE_BIND_CONSTANT_BUFFER;
4192       }
4193       llvmpipe_flush_resource(pipe, constants->buffer, 0, true, true, false, "set_constant_buffer");
4194    }
4195 
4196    switch (shader) {
4197    case PIPE_SHADER_VERTEX:
4198    case PIPE_SHADER_GEOMETRY:
4199    case PIPE_SHADER_TESS_CTRL:
4200    case PIPE_SHADER_TESS_EVAL: {
4201       const unsigned size = cb ? cb->buffer_size : 0;
4202 
4203       const uint8_t *data = NULL;
4204       if (constants->buffer) {
4205          data = (uint8_t *) llvmpipe_resource_data(constants->buffer)
4206             + constants->buffer_offset;
4207       }
4208 
4209       draw_set_mapped_constant_buffer(llvmpipe->draw, shader,
4210                                       index, data, size);
4211       break;
4212    }
4213    case PIPE_SHADER_COMPUTE:
4214       llvmpipe->cs_dirty |= LP_CSNEW_CONSTANTS;
4215       break;
4216    case PIPE_SHADER_FRAGMENT:
4217       llvmpipe->dirty |= LP_NEW_FS_CONSTANTS;
4218       break;
4219    case PIPE_SHADER_TASK:
4220       llvmpipe->dirty |= LP_NEW_TASK_CONSTANTS;
4221       break;
4222    case PIPE_SHADER_MESH:
4223       llvmpipe->dirty |= LP_NEW_MESH_CONSTANTS;
4224       break;
4225    default:
4226       unreachable("Illegal shader type");
4227       break;
4228    }
4229 }
4230 
4231 
4232 static void
llvmpipe_set_shader_buffers(struct pipe_context * pipe,enum pipe_shader_type shader,unsigned start_slot,unsigned count,const struct pipe_shader_buffer * buffers,unsigned writable_bitmask)4233 llvmpipe_set_shader_buffers(struct pipe_context *pipe,
4234                             enum pipe_shader_type shader, unsigned start_slot,
4235                             unsigned count,
4236                             const struct pipe_shader_buffer *buffers,
4237                             unsigned writable_bitmask)
4238 {
4239    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
4240 
4241    unsigned i, idx;
4242    for (i = start_slot, idx = 0; i < start_slot + count; i++, idx++) {
4243       const struct pipe_shader_buffer *buffer = buffers ? &buffers[idx] : NULL;
4244 
4245       util_copy_shader_buffer(&llvmpipe->ssbos[shader][i], buffer);
4246 
4247       if (buffer && buffer->buffer) {
4248          bool read_only = !(writable_bitmask & (1 << idx));
4249          llvmpipe_flush_resource(pipe, buffer->buffer, 0, read_only, false,
4250                                  false, "buffer");
4251       }
4252 
4253       switch (shader) {
4254       case PIPE_SHADER_VERTEX:
4255       case PIPE_SHADER_GEOMETRY:
4256       case PIPE_SHADER_TESS_CTRL:
4257       case PIPE_SHADER_TESS_EVAL: {
4258          const unsigned size = buffer ? buffer->buffer_size : 0;
4259          const uint8_t *data = NULL;
4260          if (buffer && buffer->buffer)
4261             data = (uint8_t *) llvmpipe_resource_data(buffer->buffer);
4262          if (data)
4263             data += buffer->buffer_offset;
4264          draw_set_mapped_shader_buffer(llvmpipe->draw, shader,
4265                                        i, data, size);
4266          break;
4267       }
4268       case PIPE_SHADER_COMPUTE:
4269          llvmpipe->cs_dirty |= LP_CSNEW_SSBOS;
4270          break;
4271       case PIPE_SHADER_TASK:
4272          llvmpipe->dirty |= LP_NEW_TASK_SSBOS;
4273          break;
4274       case PIPE_SHADER_MESH:
4275          llvmpipe->dirty |= LP_NEW_MESH_SSBOS;
4276          break;
4277       case PIPE_SHADER_FRAGMENT:
4278          llvmpipe->fs_ssbo_write_mask &= ~(((1 << count) - 1) << start_slot);
4279          llvmpipe->fs_ssbo_write_mask |= writable_bitmask << start_slot;
4280          llvmpipe->dirty |= LP_NEW_FS_SSBOS;
4281          break;
4282       default:
4283          unreachable("Illegal shader type");
4284          break;
4285       }
4286    }
4287 }
4288 
4289 
4290 static void
llvmpipe_set_shader_images(struct pipe_context * pipe,enum pipe_shader_type shader,unsigned start_slot,unsigned count,unsigned unbind_num_trailing_slots,const struct pipe_image_view * images)4291 llvmpipe_set_shader_images(struct pipe_context *pipe,
4292                            enum pipe_shader_type shader, unsigned start_slot,
4293                            unsigned count, unsigned unbind_num_trailing_slots,
4294                            const struct pipe_image_view *images)
4295 {
4296    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
4297    unsigned i, idx;
4298 
4299    draw_flush(llvmpipe->draw);
4300    for (i = start_slot, idx = 0; i < start_slot + count; i++, idx++) {
4301       const struct pipe_image_view *image = images ? &images[idx] : NULL;
4302 
4303       util_copy_image_view(&llvmpipe->images[shader][i], image);
4304 
4305       if (image && image->resource) {
4306          bool read_only = !(image->access & PIPE_IMAGE_ACCESS_WRITE);
4307          llvmpipe_flush_resource(pipe, image->resource, 0, read_only, false,
4308                                  false, "image");
4309       }
4310    }
4311 
4312    llvmpipe->num_images[shader] = start_slot + count;
4313    switch (shader) {
4314    case PIPE_SHADER_VERTEX:
4315    case PIPE_SHADER_GEOMETRY:
4316    case PIPE_SHADER_TESS_CTRL:
4317    case PIPE_SHADER_TESS_EVAL:
4318       draw_set_images(llvmpipe->draw, shader, llvmpipe->images[shader],
4319                       start_slot + count);
4320       break;
4321    case PIPE_SHADER_COMPUTE:
4322       llvmpipe->cs_dirty |= LP_CSNEW_IMAGES;
4323       break;
4324    case PIPE_SHADER_FRAGMENT:
4325       llvmpipe->dirty |= LP_NEW_FS_IMAGES;
4326       break;
4327    case PIPE_SHADER_TASK:
4328       llvmpipe->dirty |= LP_NEW_TASK_IMAGES;
4329       break;
4330    case PIPE_SHADER_MESH:
4331       llvmpipe->dirty |= LP_NEW_MESH_IMAGES;
4332       break;
4333    default:
4334       unreachable("Illegal shader type");
4335       break;
4336    }
4337 
4338    if (unbind_num_trailing_slots) {
4339       llvmpipe_set_shader_images(pipe, shader, start_slot + count,
4340                                  unbind_num_trailing_slots, 0, NULL);
4341    }
4342 }
4343 
4344 
4345 /**
4346  * Return the blend factor equivalent to a destination alpha of one.
4347  */
4348 static inline enum pipe_blendfactor
force_dst_alpha_one(enum pipe_blendfactor factor,bool clamped_zero)4349 force_dst_alpha_one(enum pipe_blendfactor factor, bool clamped_zero)
4350 {
4351    switch (factor) {
4352    case PIPE_BLENDFACTOR_DST_ALPHA:
4353       return PIPE_BLENDFACTOR_ONE;
4354    case PIPE_BLENDFACTOR_INV_DST_ALPHA:
4355       return PIPE_BLENDFACTOR_ZERO;
4356    case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
4357       if (clamped_zero)
4358          return PIPE_BLENDFACTOR_ZERO;
4359       else
4360          return PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE;
4361    default:
4362       return factor;
4363    }
4364 }
4365 
4366 
4367 /**
4368  * We need to generate several variants of the fragment pipeline to match
4369  * all the combinations of the contributing state atoms.
4370  *
4371  * TODO: there is actually no reason to tie this to context state -- the
4372  * generated code could be cached globally in the screen.
4373  */
4374 static struct lp_fragment_shader_variant_key *
make_variant_key(struct llvmpipe_context * lp,struct lp_fragment_shader * shader,char * store)4375 make_variant_key(struct llvmpipe_context *lp,
4376                  struct lp_fragment_shader *shader,
4377                  char *store)
4378 {
4379    struct lp_fragment_shader_variant_key *key =
4380       (struct lp_fragment_shader_variant_key *)store;
4381    struct nir_shader *nir = shader->base.ir.nir;
4382 
4383    memset(key, 0, sizeof(*key));
4384 
4385    if (lp->framebuffer.zsbuf) {
4386       const enum pipe_format zsbuf_format = lp->framebuffer.zsbuf->format;
4387       const struct util_format_description *zsbuf_desc =
4388          util_format_description(zsbuf_format);
4389 
4390       if (lp->depth_stencil->depth_enabled &&
4391           util_format_has_depth(zsbuf_desc)) {
4392          key->zsbuf_format = zsbuf_format;
4393          key->depth.enabled = lp->depth_stencil->depth_enabled;
4394          key->depth.writemask = lp->depth_stencil->depth_writemask;
4395          key->depth.func = lp->depth_stencil->depth_func;
4396       }
4397       if (lp->depth_stencil->stencil[0].enabled &&
4398           util_format_has_stencil(zsbuf_desc)) {
4399          key->zsbuf_format = zsbuf_format;
4400          memcpy(&key->stencil, &lp->depth_stencil->stencil,
4401                 sizeof key->stencil);
4402       }
4403       if (llvmpipe_resource_is_1d(lp->framebuffer.zsbuf->texture)) {
4404          key->resource_1d = true;
4405       }
4406       key->zsbuf_nr_samples =
4407          util_res_sample_count(lp->framebuffer.zsbuf->texture);
4408 
4409       /*
4410        * Restrict depth values if the API is clamped (GL, VK with ext)
4411        * for non float Z buffer
4412        */
4413       key->restrict_depth_values =
4414          !(lp->rasterizer->unclamped_fragment_depth_values &&
4415            util_format_get_depth_only(zsbuf_format) == PIPE_FORMAT_Z32_FLOAT);
4416    }
4417 
4418    /*
4419     * Propagate the depth clamp setting from the rasterizer state.
4420     */
4421    key->depth_clamp = lp->rasterizer->depth_clamp;
4422 
4423    /* alpha test only applies if render buffer 0 is non-integer
4424     * (or does not exist)
4425     */
4426    if (!lp->framebuffer.nr_cbufs ||
4427        !lp->framebuffer.cbufs[0] ||
4428        !util_format_is_pure_integer(lp->framebuffer.cbufs[0]->format)) {
4429       key->alpha.enabled = lp->depth_stencil->alpha_enabled;
4430    }
4431    if (key->alpha.enabled) {
4432       key->alpha.func = lp->depth_stencil->alpha_func;
4433       /* alpha.ref_value is passed in jit_context */
4434    }
4435 
4436    key->flatshade = lp->rasterizer->flatshade;
4437    key->multisample = lp->rasterizer->multisample;
4438    key->no_ms_sample_mask_out = lp->rasterizer->no_ms_sample_mask_out;
4439    if (lp->active_occlusion_queries && !lp->queries_disabled) {
4440       key->occlusion_count = true;
4441    }
4442 
4443    memcpy(&key->blend, lp->blend, sizeof key->blend);
4444 
4445    key->coverage_samples = 1;
4446    key->min_samples = 1;
4447    if (key->multisample) {
4448       key->coverage_samples =
4449          util_framebuffer_get_num_samples(&lp->framebuffer);
4450       /* Per EXT_shader_framebuffer_fetch spec:
4451        *
4452        *   "1. How is framebuffer data treated during multisample rendering?
4453        *
4454        *    RESOLVED: Reading the value of gl_LastFragData produces a
4455        *    different result for each sample. This implies that all or part
4456        *    of the shader be run once for each sample, but has no additional
4457        *    implications on fragment shader input variables which may still
4458        *    be interpolated per pixel by the implementation."
4459        *
4460        * ARM_shader_framebuffer_fetch_depth_stencil spec further says:
4461        *
4462        *   "(1) When multisampling is enabled, does the shader run per sample?
4463        *
4464        *    RESOLVED.
4465        *
4466        *    This behavior is inherited from either
4467        *    EXT_shader_framebuffer_fetch or ARM_shader_framebuffer_fetch as
4468        *    described in the interactions section.  If neither extension is
4469        *    supported, the shader runs once per fragment."
4470        *
4471        * Therefore we should always enable per-sample shading when FB fetch is
4472        * used.
4473        */
4474       if (lp->min_samples > 1 || nir->info.fs.uses_fbfetch_output)
4475          key->min_samples = key->coverage_samples;
4476    }
4477    key->nr_cbufs = lp->framebuffer.nr_cbufs;
4478 
4479    if (!key->blend.independent_blend_enable) {
4480       // we always need independent blend otherwise the fixups below won't work
4481       for (unsigned i = 1; i < key->nr_cbufs; i++) {
4482          memcpy(&key->blend.rt[i], &key->blend.rt[0],
4483                 sizeof(key->blend.rt[0]));
4484       }
4485       key->blend.independent_blend_enable = 1;
4486    }
4487 
4488    for (unsigned i = 0; i < lp->framebuffer.nr_cbufs; i++) {
4489       struct pipe_rt_blend_state *blend_rt = &key->blend.rt[i];
4490 
4491       if (lp->framebuffer.cbufs[i]) {
4492          const enum pipe_format format = lp->framebuffer.cbufs[i]->format;
4493 
4494          key->cbuf_format[i] = format;
4495          key->cbuf_nr_samples[i] =
4496             util_res_sample_count(lp->framebuffer.cbufs[i]->texture);
4497 
4498          /*
4499           * Figure out if this is a 1d resource. Note that OpenGL allows crazy
4500           * mixing of 2d textures with height 1 and 1d textures, so make sure
4501           * we pick 1d if any cbuf or zsbuf is 1d.
4502           */
4503          if (llvmpipe_resource_is_1d(lp->framebuffer.cbufs[i]->texture)) {
4504             key->resource_1d = true;
4505          }
4506 
4507          const struct util_format_description *format_desc =
4508             util_format_description(format);
4509          assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
4510                 format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB);
4511 
4512          /*
4513           * Mask out color channels not present in the color buffer.
4514           */
4515          blend_rt->colormask &= util_format_colormask(format_desc);
4516 
4517          /*
4518           * Disable blend for integer formats.
4519           */
4520          if (util_format_is_pure_integer(format)) {
4521             blend_rt->blend_enable = 0;
4522          }
4523 
4524          /*
4525           * Our swizzled render tiles always have an alpha channel, but the
4526           * linear render target format often does not, so force here the dst
4527           * alpha to be one.
4528           *
4529           * This is not a mere optimization. Wrong results will be produced if
4530           * the dst alpha is used, the dst format does not have alpha, and the
4531           * previous rendering was not flushed from the swizzled to linear
4532           * buffer. For example, NonPowTwo DCT.
4533           *
4534           * TODO: This should be generalized to all channels for better
4535           * performance, but only alpha causes correctness issues.
4536           *
4537           * Also, force rgb/alpha func/factors match, to make AoS blending
4538           * easier.
4539           */
4540          if (format_desc->swizzle[3] > PIPE_SWIZZLE_W ||
4541              format_desc->swizzle[3] == format_desc->swizzle[0]) {
4542             // Doesn't cover mixed snorm/unorm but can't render to them anyway
4543             bool clamped_zero = !util_format_is_float(format) &&
4544                                 !util_format_is_snorm(format);
4545             blend_rt->rgb_src_factor =
4546                force_dst_alpha_one(blend_rt->rgb_src_factor, clamped_zero);
4547             blend_rt->rgb_dst_factor =
4548                force_dst_alpha_one(blend_rt->rgb_dst_factor, clamped_zero);
4549             blend_rt->alpha_func       = blend_rt->rgb_func;
4550             blend_rt->alpha_src_factor = blend_rt->rgb_src_factor;
4551             blend_rt->alpha_dst_factor = blend_rt->rgb_dst_factor;
4552          }
4553       } else {
4554          /* no color buffer for this fragment output */
4555          key->cbuf_format[i] = PIPE_FORMAT_NONE;
4556          key->cbuf_nr_samples[i] = 0;
4557          blend_rt->colormask = 0x0;
4558          blend_rt->blend_enable = 0;
4559       }
4560    }
4561 
4562    /* This value will be the same for all the variants of a given shader:
4563     */
4564    key->nr_samplers = BITSET_LAST_BIT(nir->info.samplers_used);
4565    key->nr_sampler_views = BITSET_LAST_BIT(nir->info.textures_used);
4566 
4567    struct lp_sampler_static_state *fs_sampler =
4568       lp_fs_variant_key_samplers(key);
4569 
4570    memset(fs_sampler, 0,
4571           MAX2(key->nr_samplers, key->nr_sampler_views) * sizeof *fs_sampler);
4572 
4573    for (unsigned i = 0; i < key->nr_samplers; ++i) {
4574       if (BITSET_TEST(nir->info.samplers_used, i)) {
4575          lp_sampler_static_sampler_state(&fs_sampler[i].sampler_state,
4576                                          lp->samplers[PIPE_SHADER_FRAGMENT][i]);
4577       }
4578    }
4579 
4580    /*
4581     * XXX If TGSI_FILE_SAMPLER_VIEW exists assume all texture opcodes
4582     * are dx10-style? Can't really have mixed opcodes, at least not
4583     * if we want to skip the holes here (without rescanning tgsi).
4584     */
4585    if (key->nr_sampler_views) {
4586       for (unsigned i = 0; i < key->nr_sampler_views; ++i) {
4587          /*
4588           * Note sview may exceed what's representable by file_mask.
4589           * This will still work, the only downside is that not actually
4590           * used views may be included in the shader key.
4591           */
4592          if (BITSET_TEST(nir->info.textures_used, i)) {
4593             lp_sampler_static_texture_state(&fs_sampler[i].texture_state,
4594                                   lp->sampler_views[PIPE_SHADER_FRAGMENT][i]);
4595          }
4596       }
4597    } else {
4598       key->nr_sampler_views = key->nr_samplers;
4599       for (unsigned i = 0; i < key->nr_sampler_views; ++i) {
4600          if (BITSET_TEST(nir->info.samplers_used, i)) {
4601             lp_sampler_static_texture_state(&fs_sampler[i].texture_state,
4602                                  lp->sampler_views[PIPE_SHADER_FRAGMENT][i]);
4603          }
4604       }
4605    }
4606 
4607    struct lp_image_static_state *lp_image = lp_fs_variant_key_images(key);
4608    key->nr_images = BITSET_LAST_BIT(nir->info.images_used);
4609    if (key->nr_images)
4610       memset(lp_image, 0,
4611              key->nr_images * sizeof *lp_image);
4612    for (unsigned i = 0; i < key->nr_images; ++i) {
4613       if (BITSET_TEST(nir->info.images_used, i)) {
4614          lp_sampler_static_texture_state_image(&lp_image[i].image_state,
4615                                       &lp->images[PIPE_SHADER_FRAGMENT][i]);
4616       }
4617    }
4618 
4619    if (shader->kind == LP_FS_KIND_AERO_MINIFICATION) {
4620       struct lp_sampler_static_state *samp0 =
4621          lp_fs_variant_key_sampler_idx(key, 0);
4622       assert(samp0);
4623       samp0->sampler_state.min_img_filter = PIPE_TEX_FILTER_NEAREST;
4624       samp0->sampler_state.mag_img_filter = PIPE_TEX_FILTER_NEAREST;
4625    }
4626 
4627    return key;
4628 }
4629 
4630 
4631 /**
4632  * Update fragment shader state.  This is called just prior to drawing
4633  * something when some fragment-related state has changed.
4634  */
4635 void
llvmpipe_update_fs(struct llvmpipe_context * lp)4636 llvmpipe_update_fs(struct llvmpipe_context *lp)
4637 {
4638    struct lp_fragment_shader *shader = lp->fs;
4639 
4640    char store[LP_FS_MAX_VARIANT_KEY_SIZE];
4641    const struct lp_fragment_shader_variant_key *key =
4642       make_variant_key(lp, shader, store);
4643 
4644    struct lp_fragment_shader_variant *variant = NULL;
4645    struct lp_fs_variant_list_item *li;
4646    /* Search the variants for one which matches the key */
4647    LIST_FOR_EACH_ENTRY(li, &shader->variants.list, list) {
4648       if (memcmp(&li->base->key, key, shader->variant_key_size) == 0) {
4649          variant = li->base;
4650          break;
4651       }
4652    }
4653 
4654    if (variant) {
4655       /* Move this variant to the head of the list to implement LRU
4656        * deletion of shader's when we have too many.
4657        */
4658       list_move_to(&variant->list_item_global.list, &lp->fs_variants_list.list);
4659    } else {
4660       /* variant not found, create it now */
4661 
4662       if (LP_DEBUG & DEBUG_FS) {
4663          debug_printf("%u variants,\t%u instrs,\t%u instrs/variant\n",
4664                       lp->nr_fs_variants,
4665                       lp->nr_fs_instrs,
4666                       lp->nr_fs_variants ? lp->nr_fs_instrs / lp->nr_fs_variants : 0);
4667       }
4668 
4669       /* First, check if we've exceeded the max number of shader variants.
4670        * If so, free 6.25% of them (the least recently used ones).
4671        */
4672       const unsigned variants_to_cull =
4673          lp->nr_fs_variants >= LP_MAX_SHADER_VARIANTS
4674          ? LP_MAX_SHADER_VARIANTS / 16 : 0;
4675 
4676       if (variants_to_cull ||
4677           lp->nr_fs_instrs >= LP_MAX_SHADER_INSTRUCTIONS) {
4678          if (gallivm_debug & GALLIVM_DEBUG_PERF) {
4679             debug_printf("Evicting FS: %u fs variants,\t%u total variants,"
4680                          "\t%u instrs,\t%u instrs/variant\n",
4681                          shader->variants_cached,
4682                          lp->nr_fs_variants, lp->nr_fs_instrs,
4683                          lp->nr_fs_instrs / lp->nr_fs_variants);
4684          }
4685 
4686          /*
4687           * We need to re-check lp->nr_fs_variants because an arbitrarily
4688           * large number of shader variants (potentially all of them) could
4689           * be pending for destruction on flush.
4690           */
4691 
4692          for (unsigned i = 0;
4693               i < variants_to_cull ||
4694                  lp->nr_fs_instrs >= LP_MAX_SHADER_INSTRUCTIONS;
4695               i++) {
4696             struct lp_fs_variant_list_item *item;
4697             if (list_is_empty(&lp->fs_variants_list.list)) {
4698                break;
4699             }
4700             item = list_last_entry(&lp->fs_variants_list.list,
4701                                    struct lp_fs_variant_list_item, list);
4702             assert(item);
4703             assert(item->base);
4704             llvmpipe_remove_shader_variant(lp, item->base);
4705             struct lp_fragment_shader_variant *variant = item->base;
4706             lp_fs_variant_reference(lp, &variant, NULL);
4707          }
4708       }
4709 
4710       /*
4711        * Generate the new variant.
4712        */
4713       int64_t t0 = os_time_get();
4714       variant = generate_variant(lp, shader, key);
4715       int64_t t1 = os_time_get();
4716       int64_t dt = t1 - t0;
4717       LP_COUNT_ADD(llvm_compile_time, dt);
4718       LP_COUNT_ADD(nr_llvm_compiles, 2);  /* emit vs. omit in/out test */
4719 
4720       /* Put the new variant into the list */
4721       if (variant) {
4722          list_add(&variant->list_item_local.list, &shader->variants.list);
4723          list_add(&variant->list_item_global.list, &lp->fs_variants_list.list);
4724          lp->nr_fs_variants++;
4725          lp->nr_fs_instrs += variant->nr_instrs;
4726          shader->variants_cached++;
4727       }
4728    }
4729 
4730    /* Bind this variant */
4731    lp_setup_set_fs_variant(lp->setup, variant);
4732 }
4733 
4734 
4735 void
llvmpipe_init_fs_funcs(struct llvmpipe_context * llvmpipe)4736 llvmpipe_init_fs_funcs(struct llvmpipe_context *llvmpipe)
4737 {
4738    llvmpipe->pipe.create_fs_state = llvmpipe_create_fs_state;
4739    llvmpipe->pipe.bind_fs_state   = llvmpipe_bind_fs_state;
4740    llvmpipe->pipe.delete_fs_state = llvmpipe_delete_fs_state;
4741    llvmpipe->pipe.set_constant_buffer = llvmpipe_set_constant_buffer;
4742    llvmpipe->pipe.set_shader_buffers = llvmpipe_set_shader_buffers;
4743    llvmpipe->pipe.set_shader_images = llvmpipe_set_shader_images;
4744 }
4745