1 /**************************************************************************
2 *
3 * Copyright 2009 VMware, Inc.
4 * Copyright 2007 VMware, Inc.
5 * All Rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the
9 * "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sub license, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the
16 * next paragraph) shall be included in all copies or substantial portions
17 * of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
22 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
23 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 **************************************************************************/
28
29 /**
30 * @file
31 * Code generate the whole fragment pipeline.
32 *
33 * The fragment pipeline consists of the following stages:
34 * - early depth test
35 * - fragment shader
36 * - alpha test
37 * - depth/stencil test
38 * - blending
39 *
40 * This file has only the glue to assemble the fragment pipeline. The actual
41 * plumbing of converting Gallium state into LLVM IR is done elsewhere, in the
42 * lp_bld_*.[ch] files, and in a complete generic and reusable way. Here we
43 * muster the LLVM JIT execution engine to create a function that follows an
44 * established binary interface and that can be called from C directly.
45 *
46 * A big source of complexity here is that we often want to run different
47 * stages with different precisions and data types and precisions. For example,
48 * the fragment shader needs typically to be done in floats, but the
49 * depth/stencil test and blending is better done in the type that most closely
50 * matches the depth/stencil and color buffer respectively.
51 *
52 * Since the width of a SIMD vector register stays the same regardless of the
53 * element type, different types imply different number of elements, so we must
54 * code generate more instances of the stages with larger types to be able to
55 * feed/consume the stages with smaller types.
56 *
57 * @author Jose Fonseca <[email protected]>
58 */
59
60 #include <limits.h>
61 #include "pipe/p_defines.h"
62 #include "util/u_inlines.h"
63 #include "util/u_memory.h"
64 #include "util/u_pointer.h"
65 #include "util/format/u_format.h"
66 #include "util/u_dump.h"
67 #include "util/u_string.h"
68 #include "util/u_dual_blend.h"
69 #include "util/u_upload_mgr.h"
70 #include "util/os_time.h"
71 #include "pipe/p_shader_tokens.h"
72 #include "draw/draw_context.h"
73 #include "nir/tgsi_to_nir.h"
74 #include "gallivm/lp_bld_type.h"
75 #include "gallivm/lp_bld_const.h"
76 #include "gallivm/lp_bld_conv.h"
77 #include "gallivm/lp_bld_init.h"
78 #include "gallivm/lp_bld_intr.h"
79 #include "gallivm/lp_bld_logic.h"
80 #include "gallivm/lp_bld_tgsi.h"
81 #include "gallivm/lp_bld_nir.h"
82 #include "gallivm/lp_bld_swizzle.h"
83 #include "gallivm/lp_bld_flow.h"
84 #include "gallivm/lp_bld_debug.h"
85 #include "gallivm/lp_bld_arit.h"
86 #include "gallivm/lp_bld_bitarit.h"
87 #include "gallivm/lp_bld_pack.h"
88 #include "gallivm/lp_bld_format.h"
89 #include "gallivm/lp_bld_quad.h"
90 #include "gallivm/lp_bld_gather.h"
91 #include "gallivm/lp_bld_jit_sample.h"
92
93 #include "lp_bld_alpha.h"
94 #include "lp_bld_blend.h"
95 #include "lp_bld_depth.h"
96 #include "lp_bld_interp.h"
97 #include "lp_context.h"
98 #include "lp_debug.h"
99 #include "lp_perf.h"
100 #include "lp_setup.h"
101 #include "lp_state.h"
102 #include "lp_tex_sample.h"
103 #include "lp_flush.h"
104 #include "lp_state_fs.h"
105 #include "lp_rast.h"
106 #include "nir/nir_to_tgsi_info.h"
107
108 #include "lp_screen.h"
109 #include "compiler/nir/nir_serialize.h"
110 #include "util/mesa-sha1.h"
111
112
113 /** Fragment shader number (for debugging) */
114 static unsigned fs_no = 0;
115
116
117 static void
118 load_unswizzled_block(struct gallivm_state *gallivm,
119 LLVMTypeRef base_type,
120 LLVMValueRef base_ptr,
121 LLVMValueRef stride,
122 unsigned block_width,
123 unsigned block_height,
124 LLVMValueRef* dst,
125 struct lp_type dst_type,
126 unsigned dst_count,
127 unsigned dst_alignment);
128 /**
129 * Checks if a format description is an arithmetic format
130 *
131 * A format which has irregular channel sizes such as R3_G3_B2 or R5_G6_B5.
132 */
133 static inline bool
is_arithmetic_format(const struct util_format_description * format_desc)134 is_arithmetic_format(const struct util_format_description *format_desc)
135 {
136 bool arith = false;
137
138 for (unsigned i = 0; i < format_desc->nr_channels; ++i) {
139 arith |= format_desc->channel[i].size != format_desc->channel[0].size;
140 arith |= (format_desc->channel[i].size % 8) != 0;
141 }
142
143 return arith;
144 }
145
146
147 /**
148 * Checks if this format requires special handling due to required expansion
149 * to floats for blending, and furthermore has "natural" packed AoS ->
150 * unpacked SoA conversion.
151 */
152 static inline bool
format_expands_to_float_soa(const struct util_format_description * format_desc)153 format_expands_to_float_soa(const struct util_format_description *format_desc)
154 {
155 if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT ||
156 format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
157 return true;
158 }
159 return false;
160 }
161
162
163 /**
164 * Retrieves the type representing the memory layout for a format
165 *
166 * e.g. RGBA16F = 4x half-float and R3G3B2 = 1x byte
167 */
168 static inline void
lp_mem_type_from_format_desc(const struct util_format_description * format_desc,struct lp_type * type)169 lp_mem_type_from_format_desc(const struct util_format_description *format_desc,
170 struct lp_type* type)
171 {
172 if (format_expands_to_float_soa(format_desc)) {
173 /* just make this a uint with width of block */
174 type->floating = false;
175 type->fixed = false;
176 type->sign = false;
177 type->norm = false;
178 type->width = format_desc->block.bits;
179 type->length = 1;
180 return;
181 }
182
183 int chan = util_format_get_first_non_void_channel(format_desc->format);
184
185 memset(type, 0, sizeof(struct lp_type));
186 type->floating = format_desc->channel[chan].type == UTIL_FORMAT_TYPE_FLOAT;
187 type->fixed = format_desc->channel[chan].type == UTIL_FORMAT_TYPE_FIXED;
188 type->sign = format_desc->channel[chan].type != UTIL_FORMAT_TYPE_UNSIGNED;
189 type->norm = format_desc->channel[chan].normalized;
190
191 if (is_arithmetic_format(format_desc)) {
192 type->width = 0;
193 type->length = 1;
194
195 for (unsigned i = 0; i < format_desc->nr_channels; ++i) {
196 type->width += format_desc->channel[i].size;
197 }
198 } else {
199 type->width = format_desc->channel[chan].size;
200 type->length = format_desc->nr_channels;
201 }
202 }
203
204
205 /**
206 * Expand the relevant bits of mask_input to a n*4-dword mask for the
207 * n*four pixels in n 2x2 quads. This will set the n*four elements of the
208 * quad mask vector to 0 or ~0.
209 * Grouping is 01, 23 for 2 quad mode hence only 0 and 2 are valid
210 * quad arguments with fs length 8.
211 *
212 * \param first_quad which quad(s) of the quad group to test, in [0,3]
213 * \param mask_input bitwise mask for the whole 4x4 stamp
214 */
215 static LLVMValueRef
generate_quad_mask(struct gallivm_state * gallivm,struct lp_type fs_type,unsigned first_quad,unsigned sample,LLVMValueRef mask_input)216 generate_quad_mask(struct gallivm_state *gallivm,
217 struct lp_type fs_type,
218 unsigned first_quad,
219 unsigned sample,
220 LLVMValueRef mask_input) /* int64 */
221 {
222 LLVMBuilderRef builder = gallivm->builder;
223 LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
224 LLVMValueRef bits[16];
225 LLVMValueRef mask, bits_vec;
226
227 /*
228 * XXX: We'll need a different path for 16 x u8
229 */
230 assert(fs_type.width == 32);
231 assert(fs_type.length <= ARRAY_SIZE(bits));
232 struct lp_type mask_type = lp_int_type(fs_type);
233
234 /*
235 * mask_input >>= (quad * 4)
236 */
237 int shift;
238 switch (first_quad) {
239 case 0:
240 shift = 0;
241 break;
242 case 1:
243 assert(fs_type.length == 4);
244 shift = 2;
245 break;
246 case 2:
247 shift = 8;
248 break;
249 case 3:
250 assert(fs_type.length == 4);
251 shift = 10;
252 break;
253 default:
254 assert(0);
255 shift = 0;
256 }
257
258 mask_input = LLVMBuildLShr(builder, mask_input,
259 lp_build_const_int64(gallivm, 16 * sample), "");
260 mask_input = LLVMBuildTrunc(builder, mask_input, i32t, "");
261 mask_input = LLVMBuildAnd(builder, mask_input,
262 lp_build_const_int32(gallivm, 0xffff), "");
263 mask_input = LLVMBuildLShr(builder, mask_input,
264 LLVMConstInt(i32t, shift, 0), "");
265
266 /*
267 * mask = { mask_input & (1 << i), for i in [0,3] }
268 */
269 mask = lp_build_broadcast(gallivm,
270 lp_build_vec_type(gallivm, mask_type),
271 mask_input);
272
273 for (int i = 0; i < fs_type.length / 4; i++) {
274 unsigned j = 2 * (i % 2) + (i / 2) * 8;
275 bits[4*i + 0] = LLVMConstInt(i32t, 1ULL << (j + 0), 0);
276 bits[4*i + 1] = LLVMConstInt(i32t, 1ULL << (j + 1), 0);
277 bits[4*i + 2] = LLVMConstInt(i32t, 1ULL << (j + 4), 0);
278 bits[4*i + 3] = LLVMConstInt(i32t, 1ULL << (j + 5), 0);
279 }
280 bits_vec = LLVMConstVector(bits, fs_type.length);
281 mask = LLVMBuildAnd(builder, mask, bits_vec, "");
282
283 /*
284 * mask = mask == bits ? ~0 : 0
285 */
286 mask = lp_build_compare(gallivm,
287 mask_type, PIPE_FUNC_EQUAL,
288 mask, bits_vec);
289
290 return mask;
291 }
292
293
294 #define EARLY_DEPTH_TEST 0x1
295 #define LATE_DEPTH_TEST 0x2
296 #define EARLY_DEPTH_WRITE 0x4
297 #define LATE_DEPTH_WRITE 0x8
298 #define EARLY_DEPTH_TEST_INFERRED 0x10 //only with EARLY_DEPTH_TEST
299
300 static unsigned
get_cbuf_location(nir_variable * var,unsigned slot)301 get_cbuf_location(nir_variable *var, unsigned slot)
302 {
303 return (var->data.location - FRAG_RESULT_DATA0) + var->data.index + slot;
304 }
305
306 static int
find_output_by_frag_result(struct nir_shader * shader,gl_frag_result frag_result)307 find_output_by_frag_result(struct nir_shader *shader,
308 gl_frag_result frag_result)
309 {
310 nir_foreach_shader_out_variable(var, shader) {
311 int slots = nir_variable_count_slots(var, var->type);
312 for (unsigned s = 0; s < slots; s++) {
313 if (var->data.location + var->data.index + s == frag_result)
314 return var->data.driver_location + s;
315 }
316 }
317
318 return -1;
319 }
320
321 /**
322 * Fetch the specified lp_jit_viewport structure for a given viewport_index.
323 */
324 static LLVMValueRef
lp_llvm_viewport(LLVMTypeRef context_type,LLVMValueRef context_ptr,struct gallivm_state * gallivm,LLVMValueRef viewport_index)325 lp_llvm_viewport(LLVMTypeRef context_type,
326 LLVMValueRef context_ptr,
327 struct gallivm_state *gallivm,
328 LLVMValueRef viewport_index)
329 {
330 LLVMBuilderRef builder = gallivm->builder;
331 LLVMValueRef ptr;
332 LLVMValueRef res;
333 struct lp_type viewport_type =
334 lp_type_float_vec(32, 32 * LP_JIT_VIEWPORT_NUM_FIELDS);
335 LLVMTypeRef vtype = lp_build_vec_type(gallivm, viewport_type);
336
337 ptr = lp_jit_context_viewports(gallivm, context_type, context_ptr);
338 ptr = LLVMBuildPointerCast(builder, ptr,
339 LLVMPointerType(vtype, 0), "");
340
341 res = lp_build_pointer_get2(builder, vtype, ptr, viewport_index);
342
343 return res;
344 }
345
346
347 static LLVMValueRef
lp_build_depth_clamp(struct gallivm_state * gallivm,LLVMBuilderRef builder,bool depth_clamp,bool restrict_depth,struct lp_type type,LLVMTypeRef context_type,LLVMValueRef context_ptr,LLVMTypeRef thread_data_type,LLVMValueRef thread_data_ptr,LLVMValueRef z)348 lp_build_depth_clamp(struct gallivm_state *gallivm,
349 LLVMBuilderRef builder,
350 bool depth_clamp,
351 bool restrict_depth,
352 struct lp_type type,
353 LLVMTypeRef context_type,
354 LLVMValueRef context_ptr,
355 LLVMTypeRef thread_data_type,
356 LLVMValueRef thread_data_ptr,
357 LLVMValueRef z)
358 {
359 LLVMValueRef viewport, min_depth, max_depth;
360 LLVMValueRef viewport_index;
361 struct lp_build_context f32_bld;
362
363 assert(type.floating);
364 lp_build_context_init(&f32_bld, gallivm, type);
365
366 if (restrict_depth)
367 z = lp_build_clamp(&f32_bld, z, f32_bld.zero, f32_bld.one);
368
369 if (!depth_clamp)
370 return z;
371
372 /*
373 * Assumes clamping of the viewport index will occur in setup/gs. Value
374 * is passed through the rasterization stage via lp_rast_shader_inputs.
375 *
376 * See: draw_clamp_viewport_idx and lp_clamp_viewport_idx for clamping
377 * semantics.
378 */
379 viewport_index = lp_jit_thread_data_raster_state_viewport_index(gallivm,
380 thread_data_type,
381 thread_data_ptr);
382
383 /*
384 * Load the min and max depth from the lp_jit_context.viewports
385 * array of lp_jit_viewport structures.
386 */
387 viewport = lp_llvm_viewport(context_type, context_ptr, gallivm, viewport_index);
388
389 /* viewports[viewport_index].min_depth */
390 min_depth = LLVMBuildExtractElement(builder, viewport,
391 lp_build_const_int32(gallivm, LP_JIT_VIEWPORT_MIN_DEPTH), "");
392 min_depth = lp_build_broadcast_scalar(&f32_bld, min_depth);
393
394 /* viewports[viewport_index].max_depth */
395 max_depth = LLVMBuildExtractElement(builder, viewport,
396 lp_build_const_int32(gallivm, LP_JIT_VIEWPORT_MAX_DEPTH), "");
397 max_depth = lp_build_broadcast_scalar(&f32_bld, max_depth);
398
399 /*
400 * Clamp to the min and max depth values for the given viewport.
401 */
402 return lp_build_clamp(&f32_bld, z, min_depth, max_depth);
403 }
404
405
406 static void
lp_build_sample_alpha_to_coverage(struct gallivm_state * gallivm,struct lp_type type,unsigned coverage_samples,LLVMValueRef num_loop,LLVMValueRef loop_counter,LLVMTypeRef coverage_mask_type,LLVMValueRef coverage_mask_store,LLVMValueRef alpha)407 lp_build_sample_alpha_to_coverage(struct gallivm_state *gallivm,
408 struct lp_type type,
409 unsigned coverage_samples,
410 LLVMValueRef num_loop,
411 LLVMValueRef loop_counter,
412 LLVMTypeRef coverage_mask_type,
413 LLVMValueRef coverage_mask_store,
414 LLVMValueRef alpha)
415 {
416 struct lp_build_context bld;
417 LLVMBuilderRef builder = gallivm->builder;
418 float step = 1.0 / coverage_samples;
419
420 lp_build_context_init(&bld, gallivm, type);
421 for (unsigned s = 0; s < coverage_samples; s++) {
422 LLVMValueRef alpha_ref_value = lp_build_const_vec(gallivm, type, step * s);
423 LLVMValueRef test = lp_build_cmp(&bld, PIPE_FUNC_GREATER, alpha, alpha_ref_value);
424
425 LLVMValueRef s_mask_idx = LLVMBuildMul(builder, lp_build_const_int32(gallivm, s), num_loop, "");
426 s_mask_idx = LLVMBuildAdd(builder, s_mask_idx, loop_counter, "");
427 LLVMValueRef s_mask_ptr = LLVMBuildGEP2(builder, coverage_mask_type,
428 coverage_mask_store, &s_mask_idx, 1, "");
429 LLVMValueRef s_mask = LLVMBuildLoad2(builder, coverage_mask_type, s_mask_ptr, "");
430 s_mask = LLVMBuildAnd(builder, s_mask, test, "");
431 LLVMBuildStore(builder, s_mask, s_mask_ptr);
432 }
433 };
434
435
436 struct lp_build_fs_llvm_iface {
437 struct lp_build_fs_iface base;
438 struct lp_build_interp_soa_context *interp;
439 struct lp_build_for_loop_state *loop_state;
440 LLVMTypeRef mask_type;
441 LLVMValueRef mask_store;
442 LLVMValueRef sample_id;
443 LLVMValueRef color_ptr_ptr;
444 LLVMValueRef color_stride_ptr;
445 LLVMValueRef color_sample_stride_ptr;
446 LLVMValueRef zs_base_ptr;
447 LLVMValueRef zs_stride;
448 LLVMValueRef zs_sample_stride;
449 const struct lp_fragment_shader_variant_key *key;
450 };
451
452
453 static LLVMValueRef
fs_interp(const struct lp_build_fs_iface * iface,struct lp_build_context * bld,unsigned attrib,unsigned chan,bool centroid,bool sample,LLVMValueRef attrib_indir,LLVMValueRef offsets[2])454 fs_interp(const struct lp_build_fs_iface *iface,
455 struct lp_build_context *bld,
456 unsigned attrib, unsigned chan,
457 bool centroid, bool sample,
458 LLVMValueRef attrib_indir,
459 LLVMValueRef offsets[2])
460 {
461 struct lp_build_fs_llvm_iface *fs_iface = (struct lp_build_fs_llvm_iface *)iface;
462 struct lp_build_interp_soa_context *interp = fs_iface->interp;
463 unsigned loc = TGSI_INTERPOLATE_LOC_CENTER;
464 if (centroid)
465 loc = TGSI_INTERPOLATE_LOC_CENTROID;
466 if (sample)
467 loc = TGSI_INTERPOLATE_LOC_SAMPLE;
468
469 return lp_build_interp_soa(interp, bld->gallivm, fs_iface->loop_state->counter,
470 fs_iface->mask_type, fs_iface->mask_store,
471 attrib, chan, loc, attrib_indir, offsets);
472 }
473
474
475 /**
476 * Convert depth-stencil format to a single component one, returning
477 * PIPE_FORMAT_NONE if it doesn't contain the required component.
478 */
479 static enum pipe_format
select_zs_component_format(enum pipe_format format,bool fetch_stencil)480 select_zs_component_format(enum pipe_format format,
481 bool fetch_stencil)
482 {
483 const struct util_format_description* desc = util_format_description(format);
484 if (fetch_stencil && !util_format_has_stencil(desc))
485 return PIPE_FORMAT_NONE;
486 if (!fetch_stencil && !util_format_has_depth(desc))
487 return PIPE_FORMAT_NONE;
488
489 switch (format) {
490 case PIPE_FORMAT_Z24_UNORM_S8_UINT:
491 return fetch_stencil ? PIPE_FORMAT_X24S8_UINT : PIPE_FORMAT_Z24X8_UNORM;
492 case PIPE_FORMAT_S8_UINT_Z24_UNORM:
493 return fetch_stencil ? PIPE_FORMAT_S8X24_UINT : PIPE_FORMAT_X8Z24_UNORM;
494 case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
495 return fetch_stencil ? PIPE_FORMAT_X32_S8X24_UINT : format;
496 default:
497 return format;
498 }
499 }
500
501 static void
fs_fb_fetch(const struct lp_build_fs_iface * iface,struct lp_build_context * bld,int location,LLVMValueRef result[4])502 fs_fb_fetch(const struct lp_build_fs_iface *iface,
503 struct lp_build_context *bld,
504 int location,
505 LLVMValueRef result[4])
506 {
507 struct lp_build_fs_llvm_iface *fs_iface = (struct lp_build_fs_llvm_iface *)iface;
508 struct gallivm_state *gallivm = bld->gallivm;
509 LLVMBuilderRef builder = gallivm->builder;
510 LLVMTypeRef int32_type = LLVMInt32TypeInContext(gallivm->context);
511 LLVMTypeRef int8_type = LLVMInt8TypeInContext(gallivm->context);
512 LLVMTypeRef int8p_type = LLVMPointerType(int8_type, 0);
513 const struct lp_fragment_shader_variant_key *key = fs_iface->key;
514
515 LLVMValueRef buf_ptr;
516 LLVMValueRef stride;
517 enum pipe_format buf_format;
518
519 const bool fetch_stencil = location == FRAG_RESULT_STENCIL;
520 const bool fetch_zs = fetch_stencil || location == FRAG_RESULT_DEPTH;
521 if (fetch_zs) {
522 buf_ptr = fs_iface->zs_base_ptr;
523 stride = fs_iface->zs_stride;
524 buf_format = select_zs_component_format(key->zsbuf_format, fetch_stencil);
525 } else {
526 assert(location >= FRAG_RESULT_DATA0 && location <= FRAG_RESULT_DATA7);
527 const int cbuf = location - FRAG_RESULT_DATA0;
528 LLVMValueRef index = lp_build_const_int32(gallivm, cbuf);
529
530 buf_ptr = LLVMBuildLoad2(builder, int8p_type,
531 LLVMBuildGEP2(builder, int8p_type,
532 fs_iface->color_ptr_ptr, &index, 1, ""), "");
533 stride = LLVMBuildLoad2(builder, int32_type,
534 LLVMBuildGEP2(builder, int32_type,
535 fs_iface->color_stride_ptr, &index, 1, ""), "");
536 buf_format = key->cbuf_format[cbuf];
537 }
538
539 const struct util_format_description* out_format_desc = util_format_description(buf_format);
540 if (out_format_desc->format == PIPE_FORMAT_NONE) {
541 result[0] = result[1] = result[2] = result[3] = bld->undef;
542 return;
543 }
544
545 unsigned block_size = bld->type.length;
546 unsigned block_height = key->resource_1d ? 1 : 2;
547 unsigned block_width = block_size / block_height;
548
549 if (key->multisample) {
550 LLVMValueRef sample_stride;
551
552 if (fetch_zs) {
553 sample_stride = fs_iface->zs_sample_stride;
554 } else {
555 LLVMValueRef index = lp_build_const_int32(gallivm, location - FRAG_RESULT_DATA0);
556 sample_stride = LLVMBuildLoad2(builder, int32_type,
557 LLVMBuildGEP2(builder,
558 int32_type,
559 fs_iface->color_sample_stride_ptr,
560 &index, 1, ""), "");
561 }
562
563 LLVMValueRef sample_offset = LLVMBuildMul(builder, sample_stride, fs_iface->sample_id, "");
564 buf_ptr = LLVMBuildGEP2(builder, int8_type,
565 buf_ptr, &sample_offset, 1, "");
566 }
567
568 /* fragment shader executes on 4x4 blocks. depending on vector width it can
569 * execute 2 or 4 iterations. only move to the next row once the top row
570 * has completed 8 wide 1 iteration, 4 wide 2 iterations */
571 LLVMValueRef x_offset = NULL, y_offset = NULL;
572 if (!key->resource_1d) {
573 LLVMValueRef counter = fs_iface->loop_state->counter;
574
575 if (block_size == 4) {
576 x_offset = LLVMBuildShl(builder,
577 LLVMBuildAnd(builder, fs_iface->loop_state->counter, lp_build_const_int32(gallivm, 1), ""),
578 lp_build_const_int32(gallivm, 1), "");
579 counter = LLVMBuildLShr(builder, fs_iface->loop_state->counter, lp_build_const_int32(gallivm, 1), "");
580 }
581 y_offset = LLVMBuildMul(builder, counter, lp_build_const_int32(gallivm, 2), "");
582 }
583
584 LLVMValueRef offsets[4 * 4];
585 for (unsigned i = 0; i < block_size; i++) {
586 unsigned x = i % block_width;
587 unsigned y = i / block_width;
588
589 if (block_size == 8) {
590 /* remap the raw slots into the fragment shader execution mode. */
591 /* this math took me way too long to work out, I'm sure it's
592 * overkill.
593 */
594 x = (i & 1) + ((i >> 2) << 1);
595 if (!key->resource_1d)
596 y = (i & 2) >> 1;
597 }
598
599 LLVMValueRef x_val;
600 if (x_offset) {
601 x_val = LLVMBuildAdd(builder, lp_build_const_int32(gallivm, x), x_offset, "");
602 x_val = LLVMBuildMul(builder, x_val, lp_build_const_int32(gallivm, out_format_desc->block.bits / 8), "");
603 } else {
604 x_val = lp_build_const_int32(gallivm, x * (out_format_desc->block.bits / 8));
605 }
606
607 LLVMValueRef y_val = lp_build_const_int32(gallivm, y);
608 if (y_offset)
609 y_val = LLVMBuildAdd(builder, y_val, y_offset, "");
610 y_val = LLVMBuildMul(builder, y_val, stride, "");
611
612 offsets[i] = LLVMBuildAdd(builder, x_val, y_val, "");
613 }
614 LLVMValueRef offset = lp_build_gather_values(gallivm, offsets, block_size);
615
616 struct lp_type texel_type = bld->type;
617 if (out_format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB &&
618 out_format_desc->channel[0].pure_integer) {
619 if (out_format_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) {
620 texel_type = lp_type_int_vec(bld->type.width, bld->type.width * bld->type.length);
621 } else if (out_format_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) {
622 texel_type = lp_type_uint_vec(bld->type.width, bld->type.width * bld->type.length);
623 }
624 } else if (fetch_stencil) {
625 texel_type = lp_type_uint_vec(bld->type.width, bld->type.width * bld->type.length);
626 }
627
628 lp_build_fetch_rgba_soa(gallivm, out_format_desc, texel_type,
629 true, buf_ptr, offset,
630 NULL, NULL, NULL, result);
631 }
632
633 /**
634 * Generate the fragment shader, depth/stencil test, and alpha tests.
635 */
636 static void
generate_fs_loop(struct gallivm_state * gallivm,struct lp_fragment_shader * shader,const struct lp_fragment_shader_variant_key * key,LLVMBuilderRef builder,struct lp_type type,LLVMTypeRef context_type,LLVMValueRef context_ptr,LLVMTypeRef resources_type,LLVMValueRef resources_ptr,LLVMTypeRef sample_pos_type,LLVMValueRef sample_pos_array,LLVMValueRef num_loop,struct lp_build_interp_soa_context * interp,const struct lp_build_sampler_soa * sampler,const struct lp_build_image_soa * image,LLVMTypeRef mask_type,LLVMValueRef mask_store,LLVMValueRef (* out_color)[4],LLVMValueRef depth_base_ptr,LLVMValueRef depth_stride,LLVMValueRef depth_sample_stride,LLVMValueRef color_ptr_ptr,LLVMValueRef color_stride_ptr,LLVMValueRef color_sample_stride_ptr,LLVMValueRef facing,LLVMTypeRef thread_data_type,LLVMValueRef thread_data_ptr)637 generate_fs_loop(struct gallivm_state *gallivm,
638 struct lp_fragment_shader *shader,
639 const struct lp_fragment_shader_variant_key *key,
640 LLVMBuilderRef builder,
641 struct lp_type type,
642 LLVMTypeRef context_type,
643 LLVMValueRef context_ptr,
644 LLVMTypeRef resources_type,
645 LLVMValueRef resources_ptr,
646 LLVMTypeRef sample_pos_type,
647 LLVMValueRef sample_pos_array,
648 LLVMValueRef num_loop,
649 struct lp_build_interp_soa_context *interp,
650 const struct lp_build_sampler_soa *sampler,
651 const struct lp_build_image_soa *image,
652 LLVMTypeRef mask_type,
653 LLVMValueRef mask_store,
654 LLVMValueRef (*out_color)[4],
655 LLVMValueRef depth_base_ptr,
656 LLVMValueRef depth_stride,
657 LLVMValueRef depth_sample_stride,
658 LLVMValueRef color_ptr_ptr,
659 LLVMValueRef color_stride_ptr,
660 LLVMValueRef color_sample_stride_ptr,
661 LLVMValueRef facing,
662 LLVMTypeRef thread_data_type,
663 LLVMValueRef thread_data_ptr)
664 {
665 struct lp_type int_type = lp_int_type(type);
666 LLVMValueRef mask_ptr = NULL, mask_val = NULL;
667 LLVMValueRef z;
668 LLVMValueRef z_value, s_value;
669 LLVMValueRef z_fb, s_fb;
670 LLVMValueRef zs_samples = lp_build_const_int32(gallivm, key->zsbuf_nr_samples);
671 LLVMValueRef z_out = NULL, s_out = NULL;
672 struct lp_build_for_loop_state loop_state, sample_loop_state = {0};
673 struct lp_build_mask_context mask;
674 struct nir_shader *nir = shader->base.ir.nir;
675 const bool dual_source_blend = key->blend.rt[0].blend_enable &&
676 util_blend_state_is_dual(&key->blend, 0);
677 const bool post_depth_coverage = nir->info.fs.post_depth_coverage;
678
679 struct lp_bld_tgsi_system_values system_values;
680
681 memset(&system_values, 0, sizeof(system_values));
682
683 /* truncate then sign extend. */
684 system_values.front_facing =
685 LLVMBuildTrunc(gallivm->builder, facing,
686 LLVMInt1TypeInContext(gallivm->context), "");
687 system_values.front_facing =
688 LLVMBuildSExt(gallivm->builder, system_values.front_facing,
689 LLVMInt32TypeInContext(gallivm->context), "");
690 system_values.view_index =
691 lp_jit_thread_data_raster_state_view_index(gallivm,
692 thread_data_type,
693 thread_data_ptr);
694
695 unsigned depth_mode;
696 const struct util_format_description *zs_format_desc = NULL;
697 if (key->depth.enabled ||
698 key->stencil[0].enabled) {
699 zs_format_desc = util_format_description(key->zsbuf_format);
700
701 if (nir->info.fs.early_fragment_tests || nir->info.fs.post_depth_coverage) {
702 depth_mode = EARLY_DEPTH_TEST | EARLY_DEPTH_WRITE;
703 } else if (!(nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) &&
704 !(nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL)) &&
705 !nir->info.fs.uses_fbfetch_output && !nir->info.writes_memory) {
706 if (key->alpha.enabled ||
707 key->blend.alpha_to_coverage ||
708 nir->info.fs.uses_discard ||
709 nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)) {
710 /* With alpha test and kill, can do the depth test early
711 * and hopefully eliminate some quads. But need to do a
712 * special deferred depth write once the final mask value
713 * is known. This only works though if there's either no
714 * stencil test or the stencil value isn't written.
715 */
716 if (key->stencil[0].enabled && (key->stencil[0].writemask ||
717 (key->stencil[1].enabled &&
718 key->stencil[1].writemask)))
719 depth_mode = LATE_DEPTH_TEST | LATE_DEPTH_WRITE;
720 else
721 depth_mode = EARLY_DEPTH_TEST | LATE_DEPTH_WRITE | EARLY_DEPTH_TEST_INFERRED;
722 } else {
723 depth_mode = EARLY_DEPTH_TEST | EARLY_DEPTH_WRITE | EARLY_DEPTH_TEST_INFERRED;
724 }
725 } else {
726 depth_mode = LATE_DEPTH_TEST | LATE_DEPTH_WRITE;
727 }
728
729 if (!(key->depth.enabled && key->depth.writemask) &&
730 !(key->stencil[0].enabled && (key->stencil[0].writemask ||
731 (key->stencil[1].enabled &&
732 key->stencil[1].writemask))))
733 depth_mode &= ~(LATE_DEPTH_WRITE | EARLY_DEPTH_WRITE);
734 } else {
735 depth_mode = 0;
736 }
737
738 LLVMTypeRef vec_type = lp_build_vec_type(gallivm, type);
739 LLVMTypeRef int_vec_type = lp_build_vec_type(gallivm, int_type);
740
741 LLVMValueRef stencil_refs[2];
742 stencil_refs[0] = lp_jit_context_stencil_ref_front_value(gallivm, context_type, context_ptr);
743 stencil_refs[1] = lp_jit_context_stencil_ref_back_value(gallivm, context_type, context_ptr);
744 /* convert scalar stencil refs into vectors */
745 stencil_refs[0] = lp_build_broadcast(gallivm, int_vec_type, stencil_refs[0]);
746 stencil_refs[1] = lp_build_broadcast(gallivm, int_vec_type, stencil_refs[1]);
747
748 LLVMValueRef consts_ptr = lp_jit_resources_constants(gallivm, resources_type, resources_ptr);
749
750 LLVMValueRef ssbo_ptr = lp_jit_resources_ssbos(gallivm, resources_type, resources_ptr);
751
752 LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][TGSI_NUM_CHANNELS];
753 memset(outputs, 0, sizeof outputs);
754
755 /* Allocate color storage for each fragment sample */
756 LLVMValueRef color_store_size = num_loop;
757 if (key->min_samples > 1)
758 color_store_size = LLVMBuildMul(builder, num_loop, lp_build_const_int32(gallivm, key->min_samples), "");
759
760 for (unsigned cbuf = 0; cbuf < key->nr_cbufs; cbuf++) {
761 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
762 out_color[cbuf][chan] = lp_build_array_alloca(gallivm,
763 lp_build_vec_type(gallivm,
764 type),
765 color_store_size, "color");
766 }
767 }
768 if (dual_source_blend) {
769 assert(key->nr_cbufs <= 1);
770 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
771 out_color[1][chan] = lp_build_array_alloca(gallivm,
772 lp_build_vec_type(gallivm,
773 type),
774 color_store_size, "color1");
775 }
776 }
777 if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
778 z_out = lp_build_array_alloca(gallivm,
779 lp_build_vec_type(gallivm, type),
780 color_store_size, "depth");
781 }
782
783 if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL)) {
784 s_out = lp_build_array_alloca(gallivm,
785 lp_build_vec_type(gallivm, type),
786 color_store_size, "depth");
787 }
788
789 lp_build_for_loop_begin(&loop_state, gallivm,
790 lp_build_const_int32(gallivm, 0),
791 LLVMIntULT,
792 num_loop,
793 lp_build_const_int32(gallivm, 1));
794
795 LLVMValueRef sample_mask_in;
796 if (key->multisample) {
797 sample_mask_in = lp_build_const_int_vec(gallivm, type, 0);
798 /* create shader execution mask by combining all sample masks. */
799 for (unsigned s = 0; s < key->coverage_samples; s++) {
800 LLVMValueRef s_mask_idx = LLVMBuildMul(builder, num_loop, lp_build_const_int32(gallivm, s), "");
801 s_mask_idx = LLVMBuildAdd(builder, s_mask_idx, loop_state.counter, "");
802 LLVMValueRef s_mask = lp_build_pointer_get2(builder, mask_type, mask_store, s_mask_idx);
803 if (s == 0)
804 mask_val = s_mask;
805 else
806 mask_val = LLVMBuildOr(builder, s_mask, mask_val, "");
807
808 LLVMValueRef mask_in = LLVMBuildAnd(builder, s_mask, lp_build_const_int_vec(gallivm, type, (1ll << s)), "");
809 sample_mask_in = LLVMBuildOr(builder, sample_mask_in, mask_in, "");
810 }
811 } else {
812 sample_mask_in = lp_build_const_int_vec(gallivm, type, 1);
813 mask_ptr = LLVMBuildGEP2(builder, mask_type, mask_store,
814 &loop_state.counter, 1, "mask_ptr");
815 mask_val = LLVMBuildLoad2(builder, mask_type, mask_ptr, "");
816
817 LLVMValueRef mask_in = LLVMBuildAnd(builder, mask_val, lp_build_const_int_vec(gallivm, type, 1), "");
818 sample_mask_in = LLVMBuildOr(builder, sample_mask_in, mask_in, "");
819 }
820
821 /* 'mask' will control execution based on quad's pixel alive/killed state */
822 lp_build_mask_begin(&mask, gallivm, type, mask_val);
823
824 if (!(depth_mode & EARLY_DEPTH_TEST))
825 lp_build_mask_check(&mask);
826
827 /* Create storage for recombining sample masks after early Z pass. */
828 LLVMValueRef s_mask_or = lp_build_alloca(gallivm, int_vec_type, "cov_mask_early_depth");
829 LLVMBuildStore(builder, LLVMConstNull(int_vec_type), s_mask_or);
830
831 /* Create storage for post depth sample mask */
832 LLVMValueRef post_depth_sample_mask_in = NULL;
833 if (post_depth_coverage)
834 post_depth_sample_mask_in = lp_build_alloca(gallivm, int_vec_type, "post_depth_sample_mask_in");
835
836 LLVMValueRef s_mask = NULL, s_mask_ptr = NULL;
837 LLVMValueRef z_sample_value_store = NULL, s_sample_value_store = NULL;
838 LLVMValueRef z_fb_store = NULL, s_fb_store = NULL;
839 LLVMTypeRef z_type = NULL, z_fb_type = NULL;
840
841 /* Run early depth once per sample */
842 if (key->multisample) {
843
844 if (zs_format_desc) {
845 struct lp_type zs_type = lp_depth_type(zs_format_desc, type.length);
846 struct lp_type z_type = zs_type;
847 struct lp_type s_type = zs_type;
848 if (zs_format_desc->block.bits < type.width)
849 z_type.width = type.width;
850 if (zs_format_desc->block.bits == 8) {
851 s_type.width = type.width;
852 } else if (zs_format_desc->block.bits > 32) {
853 z_type.width = z_type.width / 2;
854 s_type.width = s_type.width / 2;
855 s_type.floating = 0;
856 }
857 z_sample_value_store = lp_build_array_alloca(gallivm, lp_build_int_vec_type(gallivm, type),
858 zs_samples, "z_sample_store");
859 s_sample_value_store = lp_build_array_alloca(gallivm, lp_build_int_vec_type(gallivm, type),
860 zs_samples, "s_sample_store");
861 z_fb_store = lp_build_array_alloca(gallivm, lp_build_vec_type(gallivm, z_type),
862 zs_samples, "z_fb_store");
863 s_fb_store = lp_build_array_alloca(gallivm, lp_build_vec_type(gallivm, s_type),
864 zs_samples, "s_fb_store");
865 }
866 lp_build_for_loop_begin(&sample_loop_state, gallivm,
867 lp_build_const_int32(gallivm, 0),
868 LLVMIntULT, lp_build_const_int32(gallivm, key->coverage_samples),
869 lp_build_const_int32(gallivm, 1));
870
871 LLVMValueRef s_mask_idx = LLVMBuildMul(builder, sample_loop_state.counter, num_loop, "");
872 s_mask_idx = LLVMBuildAdd(builder, s_mask_idx, loop_state.counter, "");
873 s_mask_ptr = LLVMBuildGEP2(builder, mask_type, mask_store, &s_mask_idx, 1, "");
874
875 s_mask = LLVMBuildLoad2(builder, mask_type, s_mask_ptr, "");
876 s_mask = LLVMBuildAnd(builder, s_mask, mask_val, "");
877 }
878
879
880 /* for multisample Z needs to be interpolated at sample points for testing. */
881 lp_build_interp_soa_update_pos_dyn(interp, gallivm, loop_state.counter,
882 key->multisample
883 ? sample_loop_state.counter : NULL);
884 z = interp->pos[2];
885
886 LLVMValueRef depth_ptr = depth_base_ptr;
887 if (key->multisample) {
888 LLVMValueRef sample_offset =
889 LLVMBuildMul(builder, sample_loop_state.counter,
890 depth_sample_stride, "");
891 depth_ptr = LLVMBuildGEP2(builder, LLVMInt8TypeInContext(gallivm->context),
892 depth_ptr, &sample_offset, 1, "");
893 }
894
895 if (depth_mode & EARLY_DEPTH_TEST) {
896 z = lp_build_depth_clamp(gallivm, builder, key->depth_clamp,
897 key->restrict_depth_values, type,
898 context_type, context_ptr,
899 thread_data_type, thread_data_ptr, z);
900
901 lp_build_depth_stencil_load_swizzled(gallivm, type,
902 zs_format_desc, key->resource_1d,
903 depth_ptr, depth_stride,
904 &z_fb, &s_fb, loop_state.counter);
905 lp_build_depth_stencil_test(gallivm,
906 &key->depth,
907 key->stencil,
908 type,
909 zs_format_desc,
910 key->multisample ? NULL : &mask,
911 &s_mask,
912 stencil_refs,
913 z, z_fb, s_fb,
914 facing,
915 &z_value, &s_value,
916 !key->multisample,
917 key->restrict_depth_values);
918
919 if (depth_mode & EARLY_DEPTH_WRITE) {
920 lp_build_depth_stencil_write_swizzled(gallivm, type,
921 zs_format_desc, key->resource_1d,
922 NULL, NULL, NULL, loop_state.counter,
923 depth_ptr, depth_stride,
924 z_value, s_value);
925 }
926 /*
927 * Note mask check if stencil is enabled must be after ds write not
928 * after stencil test otherwise new stencil values may not get written
929 * if all fragments got killed by depth/stencil test.
930 */
931 if (key->stencil[0].enabled && !key->multisample)
932 lp_build_mask_check(&mask);
933
934 if (key->multisample) {
935 z_fb_type = LLVMTypeOf(z_fb);
936 z_type = LLVMTypeOf(z_value);
937 lp_build_pointer_set(builder, z_sample_value_store, sample_loop_state.counter, LLVMBuildBitCast(builder, z_value, lp_build_int_vec_type(gallivm, type), ""));
938 lp_build_pointer_set(builder, s_sample_value_store, sample_loop_state.counter, LLVMBuildBitCast(builder, s_value, lp_build_int_vec_type(gallivm, type), ""));
939 lp_build_pointer_set(builder, z_fb_store, sample_loop_state.counter, z_fb);
940 lp_build_pointer_set(builder, s_fb_store, sample_loop_state.counter, s_fb);
941 }
942 if (key->occlusion_count && !(depth_mode & EARLY_DEPTH_TEST_INFERRED)) {
943 LLVMValueRef counter = lp_jit_thread_data_vis_counter(gallivm, thread_data_type, thread_data_ptr);
944 lp_build_name(counter, "counter");
945 lp_build_occlusion_count(gallivm, type,
946 key->multisample ? s_mask : lp_build_mask_value(&mask), counter);
947 }
948 }
949
950 if (key->multisample) {
951 /*
952 * Store the post-early Z coverage mask.
953 * Recombine the resulting coverage masks post early Z into the fragment
954 * shader execution mask.
955 */
956 LLVMValueRef tmp_s_mask_or = LLVMBuildLoad2(builder, int_vec_type, s_mask_or, "");
957 tmp_s_mask_or = LLVMBuildOr(builder, tmp_s_mask_or, s_mask, "");
958 LLVMBuildStore(builder, tmp_s_mask_or, s_mask_or);
959
960 if (post_depth_coverage) {
961 LLVMValueRef mask_bit_idx = LLVMBuildShl(builder, lp_build_const_int32(gallivm, 1), sample_loop_state.counter, "");
962 LLVMValueRef post_depth_mask_in = LLVMBuildLoad2(builder, int_vec_type, post_depth_sample_mask_in, "");
963 mask_bit_idx = LLVMBuildAnd(builder, s_mask, lp_build_broadcast(gallivm, int_vec_type, mask_bit_idx), "");
964 post_depth_mask_in = LLVMBuildOr(builder, post_depth_mask_in, mask_bit_idx, "");
965 LLVMBuildStore(builder, post_depth_mask_in, post_depth_sample_mask_in);
966 }
967
968 LLVMBuildStore(builder, s_mask, s_mask_ptr);
969
970 lp_build_for_loop_end(&sample_loop_state);
971
972 /* recombined all the coverage masks in the shader exec mask. */
973 tmp_s_mask_or = LLVMBuildLoad2(builder, int_vec_type, s_mask_or, "");
974 lp_build_mask_update(&mask, tmp_s_mask_or);
975
976 if (key->min_samples == 1) {
977 /* for multisample Z needs to be re interpolated at pixel center */
978 lp_build_interp_soa_update_pos_dyn(interp, gallivm, loop_state.counter, NULL);
979 z = interp->pos[2];
980 lp_build_mask_update(&mask, tmp_s_mask_or);
981 }
982 } else {
983 if (post_depth_coverage) {
984 LLVMValueRef post_depth_mask_in = LLVMBuildAnd(builder, lp_build_mask_value(&mask), lp_build_const_int_vec(gallivm, type, 1), "");
985 LLVMBuildStore(builder, post_depth_mask_in, post_depth_sample_mask_in);
986 }
987 }
988
989 LLVMValueRef out_sample_mask_storage = NULL;
990 if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)) {
991 out_sample_mask_storage = lp_build_alloca(gallivm, int_vec_type, "write_mask");
992 if (key->min_samples > 1)
993 LLVMBuildStore(builder, LLVMConstNull(int_vec_type), out_sample_mask_storage);
994 }
995
996 if (post_depth_coverage) {
997 system_values.sample_mask_in = LLVMBuildLoad2(builder, int_vec_type, post_depth_sample_mask_in, "");
998 } else {
999 system_values.sample_mask_in = sample_mask_in;
1000 }
1001 if (key->multisample && key->min_samples > 1) {
1002 lp_build_for_loop_begin(&sample_loop_state, gallivm,
1003 lp_build_const_int32(gallivm, 0),
1004 LLVMIntULT,
1005 lp_build_const_int32(gallivm, key->min_samples),
1006 lp_build_const_int32(gallivm, 1));
1007
1008 LLVMValueRef s_mask_idx = LLVMBuildMul(builder, sample_loop_state.counter, num_loop, "");
1009 s_mask_idx = LLVMBuildAdd(builder, s_mask_idx, loop_state.counter, "");
1010 s_mask_ptr = LLVMBuildGEP2(builder, mask_type, mask_store, &s_mask_idx, 1, "");
1011 s_mask = LLVMBuildLoad2(builder, mask_type, s_mask_ptr, "");
1012 lp_build_mask_force(&mask, s_mask);
1013 lp_build_interp_soa_update_pos_dyn(interp, gallivm, loop_state.counter, sample_loop_state.counter);
1014 system_values.sample_id = sample_loop_state.counter;
1015 system_values.sample_mask_in = LLVMBuildAnd(builder, system_values.sample_mask_in,
1016 lp_build_broadcast(gallivm, int_vec_type,
1017 LLVMBuildShl(builder, lp_build_const_int32(gallivm, 1), sample_loop_state.counter, "")), "");
1018 } else {
1019 system_values.sample_id = lp_build_const_int32(gallivm, 0);
1020
1021 }
1022 system_values.sample_pos = sample_pos_array;
1023 system_values.sample_pos_type = sample_pos_type;
1024
1025 lp_build_interp_soa_update_inputs_dyn(interp, gallivm, loop_state.counter,
1026 mask_type, mask_store, sample_loop_state.counter);
1027
1028 struct lp_build_fs_llvm_iface fs_iface = {
1029 .base.interp_fn = fs_interp,
1030 .base.fb_fetch = fs_fb_fetch,
1031 .interp = interp,
1032 .loop_state = &loop_state,
1033 .sample_id = system_values.sample_id,
1034 .mask_type = mask_type,
1035 .mask_store = mask_store,
1036 .color_ptr_ptr = color_ptr_ptr,
1037 .color_stride_ptr = color_stride_ptr,
1038 .color_sample_stride_ptr = color_sample_stride_ptr,
1039 .zs_base_ptr = depth_base_ptr,
1040 .zs_stride = depth_stride,
1041 .zs_sample_stride = depth_sample_stride,
1042 .key = key,
1043 };
1044
1045 struct lp_build_tgsi_params params;
1046 memset(¶ms, 0, sizeof(params));
1047
1048 params.type = type;
1049 params.mask = &mask;
1050 params.fs_iface = &fs_iface.base;
1051 params.consts_ptr = consts_ptr;
1052 params.system_values = &system_values;
1053 params.inputs = interp->inputs;
1054 params.num_inputs = interp->num_attribs - 1;
1055 params.context_type = context_type;
1056 params.context_ptr = context_ptr;
1057 params.resources_type = resources_type;
1058 params.resources_ptr = resources_ptr;
1059 params.thread_data_type = thread_data_type;
1060 params.thread_data_ptr = thread_data_ptr;
1061 params.sampler = sampler;
1062 params.info = &shader->info.base;
1063 params.ssbo_ptr = ssbo_ptr;
1064 params.image = image;
1065 params.aniso_filter_table = lp_jit_resources_aniso_filter_table(gallivm, resources_type, resources_ptr);
1066
1067 /* Build the actual shader */
1068 lp_build_nir_soa(gallivm, nir, ¶ms, outputs);
1069
1070 /*
1071 * Must not count ps invocations if there's a null shader.
1072 * (It would be ok to count with null shader if there's d/s tests,
1073 * but only if there's d/s buffers too, which is different
1074 * to implicit rasterization disable which must not depend
1075 * on the d/s buffers.)
1076 * Could disable if there's no stats query, but maybe not worth it.
1077 */
1078 if (shader->info.base.num_instructions > 1) {
1079 LLVMValueRef invocs = lp_jit_thread_data_ps_invocations(gallivm, thread_data_type, thread_data_ptr);
1080 lp_build_occlusion_count(gallivm, type, lp_build_mask_value(&mask), invocs);
1081 }
1082
1083 /* Alpha test */
1084 if (key->alpha.enabled) {
1085 int color0 = find_output_by_frag_result(nir, FRAG_RESULT_DATA0);
1086
1087 if (color0 != -1 && outputs[color0][3]) {
1088 const struct util_format_description *cbuf_format_desc;
1089 LLVMValueRef alpha = LLVMBuildLoad2(builder, vec_type, outputs[color0][3], "alpha");
1090 LLVMValueRef alpha_ref_value;
1091
1092 alpha_ref_value = lp_jit_context_alpha_ref_value(gallivm, context_type, context_ptr);
1093 alpha_ref_value = lp_build_broadcast(gallivm, vec_type, alpha_ref_value);
1094
1095 cbuf_format_desc = util_format_description(key->cbuf_format[0]);
1096
1097 lp_build_alpha_test(gallivm, key->alpha.func, type, cbuf_format_desc,
1098 &mask, alpha, alpha_ref_value,
1099 ((depth_mode & LATE_DEPTH_TEST) != 0) && !key->multisample);
1100 }
1101 }
1102
1103 /* Emulate Alpha to Coverage with Alpha test */
1104 if (key->blend.alpha_to_coverage) {
1105 int color0 = find_output_by_frag_result(nir, FRAG_RESULT_DATA0);
1106
1107 if (color0 != -1 && outputs[color0][3]) {
1108 LLVMValueRef alpha = LLVMBuildLoad2(builder, vec_type, outputs[color0][3], "alpha");
1109
1110 if (!key->multisample) {
1111 lp_build_alpha_to_coverage(gallivm, type,
1112 &mask, alpha,
1113 (depth_mode & LATE_DEPTH_TEST) != 0);
1114 } else {
1115 lp_build_sample_alpha_to_coverage(gallivm, type, key->coverage_samples, num_loop,
1116 loop_state.counter,
1117 mask_type, mask_store, alpha);
1118 }
1119 }
1120 }
1121
1122 if (key->blend.alpha_to_one) {
1123 nir_foreach_shader_out_variable(var, nir) {
1124 if (var->data.location < FRAG_RESULT_DATA0)
1125 continue;
1126 int slots = nir_variable_count_slots(var, var->type);
1127 for (unsigned s = 0; s < slots; s++) {
1128 unsigned cbuf = get_cbuf_location(var, s);
1129 if ((cbuf < key->nr_cbufs) || (cbuf == 1 && dual_source_blend))
1130 if (outputs[cbuf][3]) {
1131 LLVMBuildStore(builder, lp_build_const_vec(gallivm, type, 1.0),
1132 outputs[cbuf][3]);
1133 }
1134 }
1135 }
1136 }
1137
1138 if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)) {
1139 LLVMValueRef output_smask = NULL;
1140 int smaski = find_output_by_frag_result(nir, FRAG_RESULT_SAMPLE_MASK);
1141
1142 struct lp_build_context smask_bld;
1143 lp_build_context_init(&smask_bld, gallivm, int_type);
1144
1145 assert(smaski >= 0);
1146 output_smask = LLVMBuildLoad2(builder, vec_type, outputs[smaski][0], "smask");
1147 output_smask = LLVMBuildBitCast(builder, output_smask, smask_bld.vec_type, "");
1148 if (!key->multisample && key->no_ms_sample_mask_out) {
1149 output_smask = lp_build_and(&smask_bld, output_smask, smask_bld.one);
1150 output_smask = lp_build_cmp(&smask_bld, PIPE_FUNC_NOTEQUAL, output_smask, smask_bld.zero);
1151 lp_build_mask_update(&mask, output_smask);
1152 }
1153
1154 if (key->min_samples > 1) {
1155 /* only the bit corresponding to this sample is to be used. */
1156 LLVMValueRef tmp_mask = LLVMBuildLoad2(builder, int_vec_type, out_sample_mask_storage, "tmp_mask");
1157 LLVMValueRef out_smask_idx = LLVMBuildShl(builder, lp_build_const_int32(gallivm, 1), sample_loop_state.counter, "");
1158 LLVMValueRef smask_bit = LLVMBuildAnd(builder, output_smask, lp_build_broadcast(gallivm, int_vec_type, out_smask_idx), "");
1159 output_smask = LLVMBuildOr(builder, tmp_mask, smask_bit, "");
1160 }
1161
1162 LLVMBuildStore(builder, output_smask, out_sample_mask_storage);
1163 }
1164
1165 if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
1166 int pos0 = find_output_by_frag_result(nir, FRAG_RESULT_DEPTH);
1167
1168 LLVMValueRef out = LLVMBuildLoad2(builder, vec_type, outputs[pos0][2], "");
1169 LLVMValueRef idx = loop_state.counter;
1170 if (key->min_samples > 1)
1171 idx = LLVMBuildAdd(builder, idx,
1172 LLVMBuildMul(builder, sample_loop_state.counter, num_loop, ""), "");
1173 LLVMValueRef ptr = LLVMBuildGEP2(builder, vec_type, z_out, &idx, 1, "");
1174 LLVMBuildStore(builder, out, ptr);
1175 }
1176
1177 if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL)) {
1178 int sten_out = find_output_by_frag_result(nir, FRAG_RESULT_STENCIL);
1179
1180 LLVMValueRef out = LLVMBuildLoad2(builder, vec_type,
1181 outputs[sten_out][1], "output.s");
1182 LLVMValueRef idx = loop_state.counter;
1183 if (key->min_samples > 1)
1184 idx = LLVMBuildAdd(builder, idx,
1185 LLVMBuildMul(builder, sample_loop_state.counter, num_loop, ""), "");
1186 LLVMValueRef ptr = LLVMBuildGEP2(builder, vec_type, s_out, &idx, 1, "");
1187 LLVMBuildStore(builder, out, ptr);
1188 }
1189
1190 bool has_cbuf0_write = false;
1191 /* Color write - per fragment sample */
1192 nir_foreach_shader_out_variable(var, nir) {
1193 if (var->data.location < FRAG_RESULT_DATA0)
1194 continue;
1195 int slots = nir_variable_count_slots(var, var->type);
1196
1197 for (unsigned s = 0; s < slots; s++) {
1198 unsigned cbuf = get_cbuf_location(var, s);
1199 unsigned attrib = var->data.driver_location + s;
1200 if ((cbuf < key->nr_cbufs) || (cbuf == 1 && dual_source_blend)) {
1201 if (cbuf == 0) {
1202 /* XXX: there is an edge case with FB fetch where gl_FragColor and
1203 * gl_LastFragData[0] are used together. This creates both
1204 * FRAG_RESULT_COLOR and FRAG_RESULT_DATA* output variables. This
1205 * loop then writes to cbuf 0 twice, owerwriting the correct value
1206 * from gl_FragColor with some garbage. This case is excercised in
1207 * one of deqp tests. A similar bug can happen if
1208 * gl_SecondaryFragColorEXT and gl_LastFragData[1] are mixed in
1209 * the same fashion... This workaround will break if
1210 * gl_LastFragData[0] goes in outputs list before
1211 * gl_FragColor. This doesn't seem to happen though.
1212 */
1213 if (has_cbuf0_write)
1214 continue;
1215 has_cbuf0_write = true;
1216 }
1217
1218 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
1219 if (outputs[attrib][chan]) {
1220 /* XXX: just initialize outputs to point at colors[] and
1221 * skip this.
1222 */
1223 LLVMValueRef out = LLVMBuildLoad2(builder, vec_type, outputs[attrib][chan], "");
1224 LLVMValueRef color_ptr;
1225 LLVMValueRef color_idx = loop_state.counter;
1226 if (key->min_samples > 1)
1227 color_idx = LLVMBuildAdd(builder, color_idx,
1228 LLVMBuildMul(builder, sample_loop_state.counter, num_loop, ""), "");
1229 color_ptr = LLVMBuildGEP2(builder, vec_type, out_color[cbuf][chan],
1230 &color_idx, 1, "");
1231 lp_build_name(out, "color%u.%c", attrib, "rgba"[chan]);
1232 LLVMBuildStore(builder, out, color_ptr);
1233 }
1234 }
1235 }
1236 }
1237 }
1238
1239 if (key->multisample && key->min_samples > 1) {
1240 LLVMBuildStore(builder, lp_build_mask_value(&mask), s_mask_ptr);
1241 lp_build_for_loop_end(&sample_loop_state);
1242 }
1243
1244 if (key->multisample) {
1245 /* execute depth test for each sample */
1246 lp_build_for_loop_begin(&sample_loop_state, gallivm,
1247 lp_build_const_int32(gallivm, 0),
1248 LLVMIntULT, lp_build_const_int32(gallivm, key->coverage_samples),
1249 lp_build_const_int32(gallivm, 1));
1250
1251 /* load the per-sample coverage mask */
1252 LLVMValueRef s_mask_idx = LLVMBuildMul(builder, sample_loop_state.counter, num_loop, "");
1253 s_mask_idx = LLVMBuildAdd(builder, s_mask_idx, loop_state.counter, "");
1254 s_mask_ptr = LLVMBuildGEP2(builder, mask_type, mask_store, &s_mask_idx, 1, "");
1255
1256 /* combine the execution mask post fragment shader with the coverage mask. */
1257 s_mask = LLVMBuildLoad2(builder, mask_type, s_mask_ptr, "");
1258 if (key->min_samples == 1)
1259 s_mask = LLVMBuildAnd(builder, s_mask, lp_build_mask_value(&mask), "");
1260
1261 /* if the shader writes sample mask use that,
1262 * but only if this isn't genuine early-depth to avoid breaking occlusion query */
1263 if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK) &&
1264 (!(depth_mode & EARLY_DEPTH_TEST) || (depth_mode & (EARLY_DEPTH_TEST_INFERRED)))) {
1265 LLVMValueRef out_smask_idx = LLVMBuildShl(builder, lp_build_const_int32(gallivm, 1), sample_loop_state.counter, "");
1266 out_smask_idx = lp_build_broadcast(gallivm, int_vec_type, out_smask_idx);
1267 LLVMValueRef output_smask = LLVMBuildLoad2(builder, int_vec_type, out_sample_mask_storage, "");
1268 LLVMValueRef smask_bit = LLVMBuildAnd(builder, output_smask, out_smask_idx, "");
1269 LLVMValueRef cmp = LLVMBuildICmp(builder, LLVMIntNE, smask_bit, lp_build_const_int_vec(gallivm, int_type, 0), "");
1270 smask_bit = LLVMBuildSExt(builder, cmp, int_vec_type, "");
1271
1272 s_mask = LLVMBuildAnd(builder, s_mask, smask_bit, "");
1273 }
1274 }
1275
1276 depth_ptr = depth_base_ptr;
1277 if (key->multisample) {
1278 LLVMValueRef sample_offset = LLVMBuildMul(builder, sample_loop_state.counter, depth_sample_stride, "");
1279 depth_ptr = LLVMBuildGEP2(builder, LLVMInt8TypeInContext(gallivm->context),
1280 depth_ptr, &sample_offset, 1, "");
1281 }
1282
1283 /* Late Z test */
1284 if (depth_mode & LATE_DEPTH_TEST) {
1285 if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
1286 LLVMValueRef idx = loop_state.counter;
1287 if (key->min_samples > 1)
1288 idx = LLVMBuildAdd(builder, idx,
1289 LLVMBuildMul(builder, sample_loop_state.counter, num_loop, ""), "");
1290 LLVMValueRef ptr = LLVMBuildGEP2(builder, vec_type, z_out, &idx, 1, "");
1291 z = LLVMBuildLoad2(builder, vec_type, ptr, "output.z");
1292 } else {
1293 if (key->multisample) {
1294 lp_build_interp_soa_update_pos_dyn(interp, gallivm, loop_state.counter, key->multisample ? sample_loop_state.counter : NULL);
1295 z = interp->pos[2];
1296 }
1297 }
1298
1299 /*
1300 * Clamp according to ARB_depth_clamp semantics.
1301 */
1302 z = lp_build_depth_clamp(gallivm, builder, key->depth_clamp,
1303 key->restrict_depth_values, type,
1304 context_type, context_ptr,
1305 thread_data_type, thread_data_ptr, z);
1306
1307 if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL)) {
1308 LLVMValueRef idx = loop_state.counter;
1309 if (key->min_samples > 1)
1310 idx = LLVMBuildAdd(builder, idx,
1311 LLVMBuildMul(builder, sample_loop_state.counter, num_loop, ""), "");
1312 LLVMValueRef ptr = LLVMBuildGEP2(builder, vec_type, s_out, &idx, 1, "");
1313 stencil_refs[0] = LLVMBuildLoad2(builder, vec_type, ptr, "output.s");
1314 /* there's only one value, and spec says to discard additional bits */
1315 LLVMValueRef s_max_mask = lp_build_const_int_vec(gallivm, int_type, 255);
1316 stencil_refs[0] = LLVMBuildBitCast(builder, stencil_refs[0], int_vec_type, "");
1317 stencil_refs[0] = LLVMBuildAnd(builder, stencil_refs[0], s_max_mask, "");
1318 stencil_refs[1] = stencil_refs[0];
1319 }
1320
1321 lp_build_depth_stencil_load_swizzled(gallivm, type,
1322 zs_format_desc, key->resource_1d,
1323 depth_ptr, depth_stride,
1324 &z_fb, &s_fb, loop_state.counter);
1325
1326 lp_build_depth_stencil_test(gallivm,
1327 &key->depth,
1328 key->stencil,
1329 type,
1330 zs_format_desc,
1331 key->multisample ? NULL : &mask,
1332 &s_mask,
1333 stencil_refs,
1334 z, z_fb, s_fb,
1335 facing,
1336 &z_value, &s_value,
1337 false,
1338 key->restrict_depth_values);
1339 /* Late Z write */
1340 if (depth_mode & LATE_DEPTH_WRITE) {
1341 lp_build_depth_stencil_write_swizzled(gallivm, type,
1342 zs_format_desc, key->resource_1d,
1343 NULL, NULL, NULL, loop_state.counter,
1344 depth_ptr, depth_stride,
1345 z_value, s_value);
1346 }
1347 } else if ((depth_mode & EARLY_DEPTH_TEST) &&
1348 (depth_mode & LATE_DEPTH_WRITE)) {
1349 /* Need to apply a reduced mask to the depth write. Reload the
1350 * depth value, update from zs_value with the new mask value and
1351 * write that out.
1352 */
1353 if (key->multisample) {
1354 z_value = LLVMBuildBitCast(builder, lp_build_pointer_get2(builder, int_vec_type, z_sample_value_store, sample_loop_state.counter), z_type, "");
1355 s_value = lp_build_pointer_get2(builder, int_vec_type, s_sample_value_store, sample_loop_state.counter);
1356 z_fb = LLVMBuildBitCast(builder, lp_build_pointer_get2(builder, int_vec_type, z_fb_store, sample_loop_state.counter), z_fb_type, "");
1357 s_fb = lp_build_pointer_get2(builder, int_vec_type, s_fb_store, sample_loop_state.counter);
1358 }
1359 lp_build_depth_stencil_write_swizzled(gallivm, type,
1360 zs_format_desc, key->resource_1d,
1361 key->multisample ? s_mask : lp_build_mask_value(&mask), z_fb, s_fb, loop_state.counter,
1362 depth_ptr, depth_stride,
1363 z_value, s_value);
1364 }
1365
1366 if (key->occlusion_count && (!(depth_mode & EARLY_DEPTH_TEST) || (depth_mode & EARLY_DEPTH_TEST_INFERRED))) {
1367 LLVMValueRef counter = lp_jit_thread_data_vis_counter(gallivm, thread_data_type, thread_data_ptr);
1368 lp_build_name(counter, "counter");
1369
1370 lp_build_occlusion_count(gallivm, type,
1371 key->multisample ? s_mask : lp_build_mask_value(&mask), counter);
1372 }
1373
1374 /* if this is genuine early-depth in the shader, write samplemask now
1375 * after occlusion count has been updated
1376 */
1377 if (key->multisample &&
1378 nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK) &&
1379 (depth_mode & (EARLY_DEPTH_TEST_INFERRED | EARLY_DEPTH_TEST)) == EARLY_DEPTH_TEST) {
1380 /* if the shader writes sample mask use that */
1381 LLVMValueRef out_smask_idx = LLVMBuildShl(builder, lp_build_const_int32(gallivm, 1), sample_loop_state.counter, "");
1382 out_smask_idx = lp_build_broadcast(gallivm, int_vec_type, out_smask_idx);
1383 LLVMValueRef output_smask = LLVMBuildLoad2(builder, int_vec_type, out_sample_mask_storage, "");
1384 LLVMValueRef smask_bit = LLVMBuildAnd(builder, output_smask, out_smask_idx, "");
1385 LLVMValueRef cmp = LLVMBuildICmp(builder, LLVMIntNE, smask_bit, lp_build_const_int_vec(gallivm, int_type, 0), "");
1386 smask_bit = LLVMBuildSExt(builder, cmp, int_vec_type, "");
1387
1388 s_mask = LLVMBuildAnd(builder, s_mask, smask_bit, "");
1389 }
1390
1391
1392 if (key->multisample) {
1393 /* store the sample mask for this loop */
1394 LLVMBuildStore(builder, s_mask, s_mask_ptr);
1395 lp_build_for_loop_end(&sample_loop_state);
1396 }
1397
1398 mask_val = lp_build_mask_end(&mask);
1399 if (!key->multisample)
1400 LLVMBuildStore(builder, mask_val, mask_ptr);
1401 lp_build_for_loop_end(&loop_state);
1402 }
1403
1404
1405 /**
1406 * This function will reorder pixels from the fragment shader SoA to memory
1407 * layout AoS
1408 *
1409 * Fragment Shader outputs pixels in small 2x2 blocks
1410 * e.g. (0, 0), (1, 0), (0, 1), (1, 1) ; (2, 0) ...
1411 *
1412 * However in memory pixels are stored in rows
1413 * e.g. (0, 0), (1, 0), (2, 0), (3, 0) ; (0, 1) ...
1414 *
1415 * @param type fragment shader type (4x or 8x float)
1416 * @param num_fs number of fs_src
1417 * @param is_1d whether we're outputting to a 1d resource
1418 * @param dst_channels number of output channels
1419 * @param fs_src output from fragment shader
1420 * @param dst pointer to store result
1421 * @param pad_inline is channel padding inline or at end of row
1422 * @return the number of dsts
1423 */
1424 static int
generate_fs_twiddle(struct gallivm_state * gallivm,struct lp_type type,unsigned num_fs,unsigned dst_channels,LLVMValueRef fs_src[][4],LLVMValueRef * dst,bool pad_inline)1425 generate_fs_twiddle(struct gallivm_state *gallivm,
1426 struct lp_type type,
1427 unsigned num_fs,
1428 unsigned dst_channels,
1429 LLVMValueRef fs_src[][4],
1430 LLVMValueRef* dst,
1431 bool pad_inline)
1432 {
1433 LLVMValueRef src[16];
1434 unsigned pixels = type.length / 4;
1435 unsigned src_channels = dst_channels < 3 ? dst_channels : 4;
1436 unsigned src_count = num_fs * src_channels;
1437
1438 assert(pixels == 2 || pixels == 1);
1439 assert(num_fs * src_channels <= ARRAY_SIZE(src));
1440
1441 /*
1442 * Transpose from SoA -> AoS
1443 */
1444 for (unsigned i = 0; i < num_fs; ++i) {
1445 lp_build_transpose_aos_n(gallivm, type, &fs_src[i][0], src_channels,
1446 &src[i * src_channels]);
1447 }
1448
1449 /*
1450 * Pick transformation options
1451 */
1452 bool swizzle_pad = false;
1453 bool twiddle = false;
1454 bool split = false;
1455 unsigned reorder_group = 0;
1456
1457 if (dst_channels == 1) {
1458 twiddle = true;
1459 if (pixels == 2) {
1460 split = true;
1461 }
1462 } else if (dst_channels == 2) {
1463 if (pixels == 1) {
1464 reorder_group = 1;
1465 }
1466 } else if (dst_channels > 2) {
1467 if (pixels == 1) {
1468 reorder_group = 2;
1469 } else {
1470 twiddle = true;
1471 }
1472
1473 if (!pad_inline && dst_channels == 3 && pixels > 1) {
1474 swizzle_pad = true;
1475 }
1476 }
1477
1478 /*
1479 * Split the src in half
1480 */
1481 if (split) {
1482 for (unsigned i = num_fs; i > 0; --i) {
1483 src[(i - 1)*2 + 1] = lp_build_extract_range(gallivm, src[i - 1], 4, 4);
1484 src[(i - 1)*2 + 0] = lp_build_extract_range(gallivm, src[i - 1], 0, 4);
1485 }
1486
1487 src_count *= 2;
1488 type.length = 4;
1489 }
1490
1491 /*
1492 * Ensure pixels are in memory order
1493 */
1494 if (reorder_group) {
1495 /* Twiddle pixels by reordering the array, e.g.:
1496 *
1497 * src_count = 8 -> 0 2 1 3 4 6 5 7
1498 * src_count = 16 -> 0 1 4 5 2 3 6 7 8 9 12 13 10 11 14 15
1499 */
1500 const unsigned reorder_sw[] = { 0, 2, 1, 3 };
1501
1502 for (unsigned i = 0; i < src_count; ++i) {
1503 unsigned group = i / reorder_group;
1504 unsigned block = (group / 4) * 4 * reorder_group;
1505 unsigned j = block + (reorder_sw[group % 4] * reorder_group) + (i % reorder_group);
1506 dst[i] = src[j];
1507 }
1508 } else if (twiddle) {
1509 /* Twiddle pixels across elements of array */
1510 /*
1511 * XXX: we should avoid this in some cases, but would need to tell
1512 * lp_build_conv to reorder (or deal with it ourselves).
1513 */
1514 lp_bld_quad_twiddle(gallivm, type, src, src_count, dst);
1515 } else {
1516 /* Do nothing */
1517 memcpy(dst, src, sizeof(LLVMValueRef) * src_count);
1518 }
1519
1520 /*
1521 * Moves any padding between pixels to the end
1522 * e.g. RGBXRGBX -> RGBRGBXX
1523 */
1524 if (swizzle_pad) {
1525 unsigned char swizzles[16];
1526 unsigned elems = pixels * dst_channels;
1527
1528 for (unsigned i = 0; i < type.length; ++i) {
1529 if (i < elems)
1530 swizzles[i] = i % dst_channels + (i / dst_channels) * 4;
1531 else
1532 swizzles[i] = LP_BLD_SWIZZLE_DONTCARE;
1533 }
1534
1535 for (unsigned i = 0; i < src_count; ++i) {
1536 dst[i] = lp_build_swizzle_aos_n(gallivm, dst[i], swizzles,
1537 type.length, type.length);
1538 }
1539 }
1540
1541 return src_count;
1542 }
1543
1544
1545 /*
1546 * Untwiddle and transpose, much like the above.
1547 * However, this is after conversion, so we get packed vectors.
1548 * At this time only handle 4x16i8 rgba / 2x16i8 rg / 1x16i8 r data,
1549 * the vectors will look like:
1550 * r0r1r4r5r2r3r6r7r8r9r12... (albeit color channels may
1551 * be swizzled here). Extending to 16bit should be trivial.
1552 * Should also be extended to handle twice wide vectors with AVX2...
1553 */
1554 static void
fs_twiddle_transpose(struct gallivm_state * gallivm,struct lp_type type,LLVMValueRef * src,unsigned src_count,LLVMValueRef * dst)1555 fs_twiddle_transpose(struct gallivm_state *gallivm,
1556 struct lp_type type,
1557 LLVMValueRef *src,
1558 unsigned src_count,
1559 LLVMValueRef *dst)
1560 {
1561 struct lp_type type64, type16, type32;
1562 LLVMTypeRef type64_t, type8_t, type16_t, type32_t;
1563 LLVMBuilderRef builder = gallivm->builder;
1564 LLVMValueRef tmp[4], shuf[8];
1565 for (unsigned j = 0; j < 2; j++) {
1566 shuf[j*4 + 0] = lp_build_const_int32(gallivm, j*4 + 0);
1567 shuf[j*4 + 1] = lp_build_const_int32(gallivm, j*4 + 2);
1568 shuf[j*4 + 2] = lp_build_const_int32(gallivm, j*4 + 1);
1569 shuf[j*4 + 3] = lp_build_const_int32(gallivm, j*4 + 3);
1570 }
1571
1572 assert(src_count == 4 || src_count == 2 || src_count == 1);
1573 assert(type.width == 8);
1574 assert(type.length == 16);
1575
1576 type8_t = lp_build_vec_type(gallivm, type);
1577
1578 type64 = type;
1579 type64.length /= 8;
1580 type64.width *= 8;
1581 type64_t = lp_build_vec_type(gallivm, type64);
1582
1583 type16 = type;
1584 type16.length /= 2;
1585 type16.width *= 2;
1586 type16_t = lp_build_vec_type(gallivm, type16);
1587
1588 type32 = type;
1589 type32.length /= 4;
1590 type32.width *= 4;
1591 type32_t = lp_build_vec_type(gallivm, type32);
1592
1593 lp_build_transpose_aos_n(gallivm, type, src, src_count, tmp);
1594
1595 if (src_count == 1) {
1596 /* transpose was no-op, just untwiddle */
1597 LLVMValueRef shuf_vec;
1598 shuf_vec = LLVMConstVector(shuf, 8);
1599 tmp[0] = LLVMBuildBitCast(builder, src[0], type16_t, "");
1600 tmp[0] = LLVMBuildShuffleVector(builder, tmp[0], tmp[0], shuf_vec, "");
1601 dst[0] = LLVMBuildBitCast(builder, tmp[0], type8_t, "");
1602 } else if (src_count == 2) {
1603 LLVMValueRef shuf_vec;
1604 shuf_vec = LLVMConstVector(shuf, 4);
1605
1606 for (unsigned i = 0; i < 2; i++) {
1607 tmp[i] = LLVMBuildBitCast(builder, tmp[i], type32_t, "");
1608 tmp[i] = LLVMBuildShuffleVector(builder, tmp[i], tmp[i], shuf_vec, "");
1609 dst[i] = LLVMBuildBitCast(builder, tmp[i], type8_t, "");
1610 }
1611 } else {
1612 for (unsigned j = 0; j < 2; j++) {
1613 LLVMValueRef lo, hi, lo2, hi2;
1614 /*
1615 * Note that if we only really have 3 valid channels (rgb)
1616 * and we don't need alpha we could substitute a undef here
1617 * for the respective channel (causing llvm to drop conversion
1618 * for alpha).
1619 */
1620 /* we now have rgba0rgba1rgba4rgba5 etc, untwiddle */
1621 lo2 = LLVMBuildBitCast(builder, tmp[j*2], type64_t, "");
1622 hi2 = LLVMBuildBitCast(builder, tmp[j*2 + 1], type64_t, "");
1623 lo = lp_build_interleave2(gallivm, type64, lo2, hi2, 0);
1624 hi = lp_build_interleave2(gallivm, type64, lo2, hi2, 1);
1625 dst[j*2] = LLVMBuildBitCast(builder, lo, type8_t, "");
1626 dst[j*2 + 1] = LLVMBuildBitCast(builder, hi, type8_t, "");
1627 }
1628 }
1629 }
1630
1631
1632 /**
1633 * Load an unswizzled block of pixels from memory
1634 */
1635 static void
load_unswizzled_block(struct gallivm_state * gallivm,LLVMTypeRef base_type,LLVMValueRef base_ptr,LLVMValueRef stride,unsigned block_width,unsigned block_height,LLVMValueRef * dst,struct lp_type dst_type,unsigned dst_count,unsigned dst_alignment)1636 load_unswizzled_block(struct gallivm_state *gallivm,
1637 LLVMTypeRef base_type,
1638 LLVMValueRef base_ptr,
1639 LLVMValueRef stride,
1640 unsigned block_width,
1641 unsigned block_height,
1642 LLVMValueRef* dst,
1643 struct lp_type dst_type,
1644 unsigned dst_count,
1645 unsigned dst_alignment)
1646 {
1647 LLVMBuilderRef builder = gallivm->builder;
1648 const unsigned row_size = dst_count / block_height;
1649
1650 /* Ensure block exactly fits into dst */
1651 assert((block_width * block_height) % dst_count == 0);
1652
1653 for (unsigned i = 0; i < dst_count; ++i) {
1654 unsigned x = i % row_size;
1655 unsigned y = i / row_size;
1656
1657 LLVMValueRef bx = lp_build_const_int32(gallivm, x * (dst_type.width / 8) * dst_type.length);
1658 LLVMValueRef by = LLVMBuildMul(builder, lp_build_const_int32(gallivm, y), stride, "");
1659
1660 LLVMValueRef gep[2];
1661 LLVMValueRef dst_ptr;
1662
1663 gep[0] = lp_build_const_int32(gallivm, 0);
1664 gep[1] = LLVMBuildAdd(builder, bx, by, "");
1665
1666 dst_ptr = LLVMBuildGEP2(builder, base_type, base_ptr, gep, 2, "");
1667 dst_ptr = LLVMBuildBitCast(builder, dst_ptr,
1668 LLVMPointerType(lp_build_vec_type(gallivm, dst_type), 0), "");
1669
1670 dst[i] = LLVMBuildLoad2(builder,
1671 lp_build_vec_type(gallivm, dst_type),
1672 dst_ptr, "");
1673
1674 LLVMSetAlignment(dst[i], dst_alignment);
1675 }
1676 }
1677
1678
1679 /**
1680 * Store an unswizzled block of pixels to memory
1681 */
1682 static void
store_unswizzled_block(struct gallivm_state * gallivm,LLVMTypeRef base_type,LLVMValueRef base_ptr,LLVMValueRef stride,unsigned block_width,unsigned block_height,LLVMValueRef src[],struct lp_type src_type,unsigned src_count,unsigned src_alignment)1683 store_unswizzled_block(struct gallivm_state *gallivm,
1684 LLVMTypeRef base_type,
1685 LLVMValueRef base_ptr,
1686 LLVMValueRef stride,
1687 unsigned block_width,
1688 unsigned block_height,
1689 LLVMValueRef src[], // [src_count]
1690 struct lp_type src_type,
1691 unsigned src_count,
1692 unsigned src_alignment)
1693 {
1694 LLVMBuilderRef builder = gallivm->builder;
1695 const unsigned row_size = src_count / block_height;
1696
1697 /* Ensure src exactly fits into block */
1698 assert((block_width * block_height) % src_count == 0);
1699
1700 for (unsigned i = 0; i < src_count; ++i) {
1701 unsigned x = i % row_size;
1702 unsigned y = i / row_size;
1703
1704 LLVMValueRef bx = lp_build_const_int32(gallivm, x * (src_type.width / 8) * src_type.length);
1705 LLVMValueRef by = LLVMBuildMul(builder, lp_build_const_int32(gallivm, y), stride, "");
1706
1707 LLVMValueRef gep[2];
1708 LLVMValueRef src_ptr;
1709
1710 gep[0] = lp_build_const_int32(gallivm, 0);
1711 gep[1] = LLVMBuildAdd(builder, bx, by, "");
1712
1713 src_ptr = LLVMBuildGEP2(builder, base_type, base_ptr, gep, 2, "");
1714 src_ptr = LLVMBuildBitCast(builder, src_ptr,
1715 LLVMPointerType(lp_build_vec_type(gallivm, src_type), 0), "");
1716
1717 src_ptr = LLVMBuildStore(builder, src[i], src_ptr);
1718
1719 LLVMSetAlignment(src_ptr, src_alignment);
1720 }
1721 }
1722
1723
1724
1725 /**
1726 * Retrieves the type for a format which is usable in the blending code.
1727 *
1728 * e.g. RGBA16F = 4x float, R3G3B2 = 3x byte
1729 */
1730 static inline void
lp_blend_type_from_format_desc(const struct util_format_description * format_desc,struct lp_type * type)1731 lp_blend_type_from_format_desc(const struct util_format_description *format_desc,
1732 struct lp_type* type)
1733 {
1734 if (format_expands_to_float_soa(format_desc)) {
1735 /* always use ordinary floats for blending */
1736 type->floating = true;
1737 type->fixed = false;
1738 type->sign = true;
1739 type->norm = false;
1740 type->width = 32;
1741 type->length = 4;
1742 return;
1743 }
1744
1745 const int chan = util_format_get_first_non_void_channel(format_desc->format);
1746
1747 memset(type, 0, sizeof(struct lp_type));
1748 type->floating = format_desc->channel[chan].type == UTIL_FORMAT_TYPE_FLOAT;
1749 type->fixed = format_desc->channel[chan].type == UTIL_FORMAT_TYPE_FIXED;
1750 type->sign = format_desc->channel[chan].type != UTIL_FORMAT_TYPE_UNSIGNED;
1751 type->norm = format_desc->channel[chan].normalized;
1752 type->width = format_desc->channel[chan].size;
1753 type->length = format_desc->nr_channels;
1754
1755 for (unsigned i = 1; i < format_desc->nr_channels; ++i) {
1756 if (format_desc->channel[i].size > type->width)
1757 type->width = format_desc->channel[i].size;
1758 }
1759
1760 if (type->floating) {
1761 type->width = 32;
1762 } else {
1763 if (type->width <= 8) {
1764 type->width = 8;
1765 } else if (type->width <= 16) {
1766 type->width = 16;
1767 } else {
1768 type->width = 32;
1769 }
1770 }
1771
1772 if (is_arithmetic_format(format_desc) && type->length == 3) {
1773 type->length = 4;
1774 }
1775 }
1776
1777
1778 /**
1779 * Scale a normalized value from src_bits to dst_bits.
1780 *
1781 * The exact calculation is
1782 *
1783 * dst = iround(src * dst_mask / src_mask)
1784 *
1785 * or with integer rounding
1786 *
1787 * dst = src * (2*dst_mask + sign(src)*src_mask) / (2*src_mask)
1788 *
1789 * where
1790 *
1791 * src_mask = (1 << src_bits) - 1
1792 * dst_mask = (1 << dst_bits) - 1
1793 *
1794 * but we try to avoid division and multiplication through shifts.
1795 */
1796 static inline LLVMValueRef
scale_bits(struct gallivm_state * gallivm,int src_bits,int dst_bits,LLVMValueRef src,struct lp_type src_type)1797 scale_bits(struct gallivm_state *gallivm,
1798 int src_bits,
1799 int dst_bits,
1800 LLVMValueRef src,
1801 struct lp_type src_type)
1802 {
1803 LLVMBuilderRef builder = gallivm->builder;
1804 LLVMValueRef result = src;
1805
1806 if (dst_bits < src_bits) {
1807 int delta_bits = src_bits - dst_bits;
1808
1809 if (delta_bits <= dst_bits) {
1810
1811 if (dst_bits == 4) {
1812 struct lp_type flt_type =
1813 lp_type_float_vec(32, src_type.length * 32);
1814
1815 result = lp_build_unsigned_norm_to_float(gallivm, src_bits,
1816 flt_type, src);
1817 result = lp_build_clamped_float_to_unsigned_norm(gallivm, flt_type,
1818 dst_bits, result);
1819 result = LLVMBuildTrunc(gallivm->builder, result,
1820 lp_build_int_vec_type(gallivm, src_type),
1821 "");
1822 return result;
1823 }
1824
1825 /*
1826 * Approximate the rescaling with a single shift.
1827 *
1828 * This gives the wrong rounding.
1829 */
1830
1831 result = LLVMBuildLShr(builder, src,
1832 lp_build_const_int_vec(gallivm, src_type,
1833 delta_bits),
1834 "");
1835 } else {
1836 /*
1837 * Try more accurate rescaling.
1838 */
1839
1840 /*
1841 * Drop the least significant bits to make space for the
1842 * multiplication.
1843 *
1844 * XXX: A better approach would be to use a wider integer type as
1845 * intermediate. But this is enough to convert alpha from 16bits ->
1846 * 2 when rendering to PIPE_FORMAT_R10G10B10A2_UNORM.
1847 */
1848 result = LLVMBuildLShr(builder, src,
1849 lp_build_const_int_vec(gallivm, src_type,
1850 dst_bits),
1851 "");
1852
1853
1854 result = LLVMBuildMul(builder, result,
1855 lp_build_const_int_vec(gallivm, src_type,
1856 (1LL << dst_bits) - 1),
1857 "");
1858
1859 /*
1860 * Add a rounding term before the division.
1861 *
1862 * TODO: Handle signed integers too.
1863 */
1864 if (!src_type.sign) {
1865 result = LLVMBuildAdd(builder, result,
1866 lp_build_const_int_vec(gallivm, src_type,
1867 (1LL << (delta_bits - 1))),
1868 "");
1869 }
1870
1871 /*
1872 * Approximate the division by src_mask with a src_bits shift.
1873 *
1874 * Given the src has already been shifted by dst_bits, all we need
1875 * to do is to shift by the difference.
1876 */
1877
1878 result = LLVMBuildLShr(builder,
1879 result,
1880 lp_build_const_int_vec(gallivm, src_type, delta_bits),
1881 "");
1882 }
1883
1884 } else if (dst_bits > src_bits) {
1885 /* Scale up bits */
1886 int db = dst_bits - src_bits;
1887
1888 /* Shift left by difference in bits */
1889 result = LLVMBuildShl(builder,
1890 src,
1891 lp_build_const_int_vec(gallivm, src_type, db),
1892 "");
1893
1894 if (db <= src_bits) {
1895 /* Enough bits in src to fill the remainder */
1896 LLVMValueRef lower = LLVMBuildLShr(builder,
1897 src,
1898 lp_build_const_int_vec(gallivm, src_type, src_bits - db),
1899 "");
1900
1901 result = LLVMBuildOr(builder, result, lower, "");
1902 } else if (db > src_bits) {
1903 /* Need to repeatedly copy src bits to fill remainder in dst */
1904 unsigned n;
1905
1906 for (n = src_bits; n < dst_bits; n *= 2) {
1907 LLVMValueRef shuv = lp_build_const_int_vec(gallivm, src_type, n);
1908
1909 result = LLVMBuildOr(builder,
1910 result,
1911 LLVMBuildLShr(builder, result, shuv, ""),
1912 "");
1913 }
1914 }
1915 }
1916
1917 return result;
1918 }
1919
1920 /**
1921 * If RT is a smallfloat (needing denorms) format
1922 */
1923 static inline int
have_smallfloat_format(struct lp_type dst_type,enum pipe_format format)1924 have_smallfloat_format(struct lp_type dst_type,
1925 enum pipe_format format)
1926 {
1927 return ((dst_type.floating && dst_type.width != 32) ||
1928 /* due to format handling hacks this format doesn't have floating set
1929 * here (and actually has width set to 32 too) so special case this.
1930 */
1931 (format == PIPE_FORMAT_R11G11B10_FLOAT));
1932 }
1933
1934
1935 /**
1936 * Convert from memory format to blending format
1937 *
1938 * e.g. GL_R3G3B2 is 1 byte in memory but 3 bytes for blending
1939 */
1940 static void
convert_to_blend_type(struct gallivm_state * gallivm,unsigned block_size,const struct util_format_description * src_fmt,struct lp_type src_type,struct lp_type dst_type,LLVMValueRef * src,unsigned num_srcs)1941 convert_to_blend_type(struct gallivm_state *gallivm,
1942 unsigned block_size,
1943 const struct util_format_description *src_fmt,
1944 struct lp_type src_type,
1945 struct lp_type dst_type,
1946 LLVMValueRef* src, // and dst
1947 unsigned num_srcs)
1948 {
1949 LLVMValueRef *dst = src;
1950 LLVMBuilderRef builder = gallivm->builder;
1951 struct lp_type blend_type;
1952 struct lp_type mem_type;
1953 unsigned i, j;
1954 unsigned pixels = block_size / num_srcs;
1955 bool is_arith;
1956
1957 /*
1958 * full custom path for packed floats and srgb formats - none of the later
1959 * functions would do anything useful, and given the lp_type representation
1960 * they can't be fixed. Should really have some SoA blend path for these
1961 * kind of formats rather than hacking them in here.
1962 */
1963 if (format_expands_to_float_soa(src_fmt)) {
1964 LLVMValueRef tmpsrc[4];
1965 /*
1966 * This is pretty suboptimal for this case blending in SoA would be much
1967 * better, since conversion gets us SoA values so need to convert back.
1968 */
1969 assert(src_type.width == 32 || src_type.width == 16);
1970 assert(dst_type.floating);
1971 assert(dst_type.width == 32);
1972 assert(dst_type.length % 4 == 0);
1973 assert(num_srcs % 4 == 0);
1974
1975 if (src_type.width == 16) {
1976 /* expand 4x16bit values to 4x32bit */
1977 struct lp_type type32x4 = src_type;
1978 LLVMTypeRef ltype32x4;
1979 unsigned num_fetch = dst_type.length == 8 ? num_srcs / 2 : num_srcs / 4;
1980 type32x4.width = 32;
1981 ltype32x4 = lp_build_vec_type(gallivm, type32x4);
1982 for (i = 0; i < num_fetch; i++) {
1983 src[i] = LLVMBuildZExt(builder, src[i], ltype32x4, "");
1984 }
1985 src_type.width = 32;
1986 }
1987 for (i = 0; i < 4; i++) {
1988 tmpsrc[i] = src[i];
1989 }
1990 for (i = 0; i < num_srcs / 4; i++) {
1991 LLVMValueRef tmpsoa[4];
1992 LLVMValueRef tmps = tmpsrc[i];
1993 if (dst_type.length == 8) {
1994 LLVMValueRef shuffles[8];
1995 unsigned j;
1996 /* fetch was 4 values but need 8-wide output values */
1997 tmps = lp_build_concat(gallivm, &tmpsrc[i * 2], src_type, 2);
1998 /*
1999 * for 8-wide aos transpose would give us wrong order not matching
2000 * incoming converted fs values and mask. ARGH.
2001 */
2002 for (j = 0; j < 4; j++) {
2003 shuffles[j] = lp_build_const_int32(gallivm, j * 2);
2004 shuffles[j + 4] = lp_build_const_int32(gallivm, j * 2 + 1);
2005 }
2006 tmps = LLVMBuildShuffleVector(builder, tmps, tmps,
2007 LLVMConstVector(shuffles, 8), "");
2008 }
2009 if (src_fmt->format == PIPE_FORMAT_R11G11B10_FLOAT) {
2010 lp_build_r11g11b10_to_float(gallivm, tmps, tmpsoa);
2011 } else {
2012 lp_build_unpack_rgba_soa(gallivm, src_fmt, dst_type, tmps, tmpsoa);
2013 }
2014 lp_build_transpose_aos(gallivm, dst_type, tmpsoa, &src[i * 4]);
2015 }
2016 return;
2017 }
2018
2019 lp_mem_type_from_format_desc(src_fmt, &mem_type);
2020 lp_blend_type_from_format_desc(src_fmt, &blend_type);
2021
2022 /* Is the format arithmetic */
2023 is_arith = blend_type.length * blend_type.width != mem_type.width * mem_type.length;
2024 is_arith &= !(mem_type.width == 16 && mem_type.floating);
2025
2026 /* Pad if necessary */
2027 if (!is_arith && src_type.length < dst_type.length) {
2028 for (i = 0; i < num_srcs; ++i) {
2029 dst[i] = lp_build_pad_vector(gallivm, src[i], dst_type.length);
2030 }
2031
2032 src_type.length = dst_type.length;
2033 }
2034
2035 /* Special case for half-floats */
2036 if (mem_type.width == 16 && mem_type.floating) {
2037 assert(blend_type.width == 32 && blend_type.floating);
2038 lp_build_conv_auto(gallivm, src_type, &dst_type, dst, num_srcs, dst);
2039 is_arith = false;
2040 }
2041
2042 if (!is_arith) {
2043 return;
2044 }
2045
2046 src_type.width = blend_type.width * blend_type.length;
2047 blend_type.length *= pixels;
2048 src_type.length *= pixels / (src_type.length / mem_type.length);
2049
2050 for (i = 0; i < num_srcs; ++i) {
2051 LLVMValueRef chans;
2052 LLVMValueRef res = NULL;
2053
2054 dst[i] = LLVMBuildZExt(builder, src[i], lp_build_vec_type(gallivm, src_type), "");
2055
2056 for (j = 0; j < src_fmt->nr_channels; ++j) {
2057 unsigned mask = 0;
2058 unsigned sa = src_fmt->channel[j].shift;
2059 #if UTIL_ARCH_LITTLE_ENDIAN
2060 unsigned from_lsb = j;
2061 #else
2062 unsigned from_lsb = (blend_type.length / pixels) - j - 1;
2063 #endif
2064
2065 mask = (1 << src_fmt->channel[j].size) - 1;
2066
2067 /* Extract bits from source */
2068 chans = LLVMBuildLShr(builder,
2069 dst[i],
2070 lp_build_const_int_vec(gallivm, src_type, sa),
2071 "");
2072
2073 chans = LLVMBuildAnd(builder,
2074 chans,
2075 lp_build_const_int_vec(gallivm, src_type, mask),
2076 "");
2077
2078 /* Scale bits */
2079 if (src_type.norm) {
2080 chans = scale_bits(gallivm, src_fmt->channel[j].size,
2081 blend_type.width, chans, src_type);
2082 }
2083
2084 /* Insert bits into correct position */
2085 chans = LLVMBuildShl(builder,
2086 chans,
2087 lp_build_const_int_vec(gallivm, src_type, from_lsb * blend_type.width),
2088 "");
2089
2090 if (j == 0) {
2091 res = chans;
2092 } else {
2093 res = LLVMBuildOr(builder, res, chans, "");
2094 }
2095 }
2096
2097 dst[i] = LLVMBuildBitCast(builder, res, lp_build_vec_type(gallivm, blend_type), "");
2098 }
2099 }
2100
2101
2102 /**
2103 * Convert from blending format to memory format
2104 *
2105 * e.g. GL_R3G3B2 is 3 bytes for blending but 1 byte in memory
2106 */
2107 static void
convert_from_blend_type(struct gallivm_state * gallivm,unsigned block_size,const struct util_format_description * src_fmt,struct lp_type src_type,struct lp_type dst_type,LLVMValueRef * src,unsigned num_srcs)2108 convert_from_blend_type(struct gallivm_state *gallivm,
2109 unsigned block_size,
2110 const struct util_format_description *src_fmt,
2111 struct lp_type src_type,
2112 struct lp_type dst_type,
2113 LLVMValueRef* src, // and dst
2114 unsigned num_srcs)
2115 {
2116 LLVMValueRef* dst = src;
2117 unsigned i, j, k;
2118 struct lp_type mem_type;
2119 struct lp_type blend_type;
2120 LLVMBuilderRef builder = gallivm->builder;
2121 unsigned pixels = block_size / num_srcs;
2122 bool is_arith;
2123
2124 /*
2125 * full custom path for packed floats and srgb formats - none of the later
2126 * functions would do anything useful, and given the lp_type representation
2127 * they can't be fixed. Should really have some SoA blend path for these
2128 * kind of formats rather than hacking them in here.
2129 */
2130 if (format_expands_to_float_soa(src_fmt)) {
2131 /*
2132 * This is pretty suboptimal for this case blending in SoA would be much
2133 * better - we need to transpose the AoS values back to SoA values for
2134 * conversion/packing.
2135 */
2136 assert(src_type.floating);
2137 assert(src_type.width == 32);
2138 assert(src_type.length % 4 == 0);
2139 assert(dst_type.width == 32 || dst_type.width == 16);
2140
2141 for (i = 0; i < num_srcs / 4; i++) {
2142 LLVMValueRef tmpsoa[4], tmpdst;
2143 lp_build_transpose_aos(gallivm, src_type, &src[i * 4], tmpsoa);
2144 /* really really need SoA here */
2145
2146 if (src_fmt->format == PIPE_FORMAT_R11G11B10_FLOAT) {
2147 tmpdst = lp_build_float_to_r11g11b10(gallivm, tmpsoa);
2148 } else {
2149 tmpdst = lp_build_float_to_srgb_packed(gallivm, src_fmt,
2150 src_type, tmpsoa);
2151 }
2152
2153 if (src_type.length == 8) {
2154 LLVMValueRef tmpaos, shuffles[8];
2155 unsigned j;
2156 /*
2157 * for 8-wide aos transpose has given us wrong order not matching
2158 * output order. HMPF. Also need to split the output values
2159 * manually.
2160 */
2161 for (j = 0; j < 4; j++) {
2162 shuffles[j * 2] = lp_build_const_int32(gallivm, j);
2163 shuffles[j * 2 + 1] = lp_build_const_int32(gallivm, j + 4);
2164 }
2165 tmpaos = LLVMBuildShuffleVector(builder, tmpdst, tmpdst,
2166 LLVMConstVector(shuffles, 8), "");
2167 src[i * 2] = lp_build_extract_range(gallivm, tmpaos, 0, 4);
2168 src[i * 2 + 1] = lp_build_extract_range(gallivm, tmpaos, 4, 4);
2169 } else {
2170 src[i] = tmpdst;
2171 }
2172 }
2173 if (dst_type.width == 16) {
2174 struct lp_type type16x8 = dst_type;
2175 struct lp_type type32x4 = dst_type;
2176 LLVMTypeRef ltype16x4, ltypei64, ltypei128;
2177 unsigned num_fetch = src_type.length == 8 ? num_srcs / 2 : num_srcs / 4;
2178 type16x8.length = 8;
2179 type32x4.width = 32;
2180 ltypei128 = LLVMIntTypeInContext(gallivm->context, 128);
2181 ltypei64 = LLVMIntTypeInContext(gallivm->context, 64);
2182 ltype16x4 = lp_build_vec_type(gallivm, dst_type);
2183 /* We could do vector truncation but it doesn't generate very good code */
2184 for (i = 0; i < num_fetch; i++) {
2185 src[i] = lp_build_pack2(gallivm, type32x4, type16x8,
2186 src[i], lp_build_zero(gallivm, type32x4));
2187 src[i] = LLVMBuildBitCast(builder, src[i], ltypei128, "");
2188 src[i] = LLVMBuildTrunc(builder, src[i], ltypei64, "");
2189 src[i] = LLVMBuildBitCast(builder, src[i], ltype16x4, "");
2190 }
2191 }
2192 return;
2193 }
2194
2195 lp_mem_type_from_format_desc(src_fmt, &mem_type);
2196 lp_blend_type_from_format_desc(src_fmt, &blend_type);
2197
2198 is_arith = (blend_type.length * blend_type.width != mem_type.width * mem_type.length);
2199
2200 /* Special case for half-floats */
2201 if (mem_type.width == 16 && mem_type.floating) {
2202 int length = dst_type.length;
2203 assert(blend_type.width == 32 && blend_type.floating);
2204
2205 dst_type.length = src_type.length;
2206
2207 lp_build_conv_auto(gallivm, src_type, &dst_type, dst, num_srcs, dst);
2208
2209 dst_type.length = length;
2210 is_arith = false;
2211 }
2212
2213 /* Remove any padding */
2214 if (!is_arith && (src_type.length % mem_type.length)) {
2215 src_type.length -= (src_type.length % mem_type.length);
2216
2217 for (i = 0; i < num_srcs; ++i) {
2218 dst[i] = lp_build_extract_range(gallivm, dst[i], 0, src_type.length);
2219 }
2220 }
2221
2222 /* No bit arithmetic to do */
2223 if (!is_arith) {
2224 return;
2225 }
2226
2227 src_type.length = pixels;
2228 src_type.width = blend_type.length * blend_type.width;
2229 dst_type.length = pixels;
2230
2231 for (i = 0; i < num_srcs; ++i) {
2232 LLVMValueRef chans;
2233 LLVMValueRef res = NULL;
2234
2235 dst[i] = LLVMBuildBitCast(builder, src[i], lp_build_vec_type(gallivm, src_type), "");
2236
2237 for (j = 0; j < src_fmt->nr_channels; ++j) {
2238 unsigned mask = 0;
2239 unsigned sa = src_fmt->channel[j].shift;
2240 unsigned sz_a = src_fmt->channel[j].size;
2241 #if UTIL_ARCH_LITTLE_ENDIAN
2242 unsigned from_lsb = j;
2243 #else
2244 unsigned from_lsb = blend_type.length - j - 1;
2245 #endif
2246
2247 assert(blend_type.width > src_fmt->channel[j].size);
2248
2249 for (k = 0; k < blend_type.width; ++k) {
2250 mask |= 1 << k;
2251 }
2252
2253 /* Extract bits */
2254 chans = LLVMBuildLShr(builder,
2255 dst[i],
2256 lp_build_const_int_vec(gallivm, src_type,
2257 from_lsb * blend_type.width),
2258 "");
2259
2260 chans = LLVMBuildAnd(builder,
2261 chans,
2262 lp_build_const_int_vec(gallivm, src_type, mask),
2263 "");
2264
2265 /* Scale down bits */
2266 if (src_type.norm) {
2267 chans = scale_bits(gallivm, blend_type.width,
2268 src_fmt->channel[j].size, chans, src_type);
2269 } else if (!src_type.floating && sz_a < blend_type.width) {
2270 LLVMValueRef mask_val = lp_build_const_int_vec(gallivm, src_type, (1UL << sz_a) - 1);
2271 LLVMValueRef mask = LLVMBuildICmp(builder, LLVMIntUGT, chans, mask_val, "");
2272 chans = LLVMBuildSelect(builder, mask, mask_val, chans, "");
2273 }
2274
2275 /* Insert bits */
2276 chans = LLVMBuildShl(builder,
2277 chans,
2278 lp_build_const_int_vec(gallivm, src_type, sa),
2279 "");
2280
2281 sa += src_fmt->channel[j].size;
2282
2283 if (j == 0) {
2284 res = chans;
2285 } else {
2286 res = LLVMBuildOr(builder, res, chans, "");
2287 }
2288 }
2289
2290 assert (dst_type.width != 24);
2291
2292 dst[i] = LLVMBuildTrunc(builder, res, lp_build_vec_type(gallivm, dst_type), "");
2293 }
2294 }
2295
2296
2297 /**
2298 * Convert alpha to same blend type as src
2299 */
2300 static void
convert_alpha(struct gallivm_state * gallivm,struct lp_type row_type,struct lp_type alpha_type,const unsigned block_size,const unsigned block_height,const unsigned src_count,const unsigned dst_channels,const bool pad_inline,LLVMValueRef * src_alpha)2301 convert_alpha(struct gallivm_state *gallivm,
2302 struct lp_type row_type,
2303 struct lp_type alpha_type,
2304 const unsigned block_size,
2305 const unsigned block_height,
2306 const unsigned src_count,
2307 const unsigned dst_channels,
2308 const bool pad_inline,
2309 LLVMValueRef* src_alpha)
2310 {
2311 LLVMBuilderRef builder = gallivm->builder;
2312 const unsigned length = row_type.length;
2313 row_type.length = alpha_type.length;
2314
2315 /* Twiddle the alpha to match pixels */
2316 lp_bld_quad_twiddle(gallivm, alpha_type, src_alpha, block_height, src_alpha);
2317
2318 /*
2319 * TODO this should use single lp_build_conv call for
2320 * src_count == 1 && dst_channels == 1 case (dropping the concat below)
2321 */
2322 for (unsigned i = 0; i < block_height; ++i) {
2323 lp_build_conv(gallivm, alpha_type, row_type, &src_alpha[i], 1,
2324 &src_alpha[i], 1);
2325 }
2326
2327 alpha_type = row_type;
2328 row_type.length = length;
2329
2330 /* If only one channel we can only need the single alpha value per pixel */
2331 if (src_count == 1 && dst_channels == 1) {
2332 lp_build_concat_n(gallivm, alpha_type, src_alpha, block_height,
2333 src_alpha, src_count);
2334 } else {
2335 /* If there are more srcs than rows then we need to split alpha up */
2336 if (src_count > block_height) {
2337 for (unsigned i = src_count; i > 0; --i) {
2338 unsigned pixels = block_size / src_count;
2339 unsigned idx = i - 1;
2340
2341 src_alpha[idx] =
2342 lp_build_extract_range(gallivm, src_alpha[(idx * pixels) / 4],
2343 (idx * pixels) % 4, pixels);
2344 }
2345 }
2346
2347 /* If there is a src for each pixel broadcast the alpha across whole
2348 * row
2349 */
2350 if (src_count == block_size) {
2351 for (unsigned i = 0; i < src_count; ++i) {
2352 src_alpha[i] = lp_build_broadcast(gallivm,
2353 lp_build_vec_type(gallivm, row_type), src_alpha[i]);
2354 }
2355 } else {
2356 unsigned pixels = block_size / src_count;
2357 unsigned channels = pad_inline ? TGSI_NUM_CHANNELS : dst_channels;
2358 unsigned alpha_span = 1;
2359 LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
2360
2361 /* Check if we need 2 src_alphas for our shuffles */
2362 if (pixels > alpha_type.length) {
2363 alpha_span = 2;
2364 }
2365
2366 /* Broadcast alpha across all channels, e.g. a1a2 to a1a1a1a1a2a2a2a2 */
2367 for (unsigned j = 0; j < row_type.length; ++j) {
2368 if (j < pixels * channels) {
2369 shuffles[j] = lp_build_const_int32(gallivm, j / channels);
2370 } else {
2371 shuffles[j] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
2372 }
2373 }
2374
2375 for (unsigned i = 0; i < src_count; ++i) {
2376 unsigned idx1 = i, idx2 = i;
2377
2378 if (alpha_span > 1){
2379 idx1 *= alpha_span;
2380 idx2 = idx1 + 1;
2381 }
2382
2383 src_alpha[i] = LLVMBuildShuffleVector(builder,
2384 src_alpha[idx1],
2385 src_alpha[idx2],
2386 LLVMConstVector(shuffles, row_type.length),
2387 "");
2388 }
2389 }
2390 }
2391 }
2392
2393
2394 /**
2395 * Generates the blend function for unswizzled colour buffers
2396 * Also generates the read & write from colour buffer
2397 */
2398 static void
generate_unswizzled_blend(struct gallivm_state * gallivm,unsigned rt,struct lp_fragment_shader_variant * variant,enum pipe_format out_format,unsigned int num_fs,struct lp_type fs_type,LLVMValueRef * fs_mask,LLVMValueRef fs_out_color[PIPE_MAX_COLOR_BUFS][TGSI_NUM_CHANNELS][4],LLVMTypeRef context_type,LLVMValueRef context_ptr,LLVMTypeRef color_type,LLVMValueRef color_ptr,LLVMValueRef stride,unsigned partial_mask,bool do_branch)2399 generate_unswizzled_blend(struct gallivm_state *gallivm,
2400 unsigned rt,
2401 struct lp_fragment_shader_variant *variant,
2402 enum pipe_format out_format,
2403 unsigned int num_fs,
2404 struct lp_type fs_type,
2405 LLVMValueRef* fs_mask,
2406 LLVMValueRef fs_out_color[PIPE_MAX_COLOR_BUFS][TGSI_NUM_CHANNELS][4],
2407 LLVMTypeRef context_type,
2408 LLVMValueRef context_ptr,
2409 LLVMTypeRef color_type,
2410 LLVMValueRef color_ptr,
2411 LLVMValueRef stride,
2412 unsigned partial_mask,
2413 bool do_branch)
2414 {
2415 const unsigned alpha_channel = 3;
2416 const unsigned block_width = LP_RASTER_BLOCK_SIZE;
2417 const unsigned block_height = LP_RASTER_BLOCK_SIZE;
2418 const unsigned block_size = block_width * block_height;
2419 const unsigned lp_integer_vector_width = 128;
2420
2421 LLVMBuilderRef builder = gallivm->builder;
2422 LLVMValueRef fs_src[4][TGSI_NUM_CHANNELS];
2423 LLVMValueRef fs_src1[4][TGSI_NUM_CHANNELS];
2424 LLVMValueRef src_alpha[4 * 4];
2425 LLVMValueRef src1_alpha[4 * 4] = { NULL };
2426 LLVMValueRef src_mask[4 * 4];
2427 LLVMValueRef src[4 * 4];
2428 LLVMValueRef src1[4 * 4];
2429 LLVMValueRef dst[4 * 4];
2430
2431 struct lp_build_mask_context mask_ctx;
2432
2433 unsigned char swizzle[TGSI_NUM_CHANNELS];
2434 unsigned src_channels = TGSI_NUM_CHANNELS;
2435
2436 const struct util_format_description *out_format_desc =
2437 util_format_description(out_format);
2438
2439 bool pad_inline = is_arithmetic_format(out_format_desc);
2440 const bool dual_source_blend =
2441 variant->key.blend.rt[0].blend_enable &&
2442 util_blend_state_is_dual(&variant->key.blend, 0);
2443
2444 const bool is_1d = variant->key.resource_1d;
2445 const unsigned num_fullblock_fs = is_1d ? 2 * num_fs : num_fs;
2446 LLVMValueRef fpstate = NULL;
2447
2448 LLVMTypeRef fs_vec_type = lp_build_vec_type(gallivm, fs_type);
2449
2450 /* Get type from output format */
2451 struct lp_type row_type, dst_type;
2452 lp_blend_type_from_format_desc(out_format_desc, &row_type);
2453 lp_mem_type_from_format_desc(out_format_desc, &dst_type);
2454
2455 /*
2456 * Technically this code should go into lp_build_smallfloat_to_float
2457 * and lp_build_float_to_smallfloat but due to the
2458 * http://llvm.org/bugs/show_bug.cgi?id=6393
2459 * llvm reorders the mxcsr intrinsics in a way that breaks the code.
2460 * So the ordering is important here and there shouldn't be any
2461 * llvm ir instrunctions in this function before
2462 * this, otherwise half-float format conversions won't work
2463 * (again due to llvm bug #6393).
2464 */
2465 if (have_smallfloat_format(dst_type, out_format)) {
2466 /* We need to make sure that denorms are ok for half float
2467 conversions */
2468 fpstate = lp_build_fpstate_get(gallivm);
2469 lp_build_fpstate_set_denorms_zero(gallivm, false);
2470 }
2471
2472 struct lp_type mask_type = lp_int32_vec4_type();
2473 mask_type.length = fs_type.length;
2474
2475 for (unsigned i = num_fs; i < num_fullblock_fs; i++) {
2476 fs_mask[i] = lp_build_zero(gallivm, mask_type);
2477 }
2478
2479 /* Do not bother executing code when mask is empty.. */
2480 if (do_branch) {
2481 LLVMValueRef check_mask =
2482 LLVMConstNull(lp_build_int_vec_type(gallivm, mask_type));
2483
2484 for (unsigned i = 0; i < num_fullblock_fs; ++i) {
2485 check_mask = LLVMBuildOr(builder, check_mask, fs_mask[i], "");
2486 }
2487
2488 lp_build_mask_begin(&mask_ctx, gallivm, mask_type, check_mask);
2489 lp_build_mask_check(&mask_ctx);
2490 }
2491
2492 partial_mask |= !variant->opaque;
2493 LLVMValueRef i32_zero = lp_build_const_int32(gallivm, 0);
2494
2495 LLVMValueRef undef_src_val = lp_build_undef(gallivm, fs_type);
2496
2497 row_type.length = fs_type.length;
2498 unsigned vector_width =
2499 dst_type.floating ? lp_native_vector_width : lp_integer_vector_width;
2500
2501 /* Compute correct swizzle and count channels */
2502 memset(swizzle, LP_BLD_SWIZZLE_DONTCARE, TGSI_NUM_CHANNELS);
2503 unsigned dst_channels = 0;
2504
2505 bool has_alpha = false;
2506 for (unsigned i = 0; i < TGSI_NUM_CHANNELS; ++i) {
2507 /* Ensure channel is used */
2508 if (out_format_desc->swizzle[i] >= TGSI_NUM_CHANNELS) {
2509 continue;
2510 }
2511
2512 /* Ensure not already written to (happens in case with GL_ALPHA) */
2513 if (swizzle[out_format_desc->swizzle[i]] < TGSI_NUM_CHANNELS) {
2514 continue;
2515 }
2516
2517 /* Ensure we haven't already found all channels */
2518 if (dst_channels >= out_format_desc->nr_channels) {
2519 continue;
2520 }
2521
2522 swizzle[out_format_desc->swizzle[i]] = i;
2523 ++dst_channels;
2524
2525 if (i == alpha_channel) {
2526 has_alpha = true;
2527 }
2528 }
2529
2530 if (format_expands_to_float_soa(out_format_desc)) {
2531 /*
2532 * the code above can't work for layout_other
2533 * for srgb it would sort of work but we short-circuit swizzles, etc.
2534 * as that is done as part of unpack / pack.
2535 */
2536 dst_channels = 4; /* HACK: this is fake 4 really but need it due to transpose stuff later */
2537 has_alpha = true;
2538 swizzle[0] = 0;
2539 swizzle[1] = 1;
2540 swizzle[2] = 2;
2541 swizzle[3] = 3;
2542 pad_inline = true; /* HACK: prevent rgbxrgbx->rgbrgbxx conversion later */
2543 }
2544
2545 /* If 3 channels then pad to include alpha for 4 element transpose */
2546 if (dst_channels == 3) {
2547 assert (!has_alpha);
2548 for (unsigned i = 0; i < TGSI_NUM_CHANNELS; i++) {
2549 if (swizzle[i] > TGSI_NUM_CHANNELS)
2550 swizzle[i] = 3;
2551 }
2552 if (out_format_desc->nr_channels == 4) {
2553 dst_channels = 4;
2554 /*
2555 * We use alpha from the color conversion, not separate one.
2556 * We had to include it for transpose, hence it will get converted
2557 * too (albeit when doing transpose after conversion, that would
2558 * no longer be the case necessarily).
2559 * (It works only with 4 channel dsts, e.g. rgbx formats, because
2560 * otherwise we really have padding, not alpha, included.)
2561 */
2562 has_alpha = true;
2563 }
2564 }
2565
2566 /*
2567 * Load shader output
2568 */
2569 for (unsigned i = 0; i < num_fullblock_fs; ++i) {
2570 /* Always load alpha for use in blending */
2571 LLVMValueRef alpha;
2572 if (i < num_fs) {
2573 alpha = LLVMBuildLoad2(builder, fs_vec_type,
2574 fs_out_color[rt][alpha_channel][i], "");
2575 } else {
2576 alpha = undef_src_val;
2577 }
2578
2579 /* Load each channel */
2580 for (unsigned j = 0; j < dst_channels; ++j) {
2581 assert(swizzle[j] < 4);
2582 if (i < num_fs) {
2583 fs_src[i][j] = LLVMBuildLoad2(builder, fs_vec_type,
2584 fs_out_color[rt][swizzle[j]][i], "");
2585 } else {
2586 fs_src[i][j] = undef_src_val;
2587 }
2588 }
2589
2590 /* If 3 channels then pad to include alpha for 4 element transpose */
2591 /*
2592 * XXX If we include that here maybe could actually use it instead of
2593 * separate alpha for blending?
2594 * (Difficult though we actually convert pad channels, not alpha.)
2595 */
2596 if (dst_channels == 3 && !has_alpha) {
2597 fs_src[i][3] = alpha;
2598 }
2599
2600 /* We split the row_mask and row_alpha as we want 128bit interleave */
2601 if (fs_type.length == 8) {
2602 src_mask[i*2 + 0] = lp_build_extract_range(gallivm, fs_mask[i],
2603 0, src_channels);
2604 src_mask[i*2 + 1] = lp_build_extract_range(gallivm, fs_mask[i],
2605 src_channels,
2606 src_channels);
2607
2608 src_alpha[i*2 + 0] = lp_build_extract_range(gallivm, alpha,
2609 0, src_channels);
2610 src_alpha[i*2 + 1] = lp_build_extract_range(gallivm, alpha,
2611 src_channels,
2612 src_channels);
2613 } else {
2614 src_mask[i] = fs_mask[i];
2615 src_alpha[i] = alpha;
2616 }
2617 }
2618 if (dual_source_blend) {
2619 /* same as above except different src/dst, skip masks and comments... */
2620 for (unsigned i = 0; i < num_fullblock_fs; ++i) {
2621 LLVMValueRef alpha;
2622 if (i < num_fs) {
2623 alpha = LLVMBuildLoad2(builder, fs_vec_type,
2624 fs_out_color[1][alpha_channel][i], "");
2625 } else {
2626 alpha = undef_src_val;
2627 }
2628
2629 for (unsigned j = 0; j < dst_channels; ++j) {
2630 assert(swizzle[j] < 4);
2631 if (i < num_fs) {
2632 fs_src1[i][j] = LLVMBuildLoad2(builder, fs_vec_type,
2633 fs_out_color[1][swizzle[j]][i], "");
2634 } else {
2635 fs_src1[i][j] = undef_src_val;
2636 }
2637 }
2638 if (dst_channels == 3 && !has_alpha) {
2639 fs_src1[i][3] = alpha;
2640 }
2641 if (fs_type.length == 8) {
2642 src1_alpha[i*2 + 0] = lp_build_extract_range(gallivm, alpha, 0, src_channels);
2643 src1_alpha[i*2 + 1] = lp_build_extract_range(gallivm, alpha,
2644 src_channels, src_channels);
2645 } else {
2646 src1_alpha[i] = alpha;
2647 }
2648 }
2649 }
2650
2651 if (util_format_is_pure_integer(out_format)) {
2652 /*
2653 * In this case fs_type was really ints or uints disguised as floats,
2654 * fix that up now.
2655 */
2656 fs_type.floating = 0;
2657 fs_type.sign = dst_type.sign;
2658 fs_vec_type = lp_build_vec_type(gallivm, fs_type);
2659 for (unsigned i = 0; i < num_fullblock_fs; ++i) {
2660 for (unsigned j = 0; j < dst_channels; ++j) {
2661 fs_src[i][j] = LLVMBuildBitCast(builder, fs_src[i][j],
2662 fs_vec_type, "");
2663 }
2664 if (dst_channels == 3 && !has_alpha) {
2665 fs_src[i][3] = LLVMBuildBitCast(builder, fs_src[i][3],
2666 fs_vec_type, "");
2667 }
2668 }
2669 }
2670
2671 /*
2672 * We actually should generally do conversion first (for non-1d cases)
2673 * when the blend format is 8 or 16 bits. The reason is obvious,
2674 * there's 2 or 4 times less vectors to deal with for the interleave...
2675 * Albeit for the AVX (not AVX2) case there's no benefit with 16 bit
2676 * vectors (as it can do 32bit unpack with 256bit vectors, but 8/16bit
2677 * unpack only with 128bit vectors).
2678 * Note: for 16bit sizes really need matching pack conversion code
2679 */
2680 bool twiddle_after_convert = false;
2681 if (!is_1d && dst_channels != 3 && dst_type.width == 8) {
2682 twiddle_after_convert = true;
2683 }
2684
2685 /*
2686 * Pixel twiddle from fragment shader order to memory order
2687 */
2688 unsigned src_count;
2689 if (!twiddle_after_convert) {
2690 src_count = generate_fs_twiddle(gallivm, fs_type, num_fullblock_fs,
2691 dst_channels, fs_src, src, pad_inline);
2692 if (dual_source_blend) {
2693 generate_fs_twiddle(gallivm, fs_type, num_fullblock_fs, dst_channels,
2694 fs_src1, src1, pad_inline);
2695 }
2696 } else {
2697 src_count = num_fullblock_fs * dst_channels;
2698 /*
2699 * We reorder things a bit here, so the cases for 4-wide and 8-wide
2700 * (AVX) turn out the same later when untwiddling/transpose (albeit
2701 * for true AVX2 path untwiddle needs to be different).
2702 * For now just order by colors first (so we can use unpack later).
2703 */
2704 for (unsigned j = 0; j < num_fullblock_fs; j++) {
2705 for (unsigned i = 0; i < dst_channels; i++) {
2706 src[i*num_fullblock_fs + j] = fs_src[j][i];
2707 if (dual_source_blend) {
2708 src1[i*num_fullblock_fs + j] = fs_src1[j][i];
2709 }
2710 }
2711 }
2712 }
2713
2714 src_channels = dst_channels < 3 ? dst_channels : 4;
2715 if (src_count != num_fullblock_fs * src_channels) {
2716 unsigned ds = src_count / (num_fullblock_fs * src_channels);
2717 row_type.length /= ds;
2718 fs_type.length = row_type.length;
2719 fs_vec_type = lp_build_vec_type(gallivm, fs_type);
2720 }
2721
2722 struct lp_type blend_type = row_type;
2723 mask_type.length = 4;
2724
2725 /* Convert src to row_type */
2726 if (dual_source_blend) {
2727 struct lp_type old_row_type = row_type;
2728 lp_build_conv_auto(gallivm, fs_type, &row_type, src, src_count, src);
2729 src_count = lp_build_conv_auto(gallivm, fs_type, &old_row_type,
2730 src1, src_count, src1);
2731 } else {
2732 src_count = lp_build_conv_auto(gallivm, fs_type, &row_type,
2733 src, src_count, src);
2734 }
2735
2736 /* If the rows are not an SSE vector, combine them to become SSE size! */
2737 if ((row_type.width * row_type.length) % 128) {
2738 unsigned bits = row_type.width * row_type.length;
2739 unsigned combined;
2740
2741 assert(src_count >= (vector_width / bits));
2742
2743 const unsigned dst_count = src_count / (vector_width / bits);
2744
2745 combined = lp_build_concat_n(gallivm, row_type, src, src_count,
2746 src, dst_count);
2747 if (dual_source_blend) {
2748 lp_build_concat_n(gallivm, row_type, src1, src_count, src1, dst_count);
2749 }
2750
2751 row_type.length *= combined;
2752 src_count /= combined;
2753
2754 bits = row_type.width * row_type.length;
2755 assert(bits == 128 || bits == 256);
2756 }
2757
2758 if (twiddle_after_convert) {
2759 fs_twiddle_transpose(gallivm, row_type, src, src_count, src);
2760 if (dual_source_blend) {
2761 fs_twiddle_transpose(gallivm, row_type, src1, src_count, src1);
2762 }
2763 }
2764
2765 /*
2766 * Blend Colour conversion
2767 */
2768 LLVMValueRef blend_color =
2769 lp_jit_context_f_blend_color(gallivm, context_type, context_ptr);
2770 blend_color = LLVMBuildPointerCast(builder, blend_color,
2771 LLVMPointerType(fs_vec_type, 0),
2772 "");
2773 blend_color = LLVMBuildLoad2(builder, fs_vec_type,
2774 LLVMBuildGEP2(builder, fs_vec_type,
2775 blend_color,
2776 &i32_zero, 1, ""), "");
2777
2778 /* Convert */
2779 lp_build_conv(gallivm, fs_type, blend_type, &blend_color, 1,
2780 &blend_color, 1);
2781
2782 if (out_format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
2783 /*
2784 * since blending is done with floats, there was no conversion.
2785 * However, the rules according to fixed point renderbuffers still
2786 * apply, that is we must clamp inputs to 0.0/1.0.
2787 * (This would apply to separate alpha conversion too but we currently
2788 * force has_alpha to be true.)
2789 * TODO: should skip this with "fake" blend, since post-blend conversion
2790 * will clamp anyway.
2791 * TODO: could also skip this if fragment color clamping is enabled.
2792 * We don't support it natively so it gets baked into the shader
2793 * however, so can't really tell here.
2794 */
2795 struct lp_build_context f32_bld;
2796 assert(row_type.floating);
2797 lp_build_context_init(&f32_bld, gallivm, row_type);
2798 for (unsigned i = 0; i < src_count; i++) {
2799 src[i] = lp_build_clamp_zero_one_nanzero(&f32_bld, src[i]);
2800 }
2801 if (dual_source_blend) {
2802 for (unsigned i = 0; i < src_count; i++) {
2803 src1[i] = lp_build_clamp_zero_one_nanzero(&f32_bld, src1[i]);
2804 }
2805 }
2806 /* probably can't be different than row_type but better safe than sorry... */
2807 lp_build_context_init(&f32_bld, gallivm, blend_type);
2808 blend_color = lp_build_clamp(&f32_bld, blend_color,
2809 f32_bld.zero, f32_bld.one);
2810 }
2811
2812 /* Extract alpha */
2813 LLVMValueRef blend_alpha =
2814 lp_build_extract_broadcast(gallivm, blend_type, row_type,
2815 blend_color,
2816 lp_build_const_int32(gallivm, 3));
2817
2818 /* Swizzle to appropriate channels, e.g. from RGBA to BGRA BGRA */
2819 pad_inline &= (dst_channels * (block_size / src_count) * row_type.width)
2820 != vector_width;
2821 if (pad_inline) {
2822 /* Use all 4 channels e.g. from RGBA RGBA to RGxx RGxx */
2823 blend_color = lp_build_swizzle_aos_n(gallivm, blend_color, swizzle,
2824 TGSI_NUM_CHANNELS, row_type.length);
2825 } else {
2826 /* Only use dst_channels e.g. RGBA RGBA to RG RG xxxx */
2827 blend_color = lp_build_swizzle_aos_n(gallivm, blend_color, swizzle,
2828 dst_channels, row_type.length);
2829 }
2830
2831 /*
2832 * Mask conversion
2833 */
2834 lp_bld_quad_twiddle(gallivm, mask_type, &src_mask[0],
2835 block_height, &src_mask[0]);
2836
2837 if (src_count < block_height) {
2838 lp_build_concat_n(gallivm, mask_type, src_mask, 4, src_mask, src_count);
2839 } else if (src_count > block_height) {
2840 for (unsigned i = src_count; i > 0; --i) {
2841 unsigned pixels = block_size / src_count;
2842 unsigned idx = i - 1;
2843
2844 src_mask[idx] = lp_build_extract_range(gallivm,
2845 src_mask[(idx * pixels) / 4],
2846 (idx * pixels) % 4, pixels);
2847 }
2848 }
2849
2850 assert(mask_type.width == 32);
2851
2852 for (unsigned i = 0; i < src_count; ++i) {
2853 unsigned pixels = block_size / src_count;
2854 unsigned pixel_width = row_type.width * dst_channels;
2855
2856 if (pixel_width == 24) {
2857 mask_type.width = 8;
2858 mask_type.length = vector_width / mask_type.width;
2859 } else {
2860 mask_type.length = pixels;
2861 mask_type.width = row_type.width * dst_channels;
2862
2863 /*
2864 * If mask_type width is smaller than 32bit, this doesn't quite
2865 * generate the most efficient code (could use some pack).
2866 */
2867 src_mask[i] = LLVMBuildIntCast(builder, src_mask[i],
2868 lp_build_int_vec_type(gallivm,
2869 mask_type), "");
2870
2871 mask_type.length *= dst_channels;
2872 mask_type.width /= dst_channels;
2873 }
2874
2875 src_mask[i] = LLVMBuildBitCast(builder, src_mask[i],
2876 lp_build_int_vec_type(gallivm, mask_type),
2877 "");
2878 src_mask[i] = lp_build_pad_vector(gallivm, src_mask[i], row_type.length);
2879 }
2880
2881 /*
2882 * Alpha conversion
2883 */
2884 if (!has_alpha) {
2885 struct lp_type alpha_type = fs_type;
2886 alpha_type.length = 4;
2887 convert_alpha(gallivm, row_type, alpha_type,
2888 block_size, block_height,
2889 src_count, dst_channels,
2890 pad_inline, src_alpha);
2891 if (dual_source_blend) {
2892 convert_alpha(gallivm, row_type, alpha_type,
2893 block_size, block_height,
2894 src_count, dst_channels,
2895 pad_inline, src1_alpha);
2896 }
2897 }
2898
2899
2900 /*
2901 * Load dst from memory
2902 */
2903 unsigned dst_count;
2904 if (src_count < block_height) {
2905 dst_count = block_height;
2906 } else {
2907 dst_count = src_count;
2908 }
2909
2910 dst_type.length *= block_size / dst_count;
2911
2912 if (format_expands_to_float_soa(out_format_desc)) {
2913 /*
2914 * we need multiple values at once for the conversion, so can as well
2915 * load them vectorized here too instead of concatenating later.
2916 * (Still need concatenation later for 8-wide vectors).
2917 */
2918 dst_count = block_height;
2919 dst_type.length = block_width;
2920 }
2921
2922 /*
2923 * Compute the alignment of the destination pointer in bytes
2924 * We fetch 1-4 pixels, if the format has pot alignment then those fetches
2925 * are always aligned by MIN2(16, fetch_width) except for buffers (not
2926 * 1d tex but can't distinguish here) so need to stick with per-pixel
2927 * alignment in this case.
2928 */
2929 unsigned dst_alignment;
2930 if (is_1d) {
2931 dst_alignment = (out_format_desc->block.bits + 7)/(out_format_desc->block.width * 8);
2932 } else {
2933 dst_alignment = dst_type.length * dst_type.width / 8;
2934 }
2935 /* Force power-of-two alignment by extracting only the least-significant-bit */
2936 dst_alignment = 1 << (ffs(dst_alignment) - 1);
2937 /*
2938 * Resource base and stride pointers are aligned to 16 bytes, so that's
2939 * the maximum alignment we can guarantee
2940 */
2941 dst_alignment = MIN2(16, dst_alignment);
2942
2943 struct lp_type ls_type = dst_type;
2944
2945 if (dst_count > src_count) {
2946 if ((dst_type.width == 8 || dst_type.width == 16) &&
2947 util_is_power_of_two_or_zero(dst_type.length) &&
2948 dst_type.length * dst_type.width < 128) {
2949 /*
2950 * Never try to load values as 4xi8 which we will then
2951 * concatenate to larger vectors. This gives llvm a real
2952 * headache (the problem is the type legalizer (?) will
2953 * try to load that as 4xi8 zext to 4xi32 to fill the vector,
2954 * then the shuffles to concatenate are more or less impossible
2955 * - llvm is easily capable of generating a sequence of 32
2956 * pextrb/pinsrb instructions for that. Albeit it appears to
2957 * be fixed in llvm 4.0. So, load and concatenate with 32bit
2958 * width to avoid the trouble (16bit seems not as bad, llvm
2959 * probably recognizes the load+shuffle as only one shuffle
2960 * is necessary, but we can do just the same anyway).
2961 */
2962 ls_type.length = dst_type.length * dst_type.width / 32;
2963 ls_type.width = 32;
2964 }
2965 }
2966
2967 if (is_1d) {
2968 load_unswizzled_block(gallivm, color_type, color_ptr, stride, block_width, 1,
2969 dst, ls_type, dst_count / 4, dst_alignment);
2970 for (unsigned i = dst_count / 4; i < dst_count; i++) {
2971 dst[i] = lp_build_undef(gallivm, ls_type);
2972 }
2973 } else {
2974 load_unswizzled_block(gallivm, color_type, color_ptr, stride, block_width,
2975 block_height, dst, ls_type, dst_count,
2976 dst_alignment);
2977 }
2978
2979
2980 /*
2981 * Convert from dst/output format to src/blending format.
2982 *
2983 * This is necessary as we can only read 1 row from memory at a time,
2984 * so the minimum dst_count will ever be at this point is 4.
2985 *
2986 * With, for example, R8 format you can have all 16 pixels in a 128 bit
2987 * vector, this will take the 4 dsts and combine them into 1 src so we can
2988 * perform blending on all 16 pixels in that single vector at once.
2989 */
2990 if (dst_count > src_count) {
2991 if (ls_type.length != dst_type.length && ls_type.length == 1) {
2992 LLVMTypeRef elem_type = lp_build_elem_type(gallivm, ls_type);
2993 LLVMTypeRef ls_vec_type = LLVMVectorType(elem_type, 1);
2994 for (unsigned i = 0; i < dst_count; i++) {
2995 dst[i] = LLVMBuildBitCast(builder, dst[i], ls_vec_type, "");
2996 }
2997 }
2998
2999 lp_build_concat_n(gallivm, ls_type, dst, 4, dst, src_count);
3000
3001 if (ls_type.length != dst_type.length) {
3002 struct lp_type tmp_type = dst_type;
3003 tmp_type.length = dst_type.length * 4 / src_count;
3004 for (unsigned i = 0; i < src_count; i++) {
3005 dst[i] = LLVMBuildBitCast(builder, dst[i],
3006 lp_build_vec_type(gallivm, tmp_type), "");
3007 }
3008 }
3009 }
3010
3011 /*
3012 * Blending
3013 */
3014 /* XXX this is broken for RGB8 formats -
3015 * they get expanded from 12 to 16 elements (to include alpha)
3016 * by convert_to_blend_type then reduced to 15 instead of 12
3017 * by convert_from_blend_type (a simple fix though breaks A8...).
3018 * R16G16B16 also crashes differently however something going wrong
3019 * inside llvm handling npot vector sizes seemingly.
3020 * It seems some cleanup could be done here (like skipping conversion/blend
3021 * when not needed).
3022 */
3023 convert_to_blend_type(gallivm, block_size, out_format_desc, dst_type,
3024 row_type, dst, src_count);
3025
3026 /*
3027 * FIXME: Really should get logic ops / masks out of generic blend / row
3028 * format. Logic ops will definitely not work on the blend float format
3029 * used for SRGB here and I think OpenGL expects this to work as expected
3030 * (that is incoming values converted to srgb then logic op applied).
3031 */
3032 for (unsigned i = 0; i < src_count; ++i) {
3033 dst[i] = lp_build_blend_aos(gallivm,
3034 &variant->key.blend,
3035 out_format,
3036 row_type,
3037 rt,
3038 src[i],
3039 has_alpha ? NULL : src_alpha[i],
3040 src1[i],
3041 has_alpha ? NULL : src1_alpha[i],
3042 dst[i],
3043 partial_mask ? src_mask[i] : NULL,
3044 blend_color,
3045 has_alpha ? NULL : blend_alpha,
3046 swizzle,
3047 pad_inline ? 4 : dst_channels);
3048 }
3049
3050 convert_from_blend_type(gallivm, block_size, out_format_desc,
3051 row_type, dst_type, dst, src_count);
3052
3053 /* Split the blend rows back to memory rows */
3054 if (dst_count > src_count) {
3055 row_type.length = dst_type.length * (dst_count / src_count);
3056
3057 if (src_count == 1) {
3058 dst[1] = lp_build_extract_range(gallivm, dst[0],
3059 row_type.length / 2,
3060 row_type.length / 2);
3061 dst[0] = lp_build_extract_range(gallivm, dst[0],
3062 0, row_type.length / 2);
3063
3064 row_type.length /= 2;
3065 src_count *= 2;
3066 }
3067
3068 dst[3] = lp_build_extract_range(gallivm, dst[1], row_type.length / 2,
3069 row_type.length / 2);
3070 dst[2] = lp_build_extract_range(gallivm, dst[1], 0, row_type.length / 2);
3071 dst[1] = lp_build_extract_range(gallivm, dst[0], row_type.length / 2,
3072 row_type.length / 2);
3073 dst[0] = lp_build_extract_range(gallivm, dst[0], 0, row_type.length / 2);
3074
3075 row_type.length /= 2;
3076 src_count *= 2;
3077 }
3078
3079 /*
3080 * Store blend result to memory
3081 */
3082 if (is_1d) {
3083 store_unswizzled_block(gallivm, color_type, color_ptr, stride, block_width, 1,
3084 dst, dst_type, dst_count / 4, dst_alignment);
3085 } else {
3086 store_unswizzled_block(gallivm, color_type, color_ptr, stride, block_width,
3087 block_height,
3088 dst, dst_type, dst_count, dst_alignment);
3089 }
3090
3091 if (do_branch) {
3092 lp_build_mask_end(&mask_ctx);
3093 }
3094
3095 if (fpstate) {
3096 lp_build_fpstate_set(gallivm, fpstate);
3097 }
3098 }
3099
3100
3101 /**
3102 * Generate the runtime callable function for the whole fragment pipeline.
3103 * Note that the function which we generate operates on a block of 16
3104 * pixels at at time. The block contains 2x2 quads. Each quad contains
3105 * 2x2 pixels.
3106 */
3107 static void
generate_fragment(struct llvmpipe_context * lp,struct lp_fragment_shader * shader,struct lp_fragment_shader_variant * variant,unsigned partial_mask)3108 generate_fragment(struct llvmpipe_context *lp,
3109 struct lp_fragment_shader *shader,
3110 struct lp_fragment_shader_variant *variant,
3111 unsigned partial_mask)
3112 {
3113 assert(partial_mask == RAST_WHOLE ||
3114 partial_mask == RAST_EDGE_TEST);
3115
3116 struct nir_shader *nir = shader->base.ir.nir;
3117 struct gallivm_state *gallivm = variant->gallivm;
3118 struct lp_fragment_shader_variant_key *key = &variant->key;
3119 struct lp_shader_input inputs[PIPE_MAX_SHADER_INPUTS];
3120 LLVMTypeRef fs_elem_type;
3121 LLVMTypeRef blend_vec_type;
3122 LLVMTypeRef arg_types[16];
3123 LLVMTypeRef func_type;
3124 LLVMTypeRef int32_type = LLVMInt32TypeInContext(gallivm->context);
3125 LLVMTypeRef int32p_type = LLVMPointerType(int32_type, 0);
3126 LLVMTypeRef int8_type = LLVMInt8TypeInContext(gallivm->context);
3127 LLVMTypeRef int8p_type = LLVMPointerType(int8_type, 0);
3128 LLVMValueRef context_ptr;
3129 LLVMValueRef resources_ptr;
3130 LLVMValueRef x;
3131 LLVMValueRef y;
3132 LLVMValueRef a0_ptr;
3133 LLVMValueRef dadx_ptr;
3134 LLVMValueRef dady_ptr;
3135 LLVMValueRef color_ptr_ptr;
3136 LLVMValueRef stride_ptr;
3137 LLVMValueRef color_sample_stride_ptr;
3138 LLVMValueRef depth_ptr;
3139 LLVMValueRef depth_stride;
3140 LLVMValueRef depth_sample_stride;
3141 LLVMValueRef mask_input;
3142 LLVMValueRef thread_data_ptr;
3143 LLVMBasicBlockRef block;
3144 LLVMBuilderRef builder;
3145 struct lp_build_interp_soa_context interp;
3146 LLVMValueRef fs_mask[(16 / 4) * LP_MAX_SAMPLES];
3147 LLVMValueRef fs_out_color[LP_MAX_SAMPLES][PIPE_MAX_COLOR_BUFS][TGSI_NUM_CHANNELS][16 / 4];
3148 LLVMValueRef function;
3149 LLVMValueRef facing;
3150 const bool dual_source_blend = key->blend.rt[0].blend_enable &&
3151 util_blend_state_is_dual(&key->blend, 0);
3152
3153 assert(lp_native_vector_width / 32 >= 4);
3154
3155 /* Adjust color input interpolation according to flatshade state:
3156 */
3157 nir_foreach_shader_in_variable(var, nir) {
3158 unsigned idx = var->data.driver_location;
3159 unsigned slots = nir_variable_count_slots(var, var->type);
3160 memcpy(&inputs[idx], &shader->inputs[idx], (sizeof inputs[0] * slots));
3161 for (unsigned s = 0; s < slots; s++) {
3162 if (inputs[idx + s].interp == LP_INTERP_COLOR)
3163 inputs[idx + s].interp = key->flatshade ? LP_INTERP_CONSTANT : LP_INTERP_PERSPECTIVE;
3164 }
3165 }
3166
3167 /* TODO: actually pick these based on the fs and color buffer
3168 * characteristics. */
3169
3170 struct lp_type fs_type;
3171 memset(&fs_type, 0, sizeof fs_type);
3172 fs_type.floating = true; /* floating point values */
3173 fs_type.sign = true; /* values are signed */
3174 fs_type.norm = false; /* values are not limited to [0,1] or [-1,1] */
3175 fs_type.width = 32; /* 32-bit float */
3176 fs_type.length = MIN2(lp_native_vector_width / 32, 16); /* n*4 elements per vector */
3177
3178 struct lp_type blend_type;
3179 memset(&blend_type, 0, sizeof blend_type);
3180 blend_type.floating = false; /* values are integers */
3181 blend_type.sign = false; /* values are unsigned */
3182 blend_type.norm = true; /* values are in [0,1] or [-1,1] */
3183 blend_type.width = 8; /* 8-bit ubyte values */
3184 blend_type.length = 16; /* 16 elements per vector */
3185
3186 /*
3187 * Generate the function prototype. Any change here must be reflected in
3188 * lp_jit.h's lp_jit_frag_func function pointer type, and vice-versa.
3189 */
3190
3191 fs_elem_type = lp_build_elem_type(gallivm, fs_type);
3192
3193 blend_vec_type = lp_build_vec_type(gallivm, blend_type);
3194
3195 char func_name[64];
3196 snprintf(func_name, sizeof(func_name), "fs_variant_%s",
3197 partial_mask ? "partial" : "whole");
3198
3199 arg_types[0] = variant->jit_context_ptr_type; /* context */
3200 arg_types[1] = variant->jit_resources_ptr_type; /* context */
3201 arg_types[2] = int32_type; /* x */
3202 arg_types[3] = int32_type; /* y */
3203 arg_types[4] = int32_type; /* facing */
3204 arg_types[5] = LLVMPointerType(fs_elem_type, 0); /* a0 */
3205 arg_types[6] = LLVMPointerType(fs_elem_type, 0); /* dadx */
3206 arg_types[7] = LLVMPointerType(fs_elem_type, 0); /* dady */
3207 arg_types[8] = LLVMPointerType(int8p_type, 0); /* color */
3208 arg_types[9] = int8p_type; /* depth */
3209 arg_types[10] = LLVMInt64TypeInContext(gallivm->context); /* mask_input */
3210 arg_types[11] = variant->jit_thread_data_ptr_type; /* per thread data */
3211 arg_types[12] = int32p_type; /* stride */
3212 arg_types[13] = int32_type; /* depth_stride */
3213 arg_types[14] = int32p_type; /* color sample strides */
3214 arg_types[15] = int32_type; /* depth sample stride */
3215
3216 func_type = LLVMFunctionType(LLVMVoidTypeInContext(gallivm->context),
3217 arg_types, ARRAY_SIZE(arg_types), 0);
3218
3219 function = LLVMAddFunction(gallivm->module, func_name, func_type);
3220 LLVMSetFunctionCallConv(function, LLVMCCallConv);
3221
3222 variant->function[partial_mask] = function;
3223 variant->function_name[partial_mask] = MALLOC(strlen(func_name)+1);
3224 strcpy(variant->function_name[partial_mask], func_name);
3225
3226 /* XXX: need to propagate noalias down into color param now we are
3227 * passing a pointer-to-pointer?
3228 */
3229 for (unsigned i = 0; i < ARRAY_SIZE(arg_types); ++i)
3230 if (LLVMGetTypeKind(arg_types[i]) == LLVMPointerTypeKind)
3231 lp_add_function_attr(function, i + 1, LP_FUNC_ATTR_NOALIAS);
3232
3233 if (variant->gallivm->cache->data_size) {
3234 gallivm_stub_func(gallivm, function);
3235 return;
3236 }
3237
3238 context_ptr = LLVMGetParam(function, 0);
3239 resources_ptr = LLVMGetParam(function, 1);
3240 x = LLVMGetParam(function, 2);
3241 y = LLVMGetParam(function, 3);
3242 facing = LLVMGetParam(function, 4);
3243 a0_ptr = LLVMGetParam(function, 5);
3244 dadx_ptr = LLVMGetParam(function, 6);
3245 dady_ptr = LLVMGetParam(function, 7);
3246 color_ptr_ptr = LLVMGetParam(function, 8);
3247 depth_ptr = LLVMGetParam(function, 9);
3248 mask_input = LLVMGetParam(function, 10);
3249 thread_data_ptr = LLVMGetParam(function, 11);
3250 stride_ptr = LLVMGetParam(function, 12);
3251 depth_stride = LLVMGetParam(function, 13);
3252 color_sample_stride_ptr = LLVMGetParam(function, 14);
3253 depth_sample_stride = LLVMGetParam(function, 15);
3254
3255 lp_build_name(context_ptr, "context");
3256 lp_build_name(resources_ptr, "resources");
3257 lp_build_name(x, "x");
3258 lp_build_name(y, "y");
3259 lp_build_name(a0_ptr, "a0");
3260 lp_build_name(dadx_ptr, "dadx");
3261 lp_build_name(dady_ptr, "dady");
3262 lp_build_name(color_ptr_ptr, "color_ptr_ptr");
3263 lp_build_name(depth_ptr, "depth");
3264 lp_build_name(mask_input, "mask_input");
3265 lp_build_name(thread_data_ptr, "thread_data");
3266 lp_build_name(stride_ptr, "stride_ptr");
3267 lp_build_name(depth_stride, "depth_stride");
3268 lp_build_name(color_sample_stride_ptr, "color_sample_stride_ptr");
3269 lp_build_name(depth_sample_stride, "depth_sample_stride");
3270
3271 /*
3272 * Function body
3273 */
3274
3275 block = LLVMAppendBasicBlockInContext(gallivm->context, function, "entry");
3276 builder = gallivm->builder;
3277 assert(builder);
3278 LLVMPositionBuilderAtEnd(builder, block);
3279
3280 /* code generated texture sampling */
3281 struct lp_build_sampler_soa *sampler =
3282 lp_llvm_sampler_soa_create(lp_fs_variant_key_samplers(key),
3283 MAX2(key->nr_samplers,
3284 key->nr_sampler_views));
3285 struct lp_build_image_soa *image =
3286 lp_bld_llvm_image_soa_create(lp_fs_variant_key_images(key), key->nr_images);
3287
3288 unsigned num_fs = 16 / fs_type.length; /* number of loops per 4x4 stamp */
3289 /* for 1d resources only run "upper half" of stamp */
3290 if (key->resource_1d)
3291 num_fs /= 2;
3292
3293 {
3294 LLVMValueRef num_loop = lp_build_const_int32(gallivm, num_fs);
3295 LLVMTypeRef mask_type = lp_build_int_vec_type(gallivm, fs_type);
3296 LLVMValueRef num_loop_samp =
3297 lp_build_const_int32(gallivm, num_fs * key->coverage_samples);
3298 LLVMValueRef mask_store =
3299 lp_build_array_alloca(gallivm, mask_type,
3300 num_loop_samp, "mask_store");
3301 LLVMTypeRef flt_type = LLVMFloatTypeInContext(gallivm->context);
3302 LLVMValueRef glob_sample_pos =
3303 LLVMAddGlobal(gallivm->module,
3304 LLVMArrayType(flt_type, key->coverage_samples * 2), "");
3305 LLVMSetLinkage(glob_sample_pos, LLVMInternalLinkage);
3306 LLVMValueRef sample_pos_array;
3307
3308 if (key->multisample && key->coverage_samples == 4) {
3309 LLVMValueRef sample_pos_arr[8];
3310 for (unsigned i = 0; i < 4; i++) {
3311 sample_pos_arr[i * 2] = LLVMConstReal(flt_type,
3312 lp_sample_pos_4x[i][0]);
3313 sample_pos_arr[i * 2 + 1] = LLVMConstReal(flt_type,
3314 lp_sample_pos_4x[i][1]);
3315 }
3316 sample_pos_array =
3317 LLVMConstArray(LLVMFloatTypeInContext(gallivm->context),
3318 sample_pos_arr, 8);
3319 } else {
3320 LLVMValueRef sample_pos_arr[2];
3321 sample_pos_arr[0] = LLVMConstReal(flt_type, 0.5);
3322 sample_pos_arr[1] = LLVMConstReal(flt_type, 0.5);
3323 sample_pos_array =
3324 LLVMConstArray(LLVMFloatTypeInContext(gallivm->context),
3325 sample_pos_arr, 2);
3326 }
3327 LLVMSetInitializer(glob_sample_pos, sample_pos_array);
3328
3329 LLVMValueRef color_store[PIPE_MAX_COLOR_BUFS][TGSI_NUM_CHANNELS];
3330 bool pixel_center_integer = nir->info.fs.pixel_center_integer;
3331
3332 /*
3333 * The shader input interpolation info is not explicitely baked in the
3334 * shader key, but everything it derives from (TGSI, and flatshade) is
3335 * already included in the shader key.
3336 */
3337 lp_build_interp_soa_init(&interp,
3338 gallivm,
3339 nir->num_inputs,
3340 inputs,
3341 pixel_center_integer,
3342 key->coverage_samples,
3343 LLVMTypeOf(sample_pos_array),
3344 glob_sample_pos,
3345 num_loop,
3346 builder, fs_type,
3347 a0_ptr, dadx_ptr, dady_ptr,
3348 x, y);
3349
3350 for (unsigned i = 0; i < num_fs; i++) {
3351 if (key->multisample) {
3352 LLVMValueRef smask_val =
3353 LLVMBuildLoad2(builder, int32_type,
3354 lp_jit_context_sample_mask(gallivm, variant->jit_context_type, context_ptr),
3355 "");
3356
3357 /*
3358 * For multisampling, extract the per-sample mask from the
3359 * incoming 64-bit mask, store to the per sample mask storage. Or
3360 * all of them together to generate the fragment shader
3361 * mask. (sample shading TODO). Take the incoming state coverage
3362 * mask into account.
3363 */
3364 for (unsigned s = 0; s < key->coverage_samples; s++) {
3365 LLVMValueRef sindexi =
3366 lp_build_const_int32(gallivm, i + (s * num_fs));
3367 LLVMValueRef sample_mask_ptr =
3368 LLVMBuildGEP2(builder, mask_type, mask_store, &sindexi, 1,
3369 "sample_mask_ptr");
3370 LLVMValueRef s_mask =
3371 generate_quad_mask(gallivm, fs_type,
3372 i * fs_type.length / 4, s, mask_input);
3373 LLVMValueRef smask_bit =
3374 LLVMBuildAnd(builder, smask_val,
3375 lp_build_const_int32(gallivm, (1 << s)), "");
3376 LLVMValueRef cmp =
3377 LLVMBuildICmp(builder, LLVMIntNE, smask_bit,
3378 lp_build_const_int32(gallivm, 0), "");
3379 smask_bit = LLVMBuildSExt(builder, cmp, int32_type, "");
3380 smask_bit = lp_build_broadcast(gallivm, mask_type, smask_bit);
3381
3382 s_mask = LLVMBuildAnd(builder, s_mask, smask_bit, "");
3383 LLVMBuildStore(builder, s_mask, sample_mask_ptr);
3384 }
3385 } else {
3386 LLVMValueRef mask;
3387 LLVMValueRef indexi = lp_build_const_int32(gallivm, i);
3388 LLVMValueRef mask_ptr = LLVMBuildGEP2(builder, mask_type, mask_store,
3389 &indexi, 1, "mask_ptr");
3390
3391 if (partial_mask) {
3392 mask = generate_quad_mask(gallivm, fs_type,
3393 i * fs_type.length / 4, 0, mask_input);
3394 } else {
3395 mask = lp_build_const_int_vec(gallivm, fs_type, ~0);
3396 }
3397 LLVMBuildStore(builder, mask, mask_ptr);
3398 }
3399 }
3400
3401 generate_fs_loop(gallivm,
3402 shader, key,
3403 builder,
3404 fs_type,
3405 variant->jit_context_type,
3406 context_ptr,
3407 variant->jit_resources_type,
3408 resources_ptr,
3409 LLVMTypeOf(sample_pos_array),
3410 glob_sample_pos,
3411 num_loop,
3412 &interp,
3413 sampler,
3414 image,
3415 mask_type,
3416 mask_store, /* output */
3417 color_store,
3418 depth_ptr,
3419 depth_stride,
3420 depth_sample_stride,
3421 color_ptr_ptr,
3422 stride_ptr,
3423 color_sample_stride_ptr,
3424 facing,
3425 variant->jit_thread_data_type,
3426 thread_data_ptr);
3427
3428 LLVMTypeRef fs_vec_type = lp_build_vec_type(gallivm, fs_type);
3429 for (unsigned i = 0; i < num_fs; i++) {
3430 LLVMValueRef ptr;
3431 for (unsigned s = 0; s < key->coverage_samples; s++) {
3432 int idx = (i + (s * num_fs));
3433 LLVMValueRef sindexi = lp_build_const_int32(gallivm, idx);
3434 ptr = LLVMBuildGEP2(builder, mask_type, mask_store, &sindexi, 1, "");
3435
3436 fs_mask[idx] = LLVMBuildLoad2(builder, mask_type, ptr, "smask");
3437 }
3438
3439 for (unsigned s = 0; s < key->min_samples; s++) {
3440 /* This is fucked up need to reorganize things */
3441 int idx = s * num_fs + i;
3442 LLVMValueRef sindexi = lp_build_const_int32(gallivm, idx);
3443 for (unsigned cbuf = 0; cbuf < key->nr_cbufs; cbuf++) {
3444 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
3445 ptr = LLVMBuildGEP2(builder, fs_vec_type,
3446 color_store[cbuf][chan],
3447 &sindexi, 1, "");
3448 fs_out_color[s][cbuf][chan][i] = ptr;
3449 }
3450 }
3451 if (dual_source_blend) {
3452 /* only support one dual source blend target hence always use
3453 * output 1
3454 */
3455 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
3456 ptr = LLVMBuildGEP2(builder, fs_vec_type,
3457 color_store[1][chan],
3458 &sindexi, 1, "");
3459 fs_out_color[s][1][chan][i] = ptr;
3460 }
3461 }
3462 }
3463 }
3464 }
3465
3466 lp_bld_llvm_sampler_soa_destroy(sampler);
3467 lp_bld_llvm_image_soa_destroy(image);
3468
3469 /* Loop over color outputs / color buffers to do blending */
3470 for (unsigned cbuf = 0; cbuf < key->nr_cbufs; cbuf++) {
3471 if (key->cbuf_format[cbuf] != PIPE_FORMAT_NONE &&
3472 (key->blend.rt[cbuf].blend_enable || key->blend.logicop_enable ||
3473 find_output_by_frag_result(nir, FRAG_RESULT_DATA0 + cbuf) != -1)) {
3474 LLVMValueRef color_ptr;
3475 LLVMValueRef stride;
3476 LLVMValueRef sample_stride = NULL;
3477 LLVMValueRef index = lp_build_const_int32(gallivm, cbuf);
3478
3479 bool do_branch = ((key->depth.enabled
3480 || key->stencil[0].enabled
3481 || key->alpha.enabled)
3482 && !nir->info.fs.uses_discard);
3483
3484 color_ptr = LLVMBuildLoad2(builder, int8p_type,
3485 LLVMBuildGEP2(builder, int8p_type, color_ptr_ptr,
3486 &index, 1, ""),
3487 "");
3488
3489 stride = LLVMBuildLoad2(builder, int32_type,
3490 LLVMBuildGEP2(builder, int32_type, stride_ptr,
3491 &index, 1, ""),
3492 "");
3493
3494 if (key->cbuf_nr_samples[cbuf] > 1)
3495 sample_stride = LLVMBuildLoad2(builder, int32_type,
3496 LLVMBuildGEP2(builder,
3497 int32_type,
3498 color_sample_stride_ptr,
3499 &index, 1, ""), "");
3500
3501 for (unsigned s = 0; s < key->cbuf_nr_samples[cbuf]; s++) {
3502 unsigned mask_idx = num_fs * (key->multisample ? s : 0);
3503 unsigned out_idx = key->min_samples == 1 ? 0 : s;
3504 LLVMValueRef out_ptr = color_ptr;
3505
3506 if (sample_stride) {
3507 LLVMValueRef sample_offset =
3508 LLVMBuildMul(builder, sample_stride,
3509 lp_build_const_int32(gallivm, s), "");
3510 out_ptr = LLVMBuildGEP2(builder, int8_type, out_ptr, &sample_offset, 1, "");
3511 }
3512 out_ptr = LLVMBuildBitCast(builder, out_ptr,
3513 LLVMPointerType(blend_vec_type, 0), "");
3514
3515 lp_build_name(out_ptr, "color_ptr%d", cbuf);
3516
3517 generate_unswizzled_blend(gallivm, cbuf, variant,
3518 key->cbuf_format[cbuf],
3519 num_fs, fs_type, &fs_mask[mask_idx],
3520 fs_out_color[out_idx],
3521 variant->jit_context_type,
3522 context_ptr, blend_vec_type, out_ptr, stride,
3523 partial_mask, do_branch);
3524 }
3525 }
3526 }
3527
3528 LLVMBuildRetVoid(builder);
3529
3530 gallivm_verify_function(gallivm, function);
3531 }
3532
3533
3534 static void
dump_fs_variant_key(struct lp_fragment_shader_variant_key * key)3535 dump_fs_variant_key(struct lp_fragment_shader_variant_key *key)
3536 {
3537 debug_printf("fs variant %p:\n", (void *) key);
3538
3539 if (key->flatshade) {
3540 debug_printf("flatshade = 1\n");
3541 }
3542 if (key->depth_clamp)
3543 debug_printf("depth_clamp = 1\n");
3544
3545 if (key->restrict_depth_values)
3546 debug_printf("restrict_depth_values = 1\n");
3547
3548 if (key->multisample) {
3549 debug_printf("multisample = 1\n");
3550 debug_printf("coverage samples = %d\n", key->coverage_samples);
3551 debug_printf("min samples = %d\n", key->min_samples);
3552 }
3553 for (unsigned i = 0; i < key->nr_cbufs; ++i) {
3554 debug_printf("cbuf_format[%u] = %s\n", i, util_format_name(key->cbuf_format[i]));
3555 debug_printf("cbuf nr_samples[%u] = %d\n", i, key->cbuf_nr_samples[i]);
3556 }
3557 if (key->depth.enabled || key->stencil[0].enabled) {
3558 debug_printf("depth.format = %s\n", util_format_name(key->zsbuf_format));
3559 debug_printf("depth nr_samples = %d\n", key->zsbuf_nr_samples);
3560 }
3561 if (key->depth.enabled) {
3562 debug_printf("depth.func = %s\n", util_str_func(key->depth.func, true));
3563 debug_printf("depth.writemask = %u\n", key->depth.writemask);
3564 }
3565
3566 for (unsigned i = 0; i < 2; ++i) {
3567 if (key->stencil[i].enabled) {
3568 debug_printf("stencil[%u].func = %s\n", i, util_str_func(key->stencil[i].func, true));
3569 debug_printf("stencil[%u].fail_op = %s\n", i, util_str_stencil_op(key->stencil[i].fail_op, true));
3570 debug_printf("stencil[%u].zpass_op = %s\n", i, util_str_stencil_op(key->stencil[i].zpass_op, true));
3571 debug_printf("stencil[%u].zfail_op = %s\n", i, util_str_stencil_op(key->stencil[i].zfail_op, true));
3572 debug_printf("stencil[%u].valuemask = 0x%x\n", i, key->stencil[i].valuemask);
3573 debug_printf("stencil[%u].writemask = 0x%x\n", i, key->stencil[i].writemask);
3574 }
3575 }
3576
3577 if (key->alpha.enabled) {
3578 debug_printf("alpha.func = %s\n", util_str_func(key->alpha.func, true));
3579 }
3580
3581 if (key->occlusion_count) {
3582 debug_printf("occlusion_count = 1\n");
3583 }
3584
3585 if (key->blend.logicop_enable) {
3586 debug_printf("blend.logicop_func = %s\n", util_str_logicop(key->blend.logicop_func, true));
3587 } else if (key->blend.rt[0].blend_enable) {
3588 debug_printf("blend.rgb_func = %s\n", util_str_blend_func (key->blend.rt[0].rgb_func, true));
3589 debug_printf("blend.rgb_src_factor = %s\n", util_str_blend_factor(key->blend.rt[0].rgb_src_factor, true));
3590 debug_printf("blend.rgb_dst_factor = %s\n", util_str_blend_factor(key->blend.rt[0].rgb_dst_factor, true));
3591 debug_printf("blend.alpha_func = %s\n", util_str_blend_func (key->blend.rt[0].alpha_func, true));
3592 debug_printf("blend.alpha_src_factor = %s\n", util_str_blend_factor(key->blend.rt[0].alpha_src_factor, true));
3593 debug_printf("blend.alpha_dst_factor = %s\n", util_str_blend_factor(key->blend.rt[0].alpha_dst_factor, true));
3594 }
3595 debug_printf("blend.colormask = 0x%x\n", key->blend.rt[0].colormask);
3596 if (key->blend.alpha_to_coverage) {
3597 debug_printf("blend.alpha_to_coverage is enabled\n");
3598 }
3599 for (unsigned i = 0; i < key->nr_samplers; ++i) {
3600 const struct lp_sampler_static_state *samplers = lp_fs_variant_key_samplers(key);
3601 const struct lp_static_sampler_state *sampler = &samplers[i].sampler_state;
3602 debug_printf("sampler[%u] = \n", i);
3603 debug_printf(" .wrap = %s %s %s\n",
3604 util_str_tex_wrap(sampler->wrap_s, true),
3605 util_str_tex_wrap(sampler->wrap_t, true),
3606 util_str_tex_wrap(sampler->wrap_r, true));
3607 debug_printf(" .min_img_filter = %s\n",
3608 util_str_tex_filter(sampler->min_img_filter, true));
3609 debug_printf(" .min_mip_filter = %s\n",
3610 util_str_tex_mipfilter(sampler->min_mip_filter, true));
3611 debug_printf(" .mag_img_filter = %s\n",
3612 util_str_tex_filter(sampler->mag_img_filter, true));
3613 if (sampler->compare_mode != PIPE_TEX_COMPARE_NONE)
3614 debug_printf(" .compare_func = %s\n", util_str_func(sampler->compare_func, true));
3615 debug_printf(" .normalized_coords = %u\n", sampler->normalized_coords);
3616 debug_printf(" .min_max_lod_equal = %u\n", sampler->min_max_lod_equal);
3617 debug_printf(" .lod_bias_non_zero = %u\n", sampler->lod_bias_non_zero);
3618 debug_printf(" .apply_min_lod = %u\n", sampler->apply_min_lod);
3619 debug_printf(" .apply_max_lod = %u\n", sampler->apply_max_lod);
3620 debug_printf(" .reduction_mode = %u\n", sampler->reduction_mode);
3621 debug_printf(" .aniso = %u\n", sampler->aniso);
3622 }
3623 for (unsigned i = 0; i < key->nr_sampler_views; ++i) {
3624 const struct lp_sampler_static_state *samplers = lp_fs_variant_key_samplers(key);
3625 const struct lp_static_texture_state *texture = &samplers[i].texture_state;
3626 debug_printf("texture[%u] = \n", i);
3627 debug_printf(" .format = %s\n",
3628 util_format_name(texture->format));
3629 debug_printf(" .target = %s\n",
3630 util_str_tex_target(texture->target, true));
3631 debug_printf(" .level_zero_only = %u\n",
3632 texture->level_zero_only);
3633 debug_printf(" .pot = %u %u %u\n",
3634 texture->pot_width,
3635 texture->pot_height,
3636 texture->pot_depth);
3637 }
3638 struct lp_image_static_state *images = lp_fs_variant_key_images(key);
3639 for (unsigned i = 0; i < key->nr_images; ++i) {
3640 const struct lp_static_texture_state *image = &images[i].image_state;
3641 debug_printf("image[%u] = \n", i);
3642 debug_printf(" .format = %s\n",
3643 util_format_name(image->format));
3644 debug_printf(" .target = %s\n",
3645 util_str_tex_target(image->target, true));
3646 debug_printf(" .level_zero_only = %u\n",
3647 image->level_zero_only);
3648 debug_printf(" .pot = %u %u %u\n",
3649 image->pot_width,
3650 image->pot_height,
3651 image->pot_depth);
3652 }
3653 }
3654
3655
3656 const char *
lp_debug_fs_kind(enum lp_fs_kind kind)3657 lp_debug_fs_kind(enum lp_fs_kind kind)
3658 {
3659 switch (kind) {
3660 case LP_FS_KIND_GENERAL:
3661 return "GENERAL";
3662 case LP_FS_KIND_BLIT_RGBA:
3663 return "BLIT_RGBA";
3664 case LP_FS_KIND_BLIT_RGB1:
3665 return "BLIT_RGB1";
3666 case LP_FS_KIND_AERO_MINIFICATION:
3667 return "AERO_MINIFICATION";
3668 case LP_FS_KIND_LLVM_LINEAR:
3669 return "LLVM_LINEAR";
3670 default:
3671 return "unknown";
3672 }
3673 }
3674
3675
3676 void
lp_debug_fs_variant(struct lp_fragment_shader_variant * variant)3677 lp_debug_fs_variant(struct lp_fragment_shader_variant *variant)
3678 {
3679 debug_printf("llvmpipe: Fragment shader #%u variant #%u:\n",
3680 variant->shader->no, variant->no);
3681 nir_print_shader(variant->shader->base.ir.nir, stderr);
3682 dump_fs_variant_key(&variant->key);
3683 debug_printf("variant->opaque = %u\n", variant->opaque);
3684 debug_printf("variant->potentially_opaque = %u\n", variant->potentially_opaque);
3685 debug_printf("variant->blit = %u\n", variant->blit);
3686 debug_printf("shader->kind = %s\n", lp_debug_fs_kind(variant->shader->kind));
3687 debug_printf("\n");
3688 }
3689
3690
3691 static void
lp_fs_get_ir_cache_key(struct lp_fragment_shader_variant * variant,unsigned char ir_sha1_cache_key[20])3692 lp_fs_get_ir_cache_key(struct lp_fragment_shader_variant *variant,
3693 unsigned char ir_sha1_cache_key[20])
3694 {
3695 struct blob blob = { 0 };
3696 unsigned ir_size;
3697 void *ir_binary;
3698
3699 blob_init(&blob);
3700 nir_serialize(&blob, variant->shader->base.ir.nir, true);
3701 ir_binary = blob.data;
3702 ir_size = blob.size;
3703
3704 struct mesa_sha1 ctx;
3705 _mesa_sha1_init(&ctx);
3706 _mesa_sha1_update(&ctx, &variant->key, variant->shader->variant_key_size);
3707 _mesa_sha1_update(&ctx, ir_binary, ir_size);
3708 _mesa_sha1_final(&ctx, ir_sha1_cache_key);
3709
3710 blob_finish(&blob);
3711 }
3712
3713
3714 /**
3715 * Generate a new fragment shader variant from the shader code and
3716 * other state indicated by the key.
3717 */
3718 static struct lp_fragment_shader_variant *
generate_variant(struct llvmpipe_context * lp,struct lp_fragment_shader * shader,const struct lp_fragment_shader_variant_key * key)3719 generate_variant(struct llvmpipe_context *lp,
3720 struct lp_fragment_shader *shader,
3721 const struct lp_fragment_shader_variant_key *key)
3722 {
3723 struct nir_shader *nir = shader->base.ir.nir;
3724 struct lp_fragment_shader_variant *variant =
3725 MALLOC(sizeof *variant + shader->variant_key_size - sizeof variant->key);
3726 if (!variant)
3727 return NULL;
3728
3729 memset(variant, 0, sizeof(*variant));
3730
3731 pipe_reference_init(&variant->reference, 1);
3732 lp_fs_reference(lp, &variant->shader, shader);
3733
3734 memcpy(&variant->key, key, shader->variant_key_size);
3735
3736 struct llvmpipe_screen *screen = llvmpipe_screen(lp->pipe.screen);
3737 struct lp_cached_code cached = { 0 };
3738 unsigned char ir_sha1_cache_key[20];
3739 bool needs_caching = false;
3740 if (shader->base.ir.nir) {
3741 lp_fs_get_ir_cache_key(variant, ir_sha1_cache_key);
3742
3743 lp_disk_cache_find_shader(screen, &cached, ir_sha1_cache_key);
3744 if (!cached.data_size)
3745 needs_caching = true;
3746 }
3747
3748 char module_name[64];
3749 snprintf(module_name, sizeof(module_name), "fs%u_variant%u",
3750 shader->no, shader->variants_created);
3751 variant->gallivm = gallivm_create(module_name, &lp->context, &cached);
3752 if (!variant->gallivm) {
3753 FREE(variant);
3754 return NULL;
3755 }
3756
3757 variant->list_item_global.base = variant;
3758 variant->list_item_local.base = variant;
3759 variant->no = shader->variants_created++;
3760
3761 /*
3762 * Determine whether we are touching all channels in the color buffer.
3763 */
3764 const struct util_format_description *cbuf0_format_desc = NULL;
3765 bool fullcolormask = false;
3766 if (key->nr_cbufs == 1) {
3767 cbuf0_format_desc = util_format_description(key->cbuf_format[0]);
3768 fullcolormask = util_format_colormask_full(cbuf0_format_desc,
3769 key->blend.rt[0].colormask);
3770 }
3771
3772 /* The scissor is ignored here as only tiles inside the scissoring
3773 * rectangle will refer to this.
3774 */
3775 const bool no_kill =
3776 fullcolormask &&
3777 !key->stencil[0].enabled &&
3778 !key->alpha.enabled &&
3779 !key->multisample &&
3780 !key->blend.alpha_to_coverage &&
3781 !key->depth.enabled &&
3782 !nir->info.fs.uses_discard &&
3783 !(nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)) &&
3784 !nir->info.fs.uses_fbfetch_output;
3785
3786 variant->opaque =
3787 no_kill &&
3788 !key->blend.logicop_enable &&
3789 !key->blend.rt[0].blend_enable
3790 ? true : false;
3791
3792 variant->potentially_opaque =
3793 no_kill &&
3794 !key->blend.logicop_enable &&
3795 key->blend.rt[0].blend_enable &&
3796 key->blend.rt[0].rgb_func == PIPE_BLEND_ADD &&
3797 key->blend.rt[0].rgb_dst_factor == PIPE_BLENDFACTOR_INV_SRC_ALPHA &&
3798 key->blend.rt[0].alpha_func == key->blend.rt[0].rgb_func &&
3799 key->blend.rt[0].alpha_dst_factor == key->blend.rt[0].rgb_dst_factor &&
3800 shader->base.type == PIPE_SHADER_IR_TGSI &&
3801 /*
3802 * FIXME: for NIR, all of the fields of info.xxx (except info.base)
3803 * are zeros, hence shader analysis (here and elsewhere) using these
3804 * bits cannot work and will silently fail (cbuf is the only pointer
3805 * field, hence causing a crash).
3806 */
3807 shader->info.cbuf[0][3].file != TGSI_FILE_NULL
3808 ? true : false;
3809
3810 /* We only care about opaque blits for now */
3811 if (variant->opaque &&
3812 (shader->kind == LP_FS_KIND_BLIT_RGBA ||
3813 shader->kind == LP_FS_KIND_BLIT_RGB1)) {
3814 const struct lp_sampler_static_state *samp0 =
3815 lp_fs_variant_key_sampler_idx(key, 0);
3816 assert(samp0);
3817
3818 const enum pipe_format texture_format = samp0->texture_state.format;
3819 const enum pipe_texture_target target = samp0->texture_state.target;
3820 const unsigned min_img_filter = samp0->sampler_state.min_img_filter;
3821 const unsigned mag_img_filter = samp0->sampler_state.mag_img_filter;
3822
3823 unsigned min_mip_filter;
3824 if (samp0->texture_state.level_zero_only) {
3825 min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
3826 } else {
3827 min_mip_filter = samp0->sampler_state.min_mip_filter;
3828 }
3829
3830 if (target == PIPE_TEXTURE_2D &&
3831 min_img_filter == PIPE_TEX_FILTER_NEAREST &&
3832 mag_img_filter == PIPE_TEX_FILTER_NEAREST &&
3833 min_mip_filter == PIPE_TEX_MIPFILTER_NONE &&
3834 ((texture_format &&
3835 util_is_format_compatible(util_format_description(texture_format),
3836 cbuf0_format_desc)) ||
3837 (shader->kind == LP_FS_KIND_BLIT_RGB1 &&
3838 (texture_format == PIPE_FORMAT_B8G8R8A8_UNORM ||
3839 texture_format == PIPE_FORMAT_B8G8R8X8_UNORM) &&
3840 (key->cbuf_format[0] == PIPE_FORMAT_B8G8R8A8_UNORM ||
3841 key->cbuf_format[0] == PIPE_FORMAT_B8G8R8X8_UNORM)))) {
3842 variant->blit = 1;
3843 }
3844 }
3845
3846 /* Determine whether this shader + pipeline state is a candidate for
3847 * the linear path.
3848 */
3849 const bool linear_pipeline =
3850 !key->stencil[0].enabled &&
3851 !key->depth.enabled &&
3852 !nir->info.fs.uses_discard &&
3853 !key->blend.logicop_enable &&
3854 (key->cbuf_format[0] == PIPE_FORMAT_B8G8R8A8_UNORM ||
3855 key->cbuf_format[0] == PIPE_FORMAT_B8G8R8X8_UNORM ||
3856 key->cbuf_format[0] == PIPE_FORMAT_R8G8B8A8_UNORM ||
3857 key->cbuf_format[0] == PIPE_FORMAT_R8G8B8X8_UNORM);
3858
3859 memcpy(&variant->key, key, sizeof *key);
3860
3861 if ((LP_DEBUG & DEBUG_FS) || (gallivm_debug & GALLIVM_DEBUG_IR)) {
3862 lp_debug_fs_variant(variant);
3863 }
3864
3865 llvmpipe_fs_variant_fastpath(variant);
3866
3867 lp_jit_init_types(variant);
3868
3869 if (variant->jit_function[RAST_EDGE_TEST] == NULL)
3870 generate_fragment(lp, shader, variant, RAST_EDGE_TEST);
3871
3872 if (variant->jit_function[RAST_WHOLE] == NULL) {
3873 if (variant->opaque) {
3874 /* Specialized shader, which doesn't need to read the color buffer. */
3875 generate_fragment(lp, shader, variant, RAST_WHOLE);
3876 }
3877 }
3878
3879 if (linear_pipeline) {
3880 /* Currently keeping both the old fastpaths and new linear path
3881 * active. The older code is still somewhat faster for the cases
3882 * it covers.
3883 *
3884 * XXX: consider restricting this to aero-mode only.
3885 */
3886 if (fullcolormask &&
3887 !key->alpha.enabled &&
3888 !key->blend.alpha_to_coverage) {
3889 llvmpipe_fs_variant_linear_fastpath(variant);
3890 }
3891
3892 /* If the original fastpath doesn't cover this variant, try the new
3893 * code:
3894 */
3895 if (variant->jit_linear == NULL) {
3896 if (shader->kind == LP_FS_KIND_BLIT_RGBA ||
3897 shader->kind == LP_FS_KIND_BLIT_RGB1 ||
3898 shader->kind == LP_FS_KIND_LLVM_LINEAR) {
3899 llvmpipe_fs_variant_linear_llvm(lp, shader, variant);
3900 }
3901 }
3902 } else {
3903 if (LP_DEBUG & DEBUG_LINEAR) {
3904 lp_debug_fs_variant(variant);
3905 debug_printf(" ----> no linear path for this variant\n");
3906 }
3907 }
3908
3909 /*
3910 * Compile everything
3911 */
3912
3913 #if GALLIVM_USE_ORCJIT
3914 /* module has been moved into ORCJIT after gallivm_compile_module */
3915 variant->nr_instrs += lp_build_count_ir_module(variant->gallivm->module);
3916
3917 gallivm_compile_module(variant->gallivm);
3918 #else
3919 gallivm_compile_module(variant->gallivm);
3920
3921 variant->nr_instrs += lp_build_count_ir_module(variant->gallivm->module);
3922 #endif
3923
3924 if (variant->function[RAST_EDGE_TEST]) {
3925 variant->jit_function[RAST_EDGE_TEST] = (lp_jit_frag_func)
3926 gallivm_jit_function(variant->gallivm,
3927 variant->function[RAST_EDGE_TEST],
3928 variant->function_name[RAST_EDGE_TEST]);
3929 }
3930
3931 if (variant->function[RAST_WHOLE]) {
3932 variant->jit_function[RAST_WHOLE] = (lp_jit_frag_func)
3933 gallivm_jit_function(variant->gallivm,
3934 variant->function[RAST_WHOLE],
3935 variant->function_name[RAST_WHOLE]);
3936 } else if (!variant->jit_function[RAST_WHOLE]) {
3937 variant->jit_function[RAST_WHOLE] = (lp_jit_frag_func)
3938 variant->jit_function[RAST_EDGE_TEST];
3939 }
3940
3941 if (linear_pipeline) {
3942 if (variant->linear_function) {
3943 variant->jit_linear_llvm = (lp_jit_linear_llvm_func)
3944 gallivm_jit_function(variant->gallivm, variant->linear_function,
3945 variant->linear_function_name);
3946 }
3947
3948 /*
3949 * This must be done after LLVM compilation, as it will call the JIT'ed
3950 * code to determine active inputs.
3951 */
3952 lp_linear_check_variant(variant);
3953 }
3954
3955 if (needs_caching) {
3956 lp_disk_cache_insert_shader(screen, &cached, ir_sha1_cache_key);
3957 }
3958
3959 gallivm_free_ir(variant->gallivm);
3960
3961 return variant;
3962 }
3963
3964
3965 static void *
llvmpipe_create_fs_state(struct pipe_context * pipe,const struct pipe_shader_state * templ)3966 llvmpipe_create_fs_state(struct pipe_context *pipe,
3967 const struct pipe_shader_state *templ)
3968 {
3969 struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
3970
3971 struct lp_fragment_shader *shader = CALLOC_STRUCT(lp_fragment_shader);
3972 if (!shader)
3973 return NULL;
3974
3975 pipe_reference_init(&shader->reference, 1);
3976 shader->no = fs_no++;
3977 list_inithead(&shader->variants.list);
3978
3979 shader->base.type = PIPE_SHADER_IR_NIR;
3980
3981 if (templ->type == PIPE_SHADER_IR_TGSI) {
3982 shader->base.ir.nir = tgsi_to_nir(templ->tokens, pipe->screen, false);
3983 } else {
3984 shader->base.ir.nir = templ->ir.nir;
3985 }
3986
3987 /* lower FRAG_RESULT_COLOR -> DATA[0-7] to correctly handle unused attachments */
3988 nir_shader *nir = shader->base.ir.nir;
3989 NIR_PASS_V(nir, nir_lower_fragcolor, nir->info.fs.color_is_dual_source ? 1 : 8);
3990
3991 nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
3992 nir_tgsi_scan_shader(nir, &shader->info.base, true);
3993 shader->info.num_texs = shader->info.base.opcode_count[TGSI_OPCODE_TEX];
3994
3995 llvmpipe_register_shader(pipe, &shader->base);
3996
3997 shader->draw_data = draw_create_fragment_shader(llvmpipe->draw, templ);
3998 if (shader->draw_data == NULL) {
3999 FREE(shader);
4000 return NULL;
4001 }
4002
4003 const int nr_samplers = BITSET_LAST_BIT(nir->info.samplers_used);
4004 const int nr_sampler_views = BITSET_LAST_BIT(nir->info.textures_used);
4005 const int nr_images = BITSET_LAST_BIT(nir->info.images_used);
4006
4007 shader->variant_key_size = lp_fs_variant_key_size(MAX2(nr_samplers,
4008 nr_sampler_views),
4009 nr_images);
4010
4011 nir_foreach_shader_in_variable(var, nir) {
4012 unsigned idx = var->data.driver_location;
4013 unsigned slots = nir_variable_count_slots(var, var->type);
4014
4015 if (var->data.centroid)
4016 shader->inputs[idx].location = TGSI_INTERPOLATE_LOC_CENTROID;
4017 if (var->data.sample)
4018 shader->inputs[idx].location = TGSI_INTERPOLATE_LOC_SAMPLE;
4019
4020 enum glsl_base_type base_type =
4021 glsl_get_base_type(glsl_without_array(var->type));
4022 switch (var->data.interpolation) {
4023 case INTERP_MODE_NONE:
4024 if (glsl_base_type_is_integer(base_type) || var->data.per_primitive) {
4025 shader->inputs[idx].interp = LP_INTERP_CONSTANT;
4026 break;
4027 }
4028 if (var->data.location == VARYING_SLOT_COL0 ||
4029 var->data.location == VARYING_SLOT_COL1) {
4030 shader->inputs[idx].interp = LP_INTERP_COLOR;
4031 break;
4032 }
4033 FALLTHROUGH;
4034 case INTERP_MODE_SMOOTH:
4035 shader->inputs[idx].interp = LP_INTERP_PERSPECTIVE;
4036 break;
4037 case INTERP_MODE_NOPERSPECTIVE:
4038 shader->inputs[idx].interp = LP_INTERP_LINEAR;
4039 break;
4040 case INTERP_MODE_FLAT:
4041 shader->inputs[idx].interp = LP_INTERP_CONSTANT;
4042 break;
4043 }
4044
4045 /* XXX this is a completely pointless index map... */
4046 shader->inputs[idx].src_index = idx + 1;
4047 if (var->data.location == VARYING_SLOT_FACE)
4048 shader->inputs[idx].interp = LP_INTERP_FACING;
4049 else if (var->data.location == VARYING_SLOT_POS) {
4050 shader->inputs[idx].src_index = 0;
4051 shader->inputs[idx].interp = LP_INTERP_POSITION;
4052 }
4053
4054 shader->inputs[idx].usage_mask = shader->info.base.input_usage_mask[idx];
4055 for (unsigned s = 1; s < slots; s++) {
4056 shader->inputs[idx + s] = shader->inputs[idx];
4057 shader->inputs[idx + s].src_index = idx + s + 1;
4058 shader->inputs[idx + s].usage_mask = shader->info.base.input_usage_mask[idx + s];
4059 }
4060 }
4061
4062 llvmpipe_fs_analyse_nir(shader);
4063
4064 return shader;
4065 }
4066
4067
4068 static void
llvmpipe_bind_fs_state(struct pipe_context * pipe,void * fs)4069 llvmpipe_bind_fs_state(struct pipe_context *pipe, void *fs)
4070 {
4071 struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
4072 struct lp_fragment_shader *lp_fs = (struct lp_fragment_shader *)fs;
4073 if (llvmpipe->fs == lp_fs)
4074 return;
4075
4076 draw_bind_fragment_shader(llvmpipe->draw,
4077 (lp_fs ? lp_fs->draw_data : NULL));
4078
4079 lp_fs_reference(llvmpipe, &llvmpipe->fs, lp_fs);
4080
4081 /* invalidate the setup link, NEW_FS will make it update */
4082 lp_setup_set_fs_variant(llvmpipe->setup, NULL);
4083 llvmpipe->dirty |= LP_NEW_FS;
4084 }
4085
4086
4087 /**
4088 * Remove shader variant from two lists: the shader's variant list
4089 * and the context's variant list.
4090 */
4091 static void
llvmpipe_remove_shader_variant(struct llvmpipe_context * lp,struct lp_fragment_shader_variant * variant)4092 llvmpipe_remove_shader_variant(struct llvmpipe_context *lp,
4093 struct lp_fragment_shader_variant *variant)
4094 {
4095 if ((LP_DEBUG & DEBUG_FS) || (gallivm_debug & GALLIVM_DEBUG_IR)) {
4096 debug_printf("llvmpipe: del fs #%u var %u v created %u v cached %u "
4097 "v total cached %u inst %u total inst %u\n",
4098 variant->shader->no, variant->no,
4099 variant->shader->variants_created,
4100 variant->shader->variants_cached,
4101 lp->nr_fs_variants, variant->nr_instrs, lp->nr_fs_instrs);
4102 }
4103
4104 /* remove from shader's list */
4105 list_del(&variant->list_item_local.list);
4106 variant->shader->variants_cached--;
4107
4108 /* remove from context's list */
4109 list_del(&variant->list_item_global.list);
4110 lp->nr_fs_variants--;
4111 lp->nr_fs_instrs -= variant->nr_instrs;
4112 }
4113
4114
4115 void
llvmpipe_destroy_shader_variant(struct llvmpipe_context * lp,struct lp_fragment_shader_variant * variant)4116 llvmpipe_destroy_shader_variant(struct llvmpipe_context *lp,
4117 struct lp_fragment_shader_variant *variant)
4118 {
4119 gallivm_destroy(variant->gallivm);
4120 lp_fs_reference(lp, &variant->shader, NULL);
4121 if (variant->function_name[RAST_EDGE_TEST])
4122 FREE(variant->function_name[RAST_EDGE_TEST]);
4123 if (variant->function_name[RAST_WHOLE])
4124 FREE(variant->function_name[RAST_WHOLE]);
4125 if (variant->linear_function_name)
4126 FREE(variant->linear_function_name);
4127 FREE(variant);
4128 }
4129
4130
4131 void
llvmpipe_destroy_fs(struct llvmpipe_context * llvmpipe,struct lp_fragment_shader * shader)4132 llvmpipe_destroy_fs(struct llvmpipe_context *llvmpipe,
4133 struct lp_fragment_shader *shader)
4134 {
4135 /* Delete draw module's data */
4136 draw_delete_fragment_shader(llvmpipe->draw, shader->draw_data);
4137
4138 ralloc_free(shader->base.ir.nir);
4139 assert(shader->variants_cached == 0);
4140 FREE(shader);
4141 }
4142
4143
4144 static void
llvmpipe_delete_fs_state(struct pipe_context * pipe,void * fs)4145 llvmpipe_delete_fs_state(struct pipe_context *pipe, void *fs)
4146 {
4147 struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
4148 struct lp_fragment_shader *shader = fs;
4149 struct lp_fs_variant_list_item *li, *next;
4150
4151 /* Delete all the variants */
4152 LIST_FOR_EACH_ENTRY_SAFE(li, next, &shader->variants.list, list) {
4153 struct lp_fragment_shader_variant *variant;
4154 variant = li->base;
4155 llvmpipe_remove_shader_variant(llvmpipe, li->base);
4156 lp_fs_variant_reference(llvmpipe, &variant, NULL);
4157 }
4158
4159 lp_fs_reference(llvmpipe, &shader, NULL);
4160 }
4161
4162
4163 static void
llvmpipe_set_constant_buffer(struct pipe_context * pipe,enum pipe_shader_type shader,uint index,bool take_ownership,const struct pipe_constant_buffer * cb)4164 llvmpipe_set_constant_buffer(struct pipe_context *pipe,
4165 enum pipe_shader_type shader, uint index,
4166 bool take_ownership,
4167 const struct pipe_constant_buffer *cb)
4168 {
4169 struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
4170 struct pipe_constant_buffer *constants = &llvmpipe->constants[shader][index];
4171
4172 assert(shader < PIPE_SHADER_MESH_TYPES);
4173 assert(index < ARRAY_SIZE(llvmpipe->constants[shader]));
4174
4175 /* note: reference counting */
4176 util_copy_constant_buffer(&llvmpipe->constants[shader][index], cb,
4177 take_ownership);
4178
4179 /* user_buffer is only valid until the next set_constant_buffer (at most,
4180 * possibly until shader deletion), so we need to upload it now to make
4181 * sure it doesn't get updated/freed out from under us.
4182 */
4183 if (constants->user_buffer) {
4184 u_upload_data(llvmpipe->pipe.const_uploader, 0, constants->buffer_size,
4185 16, constants->user_buffer, &constants->buffer_offset,
4186 &constants->buffer);
4187 }
4188 if (constants->buffer) {
4189 if (!(constants->buffer->bind & PIPE_BIND_CONSTANT_BUFFER)) {
4190 debug_printf("Illegal set constant without bind flag\n");
4191 constants->buffer->bind |= PIPE_BIND_CONSTANT_BUFFER;
4192 }
4193 llvmpipe_flush_resource(pipe, constants->buffer, 0, true, true, false, "set_constant_buffer");
4194 }
4195
4196 switch (shader) {
4197 case PIPE_SHADER_VERTEX:
4198 case PIPE_SHADER_GEOMETRY:
4199 case PIPE_SHADER_TESS_CTRL:
4200 case PIPE_SHADER_TESS_EVAL: {
4201 const unsigned size = cb ? cb->buffer_size : 0;
4202
4203 const uint8_t *data = NULL;
4204 if (constants->buffer) {
4205 data = (uint8_t *) llvmpipe_resource_data(constants->buffer)
4206 + constants->buffer_offset;
4207 }
4208
4209 draw_set_mapped_constant_buffer(llvmpipe->draw, shader,
4210 index, data, size);
4211 break;
4212 }
4213 case PIPE_SHADER_COMPUTE:
4214 llvmpipe->cs_dirty |= LP_CSNEW_CONSTANTS;
4215 break;
4216 case PIPE_SHADER_FRAGMENT:
4217 llvmpipe->dirty |= LP_NEW_FS_CONSTANTS;
4218 break;
4219 case PIPE_SHADER_TASK:
4220 llvmpipe->dirty |= LP_NEW_TASK_CONSTANTS;
4221 break;
4222 case PIPE_SHADER_MESH:
4223 llvmpipe->dirty |= LP_NEW_MESH_CONSTANTS;
4224 break;
4225 default:
4226 unreachable("Illegal shader type");
4227 break;
4228 }
4229 }
4230
4231
4232 static void
llvmpipe_set_shader_buffers(struct pipe_context * pipe,enum pipe_shader_type shader,unsigned start_slot,unsigned count,const struct pipe_shader_buffer * buffers,unsigned writable_bitmask)4233 llvmpipe_set_shader_buffers(struct pipe_context *pipe,
4234 enum pipe_shader_type shader, unsigned start_slot,
4235 unsigned count,
4236 const struct pipe_shader_buffer *buffers,
4237 unsigned writable_bitmask)
4238 {
4239 struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
4240
4241 unsigned i, idx;
4242 for (i = start_slot, idx = 0; i < start_slot + count; i++, idx++) {
4243 const struct pipe_shader_buffer *buffer = buffers ? &buffers[idx] : NULL;
4244
4245 util_copy_shader_buffer(&llvmpipe->ssbos[shader][i], buffer);
4246
4247 if (buffer && buffer->buffer) {
4248 bool read_only = !(writable_bitmask & (1 << idx));
4249 llvmpipe_flush_resource(pipe, buffer->buffer, 0, read_only, false,
4250 false, "buffer");
4251 }
4252
4253 switch (shader) {
4254 case PIPE_SHADER_VERTEX:
4255 case PIPE_SHADER_GEOMETRY:
4256 case PIPE_SHADER_TESS_CTRL:
4257 case PIPE_SHADER_TESS_EVAL: {
4258 const unsigned size = buffer ? buffer->buffer_size : 0;
4259 const uint8_t *data = NULL;
4260 if (buffer && buffer->buffer)
4261 data = (uint8_t *) llvmpipe_resource_data(buffer->buffer);
4262 if (data)
4263 data += buffer->buffer_offset;
4264 draw_set_mapped_shader_buffer(llvmpipe->draw, shader,
4265 i, data, size);
4266 break;
4267 }
4268 case PIPE_SHADER_COMPUTE:
4269 llvmpipe->cs_dirty |= LP_CSNEW_SSBOS;
4270 break;
4271 case PIPE_SHADER_TASK:
4272 llvmpipe->dirty |= LP_NEW_TASK_SSBOS;
4273 break;
4274 case PIPE_SHADER_MESH:
4275 llvmpipe->dirty |= LP_NEW_MESH_SSBOS;
4276 break;
4277 case PIPE_SHADER_FRAGMENT:
4278 llvmpipe->fs_ssbo_write_mask &= ~(((1 << count) - 1) << start_slot);
4279 llvmpipe->fs_ssbo_write_mask |= writable_bitmask << start_slot;
4280 llvmpipe->dirty |= LP_NEW_FS_SSBOS;
4281 break;
4282 default:
4283 unreachable("Illegal shader type");
4284 break;
4285 }
4286 }
4287 }
4288
4289
4290 static void
llvmpipe_set_shader_images(struct pipe_context * pipe,enum pipe_shader_type shader,unsigned start_slot,unsigned count,unsigned unbind_num_trailing_slots,const struct pipe_image_view * images)4291 llvmpipe_set_shader_images(struct pipe_context *pipe,
4292 enum pipe_shader_type shader, unsigned start_slot,
4293 unsigned count, unsigned unbind_num_trailing_slots,
4294 const struct pipe_image_view *images)
4295 {
4296 struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
4297 unsigned i, idx;
4298
4299 draw_flush(llvmpipe->draw);
4300 for (i = start_slot, idx = 0; i < start_slot + count; i++, idx++) {
4301 const struct pipe_image_view *image = images ? &images[idx] : NULL;
4302
4303 util_copy_image_view(&llvmpipe->images[shader][i], image);
4304
4305 if (image && image->resource) {
4306 bool read_only = !(image->access & PIPE_IMAGE_ACCESS_WRITE);
4307 llvmpipe_flush_resource(pipe, image->resource, 0, read_only, false,
4308 false, "image");
4309 }
4310 }
4311
4312 llvmpipe->num_images[shader] = start_slot + count;
4313 switch (shader) {
4314 case PIPE_SHADER_VERTEX:
4315 case PIPE_SHADER_GEOMETRY:
4316 case PIPE_SHADER_TESS_CTRL:
4317 case PIPE_SHADER_TESS_EVAL:
4318 draw_set_images(llvmpipe->draw, shader, llvmpipe->images[shader],
4319 start_slot + count);
4320 break;
4321 case PIPE_SHADER_COMPUTE:
4322 llvmpipe->cs_dirty |= LP_CSNEW_IMAGES;
4323 break;
4324 case PIPE_SHADER_FRAGMENT:
4325 llvmpipe->dirty |= LP_NEW_FS_IMAGES;
4326 break;
4327 case PIPE_SHADER_TASK:
4328 llvmpipe->dirty |= LP_NEW_TASK_IMAGES;
4329 break;
4330 case PIPE_SHADER_MESH:
4331 llvmpipe->dirty |= LP_NEW_MESH_IMAGES;
4332 break;
4333 default:
4334 unreachable("Illegal shader type");
4335 break;
4336 }
4337
4338 if (unbind_num_trailing_slots) {
4339 llvmpipe_set_shader_images(pipe, shader, start_slot + count,
4340 unbind_num_trailing_slots, 0, NULL);
4341 }
4342 }
4343
4344
4345 /**
4346 * Return the blend factor equivalent to a destination alpha of one.
4347 */
4348 static inline enum pipe_blendfactor
force_dst_alpha_one(enum pipe_blendfactor factor,bool clamped_zero)4349 force_dst_alpha_one(enum pipe_blendfactor factor, bool clamped_zero)
4350 {
4351 switch (factor) {
4352 case PIPE_BLENDFACTOR_DST_ALPHA:
4353 return PIPE_BLENDFACTOR_ONE;
4354 case PIPE_BLENDFACTOR_INV_DST_ALPHA:
4355 return PIPE_BLENDFACTOR_ZERO;
4356 case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
4357 if (clamped_zero)
4358 return PIPE_BLENDFACTOR_ZERO;
4359 else
4360 return PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE;
4361 default:
4362 return factor;
4363 }
4364 }
4365
4366
4367 /**
4368 * We need to generate several variants of the fragment pipeline to match
4369 * all the combinations of the contributing state atoms.
4370 *
4371 * TODO: there is actually no reason to tie this to context state -- the
4372 * generated code could be cached globally in the screen.
4373 */
4374 static struct lp_fragment_shader_variant_key *
make_variant_key(struct llvmpipe_context * lp,struct lp_fragment_shader * shader,char * store)4375 make_variant_key(struct llvmpipe_context *lp,
4376 struct lp_fragment_shader *shader,
4377 char *store)
4378 {
4379 struct lp_fragment_shader_variant_key *key =
4380 (struct lp_fragment_shader_variant_key *)store;
4381 struct nir_shader *nir = shader->base.ir.nir;
4382
4383 memset(key, 0, sizeof(*key));
4384
4385 if (lp->framebuffer.zsbuf) {
4386 const enum pipe_format zsbuf_format = lp->framebuffer.zsbuf->format;
4387 const struct util_format_description *zsbuf_desc =
4388 util_format_description(zsbuf_format);
4389
4390 if (lp->depth_stencil->depth_enabled &&
4391 util_format_has_depth(zsbuf_desc)) {
4392 key->zsbuf_format = zsbuf_format;
4393 key->depth.enabled = lp->depth_stencil->depth_enabled;
4394 key->depth.writemask = lp->depth_stencil->depth_writemask;
4395 key->depth.func = lp->depth_stencil->depth_func;
4396 }
4397 if (lp->depth_stencil->stencil[0].enabled &&
4398 util_format_has_stencil(zsbuf_desc)) {
4399 key->zsbuf_format = zsbuf_format;
4400 memcpy(&key->stencil, &lp->depth_stencil->stencil,
4401 sizeof key->stencil);
4402 }
4403 if (llvmpipe_resource_is_1d(lp->framebuffer.zsbuf->texture)) {
4404 key->resource_1d = true;
4405 }
4406 key->zsbuf_nr_samples =
4407 util_res_sample_count(lp->framebuffer.zsbuf->texture);
4408
4409 /*
4410 * Restrict depth values if the API is clamped (GL, VK with ext)
4411 * for non float Z buffer
4412 */
4413 key->restrict_depth_values =
4414 !(lp->rasterizer->unclamped_fragment_depth_values &&
4415 util_format_get_depth_only(zsbuf_format) == PIPE_FORMAT_Z32_FLOAT);
4416 }
4417
4418 /*
4419 * Propagate the depth clamp setting from the rasterizer state.
4420 */
4421 key->depth_clamp = lp->rasterizer->depth_clamp;
4422
4423 /* alpha test only applies if render buffer 0 is non-integer
4424 * (or does not exist)
4425 */
4426 if (!lp->framebuffer.nr_cbufs ||
4427 !lp->framebuffer.cbufs[0] ||
4428 !util_format_is_pure_integer(lp->framebuffer.cbufs[0]->format)) {
4429 key->alpha.enabled = lp->depth_stencil->alpha_enabled;
4430 }
4431 if (key->alpha.enabled) {
4432 key->alpha.func = lp->depth_stencil->alpha_func;
4433 /* alpha.ref_value is passed in jit_context */
4434 }
4435
4436 key->flatshade = lp->rasterizer->flatshade;
4437 key->multisample = lp->rasterizer->multisample;
4438 key->no_ms_sample_mask_out = lp->rasterizer->no_ms_sample_mask_out;
4439 if (lp->active_occlusion_queries && !lp->queries_disabled) {
4440 key->occlusion_count = true;
4441 }
4442
4443 memcpy(&key->blend, lp->blend, sizeof key->blend);
4444
4445 key->coverage_samples = 1;
4446 key->min_samples = 1;
4447 if (key->multisample) {
4448 key->coverage_samples =
4449 util_framebuffer_get_num_samples(&lp->framebuffer);
4450 /* Per EXT_shader_framebuffer_fetch spec:
4451 *
4452 * "1. How is framebuffer data treated during multisample rendering?
4453 *
4454 * RESOLVED: Reading the value of gl_LastFragData produces a
4455 * different result for each sample. This implies that all or part
4456 * of the shader be run once for each sample, but has no additional
4457 * implications on fragment shader input variables which may still
4458 * be interpolated per pixel by the implementation."
4459 *
4460 * ARM_shader_framebuffer_fetch_depth_stencil spec further says:
4461 *
4462 * "(1) When multisampling is enabled, does the shader run per sample?
4463 *
4464 * RESOLVED.
4465 *
4466 * This behavior is inherited from either
4467 * EXT_shader_framebuffer_fetch or ARM_shader_framebuffer_fetch as
4468 * described in the interactions section. If neither extension is
4469 * supported, the shader runs once per fragment."
4470 *
4471 * Therefore we should always enable per-sample shading when FB fetch is
4472 * used.
4473 */
4474 if (lp->min_samples > 1 || nir->info.fs.uses_fbfetch_output)
4475 key->min_samples = key->coverage_samples;
4476 }
4477 key->nr_cbufs = lp->framebuffer.nr_cbufs;
4478
4479 if (!key->blend.independent_blend_enable) {
4480 // we always need independent blend otherwise the fixups below won't work
4481 for (unsigned i = 1; i < key->nr_cbufs; i++) {
4482 memcpy(&key->blend.rt[i], &key->blend.rt[0],
4483 sizeof(key->blend.rt[0]));
4484 }
4485 key->blend.independent_blend_enable = 1;
4486 }
4487
4488 for (unsigned i = 0; i < lp->framebuffer.nr_cbufs; i++) {
4489 struct pipe_rt_blend_state *blend_rt = &key->blend.rt[i];
4490
4491 if (lp->framebuffer.cbufs[i]) {
4492 const enum pipe_format format = lp->framebuffer.cbufs[i]->format;
4493
4494 key->cbuf_format[i] = format;
4495 key->cbuf_nr_samples[i] =
4496 util_res_sample_count(lp->framebuffer.cbufs[i]->texture);
4497
4498 /*
4499 * Figure out if this is a 1d resource. Note that OpenGL allows crazy
4500 * mixing of 2d textures with height 1 and 1d textures, so make sure
4501 * we pick 1d if any cbuf or zsbuf is 1d.
4502 */
4503 if (llvmpipe_resource_is_1d(lp->framebuffer.cbufs[i]->texture)) {
4504 key->resource_1d = true;
4505 }
4506
4507 const struct util_format_description *format_desc =
4508 util_format_description(format);
4509 assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
4510 format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB);
4511
4512 /*
4513 * Mask out color channels not present in the color buffer.
4514 */
4515 blend_rt->colormask &= util_format_colormask(format_desc);
4516
4517 /*
4518 * Disable blend for integer formats.
4519 */
4520 if (util_format_is_pure_integer(format)) {
4521 blend_rt->blend_enable = 0;
4522 }
4523
4524 /*
4525 * Our swizzled render tiles always have an alpha channel, but the
4526 * linear render target format often does not, so force here the dst
4527 * alpha to be one.
4528 *
4529 * This is not a mere optimization. Wrong results will be produced if
4530 * the dst alpha is used, the dst format does not have alpha, and the
4531 * previous rendering was not flushed from the swizzled to linear
4532 * buffer. For example, NonPowTwo DCT.
4533 *
4534 * TODO: This should be generalized to all channels for better
4535 * performance, but only alpha causes correctness issues.
4536 *
4537 * Also, force rgb/alpha func/factors match, to make AoS blending
4538 * easier.
4539 */
4540 if (format_desc->swizzle[3] > PIPE_SWIZZLE_W ||
4541 format_desc->swizzle[3] == format_desc->swizzle[0]) {
4542 // Doesn't cover mixed snorm/unorm but can't render to them anyway
4543 bool clamped_zero = !util_format_is_float(format) &&
4544 !util_format_is_snorm(format);
4545 blend_rt->rgb_src_factor =
4546 force_dst_alpha_one(blend_rt->rgb_src_factor, clamped_zero);
4547 blend_rt->rgb_dst_factor =
4548 force_dst_alpha_one(blend_rt->rgb_dst_factor, clamped_zero);
4549 blend_rt->alpha_func = blend_rt->rgb_func;
4550 blend_rt->alpha_src_factor = blend_rt->rgb_src_factor;
4551 blend_rt->alpha_dst_factor = blend_rt->rgb_dst_factor;
4552 }
4553 } else {
4554 /* no color buffer for this fragment output */
4555 key->cbuf_format[i] = PIPE_FORMAT_NONE;
4556 key->cbuf_nr_samples[i] = 0;
4557 blend_rt->colormask = 0x0;
4558 blend_rt->blend_enable = 0;
4559 }
4560 }
4561
4562 /* This value will be the same for all the variants of a given shader:
4563 */
4564 key->nr_samplers = BITSET_LAST_BIT(nir->info.samplers_used);
4565 key->nr_sampler_views = BITSET_LAST_BIT(nir->info.textures_used);
4566
4567 struct lp_sampler_static_state *fs_sampler =
4568 lp_fs_variant_key_samplers(key);
4569
4570 memset(fs_sampler, 0,
4571 MAX2(key->nr_samplers, key->nr_sampler_views) * sizeof *fs_sampler);
4572
4573 for (unsigned i = 0; i < key->nr_samplers; ++i) {
4574 if (BITSET_TEST(nir->info.samplers_used, i)) {
4575 lp_sampler_static_sampler_state(&fs_sampler[i].sampler_state,
4576 lp->samplers[PIPE_SHADER_FRAGMENT][i]);
4577 }
4578 }
4579
4580 /*
4581 * XXX If TGSI_FILE_SAMPLER_VIEW exists assume all texture opcodes
4582 * are dx10-style? Can't really have mixed opcodes, at least not
4583 * if we want to skip the holes here (without rescanning tgsi).
4584 */
4585 if (key->nr_sampler_views) {
4586 for (unsigned i = 0; i < key->nr_sampler_views; ++i) {
4587 /*
4588 * Note sview may exceed what's representable by file_mask.
4589 * This will still work, the only downside is that not actually
4590 * used views may be included in the shader key.
4591 */
4592 if (BITSET_TEST(nir->info.textures_used, i)) {
4593 lp_sampler_static_texture_state(&fs_sampler[i].texture_state,
4594 lp->sampler_views[PIPE_SHADER_FRAGMENT][i]);
4595 }
4596 }
4597 } else {
4598 key->nr_sampler_views = key->nr_samplers;
4599 for (unsigned i = 0; i < key->nr_sampler_views; ++i) {
4600 if (BITSET_TEST(nir->info.samplers_used, i)) {
4601 lp_sampler_static_texture_state(&fs_sampler[i].texture_state,
4602 lp->sampler_views[PIPE_SHADER_FRAGMENT][i]);
4603 }
4604 }
4605 }
4606
4607 struct lp_image_static_state *lp_image = lp_fs_variant_key_images(key);
4608 key->nr_images = BITSET_LAST_BIT(nir->info.images_used);
4609 if (key->nr_images)
4610 memset(lp_image, 0,
4611 key->nr_images * sizeof *lp_image);
4612 for (unsigned i = 0; i < key->nr_images; ++i) {
4613 if (BITSET_TEST(nir->info.images_used, i)) {
4614 lp_sampler_static_texture_state_image(&lp_image[i].image_state,
4615 &lp->images[PIPE_SHADER_FRAGMENT][i]);
4616 }
4617 }
4618
4619 if (shader->kind == LP_FS_KIND_AERO_MINIFICATION) {
4620 struct lp_sampler_static_state *samp0 =
4621 lp_fs_variant_key_sampler_idx(key, 0);
4622 assert(samp0);
4623 samp0->sampler_state.min_img_filter = PIPE_TEX_FILTER_NEAREST;
4624 samp0->sampler_state.mag_img_filter = PIPE_TEX_FILTER_NEAREST;
4625 }
4626
4627 return key;
4628 }
4629
4630
4631 /**
4632 * Update fragment shader state. This is called just prior to drawing
4633 * something when some fragment-related state has changed.
4634 */
4635 void
llvmpipe_update_fs(struct llvmpipe_context * lp)4636 llvmpipe_update_fs(struct llvmpipe_context *lp)
4637 {
4638 struct lp_fragment_shader *shader = lp->fs;
4639
4640 char store[LP_FS_MAX_VARIANT_KEY_SIZE];
4641 const struct lp_fragment_shader_variant_key *key =
4642 make_variant_key(lp, shader, store);
4643
4644 struct lp_fragment_shader_variant *variant = NULL;
4645 struct lp_fs_variant_list_item *li;
4646 /* Search the variants for one which matches the key */
4647 LIST_FOR_EACH_ENTRY(li, &shader->variants.list, list) {
4648 if (memcmp(&li->base->key, key, shader->variant_key_size) == 0) {
4649 variant = li->base;
4650 break;
4651 }
4652 }
4653
4654 if (variant) {
4655 /* Move this variant to the head of the list to implement LRU
4656 * deletion of shader's when we have too many.
4657 */
4658 list_move_to(&variant->list_item_global.list, &lp->fs_variants_list.list);
4659 } else {
4660 /* variant not found, create it now */
4661
4662 if (LP_DEBUG & DEBUG_FS) {
4663 debug_printf("%u variants,\t%u instrs,\t%u instrs/variant\n",
4664 lp->nr_fs_variants,
4665 lp->nr_fs_instrs,
4666 lp->nr_fs_variants ? lp->nr_fs_instrs / lp->nr_fs_variants : 0);
4667 }
4668
4669 /* First, check if we've exceeded the max number of shader variants.
4670 * If so, free 6.25% of them (the least recently used ones).
4671 */
4672 const unsigned variants_to_cull =
4673 lp->nr_fs_variants >= LP_MAX_SHADER_VARIANTS
4674 ? LP_MAX_SHADER_VARIANTS / 16 : 0;
4675
4676 if (variants_to_cull ||
4677 lp->nr_fs_instrs >= LP_MAX_SHADER_INSTRUCTIONS) {
4678 if (gallivm_debug & GALLIVM_DEBUG_PERF) {
4679 debug_printf("Evicting FS: %u fs variants,\t%u total variants,"
4680 "\t%u instrs,\t%u instrs/variant\n",
4681 shader->variants_cached,
4682 lp->nr_fs_variants, lp->nr_fs_instrs,
4683 lp->nr_fs_instrs / lp->nr_fs_variants);
4684 }
4685
4686 /*
4687 * We need to re-check lp->nr_fs_variants because an arbitrarily
4688 * large number of shader variants (potentially all of them) could
4689 * be pending for destruction on flush.
4690 */
4691
4692 for (unsigned i = 0;
4693 i < variants_to_cull ||
4694 lp->nr_fs_instrs >= LP_MAX_SHADER_INSTRUCTIONS;
4695 i++) {
4696 struct lp_fs_variant_list_item *item;
4697 if (list_is_empty(&lp->fs_variants_list.list)) {
4698 break;
4699 }
4700 item = list_last_entry(&lp->fs_variants_list.list,
4701 struct lp_fs_variant_list_item, list);
4702 assert(item);
4703 assert(item->base);
4704 llvmpipe_remove_shader_variant(lp, item->base);
4705 struct lp_fragment_shader_variant *variant = item->base;
4706 lp_fs_variant_reference(lp, &variant, NULL);
4707 }
4708 }
4709
4710 /*
4711 * Generate the new variant.
4712 */
4713 int64_t t0 = os_time_get();
4714 variant = generate_variant(lp, shader, key);
4715 int64_t t1 = os_time_get();
4716 int64_t dt = t1 - t0;
4717 LP_COUNT_ADD(llvm_compile_time, dt);
4718 LP_COUNT_ADD(nr_llvm_compiles, 2); /* emit vs. omit in/out test */
4719
4720 /* Put the new variant into the list */
4721 if (variant) {
4722 list_add(&variant->list_item_local.list, &shader->variants.list);
4723 list_add(&variant->list_item_global.list, &lp->fs_variants_list.list);
4724 lp->nr_fs_variants++;
4725 lp->nr_fs_instrs += variant->nr_instrs;
4726 shader->variants_cached++;
4727 }
4728 }
4729
4730 /* Bind this variant */
4731 lp_setup_set_fs_variant(lp->setup, variant);
4732 }
4733
4734
4735 void
llvmpipe_init_fs_funcs(struct llvmpipe_context * llvmpipe)4736 llvmpipe_init_fs_funcs(struct llvmpipe_context *llvmpipe)
4737 {
4738 llvmpipe->pipe.create_fs_state = llvmpipe_create_fs_state;
4739 llvmpipe->pipe.bind_fs_state = llvmpipe_bind_fs_state;
4740 llvmpipe->pipe.delete_fs_state = llvmpipe_delete_fs_state;
4741 llvmpipe->pipe.set_constant_buffer = llvmpipe_set_constant_buffer;
4742 llvmpipe->pipe.set_shader_buffers = llvmpipe_set_shader_buffers;
4743 llvmpipe->pipe.set_shader_images = llvmpipe_set_shader_images;
4744 }
4745