1 /**************************************************************************
2 *
3 * Copyright 2010-2018 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
18 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
19 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
20 * USE OR OTHER DEALINGS IN THE SOFTWARE.
21 *
22 * The above copyright notice and this permission notice (including the
23 * next paragraph) shall be included in all copies or substantial portions
24 * of the Software.
25 *
26 **************************************************************************/
27
28
29 /**
30 * @file
31 * s3tc pixel format manipulation.
32 *
33 * @author Roland Scheidegger <[email protected]>
34 */
35
36
37 #include <llvm/Config/llvm-config.h>
38
39 #include "util/format/u_format.h"
40 #include "util/u_math.h"
41 #include "util/u_string.h"
42 #include "util/u_cpu_detect.h"
43 #include "util/u_debug.h"
44
45 #include "lp_bld_arit.h"
46 #include "lp_bld_type.h"
47 #include "lp_bld_const.h"
48 #include "lp_bld_conv.h"
49 #include "lp_bld_gather.h"
50 #include "lp_bld_format.h"
51 #include "lp_bld_logic.h"
52 #include "lp_bld_pack.h"
53 #include "lp_bld_flow.h"
54 #include "lp_bld_printf.h"
55 #include "lp_bld_struct.h"
56 #include "lp_bld_swizzle.h"
57 #include "lp_bld_init.h"
58 #include "lp_bld_debug.h"
59 #include "lp_bld_intr.h"
60
61
62 /**
63 * Reverse an interleave2_half
64 * (ie. pick every second element, independent lower/upper halfs)
65 * sse2 can only do that with 32bit (shufps) or larger elements
66 * natively. (Otherwise, and/pack (even) or shift/pack (odd)
67 * could be used, ideally llvm would do that for us.)
68 * XXX: Unfortunately, this does NOT translate to a shufps if those
69 * are int vectors (and casting will not help, llvm needs to recognize it
70 * as "real" float). Instead, llvm will use a pshufd/pshufd/punpcklqdq
71 * sequence which I'm pretty sure is a lot worse despite domain transition
72 * penalties with shufps (except maybe on Nehalem).
73 */
74 static LLVMValueRef
lp_build_uninterleave2_half(struct gallivm_state * gallivm,struct lp_type type,LLVMValueRef a,LLVMValueRef b,unsigned lo_hi)75 lp_build_uninterleave2_half(struct gallivm_state *gallivm,
76 struct lp_type type,
77 LLVMValueRef a,
78 LLVMValueRef b,
79 unsigned lo_hi)
80 {
81 LLVMValueRef shuffle, elems[LP_MAX_VECTOR_LENGTH];
82 unsigned i;
83
84 assert(type.length <= LP_MAX_VECTOR_LENGTH);
85 assert(lo_hi < 2);
86
87 if (type.length * type.width == 256) {
88 assert(type.length == 8);
89 assert(type.width == 32);
90 static const unsigned shufvals[8] = {0, 2, 8, 10, 4, 6, 12, 14};
91 for (i = 0; i < type.length; ++i) {
92 elems[i] = lp_build_const_int32(gallivm, shufvals[i] + lo_hi);
93 }
94 } else {
95 for (i = 0; i < type.length; ++i) {
96 elems[i] = lp_build_const_int32(gallivm, 2*i + lo_hi);
97 }
98 }
99
100 shuffle = LLVMConstVector(elems, type.length);
101
102 return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, "");
103
104 }
105
106
107 /**
108 * Build shuffle for extending vectors.
109 */
110 static LLVMValueRef
lp_build_const_extend_shuffle(struct gallivm_state * gallivm,unsigned n,unsigned length)111 lp_build_const_extend_shuffle(struct gallivm_state *gallivm,
112 unsigned n, unsigned length)
113 {
114 LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
115 unsigned i;
116
117 assert(n <= length);
118 assert(length <= LP_MAX_VECTOR_LENGTH);
119
120 /* TODO: cache results in a static table */
121
122 for(i = 0; i < n; i++) {
123 elems[i] = lp_build_const_int32(gallivm, i);
124 }
125 for (i = n; i < length; i++) {
126 elems[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
127 }
128
129 return LLVMConstVector(elems, length);
130 }
131
132 static LLVMValueRef
lp_build_const_unpackx2_shuffle(struct gallivm_state * gallivm,unsigned n)133 lp_build_const_unpackx2_shuffle(struct gallivm_state *gallivm, unsigned n)
134 {
135 LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
136 unsigned i, j;
137
138 assert(n <= LP_MAX_VECTOR_LENGTH);
139
140 /* TODO: cache results in a static table */
141
142 for(i = 0, j = 0; i < n; i += 2, ++j) {
143 elems[i + 0] = lp_build_const_int32(gallivm, 0 + j);
144 elems[i + 1] = lp_build_const_int32(gallivm, n + j);
145 elems[n + i + 0] = lp_build_const_int32(gallivm, 0 + n/2 + j);
146 elems[n + i + 1] = lp_build_const_int32(gallivm, n + n/2 + j);
147 }
148
149 return LLVMConstVector(elems, n * 2);
150 }
151
152 /*
153 * broadcast 1 element to all elements
154 */
155 static LLVMValueRef
lp_build_const_shuffle1(struct gallivm_state * gallivm,unsigned index,unsigned n)156 lp_build_const_shuffle1(struct gallivm_state *gallivm,
157 unsigned index, unsigned n)
158 {
159 LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
160 unsigned i;
161
162 assert(n <= LP_MAX_VECTOR_LENGTH);
163
164 /* TODO: cache results in a static table */
165
166 for (i = 0; i < n; i++) {
167 elems[i] = lp_build_const_int32(gallivm, index);
168 }
169
170 return LLVMConstVector(elems, n);
171 }
172
173 /*
174 * move 1 element to pos 0, rest undef
175 */
176 static LLVMValueRef
lp_build_shuffle1undef(struct gallivm_state * gallivm,LLVMValueRef a,unsigned index,unsigned n)177 lp_build_shuffle1undef(struct gallivm_state *gallivm,
178 LLVMValueRef a, unsigned index, unsigned n)
179 {
180 LLVMValueRef elems[LP_MAX_VECTOR_LENGTH], shuf;
181 unsigned i;
182
183 assert(n <= LP_MAX_VECTOR_LENGTH);
184
185 elems[0] = lp_build_const_int32(gallivm, index);
186
187 for (i = 1; i < n; i++) {
188 elems[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
189 }
190 shuf = LLVMConstVector(elems, n);
191
192 return LLVMBuildShuffleVector(gallivm->builder, a, a, shuf, "");
193 }
194
195 static bool
format_dxt1_variant(enum pipe_format format)196 format_dxt1_variant(enum pipe_format format)
197 {
198 return format == PIPE_FORMAT_DXT1_RGB ||
199 format == PIPE_FORMAT_DXT1_RGBA ||
200 format == PIPE_FORMAT_DXT1_SRGB ||
201 format == PIPE_FORMAT_DXT1_SRGBA;
202
203 }
204
205 /**
206 * Gather elements from scatter positions in memory into vectors.
207 * This is customised for fetching texels from s3tc textures.
208 * For SSE, typical value is length=4.
209 *
210 * @param length length of the offsets
211 * @param colors the stored colors of the blocks will be extracted into this.
212 * @param codewords the codewords of the blocks will be extracted into this.
213 * @param alpha_lo used for storing lower 32bit of alpha components for dxt3/5
214 * @param alpha_hi used for storing higher 32bit of alpha components for dxt3/5
215 * @param base_ptr base pointer, should be a i8 pointer type.
216 * @param offsets vector with offsets
217 */
218 static void
lp_build_gather_s3tc(struct gallivm_state * gallivm,unsigned length,const struct util_format_description * format_desc,LLVMValueRef * colors,LLVMValueRef * codewords,LLVMValueRef * alpha_lo,LLVMValueRef * alpha_hi,LLVMValueRef base_ptr,LLVMValueRef offsets)219 lp_build_gather_s3tc(struct gallivm_state *gallivm,
220 unsigned length,
221 const struct util_format_description *format_desc,
222 LLVMValueRef *colors,
223 LLVMValueRef *codewords,
224 LLVMValueRef *alpha_lo,
225 LLVMValueRef *alpha_hi,
226 LLVMValueRef base_ptr,
227 LLVMValueRef offsets)
228 {
229 LLVMBuilderRef builder = gallivm->builder;
230 unsigned block_bits = format_desc->block.bits;
231 unsigned i;
232 LLVMValueRef elems[8];
233 LLVMTypeRef type32 = LLVMInt32TypeInContext(gallivm->context);
234 LLVMTypeRef type64 = LLVMInt64TypeInContext(gallivm->context);
235 LLVMTypeRef type32dxt;
236 struct lp_type lp_type32dxt;
237
238 memset(&lp_type32dxt, 0, sizeof lp_type32dxt);
239 lp_type32dxt.width = 32;
240 lp_type32dxt.length = block_bits / 32;
241 type32dxt = lp_build_vec_type(gallivm, lp_type32dxt);
242
243 assert(block_bits == 64 || block_bits == 128);
244 assert(length == 1 || length == 4 || length == 8);
245
246 for (i = 0; i < length; ++i) {
247 elems[i] = lp_build_gather_elem(gallivm, length,
248 block_bits, block_bits, true,
249 base_ptr, offsets, i, false);
250 elems[i] = LLVMBuildBitCast(builder, elems[i], type32dxt, "");
251 }
252 if (length == 1) {
253 LLVMValueRef elem = elems[0];
254 if (block_bits == 128) {
255 *alpha_lo = LLVMBuildExtractElement(builder, elem,
256 lp_build_const_int32(gallivm, 0), "");
257 *alpha_hi = LLVMBuildExtractElement(builder, elem,
258 lp_build_const_int32(gallivm, 1), "");
259 *colors = LLVMBuildExtractElement(builder, elem,
260 lp_build_const_int32(gallivm, 2), "");
261 *codewords = LLVMBuildExtractElement(builder, elem,
262 lp_build_const_int32(gallivm, 3), "");
263 }
264 else {
265 *alpha_lo = LLVMGetUndef(type32);
266 *alpha_hi = LLVMGetUndef(type32);
267 *colors = LLVMBuildExtractElement(builder, elem,
268 lp_build_const_int32(gallivm, 0), "");
269 *codewords = LLVMBuildExtractElement(builder, elem,
270 lp_build_const_int32(gallivm, 1), "");
271 }
272 }
273 else {
274 LLVMValueRef tmp[4], cc01, cc23;
275 struct lp_type lp_type32, lp_type64;
276 memset(&lp_type32, 0, sizeof lp_type32);
277 lp_type32.width = 32;
278 lp_type32.length = length;
279 memset(&lp_type64, 0, sizeof lp_type64);
280 lp_type64.width = 64;
281 lp_type64.length = length/2;
282
283 if (block_bits == 128) {
284 if (length == 8) {
285 for (i = 0; i < 4; ++i) {
286 tmp[0] = elems[i];
287 tmp[1] = elems[i+4];
288 elems[i] = lp_build_concat(gallivm, tmp, lp_type32dxt, 2);
289 }
290 }
291 lp_build_transpose_aos(gallivm, lp_type32, elems, tmp);
292 *colors = tmp[2];
293 *codewords = tmp[3];
294 *alpha_lo = tmp[0];
295 *alpha_hi = tmp[1];
296 } else {
297 LLVMTypeRef type64_vec = LLVMVectorType(type64, length/2);
298 LLVMTypeRef type32_vec = LLVMVectorType(type32, length);
299
300 for (i = 0; i < length; ++i) {
301 /* no-op shuffle */
302 elems[i] = LLVMBuildShuffleVector(builder, elems[i],
303 LLVMGetUndef(type32dxt),
304 lp_build_const_extend_shuffle(gallivm, 2, 4), "");
305 }
306 if (length == 8) {
307 struct lp_type lp_type32_4 = {0};
308 lp_type32_4.width = 32;
309 lp_type32_4.length = 4;
310 for (i = 0; i < 4; ++i) {
311 tmp[0] = elems[i];
312 tmp[1] = elems[i+4];
313 elems[i] = lp_build_concat(gallivm, tmp, lp_type32_4, 2);
314 }
315 }
316 cc01 = lp_build_interleave2_half(gallivm, lp_type32, elems[0], elems[1], 0);
317 cc23 = lp_build_interleave2_half(gallivm, lp_type32, elems[2], elems[3], 0);
318 cc01 = LLVMBuildBitCast(builder, cc01, type64_vec, "");
319 cc23 = LLVMBuildBitCast(builder, cc23, type64_vec, "");
320 *colors = lp_build_interleave2_half(gallivm, lp_type64, cc01, cc23, 0);
321 *codewords = lp_build_interleave2_half(gallivm, lp_type64, cc01, cc23, 1);
322 *colors = LLVMBuildBitCast(builder, *colors, type32_vec, "");
323 *codewords = LLVMBuildBitCast(builder, *codewords, type32_vec, "");
324 }
325 }
326 }
327
328 /** Convert from <n x i32> containing 2 x n rgb565 colors
329 * to 2 <n x i32> rgba8888 colors
330 * This is the most optimized version I can think of
331 * should be nearly as fast as decoding only one color
332 * NOTE: alpha channel will be set to 0
333 * @param colors is a <n x i32> vector containing the rgb565 colors
334 */
335 static void
color_expand2_565_to_8888(struct gallivm_state * gallivm,unsigned n,LLVMValueRef colors,LLVMValueRef * color0,LLVMValueRef * color1)336 color_expand2_565_to_8888(struct gallivm_state *gallivm,
337 unsigned n,
338 LLVMValueRef colors,
339 LLVMValueRef *color0,
340 LLVMValueRef *color1)
341 {
342 LLVMBuilderRef builder = gallivm->builder;
343 LLVMValueRef r, g, b, rblo, glo;
344 LLVMValueRef rgblomask, rb, rgb0, rgb1;
345 struct lp_type type, type16, type8;
346
347 assert(n > 1);
348
349 memset(&type, 0, sizeof type);
350 type.width = 32;
351 type.length = n;
352
353 memset(&type16, 0, sizeof type16);
354 type16.width = 16;
355 type16.length = 2 * n;
356
357 memset(&type8, 0, sizeof type8);
358 type8.width = 8;
359 type8.length = 4 * n;
360
361 rgblomask = lp_build_const_int_vec(gallivm, type16, 0x0707);
362 colors = LLVMBuildBitCast(builder, colors,
363 lp_build_vec_type(gallivm, type16), "");
364 /* move r into low 8 bits, b into high 8 bits, g into another reg (low bits)
365 * make sure low bits of r are zero - could use AND but requires constant */
366 r = LLVMBuildLShr(builder, colors, lp_build_const_int_vec(gallivm, type16, 11), "");
367 r = LLVMBuildShl(builder, r, lp_build_const_int_vec(gallivm, type16, 3), "");
368 b = LLVMBuildShl(builder, colors, lp_build_const_int_vec(gallivm, type16, 11), "");
369 rb = LLVMBuildOr(builder, r, b, "");
370 rblo = LLVMBuildLShr(builder, rb, lp_build_const_int_vec(gallivm, type16, 5), "");
371 /* don't have byte shift hence need mask */
372 rblo = LLVMBuildAnd(builder, rblo, rgblomask, "");
373 rb = LLVMBuildOr(builder, rb, rblo, "");
374
375 /* make sure low bits of g are zero */
376 g = LLVMBuildAnd(builder, colors, lp_build_const_int_vec(gallivm, type16, 0x07e0), "");
377 g = LLVMBuildLShr(builder, g, lp_build_const_int_vec(gallivm, type16, 3), "");
378 glo = LLVMBuildLShr(builder, g, lp_build_const_int_vec(gallivm, type16, 6), "");
379 g = LLVMBuildOr(builder, g, glo, "");
380
381 rb = LLVMBuildBitCast(builder, rb, lp_build_vec_type(gallivm, type8), "");
382 g = LLVMBuildBitCast(builder, g, lp_build_vec_type(gallivm, type8), "");
383 rgb0 = lp_build_interleave2_half(gallivm, type8, rb, g, 0);
384 rgb1 = lp_build_interleave2_half(gallivm, type8, rb, g, 1);
385
386 rgb0 = LLVMBuildBitCast(builder, rgb0, lp_build_vec_type(gallivm, type), "");
387 rgb1 = LLVMBuildBitCast(builder, rgb1, lp_build_vec_type(gallivm, type), "");
388
389 /* rgb0 is rgb00, rgb01, rgb10, rgb11
390 * instead of rgb00, rgb10, rgb20, rgb30 hence need reshuffle
391 * on x86 this _should_ just generate one shufps...
392 */
393 *color0 = lp_build_uninterleave2_half(gallivm, type, rgb0, rgb1, 0);
394 *color1 = lp_build_uninterleave2_half(gallivm, type, rgb0, rgb1, 1);
395 }
396
397
398 /** Convert from <n x i32> containing rgb565 colors
399 * (in first 16 bits) to <n x i32> rgba8888 colors
400 * bits 16-31 MBZ
401 * NOTE: alpha channel will be set to 0
402 * @param colors is a <n x i32> vector containing the rgb565 colors
403 */
404 static LLVMValueRef
color_expand_565_to_8888(struct gallivm_state * gallivm,unsigned n,LLVMValueRef colors)405 color_expand_565_to_8888(struct gallivm_state *gallivm,
406 unsigned n,
407 LLVMValueRef colors)
408 {
409 LLVMBuilderRef builder = gallivm->builder;
410 LLVMValueRef rgba, r, g, b, rgblo, glo;
411 LLVMValueRef rbhimask, g6mask, rgblomask;
412 struct lp_type type;
413 memset(&type, 0, sizeof type);
414 type.width = 32;
415 type.length = n;
416
417 /* color expansion:
418 * first extract and shift colors into their final locations
419 * (high bits - low bits zero at this point)
420 * then replicate highest bits to the lowest bits
421 * note rb replication can be done in parallel but not g
422 * (different shift)
423 * r5mask = 0xf800, g6mask = 0x07e0, b5mask = 0x001f
424 * rhigh = 8, ghigh = 5, bhigh = 19
425 * rblow = 5, glow = 6
426 * rgblowmask = 0x00070307
427 * r = colors >> rhigh
428 * b = colors << bhigh
429 * g = (colors & g6mask) << ghigh
430 * rb = (r | b) rbhimask
431 * rbtmp = rb >> rblow
432 * gtmp = rb >> glow
433 * rbtmp = rbtmp | gtmp
434 * rbtmp = rbtmp & rgblowmask
435 * rgb = rb | g | rbtmp
436 */
437 g6mask = lp_build_const_int_vec(gallivm, type, 0x07e0);
438 rbhimask = lp_build_const_int_vec(gallivm, type, 0x00f800f8);
439 rgblomask = lp_build_const_int_vec(gallivm, type, 0x00070307);
440
441 r = LLVMBuildLShr(builder, colors, lp_build_const_int_vec(gallivm, type, 8), "");
442 b = LLVMBuildShl(builder, colors, lp_build_const_int_vec(gallivm, type, 19), "");
443 g = LLVMBuildAnd(builder, colors, g6mask, "");
444 g = LLVMBuildShl(builder, g, lp_build_const_int_vec(gallivm, type, 5), "");
445 rgba = LLVMBuildOr(builder, r, b, "");
446 rgba = LLVMBuildAnd(builder, rgba, rbhimask, "");
447 rgblo = LLVMBuildLShr(builder, rgba, lp_build_const_int_vec(gallivm, type, 5), "");
448 glo = LLVMBuildLShr(builder, g, lp_build_const_int_vec(gallivm, type, 6), "");
449 rgblo = LLVMBuildOr(builder, rgblo, glo, "");
450 rgblo = LLVMBuildAnd(builder, rgblo, rgblomask, "");
451 rgba = LLVMBuildOr(builder, rgba, g, "");
452 rgba = LLVMBuildOr(builder, rgba, rgblo, "");
453
454 return rgba;
455 }
456
457
458 /*
459 * Average two byte vectors. (Will always round up.)
460 */
461 static LLVMValueRef
lp_build_pavgb(struct lp_build_context * bld8,LLVMValueRef v0,LLVMValueRef v1)462 lp_build_pavgb(struct lp_build_context *bld8,
463 LLVMValueRef v0,
464 LLVMValueRef v1)
465 {
466 struct gallivm_state *gallivm = bld8->gallivm;
467 LLVMBuilderRef builder = gallivm->builder;
468 assert(bld8->type.width == 8);
469 assert(bld8->type.length == 16 || bld8->type.length == 32);
470 if (LLVM_VERSION_MAJOR < 6) {
471 LLVMValueRef intrargs[2];
472 char *intr_name = bld8->type.length == 32 ? "llvm.x86.avx2.pavg.b" :
473 "llvm.x86.sse2.pavg.b";
474 intrargs[0] = v0;
475 intrargs[1] = v1;
476 return lp_build_intrinsic(builder, intr_name,
477 bld8->vec_type, intrargs, 2, 0);
478 } else {
479 /*
480 * Must match llvm's autoupgrade of pavg.b intrinsic to be useful.
481 * You better hope the backend code manages to detect the pattern, and
482 * the pattern doesn't change there...
483 */
484 struct lp_type type_ext = bld8->type;
485 LLVMTypeRef vec_type_ext;
486 LLVMValueRef res;
487 LLVMValueRef ext_one;
488 type_ext.width = 16;
489 vec_type_ext = lp_build_vec_type(gallivm, type_ext);
490 ext_one = lp_build_const_vec(gallivm, type_ext, 1);
491
492 v0 = LLVMBuildZExt(builder, v0, vec_type_ext, "");
493 v1 = LLVMBuildZExt(builder, v1, vec_type_ext, "");
494 res = LLVMBuildAdd(builder, v0, v1, "");
495 res = LLVMBuildAdd(builder, res, ext_one, "");
496 res = LLVMBuildLShr(builder, res, ext_one, "");
497 res = LLVMBuildTrunc(builder, res, bld8->vec_type, "");
498 return res;
499 }
500 }
501
502 /**
503 * Calculate 1/3(v1-v0) + v0
504 * and 2*1/3(v1-v0) + v0
505 */
506 static void
lp_build_lerp23(struct lp_build_context * bld,LLVMValueRef v0,LLVMValueRef v1,LLVMValueRef * res0,LLVMValueRef * res1)507 lp_build_lerp23(struct lp_build_context *bld,
508 LLVMValueRef v0,
509 LLVMValueRef v1,
510 LLVMValueRef *res0,
511 LLVMValueRef *res1)
512 {
513 struct gallivm_state *gallivm = bld->gallivm;
514 LLVMValueRef x, x_lo, x_hi, delta_lo, delta_hi;
515 LLVMValueRef mul_lo, mul_hi, v0_lo, v0_hi, v1_lo, v1_hi, tmp;
516 const struct lp_type type = bld->type;
517 LLVMBuilderRef builder = bld->gallivm->builder;
518 struct lp_type i16_type = lp_wider_type(type);
519 struct lp_build_context bld2;
520
521 assert(lp_check_value(type, v0));
522 assert(lp_check_value(type, v1));
523 assert(!type.floating && !type.fixed && !type.norm && type.width == 8);
524
525 lp_build_context_init(&bld2, gallivm, i16_type);
526 bld2.type.sign = true;
527 x = lp_build_const_int_vec(gallivm, bld->type, 255*1/3);
528
529 /* FIXME: use native avx256 unpack/pack */
530 lp_build_unpack2(gallivm, type, i16_type, x, &x_lo, &x_hi);
531 lp_build_unpack2(gallivm, type, i16_type, v0, &v0_lo, &v0_hi);
532 lp_build_unpack2(gallivm, type, i16_type, v1, &v1_lo, &v1_hi);
533 delta_lo = lp_build_sub(&bld2, v1_lo, v0_lo);
534 delta_hi = lp_build_sub(&bld2, v1_hi, v0_hi);
535
536 mul_lo = LLVMBuildMul(builder, x_lo, delta_lo, "");
537 mul_hi = LLVMBuildMul(builder, x_hi, delta_hi, "");
538
539 x_lo = LLVMBuildLShr(builder, mul_lo, lp_build_const_int_vec(gallivm, i16_type, 8), "");
540 x_hi = LLVMBuildLShr(builder, mul_hi, lp_build_const_int_vec(gallivm, i16_type, 8), "");
541 /* lerp optimization: pack now, do add afterwards */
542 tmp = lp_build_pack2(gallivm, i16_type, type, x_lo, x_hi);
543 *res0 = lp_build_add(bld, tmp, v0);
544
545 x_lo = LLVMBuildLShr(builder, mul_lo, lp_build_const_int_vec(gallivm, i16_type, 7), "");
546 x_hi = LLVMBuildLShr(builder, mul_hi, lp_build_const_int_vec(gallivm, i16_type, 7), "");
547 /* unlike above still need mask (but add still afterwards). */
548 x_lo = LLVMBuildAnd(builder, x_lo, lp_build_const_int_vec(gallivm, i16_type, 0xff), "");
549 x_hi = LLVMBuildAnd(builder, x_hi, lp_build_const_int_vec(gallivm, i16_type, 0xff), "");
550 tmp = lp_build_pack2(gallivm, i16_type, type, x_lo, x_hi);
551 *res1 = lp_build_add(bld, tmp, v0);
552 }
553
554 /**
555 * Convert from <n x i64> s3tc dxt1 to <4n x i8> RGBA AoS
556 * @param colors is a <n x i32> vector with n x 2x16bit colors
557 * @param codewords is a <n x i32> vector containing the codewords
558 * @param i is a <n x i32> vector with the x pixel coordinate (0 to 3)
559 * @param j is a <n x i32> vector with the y pixel coordinate (0 to 3)
560 */
561 static LLVMValueRef
s3tc_dxt1_full_to_rgba_aos(struct gallivm_state * gallivm,unsigned n,enum pipe_format format,LLVMValueRef colors,LLVMValueRef codewords,LLVMValueRef i,LLVMValueRef j)562 s3tc_dxt1_full_to_rgba_aos(struct gallivm_state *gallivm,
563 unsigned n,
564 enum pipe_format format,
565 LLVMValueRef colors,
566 LLVMValueRef codewords,
567 LLVMValueRef i,
568 LLVMValueRef j)
569 {
570 LLVMBuilderRef builder = gallivm->builder;
571 LLVMValueRef color0, color1, color2, color3, color2_2, color3_2;
572 LLVMValueRef rgba, a, colors0, colors1, col0, col1, const2;
573 LLVMValueRef bit_pos, sel_mask, sel_lo, sel_hi, indices;
574 struct lp_type type, type8;
575 struct lp_build_context bld8, bld32;
576 bool is_dxt1_variant = format_dxt1_variant(format);
577
578 memset(&type, 0, sizeof type);
579 type.width = 32;
580 type.length = n;
581
582 memset(&type8, 0, sizeof type8);
583 type8.width = 8;
584 type8.length = 4*n;
585
586 assert(lp_check_value(type, i));
587 assert(lp_check_value(type, j));
588
589 a = lp_build_const_int_vec(gallivm, type, 0xff000000);
590
591 lp_build_context_init(&bld32, gallivm, type);
592 lp_build_context_init(&bld8, gallivm, type8);
593
594 /*
595 * works as follows:
596 * - expand color0/color1 to rgba8888
597 * - calculate color2/3 (interpolation) according to color0 < color1 rules
598 * - calculate color2/3 according to color0 >= color1 rules
599 * - do selection of color2/3 according to comparison of color0/1
600 * - extract indices (vector shift).
601 * - use compare/select to select the correct color. Since we have 2bit
602 * indices (and 4 colors), needs at least three compare/selects.
603 */
604 /*
605 * expand the two colors
606 */
607 col0 = LLVMBuildAnd(builder, colors, lp_build_const_int_vec(gallivm, type, 0x0000ffff), "");
608 col1 = LLVMBuildLShr(builder, colors, lp_build_const_int_vec(gallivm, type, 16), "");
609 if (n > 1) {
610 color_expand2_565_to_8888(gallivm, n, colors, &color0, &color1);
611 }
612 else {
613 color0 = color_expand_565_to_8888(gallivm, n, col0);
614 color1 = color_expand_565_to_8888(gallivm, n, col1);
615 }
616
617 /*
618 * interpolate colors
619 * color2_1 is 2/3 color0 + 1/3 color1
620 * color3_1 is 1/3 color0 + 2/3 color1
621 * color2_2 is 1/2 color0 + 1/2 color1
622 * color3_2 is 0
623 */
624
625 colors0 = LLVMBuildBitCast(builder, color0, bld8.vec_type, "");
626 colors1 = LLVMBuildBitCast(builder, color1, bld8.vec_type, "");
627 /* can combine 2 lerps into one mostly - still looks expensive enough. */
628 lp_build_lerp23(&bld8, colors0, colors1, &color2, &color3);
629 color2 = LLVMBuildBitCast(builder, color2, bld32.vec_type, "");
630 color3 = LLVMBuildBitCast(builder, color3, bld32.vec_type, "");
631
632 /* dxt3/5 always use 4-color encoding */
633 if (is_dxt1_variant) {
634 /* fix up alpha */
635 if (format == PIPE_FORMAT_DXT1_RGBA ||
636 format == PIPE_FORMAT_DXT1_SRGBA) {
637 color0 = LLVMBuildOr(builder, color0, a, "");
638 color1 = LLVMBuildOr(builder, color1, a, "");
639 color3 = LLVMBuildOr(builder, color3, a, "");
640 }
641 /*
642 * XXX with sse2 and 16x8 vectors, should use pavgb even when n == 1.
643 * Much cheaper (but we don't care that much if n == 1).
644 */
645 if ((util_get_cpu_caps()->has_sse2 && n == 4) ||
646 (util_get_cpu_caps()->has_avx2 && n == 8)) {
647 color2_2 = lp_build_pavgb(&bld8, colors0, colors1);
648 color2_2 = LLVMBuildBitCast(builder, color2_2, bld32.vec_type, "");
649 }
650 else {
651 struct lp_type i16_type = lp_wider_type(type8);
652 struct lp_build_context bld2;
653 LLVMValueRef v0_lo, v0_hi, v1_lo, v1_hi, addlo, addhi;
654
655 lp_build_context_init(&bld2, gallivm, i16_type);
656 bld2.type.sign = true;
657
658 /*
659 * This isn't as expensive as it looks (the unpack is the same as
660 * for lerp23), with correct rounding.
661 * (Note that while rounding is correct, this will always round down,
662 * whereas pavgb will always round up.)
663 */
664 /* FIXME: use native avx256 unpack/pack */
665 lp_build_unpack2(gallivm, type8, i16_type, colors0, &v0_lo, &v0_hi);
666 lp_build_unpack2(gallivm, type8, i16_type, colors1, &v1_lo, &v1_hi);
667
668 addlo = lp_build_add(&bld2, v0_lo, v1_lo);
669 addhi = lp_build_add(&bld2, v0_hi, v1_hi);
670 addlo = LLVMBuildLShr(builder, addlo,
671 lp_build_const_int_vec(gallivm, i16_type, 1), "");
672 addhi = LLVMBuildLShr(builder, addhi,
673 lp_build_const_int_vec(gallivm, i16_type, 1), "");
674 color2_2 = lp_build_pack2(gallivm, i16_type, type8, addlo, addhi);
675 color2_2 = LLVMBuildBitCast(builder, color2_2, bld32.vec_type, "");
676 }
677 color3_2 = lp_build_const_int_vec(gallivm, type, 0);
678
679 /* select between colors2/3 */
680 /* signed compare is faster saves some xors */
681 type.sign = true;
682 sel_mask = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER, col0, col1);
683 color2 = lp_build_select(&bld32, sel_mask, color2, color2_2);
684 color3 = lp_build_select(&bld32, sel_mask, color3, color3_2);
685 type.sign = false;
686
687 if (format == PIPE_FORMAT_DXT1_RGBA ||
688 format == PIPE_FORMAT_DXT1_SRGBA) {
689 color2 = LLVMBuildOr(builder, color2, a, "");
690 }
691 }
692
693 const2 = lp_build_const_int_vec(gallivm, type, 2);
694 /* extract 2-bit index values */
695 bit_pos = LLVMBuildShl(builder, j, const2, "");
696 bit_pos = LLVMBuildAdd(builder, bit_pos, i, "");
697 bit_pos = LLVMBuildAdd(builder, bit_pos, bit_pos, "");
698 /*
699 * NOTE: This innocent looking shift is very expensive with x86/ssex.
700 * Shifts with per-elemnent shift count get roughly translated to
701 * extract (count), extract (value), shift, move (back to xmm), unpack
702 * per element!
703 * So about 20 instructions here for 4xi32.
704 * Newer llvm versions (3.7+) will not do extract/insert but use a
705 * a couple constant count vector shifts plus shuffles. About same
706 * amount of instructions unfortunately...
707 * Would get much worse with 8xi16 even...
708 * We could actually do better here:
709 * - subtract bit_pos from 128+30, shl 23, convert float to int...
710 * - now do mul with codewords followed by shr 30...
711 * But requires 32bit->32bit mul, sse41 only (well that's emulatable
712 * with 2 32bit->64bit muls...) and not exactly cheap
713 * AVX2, of course, fixes this nonsense.
714 */
715 indices = LLVMBuildLShr(builder, codewords, bit_pos, "");
716
717 /* finally select the colors */
718 sel_lo = LLVMBuildAnd(builder, indices, bld32.one, "");
719 sel_lo = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL, sel_lo, bld32.one);
720 color0 = lp_build_select(&bld32, sel_lo, color1, color0);
721 color2 = lp_build_select(&bld32, sel_lo, color3, color2);
722 sel_hi = LLVMBuildAnd(builder, indices, const2, "");
723 sel_hi = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL, sel_hi, const2);
724 rgba = lp_build_select(&bld32, sel_hi, color2, color0);
725
726 /* fix up alpha */
727 if (format == PIPE_FORMAT_DXT1_RGB ||
728 format == PIPE_FORMAT_DXT1_SRGB) {
729 rgba = LLVMBuildOr(builder, rgba, a, "");
730 }
731 return LLVMBuildBitCast(builder, rgba, bld8.vec_type, "");
732 }
733
734
735 static LLVMValueRef
s3tc_dxt1_to_rgba_aos(struct gallivm_state * gallivm,unsigned n,enum pipe_format format,LLVMValueRef colors,LLVMValueRef codewords,LLVMValueRef i,LLVMValueRef j)736 s3tc_dxt1_to_rgba_aos(struct gallivm_state *gallivm,
737 unsigned n,
738 enum pipe_format format,
739 LLVMValueRef colors,
740 LLVMValueRef codewords,
741 LLVMValueRef i,
742 LLVMValueRef j)
743 {
744 return s3tc_dxt1_full_to_rgba_aos(gallivm, n, format,
745 colors, codewords, i, j);
746 }
747
748
749 /**
750 * Convert from <n x i128> s3tc dxt3 to <4n x i8> RGBA AoS
751 * @param colors is a <n x i32> vector with n x 2x16bit colors
752 * @param codewords is a <n x i32> vector containing the codewords
753 * @param alphas is a <n x i64> vector containing the alpha values
754 * @param i is a <n x i32> vector with the x pixel coordinate (0 to 3)
755 * @param j is a <n x i32> vector with the y pixel coordinate (0 to 3)
756 */
757 static LLVMValueRef
s3tc_dxt3_to_rgba_aos(struct gallivm_state * gallivm,unsigned n,enum pipe_format format,LLVMValueRef colors,LLVMValueRef codewords,LLVMValueRef alpha_low,LLVMValueRef alpha_hi,LLVMValueRef i,LLVMValueRef j)758 s3tc_dxt3_to_rgba_aos(struct gallivm_state *gallivm,
759 unsigned n,
760 enum pipe_format format,
761 LLVMValueRef colors,
762 LLVMValueRef codewords,
763 LLVMValueRef alpha_low,
764 LLVMValueRef alpha_hi,
765 LLVMValueRef i,
766 LLVMValueRef j)
767 {
768 LLVMBuilderRef builder = gallivm->builder;
769 LLVMValueRef rgba, tmp, tmp2;
770 LLVMValueRef bit_pos, sel_mask;
771 struct lp_type type, type8;
772 struct lp_build_context bld;
773
774 memset(&type, 0, sizeof type);
775 type.width = 32;
776 type.length = n;
777
778 memset(&type8, 0, sizeof type8);
779 type8.width = 8;
780 type8.length = n*4;
781
782 assert(lp_check_value(type, i));
783 assert(lp_check_value(type, j));
784
785 lp_build_context_init(&bld, gallivm, type);
786
787 rgba = s3tc_dxt1_to_rgba_aos(gallivm, n, format,
788 colors, codewords, i, j);
789
790 rgba = LLVMBuildBitCast(builder, rgba, bld.vec_type, "");
791
792 /*
793 * Extract alpha values. Since we now need to select from
794 * which 32bit vector values are fetched, construct selection
795 * mask from highest bit of bit_pos, and use select, then shift
796 * according to the bit_pos (without the highest bit).
797 * Note this is pointless for n == 1 case. Could just
798 * directly use 64bit arithmetic if we'd extract 64bit
799 * alpha value instead of 2x32...
800 */
801 /* pos = 4*(4j+i) */
802 bit_pos = LLVMBuildShl(builder, j, lp_build_const_int_vec(gallivm, type, 2), "");
803 bit_pos = LLVMBuildAdd(builder, bit_pos, i, "");
804 bit_pos = LLVMBuildShl(builder, bit_pos,
805 lp_build_const_int_vec(gallivm, type, 2), "");
806 sel_mask = LLVMBuildLShr(builder, bit_pos,
807 lp_build_const_int_vec(gallivm, type, 5), "");
808 sel_mask = LLVMBuildSub(builder, sel_mask, bld.one, "");
809 tmp = lp_build_select(&bld, sel_mask, alpha_low, alpha_hi);
810 bit_pos = LLVMBuildAnd(builder, bit_pos,
811 lp_build_const_int_vec(gallivm, type, 0xffffffdf), "");
812 /* Warning: slow shift with per element count (without avx2) */
813 /*
814 * Could do pshufb here as well - just use appropriate 2 bits in bit_pos
815 * to select the right byte with pshufb. Then for the remaining one bit
816 * just do shift/select.
817 */
818 tmp = LLVMBuildLShr(builder, tmp, bit_pos, "");
819
820 /* combined expand from a4 to a8 and shift into position */
821 tmp = LLVMBuildShl(builder, tmp, lp_build_const_int_vec(gallivm, type, 28), "");
822 tmp2 = LLVMBuildLShr(builder, tmp, lp_build_const_int_vec(gallivm, type, 4), "");
823 tmp = LLVMBuildOr(builder, tmp, tmp2, "");
824
825 rgba = LLVMBuildOr(builder, tmp, rgba, "");
826
827 return LLVMBuildBitCast(builder, rgba, lp_build_vec_type(gallivm, type8), "");
828 }
829
830 static LLVMValueRef
lp_build_lerpdxta(struct gallivm_state * gallivm,LLVMValueRef alpha0,LLVMValueRef alpha1,LLVMValueRef code,LLVMValueRef sel_mask,unsigned n)831 lp_build_lerpdxta(struct gallivm_state *gallivm,
832 LLVMValueRef alpha0,
833 LLVMValueRef alpha1,
834 LLVMValueRef code,
835 LLVMValueRef sel_mask,
836 unsigned n)
837 {
838 /*
839 * note we're doing lerp in 16bit since 32bit pmulld is only available in sse41
840 * (plus pmullw is actually faster...)
841 * we just pretend our 32bit values (which are really only 8bit) are 16bits.
842 * Note that this is obviously a disaster for the scalar case.
843 */
844 LLVMBuilderRef builder = gallivm->builder;
845 LLVMValueRef delta, ainterp;
846 LLVMValueRef weight5, weight7, weight;
847 struct lp_type type32, type16, type8;
848 struct lp_build_context bld16;
849
850 memset(&type32, 0, sizeof type32);
851 type32.width = 32;
852 type32.length = n;
853 memset(&type16, 0, sizeof type16);
854 type16.width = 16;
855 type16.length = 2*n;
856 type16.sign = true;
857 memset(&type8, 0, sizeof type8);
858 type8.width = 8;
859 type8.length = 4*n;
860
861 lp_build_context_init(&bld16, gallivm, type16);
862 /* 255/7 is a bit off - increase accuracy at the expense of shift later */
863 sel_mask = LLVMBuildBitCast(builder, sel_mask, bld16.vec_type, "");
864 weight5 = lp_build_const_int_vec(gallivm, type16, 255*64/5);
865 weight7 = lp_build_const_int_vec(gallivm, type16, 255*64/7);
866 weight = lp_build_select(&bld16, sel_mask, weight7, weight5);
867
868 alpha0 = LLVMBuildBitCast(builder, alpha0, bld16.vec_type, "");
869 alpha1 = LLVMBuildBitCast(builder, alpha1, bld16.vec_type, "");
870 code = LLVMBuildBitCast(builder, code, bld16.vec_type, "");
871 /* we'll get garbage in the elements which had code 0 (or larger than 5 or 7)
872 but we don't care */
873 code = LLVMBuildSub(builder, code, bld16.one, "");
874
875 weight = LLVMBuildMul(builder, weight, code, "");
876 weight = LLVMBuildLShr(builder, weight,
877 lp_build_const_int_vec(gallivm, type16, 6), "");
878
879 delta = LLVMBuildSub(builder, alpha1, alpha0, "");
880
881 ainterp = LLVMBuildMul(builder, delta, weight, "");
882 ainterp = LLVMBuildLShr(builder, ainterp,
883 lp_build_const_int_vec(gallivm, type16, 8), "");
884
885 ainterp = LLVMBuildBitCast(builder, ainterp, lp_build_vec_type(gallivm, type8), "");
886 alpha0 = LLVMBuildBitCast(builder, alpha0, lp_build_vec_type(gallivm, type8), "");
887 ainterp = LLVMBuildAdd(builder, alpha0, ainterp, "");
888 ainterp = LLVMBuildBitCast(builder, ainterp, lp_build_vec_type(gallivm, type32), "");
889
890 return ainterp;
891 }
892
893 static LLVMValueRef
s3tc_dxt5_alpha_channel(struct gallivm_state * gallivm,bool is_signed,unsigned n,LLVMValueRef alpha_hi,LLVMValueRef alpha_lo,LLVMValueRef i,LLVMValueRef j)894 s3tc_dxt5_alpha_channel(struct gallivm_state *gallivm,
895 bool is_signed,
896 unsigned n,
897 LLVMValueRef alpha_hi, LLVMValueRef alpha_lo,
898 LLVMValueRef i, LLVMValueRef j)
899 {
900 LLVMBuilderRef builder = gallivm->builder;
901 struct lp_type type, type8;
902 LLVMValueRef tmp, alpha0, alpha1, alphac, alphac0, bit_pos, shift;
903 LLVMValueRef sel_mask, tmp_mask, alpha, alpha64, code_s;
904 LLVMValueRef mask6, mask7, ainterp;
905 LLVMTypeRef i64t = LLVMInt64TypeInContext(gallivm->context);
906 LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
907 struct lp_build_context bld32;
908
909 memset(&type, 0, sizeof type);
910 type.width = 32;
911 type.length = n;
912
913 memset(&type8, 0, sizeof type8);
914 type8.width = 8;
915 type8.length = n;
916 type8.sign = is_signed;
917
918 lp_build_context_init(&bld32, gallivm, type);
919 /* this looks pretty complex for vectorization:
920 * extract a0/a1 values
921 * extract code
922 * select weights for interpolation depending on a0 > a1
923 * mul weights by code - 1
924 * lerp a0/a1/weights
925 * use selects for getting either a0, a1, interp a, interp a/0.0, interp a/1.0
926 */
927
928 alpha0 = LLVMBuildAnd(builder, alpha_lo,
929 lp_build_const_int_vec(gallivm, type, 0xff), "");
930 if (is_signed) {
931 alpha0 = LLVMBuildTrunc(builder, alpha0, lp_build_vec_type(gallivm, type8), "");
932 alpha0 = LLVMBuildSExt(builder, alpha0, lp_build_vec_type(gallivm, type), "");
933 }
934
935 alpha1 = LLVMBuildLShr(builder, alpha_lo,
936 lp_build_const_int_vec(gallivm, type, 8), "");
937 alpha1 = LLVMBuildAnd(builder, alpha1,
938 lp_build_const_int_vec(gallivm, type, 0xff), "");
939 if (is_signed) {
940 alpha1 = LLVMBuildTrunc(builder, alpha1, lp_build_vec_type(gallivm, type8), "");
941 alpha1 = LLVMBuildSExt(builder, alpha1, lp_build_vec_type(gallivm, type), "");
942 }
943
944 /* pos = 3*(4j+i) */
945 bit_pos = LLVMBuildShl(builder, j, lp_build_const_int_vec(gallivm, type, 2), "");
946 bit_pos = LLVMBuildAdd(builder, bit_pos, i, "");
947 tmp = LLVMBuildAdd(builder, bit_pos, bit_pos, "");
948 bit_pos = LLVMBuildAdd(builder, bit_pos, tmp, "");
949 /* get rid of first 2 bytes - saves shifts of alpha_lo/hi */
950 bit_pos = LLVMBuildAdd(builder, bit_pos,
951 lp_build_const_int_vec(gallivm, type, 16), "");
952
953 if (n == 1) {
954 struct lp_type type64;
955 memset(&type64, 0, sizeof type64);
956 type64.width = 64;
957 type64.length = 1;
958 /* This is pretty pointless could avoid by just directly extracting
959 64bit in the first place but makes it more complicated elsewhere */
960 alpha_lo = LLVMBuildZExt(builder, alpha_lo, i64t, "");
961 alpha_hi = LLVMBuildZExt(builder, alpha_hi, i64t, "");
962 alphac0 = LLVMBuildShl(builder, alpha_hi,
963 lp_build_const_int_vec(gallivm, type64, 32), "");
964 alphac0 = LLVMBuildOr(builder, alpha_lo, alphac0, "");
965
966 shift = LLVMBuildZExt(builder, bit_pos, i64t, "");
967 alphac0 = LLVMBuildLShr(builder, alphac0, shift, "");
968 alphac0 = LLVMBuildTrunc(builder, alphac0, i32t, "");
969 alphac = LLVMBuildAnd(builder, alphac0,
970 lp_build_const_int_vec(gallivm, type, 0x7), "");
971 }
972 else {
973 /*
974 * Using non-native vector length here (actually, with avx2 and
975 * n == 4 llvm will indeed expand to ymm regs...)
976 * At least newer llvm versions handle that ok.
977 * llvm 3.7+ will even handle the emulated 64bit shift with variable
978 * shift count without extraction (and it's actually easier to
979 * emulate than the 32bit one).
980 */
981 alpha64 = LLVMBuildShuffleVector(builder, alpha_lo, alpha_hi,
982 lp_build_const_unpackx2_shuffle(gallivm, n), "");
983
984 alpha64 = LLVMBuildBitCast(builder, alpha64, LLVMVectorType(i64t, n), "");
985 shift = LLVMBuildZExt(builder, bit_pos, LLVMVectorType(i64t, n), "");
986 alphac = LLVMBuildLShr(builder, alpha64, shift, "");
987 alphac = LLVMBuildTrunc(builder, alphac, bld32.vec_type, "");
988
989 alphac = LLVMBuildAnd(builder, alphac,
990 lp_build_const_int_vec(gallivm, type, 0x7), "");
991 }
992
993 /* signed compare is faster saves some xors */
994 type.sign = true;
995 /* alpha0 > alpha1 selection */
996 sel_mask = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER,
997 alpha0, alpha1);
998 ainterp = lp_build_lerpdxta(gallivm, alpha0, alpha1, alphac, sel_mask, n);
999
1000 /*
1001 * if a0 > a1 then we select a0 for case 0, a1 for case 1, interp otherwise.
1002 * else we select a0 for case 0, a1 for case 1,
1003 * interp for case 2-5, 00 for 6 and 0xff(ffffff) for 7
1004 * a = (c == 0) ? a0 : a1
1005 * a = (c > 1) ? ainterp : a
1006 * Finally handle case 6/7 for !(a0 > a1)
1007 * a = (!(a0 > a1) && c == 6) ? 0 : a (andnot with mask)
1008 * a = (!(a0 > a1) && c == 7) ? 0xffffffff : a (or with mask)
1009 */
1010 tmp_mask = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL,
1011 alphac, bld32.zero);
1012 alpha = lp_build_select(&bld32, tmp_mask, alpha0, alpha1);
1013 tmp_mask = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER,
1014 alphac, bld32.one);
1015 alpha = lp_build_select(&bld32, tmp_mask, ainterp, alpha);
1016
1017 code_s = LLVMBuildAnd(builder, alphac,
1018 LLVMBuildNot(builder, sel_mask, ""), "");
1019 mask6 = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL,
1020 code_s, lp_build_const_int_vec(gallivm, type, 6));
1021 mask7 = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL,
1022 code_s, lp_build_const_int_vec(gallivm, type, 7));
1023 if (is_signed) {
1024 alpha = lp_build_select(&bld32, mask6, lp_build_const_int_vec(gallivm, type, -127), alpha);
1025 alpha = lp_build_select(&bld32, mask7, lp_build_const_int_vec(gallivm, type, 127), alpha);
1026 } else {
1027 alpha = LLVMBuildAnd(builder, alpha, LLVMBuildNot(builder, mask6, ""), "");
1028 alpha = LLVMBuildOr(builder, alpha, mask7, "");
1029 }
1030 /* There can be garbage in upper bits, mask them off for rgtc formats */
1031 alpha = LLVMBuildAnd(builder, alpha, lp_build_const_int_vec(gallivm, type, 0xff), "");
1032
1033 return alpha;
1034 }
1035
1036 /**
1037 * Convert from <n x i128> s3tc dxt5 to <4n x i8> RGBA AoS
1038 * @param colors is a <n x i32> vector with n x 2x16bit colors
1039 * @param codewords is a <n x i32> vector containing the codewords
1040 * @param alphas is a <n x i64> vector containing the alpha values
1041 * @param i is a <n x i32> vector with the x pixel coordinate (0 to 3)
1042 * @param j is a <n x i32> vector with the y pixel coordinate (0 to 3)
1043 */
1044 static LLVMValueRef
s3tc_dxt5_full_to_rgba_aos(struct gallivm_state * gallivm,unsigned n,enum pipe_format format,LLVMValueRef colors,LLVMValueRef codewords,LLVMValueRef alpha_lo,LLVMValueRef alpha_hi,LLVMValueRef i,LLVMValueRef j)1045 s3tc_dxt5_full_to_rgba_aos(struct gallivm_state *gallivm,
1046 unsigned n,
1047 enum pipe_format format,
1048 LLVMValueRef colors,
1049 LLVMValueRef codewords,
1050 LLVMValueRef alpha_lo,
1051 LLVMValueRef alpha_hi,
1052 LLVMValueRef i,
1053 LLVMValueRef j)
1054 {
1055 LLVMBuilderRef builder = gallivm->builder;
1056 LLVMValueRef rgba, alpha;
1057 struct lp_type type, type8;
1058 struct lp_build_context bld32;
1059
1060 memset(&type, 0, sizeof type);
1061 type.width = 32;
1062 type.length = n;
1063
1064 memset(&type8, 0, sizeof type8);
1065 type8.width = 8;
1066 type8.length = n*4;
1067
1068 assert(lp_check_value(type, i));
1069 assert(lp_check_value(type, j));
1070
1071 lp_build_context_init(&bld32, gallivm, type);
1072
1073 assert(lp_check_value(type, i));
1074 assert(lp_check_value(type, j));
1075
1076 rgba = s3tc_dxt1_to_rgba_aos(gallivm, n, format,
1077 colors, codewords, i, j);
1078
1079 rgba = LLVMBuildBitCast(builder, rgba, bld32.vec_type, "");
1080
1081 alpha = s3tc_dxt5_alpha_channel(gallivm, false, n, alpha_hi, alpha_lo, i, j);
1082 alpha = LLVMBuildShl(builder, alpha, lp_build_const_int_vec(gallivm, type, 24), "");
1083 rgba = LLVMBuildOr(builder, alpha, rgba, "");
1084
1085 return LLVMBuildBitCast(builder, rgba, lp_build_vec_type(gallivm, type8), "");
1086 }
1087
1088
1089 static void
lp_build_gather_s3tc_simple_scalar(struct gallivm_state * gallivm,const struct util_format_description * format_desc,LLVMValueRef * dxt_block,LLVMValueRef ptr)1090 lp_build_gather_s3tc_simple_scalar(struct gallivm_state *gallivm,
1091 const struct util_format_description *format_desc,
1092 LLVMValueRef *dxt_block,
1093 LLVMValueRef ptr)
1094 {
1095 LLVMBuilderRef builder = gallivm->builder;
1096 unsigned block_bits = format_desc->block.bits;
1097 LLVMValueRef elem, shuf;
1098 LLVMTypeRef type32 = LLVMIntTypeInContext(gallivm->context, 32);
1099 LLVMTypeRef src_type = LLVMIntTypeInContext(gallivm->context, block_bits);
1100 LLVMTypeRef type32_4 = LLVMVectorType(type32, 4);
1101
1102 assert(block_bits == 64 || block_bits == 128);
1103
1104 ptr = LLVMBuildBitCast(builder, ptr, LLVMPointerType(src_type, 0), "");
1105 elem = LLVMBuildLoad2(builder, src_type, ptr, "");
1106
1107 if (block_bits == 128) {
1108 /* just return block as is */
1109 *dxt_block = LLVMBuildBitCast(builder, elem, type32_4, "");
1110 }
1111 else {
1112 LLVMTypeRef type32_2 = LLVMVectorType(type32, 2);
1113 shuf = lp_build_const_extend_shuffle(gallivm, 2, 4);
1114 elem = LLVMBuildBitCast(builder, elem, type32_2, "");
1115 *dxt_block = LLVMBuildShuffleVector(builder, elem,
1116 LLVMGetUndef(type32_2), shuf, "");
1117 }
1118 }
1119
1120
1121 static void
s3tc_store_cached_block(struct gallivm_state * gallivm,LLVMValueRef * col,LLVMValueRef tag_value,LLVMValueRef hash_index,LLVMValueRef cache)1122 s3tc_store_cached_block(struct gallivm_state *gallivm,
1123 LLVMValueRef *col,
1124 LLVMValueRef tag_value,
1125 LLVMValueRef hash_index,
1126 LLVMValueRef cache)
1127 {
1128 LLVMBuilderRef builder = gallivm->builder;
1129 LLVMValueRef ptr, indices[3];
1130 LLVMTypeRef type_ptr4x32;
1131 unsigned count;
1132
1133 type_ptr4x32 = LLVMPointerType(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4), 0);
1134 indices[0] = lp_build_const_int32(gallivm, 0);
1135 indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_TAGS);
1136 indices[2] = hash_index;
1137 LLVMTypeRef cache_type = lp_build_format_cache_type(gallivm);
1138 ptr = LLVMBuildGEP2(builder, cache_type, cache, indices, ARRAY_SIZE(indices), "");
1139 LLVMBuildStore(builder, tag_value, ptr);
1140
1141 indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_DATA);
1142 hash_index = LLVMBuildMul(builder, hash_index, lp_build_const_int32(gallivm, 16), "");
1143 for (count = 0; count < 4; count++) {
1144 indices[2] = hash_index;
1145 ptr = LLVMBuildGEP2(builder, cache_type, cache, indices, ARRAY_SIZE(indices), "");
1146 ptr = LLVMBuildBitCast(builder, ptr, type_ptr4x32, "");
1147 LLVMBuildStore(builder, col[count], ptr);
1148 hash_index = LLVMBuildAdd(builder, hash_index, lp_build_const_int32(gallivm, 4), "");
1149 }
1150 }
1151
1152 static LLVMValueRef
lookup_cache_member(struct gallivm_state * gallivm,LLVMValueRef cache,enum cache_member member,LLVMValueRef index)1153 lookup_cache_member(struct gallivm_state *gallivm, LLVMValueRef cache, enum cache_member member, LLVMValueRef index) {
1154 assert(member == LP_BUILD_FORMAT_CACHE_MEMBER_DATA || member == LP_BUILD_FORMAT_CACHE_MEMBER_TAGS);
1155 LLVMBuilderRef builder = gallivm->builder;
1156 LLVMValueRef member_ptr, indices[3];
1157
1158 indices[0] = lp_build_const_int32(gallivm, 0);
1159 indices[1] = lp_build_const_int32(gallivm, member);
1160 indices[2] = index;
1161
1162 const char *name =
1163 member == LP_BUILD_FORMAT_CACHE_MEMBER_DATA ? "cache_data" :
1164 member == LP_BUILD_FORMAT_CACHE_MEMBER_TAGS ? "tag_data" : "";
1165
1166 member_ptr = LLVMBuildGEP2(builder, lp_build_format_cache_type(gallivm),
1167 cache, indices, ARRAY_SIZE(indices), "cache_gep");
1168
1169 return LLVMBuildLoad2(builder, lp_build_format_cache_elem_type(gallivm, member), member_ptr, name);
1170 }
1171
1172 static LLVMValueRef
s3tc_lookup_cached_pixel(struct gallivm_state * gallivm,LLVMValueRef cache,LLVMValueRef index)1173 s3tc_lookup_cached_pixel(struct gallivm_state *gallivm,
1174 LLVMValueRef cache,
1175 LLVMValueRef index)
1176 {
1177 return lookup_cache_member(gallivm, cache, LP_BUILD_FORMAT_CACHE_MEMBER_DATA, index);
1178 }
1179
1180 static LLVMValueRef
s3tc_lookup_tag_data(struct gallivm_state * gallivm,LLVMValueRef cache,LLVMValueRef index)1181 s3tc_lookup_tag_data(struct gallivm_state *gallivm,
1182 LLVMValueRef cache,
1183 LLVMValueRef index)
1184 {
1185 return lookup_cache_member(gallivm, cache, LP_BUILD_FORMAT_CACHE_MEMBER_TAGS, index);
1186 }
1187
1188 #if LP_BUILD_FORMAT_CACHE_DEBUG
1189 static void
s3tc_update_cache_access(struct gallivm_state * gallivm,LLVMValueRef ptr,unsigned count,unsigned index)1190 s3tc_update_cache_access(struct gallivm_state *gallivm,
1191 LLVMValueRef ptr,
1192 unsigned count,
1193 unsigned index)
1194 {
1195 LLVMBuilderRef builder = gallivm->builder;
1196 LLVMValueRef member_ptr, cache_access;
1197
1198 assert(index == LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL ||
1199 index == LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS);
1200 LLVMTypeRef cache_type = lp_build_format_cache_type(gallivm);
1201 member_ptr = lp_build_struct_get_ptr2(gallivm, cache_type, ptr, index, "");
1202 cache_access = LLVMBuildLoad2(builder, LLVMInt64TypeInContext(gallivm->context), member_ptr, "cache_access");
1203 cache_access = LLVMBuildAdd(builder, cache_access,
1204 LLVMConstInt(LLVMInt64TypeInContext(gallivm->context), count, 0), "");
1205 LLVMBuildStore(builder, cache_access, member_ptr);
1206 }
1207 #endif
1208
1209 /**
1210 * Calculate 1/3(v1-v0) + v0 and 2*1/3(v1-v0) + v0.
1211 * The lerp is performed between the first 2 32bit colors
1212 * in the source vector, both results are returned packed in result vector.
1213 */
1214 static LLVMValueRef
lp_build_lerp23_single(struct lp_build_context * bld,LLVMValueRef v01)1215 lp_build_lerp23_single(struct lp_build_context *bld,
1216 LLVMValueRef v01)
1217 {
1218 struct gallivm_state *gallivm = bld->gallivm;
1219 LLVMValueRef x, mul, delta, res, v0, v1, elems[8];
1220 const struct lp_type type = bld->type;
1221 LLVMBuilderRef builder = bld->gallivm->builder;
1222 struct lp_type i16_type = lp_wider_type(type);
1223 struct lp_type i32_type = lp_wider_type(i16_type);
1224 struct lp_build_context bld2;
1225
1226 assert(!type.floating && !type.fixed && !type.norm && type.width == 8);
1227
1228 lp_build_context_init(&bld2, gallivm, i16_type);
1229 bld2.type.sign = true;
1230
1231 /* weights 256/3, 256*2/3, with correct rounding */
1232 elems[0] = elems[1] = elems[2] = elems[3] =
1233 lp_build_const_elem(gallivm, i16_type, 255*1/3);
1234 elems[4] = elems[5] = elems[6] = elems[7] =
1235 lp_build_const_elem(gallivm, i16_type, 171);
1236 x = LLVMConstVector(elems, 8);
1237
1238 /*
1239 * v01 has col0 in 32bit elem 0, col1 in elem 1.
1240 * Interleave/unpack will give us separate v0/v1 vectors.
1241 */
1242 v01 = lp_build_interleave2(gallivm, i32_type, v01, v01, 0);
1243 v01 = LLVMBuildBitCast(builder, v01, bld->vec_type, "");
1244
1245 lp_build_unpack2(gallivm, type, i16_type, v01, &v0, &v1);
1246 delta = lp_build_sub(&bld2, v1, v0);
1247
1248 mul = LLVMBuildMul(builder, x, delta, "");
1249
1250 mul = LLVMBuildLShr(builder, mul, lp_build_const_int_vec(gallivm, i16_type, 8), "");
1251 /* lerp optimization: pack now, do add afterwards */
1252 res = lp_build_pack2(gallivm, i16_type, type, mul, bld2.undef);
1253 /* only lower 2 elems are valid - for these v0 is really v0 */
1254 return lp_build_add(bld, res, v01);
1255 }
1256
1257 /*
1258 * decode one dxt1 block.
1259 */
1260 static void
s3tc_decode_block_dxt1(struct gallivm_state * gallivm,enum pipe_format format,LLVMValueRef dxt_block,LLVMValueRef * col)1261 s3tc_decode_block_dxt1(struct gallivm_state *gallivm,
1262 enum pipe_format format,
1263 LLVMValueRef dxt_block,
1264 LLVMValueRef *col)
1265 {
1266 LLVMBuilderRef builder = gallivm->builder;
1267 LLVMValueRef color01, color23, color01_16, color0123;
1268 LLVMValueRef rgba, tmp, a, sel_mask, indices, code, const2;
1269 struct lp_type type8, type32, type16, type64;
1270 struct lp_build_context bld8, bld32, bld16, bld64;
1271 unsigned i;
1272 bool is_dxt1_variant = format_dxt1_variant(format);
1273
1274 memset(&type32, 0, sizeof type32);
1275 type32.width = 32;
1276 type32.length = 4;
1277 type32.sign = true;
1278
1279 memset(&type8, 0, sizeof type8);
1280 type8.width = 8;
1281 type8.length = 16;
1282
1283 memset(&type16, 0, sizeof type16);
1284 type16.width = 16;
1285 type16.length = 8;
1286
1287 memset(&type64, 0, sizeof type64);
1288 type64.width = 64;
1289 type64.length = 2;
1290
1291 a = lp_build_const_int_vec(gallivm, type32, 0xff000000);
1292 const2 = lp_build_const_int_vec(gallivm, type32, 2);
1293
1294 lp_build_context_init(&bld32, gallivm, type32);
1295 lp_build_context_init(&bld16, gallivm, type16);
1296 lp_build_context_init(&bld8, gallivm, type8);
1297 lp_build_context_init(&bld64, gallivm, type64);
1298
1299 if (is_dxt1_variant) {
1300 color01 = lp_build_shuffle1undef(gallivm, dxt_block, 0, 4);
1301 code = lp_build_shuffle1undef(gallivm, dxt_block, 1, 4);
1302 } else {
1303 color01 = lp_build_shuffle1undef(gallivm, dxt_block, 2, 4);
1304 code = lp_build_shuffle1undef(gallivm, dxt_block, 3, 4);
1305 }
1306 code = LLVMBuildBitCast(builder, code, bld8.vec_type, "");
1307 /* expand bytes to dwords */
1308 code = lp_build_interleave2(gallivm, type8, code, code, 0);
1309 code = lp_build_interleave2(gallivm, type8, code, code, 0);
1310
1311
1312 /*
1313 * works as follows:
1314 * - expand color0/color1 to rgba8888
1315 * - calculate color2/3 (interpolation) according to color0 < color1 rules
1316 * - calculate color2/3 according to color0 >= color1 rules
1317 * - do selection of color2/3 according to comparison of color0/1
1318 * - extract indices.
1319 * - use compare/select to select the correct color. Since we have 2bit
1320 * indices (and 4 colors), needs at least three compare/selects.
1321 */
1322
1323 /*
1324 * expand the two colors
1325 */
1326 color01 = LLVMBuildBitCast(builder, color01, bld16.vec_type, "");
1327 color01 = lp_build_interleave2(gallivm, type16, color01,
1328 bld16.zero, 0);
1329 color01_16 = LLVMBuildBitCast(builder, color01, bld32.vec_type, "");
1330 color01 = color_expand_565_to_8888(gallivm, 4, color01_16);
1331
1332 /*
1333 * interpolate colors
1334 * color2_1 is 2/3 color0 + 1/3 color1
1335 * color3_1 is 1/3 color0 + 2/3 color1
1336 * color2_2 is 1/2 color0 + 1/2 color1
1337 * color3_2 is 0
1338 */
1339
1340 /* TODO: since this is now always scalar, should
1341 * probably just use control flow here instead of calculating
1342 * both cases and then selection
1343 */
1344 if (format == PIPE_FORMAT_DXT1_RGBA ||
1345 format == PIPE_FORMAT_DXT1_SRGBA) {
1346 color01 = LLVMBuildOr(builder, color01, a, "");
1347 }
1348 /* can combine 2 lerps into one mostly */
1349 color23 = lp_build_lerp23_single(&bld8, color01);
1350 color23 = LLVMBuildBitCast(builder, color23, bld32.vec_type, "");
1351
1352 /* dxt3/5 always use 4-color encoding */
1353 if (is_dxt1_variant) {
1354 LLVMValueRef color23_2, color2_2;
1355
1356 if (util_get_cpu_caps()->has_sse2) {
1357 LLVMValueRef intrargs[2];
1358 intrargs[0] = LLVMBuildBitCast(builder, color01, bld8.vec_type, "");
1359 /* same interleave as for lerp23 - correct result in 2nd element */
1360 intrargs[1] = lp_build_interleave2(gallivm, type32, color01, color01, 0);
1361 intrargs[1] = LLVMBuildBitCast(builder, intrargs[1], bld8.vec_type, "");
1362 color2_2 = lp_build_pavgb(&bld8, intrargs[0], intrargs[1]);
1363 }
1364 else {
1365 LLVMValueRef v01, v0, v1, vhalf;
1366 /*
1367 * This isn't as expensive as it looks (the unpack is the same as
1368 * for lerp23, which is the reason why we do the pointless
1369 * interleave2 too), with correct rounding (the two lower elements
1370 * will be the same).
1371 */
1372 v01 = lp_build_interleave2(gallivm, type32, color01, color01, 0);
1373 v01 = LLVMBuildBitCast(builder, v01, bld8.vec_type, "");
1374 lp_build_unpack2(gallivm, type8, type16, v01, &v0, &v1);
1375 vhalf = lp_build_add(&bld16, v0, v1);
1376 vhalf = LLVMBuildLShr(builder, vhalf, bld16.one, "");
1377 color2_2 = lp_build_pack2(gallivm, type16, type8, vhalf, bld16.undef);
1378 }
1379 /* shuffle in color 3 as elem 2 zero, color 2 elem 1 */
1380 color23_2 = LLVMBuildBitCast(builder, color2_2, bld64.vec_type, "");
1381 color23_2 = LLVMBuildLShr(builder, color23_2,
1382 lp_build_const_int_vec(gallivm, type64, 32), "");
1383 color23_2 = LLVMBuildBitCast(builder, color23_2, bld32.vec_type, "");
1384
1385 tmp = LLVMBuildBitCast(builder, color01_16, bld64.vec_type, "");
1386 tmp = LLVMBuildLShr(builder, tmp,
1387 lp_build_const_int_vec(gallivm, type64, 32), "");
1388 tmp = LLVMBuildBitCast(builder, tmp, bld32.vec_type, "");
1389 sel_mask = lp_build_compare(gallivm, type32, PIPE_FUNC_GREATER,
1390 color01_16, tmp);
1391 sel_mask = lp_build_interleave2(gallivm, type32, sel_mask, sel_mask, 0);
1392 color23 = lp_build_select(&bld32, sel_mask, color23, color23_2);
1393 }
1394
1395 if (util_get_cpu_caps()->has_ssse3) {
1396 /*
1397 * Use pshufb as mini-lut. (Only doable with intrinsics as the
1398 * final shuffles are non-constant. pshufb is awesome!)
1399 */
1400 LLVMValueRef shuf[16], low2mask;
1401 LLVMValueRef intrargs[2], lut_ind, lut_adj;
1402
1403 color01 = LLVMBuildBitCast(builder, color01, bld64.vec_type, "");
1404 color23 = LLVMBuildBitCast(builder, color23, bld64.vec_type, "");
1405 color0123 = lp_build_interleave2(gallivm, type64, color01, color23, 0);
1406 color0123 = LLVMBuildBitCast(builder, color0123, bld32.vec_type, "");
1407
1408 if (format == PIPE_FORMAT_DXT1_RGB ||
1409 format == PIPE_FORMAT_DXT1_SRGB) {
1410 color0123 = LLVMBuildOr(builder, color0123, a, "");
1411 }
1412
1413 /* shuffle as r0r1r2r3g0g1... */
1414 for (i = 0; i < 4; i++) {
1415 shuf[4*i] = lp_build_const_int32(gallivm, 0 + i);
1416 shuf[4*i+1] = lp_build_const_int32(gallivm, 4 + i);
1417 shuf[4*i+2] = lp_build_const_int32(gallivm, 8 + i);
1418 shuf[4*i+3] = lp_build_const_int32(gallivm, 12 + i);
1419 }
1420 color0123 = LLVMBuildBitCast(builder, color0123, bld8.vec_type, "");
1421 color0123 = LLVMBuildShuffleVector(builder, color0123, bld8.undef,
1422 LLVMConstVector(shuf, 16), "");
1423
1424 /* lowest 2 bits of each 8 bit value contain index into "LUT" */
1425 low2mask = lp_build_const_int_vec(gallivm, type8, 3);
1426 /* add 0/4/8/12 for r/g/b/a */
1427 lut_adj = lp_build_const_int_vec(gallivm, type32, 0x0c080400);
1428 lut_adj = LLVMBuildBitCast(builder, lut_adj, bld8.vec_type, "");
1429 intrargs[0] = color0123;
1430 for (i = 0; i < 4; i++) {
1431 lut_ind = LLVMBuildAnd(builder, code, low2mask, "");
1432 lut_ind = LLVMBuildOr(builder, lut_ind, lut_adj, "");
1433 intrargs[1] = lut_ind;
1434 col[i] = lp_build_intrinsic(builder, "llvm.x86.ssse3.pshuf.b.128",
1435 bld8.vec_type, intrargs, 2, 0);
1436 col[i] = LLVMBuildBitCast(builder, col[i], bld32.vec_type, "");
1437 code = LLVMBuildBitCast(builder, code, bld32.vec_type, "");
1438 code = LLVMBuildLShr(builder, code, const2, "");
1439 code = LLVMBuildBitCast(builder, code, bld8.vec_type, "");
1440 }
1441 }
1442 else {
1443 /* Thanks to vectorization can do 4 texels in parallel */
1444 LLVMValueRef color0, color1, color2, color3;
1445 if (format == PIPE_FORMAT_DXT1_RGB ||
1446 format == PIPE_FORMAT_DXT1_SRGB) {
1447 color01 = LLVMBuildOr(builder, color01, a, "");
1448 color23 = LLVMBuildOr(builder, color23, a, "");
1449 }
1450 color0 = LLVMBuildShuffleVector(builder, color01, bld32.undef,
1451 lp_build_const_shuffle1(gallivm, 0, 4), "");
1452 color1 = LLVMBuildShuffleVector(builder, color01, bld32.undef,
1453 lp_build_const_shuffle1(gallivm, 1, 4), "");
1454 color2 = LLVMBuildShuffleVector(builder, color23, bld32.undef,
1455 lp_build_const_shuffle1(gallivm, 0, 4), "");
1456 color3 = LLVMBuildShuffleVector(builder, color23, bld32.undef,
1457 lp_build_const_shuffle1(gallivm, 1, 4), "");
1458 code = LLVMBuildBitCast(builder, code, bld32.vec_type, "");
1459
1460 for (i = 0; i < 4; i++) {
1461 /* select the colors */
1462 LLVMValueRef selmasklo, rgba01, rgba23, bitlo;
1463 bitlo = bld32.one;
1464 indices = LLVMBuildAnd(builder, code, bitlo, "");
1465 selmasklo = lp_build_compare(gallivm, type32, PIPE_FUNC_EQUAL,
1466 indices, bitlo);
1467 rgba01 = lp_build_select(&bld32, selmasklo, color1, color0);
1468
1469 LLVMValueRef selmaskhi;
1470 indices = LLVMBuildAnd(builder, code, const2, "");
1471 selmaskhi = lp_build_compare(gallivm, type32, PIPE_FUNC_EQUAL,
1472 indices, const2);
1473 rgba23 = lp_build_select(&bld32, selmasklo, color3, color2);
1474 rgba = lp_build_select(&bld32, selmaskhi, rgba23, rgba01);
1475
1476 /*
1477 * Note that this will give "wrong" order.
1478 * col0 will be rgba0, rgba4, rgba8, rgba12, col1 rgba1, rgba5, ...
1479 * This would be easily fixable by using different shuffle, bitlo/hi
1480 * vectors above (and different shift), but seems slightly easier to
1481 * deal with for dxt3/dxt5 alpha too. So instead change lookup.
1482 */
1483 col[i] = rgba;
1484 code = LLVMBuildLShr(builder, code, const2, "");
1485 }
1486 }
1487 }
1488
1489 /*
1490 * decode one dxt3 block.
1491 */
1492 static void
s3tc_decode_block_dxt3(struct gallivm_state * gallivm,enum pipe_format format,LLVMValueRef dxt_block,LLVMValueRef * col)1493 s3tc_decode_block_dxt3(struct gallivm_state *gallivm,
1494 enum pipe_format format,
1495 LLVMValueRef dxt_block,
1496 LLVMValueRef *col)
1497 {
1498 LLVMBuilderRef builder = gallivm->builder;
1499 LLVMValueRef alpha, alphas0, alphas1, shift4_16, a[4], mask8hi;
1500 struct lp_type type32, type8, type16;
1501 unsigned i;
1502
1503 memset(&type32, 0, sizeof type32);
1504 type32.width = 32;
1505 type32.length = 4;
1506
1507 memset(&type8, 0, sizeof type8);
1508 type8.width = 8;
1509 type8.length = 16;
1510
1511 memset(&type16, 0, sizeof type16);
1512 type16.width = 16;
1513 type16.length = 8;
1514
1515 s3tc_decode_block_dxt1(gallivm, format, dxt_block, col);
1516
1517 shift4_16 = lp_build_const_int_vec(gallivm, type16, 4);
1518 mask8hi = lp_build_const_int_vec(gallivm, type32, 0xff000000);
1519
1520 alpha = LLVMBuildBitCast(builder, dxt_block,
1521 lp_build_vec_type(gallivm, type8), "");
1522 alpha = lp_build_interleave2(gallivm, type8, alpha, alpha, 0);
1523 alpha = LLVMBuildBitCast(builder, alpha,
1524 lp_build_vec_type(gallivm, type16), "");
1525 alpha = LLVMBuildAnd(builder, alpha,
1526 lp_build_const_int_vec(gallivm, type16, 0xf00f), "");
1527 alphas0 = LLVMBuildLShr(builder, alpha, shift4_16, "");
1528 alphas1 = LLVMBuildShl(builder, alpha, shift4_16, "");
1529 alpha = LLVMBuildOr(builder, alphas0, alpha, "");
1530 alpha = LLVMBuildOr(builder, alphas1, alpha, "");
1531 alpha = LLVMBuildBitCast(builder, alpha,
1532 lp_build_vec_type(gallivm, type32), "");
1533 /*
1534 * alpha now contains elems 0,1,2,3,... (ubytes)
1535 * we need 0,4,8,12, 1,5,9,13 etc. in dwords to match color (which
1536 * is just as easy as "natural" order - 3 shift/and instead of 6 unpack).
1537 */
1538 a[0] = LLVMBuildShl(builder, alpha,
1539 lp_build_const_int_vec(gallivm, type32, 24), "");
1540 a[1] = LLVMBuildShl(builder, alpha,
1541 lp_build_const_int_vec(gallivm, type32, 16), "");
1542 a[1] = LLVMBuildAnd(builder, a[1], mask8hi, "");
1543 a[2] = LLVMBuildShl(builder, alpha,
1544 lp_build_const_int_vec(gallivm, type32, 8), "");
1545 a[2] = LLVMBuildAnd(builder, a[2], mask8hi, "");
1546 a[3] = LLVMBuildAnd(builder, alpha, mask8hi, "");
1547
1548 for (i = 0; i < 4; i++) {
1549 col[i] = LLVMBuildOr(builder, col[i], a[i], "");
1550 }
1551 }
1552
1553
1554 static LLVMValueRef
lp_build_lerpdxta_block(struct gallivm_state * gallivm,LLVMValueRef alpha0,LLVMValueRef alpha1,LLVMValueRef code,LLVMValueRef sel_mask)1555 lp_build_lerpdxta_block(struct gallivm_state *gallivm,
1556 LLVMValueRef alpha0,
1557 LLVMValueRef alpha1,
1558 LLVMValueRef code,
1559 LLVMValueRef sel_mask)
1560 {
1561 LLVMBuilderRef builder = gallivm->builder;
1562 LLVMValueRef delta, ainterp;
1563 LLVMValueRef weight5, weight7, weight;
1564 struct lp_type type16;
1565 struct lp_build_context bld;
1566
1567 memset(&type16, 0, sizeof type16);
1568 type16.width = 16;
1569 type16.length = 8;
1570 type16.sign = true;
1571
1572 lp_build_context_init(&bld, gallivm, type16);
1573 /*
1574 * 256/7 is only 36.57 so we'd lose quite some precision. Since it would
1575 * actually be desirable to do this here with even higher accuracy than
1576 * even 8 bit (more or less required for rgtc, albeit that's not handled
1577 * here right now), shift the weights after multiplication by code.
1578 */
1579 weight5 = lp_build_const_int_vec(gallivm, type16, 256*64/5);
1580 weight7 = lp_build_const_int_vec(gallivm, type16, 256*64/7);
1581 weight = lp_build_select(&bld, sel_mask, weight7, weight5);
1582
1583 /*
1584 * we'll get garbage in the elements which had code 0 (or larger than
1585 * 5 or 7) but we don't care (or rather, need to fix up anyway).
1586 */
1587 code = LLVMBuildSub(builder, code, bld.one, "");
1588
1589 weight = LLVMBuildMul(builder, weight, code, "");
1590 weight = LLVMBuildLShr(builder, weight,
1591 lp_build_const_int_vec(gallivm, type16, 6), "");
1592
1593 delta = LLVMBuildSub(builder, alpha1, alpha0, "");
1594
1595 ainterp = LLVMBuildMul(builder, delta, weight, "");
1596 ainterp = LLVMBuildLShr(builder, ainterp,
1597 lp_build_const_int_vec(gallivm, type16, 8), "");
1598
1599 /* lerp is done later (with packed values) */
1600
1601 return ainterp;
1602 }
1603
1604
1605 /*
1606 * decode one dxt5 block.
1607 */
1608 static void
s3tc_decode_block_dxt5(struct gallivm_state * gallivm,enum pipe_format format,LLVMValueRef dxt_block,LLVMValueRef * col)1609 s3tc_decode_block_dxt5(struct gallivm_state *gallivm,
1610 enum pipe_format format,
1611 LLVMValueRef dxt_block,
1612 LLVMValueRef *col)
1613 {
1614 LLVMBuilderRef builder = gallivm->builder;
1615 LLVMValueRef alpha, alpha0, alpha1, ares;
1616 LLVMValueRef ainterp, ainterp0, ainterp1, shuffle1, sel_mask, sel_mask2;
1617 LLVMValueRef a[4], acode, tmp0, tmp1;
1618 LLVMTypeRef i64t, i32t;
1619 struct lp_type type32, type64, type8, type16;
1620 struct lp_build_context bld16, bld8;
1621 unsigned i;
1622
1623 memset(&type32, 0, sizeof type32);
1624 type32.width = 32;
1625 type32.length = 4;
1626
1627 memset(&type64, 0, sizeof type64);
1628 type64.width = 64;
1629 type64.length = 2;
1630
1631 memset(&type8, 0, sizeof type8);
1632 type8.width = 8;
1633 type8.length = 16;
1634
1635 memset(&type16, 0, sizeof type16);
1636 type16.width = 16;
1637 type16.length = 8;
1638
1639 lp_build_context_init(&bld16, gallivm, type16);
1640 lp_build_context_init(&bld8, gallivm, type8);
1641
1642 i64t = lp_build_vec_type(gallivm, type64);
1643 i32t = lp_build_vec_type(gallivm, type32);
1644
1645 s3tc_decode_block_dxt1(gallivm, format, dxt_block, col);
1646
1647 /*
1648 * three possible strategies for vectorizing alpha:
1649 * 1) compute all 8 values then use scalar extraction
1650 * (i.e. have all 8 alpha values packed in one 64bit scalar
1651 * and do something like ax = vals >> (codex * 8) followed
1652 * by inserting these values back into color)
1653 * 2) same as 8 but just use pshufb as a mini-LUT for selection.
1654 * (without pshufb would need boatloads of cmp/selects trying to
1655 * keep things vectorized for essentially scalar selection).
1656 * 3) do something similar to the uncached case
1657 * needs more calculations (need to calc 16 values instead of 8 though
1658 * that's only an issue for the lerp which we need to do twice otherwise
1659 * everything still fits into 128bit) but keeps things vectorized mostly.
1660 * Trying 3) here though not sure it's really faster...
1661 * With pshufb, we try 2) (cheaper and more accurate)
1662 */
1663
1664 /*
1665 * Ideally, we'd use 2 variable 16bit shifts here (byte shifts wouldn't
1666 * help since code crosses 8bit boundaries). But variable shifts are
1667 * AVX2 only, and even then only dword/quadword (intel _really_ hates
1668 * shifts!). Instead, emulate by 16bit muls.
1669 * Also, the required byte shuffles are essentially non-emulatable, so
1670 * require ssse3 (albeit other archs might do them fine).
1671 * This is not directly tied to ssse3 - just need sane byte shuffles.
1672 * But ordering is going to be different below so use same condition.
1673 */
1674
1675
1676 /* vectorize alpha */
1677 alpha = LLVMBuildBitCast(builder, dxt_block, i64t, "");
1678 alpha0 = LLVMBuildAnd(builder, alpha,
1679 lp_build_const_int_vec(gallivm, type64, 0xff), "");
1680 alpha0 = LLVMBuildBitCast(builder, alpha0, bld16.vec_type, "");
1681 alpha = LLVMBuildBitCast(builder, alpha, bld16.vec_type, "");
1682 alpha1 = LLVMBuildLShr(builder, alpha,
1683 lp_build_const_int_vec(gallivm, type16, 8), "");
1684 alpha = LLVMBuildBitCast(builder, alpha, i64t, "");
1685 shuffle1 = lp_build_const_shuffle1(gallivm, 0, 8);
1686 alpha0 = LLVMBuildShuffleVector(builder, alpha0, alpha0, shuffle1, "");
1687 alpha1 = LLVMBuildShuffleVector(builder, alpha1, alpha1, shuffle1, "");
1688
1689 type16.sign = true;
1690 sel_mask = lp_build_compare(gallivm, type16, PIPE_FUNC_GREATER,
1691 alpha0, alpha1);
1692 type16.sign = false;
1693 sel_mask = LLVMBuildBitCast(builder, sel_mask, bld8.vec_type, "");
1694
1695 if (!util_get_cpu_caps()->has_ssse3) {
1696 LLVMValueRef acodeg, mask1, acode0, acode1;
1697
1698 /* extraction of the 3 bit values into something more useful is HARD */
1699 /* first steps are actually scalar */
1700 acode = LLVMBuildLShr(builder, alpha,
1701 lp_build_const_int_vec(gallivm, type64, 16), "");
1702 tmp0 = LLVMBuildAnd(builder, acode,
1703 lp_build_const_int_vec(gallivm, type64, 0xffffff), "");
1704 tmp1 = LLVMBuildLShr(builder, acode,
1705 lp_build_const_int_vec(gallivm, type64, 24), "");
1706 tmp0 = LLVMBuildBitCast(builder, tmp0, i32t, "");
1707 tmp1 = LLVMBuildBitCast(builder, tmp1, i32t, "");
1708 acode = lp_build_interleave2(gallivm, type32, tmp0, tmp1, 0);
1709 /* now have 2x24bit in 4x32bit, order 01234567, 89..., undef, undef */
1710 tmp0 = LLVMBuildAnd(builder, acode,
1711 lp_build_const_int_vec(gallivm, type32, 0xfff), "");
1712 tmp1 = LLVMBuildLShr(builder, acode,
1713 lp_build_const_int_vec(gallivm, type32, 12), "");
1714 acode = lp_build_interleave2(gallivm, type32, tmp0, tmp1, 0);
1715 /* now have 4x12bit in 4x32bit, order 0123, 4567, ,,, */
1716 tmp0 = LLVMBuildAnd(builder, acode,
1717 lp_build_const_int_vec(gallivm, type32, 0x3f), "");
1718 tmp1 = LLVMBuildLShr(builder, acode,
1719 lp_build_const_int_vec(gallivm, type32, 6), "");
1720 /* use signed pack doesn't matter and otherwise need sse41 */
1721 type32.sign = type16.sign = true;
1722 acode = lp_build_pack2(gallivm, type32, type16, tmp0, tmp1);
1723 type32.sign = type16.sign = false;
1724 /* now have 8x6bit in 8x16bit, 01, 45, 89, ..., 23, 67, ... */
1725 acode0 = LLVMBuildAnd(builder, acode,
1726 lp_build_const_int_vec(gallivm, type16, 0x7), "");
1727 acode1 = LLVMBuildLShr(builder, acode,
1728 lp_build_const_int_vec(gallivm, type16, 3), "");
1729 acode = lp_build_pack2(gallivm, type16, type8, acode0, acode1);
1730 /* acode0 contains elems 0,4,8,12,2,6,10,14, acode1 1,5,9,... */
1731
1732 acodeg = LLVMBuildAnd(builder, acode,
1733 LLVMBuildNot(builder, sel_mask, ""), "");
1734 mask1 = lp_build_compare(gallivm, type8, PIPE_FUNC_EQUAL,
1735 acode, bld8.one);
1736
1737 sel_mask = LLVMBuildBitCast(builder, sel_mask, bld16.vec_type, "");
1738 ainterp0 = lp_build_lerpdxta_block(gallivm, alpha0, alpha1, acode0, sel_mask);
1739 ainterp1 = lp_build_lerpdxta_block(gallivm, alpha0, alpha1, acode1, sel_mask);
1740 sel_mask = LLVMBuildBitCast(builder, sel_mask, bld8.vec_type, "");
1741 ainterp = lp_build_pack2(gallivm, type16, type8, ainterp0, ainterp1);
1742 alpha0 = lp_build_pack2(gallivm, type16, type8, alpha0, alpha0);
1743 alpha1 = lp_build_pack2(gallivm, type16, type8, alpha1, alpha1);
1744 ainterp = LLVMBuildAdd(builder, ainterp, alpha0, "");
1745 /* Fix up val01 */
1746 sel_mask2 = lp_build_compare(gallivm, type8, PIPE_FUNC_EQUAL,
1747 acode, bld8.zero);
1748 ainterp = lp_build_select(&bld8, sel_mask2, alpha0, ainterp);
1749 ainterp = lp_build_select(&bld8, mask1, alpha1, ainterp);
1750
1751 /* fix up val67 if a0 <= a1 */
1752 sel_mask2 = lp_build_compare(gallivm, type8, PIPE_FUNC_EQUAL,
1753 acodeg, lp_build_const_int_vec(gallivm, type8, 6));
1754 ares = LLVMBuildAnd(builder, ainterp, LLVMBuildNot(builder, sel_mask2, ""), "");
1755 sel_mask2 = lp_build_compare(gallivm, type8, PIPE_FUNC_EQUAL,
1756 acodeg, lp_build_const_int_vec(gallivm, type8, 7));
1757 ares = LLVMBuildOr(builder, ares, sel_mask2, "");
1758
1759 /* unpack in right order (0,4,8,12,1,5,..) */
1760 /* this gives us zero, a0, zero, a4, zero, a8, ... for tmp0 */
1761 tmp0 = lp_build_interleave2(gallivm, type8, bld8.zero, ares, 0);
1762 tmp1 = lp_build_interleave2(gallivm, type8, bld8.zero, ares, 1);
1763 tmp0 = LLVMBuildBitCast(builder, tmp0, bld16.vec_type, "");
1764 tmp1 = LLVMBuildBitCast(builder, tmp1, bld16.vec_type, "");
1765
1766 a[0] = lp_build_interleave2(gallivm, type16, bld16.zero, tmp0, 0);
1767 a[1] = lp_build_interleave2(gallivm, type16, bld16.zero, tmp1, 0);
1768 a[2] = lp_build_interleave2(gallivm, type16, bld16.zero, tmp0, 1);
1769 a[3] = lp_build_interleave2(gallivm, type16, bld16.zero, tmp1, 1);
1770 }
1771 else {
1772 LLVMValueRef elems[16], intrargs[2], shufa, mulclo, mulchi, mask8hi;
1773 LLVMTypeRef type16s = LLVMInt16TypeInContext(gallivm->context);
1774 LLVMTypeRef type8s = LLVMInt8TypeInContext(gallivm->context);
1775 unsigned i, j;
1776 /*
1777 * Ideally, we'd use 2 variable 16bit shifts here (byte shifts wouldn't
1778 * help since code crosses 8bit boundaries). But variable shifts are
1779 * AVX2 only, and even then only dword/quadword (intel _really_ hates
1780 * shifts!). Instead, emulate by 16bit muls.
1781 * Also, the required byte shuffles are essentially non-emulatable, so
1782 * require ssse3 (albeit other archs might do them fine, but the
1783 * complete path is ssse3 only for now).
1784 */
1785 for (i = 0, j = 0; i < 16; i += 8, j += 3) {
1786 elems[i+0] = elems[i+1] = elems[i+2] = lp_build_const_int32(gallivm, j+2);
1787 elems[i+3] = elems[i+4] = lp_build_const_int32(gallivm, j+3);
1788 elems[i+5] = elems[i+6] = elems[i+7] = lp_build_const_int32(gallivm, j+4);
1789 }
1790 shufa = LLVMConstVector(elems, 16);
1791 alpha = LLVMBuildBitCast(builder, alpha, bld8.vec_type, "");
1792 acode = LLVMBuildShuffleVector(builder, alpha, bld8.undef, shufa, "");
1793 acode = LLVMBuildBitCast(builder, acode, bld16.vec_type, "");
1794 /*
1795 * Put 0/2/4/6 into high 3 bits of 16 bits (save AND mask)
1796 * Do the same for 1/3/5/7 (albeit still need mask there - ideally
1797 * we'd place them into bits 4-7 so could save shift but impossible.)
1798 */
1799 for (i = 0; i < 8; i += 4) {
1800 elems[i+0] = LLVMConstInt(type16s, 1 << (13-0), 0);
1801 elems[i+1] = LLVMConstInt(type16s, 1 << (13-6), 0);
1802 elems[i+2] = LLVMConstInt(type16s, 1 << (13-4), 0);
1803 elems[i+3] = LLVMConstInt(type16s, 1 << (13-2), 0);
1804 }
1805 mulclo = LLVMConstVector(elems, 8);
1806 for (i = 0; i < 8; i += 4) {
1807 elems[i+0] = LLVMConstInt(type16s, 1 << (13-3), 0);
1808 elems[i+1] = LLVMConstInt(type16s, 1 << (13-9), 0);
1809 elems[i+2] = LLVMConstInt(type16s, 1 << (13-7), 0);
1810 elems[i+3] = LLVMConstInt(type16s, 1 << (13-5), 0);
1811 }
1812 mulchi = LLVMConstVector(elems, 8);
1813
1814 tmp0 = LLVMBuildMul(builder, acode, mulclo, "");
1815 tmp1 = LLVMBuildMul(builder, acode, mulchi, "");
1816 tmp0 = LLVMBuildLShr(builder, tmp0,
1817 lp_build_const_int_vec(gallivm, type16, 13), "");
1818 tmp1 = LLVMBuildLShr(builder, tmp1,
1819 lp_build_const_int_vec(gallivm, type16, 5), "");
1820 tmp1 = LLVMBuildAnd(builder, tmp1,
1821 lp_build_const_int_vec(gallivm, type16, 0x700), "");
1822 acode = LLVMBuildOr(builder, tmp0, tmp1, "");
1823 acode = LLVMBuildBitCast(builder, acode, bld8.vec_type, "");
1824
1825 /*
1826 * Note that ordering is different here to non-ssse3 path:
1827 * 0/1/2/3/4/5...
1828 */
1829
1830 LLVMValueRef weight0, weight1, weight, delta;
1831 LLVMValueRef constff_elem7, const0_elem6;
1832 /* weights, correctly rounded (round(256*x/7)) */
1833 elems[0] = LLVMConstInt(type16s, 256, 0);
1834 elems[1] = LLVMConstInt(type16s, 0, 0);
1835 elems[2] = LLVMConstInt(type16s, 219, 0);
1836 elems[3] = LLVMConstInt(type16s, 183, 0);
1837 elems[4] = LLVMConstInt(type16s, 146, 0);
1838 elems[5] = LLVMConstInt(type16s, 110, 0);
1839 elems[6] = LLVMConstInt(type16s, 73, 0);
1840 elems[7] = LLVMConstInt(type16s, 37, 0);
1841 weight0 = LLVMConstVector(elems, 8);
1842
1843 elems[0] = LLVMConstInt(type16s, 256, 0);
1844 elems[1] = LLVMConstInt(type16s, 0, 0);
1845 elems[2] = LLVMConstInt(type16s, 205, 0);
1846 elems[3] = LLVMConstInt(type16s, 154, 0);
1847 elems[4] = LLVMConstInt(type16s, 102, 0);
1848 elems[5] = LLVMConstInt(type16s, 51, 0);
1849 elems[6] = LLVMConstInt(type16s, 0, 0);
1850 elems[7] = LLVMConstInt(type16s, 0, 0);
1851 weight1 = LLVMConstVector(elems, 8);
1852
1853 weight0 = LLVMBuildBitCast(builder, weight0, bld8.vec_type, "");
1854 weight1 = LLVMBuildBitCast(builder, weight1, bld8.vec_type, "");
1855 weight = lp_build_select(&bld8, sel_mask, weight0, weight1);
1856 weight = LLVMBuildBitCast(builder, weight, bld16.vec_type, "");
1857
1858 for (i = 0; i < 16; i++) {
1859 elems[i] = LLVMConstNull(type8s);
1860 }
1861 elems[7] = LLVMConstInt(type8s, 255, 0);
1862 constff_elem7 = LLVMConstVector(elems, 16);
1863
1864 for (i = 0; i < 16; i++) {
1865 elems[i] = LLVMConstInt(type8s, 255, 0);
1866 }
1867 elems[6] = LLVMConstInt(type8s, 0, 0);
1868 const0_elem6 = LLVMConstVector(elems, 16);
1869
1870 /* standard simple lerp - but the version we need isn't available */
1871 delta = LLVMBuildSub(builder, alpha0, alpha1, "");
1872 ainterp = LLVMBuildMul(builder, delta, weight, "");
1873 ainterp = LLVMBuildLShr(builder, ainterp,
1874 lp_build_const_int_vec(gallivm, type16, 8), "");
1875 ainterp = LLVMBuildBitCast(builder, ainterp, bld8.vec_type, "");
1876 alpha1 = LLVMBuildBitCast(builder, alpha1, bld8.vec_type, "");
1877 ainterp = LLVMBuildAdd(builder, ainterp, alpha1, "");
1878 ainterp = LLVMBuildBitCast(builder, ainterp, bld16.vec_type, "");
1879 ainterp = lp_build_pack2(gallivm, type16, type8, ainterp, bld16.undef);
1880
1881 /* fixing 0/0xff case is slightly more complex */
1882 constff_elem7 = LLVMBuildAnd(builder, constff_elem7,
1883 LLVMBuildNot(builder, sel_mask, ""), "");
1884 const0_elem6 = LLVMBuildOr(builder, const0_elem6, sel_mask, "");
1885 ainterp = LLVMBuildOr(builder, ainterp, constff_elem7, "");
1886 ainterp = LLVMBuildAnd(builder, ainterp, const0_elem6, "");
1887
1888 /* now pick all 16 elements at once! */
1889 intrargs[0] = ainterp;
1890 intrargs[1] = acode;
1891 ares = lp_build_intrinsic(builder, "llvm.x86.ssse3.pshuf.b.128",
1892 bld8.vec_type, intrargs, 2, 0);
1893
1894 ares = LLVMBuildBitCast(builder, ares, i32t, "");
1895 mask8hi = lp_build_const_int_vec(gallivm, type32, 0xff000000);
1896 a[0] = LLVMBuildShl(builder, ares,
1897 lp_build_const_int_vec(gallivm, type32, 24), "");
1898 a[1] = LLVMBuildShl(builder, ares,
1899 lp_build_const_int_vec(gallivm, type32, 16), "");
1900 a[1] = LLVMBuildAnd(builder, a[1], mask8hi, "");
1901 a[2] = LLVMBuildShl(builder, ares,
1902 lp_build_const_int_vec(gallivm, type32, 8), "");
1903 a[2] = LLVMBuildAnd(builder, a[2], mask8hi, "");
1904 a[3] = LLVMBuildAnd(builder, ares, mask8hi, "");
1905 }
1906
1907 for (i = 0; i < 4; i++) {
1908 a[i] = LLVMBuildBitCast(builder, a[i], i32t, "");
1909 col[i] = LLVMBuildOr(builder, col[i], a[i], "");
1910 }
1911 }
1912
1913
1914 static void
generate_update_cache_one_block(struct gallivm_state * gallivm,LLVMValueRef function,const struct util_format_description * format_desc)1915 generate_update_cache_one_block(struct gallivm_state *gallivm,
1916 LLVMValueRef function,
1917 const struct util_format_description *format_desc)
1918 {
1919 LLVMBasicBlockRef block;
1920 LLVMBuilderRef old_builder;
1921 LLVMValueRef ptr_addr;
1922 LLVMValueRef hash_index;
1923 LLVMValueRef cache;
1924 LLVMValueRef dxt_block, tag_value;
1925 LLVMValueRef col[LP_MAX_VECTOR_LENGTH];
1926
1927 ptr_addr = LLVMGetParam(function, 0);
1928 hash_index = LLVMGetParam(function, 1);
1929 cache = LLVMGetParam(function, 2);
1930
1931 lp_build_name(ptr_addr, "ptr_addr" );
1932 lp_build_name(hash_index, "hash_index");
1933 lp_build_name(cache, "cache_addr");
1934
1935 /*
1936 * Function body
1937 */
1938
1939 old_builder = gallivm->builder;
1940 block = LLVMAppendBasicBlockInContext(gallivm->context, function, "entry");
1941 gallivm->builder = LLVMCreateBuilderInContext(gallivm->context);
1942 LLVMPositionBuilderAtEnd(gallivm->builder, block);
1943
1944 lp_build_gather_s3tc_simple_scalar(gallivm, format_desc, &dxt_block,
1945 ptr_addr);
1946
1947 switch (format_desc->format) {
1948 case PIPE_FORMAT_DXT1_RGB:
1949 case PIPE_FORMAT_DXT1_RGBA:
1950 case PIPE_FORMAT_DXT1_SRGB:
1951 case PIPE_FORMAT_DXT1_SRGBA:
1952 s3tc_decode_block_dxt1(gallivm, format_desc->format, dxt_block, col);
1953 break;
1954 case PIPE_FORMAT_DXT3_RGBA:
1955 case PIPE_FORMAT_DXT3_SRGBA:
1956 s3tc_decode_block_dxt3(gallivm, format_desc->format, dxt_block, col);
1957 break;
1958 case PIPE_FORMAT_DXT5_RGBA:
1959 case PIPE_FORMAT_DXT5_SRGBA:
1960 s3tc_decode_block_dxt5(gallivm, format_desc->format, dxt_block, col);
1961 break;
1962 default:
1963 assert(0);
1964 s3tc_decode_block_dxt1(gallivm, format_desc->format, dxt_block, col);
1965 break;
1966 }
1967
1968 tag_value = LLVMBuildPtrToInt(gallivm->builder, ptr_addr,
1969 LLVMInt64TypeInContext(gallivm->context), "");
1970 s3tc_store_cached_block(gallivm, col, tag_value, hash_index, cache);
1971
1972 LLVMBuildRetVoid(gallivm->builder);
1973
1974 LLVMDisposeBuilder(gallivm->builder);
1975 gallivm->builder = old_builder;
1976
1977 gallivm_verify_function(gallivm, function);
1978 }
1979
1980
1981 static void
update_cached_block(struct gallivm_state * gallivm,const struct util_format_description * format_desc,LLVMValueRef ptr_addr,LLVMValueRef hash_index,LLVMValueRef cache)1982 update_cached_block(struct gallivm_state *gallivm,
1983 const struct util_format_description *format_desc,
1984 LLVMValueRef ptr_addr,
1985 LLVMValueRef hash_index,
1986 LLVMValueRef cache)
1987
1988 {
1989 LLVMBuilderRef builder = gallivm->builder;
1990 LLVMModuleRef module = gallivm->module;
1991 char name[256];
1992 LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
1993 LLVMTypeRef pi8t = LLVMPointerType(i8t, 0);
1994 LLVMValueRef function, inst;
1995 LLVMBasicBlockRef bb;
1996 LLVMValueRef args[3];
1997
1998 snprintf(name, sizeof name, "%s_update_cache_one_block",
1999 format_desc->short_name);
2000 function = LLVMGetNamedFunction(module, name);
2001
2002 LLVMTypeRef ret_type = LLVMVoidTypeInContext(gallivm->context);
2003 LLVMTypeRef arg_types[3];
2004 arg_types[0] = pi8t;
2005 arg_types[1] = LLVMInt32TypeInContext(gallivm->context);
2006 arg_types[2] = LLVMTypeOf(cache); // XXX: put right type here
2007 LLVMTypeRef function_type = LLVMFunctionType(ret_type, arg_types, ARRAY_SIZE(arg_types), 0);
2008
2009 if (!function) {
2010 function = LLVMAddFunction(module, name, function_type);
2011
2012 for (unsigned arg = 0; arg < ARRAY_SIZE(arg_types); ++arg)
2013 if (LLVMGetTypeKind(arg_types[arg]) == LLVMPointerTypeKind)
2014 lp_add_function_attr(function, arg + 1, LP_FUNC_ATTR_NOALIAS);
2015
2016 LLVMSetFunctionCallConv(function, LLVMFastCallConv);
2017 LLVMSetVisibility(function, LLVMHiddenVisibility);
2018 generate_update_cache_one_block(gallivm, function, format_desc);
2019 }
2020
2021 args[0] = ptr_addr;
2022 args[1] = hash_index;
2023 args[2] = cache;
2024
2025 LLVMBuildCall2(builder, function_type, function, args, ARRAY_SIZE(args), "");
2026 bb = LLVMGetInsertBlock(builder);
2027 inst = LLVMGetLastInstruction(bb);
2028 LLVMSetInstructionCallConv(inst, LLVMFastCallConv);
2029 }
2030
2031 /*
2032 * cached lookup
2033 */
2034 static LLVMValueRef
compressed_fetch_cached(struct gallivm_state * gallivm,const struct util_format_description * format_desc,unsigned n,LLVMValueRef base_ptr,LLVMValueRef offset,LLVMValueRef i,LLVMValueRef j,LLVMValueRef cache)2035 compressed_fetch_cached(struct gallivm_state *gallivm,
2036 const struct util_format_description *format_desc,
2037 unsigned n,
2038 LLVMValueRef base_ptr,
2039 LLVMValueRef offset,
2040 LLVMValueRef i,
2041 LLVMValueRef j,
2042 LLVMValueRef cache)
2043
2044 {
2045 LLVMBuilderRef builder = gallivm->builder;
2046 unsigned count, low_bit, log2size;
2047 LLVMValueRef color, offset_stored, addr, ptr_addrtrunc, tmp;
2048 LLVMValueRef ij_index, hash_index, hash_mask, block_index;
2049 LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
2050 LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
2051 LLVMTypeRef i64t = LLVMInt64TypeInContext(gallivm->context);
2052 struct lp_type type;
2053 struct lp_build_context bld32;
2054 memset(&type, 0, sizeof type);
2055 type.width = 32;
2056 type.length = n;
2057
2058 lp_build_context_init(&bld32, gallivm, type);
2059
2060 /*
2061 * compute hash - we use direct mapped cache, the hash function could
2062 * be better but it needs to be simple
2063 * per-element:
2064 * compare offset with offset stored at tag (hash)
2065 * if not equal extract block, store block, update tag
2066 * extract color from cache
2067 * assemble colors
2068 */
2069
2070 low_bit = util_logbase2(format_desc->block.bits / 8);
2071 log2size = util_logbase2(LP_BUILD_FORMAT_CACHE_SIZE);
2072 addr = LLVMBuildPtrToInt(builder, base_ptr, i64t, "");
2073 ptr_addrtrunc = LLVMBuildPtrToInt(builder, base_ptr, i32t, "");
2074 ptr_addrtrunc = lp_build_broadcast_scalar(&bld32, ptr_addrtrunc);
2075 /* For the hash function, first mask off the unused lowest bits. Then just
2076 do some xor with address bits - only use lower 32bits */
2077 ptr_addrtrunc = LLVMBuildAdd(builder, offset, ptr_addrtrunc, "");
2078 ptr_addrtrunc = LLVMBuildLShr(builder, ptr_addrtrunc,
2079 lp_build_const_int_vec(gallivm, type, low_bit), "");
2080 /* This only really makes sense for size 64,128,256 */
2081 hash_index = ptr_addrtrunc;
2082 ptr_addrtrunc = LLVMBuildLShr(builder, ptr_addrtrunc,
2083 lp_build_const_int_vec(gallivm, type, 2*log2size), "");
2084 hash_index = LLVMBuildXor(builder, ptr_addrtrunc, hash_index, "");
2085 tmp = LLVMBuildLShr(builder, hash_index,
2086 lp_build_const_int_vec(gallivm, type, log2size), "");
2087 hash_index = LLVMBuildXor(builder, hash_index, tmp, "");
2088
2089 hash_mask = lp_build_const_int_vec(gallivm, type, LP_BUILD_FORMAT_CACHE_SIZE - 1);
2090 hash_index = LLVMBuildAnd(builder, hash_index, hash_mask, "");
2091 ij_index = LLVMBuildShl(builder, i, lp_build_const_int_vec(gallivm, type, 2), "");
2092 ij_index = LLVMBuildAdd(builder, ij_index, j, "");
2093 block_index = LLVMBuildShl(builder, hash_index,
2094 lp_build_const_int_vec(gallivm, type, 4), "");
2095 block_index = LLVMBuildAdd(builder, ij_index, block_index, "");
2096
2097 if (n > 1) {
2098 color = bld32.undef;
2099 for (count = 0; count < n; count++) {
2100 LLVMValueRef index, cond, colorx;
2101 LLVMValueRef block_indexx, hash_indexx, addrx, offsetx, ptr_addrx;
2102 struct lp_build_if_state if_ctx;
2103
2104 index = lp_build_const_int32(gallivm, count);
2105 offsetx = LLVMBuildExtractElement(builder, offset, index, "");
2106 addrx = LLVMBuildZExt(builder, offsetx, i64t, "");
2107 addrx = LLVMBuildAdd(builder, addrx, addr, "");
2108 block_indexx = LLVMBuildExtractElement(builder, block_index, index, "");
2109 hash_indexx = LLVMBuildLShr(builder, block_indexx,
2110 lp_build_const_int32(gallivm, 4), "");
2111 offset_stored = s3tc_lookup_tag_data(gallivm, cache, hash_indexx);
2112 cond = LLVMBuildICmp(builder, LLVMIntNE, offset_stored, addrx, "");
2113
2114 lp_build_if(&if_ctx, gallivm, cond);
2115 {
2116 ptr_addrx = LLVMBuildIntToPtr(builder, addrx,
2117 LLVMPointerType(i8t, 0), "");
2118 update_cached_block(gallivm, format_desc, ptr_addrx, hash_indexx, cache);
2119 #if LP_BUILD_FORMAT_CACHE_DEBUG
2120 s3tc_update_cache_access(gallivm, cache, 1,
2121 LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS);
2122 #endif
2123 }
2124 lp_build_endif(&if_ctx);
2125
2126 colorx = s3tc_lookup_cached_pixel(gallivm, cache, block_indexx);
2127
2128 color = LLVMBuildInsertElement(builder, color, colorx,
2129 lp_build_const_int32(gallivm, count), "");
2130 }
2131 }
2132 else {
2133 LLVMValueRef cond;
2134 struct lp_build_if_state if_ctx;
2135
2136 tmp = LLVMBuildZExt(builder, offset, i64t, "");
2137 addr = LLVMBuildAdd(builder, tmp, addr, "");
2138 offset_stored = s3tc_lookup_tag_data(gallivm, cache, hash_index);
2139 cond = LLVMBuildICmp(builder, LLVMIntNE, offset_stored, addr, "");
2140
2141 lp_build_if(&if_ctx, gallivm, cond);
2142 {
2143 tmp = LLVMBuildIntToPtr(builder, addr, LLVMPointerType(i8t, 0), "");
2144 update_cached_block(gallivm, format_desc, tmp, hash_index, cache);
2145 #if LP_BUILD_FORMAT_CACHE_DEBUG
2146 s3tc_update_cache_access(gallivm, cache, 1,
2147 LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS);
2148 #endif
2149 }
2150 lp_build_endif(&if_ctx);
2151
2152 color = s3tc_lookup_cached_pixel(gallivm, cache, block_index);
2153 }
2154 #if LP_BUILD_FORMAT_CACHE_DEBUG
2155 s3tc_update_cache_access(gallivm, cache, n,
2156 LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL);
2157 #endif
2158 return LLVMBuildBitCast(builder, color, LLVMVectorType(i8t, n * 4), "");
2159 }
2160
2161
2162 static LLVMValueRef
s3tc_dxt5_to_rgba_aos(struct gallivm_state * gallivm,unsigned n,enum pipe_format format,LLVMValueRef colors,LLVMValueRef codewords,LLVMValueRef alpha_lo,LLVMValueRef alpha_hi,LLVMValueRef i,LLVMValueRef j)2163 s3tc_dxt5_to_rgba_aos(struct gallivm_state *gallivm,
2164 unsigned n,
2165 enum pipe_format format,
2166 LLVMValueRef colors,
2167 LLVMValueRef codewords,
2168 LLVMValueRef alpha_lo,
2169 LLVMValueRef alpha_hi,
2170 LLVMValueRef i,
2171 LLVMValueRef j)
2172 {
2173 return s3tc_dxt5_full_to_rgba_aos(gallivm, n, format, colors,
2174 codewords, alpha_lo, alpha_hi, i, j);
2175 }
2176
2177
2178 /**
2179 * @param n number of pixels processed (usually n=4, but it should also work with n=1
2180 * and multiples of 4)
2181 * @param base_ptr base pointer (32bit or 64bit pointer depending on the architecture)
2182 * @param offset <n x i32> vector with the relative offsets of the S3TC blocks
2183 * @param i is a <n x i32> vector with the x subpixel coordinate (0..3)
2184 * @param j is a <n x i32> vector with the y subpixel coordinate (0..3)
2185 * @return a <4*n x i8> vector with the pixel RGBA values in AoS
2186 */
2187 LLVMValueRef
lp_build_fetch_s3tc_rgba_aos(struct gallivm_state * gallivm,const struct util_format_description * format_desc,unsigned n,LLVMValueRef base_ptr,LLVMValueRef offset,LLVMValueRef i,LLVMValueRef j,LLVMValueRef cache)2188 lp_build_fetch_s3tc_rgba_aos(struct gallivm_state *gallivm,
2189 const struct util_format_description *format_desc,
2190 unsigned n,
2191 LLVMValueRef base_ptr,
2192 LLVMValueRef offset,
2193 LLVMValueRef i,
2194 LLVMValueRef j,
2195 LLVMValueRef cache)
2196 {
2197 LLVMValueRef rgba;
2198 LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
2199 LLVMBuilderRef builder = gallivm->builder;
2200
2201 assert(format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC);
2202 assert(format_desc->block.width == 4);
2203 assert(format_desc->block.height == 4);
2204
2205 assert((n == 1) || (n % 4 == 0));
2206
2207 /* debug_printf("format = %d\n", format_desc->format);*/
2208 if (cache) {
2209 rgba = compressed_fetch_cached(gallivm, format_desc, n,
2210 base_ptr, offset, i, j, cache);
2211 return rgba;
2212 }
2213
2214 /*
2215 * Could use n > 8 here with avx2, but doesn't seem faster.
2216 */
2217 if (n > 4) {
2218 unsigned count;
2219 LLVMTypeRef i8_vectype = LLVMVectorType(i8t, 4 * n);
2220 LLVMTypeRef i128_type = LLVMIntTypeInContext(gallivm->context, 128);
2221 LLVMTypeRef i128_vectype = LLVMVectorType(i128_type, n / 4);
2222 LLVMTypeRef i324_vectype = LLVMVectorType(LLVMInt32TypeInContext(
2223 gallivm->context), 4);
2224 LLVMValueRef offset4, i4, j4, rgba4[LP_MAX_VECTOR_LENGTH/16];
2225 struct lp_type lp_324_vectype = lp_type_uint_vec(32, 128);
2226
2227 assert(n / 4 <= ARRAY_SIZE(rgba4));
2228
2229 rgba = LLVMGetUndef(i128_vectype);
2230
2231 for (count = 0; count < n / 4; count++) {
2232 LLVMValueRef colors, codewords, alpha_lo = NULL, alpha_hi = NULL;
2233
2234 i4 = lp_build_extract_range(gallivm, i, count * 4, 4);
2235 j4 = lp_build_extract_range(gallivm, j, count * 4, 4);
2236 offset4 = lp_build_extract_range(gallivm, offset, count * 4, 4);
2237
2238 lp_build_gather_s3tc(gallivm, 4, format_desc, &colors, &codewords,
2239 &alpha_lo, &alpha_hi, base_ptr, offset4);
2240
2241 switch (format_desc->format) {
2242 case PIPE_FORMAT_DXT1_RGB:
2243 case PIPE_FORMAT_DXT1_RGBA:
2244 case PIPE_FORMAT_DXT1_SRGB:
2245 case PIPE_FORMAT_DXT1_SRGBA:
2246 rgba4[count] = s3tc_dxt1_to_rgba_aos(gallivm, 4, format_desc->format,
2247 colors, codewords, i4, j4);
2248 break;
2249 case PIPE_FORMAT_DXT3_RGBA:
2250 case PIPE_FORMAT_DXT3_SRGBA:
2251 rgba4[count] = s3tc_dxt3_to_rgba_aos(gallivm, 4, format_desc->format, colors,
2252 codewords, alpha_lo, alpha_hi, i4, j4);
2253 break;
2254 case PIPE_FORMAT_DXT5_RGBA:
2255 case PIPE_FORMAT_DXT5_SRGBA:
2256 rgba4[count] = s3tc_dxt5_to_rgba_aos(gallivm, 4, format_desc->format, colors,
2257 codewords, alpha_lo, alpha_hi, i4, j4);
2258 break;
2259 default:
2260 assert(0);
2261 rgba4[count] = LLVMGetUndef(LLVMVectorType(i8t, 4));
2262 break;
2263 }
2264 /* shuffles typically give best results with dword elements...*/
2265 rgba4[count] = LLVMBuildBitCast(builder, rgba4[count], i324_vectype, "");
2266 }
2267 rgba = lp_build_concat(gallivm, rgba4, lp_324_vectype, n / 4);
2268 rgba = LLVMBuildBitCast(builder, rgba, i8_vectype, "");
2269 }
2270 else {
2271 LLVMValueRef colors, codewords, alpha_lo = NULL, alpha_hi = NULL;
2272
2273 lp_build_gather_s3tc(gallivm, n, format_desc, &colors, &codewords,
2274 &alpha_lo, &alpha_hi, base_ptr, offset);
2275
2276 switch (format_desc->format) {
2277 case PIPE_FORMAT_DXT1_RGB:
2278 case PIPE_FORMAT_DXT1_RGBA:
2279 case PIPE_FORMAT_DXT1_SRGB:
2280 case PIPE_FORMAT_DXT1_SRGBA:
2281 rgba = s3tc_dxt1_to_rgba_aos(gallivm, n, format_desc->format,
2282 colors, codewords, i, j);
2283 break;
2284 case PIPE_FORMAT_DXT3_RGBA:
2285 case PIPE_FORMAT_DXT3_SRGBA:
2286 rgba = s3tc_dxt3_to_rgba_aos(gallivm, n, format_desc->format, colors,
2287 codewords, alpha_lo, alpha_hi, i, j);
2288 break;
2289 case PIPE_FORMAT_DXT5_RGBA:
2290 case PIPE_FORMAT_DXT5_SRGBA:
2291 rgba = s3tc_dxt5_to_rgba_aos(gallivm, n, format_desc->format, colors,
2292 codewords, alpha_lo, alpha_hi, i, j);
2293 break;
2294 default:
2295 assert(0);
2296 rgba = LLVMGetUndef(LLVMVectorType(i8t, 4*n));
2297 break;
2298 }
2299 }
2300
2301 /* always return just decompressed values - srgb conversion is done later */
2302
2303 return rgba;
2304 }
2305
2306 /**
2307 * Gather elements from scatter positions in memory into vectors.
2308 * This is customised for fetching texels from s3tc textures.
2309 * For SSE, typical value is length=4.
2310 *
2311 * @param length length of the offsets
2312 * @param colors the stored colors of the blocks will be extracted into this.
2313 * @param codewords the codewords of the blocks will be extracted into this.
2314 * @param alpha_lo used for storing lower 32bit of alpha components for dxt3/5
2315 * @param alpha_hi used for storing higher 32bit of alpha components for dxt3/5
2316 * @param base_ptr base pointer, should be a i8 pointer type.
2317 * @param offsets vector with offsets
2318 */
2319 static void
lp_build_gather_rgtc(struct gallivm_state * gallivm,unsigned length,const struct util_format_description * format_desc,LLVMValueRef * red_lo,LLVMValueRef * red_hi,LLVMValueRef * green_lo,LLVMValueRef * green_hi,LLVMValueRef base_ptr,LLVMValueRef offsets)2320 lp_build_gather_rgtc(struct gallivm_state *gallivm,
2321 unsigned length,
2322 const struct util_format_description *format_desc,
2323 LLVMValueRef *red_lo, LLVMValueRef *red_hi,
2324 LLVMValueRef *green_lo, LLVMValueRef *green_hi,
2325 LLVMValueRef base_ptr,
2326 LLVMValueRef offsets)
2327 {
2328 LLVMBuilderRef builder = gallivm->builder;
2329 unsigned block_bits = format_desc->block.bits;
2330 unsigned i;
2331 LLVMValueRef elems[8];
2332 LLVMTypeRef type32 = LLVMInt32TypeInContext(gallivm->context);
2333 LLVMTypeRef type64 = LLVMInt64TypeInContext(gallivm->context);
2334 LLVMTypeRef type32dxt;
2335 struct lp_type lp_type32dxt;
2336
2337 memset(&lp_type32dxt, 0, sizeof lp_type32dxt);
2338 lp_type32dxt.width = 32;
2339 lp_type32dxt.length = block_bits / 32;
2340 type32dxt = lp_build_vec_type(gallivm, lp_type32dxt);
2341
2342 assert(block_bits == 64 || block_bits == 128);
2343 assert(length == 1 || length == 4 || length == 8);
2344
2345 for (i = 0; i < length; ++i) {
2346 elems[i] = lp_build_gather_elem(gallivm, length,
2347 block_bits, block_bits, true,
2348 base_ptr, offsets, i, false);
2349 elems[i] = LLVMBuildBitCast(builder, elems[i], type32dxt, "");
2350 }
2351 if (length == 1) {
2352 LLVMValueRef elem = elems[0];
2353
2354 *red_lo = LLVMBuildExtractElement(builder, elem,
2355 lp_build_const_int32(gallivm, 0), "");
2356 *red_hi = LLVMBuildExtractElement(builder, elem,
2357 lp_build_const_int32(gallivm, 1), "");
2358
2359 if (block_bits == 128) {
2360 *green_lo = LLVMBuildExtractElement(builder, elem,
2361 lp_build_const_int32(gallivm, 2), "");
2362 *green_hi = LLVMBuildExtractElement(builder, elem,
2363 lp_build_const_int32(gallivm, 3), "");
2364 } else {
2365 *green_lo = NULL;
2366 *green_hi = NULL;
2367 }
2368 } else {
2369 LLVMValueRef tmp[4];
2370 struct lp_type lp_type32, lp_type64;
2371 memset(&lp_type32, 0, sizeof lp_type32);
2372 lp_type32.width = 32;
2373 lp_type32.length = length;
2374 lp_type32.sign = lp_type32dxt.sign;
2375 memset(&lp_type64, 0, sizeof lp_type64);
2376 lp_type64.width = 64;
2377 lp_type64.length = length/2;
2378 if (block_bits == 128) {
2379 if (length == 8) {
2380 for (i = 0; i < 4; ++i) {
2381 tmp[0] = elems[i];
2382 tmp[1] = elems[i+4];
2383 elems[i] = lp_build_concat(gallivm, tmp, lp_type32dxt, 2);
2384 }
2385 }
2386 lp_build_transpose_aos(gallivm, lp_type32, elems, tmp);
2387 *green_lo = tmp[2];
2388 *green_hi = tmp[3];
2389 *red_lo = tmp[0];
2390 *red_hi = tmp[1];
2391 } else {
2392 LLVMValueRef red01, red23;
2393 LLVMTypeRef type64_vec = LLVMVectorType(type64, length/2);
2394 LLVMTypeRef type32_vec = LLVMVectorType(type32, length);
2395
2396 for (i = 0; i < length; ++i) {
2397 /* no-op shuffle */
2398 elems[i] = LLVMBuildShuffleVector(builder, elems[i],
2399 LLVMGetUndef(type32dxt),
2400 lp_build_const_extend_shuffle(gallivm, 2, 4), "");
2401 }
2402 if (length == 8) {
2403 struct lp_type lp_type32_4 = {0};
2404 lp_type32_4.width = 32;
2405 lp_type32_4.length = 4;
2406 for (i = 0; i < 4; ++i) {
2407 tmp[0] = elems[i];
2408 tmp[1] = elems[i+4];
2409 elems[i] = lp_build_concat(gallivm, tmp, lp_type32_4, 2);
2410 }
2411 }
2412 red01 = lp_build_interleave2_half(gallivm, lp_type32, elems[0], elems[1], 0);
2413 red23 = lp_build_interleave2_half(gallivm, lp_type32, elems[2], elems[3], 0);
2414 red01 = LLVMBuildBitCast(builder, red01, type64_vec, "");
2415 red23 = LLVMBuildBitCast(builder, red23, type64_vec, "");
2416 *red_lo = lp_build_interleave2_half(gallivm, lp_type64, red01, red23, 0);
2417 *red_hi = lp_build_interleave2_half(gallivm, lp_type64, red01, red23, 1);
2418 *red_lo = LLVMBuildBitCast(builder, *red_lo, type32_vec, "");
2419 *red_hi = LLVMBuildBitCast(builder, *red_hi, type32_vec, "");
2420 *green_lo = NULL;
2421 *green_hi = NULL;
2422 }
2423 }
2424 }
2425
2426 static LLVMValueRef
rgtc1_to_rgba_aos(struct gallivm_state * gallivm,unsigned n,enum pipe_format format,LLVMValueRef red_lo,LLVMValueRef red_hi,LLVMValueRef i,LLVMValueRef j)2427 rgtc1_to_rgba_aos(struct gallivm_state *gallivm,
2428 unsigned n,
2429 enum pipe_format format,
2430 LLVMValueRef red_lo,
2431 LLVMValueRef red_hi,
2432 LLVMValueRef i,
2433 LLVMValueRef j)
2434 {
2435 LLVMBuilderRef builder = gallivm->builder;
2436 bool is_signed = (format == PIPE_FORMAT_RGTC1_SNORM);
2437 LLVMValueRef red = s3tc_dxt5_alpha_channel(gallivm, is_signed, n, red_hi, red_lo, i, j);
2438 LLVMValueRef rgba;
2439 struct lp_type type, type8;
2440 memset(&type, 0, sizeof type);
2441 type.width = 32;
2442 type.length = n;
2443 memset(&type8, 0, sizeof type8);
2444 type8.width = 8;
2445 type8.length = n*4;
2446 rgba = lp_build_const_int_vec(gallivm, type, is_signed ? (0x7f << 24) : (0xffu << 24));
2447 rgba = LLVMBuildOr(builder, rgba, red, "");
2448 return LLVMBuildBitCast(builder, rgba, lp_build_vec_type(gallivm, type8), "");
2449 }
2450
2451 static LLVMValueRef
rgtc2_to_rgba_aos(struct gallivm_state * gallivm,unsigned n,enum pipe_format format,LLVMValueRef red_lo,LLVMValueRef red_hi,LLVMValueRef green_lo,LLVMValueRef green_hi,LLVMValueRef i,LLVMValueRef j)2452 rgtc2_to_rgba_aos(struct gallivm_state *gallivm,
2453 unsigned n,
2454 enum pipe_format format,
2455 LLVMValueRef red_lo,
2456 LLVMValueRef red_hi,
2457 LLVMValueRef green_lo,
2458 LLVMValueRef green_hi,
2459 LLVMValueRef i,
2460 LLVMValueRef j)
2461 {
2462 LLVMBuilderRef builder = gallivm->builder;
2463 bool is_signed = (format == PIPE_FORMAT_RGTC2_SNORM);
2464 LLVMValueRef red = s3tc_dxt5_alpha_channel(gallivm, is_signed, n, red_hi, red_lo, i, j);
2465 LLVMValueRef green = s3tc_dxt5_alpha_channel(gallivm, is_signed, n, green_hi, green_lo, i, j);
2466 LLVMValueRef rgba;
2467 struct lp_type type, type8;
2468 memset(&type, 0, sizeof type);
2469 type.width = 32;
2470 type.length = n;
2471 memset(&type8, 0, sizeof type8);
2472 type8.width = 8;
2473 type8.length = n*4;
2474 rgba = lp_build_const_int_vec(gallivm, type, is_signed ? (0x7f << 24) : (0xffu << 24));
2475 rgba = LLVMBuildOr(builder, rgba, red, "");
2476 green = LLVMBuildShl(builder, green, lp_build_const_int_vec(gallivm, type, 8), "");
2477 rgba = LLVMBuildOr(builder, rgba, green, "");
2478 return LLVMBuildBitCast(builder, rgba, lp_build_vec_type(gallivm, type8), "");
2479 }
2480
2481 static LLVMValueRef
latc1_to_rgba_aos(struct gallivm_state * gallivm,unsigned n,enum pipe_format format,LLVMValueRef red_lo,LLVMValueRef red_hi,LLVMValueRef i,LLVMValueRef j)2482 latc1_to_rgba_aos(struct gallivm_state *gallivm,
2483 unsigned n,
2484 enum pipe_format format,
2485 LLVMValueRef red_lo,
2486 LLVMValueRef red_hi,
2487 LLVMValueRef i,
2488 LLVMValueRef j)
2489 {
2490 LLVMBuilderRef builder = gallivm->builder;
2491 bool is_signed = (format == PIPE_FORMAT_LATC1_SNORM);
2492 LLVMValueRef red = s3tc_dxt5_alpha_channel(gallivm, is_signed, n, red_hi, red_lo, i, j);
2493 LLVMValueRef rgba, temp;
2494 struct lp_type type, type8;
2495 memset(&type, 0, sizeof type);
2496 type.width = 32;
2497 type.length = n;
2498 memset(&type8, 0, sizeof type8);
2499 type8.width = 8;
2500 type8.length = n*4;
2501 rgba = lp_build_const_int_vec(gallivm, type, is_signed ? (0x7f << 24) : (0xffu << 24));
2502 rgba = LLVMBuildOr(builder, rgba, red, "");
2503 temp = LLVMBuildShl(builder, red, lp_build_const_int_vec(gallivm, type, 8), "");
2504 rgba = LLVMBuildOr(builder, rgba, temp, "");
2505 temp = LLVMBuildShl(builder, red, lp_build_const_int_vec(gallivm, type, 16), "");
2506 rgba = LLVMBuildOr(builder, rgba, temp, "");
2507 return LLVMBuildBitCast(builder, rgba, lp_build_vec_type(gallivm, type8), "");
2508 }
2509
2510 static LLVMValueRef
latc2_to_rgba_aos(struct gallivm_state * gallivm,unsigned n,enum pipe_format format,LLVMValueRef red_lo,LLVMValueRef red_hi,LLVMValueRef green_lo,LLVMValueRef green_hi,LLVMValueRef i,LLVMValueRef j)2511 latc2_to_rgba_aos(struct gallivm_state *gallivm,
2512 unsigned n,
2513 enum pipe_format format,
2514 LLVMValueRef red_lo,
2515 LLVMValueRef red_hi,
2516 LLVMValueRef green_lo,
2517 LLVMValueRef green_hi,
2518 LLVMValueRef i,
2519 LLVMValueRef j)
2520 {
2521 LLVMBuilderRef builder = gallivm->builder;
2522 bool is_signed = (format == PIPE_FORMAT_LATC2_SNORM);
2523 LLVMValueRef red = s3tc_dxt5_alpha_channel(gallivm, is_signed, n, red_hi, red_lo, i, j);
2524 LLVMValueRef green = s3tc_dxt5_alpha_channel(gallivm, is_signed, n, green_hi, green_lo, i, j);
2525 LLVMValueRef rgba, temp;
2526 struct lp_type type, type8;
2527 memset(&type, 0, sizeof type);
2528 type.width = 32;
2529 type.length = n;
2530 memset(&type8, 0, sizeof type8);
2531 type8.width = 8;
2532 type8.length = n*4;
2533
2534 temp = LLVMBuildShl(builder, red, lp_build_const_int_vec(gallivm, type, 8), "");
2535 rgba = LLVMBuildOr(builder, red, temp, "");
2536 temp = LLVMBuildShl(builder, red, lp_build_const_int_vec(gallivm, type, 16), "");
2537 rgba = LLVMBuildOr(builder, rgba, temp, "");
2538 temp = LLVMBuildShl(builder, green, lp_build_const_int_vec(gallivm, type, 24), "");
2539 rgba = LLVMBuildOr(builder, rgba, temp, "");
2540 return LLVMBuildBitCast(builder, rgba, lp_build_vec_type(gallivm, type8), "");
2541 }
2542
2543 /**
2544 * @param n number of pixels processed (usually n=4, but it should also work with n=1
2545 * and multiples of 4)
2546 * @param base_ptr base pointer (32bit or 64bit pointer depending on the architecture)
2547 * @param offset <n x i32> vector with the relative offsets of the S3TC blocks
2548 * @param i is a <n x i32> vector with the x subpixel coordinate (0..3)
2549 * @param j is a <n x i32> vector with the y subpixel coordinate (0..3)
2550 * @return a <4*n x i8> vector with the pixel RGBA values in AoS
2551 */
2552 LLVMValueRef
lp_build_fetch_rgtc_rgba_aos(struct gallivm_state * gallivm,const struct util_format_description * format_desc,unsigned n,LLVMValueRef base_ptr,LLVMValueRef offset,LLVMValueRef i,LLVMValueRef j,LLVMValueRef cache)2553 lp_build_fetch_rgtc_rgba_aos(struct gallivm_state *gallivm,
2554 const struct util_format_description *format_desc,
2555 unsigned n,
2556 LLVMValueRef base_ptr,
2557 LLVMValueRef offset,
2558 LLVMValueRef i,
2559 LLVMValueRef j,
2560 LLVMValueRef cache)
2561 {
2562 LLVMValueRef rgba;
2563 LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
2564 LLVMBuilderRef builder = gallivm->builder;
2565 LLVMValueRef red_lo, red_hi, green_lo, green_hi;
2566 assert(format_desc->layout == UTIL_FORMAT_LAYOUT_RGTC);
2567 assert(format_desc->block.width == 4);
2568 assert(format_desc->block.height == 4);
2569
2570 assert((n == 1) || (n % 4 == 0));
2571
2572 if (n > 4) {
2573 unsigned count;
2574 LLVMTypeRef i128_type = LLVMIntTypeInContext(gallivm->context, 128);
2575 LLVMTypeRef i128_vectype = LLVMVectorType(i128_type, n / 4);
2576 LLVMTypeRef i8_vectype = LLVMVectorType(i8t, 4 * n);
2577 LLVMTypeRef i324_vectype = LLVMVectorType(LLVMInt32TypeInContext(
2578 gallivm->context), 4);
2579 LLVMValueRef offset4, i4, j4, rgba4[LP_MAX_VECTOR_LENGTH/16];
2580 struct lp_type lp_324_vectype = lp_type_uint_vec(32, 128);
2581
2582 rgba = LLVMGetUndef(i128_vectype);
2583
2584 for (count = 0; count < n / 4; count++) {
2585
2586 i4 = lp_build_extract_range(gallivm, i, count * 4, 4);
2587 j4 = lp_build_extract_range(gallivm, j, count * 4, 4);
2588 offset4 = lp_build_extract_range(gallivm, offset, count * 4, 4);
2589
2590 lp_build_gather_rgtc(gallivm, 4, format_desc, &red_lo, &red_hi,
2591 &green_lo, &green_hi, base_ptr, offset4);
2592
2593 switch (format_desc->format) {
2594 case PIPE_FORMAT_RGTC1_UNORM:
2595 case PIPE_FORMAT_RGTC1_SNORM:
2596 rgba4[count] = rgtc1_to_rgba_aos(gallivm, 4, format_desc->format,
2597 red_lo, red_hi, i4, j4);
2598 break;
2599 case PIPE_FORMAT_RGTC2_UNORM:
2600 case PIPE_FORMAT_RGTC2_SNORM:
2601 rgba4[count] = rgtc2_to_rgba_aos(gallivm, 4, format_desc->format,
2602 red_lo, red_hi, green_lo, green_hi, i4, j4);
2603 break;
2604 case PIPE_FORMAT_LATC1_UNORM:
2605 case PIPE_FORMAT_LATC1_SNORM:
2606 rgba4[count] = latc1_to_rgba_aos(gallivm, 4, format_desc->format,
2607 red_lo, red_hi, i4, j4);
2608 break;
2609 case PIPE_FORMAT_LATC2_UNORM:
2610 case PIPE_FORMAT_LATC2_SNORM:
2611 rgba4[count] = latc2_to_rgba_aos(gallivm, 4, format_desc->format,
2612 red_lo, red_hi, green_lo, green_hi, i4, j4);
2613 break;
2614 default:
2615 assert(0);
2616 rgba4[count] = LLVMGetUndef(LLVMVectorType(i8t, 4));
2617 break;
2618 }
2619 /* shuffles typically give best results with dword elements...*/
2620 rgba4[count] = LLVMBuildBitCast(builder, rgba4[count], i324_vectype, "");
2621 }
2622 rgba = lp_build_concat(gallivm, rgba4, lp_324_vectype, n / 4);
2623 rgba = LLVMBuildBitCast(builder, rgba, i8_vectype, "");
2624 } else {
2625 LLVMValueRef red_lo, red_hi, green_lo, green_hi;
2626
2627 lp_build_gather_rgtc(gallivm, n, format_desc, &red_lo, &red_hi,
2628 &green_lo, &green_hi, base_ptr, offset);
2629
2630 switch (format_desc->format) {
2631 case PIPE_FORMAT_RGTC1_UNORM:
2632 case PIPE_FORMAT_RGTC1_SNORM:
2633 rgba = rgtc1_to_rgba_aos(gallivm, n, format_desc->format,
2634 red_lo, red_hi, i, j);
2635 break;
2636 case PIPE_FORMAT_RGTC2_UNORM:
2637 case PIPE_FORMAT_RGTC2_SNORM:
2638 rgba = rgtc2_to_rgba_aos(gallivm, n, format_desc->format,
2639 red_lo, red_hi, green_lo, green_hi, i, j);
2640 break;
2641 case PIPE_FORMAT_LATC1_UNORM:
2642 case PIPE_FORMAT_LATC1_SNORM:
2643 rgba = latc1_to_rgba_aos(gallivm, n, format_desc->format,
2644 red_lo, red_hi, i, j);
2645 break;
2646 case PIPE_FORMAT_LATC2_UNORM:
2647 case PIPE_FORMAT_LATC2_SNORM:
2648 rgba = latc2_to_rgba_aos(gallivm, n, format_desc->format,
2649 red_lo, red_hi, green_lo, green_hi, i, j);
2650 break;
2651 default:
2652 assert(0);
2653 rgba = LLVMGetUndef(LLVMVectorType(i8t, 4*n));
2654 break;
2655 }
2656 }
2657 return rgba;
2658 }
2659