xref: /aosp_15_r20/external/mesa3d/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /**************************************************************************
2  *
3  * Copyright 2010-2018 VMware, Inc.
4  * All Rights Reserved.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the
8  * "Software"), to deal in the Software without restriction, including
9  * without limitation the rights to use, copy, modify, merge, publish,
10  * distribute, sub license, and/or sell copies of the Software, and to
11  * permit persons to whom the Software is furnished to do so, subject to
12  * the following conditions:
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
18  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
19  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
20  * USE OR OTHER DEALINGS IN THE SOFTWARE.
21  *
22  * The above copyright notice and this permission notice (including the
23  * next paragraph) shall be included in all copies or substantial portions
24  * of the Software.
25  *
26  **************************************************************************/
27 
28 
29 /**
30  * @file
31  * s3tc pixel format manipulation.
32  *
33  * @author Roland Scheidegger <[email protected]>
34  */
35 
36 
37 #include <llvm/Config/llvm-config.h>
38 
39 #include "util/format/u_format.h"
40 #include "util/u_math.h"
41 #include "util/u_string.h"
42 #include "util/u_cpu_detect.h"
43 #include "util/u_debug.h"
44 
45 #include "lp_bld_arit.h"
46 #include "lp_bld_type.h"
47 #include "lp_bld_const.h"
48 #include "lp_bld_conv.h"
49 #include "lp_bld_gather.h"
50 #include "lp_bld_format.h"
51 #include "lp_bld_logic.h"
52 #include "lp_bld_pack.h"
53 #include "lp_bld_flow.h"
54 #include "lp_bld_printf.h"
55 #include "lp_bld_struct.h"
56 #include "lp_bld_swizzle.h"
57 #include "lp_bld_init.h"
58 #include "lp_bld_debug.h"
59 #include "lp_bld_intr.h"
60 
61 
62 /**
63  * Reverse an interleave2_half
64  * (ie. pick every second element, independent lower/upper halfs)
65  * sse2 can only do that with 32bit (shufps) or larger elements
66  * natively. (Otherwise, and/pack (even) or shift/pack (odd)
67  * could be used, ideally llvm would do that for us.)
68  * XXX: Unfortunately, this does NOT translate to a shufps if those
69  * are int vectors (and casting will not help, llvm needs to recognize it
70  * as "real" float). Instead, llvm will use a pshufd/pshufd/punpcklqdq
71  * sequence which I'm pretty sure is a lot worse despite domain transition
72  * penalties with shufps (except maybe on Nehalem).
73  */
74 static LLVMValueRef
lp_build_uninterleave2_half(struct gallivm_state * gallivm,struct lp_type type,LLVMValueRef a,LLVMValueRef b,unsigned lo_hi)75 lp_build_uninterleave2_half(struct gallivm_state *gallivm,
76                             struct lp_type type,
77                             LLVMValueRef a,
78                             LLVMValueRef b,
79                             unsigned lo_hi)
80 {
81    LLVMValueRef shuffle, elems[LP_MAX_VECTOR_LENGTH];
82    unsigned i;
83 
84    assert(type.length <= LP_MAX_VECTOR_LENGTH);
85    assert(lo_hi < 2);
86 
87    if (type.length * type.width == 256) {
88       assert(type.length == 8);
89       assert(type.width == 32);
90       static const unsigned shufvals[8] = {0, 2, 8, 10, 4, 6, 12, 14};
91       for (i = 0; i < type.length; ++i) {
92          elems[i] = lp_build_const_int32(gallivm, shufvals[i] + lo_hi);
93       }
94    } else {
95       for (i = 0; i < type.length; ++i) {
96          elems[i] = lp_build_const_int32(gallivm, 2*i + lo_hi);
97       }
98    }
99 
100    shuffle = LLVMConstVector(elems, type.length);
101 
102    return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, "");
103 
104 }
105 
106 
107 /**
108  * Build shuffle for extending vectors.
109  */
110 static LLVMValueRef
lp_build_const_extend_shuffle(struct gallivm_state * gallivm,unsigned n,unsigned length)111 lp_build_const_extend_shuffle(struct gallivm_state *gallivm,
112                               unsigned n, unsigned length)
113 {
114    LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
115    unsigned i;
116 
117    assert(n <= length);
118    assert(length <= LP_MAX_VECTOR_LENGTH);
119 
120    /* TODO: cache results in a static table */
121 
122    for(i = 0; i < n; i++) {
123       elems[i] = lp_build_const_int32(gallivm, i);
124    }
125    for (i = n; i < length; i++) {
126       elems[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
127    }
128 
129    return LLVMConstVector(elems, length);
130 }
131 
132 static LLVMValueRef
lp_build_const_unpackx2_shuffle(struct gallivm_state * gallivm,unsigned n)133 lp_build_const_unpackx2_shuffle(struct gallivm_state *gallivm, unsigned n)
134 {
135    LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
136    unsigned i, j;
137 
138    assert(n <= LP_MAX_VECTOR_LENGTH);
139 
140    /* TODO: cache results in a static table */
141 
142    for(i = 0, j = 0; i < n; i += 2, ++j) {
143       elems[i + 0] = lp_build_const_int32(gallivm, 0 + j);
144       elems[i + 1] = lp_build_const_int32(gallivm, n + j);
145       elems[n + i + 0] = lp_build_const_int32(gallivm, 0 + n/2 + j);
146       elems[n + i + 1] = lp_build_const_int32(gallivm, n + n/2 + j);
147    }
148 
149    return LLVMConstVector(elems, n * 2);
150 }
151 
152 /*
153  * broadcast 1 element to all elements
154  */
155 static LLVMValueRef
lp_build_const_shuffle1(struct gallivm_state * gallivm,unsigned index,unsigned n)156 lp_build_const_shuffle1(struct gallivm_state *gallivm,
157                         unsigned index, unsigned n)
158 {
159    LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
160    unsigned i;
161 
162    assert(n <= LP_MAX_VECTOR_LENGTH);
163 
164    /* TODO: cache results in a static table */
165 
166    for (i = 0; i < n; i++) {
167       elems[i] = lp_build_const_int32(gallivm, index);
168    }
169 
170    return LLVMConstVector(elems, n);
171 }
172 
173 /*
174  * move 1 element to pos 0, rest undef
175  */
176 static LLVMValueRef
lp_build_shuffle1undef(struct gallivm_state * gallivm,LLVMValueRef a,unsigned index,unsigned n)177 lp_build_shuffle1undef(struct gallivm_state *gallivm,
178                        LLVMValueRef a, unsigned index, unsigned n)
179 {
180    LLVMValueRef elems[LP_MAX_VECTOR_LENGTH], shuf;
181    unsigned i;
182 
183    assert(n <= LP_MAX_VECTOR_LENGTH);
184 
185    elems[0] = lp_build_const_int32(gallivm, index);
186 
187    for (i = 1; i < n; i++) {
188       elems[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
189    }
190    shuf = LLVMConstVector(elems, n);
191 
192    return LLVMBuildShuffleVector(gallivm->builder, a, a, shuf, "");
193 }
194 
195 static bool
format_dxt1_variant(enum pipe_format format)196 format_dxt1_variant(enum pipe_format format)
197 {
198   return format == PIPE_FORMAT_DXT1_RGB ||
199          format == PIPE_FORMAT_DXT1_RGBA ||
200          format == PIPE_FORMAT_DXT1_SRGB ||
201          format == PIPE_FORMAT_DXT1_SRGBA;
202 
203 }
204 
205 /**
206  * Gather elements from scatter positions in memory into vectors.
207  * This is customised for fetching texels from s3tc textures.
208  * For SSE, typical value is length=4.
209  *
210  * @param length length of the offsets
211  * @param colors the stored colors of the blocks will be extracted into this.
212  * @param codewords the codewords of the blocks will be extracted into this.
213  * @param alpha_lo used for storing lower 32bit of alpha components for dxt3/5
214  * @param alpha_hi used for storing higher 32bit of alpha components for dxt3/5
215  * @param base_ptr base pointer, should be a i8 pointer type.
216  * @param offsets vector with offsets
217  */
218 static void
lp_build_gather_s3tc(struct gallivm_state * gallivm,unsigned length,const struct util_format_description * format_desc,LLVMValueRef * colors,LLVMValueRef * codewords,LLVMValueRef * alpha_lo,LLVMValueRef * alpha_hi,LLVMValueRef base_ptr,LLVMValueRef offsets)219 lp_build_gather_s3tc(struct gallivm_state *gallivm,
220                      unsigned length,
221                      const struct util_format_description *format_desc,
222                      LLVMValueRef *colors,
223                      LLVMValueRef *codewords,
224                      LLVMValueRef *alpha_lo,
225                      LLVMValueRef *alpha_hi,
226                      LLVMValueRef base_ptr,
227                      LLVMValueRef offsets)
228 {
229    LLVMBuilderRef builder = gallivm->builder;
230    unsigned block_bits = format_desc->block.bits;
231    unsigned i;
232    LLVMValueRef elems[8];
233    LLVMTypeRef type32 = LLVMInt32TypeInContext(gallivm->context);
234    LLVMTypeRef type64 = LLVMInt64TypeInContext(gallivm->context);
235    LLVMTypeRef type32dxt;
236    struct lp_type lp_type32dxt;
237 
238    memset(&lp_type32dxt, 0, sizeof lp_type32dxt);
239    lp_type32dxt.width = 32;
240    lp_type32dxt.length = block_bits / 32;
241    type32dxt = lp_build_vec_type(gallivm, lp_type32dxt);
242 
243    assert(block_bits == 64 || block_bits == 128);
244    assert(length == 1 || length == 4 || length == 8);
245 
246    for (i = 0; i < length; ++i) {
247       elems[i] = lp_build_gather_elem(gallivm, length,
248                                       block_bits, block_bits, true,
249                                       base_ptr, offsets, i, false);
250       elems[i] = LLVMBuildBitCast(builder, elems[i], type32dxt, "");
251    }
252    if (length == 1) {
253       LLVMValueRef elem = elems[0];
254       if (block_bits == 128) {
255          *alpha_lo = LLVMBuildExtractElement(builder, elem,
256                                              lp_build_const_int32(gallivm, 0), "");
257          *alpha_hi = LLVMBuildExtractElement(builder, elem,
258                                              lp_build_const_int32(gallivm, 1), "");
259          *colors = LLVMBuildExtractElement(builder, elem,
260                                            lp_build_const_int32(gallivm, 2), "");
261          *codewords = LLVMBuildExtractElement(builder, elem,
262                                               lp_build_const_int32(gallivm, 3), "");
263       }
264       else {
265          *alpha_lo = LLVMGetUndef(type32);
266          *alpha_hi = LLVMGetUndef(type32);
267          *colors = LLVMBuildExtractElement(builder, elem,
268                                            lp_build_const_int32(gallivm, 0), "");
269          *codewords = LLVMBuildExtractElement(builder, elem,
270                                               lp_build_const_int32(gallivm, 1), "");
271       }
272    }
273    else {
274       LLVMValueRef tmp[4], cc01, cc23;
275       struct lp_type lp_type32, lp_type64;
276       memset(&lp_type32, 0, sizeof lp_type32);
277       lp_type32.width = 32;
278       lp_type32.length = length;
279       memset(&lp_type64, 0, sizeof lp_type64);
280       lp_type64.width = 64;
281       lp_type64.length = length/2;
282 
283       if (block_bits == 128) {
284          if (length == 8) {
285             for (i = 0; i < 4; ++i) {
286                tmp[0] = elems[i];
287                tmp[1] = elems[i+4];
288                elems[i] = lp_build_concat(gallivm, tmp, lp_type32dxt, 2);
289             }
290          }
291          lp_build_transpose_aos(gallivm, lp_type32, elems, tmp);
292          *colors = tmp[2];
293          *codewords = tmp[3];
294          *alpha_lo = tmp[0];
295          *alpha_hi = tmp[1];
296       } else {
297          LLVMTypeRef type64_vec = LLVMVectorType(type64, length/2);
298          LLVMTypeRef type32_vec = LLVMVectorType(type32, length);
299 
300          for (i = 0; i < length; ++i) {
301             /* no-op shuffle */
302             elems[i] = LLVMBuildShuffleVector(builder, elems[i],
303                                               LLVMGetUndef(type32dxt),
304                                               lp_build_const_extend_shuffle(gallivm, 2, 4), "");
305          }
306          if (length == 8) {
307             struct lp_type lp_type32_4 = {0};
308             lp_type32_4.width = 32;
309             lp_type32_4.length = 4;
310             for (i = 0; i < 4; ++i) {
311                tmp[0] = elems[i];
312                tmp[1] = elems[i+4];
313                elems[i] = lp_build_concat(gallivm, tmp, lp_type32_4, 2);
314             }
315          }
316          cc01 = lp_build_interleave2_half(gallivm, lp_type32, elems[0], elems[1], 0);
317          cc23 = lp_build_interleave2_half(gallivm, lp_type32, elems[2], elems[3], 0);
318          cc01 = LLVMBuildBitCast(builder, cc01, type64_vec, "");
319          cc23 = LLVMBuildBitCast(builder, cc23, type64_vec, "");
320          *colors = lp_build_interleave2_half(gallivm, lp_type64, cc01, cc23, 0);
321          *codewords = lp_build_interleave2_half(gallivm, lp_type64, cc01, cc23, 1);
322          *colors = LLVMBuildBitCast(builder, *colors, type32_vec, "");
323          *codewords = LLVMBuildBitCast(builder, *codewords, type32_vec, "");
324       }
325    }
326 }
327 
328 /** Convert from <n x i32> containing 2 x n rgb565 colors
329  * to 2 <n x i32> rgba8888 colors
330  * This is the most optimized version I can think of
331  * should be nearly as fast as decoding only one color
332  * NOTE: alpha channel will be set to 0
333  * @param colors  is a <n x i32> vector containing the rgb565 colors
334  */
335 static void
color_expand2_565_to_8888(struct gallivm_state * gallivm,unsigned n,LLVMValueRef colors,LLVMValueRef * color0,LLVMValueRef * color1)336 color_expand2_565_to_8888(struct gallivm_state *gallivm,
337                           unsigned n,
338                           LLVMValueRef colors,
339                           LLVMValueRef *color0,
340                           LLVMValueRef *color1)
341 {
342    LLVMBuilderRef builder = gallivm->builder;
343    LLVMValueRef r, g, b, rblo, glo;
344    LLVMValueRef rgblomask, rb, rgb0, rgb1;
345    struct lp_type type, type16, type8;
346 
347    assert(n > 1);
348 
349    memset(&type, 0, sizeof type);
350    type.width = 32;
351    type.length = n;
352 
353    memset(&type16, 0, sizeof type16);
354    type16.width = 16;
355    type16.length = 2 * n;
356 
357    memset(&type8, 0, sizeof type8);
358    type8.width = 8;
359    type8.length = 4 * n;
360 
361    rgblomask = lp_build_const_int_vec(gallivm, type16, 0x0707);
362    colors = LLVMBuildBitCast(builder, colors,
363                              lp_build_vec_type(gallivm, type16), "");
364    /* move r into low 8 bits, b into high 8 bits, g into another reg (low bits)
365     * make sure low bits of r are zero - could use AND but requires constant */
366    r = LLVMBuildLShr(builder, colors, lp_build_const_int_vec(gallivm, type16, 11), "");
367    r = LLVMBuildShl(builder, r, lp_build_const_int_vec(gallivm, type16, 3), "");
368    b = LLVMBuildShl(builder, colors, lp_build_const_int_vec(gallivm, type16, 11), "");
369    rb = LLVMBuildOr(builder, r, b, "");
370    rblo = LLVMBuildLShr(builder, rb, lp_build_const_int_vec(gallivm, type16, 5), "");
371    /* don't have byte shift hence need mask */
372    rblo = LLVMBuildAnd(builder, rblo, rgblomask, "");
373    rb = LLVMBuildOr(builder, rb, rblo, "");
374 
375    /* make sure low bits of g are zero */
376    g = LLVMBuildAnd(builder, colors, lp_build_const_int_vec(gallivm, type16, 0x07e0), "");
377    g = LLVMBuildLShr(builder, g, lp_build_const_int_vec(gallivm, type16, 3), "");
378    glo = LLVMBuildLShr(builder, g, lp_build_const_int_vec(gallivm, type16, 6), "");
379    g = LLVMBuildOr(builder, g, glo, "");
380 
381    rb = LLVMBuildBitCast(builder, rb, lp_build_vec_type(gallivm, type8), "");
382    g = LLVMBuildBitCast(builder, g, lp_build_vec_type(gallivm, type8), "");
383    rgb0 = lp_build_interleave2_half(gallivm, type8, rb, g, 0);
384    rgb1 = lp_build_interleave2_half(gallivm, type8, rb, g, 1);
385 
386    rgb0 = LLVMBuildBitCast(builder, rgb0, lp_build_vec_type(gallivm, type), "");
387    rgb1 = LLVMBuildBitCast(builder, rgb1, lp_build_vec_type(gallivm, type), "");
388 
389    /* rgb0 is rgb00, rgb01, rgb10, rgb11
390     * instead of rgb00, rgb10, rgb20, rgb30 hence need reshuffle
391     * on x86 this _should_ just generate one shufps...
392     */
393    *color0 = lp_build_uninterleave2_half(gallivm, type, rgb0, rgb1, 0);
394    *color1 = lp_build_uninterleave2_half(gallivm, type, rgb0, rgb1, 1);
395 }
396 
397 
398 /** Convert from <n x i32> containing rgb565 colors
399  * (in first 16 bits) to <n x i32> rgba8888 colors
400  * bits 16-31 MBZ
401  * NOTE: alpha channel will be set to 0
402  * @param colors  is a <n x i32> vector containing the rgb565 colors
403  */
404 static LLVMValueRef
color_expand_565_to_8888(struct gallivm_state * gallivm,unsigned n,LLVMValueRef colors)405 color_expand_565_to_8888(struct gallivm_state *gallivm,
406                          unsigned n,
407                          LLVMValueRef colors)
408 {
409    LLVMBuilderRef builder = gallivm->builder;
410    LLVMValueRef rgba, r, g, b, rgblo, glo;
411    LLVMValueRef rbhimask, g6mask, rgblomask;
412    struct lp_type type;
413    memset(&type, 0, sizeof type);
414    type.width = 32;
415    type.length = n;
416 
417    /* color expansion:
418     * first extract and shift colors into their final locations
419     * (high bits - low bits zero at this point)
420     * then replicate highest bits to the lowest bits
421     * note rb replication can be done in parallel but not g
422     * (different shift)
423     * r5mask = 0xf800, g6mask = 0x07e0, b5mask = 0x001f
424     * rhigh = 8, ghigh = 5, bhigh = 19
425     * rblow = 5, glow = 6
426     * rgblowmask = 0x00070307
427     * r = colors >> rhigh
428     * b = colors << bhigh
429     * g = (colors & g6mask) << ghigh
430     * rb = (r | b) rbhimask
431     * rbtmp = rb >> rblow
432     * gtmp = rb >> glow
433     * rbtmp = rbtmp | gtmp
434     * rbtmp = rbtmp & rgblowmask
435     * rgb = rb | g | rbtmp
436     */
437    g6mask = lp_build_const_int_vec(gallivm, type, 0x07e0);
438    rbhimask = lp_build_const_int_vec(gallivm, type, 0x00f800f8);
439    rgblomask = lp_build_const_int_vec(gallivm, type, 0x00070307);
440 
441    r = LLVMBuildLShr(builder, colors, lp_build_const_int_vec(gallivm, type, 8), "");
442    b = LLVMBuildShl(builder, colors, lp_build_const_int_vec(gallivm, type, 19), "");
443    g = LLVMBuildAnd(builder, colors, g6mask, "");
444    g = LLVMBuildShl(builder, g, lp_build_const_int_vec(gallivm, type, 5), "");
445    rgba = LLVMBuildOr(builder, r, b, "");
446    rgba = LLVMBuildAnd(builder, rgba, rbhimask, "");
447    rgblo = LLVMBuildLShr(builder, rgba, lp_build_const_int_vec(gallivm, type, 5), "");
448    glo = LLVMBuildLShr(builder, g, lp_build_const_int_vec(gallivm, type, 6), "");
449    rgblo = LLVMBuildOr(builder, rgblo, glo, "");
450    rgblo = LLVMBuildAnd(builder, rgblo, rgblomask, "");
451    rgba = LLVMBuildOr(builder, rgba, g, "");
452    rgba = LLVMBuildOr(builder, rgba, rgblo, "");
453 
454    return rgba;
455 }
456 
457 
458 /*
459  * Average two byte vectors. (Will always round up.)
460  */
461 static LLVMValueRef
lp_build_pavgb(struct lp_build_context * bld8,LLVMValueRef v0,LLVMValueRef v1)462 lp_build_pavgb(struct lp_build_context *bld8,
463                LLVMValueRef v0,
464                LLVMValueRef v1)
465 {
466    struct gallivm_state *gallivm = bld8->gallivm;
467    LLVMBuilderRef builder = gallivm->builder;
468    assert(bld8->type.width == 8);
469    assert(bld8->type.length == 16 || bld8->type.length == 32);
470    if (LLVM_VERSION_MAJOR < 6) {
471       LLVMValueRef intrargs[2];
472       char *intr_name = bld8->type.length == 32 ? "llvm.x86.avx2.pavg.b" :
473                                                   "llvm.x86.sse2.pavg.b";
474       intrargs[0] = v0;
475       intrargs[1] = v1;
476       return lp_build_intrinsic(builder, intr_name,
477                                 bld8->vec_type, intrargs, 2, 0);
478    } else {
479       /*
480        * Must match llvm's autoupgrade of pavg.b intrinsic to be useful.
481        * You better hope the backend code manages to detect the pattern, and
482        * the pattern doesn't change there...
483        */
484       struct lp_type type_ext = bld8->type;
485       LLVMTypeRef vec_type_ext;
486       LLVMValueRef res;
487       LLVMValueRef ext_one;
488       type_ext.width = 16;
489       vec_type_ext = lp_build_vec_type(gallivm, type_ext);
490       ext_one = lp_build_const_vec(gallivm, type_ext, 1);
491 
492       v0 = LLVMBuildZExt(builder, v0, vec_type_ext, "");
493       v1 = LLVMBuildZExt(builder, v1, vec_type_ext, "");
494       res = LLVMBuildAdd(builder, v0, v1, "");
495       res = LLVMBuildAdd(builder, res, ext_one, "");
496       res = LLVMBuildLShr(builder, res, ext_one, "");
497       res = LLVMBuildTrunc(builder, res, bld8->vec_type, "");
498       return res;
499    }
500 }
501 
502 /**
503  * Calculate 1/3(v1-v0) + v0
504  * and 2*1/3(v1-v0) + v0
505  */
506 static void
lp_build_lerp23(struct lp_build_context * bld,LLVMValueRef v0,LLVMValueRef v1,LLVMValueRef * res0,LLVMValueRef * res1)507 lp_build_lerp23(struct lp_build_context *bld,
508                 LLVMValueRef v0,
509                 LLVMValueRef v1,
510                 LLVMValueRef *res0,
511                 LLVMValueRef *res1)
512 {
513    struct gallivm_state *gallivm = bld->gallivm;
514    LLVMValueRef x, x_lo, x_hi, delta_lo, delta_hi;
515    LLVMValueRef mul_lo, mul_hi, v0_lo, v0_hi, v1_lo, v1_hi, tmp;
516    const struct lp_type type = bld->type;
517    LLVMBuilderRef builder = bld->gallivm->builder;
518    struct lp_type i16_type = lp_wider_type(type);
519    struct lp_build_context bld2;
520 
521    assert(lp_check_value(type, v0));
522    assert(lp_check_value(type, v1));
523    assert(!type.floating && !type.fixed && !type.norm && type.width == 8);
524 
525    lp_build_context_init(&bld2, gallivm, i16_type);
526    bld2.type.sign = true;
527    x = lp_build_const_int_vec(gallivm, bld->type, 255*1/3);
528 
529    /* FIXME: use native avx256 unpack/pack */
530    lp_build_unpack2(gallivm, type, i16_type, x, &x_lo, &x_hi);
531    lp_build_unpack2(gallivm, type, i16_type, v0, &v0_lo, &v0_hi);
532    lp_build_unpack2(gallivm, type, i16_type, v1, &v1_lo, &v1_hi);
533    delta_lo = lp_build_sub(&bld2, v1_lo, v0_lo);
534    delta_hi = lp_build_sub(&bld2, v1_hi, v0_hi);
535 
536    mul_lo = LLVMBuildMul(builder, x_lo, delta_lo, "");
537    mul_hi = LLVMBuildMul(builder, x_hi, delta_hi, "");
538 
539    x_lo = LLVMBuildLShr(builder, mul_lo, lp_build_const_int_vec(gallivm, i16_type, 8), "");
540    x_hi = LLVMBuildLShr(builder, mul_hi, lp_build_const_int_vec(gallivm, i16_type, 8), "");
541    /* lerp optimization: pack now, do add afterwards */
542    tmp = lp_build_pack2(gallivm, i16_type, type, x_lo, x_hi);
543    *res0 = lp_build_add(bld, tmp, v0);
544 
545    x_lo = LLVMBuildLShr(builder, mul_lo, lp_build_const_int_vec(gallivm, i16_type, 7), "");
546    x_hi = LLVMBuildLShr(builder, mul_hi, lp_build_const_int_vec(gallivm, i16_type, 7), "");
547    /* unlike above still need mask (but add still afterwards). */
548    x_lo = LLVMBuildAnd(builder, x_lo, lp_build_const_int_vec(gallivm, i16_type, 0xff), "");
549    x_hi = LLVMBuildAnd(builder, x_hi, lp_build_const_int_vec(gallivm, i16_type, 0xff), "");
550    tmp = lp_build_pack2(gallivm, i16_type, type, x_lo, x_hi);
551    *res1 = lp_build_add(bld, tmp, v0);
552 }
553 
554 /**
555  * Convert from <n x i64> s3tc dxt1 to <4n x i8> RGBA AoS
556  * @param colors  is a <n x i32> vector with n x 2x16bit colors
557  * @param codewords  is a <n x i32> vector containing the codewords
558  * @param i  is a <n x i32> vector with the x pixel coordinate (0 to 3)
559  * @param j  is a <n x i32> vector with the y pixel coordinate (0 to 3)
560  */
561 static LLVMValueRef
s3tc_dxt1_full_to_rgba_aos(struct gallivm_state * gallivm,unsigned n,enum pipe_format format,LLVMValueRef colors,LLVMValueRef codewords,LLVMValueRef i,LLVMValueRef j)562 s3tc_dxt1_full_to_rgba_aos(struct gallivm_state *gallivm,
563                            unsigned n,
564                            enum pipe_format format,
565                            LLVMValueRef colors,
566                            LLVMValueRef codewords,
567                            LLVMValueRef i,
568                            LLVMValueRef j)
569 {
570    LLVMBuilderRef builder = gallivm->builder;
571    LLVMValueRef color0, color1, color2, color3, color2_2, color3_2;
572    LLVMValueRef rgba, a, colors0, colors1, col0, col1, const2;
573    LLVMValueRef bit_pos, sel_mask, sel_lo, sel_hi, indices;
574    struct lp_type type, type8;
575    struct lp_build_context bld8, bld32;
576    bool is_dxt1_variant = format_dxt1_variant(format);
577 
578    memset(&type, 0, sizeof type);
579    type.width = 32;
580    type.length = n;
581 
582    memset(&type8, 0, sizeof type8);
583    type8.width = 8;
584    type8.length = 4*n;
585 
586    assert(lp_check_value(type, i));
587    assert(lp_check_value(type, j));
588 
589    a = lp_build_const_int_vec(gallivm, type, 0xff000000);
590 
591    lp_build_context_init(&bld32, gallivm, type);
592    lp_build_context_init(&bld8, gallivm, type8);
593 
594    /*
595     * works as follows:
596     * - expand color0/color1 to rgba8888
597     * - calculate color2/3 (interpolation) according to color0 < color1 rules
598     * - calculate color2/3 according to color0 >= color1 rules
599     * - do selection of color2/3 according to comparison of color0/1
600     * - extract indices (vector shift).
601     * - use compare/select to select the correct color. Since we have 2bit
602     *   indices (and 4 colors), needs at least three compare/selects.
603     */
604    /*
605     * expand the two colors
606     */
607    col0 = LLVMBuildAnd(builder, colors, lp_build_const_int_vec(gallivm, type, 0x0000ffff), "");
608    col1 = LLVMBuildLShr(builder, colors, lp_build_const_int_vec(gallivm, type, 16), "");
609    if (n > 1) {
610       color_expand2_565_to_8888(gallivm, n, colors, &color0, &color1);
611    }
612    else {
613       color0 = color_expand_565_to_8888(gallivm, n, col0);
614       color1 = color_expand_565_to_8888(gallivm, n, col1);
615    }
616 
617    /*
618     * interpolate colors
619     * color2_1 is 2/3 color0 + 1/3 color1
620     * color3_1 is 1/3 color0 + 2/3 color1
621     * color2_2 is 1/2 color0 + 1/2 color1
622     * color3_2 is 0
623     */
624 
625    colors0 = LLVMBuildBitCast(builder, color0, bld8.vec_type, "");
626    colors1 = LLVMBuildBitCast(builder, color1, bld8.vec_type, "");
627    /* can combine 2 lerps into one mostly - still looks expensive enough. */
628    lp_build_lerp23(&bld8, colors0, colors1, &color2, &color3);
629    color2 = LLVMBuildBitCast(builder, color2, bld32.vec_type, "");
630    color3 = LLVMBuildBitCast(builder, color3, bld32.vec_type, "");
631 
632    /* dxt3/5 always use 4-color encoding */
633    if (is_dxt1_variant) {
634       /* fix up alpha */
635       if (format == PIPE_FORMAT_DXT1_RGBA ||
636           format == PIPE_FORMAT_DXT1_SRGBA) {
637          color0 = LLVMBuildOr(builder, color0, a, "");
638          color1 = LLVMBuildOr(builder, color1, a, "");
639          color3 = LLVMBuildOr(builder, color3, a, "");
640       }
641       /*
642        * XXX with sse2 and 16x8 vectors, should use pavgb even when n == 1.
643        * Much cheaper (but we don't care that much if n == 1).
644        */
645       if ((util_get_cpu_caps()->has_sse2 && n == 4) ||
646           (util_get_cpu_caps()->has_avx2 && n == 8)) {
647          color2_2 = lp_build_pavgb(&bld8, colors0, colors1);
648          color2_2 = LLVMBuildBitCast(builder, color2_2, bld32.vec_type, "");
649       }
650       else {
651          struct lp_type i16_type = lp_wider_type(type8);
652          struct lp_build_context bld2;
653          LLVMValueRef v0_lo, v0_hi, v1_lo, v1_hi, addlo, addhi;
654 
655          lp_build_context_init(&bld2, gallivm, i16_type);
656          bld2.type.sign = true;
657 
658          /*
659           * This isn't as expensive as it looks (the unpack is the same as
660           * for lerp23), with correct rounding.
661           * (Note that while rounding is correct, this will always round down,
662           * whereas pavgb will always round up.)
663           */
664          /* FIXME: use native avx256 unpack/pack */
665          lp_build_unpack2(gallivm, type8, i16_type, colors0, &v0_lo, &v0_hi);
666          lp_build_unpack2(gallivm, type8, i16_type, colors1, &v1_lo, &v1_hi);
667 
668          addlo = lp_build_add(&bld2, v0_lo, v1_lo);
669          addhi = lp_build_add(&bld2, v0_hi, v1_hi);
670          addlo = LLVMBuildLShr(builder, addlo,
671                                lp_build_const_int_vec(gallivm, i16_type, 1), "");
672          addhi = LLVMBuildLShr(builder, addhi,
673                                lp_build_const_int_vec(gallivm, i16_type, 1), "");
674          color2_2 = lp_build_pack2(gallivm, i16_type, type8, addlo, addhi);
675          color2_2 = LLVMBuildBitCast(builder, color2_2, bld32.vec_type, "");
676       }
677       color3_2 = lp_build_const_int_vec(gallivm, type, 0);
678 
679       /* select between colors2/3 */
680       /* signed compare is faster saves some xors */
681       type.sign = true;
682       sel_mask = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER, col0, col1);
683       color2 = lp_build_select(&bld32, sel_mask, color2, color2_2);
684       color3 = lp_build_select(&bld32, sel_mask, color3, color3_2);
685       type.sign = false;
686 
687       if (format == PIPE_FORMAT_DXT1_RGBA ||
688           format == PIPE_FORMAT_DXT1_SRGBA) {
689          color2 = LLVMBuildOr(builder, color2, a, "");
690       }
691    }
692 
693    const2 = lp_build_const_int_vec(gallivm, type, 2);
694    /* extract 2-bit index values */
695    bit_pos = LLVMBuildShl(builder, j, const2, "");
696    bit_pos = LLVMBuildAdd(builder, bit_pos, i, "");
697    bit_pos = LLVMBuildAdd(builder, bit_pos, bit_pos, "");
698    /*
699     * NOTE: This innocent looking shift is very expensive with x86/ssex.
700     * Shifts with per-elemnent shift count get roughly translated to
701     * extract (count), extract (value), shift, move (back to xmm), unpack
702     * per element!
703     * So about 20 instructions here for 4xi32.
704     * Newer llvm versions (3.7+) will not do extract/insert but use a
705     * a couple constant count vector shifts plus shuffles. About same
706     * amount of instructions unfortunately...
707     * Would get much worse with 8xi16 even...
708     * We could actually do better here:
709     * - subtract bit_pos from 128+30, shl 23, convert float to int...
710     * - now do mul with codewords followed by shr 30...
711     * But requires 32bit->32bit mul, sse41 only (well that's emulatable
712     * with 2 32bit->64bit muls...) and not exactly cheap
713     * AVX2, of course, fixes this nonsense.
714     */
715    indices = LLVMBuildLShr(builder, codewords, bit_pos, "");
716 
717    /* finally select the colors */
718    sel_lo = LLVMBuildAnd(builder, indices, bld32.one, "");
719    sel_lo = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL, sel_lo, bld32.one);
720    color0 = lp_build_select(&bld32, sel_lo, color1, color0);
721    color2 = lp_build_select(&bld32, sel_lo, color3, color2);
722    sel_hi = LLVMBuildAnd(builder, indices, const2, "");
723    sel_hi = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL, sel_hi, const2);
724    rgba = lp_build_select(&bld32, sel_hi, color2, color0);
725 
726    /* fix up alpha */
727    if (format == PIPE_FORMAT_DXT1_RGB ||
728        format == PIPE_FORMAT_DXT1_SRGB) {
729       rgba = LLVMBuildOr(builder, rgba, a, "");
730    }
731    return LLVMBuildBitCast(builder, rgba, bld8.vec_type, "");
732 }
733 
734 
735 static LLVMValueRef
s3tc_dxt1_to_rgba_aos(struct gallivm_state * gallivm,unsigned n,enum pipe_format format,LLVMValueRef colors,LLVMValueRef codewords,LLVMValueRef i,LLVMValueRef j)736 s3tc_dxt1_to_rgba_aos(struct gallivm_state *gallivm,
737                       unsigned n,
738                       enum pipe_format format,
739                       LLVMValueRef colors,
740                       LLVMValueRef codewords,
741                       LLVMValueRef i,
742                       LLVMValueRef j)
743 {
744    return s3tc_dxt1_full_to_rgba_aos(gallivm, n, format,
745                                      colors, codewords, i, j);
746 }
747 
748 
749 /**
750  * Convert from <n x i128> s3tc dxt3 to <4n x i8> RGBA AoS
751  * @param colors  is a <n x i32> vector with n x 2x16bit colors
752  * @param codewords  is a <n x i32> vector containing the codewords
753  * @param alphas  is a <n x i64> vector containing the alpha values
754  * @param i  is a <n x i32> vector with the x pixel coordinate (0 to 3)
755  * @param j  is a <n x i32> vector with the y pixel coordinate (0 to 3)
756  */
757 static LLVMValueRef
s3tc_dxt3_to_rgba_aos(struct gallivm_state * gallivm,unsigned n,enum pipe_format format,LLVMValueRef colors,LLVMValueRef codewords,LLVMValueRef alpha_low,LLVMValueRef alpha_hi,LLVMValueRef i,LLVMValueRef j)758 s3tc_dxt3_to_rgba_aos(struct gallivm_state *gallivm,
759                       unsigned n,
760                       enum pipe_format format,
761                       LLVMValueRef colors,
762                       LLVMValueRef codewords,
763                       LLVMValueRef alpha_low,
764                       LLVMValueRef alpha_hi,
765                       LLVMValueRef i,
766                       LLVMValueRef j)
767 {
768    LLVMBuilderRef builder = gallivm->builder;
769    LLVMValueRef rgba, tmp, tmp2;
770    LLVMValueRef bit_pos, sel_mask;
771    struct lp_type type, type8;
772    struct lp_build_context bld;
773 
774    memset(&type, 0, sizeof type);
775    type.width = 32;
776    type.length = n;
777 
778    memset(&type8, 0, sizeof type8);
779    type8.width = 8;
780    type8.length = n*4;
781 
782    assert(lp_check_value(type, i));
783    assert(lp_check_value(type, j));
784 
785    lp_build_context_init(&bld, gallivm, type);
786 
787    rgba = s3tc_dxt1_to_rgba_aos(gallivm, n, format,
788                                 colors, codewords, i, j);
789 
790    rgba = LLVMBuildBitCast(builder, rgba, bld.vec_type, "");
791 
792    /*
793     * Extract alpha values. Since we now need to select from
794     * which 32bit vector values are fetched, construct selection
795     * mask from highest bit of bit_pos, and use select, then shift
796     * according to the bit_pos (without the highest bit).
797     * Note this is pointless for n == 1 case. Could just
798     * directly use 64bit arithmetic if we'd extract 64bit
799     * alpha value instead of 2x32...
800     */
801    /* pos = 4*(4j+i) */
802    bit_pos = LLVMBuildShl(builder, j, lp_build_const_int_vec(gallivm, type, 2), "");
803    bit_pos = LLVMBuildAdd(builder, bit_pos, i, "");
804    bit_pos = LLVMBuildShl(builder, bit_pos,
805                           lp_build_const_int_vec(gallivm, type, 2), "");
806    sel_mask = LLVMBuildLShr(builder, bit_pos,
807                             lp_build_const_int_vec(gallivm, type, 5), "");
808    sel_mask = LLVMBuildSub(builder, sel_mask, bld.one, "");
809    tmp = lp_build_select(&bld, sel_mask, alpha_low, alpha_hi);
810    bit_pos = LLVMBuildAnd(builder, bit_pos,
811                           lp_build_const_int_vec(gallivm, type, 0xffffffdf), "");
812    /* Warning: slow shift with per element count (without avx2) */
813    /*
814     * Could do pshufb here as well - just use appropriate 2 bits in bit_pos
815     * to select the right byte with pshufb. Then for the remaining one bit
816     * just do shift/select.
817     */
818    tmp = LLVMBuildLShr(builder, tmp, bit_pos, "");
819 
820    /* combined expand from a4 to a8 and shift into position */
821    tmp = LLVMBuildShl(builder, tmp, lp_build_const_int_vec(gallivm, type, 28), "");
822    tmp2 = LLVMBuildLShr(builder, tmp, lp_build_const_int_vec(gallivm, type, 4), "");
823    tmp = LLVMBuildOr(builder, tmp, tmp2, "");
824 
825    rgba = LLVMBuildOr(builder, tmp, rgba, "");
826 
827    return LLVMBuildBitCast(builder, rgba, lp_build_vec_type(gallivm, type8), "");
828 }
829 
830 static LLVMValueRef
lp_build_lerpdxta(struct gallivm_state * gallivm,LLVMValueRef alpha0,LLVMValueRef alpha1,LLVMValueRef code,LLVMValueRef sel_mask,unsigned n)831 lp_build_lerpdxta(struct gallivm_state *gallivm,
832                   LLVMValueRef alpha0,
833                   LLVMValueRef alpha1,
834                   LLVMValueRef code,
835                   LLVMValueRef sel_mask,
836                   unsigned n)
837 {
838    /*
839     * note we're doing lerp in 16bit since 32bit pmulld is only available in sse41
840     * (plus pmullw is actually faster...)
841     * we just pretend our 32bit values (which are really only 8bit) are 16bits.
842     * Note that this is obviously a disaster for the scalar case.
843     */
844    LLVMBuilderRef builder = gallivm->builder;
845    LLVMValueRef delta, ainterp;
846    LLVMValueRef weight5, weight7, weight;
847    struct lp_type type32, type16, type8;
848    struct lp_build_context bld16;
849 
850    memset(&type32, 0, sizeof type32);
851    type32.width = 32;
852    type32.length = n;
853    memset(&type16, 0, sizeof type16);
854    type16.width = 16;
855    type16.length = 2*n;
856    type16.sign = true;
857    memset(&type8, 0, sizeof type8);
858    type8.width = 8;
859    type8.length = 4*n;
860 
861    lp_build_context_init(&bld16, gallivm, type16);
862    /* 255/7 is a bit off - increase accuracy at the expense of shift later */
863    sel_mask = LLVMBuildBitCast(builder, sel_mask, bld16.vec_type, "");
864    weight5 = lp_build_const_int_vec(gallivm, type16, 255*64/5);
865    weight7 = lp_build_const_int_vec(gallivm, type16, 255*64/7);
866    weight = lp_build_select(&bld16, sel_mask, weight7, weight5);
867 
868    alpha0 = LLVMBuildBitCast(builder, alpha0, bld16.vec_type, "");
869    alpha1 = LLVMBuildBitCast(builder, alpha1, bld16.vec_type, "");
870    code = LLVMBuildBitCast(builder, code, bld16.vec_type, "");
871    /* we'll get garbage in the elements which had code 0 (or larger than 5 or 7)
872       but we don't care */
873    code = LLVMBuildSub(builder, code, bld16.one, "");
874 
875    weight = LLVMBuildMul(builder, weight, code, "");
876    weight = LLVMBuildLShr(builder, weight,
877                           lp_build_const_int_vec(gallivm, type16, 6), "");
878 
879    delta = LLVMBuildSub(builder, alpha1, alpha0, "");
880 
881    ainterp = LLVMBuildMul(builder, delta, weight, "");
882    ainterp = LLVMBuildLShr(builder, ainterp,
883                            lp_build_const_int_vec(gallivm, type16, 8), "");
884 
885    ainterp = LLVMBuildBitCast(builder, ainterp, lp_build_vec_type(gallivm, type8), "");
886    alpha0 = LLVMBuildBitCast(builder, alpha0, lp_build_vec_type(gallivm, type8), "");
887    ainterp = LLVMBuildAdd(builder, alpha0, ainterp, "");
888    ainterp = LLVMBuildBitCast(builder, ainterp, lp_build_vec_type(gallivm, type32), "");
889 
890    return ainterp;
891 }
892 
893 static LLVMValueRef
s3tc_dxt5_alpha_channel(struct gallivm_state * gallivm,bool is_signed,unsigned n,LLVMValueRef alpha_hi,LLVMValueRef alpha_lo,LLVMValueRef i,LLVMValueRef j)894 s3tc_dxt5_alpha_channel(struct gallivm_state *gallivm,
895                         bool is_signed,
896                         unsigned n,
897                         LLVMValueRef alpha_hi, LLVMValueRef alpha_lo,
898                         LLVMValueRef i, LLVMValueRef j)
899 {
900    LLVMBuilderRef builder = gallivm->builder;
901    struct lp_type type, type8;
902    LLVMValueRef tmp, alpha0, alpha1, alphac, alphac0, bit_pos, shift;
903    LLVMValueRef sel_mask, tmp_mask, alpha, alpha64, code_s;
904    LLVMValueRef mask6, mask7, ainterp;
905    LLVMTypeRef i64t = LLVMInt64TypeInContext(gallivm->context);
906    LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
907    struct lp_build_context bld32;
908 
909    memset(&type, 0, sizeof type);
910    type.width = 32;
911    type.length = n;
912 
913    memset(&type8, 0, sizeof type8);
914    type8.width = 8;
915    type8.length = n;
916    type8.sign = is_signed;
917 
918    lp_build_context_init(&bld32, gallivm, type);
919    /* this looks pretty complex for vectorization:
920     * extract a0/a1 values
921     * extract code
922     * select weights for interpolation depending on a0 > a1
923     * mul weights by code - 1
924     * lerp a0/a1/weights
925     * use selects for getting either a0, a1, interp a, interp a/0.0, interp a/1.0
926     */
927 
928    alpha0 = LLVMBuildAnd(builder, alpha_lo,
929                          lp_build_const_int_vec(gallivm, type, 0xff), "");
930    if (is_signed) {
931       alpha0 = LLVMBuildTrunc(builder, alpha0, lp_build_vec_type(gallivm, type8), "");
932       alpha0 = LLVMBuildSExt(builder, alpha0, lp_build_vec_type(gallivm, type), "");
933    }
934 
935    alpha1 = LLVMBuildLShr(builder, alpha_lo,
936                           lp_build_const_int_vec(gallivm, type, 8), "");
937    alpha1 = LLVMBuildAnd(builder, alpha1,
938                          lp_build_const_int_vec(gallivm, type, 0xff), "");
939    if (is_signed) {
940       alpha1 = LLVMBuildTrunc(builder, alpha1, lp_build_vec_type(gallivm, type8), "");
941       alpha1 = LLVMBuildSExt(builder, alpha1, lp_build_vec_type(gallivm, type), "");
942    }
943 
944    /* pos = 3*(4j+i) */
945    bit_pos = LLVMBuildShl(builder, j, lp_build_const_int_vec(gallivm, type, 2), "");
946    bit_pos = LLVMBuildAdd(builder, bit_pos, i, "");
947    tmp = LLVMBuildAdd(builder, bit_pos, bit_pos, "");
948    bit_pos = LLVMBuildAdd(builder, bit_pos, tmp, "");
949    /* get rid of first 2 bytes - saves shifts of alpha_lo/hi */
950    bit_pos = LLVMBuildAdd(builder, bit_pos,
951                           lp_build_const_int_vec(gallivm, type, 16), "");
952 
953    if (n == 1) {
954       struct lp_type type64;
955       memset(&type64, 0, sizeof type64);
956       type64.width = 64;
957       type64.length = 1;
958       /* This is pretty pointless could avoid by just directly extracting
959          64bit in the first place but makes it more complicated elsewhere */
960       alpha_lo = LLVMBuildZExt(builder, alpha_lo, i64t, "");
961       alpha_hi = LLVMBuildZExt(builder, alpha_hi, i64t, "");
962       alphac0 = LLVMBuildShl(builder, alpha_hi,
963                              lp_build_const_int_vec(gallivm, type64, 32), "");
964       alphac0 = LLVMBuildOr(builder, alpha_lo, alphac0, "");
965 
966       shift = LLVMBuildZExt(builder, bit_pos, i64t, "");
967       alphac0 = LLVMBuildLShr(builder, alphac0, shift, "");
968       alphac0 = LLVMBuildTrunc(builder, alphac0, i32t, "");
969       alphac = LLVMBuildAnd(builder, alphac0,
970                             lp_build_const_int_vec(gallivm, type, 0x7), "");
971    }
972    else {
973       /*
974        * Using non-native vector length here (actually, with avx2 and
975        * n == 4 llvm will indeed expand to ymm regs...)
976        * At least newer llvm versions handle that ok.
977        * llvm 3.7+ will even handle the emulated 64bit shift with variable
978        * shift count without extraction (and it's actually easier to
979        * emulate than the 32bit one).
980        */
981       alpha64 = LLVMBuildShuffleVector(builder, alpha_lo, alpha_hi,
982                                        lp_build_const_unpackx2_shuffle(gallivm, n), "");
983 
984       alpha64 = LLVMBuildBitCast(builder, alpha64, LLVMVectorType(i64t, n), "");
985       shift = LLVMBuildZExt(builder, bit_pos, LLVMVectorType(i64t, n), "");
986       alphac = LLVMBuildLShr(builder, alpha64, shift, "");
987       alphac = LLVMBuildTrunc(builder, alphac, bld32.vec_type, "");
988 
989       alphac = LLVMBuildAnd(builder, alphac,
990                             lp_build_const_int_vec(gallivm, type, 0x7), "");
991    }
992 
993    /* signed compare is faster saves some xors */
994    type.sign = true;
995    /* alpha0 > alpha1 selection */
996    sel_mask = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER,
997                                alpha0, alpha1);
998    ainterp = lp_build_lerpdxta(gallivm, alpha0, alpha1, alphac, sel_mask, n);
999 
1000    /*
1001     * if a0 > a1 then we select a0 for case 0, a1 for case 1, interp otherwise.
1002     * else we select a0 for case 0, a1 for case 1,
1003     * interp for case 2-5, 00 for 6 and 0xff(ffffff) for 7
1004     * a = (c == 0) ? a0 : a1
1005     * a = (c > 1) ? ainterp : a
1006     * Finally handle case 6/7 for !(a0 > a1)
1007     * a = (!(a0 > a1) && c == 6) ? 0 : a (andnot with mask)
1008     * a = (!(a0 > a1) && c == 7) ? 0xffffffff : a (or with mask)
1009     */
1010    tmp_mask = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL,
1011                                alphac, bld32.zero);
1012    alpha = lp_build_select(&bld32, tmp_mask, alpha0, alpha1);
1013    tmp_mask = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER,
1014                                alphac, bld32.one);
1015    alpha = lp_build_select(&bld32, tmp_mask, ainterp, alpha);
1016 
1017    code_s = LLVMBuildAnd(builder, alphac,
1018                          LLVMBuildNot(builder, sel_mask, ""), "");
1019    mask6 = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL,
1020                             code_s, lp_build_const_int_vec(gallivm, type, 6));
1021    mask7 = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL,
1022                             code_s, lp_build_const_int_vec(gallivm, type, 7));
1023    if (is_signed) {
1024       alpha = lp_build_select(&bld32, mask6, lp_build_const_int_vec(gallivm, type, -127), alpha);
1025       alpha = lp_build_select(&bld32, mask7, lp_build_const_int_vec(gallivm, type, 127), alpha);
1026    } else {
1027       alpha = LLVMBuildAnd(builder, alpha, LLVMBuildNot(builder, mask6, ""), "");
1028       alpha = LLVMBuildOr(builder, alpha, mask7, "");
1029    }
1030    /* There can be garbage in upper bits, mask them off for rgtc formats */
1031    alpha = LLVMBuildAnd(builder, alpha, lp_build_const_int_vec(gallivm, type, 0xff), "");
1032 
1033    return alpha;
1034 }
1035 
1036 /**
1037  * Convert from <n x i128> s3tc dxt5 to <4n x i8> RGBA AoS
1038  * @param colors  is a <n x i32> vector with n x 2x16bit colors
1039  * @param codewords  is a <n x i32> vector containing the codewords
1040  * @param alphas  is a <n x i64> vector containing the alpha values
1041  * @param i  is a <n x i32> vector with the x pixel coordinate (0 to 3)
1042  * @param j  is a <n x i32> vector with the y pixel coordinate (0 to 3)
1043  */
1044 static LLVMValueRef
s3tc_dxt5_full_to_rgba_aos(struct gallivm_state * gallivm,unsigned n,enum pipe_format format,LLVMValueRef colors,LLVMValueRef codewords,LLVMValueRef alpha_lo,LLVMValueRef alpha_hi,LLVMValueRef i,LLVMValueRef j)1045 s3tc_dxt5_full_to_rgba_aos(struct gallivm_state *gallivm,
1046                            unsigned n,
1047                            enum pipe_format format,
1048                            LLVMValueRef colors,
1049                            LLVMValueRef codewords,
1050                            LLVMValueRef alpha_lo,
1051                            LLVMValueRef alpha_hi,
1052                            LLVMValueRef i,
1053                            LLVMValueRef j)
1054 {
1055    LLVMBuilderRef builder = gallivm->builder;
1056    LLVMValueRef rgba, alpha;
1057    struct lp_type type, type8;
1058    struct lp_build_context bld32;
1059 
1060    memset(&type, 0, sizeof type);
1061    type.width = 32;
1062    type.length = n;
1063 
1064    memset(&type8, 0, sizeof type8);
1065    type8.width = 8;
1066    type8.length = n*4;
1067 
1068    assert(lp_check_value(type, i));
1069    assert(lp_check_value(type, j));
1070 
1071    lp_build_context_init(&bld32, gallivm, type);
1072 
1073    assert(lp_check_value(type, i));
1074    assert(lp_check_value(type, j));
1075 
1076    rgba = s3tc_dxt1_to_rgba_aos(gallivm, n, format,
1077                                 colors, codewords, i, j);
1078 
1079    rgba = LLVMBuildBitCast(builder, rgba, bld32.vec_type, "");
1080 
1081    alpha = s3tc_dxt5_alpha_channel(gallivm, false, n, alpha_hi, alpha_lo, i, j);
1082    alpha = LLVMBuildShl(builder, alpha, lp_build_const_int_vec(gallivm, type, 24), "");
1083    rgba = LLVMBuildOr(builder, alpha, rgba, "");
1084 
1085    return LLVMBuildBitCast(builder, rgba, lp_build_vec_type(gallivm, type8), "");
1086 }
1087 
1088 
1089 static void
lp_build_gather_s3tc_simple_scalar(struct gallivm_state * gallivm,const struct util_format_description * format_desc,LLVMValueRef * dxt_block,LLVMValueRef ptr)1090 lp_build_gather_s3tc_simple_scalar(struct gallivm_state *gallivm,
1091                                    const struct util_format_description *format_desc,
1092                                    LLVMValueRef *dxt_block,
1093                                    LLVMValueRef ptr)
1094 {
1095    LLVMBuilderRef builder = gallivm->builder;
1096    unsigned block_bits = format_desc->block.bits;
1097    LLVMValueRef elem, shuf;
1098    LLVMTypeRef type32 = LLVMIntTypeInContext(gallivm->context, 32);
1099    LLVMTypeRef src_type = LLVMIntTypeInContext(gallivm->context, block_bits);
1100    LLVMTypeRef type32_4 = LLVMVectorType(type32, 4);
1101 
1102    assert(block_bits == 64 || block_bits == 128);
1103 
1104    ptr = LLVMBuildBitCast(builder, ptr, LLVMPointerType(src_type, 0), "");
1105    elem = LLVMBuildLoad2(builder, src_type, ptr, "");
1106 
1107    if (block_bits == 128) {
1108       /* just return block as is */
1109       *dxt_block = LLVMBuildBitCast(builder, elem, type32_4, "");
1110    }
1111    else {
1112       LLVMTypeRef type32_2 = LLVMVectorType(type32, 2);
1113       shuf = lp_build_const_extend_shuffle(gallivm, 2, 4);
1114       elem = LLVMBuildBitCast(builder, elem, type32_2, "");
1115       *dxt_block = LLVMBuildShuffleVector(builder, elem,
1116                                           LLVMGetUndef(type32_2), shuf, "");
1117    }
1118 }
1119 
1120 
1121 static void
s3tc_store_cached_block(struct gallivm_state * gallivm,LLVMValueRef * col,LLVMValueRef tag_value,LLVMValueRef hash_index,LLVMValueRef cache)1122 s3tc_store_cached_block(struct gallivm_state *gallivm,
1123                         LLVMValueRef *col,
1124                         LLVMValueRef tag_value,
1125                         LLVMValueRef hash_index,
1126                         LLVMValueRef cache)
1127 {
1128    LLVMBuilderRef builder = gallivm->builder;
1129    LLVMValueRef ptr, indices[3];
1130    LLVMTypeRef type_ptr4x32;
1131    unsigned count;
1132 
1133    type_ptr4x32 = LLVMPointerType(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4), 0);
1134    indices[0] = lp_build_const_int32(gallivm, 0);
1135    indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_TAGS);
1136    indices[2] = hash_index;
1137    LLVMTypeRef cache_type = lp_build_format_cache_type(gallivm);
1138    ptr = LLVMBuildGEP2(builder, cache_type, cache, indices, ARRAY_SIZE(indices), "");
1139    LLVMBuildStore(builder, tag_value, ptr);
1140 
1141    indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_DATA);
1142    hash_index = LLVMBuildMul(builder, hash_index, lp_build_const_int32(gallivm, 16), "");
1143    for (count = 0; count < 4; count++) {
1144       indices[2] = hash_index;
1145       ptr = LLVMBuildGEP2(builder, cache_type, cache, indices, ARRAY_SIZE(indices), "");
1146       ptr = LLVMBuildBitCast(builder, ptr, type_ptr4x32, "");
1147       LLVMBuildStore(builder, col[count], ptr);
1148       hash_index = LLVMBuildAdd(builder, hash_index, lp_build_const_int32(gallivm, 4), "");
1149    }
1150 }
1151 
1152 static LLVMValueRef
lookup_cache_member(struct gallivm_state * gallivm,LLVMValueRef cache,enum cache_member member,LLVMValueRef index)1153 lookup_cache_member(struct gallivm_state *gallivm, LLVMValueRef cache, enum cache_member member, LLVMValueRef index) {
1154    assert(member == LP_BUILD_FORMAT_CACHE_MEMBER_DATA || member == LP_BUILD_FORMAT_CACHE_MEMBER_TAGS);
1155    LLVMBuilderRef builder = gallivm->builder;
1156    LLVMValueRef member_ptr, indices[3];
1157 
1158    indices[0] = lp_build_const_int32(gallivm, 0);
1159    indices[1] = lp_build_const_int32(gallivm, member);
1160    indices[2] = index;
1161 
1162    const char *name =
1163          member == LP_BUILD_FORMAT_CACHE_MEMBER_DATA ? "cache_data" :
1164          member == LP_BUILD_FORMAT_CACHE_MEMBER_TAGS ? "tag_data" : "";
1165 
1166    member_ptr = LLVMBuildGEP2(builder, lp_build_format_cache_type(gallivm),
1167                               cache, indices, ARRAY_SIZE(indices), "cache_gep");
1168 
1169    return LLVMBuildLoad2(builder, lp_build_format_cache_elem_type(gallivm, member), member_ptr, name);
1170 }
1171 
1172 static LLVMValueRef
s3tc_lookup_cached_pixel(struct gallivm_state * gallivm,LLVMValueRef cache,LLVMValueRef index)1173 s3tc_lookup_cached_pixel(struct gallivm_state *gallivm,
1174                          LLVMValueRef cache,
1175                          LLVMValueRef index)
1176 {
1177    return lookup_cache_member(gallivm, cache, LP_BUILD_FORMAT_CACHE_MEMBER_DATA, index);
1178 }
1179 
1180 static LLVMValueRef
s3tc_lookup_tag_data(struct gallivm_state * gallivm,LLVMValueRef cache,LLVMValueRef index)1181 s3tc_lookup_tag_data(struct gallivm_state *gallivm,
1182                      LLVMValueRef cache,
1183                      LLVMValueRef index)
1184 {
1185    return lookup_cache_member(gallivm, cache, LP_BUILD_FORMAT_CACHE_MEMBER_TAGS, index);
1186 }
1187 
1188 #if LP_BUILD_FORMAT_CACHE_DEBUG
1189 static void
s3tc_update_cache_access(struct gallivm_state * gallivm,LLVMValueRef ptr,unsigned count,unsigned index)1190 s3tc_update_cache_access(struct gallivm_state *gallivm,
1191                          LLVMValueRef ptr,
1192                          unsigned count,
1193                          unsigned index)
1194 {
1195    LLVMBuilderRef builder = gallivm->builder;
1196    LLVMValueRef member_ptr, cache_access;
1197 
1198    assert(index == LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL ||
1199           index == LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS);
1200    LLVMTypeRef cache_type = lp_build_format_cache_type(gallivm);
1201    member_ptr = lp_build_struct_get_ptr2(gallivm, cache_type, ptr, index, "");
1202    cache_access = LLVMBuildLoad2(builder, LLVMInt64TypeInContext(gallivm->context), member_ptr, "cache_access");
1203    cache_access = LLVMBuildAdd(builder, cache_access,
1204                                LLVMConstInt(LLVMInt64TypeInContext(gallivm->context), count, 0), "");
1205    LLVMBuildStore(builder, cache_access, member_ptr);
1206 }
1207 #endif
1208 
1209 /**
1210  * Calculate 1/3(v1-v0) + v0 and 2*1/3(v1-v0) + v0.
1211  * The lerp is performed between the first 2 32bit colors
1212  * in the source vector, both results are returned packed in result vector.
1213  */
1214 static LLVMValueRef
lp_build_lerp23_single(struct lp_build_context * bld,LLVMValueRef v01)1215 lp_build_lerp23_single(struct lp_build_context *bld,
1216                        LLVMValueRef v01)
1217 {
1218    struct gallivm_state *gallivm = bld->gallivm;
1219    LLVMValueRef x, mul, delta, res, v0, v1, elems[8];
1220    const struct lp_type type = bld->type;
1221    LLVMBuilderRef builder = bld->gallivm->builder;
1222    struct lp_type i16_type = lp_wider_type(type);
1223    struct lp_type i32_type = lp_wider_type(i16_type);
1224    struct lp_build_context bld2;
1225 
1226    assert(!type.floating && !type.fixed && !type.norm && type.width == 8);
1227 
1228    lp_build_context_init(&bld2, gallivm, i16_type);
1229    bld2.type.sign = true;
1230 
1231    /* weights 256/3, 256*2/3, with correct rounding */
1232    elems[0] = elems[1] = elems[2] = elems[3] =
1233       lp_build_const_elem(gallivm, i16_type, 255*1/3);
1234    elems[4] = elems[5] = elems[6] = elems[7] =
1235       lp_build_const_elem(gallivm, i16_type, 171);
1236    x = LLVMConstVector(elems, 8);
1237 
1238    /*
1239     * v01 has col0 in 32bit elem 0, col1 in elem 1.
1240     * Interleave/unpack will give us separate v0/v1 vectors.
1241     */
1242    v01 = lp_build_interleave2(gallivm, i32_type, v01, v01, 0);
1243    v01 = LLVMBuildBitCast(builder, v01, bld->vec_type, "");
1244 
1245    lp_build_unpack2(gallivm, type, i16_type, v01, &v0, &v1);
1246    delta = lp_build_sub(&bld2, v1, v0);
1247 
1248    mul = LLVMBuildMul(builder, x, delta, "");
1249 
1250    mul = LLVMBuildLShr(builder, mul, lp_build_const_int_vec(gallivm, i16_type, 8), "");
1251    /* lerp optimization: pack now, do add afterwards */
1252    res = lp_build_pack2(gallivm, i16_type, type, mul, bld2.undef);
1253    /* only lower 2 elems are valid - for these v0 is really v0 */
1254    return lp_build_add(bld, res, v01);
1255 }
1256 
1257 /*
1258  * decode one dxt1 block.
1259  */
1260 static void
s3tc_decode_block_dxt1(struct gallivm_state * gallivm,enum pipe_format format,LLVMValueRef dxt_block,LLVMValueRef * col)1261 s3tc_decode_block_dxt1(struct gallivm_state *gallivm,
1262                        enum pipe_format format,
1263                        LLVMValueRef dxt_block,
1264                        LLVMValueRef *col)
1265 {
1266    LLVMBuilderRef builder = gallivm->builder;
1267    LLVMValueRef color01, color23, color01_16, color0123;
1268    LLVMValueRef rgba, tmp, a, sel_mask, indices, code, const2;
1269    struct lp_type type8, type32, type16, type64;
1270    struct lp_build_context bld8, bld32, bld16, bld64;
1271    unsigned i;
1272    bool is_dxt1_variant = format_dxt1_variant(format);
1273 
1274    memset(&type32, 0, sizeof type32);
1275    type32.width = 32;
1276    type32.length = 4;
1277    type32.sign = true;
1278 
1279    memset(&type8, 0, sizeof type8);
1280    type8.width = 8;
1281    type8.length = 16;
1282 
1283    memset(&type16, 0, sizeof type16);
1284    type16.width = 16;
1285    type16.length = 8;
1286 
1287    memset(&type64, 0, sizeof type64);
1288    type64.width = 64;
1289    type64.length = 2;
1290 
1291    a = lp_build_const_int_vec(gallivm, type32, 0xff000000);
1292    const2 = lp_build_const_int_vec(gallivm, type32, 2);
1293 
1294    lp_build_context_init(&bld32, gallivm, type32);
1295    lp_build_context_init(&bld16, gallivm, type16);
1296    lp_build_context_init(&bld8, gallivm, type8);
1297    lp_build_context_init(&bld64, gallivm, type64);
1298 
1299    if (is_dxt1_variant) {
1300       color01 = lp_build_shuffle1undef(gallivm, dxt_block, 0, 4);
1301       code = lp_build_shuffle1undef(gallivm, dxt_block, 1, 4);
1302    } else {
1303       color01 = lp_build_shuffle1undef(gallivm, dxt_block, 2, 4);
1304       code = lp_build_shuffle1undef(gallivm, dxt_block, 3, 4);
1305    }
1306    code = LLVMBuildBitCast(builder, code, bld8.vec_type, "");
1307    /* expand bytes to dwords */
1308    code = lp_build_interleave2(gallivm, type8, code, code, 0);
1309    code = lp_build_interleave2(gallivm, type8, code, code, 0);
1310 
1311 
1312    /*
1313     * works as follows:
1314     * - expand color0/color1 to rgba8888
1315     * - calculate color2/3 (interpolation) according to color0 < color1 rules
1316     * - calculate color2/3 according to color0 >= color1 rules
1317     * - do selection of color2/3 according to comparison of color0/1
1318     * - extract indices.
1319     * - use compare/select to select the correct color. Since we have 2bit
1320     *   indices (and 4 colors), needs at least three compare/selects.
1321     */
1322 
1323    /*
1324     * expand the two colors
1325     */
1326    color01 = LLVMBuildBitCast(builder, color01, bld16.vec_type, "");
1327    color01 = lp_build_interleave2(gallivm, type16, color01,
1328                                   bld16.zero, 0);
1329    color01_16 = LLVMBuildBitCast(builder, color01, bld32.vec_type, "");
1330    color01 = color_expand_565_to_8888(gallivm, 4, color01_16);
1331 
1332    /*
1333     * interpolate colors
1334     * color2_1 is 2/3 color0 + 1/3 color1
1335     * color3_1 is 1/3 color0 + 2/3 color1
1336     * color2_2 is 1/2 color0 + 1/2 color1
1337     * color3_2 is 0
1338     */
1339 
1340    /* TODO: since this is now always scalar, should
1341     * probably just use control flow here instead of calculating
1342     * both cases and then selection
1343     */
1344    if (format == PIPE_FORMAT_DXT1_RGBA ||
1345        format == PIPE_FORMAT_DXT1_SRGBA) {
1346       color01 = LLVMBuildOr(builder, color01, a, "");
1347    }
1348    /* can combine 2 lerps into one mostly */
1349    color23 = lp_build_lerp23_single(&bld8, color01);
1350    color23 = LLVMBuildBitCast(builder, color23, bld32.vec_type, "");
1351 
1352    /* dxt3/5 always use 4-color encoding */
1353    if (is_dxt1_variant) {
1354       LLVMValueRef color23_2, color2_2;
1355 
1356       if (util_get_cpu_caps()->has_sse2) {
1357          LLVMValueRef intrargs[2];
1358          intrargs[0] = LLVMBuildBitCast(builder, color01, bld8.vec_type, "");
1359          /* same interleave as for lerp23 - correct result in 2nd element */
1360          intrargs[1] = lp_build_interleave2(gallivm, type32, color01, color01, 0);
1361          intrargs[1] = LLVMBuildBitCast(builder, intrargs[1], bld8.vec_type, "");
1362          color2_2 = lp_build_pavgb(&bld8, intrargs[0], intrargs[1]);
1363       }
1364       else {
1365          LLVMValueRef v01, v0, v1, vhalf;
1366          /*
1367           * This isn't as expensive as it looks (the unpack is the same as
1368           * for lerp23, which is the reason why we do the pointless
1369           * interleave2 too), with correct rounding (the two lower elements
1370           * will be the same).
1371           */
1372          v01 = lp_build_interleave2(gallivm, type32, color01, color01, 0);
1373          v01 = LLVMBuildBitCast(builder, v01, bld8.vec_type, "");
1374          lp_build_unpack2(gallivm, type8, type16, v01, &v0, &v1);
1375          vhalf = lp_build_add(&bld16, v0, v1);
1376          vhalf = LLVMBuildLShr(builder, vhalf, bld16.one, "");
1377          color2_2 = lp_build_pack2(gallivm, type16, type8, vhalf, bld16.undef);
1378       }
1379       /* shuffle in color 3 as elem 2 zero, color 2 elem 1 */
1380       color23_2 = LLVMBuildBitCast(builder, color2_2, bld64.vec_type, "");
1381       color23_2 = LLVMBuildLShr(builder, color23_2,
1382                                 lp_build_const_int_vec(gallivm, type64, 32), "");
1383       color23_2 = LLVMBuildBitCast(builder, color23_2, bld32.vec_type, "");
1384 
1385       tmp = LLVMBuildBitCast(builder, color01_16, bld64.vec_type, "");
1386       tmp = LLVMBuildLShr(builder, tmp,
1387                           lp_build_const_int_vec(gallivm, type64, 32), "");
1388       tmp = LLVMBuildBitCast(builder, tmp, bld32.vec_type, "");
1389       sel_mask = lp_build_compare(gallivm, type32, PIPE_FUNC_GREATER,
1390                                   color01_16, tmp);
1391       sel_mask = lp_build_interleave2(gallivm, type32, sel_mask, sel_mask, 0);
1392       color23 = lp_build_select(&bld32, sel_mask, color23, color23_2);
1393    }
1394 
1395    if (util_get_cpu_caps()->has_ssse3) {
1396       /*
1397        * Use pshufb as mini-lut. (Only doable with intrinsics as the
1398        * final shuffles are non-constant. pshufb is awesome!)
1399        */
1400       LLVMValueRef shuf[16], low2mask;
1401       LLVMValueRef intrargs[2], lut_ind, lut_adj;
1402 
1403       color01 = LLVMBuildBitCast(builder, color01, bld64.vec_type, "");
1404       color23 = LLVMBuildBitCast(builder, color23, bld64.vec_type, "");
1405       color0123 = lp_build_interleave2(gallivm, type64, color01, color23, 0);
1406       color0123 = LLVMBuildBitCast(builder, color0123, bld32.vec_type, "");
1407 
1408       if (format == PIPE_FORMAT_DXT1_RGB ||
1409           format == PIPE_FORMAT_DXT1_SRGB) {
1410          color0123 = LLVMBuildOr(builder, color0123, a, "");
1411       }
1412 
1413       /* shuffle as r0r1r2r3g0g1... */
1414       for (i = 0; i < 4; i++) {
1415          shuf[4*i] = lp_build_const_int32(gallivm, 0 + i);
1416          shuf[4*i+1] = lp_build_const_int32(gallivm, 4 + i);
1417          shuf[4*i+2] = lp_build_const_int32(gallivm, 8 + i);
1418          shuf[4*i+3] = lp_build_const_int32(gallivm, 12 + i);
1419       }
1420       color0123 = LLVMBuildBitCast(builder, color0123, bld8.vec_type, "");
1421       color0123 = LLVMBuildShuffleVector(builder, color0123, bld8.undef,
1422                                          LLVMConstVector(shuf, 16), "");
1423 
1424       /* lowest 2 bits of each 8 bit value contain index into "LUT" */
1425       low2mask = lp_build_const_int_vec(gallivm, type8, 3);
1426       /* add 0/4/8/12 for r/g/b/a */
1427       lut_adj = lp_build_const_int_vec(gallivm, type32, 0x0c080400);
1428       lut_adj = LLVMBuildBitCast(builder, lut_adj, bld8.vec_type, "");
1429       intrargs[0] = color0123;
1430       for (i = 0; i < 4; i++) {
1431          lut_ind = LLVMBuildAnd(builder, code, low2mask, "");
1432          lut_ind = LLVMBuildOr(builder, lut_ind, lut_adj, "");
1433          intrargs[1] = lut_ind;
1434          col[i] = lp_build_intrinsic(builder, "llvm.x86.ssse3.pshuf.b.128",
1435                                      bld8.vec_type, intrargs, 2, 0);
1436          col[i] = LLVMBuildBitCast(builder, col[i], bld32.vec_type, "");
1437          code = LLVMBuildBitCast(builder, code, bld32.vec_type, "");
1438          code = LLVMBuildLShr(builder, code, const2, "");
1439          code = LLVMBuildBitCast(builder, code, bld8.vec_type, "");
1440       }
1441    }
1442    else {
1443       /* Thanks to vectorization can do 4 texels in parallel */
1444       LLVMValueRef color0, color1, color2, color3;
1445       if (format == PIPE_FORMAT_DXT1_RGB ||
1446           format == PIPE_FORMAT_DXT1_SRGB) {
1447          color01 = LLVMBuildOr(builder, color01, a, "");
1448          color23 = LLVMBuildOr(builder, color23, a, "");
1449       }
1450       color0 = LLVMBuildShuffleVector(builder, color01, bld32.undef,
1451                                       lp_build_const_shuffle1(gallivm, 0, 4), "");
1452       color1 = LLVMBuildShuffleVector(builder, color01, bld32.undef,
1453                                       lp_build_const_shuffle1(gallivm, 1, 4), "");
1454       color2 = LLVMBuildShuffleVector(builder, color23, bld32.undef,
1455                                       lp_build_const_shuffle1(gallivm, 0, 4), "");
1456       color3 = LLVMBuildShuffleVector(builder, color23, bld32.undef,
1457                                       lp_build_const_shuffle1(gallivm, 1, 4), "");
1458       code = LLVMBuildBitCast(builder, code, bld32.vec_type, "");
1459 
1460       for (i = 0; i < 4; i++) {
1461          /* select the colors */
1462          LLVMValueRef selmasklo, rgba01, rgba23, bitlo;
1463          bitlo = bld32.one;
1464          indices = LLVMBuildAnd(builder, code, bitlo, "");
1465          selmasklo = lp_build_compare(gallivm, type32, PIPE_FUNC_EQUAL,
1466                                       indices, bitlo);
1467          rgba01 = lp_build_select(&bld32, selmasklo, color1, color0);
1468 
1469          LLVMValueRef selmaskhi;
1470          indices = LLVMBuildAnd(builder, code, const2, "");
1471          selmaskhi = lp_build_compare(gallivm, type32, PIPE_FUNC_EQUAL,
1472                                       indices, const2);
1473          rgba23 = lp_build_select(&bld32, selmasklo, color3, color2);
1474          rgba = lp_build_select(&bld32, selmaskhi, rgba23, rgba01);
1475 
1476          /*
1477           * Note that this will give "wrong" order.
1478           * col0 will be rgba0, rgba4, rgba8, rgba12, col1 rgba1, rgba5, ...
1479           * This would be easily fixable by using different shuffle, bitlo/hi
1480           * vectors above (and different shift), but seems slightly easier to
1481           * deal with for dxt3/dxt5 alpha too. So instead change lookup.
1482           */
1483          col[i] = rgba;
1484          code = LLVMBuildLShr(builder, code, const2, "");
1485       }
1486    }
1487 }
1488 
1489 /*
1490  * decode one dxt3 block.
1491  */
1492 static void
s3tc_decode_block_dxt3(struct gallivm_state * gallivm,enum pipe_format format,LLVMValueRef dxt_block,LLVMValueRef * col)1493 s3tc_decode_block_dxt3(struct gallivm_state *gallivm,
1494                        enum pipe_format format,
1495                        LLVMValueRef dxt_block,
1496                        LLVMValueRef *col)
1497 {
1498    LLVMBuilderRef builder = gallivm->builder;
1499    LLVMValueRef alpha, alphas0, alphas1, shift4_16, a[4], mask8hi;
1500    struct lp_type type32, type8, type16;
1501    unsigned i;
1502 
1503    memset(&type32, 0, sizeof type32);
1504    type32.width = 32;
1505    type32.length = 4;
1506 
1507    memset(&type8, 0, sizeof type8);
1508    type8.width = 8;
1509    type8.length = 16;
1510 
1511    memset(&type16, 0, sizeof type16);
1512    type16.width = 16;
1513    type16.length = 8;
1514 
1515    s3tc_decode_block_dxt1(gallivm, format, dxt_block, col);
1516 
1517    shift4_16 = lp_build_const_int_vec(gallivm, type16, 4);
1518    mask8hi = lp_build_const_int_vec(gallivm, type32, 0xff000000);
1519 
1520    alpha = LLVMBuildBitCast(builder, dxt_block,
1521                             lp_build_vec_type(gallivm, type8), "");
1522    alpha = lp_build_interleave2(gallivm, type8, alpha, alpha, 0);
1523    alpha = LLVMBuildBitCast(builder, alpha,
1524                             lp_build_vec_type(gallivm, type16), "");
1525    alpha = LLVMBuildAnd(builder, alpha,
1526                         lp_build_const_int_vec(gallivm, type16, 0xf00f), "");
1527    alphas0 = LLVMBuildLShr(builder, alpha, shift4_16, "");
1528    alphas1 = LLVMBuildShl(builder, alpha, shift4_16, "");
1529    alpha = LLVMBuildOr(builder, alphas0, alpha, "");
1530    alpha = LLVMBuildOr(builder, alphas1, alpha, "");
1531    alpha = LLVMBuildBitCast(builder, alpha,
1532                             lp_build_vec_type(gallivm, type32), "");
1533    /*
1534     * alpha now contains elems 0,1,2,3,... (ubytes)
1535     * we need 0,4,8,12, 1,5,9,13 etc. in dwords to match color (which
1536     * is just as easy as "natural" order - 3 shift/and instead of 6 unpack).
1537     */
1538    a[0] = LLVMBuildShl(builder, alpha,
1539                        lp_build_const_int_vec(gallivm, type32, 24), "");
1540    a[1] = LLVMBuildShl(builder, alpha,
1541                        lp_build_const_int_vec(gallivm, type32, 16), "");
1542    a[1] = LLVMBuildAnd(builder, a[1], mask8hi, "");
1543    a[2] = LLVMBuildShl(builder, alpha,
1544                        lp_build_const_int_vec(gallivm, type32, 8), "");
1545    a[2] = LLVMBuildAnd(builder, a[2], mask8hi, "");
1546    a[3] = LLVMBuildAnd(builder, alpha, mask8hi, "");
1547 
1548    for (i = 0; i < 4; i++) {
1549       col[i] = LLVMBuildOr(builder, col[i], a[i], "");
1550    }
1551 }
1552 
1553 
1554 static LLVMValueRef
lp_build_lerpdxta_block(struct gallivm_state * gallivm,LLVMValueRef alpha0,LLVMValueRef alpha1,LLVMValueRef code,LLVMValueRef sel_mask)1555 lp_build_lerpdxta_block(struct gallivm_state *gallivm,
1556                         LLVMValueRef alpha0,
1557                         LLVMValueRef alpha1,
1558                         LLVMValueRef code,
1559                         LLVMValueRef sel_mask)
1560 {
1561    LLVMBuilderRef builder = gallivm->builder;
1562    LLVMValueRef delta, ainterp;
1563    LLVMValueRef weight5, weight7, weight;
1564    struct lp_type type16;
1565    struct lp_build_context bld;
1566 
1567    memset(&type16, 0, sizeof type16);
1568    type16.width = 16;
1569    type16.length = 8;
1570    type16.sign = true;
1571 
1572    lp_build_context_init(&bld, gallivm, type16);
1573    /*
1574     * 256/7 is only 36.57 so we'd lose quite some precision. Since it would
1575     * actually be desirable to do this here with even higher accuracy than
1576     * even 8 bit (more or less required for rgtc, albeit that's not handled
1577     * here right now), shift the weights after multiplication by code.
1578     */
1579    weight5 = lp_build_const_int_vec(gallivm, type16, 256*64/5);
1580    weight7 = lp_build_const_int_vec(gallivm, type16, 256*64/7);
1581    weight = lp_build_select(&bld, sel_mask, weight7, weight5);
1582 
1583    /*
1584     * we'll get garbage in the elements which had code 0 (or larger than
1585     * 5 or 7) but we don't care (or rather, need to fix up anyway).
1586     */
1587    code = LLVMBuildSub(builder, code, bld.one, "");
1588 
1589    weight = LLVMBuildMul(builder, weight, code, "");
1590    weight = LLVMBuildLShr(builder, weight,
1591                           lp_build_const_int_vec(gallivm, type16, 6), "");
1592 
1593    delta = LLVMBuildSub(builder, alpha1, alpha0, "");
1594 
1595    ainterp = LLVMBuildMul(builder, delta, weight, "");
1596    ainterp = LLVMBuildLShr(builder, ainterp,
1597                            lp_build_const_int_vec(gallivm, type16, 8), "");
1598 
1599    /* lerp is done later (with packed values) */
1600 
1601    return ainterp;
1602 }
1603 
1604 
1605 /*
1606  * decode one dxt5 block.
1607  */
1608 static void
s3tc_decode_block_dxt5(struct gallivm_state * gallivm,enum pipe_format format,LLVMValueRef dxt_block,LLVMValueRef * col)1609 s3tc_decode_block_dxt5(struct gallivm_state *gallivm,
1610                        enum pipe_format format,
1611                        LLVMValueRef dxt_block,
1612                        LLVMValueRef *col)
1613 {
1614    LLVMBuilderRef builder = gallivm->builder;
1615    LLVMValueRef alpha, alpha0, alpha1, ares;
1616    LLVMValueRef ainterp, ainterp0, ainterp1, shuffle1, sel_mask, sel_mask2;
1617    LLVMValueRef a[4], acode, tmp0, tmp1;
1618    LLVMTypeRef i64t, i32t;
1619    struct lp_type type32, type64, type8, type16;
1620    struct lp_build_context bld16, bld8;
1621    unsigned i;
1622 
1623    memset(&type32, 0, sizeof type32);
1624    type32.width = 32;
1625    type32.length = 4;
1626 
1627    memset(&type64, 0, sizeof type64);
1628    type64.width = 64;
1629    type64.length = 2;
1630 
1631    memset(&type8, 0, sizeof type8);
1632    type8.width = 8;
1633    type8.length = 16;
1634 
1635    memset(&type16, 0, sizeof type16);
1636    type16.width = 16;
1637    type16.length = 8;
1638 
1639    lp_build_context_init(&bld16, gallivm, type16);
1640    lp_build_context_init(&bld8, gallivm, type8);
1641 
1642    i64t = lp_build_vec_type(gallivm, type64);
1643    i32t = lp_build_vec_type(gallivm, type32);
1644 
1645    s3tc_decode_block_dxt1(gallivm, format, dxt_block, col);
1646 
1647    /*
1648     * three possible strategies for vectorizing alpha:
1649     * 1) compute all 8 values then use scalar extraction
1650     *    (i.e. have all 8 alpha values packed in one 64bit scalar
1651     *    and do something like ax = vals >> (codex * 8) followed
1652     *    by inserting these values back into color)
1653     * 2) same as 8 but just use pshufb as a mini-LUT for selection.
1654     *    (without pshufb would need boatloads of cmp/selects trying to
1655     *    keep things vectorized for essentially scalar selection).
1656     * 3) do something similar to the uncached case
1657     *    needs more calculations (need to calc 16 values instead of 8 though
1658     *    that's only an issue for the lerp which we need to do twice otherwise
1659     *    everything still fits into 128bit) but keeps things vectorized mostly.
1660     * Trying 3) here though not sure it's really faster...
1661     * With pshufb, we try 2) (cheaper and more accurate)
1662     */
1663 
1664    /*
1665     * Ideally, we'd use 2 variable 16bit shifts here (byte shifts wouldn't
1666     * help since code crosses 8bit boundaries). But variable shifts are
1667     * AVX2 only, and even then only dword/quadword (intel _really_ hates
1668     * shifts!). Instead, emulate by 16bit muls.
1669     * Also, the required byte shuffles are essentially non-emulatable, so
1670     * require ssse3 (albeit other archs might do them fine).
1671     * This is not directly tied to ssse3 - just need sane byte shuffles.
1672     * But ordering is going to be different below so use same condition.
1673     */
1674 
1675 
1676    /* vectorize alpha */
1677    alpha = LLVMBuildBitCast(builder, dxt_block, i64t, "");
1678    alpha0 = LLVMBuildAnd(builder, alpha,
1679                          lp_build_const_int_vec(gallivm, type64, 0xff), "");
1680    alpha0 = LLVMBuildBitCast(builder, alpha0, bld16.vec_type, "");
1681    alpha = LLVMBuildBitCast(builder, alpha, bld16.vec_type, "");
1682    alpha1 = LLVMBuildLShr(builder, alpha,
1683                           lp_build_const_int_vec(gallivm, type16, 8), "");
1684    alpha = LLVMBuildBitCast(builder, alpha,  i64t, "");
1685    shuffle1 = lp_build_const_shuffle1(gallivm, 0, 8);
1686    alpha0 = LLVMBuildShuffleVector(builder, alpha0, alpha0, shuffle1, "");
1687    alpha1 = LLVMBuildShuffleVector(builder, alpha1, alpha1, shuffle1, "");
1688 
1689    type16.sign = true;
1690    sel_mask = lp_build_compare(gallivm, type16, PIPE_FUNC_GREATER,
1691                                alpha0, alpha1);
1692    type16.sign = false;
1693    sel_mask = LLVMBuildBitCast(builder, sel_mask, bld8.vec_type, "");
1694 
1695    if (!util_get_cpu_caps()->has_ssse3) {
1696       LLVMValueRef acodeg, mask1, acode0, acode1;
1697 
1698       /* extraction of the 3 bit values into something more useful is HARD */
1699       /* first steps are actually scalar */
1700       acode = LLVMBuildLShr(builder, alpha,
1701                             lp_build_const_int_vec(gallivm, type64, 16), "");
1702       tmp0 = LLVMBuildAnd(builder, acode,
1703                           lp_build_const_int_vec(gallivm, type64, 0xffffff), "");
1704       tmp1 =  LLVMBuildLShr(builder, acode,
1705                             lp_build_const_int_vec(gallivm, type64, 24), "");
1706       tmp0 = LLVMBuildBitCast(builder, tmp0, i32t, "");
1707       tmp1 = LLVMBuildBitCast(builder, tmp1, i32t, "");
1708       acode = lp_build_interleave2(gallivm, type32, tmp0, tmp1, 0);
1709       /* now have 2x24bit in 4x32bit, order 01234567, 89..., undef, undef */
1710       tmp0 = LLVMBuildAnd(builder, acode,
1711                           lp_build_const_int_vec(gallivm, type32, 0xfff), "");
1712       tmp1 =  LLVMBuildLShr(builder, acode,
1713                             lp_build_const_int_vec(gallivm, type32, 12), "");
1714       acode = lp_build_interleave2(gallivm, type32, tmp0, tmp1, 0);
1715       /* now have 4x12bit in 4x32bit, order 0123, 4567, ,,, */
1716       tmp0 = LLVMBuildAnd(builder, acode,
1717                           lp_build_const_int_vec(gallivm, type32, 0x3f), "");
1718       tmp1 =  LLVMBuildLShr(builder, acode,
1719                             lp_build_const_int_vec(gallivm, type32, 6), "");
1720       /* use signed pack doesn't matter and otherwise need sse41 */
1721       type32.sign = type16.sign = true;
1722       acode = lp_build_pack2(gallivm, type32, type16, tmp0, tmp1);
1723       type32.sign = type16.sign = false;
1724       /* now have 8x6bit in 8x16bit, 01, 45, 89, ..., 23, 67, ... */
1725       acode0 = LLVMBuildAnd(builder, acode,
1726                             lp_build_const_int_vec(gallivm, type16, 0x7), "");
1727       acode1 =  LLVMBuildLShr(builder, acode,
1728                               lp_build_const_int_vec(gallivm, type16, 3), "");
1729       acode = lp_build_pack2(gallivm, type16, type8, acode0, acode1);
1730       /* acode0 contains elems 0,4,8,12,2,6,10,14, acode1 1,5,9,... */
1731 
1732       acodeg = LLVMBuildAnd(builder, acode,
1733                             LLVMBuildNot(builder, sel_mask, ""), "");
1734       mask1 = lp_build_compare(gallivm, type8, PIPE_FUNC_EQUAL,
1735                                acode, bld8.one);
1736 
1737       sel_mask = LLVMBuildBitCast(builder, sel_mask, bld16.vec_type, "");
1738       ainterp0 = lp_build_lerpdxta_block(gallivm, alpha0, alpha1, acode0, sel_mask);
1739       ainterp1 = lp_build_lerpdxta_block(gallivm, alpha0, alpha1, acode1, sel_mask);
1740       sel_mask = LLVMBuildBitCast(builder, sel_mask, bld8.vec_type, "");
1741       ainterp = lp_build_pack2(gallivm, type16, type8, ainterp0, ainterp1);
1742       alpha0 = lp_build_pack2(gallivm, type16, type8, alpha0, alpha0);
1743       alpha1 = lp_build_pack2(gallivm, type16, type8, alpha1, alpha1);
1744       ainterp = LLVMBuildAdd(builder, ainterp, alpha0, "");
1745       /* Fix up val01 */
1746       sel_mask2 = lp_build_compare(gallivm, type8, PIPE_FUNC_EQUAL,
1747                                    acode, bld8.zero);
1748       ainterp = lp_build_select(&bld8, sel_mask2, alpha0, ainterp);
1749       ainterp = lp_build_select(&bld8, mask1, alpha1, ainterp);
1750 
1751       /* fix up val67 if a0 <= a1 */
1752       sel_mask2 = lp_build_compare(gallivm, type8, PIPE_FUNC_EQUAL,
1753                                    acodeg, lp_build_const_int_vec(gallivm, type8, 6));
1754       ares = LLVMBuildAnd(builder, ainterp, LLVMBuildNot(builder, sel_mask2, ""), "");
1755       sel_mask2 = lp_build_compare(gallivm, type8, PIPE_FUNC_EQUAL,
1756                                    acodeg, lp_build_const_int_vec(gallivm, type8, 7));
1757       ares = LLVMBuildOr(builder, ares, sel_mask2, "");
1758 
1759       /* unpack in right order (0,4,8,12,1,5,..) */
1760       /* this gives us zero, a0, zero, a4, zero, a8, ... for tmp0 */
1761       tmp0 = lp_build_interleave2(gallivm, type8, bld8.zero, ares, 0);
1762       tmp1 = lp_build_interleave2(gallivm, type8, bld8.zero, ares, 1);
1763       tmp0 = LLVMBuildBitCast(builder, tmp0, bld16.vec_type, "");
1764       tmp1 = LLVMBuildBitCast(builder, tmp1, bld16.vec_type, "");
1765 
1766       a[0] = lp_build_interleave2(gallivm, type16, bld16.zero, tmp0, 0);
1767       a[1] = lp_build_interleave2(gallivm, type16, bld16.zero, tmp1, 0);
1768       a[2] = lp_build_interleave2(gallivm, type16, bld16.zero, tmp0, 1);
1769       a[3] = lp_build_interleave2(gallivm, type16, bld16.zero, tmp1, 1);
1770    }
1771    else {
1772       LLVMValueRef elems[16], intrargs[2], shufa, mulclo, mulchi, mask8hi;
1773       LLVMTypeRef type16s = LLVMInt16TypeInContext(gallivm->context);
1774       LLVMTypeRef type8s = LLVMInt8TypeInContext(gallivm->context);
1775       unsigned i, j;
1776       /*
1777        * Ideally, we'd use 2 variable 16bit shifts here (byte shifts wouldn't
1778        * help since code crosses 8bit boundaries). But variable shifts are
1779        * AVX2 only, and even then only dword/quadword (intel _really_ hates
1780        * shifts!). Instead, emulate by 16bit muls.
1781        * Also, the required byte shuffles are essentially non-emulatable, so
1782        * require ssse3 (albeit other archs might do them fine, but the
1783        * complete path is ssse3 only for now).
1784        */
1785       for (i = 0, j = 0; i < 16; i += 8, j += 3) {
1786          elems[i+0] = elems[i+1] = elems[i+2] = lp_build_const_int32(gallivm, j+2);
1787          elems[i+3] = elems[i+4] = lp_build_const_int32(gallivm, j+3);
1788          elems[i+5] = elems[i+6] = elems[i+7] = lp_build_const_int32(gallivm, j+4);
1789       }
1790       shufa = LLVMConstVector(elems, 16);
1791       alpha = LLVMBuildBitCast(builder, alpha, bld8.vec_type, "");
1792       acode = LLVMBuildShuffleVector(builder, alpha, bld8.undef, shufa, "");
1793       acode = LLVMBuildBitCast(builder, acode, bld16.vec_type, "");
1794       /*
1795        * Put 0/2/4/6 into high 3 bits of 16 bits (save AND mask)
1796        * Do the same for 1/3/5/7 (albeit still need mask there - ideally
1797        * we'd place them into bits 4-7 so could save shift but impossible.)
1798        */
1799       for (i = 0; i < 8; i += 4) {
1800          elems[i+0] = LLVMConstInt(type16s, 1 << (13-0), 0);
1801          elems[i+1] = LLVMConstInt(type16s, 1 << (13-6), 0);
1802          elems[i+2] = LLVMConstInt(type16s, 1 << (13-4), 0);
1803          elems[i+3] = LLVMConstInt(type16s, 1 << (13-2), 0);
1804       }
1805       mulclo = LLVMConstVector(elems, 8);
1806       for (i = 0; i < 8; i += 4) {
1807          elems[i+0] = LLVMConstInt(type16s, 1 << (13-3), 0);
1808          elems[i+1] = LLVMConstInt(type16s, 1 << (13-9), 0);
1809          elems[i+2] = LLVMConstInt(type16s, 1 << (13-7), 0);
1810          elems[i+3] = LLVMConstInt(type16s, 1 << (13-5), 0);
1811       }
1812       mulchi = LLVMConstVector(elems, 8);
1813 
1814       tmp0 = LLVMBuildMul(builder, acode, mulclo, "");
1815       tmp1 = LLVMBuildMul(builder, acode, mulchi, "");
1816       tmp0 = LLVMBuildLShr(builder, tmp0,
1817                            lp_build_const_int_vec(gallivm, type16, 13), "");
1818       tmp1 = LLVMBuildLShr(builder, tmp1,
1819                            lp_build_const_int_vec(gallivm, type16, 5), "");
1820       tmp1 = LLVMBuildAnd(builder, tmp1,
1821                           lp_build_const_int_vec(gallivm, type16, 0x700), "");
1822       acode = LLVMBuildOr(builder, tmp0, tmp1, "");
1823       acode = LLVMBuildBitCast(builder, acode, bld8.vec_type, "");
1824 
1825       /*
1826        * Note that ordering is different here to non-ssse3 path:
1827        * 0/1/2/3/4/5...
1828        */
1829 
1830       LLVMValueRef weight0, weight1, weight, delta;
1831       LLVMValueRef constff_elem7, const0_elem6;
1832       /* weights, correctly rounded (round(256*x/7)) */
1833       elems[0] = LLVMConstInt(type16s, 256, 0);
1834       elems[1] = LLVMConstInt(type16s, 0, 0);
1835       elems[2] = LLVMConstInt(type16s, 219, 0);
1836       elems[3] =  LLVMConstInt(type16s, 183, 0);
1837       elems[4] =  LLVMConstInt(type16s, 146, 0);
1838       elems[5] =  LLVMConstInt(type16s, 110, 0);
1839       elems[6] =  LLVMConstInt(type16s, 73, 0);
1840       elems[7] =  LLVMConstInt(type16s, 37, 0);
1841       weight0 = LLVMConstVector(elems, 8);
1842 
1843       elems[0] = LLVMConstInt(type16s, 256, 0);
1844       elems[1] = LLVMConstInt(type16s, 0, 0);
1845       elems[2] = LLVMConstInt(type16s, 205, 0);
1846       elems[3] =  LLVMConstInt(type16s, 154, 0);
1847       elems[4] =  LLVMConstInt(type16s, 102, 0);
1848       elems[5] =  LLVMConstInt(type16s, 51, 0);
1849       elems[6] =  LLVMConstInt(type16s, 0, 0);
1850       elems[7] =  LLVMConstInt(type16s, 0, 0);
1851       weight1 = LLVMConstVector(elems, 8);
1852 
1853       weight0 = LLVMBuildBitCast(builder, weight0, bld8.vec_type, "");
1854       weight1 = LLVMBuildBitCast(builder, weight1, bld8.vec_type, "");
1855       weight = lp_build_select(&bld8, sel_mask, weight0, weight1);
1856       weight = LLVMBuildBitCast(builder, weight, bld16.vec_type, "");
1857 
1858       for (i = 0; i < 16; i++) {
1859          elems[i] = LLVMConstNull(type8s);
1860       }
1861       elems[7] = LLVMConstInt(type8s, 255, 0);
1862       constff_elem7 = LLVMConstVector(elems, 16);
1863 
1864       for (i = 0; i < 16; i++) {
1865          elems[i] = LLVMConstInt(type8s, 255, 0);
1866       }
1867       elems[6] = LLVMConstInt(type8s, 0, 0);
1868       const0_elem6 = LLVMConstVector(elems, 16);
1869 
1870       /* standard simple lerp - but the version we need isn't available */
1871       delta = LLVMBuildSub(builder, alpha0, alpha1, "");
1872       ainterp = LLVMBuildMul(builder, delta, weight, "");
1873       ainterp = LLVMBuildLShr(builder, ainterp,
1874                               lp_build_const_int_vec(gallivm, type16, 8), "");
1875       ainterp = LLVMBuildBitCast(builder, ainterp, bld8.vec_type, "");
1876       alpha1 = LLVMBuildBitCast(builder, alpha1, bld8.vec_type, "");
1877       ainterp = LLVMBuildAdd(builder, ainterp, alpha1, "");
1878       ainterp = LLVMBuildBitCast(builder, ainterp, bld16.vec_type, "");
1879       ainterp = lp_build_pack2(gallivm, type16, type8, ainterp, bld16.undef);
1880 
1881       /* fixing 0/0xff case is slightly more complex */
1882       constff_elem7 = LLVMBuildAnd(builder, constff_elem7,
1883                                    LLVMBuildNot(builder, sel_mask, ""), "");
1884       const0_elem6 = LLVMBuildOr(builder, const0_elem6, sel_mask, "");
1885       ainterp = LLVMBuildOr(builder, ainterp, constff_elem7, "");
1886       ainterp = LLVMBuildAnd(builder, ainterp, const0_elem6, "");
1887 
1888       /* now pick all 16 elements at once! */
1889       intrargs[0] = ainterp;
1890       intrargs[1] = acode;
1891       ares = lp_build_intrinsic(builder, "llvm.x86.ssse3.pshuf.b.128",
1892                                 bld8.vec_type, intrargs, 2, 0);
1893 
1894       ares = LLVMBuildBitCast(builder, ares, i32t, "");
1895       mask8hi = lp_build_const_int_vec(gallivm, type32, 0xff000000);
1896       a[0] = LLVMBuildShl(builder, ares,
1897                           lp_build_const_int_vec(gallivm, type32, 24), "");
1898       a[1] = LLVMBuildShl(builder, ares,
1899                           lp_build_const_int_vec(gallivm, type32, 16), "");
1900       a[1] = LLVMBuildAnd(builder, a[1], mask8hi, "");
1901       a[2] = LLVMBuildShl(builder, ares,
1902                           lp_build_const_int_vec(gallivm, type32, 8), "");
1903       a[2] = LLVMBuildAnd(builder, a[2], mask8hi, "");
1904       a[3] = LLVMBuildAnd(builder, ares, mask8hi, "");
1905    }
1906 
1907    for (i = 0; i < 4; i++) {
1908       a[i] = LLVMBuildBitCast(builder, a[i], i32t, "");
1909       col[i] = LLVMBuildOr(builder, col[i], a[i], "");
1910    }
1911 }
1912 
1913 
1914 static void
generate_update_cache_one_block(struct gallivm_state * gallivm,LLVMValueRef function,const struct util_format_description * format_desc)1915 generate_update_cache_one_block(struct gallivm_state *gallivm,
1916                                 LLVMValueRef function,
1917                                 const struct util_format_description *format_desc)
1918 {
1919    LLVMBasicBlockRef block;
1920    LLVMBuilderRef old_builder;
1921    LLVMValueRef ptr_addr;
1922    LLVMValueRef hash_index;
1923    LLVMValueRef cache;
1924    LLVMValueRef dxt_block, tag_value;
1925    LLVMValueRef col[LP_MAX_VECTOR_LENGTH];
1926 
1927    ptr_addr     = LLVMGetParam(function, 0);
1928    hash_index   = LLVMGetParam(function, 1);
1929    cache        = LLVMGetParam(function, 2);
1930 
1931    lp_build_name(ptr_addr,   "ptr_addr"  );
1932    lp_build_name(hash_index, "hash_index");
1933    lp_build_name(cache,      "cache_addr");
1934 
1935    /*
1936     * Function body
1937     */
1938 
1939    old_builder = gallivm->builder;
1940    block = LLVMAppendBasicBlockInContext(gallivm->context, function, "entry");
1941    gallivm->builder = LLVMCreateBuilderInContext(gallivm->context);
1942    LLVMPositionBuilderAtEnd(gallivm->builder, block);
1943 
1944    lp_build_gather_s3tc_simple_scalar(gallivm, format_desc, &dxt_block,
1945                                       ptr_addr);
1946 
1947    switch (format_desc->format) {
1948    case PIPE_FORMAT_DXT1_RGB:
1949    case PIPE_FORMAT_DXT1_RGBA:
1950    case PIPE_FORMAT_DXT1_SRGB:
1951    case PIPE_FORMAT_DXT1_SRGBA:
1952       s3tc_decode_block_dxt1(gallivm, format_desc->format, dxt_block, col);
1953       break;
1954    case PIPE_FORMAT_DXT3_RGBA:
1955    case PIPE_FORMAT_DXT3_SRGBA:
1956       s3tc_decode_block_dxt3(gallivm, format_desc->format, dxt_block, col);
1957       break;
1958    case PIPE_FORMAT_DXT5_RGBA:
1959    case PIPE_FORMAT_DXT5_SRGBA:
1960       s3tc_decode_block_dxt5(gallivm, format_desc->format, dxt_block, col);
1961       break;
1962    default:
1963       assert(0);
1964       s3tc_decode_block_dxt1(gallivm, format_desc->format, dxt_block, col);
1965       break;
1966    }
1967 
1968    tag_value = LLVMBuildPtrToInt(gallivm->builder, ptr_addr,
1969                                  LLVMInt64TypeInContext(gallivm->context), "");
1970    s3tc_store_cached_block(gallivm, col, tag_value, hash_index, cache);
1971 
1972    LLVMBuildRetVoid(gallivm->builder);
1973 
1974    LLVMDisposeBuilder(gallivm->builder);
1975    gallivm->builder = old_builder;
1976 
1977    gallivm_verify_function(gallivm, function);
1978 }
1979 
1980 
1981 static void
update_cached_block(struct gallivm_state * gallivm,const struct util_format_description * format_desc,LLVMValueRef ptr_addr,LLVMValueRef hash_index,LLVMValueRef cache)1982 update_cached_block(struct gallivm_state *gallivm,
1983                     const struct util_format_description *format_desc,
1984                     LLVMValueRef ptr_addr,
1985                     LLVMValueRef hash_index,
1986                     LLVMValueRef cache)
1987 
1988 {
1989    LLVMBuilderRef builder = gallivm->builder;
1990    LLVMModuleRef module = gallivm->module;
1991    char name[256];
1992    LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
1993    LLVMTypeRef pi8t = LLVMPointerType(i8t, 0);
1994    LLVMValueRef function, inst;
1995    LLVMBasicBlockRef bb;
1996    LLVMValueRef args[3];
1997 
1998    snprintf(name, sizeof name, "%s_update_cache_one_block",
1999             format_desc->short_name);
2000    function = LLVMGetNamedFunction(module, name);
2001 
2002    LLVMTypeRef ret_type = LLVMVoidTypeInContext(gallivm->context);
2003    LLVMTypeRef arg_types[3];
2004    arg_types[0] = pi8t;
2005    arg_types[1] = LLVMInt32TypeInContext(gallivm->context);
2006    arg_types[2] = LLVMTypeOf(cache); // XXX: put right type here
2007    LLVMTypeRef function_type = LLVMFunctionType(ret_type, arg_types, ARRAY_SIZE(arg_types), 0);
2008 
2009    if (!function) {
2010       function = LLVMAddFunction(module, name, function_type);
2011 
2012       for (unsigned arg = 0; arg < ARRAY_SIZE(arg_types); ++arg)
2013          if (LLVMGetTypeKind(arg_types[arg]) == LLVMPointerTypeKind)
2014             lp_add_function_attr(function, arg + 1, LP_FUNC_ATTR_NOALIAS);
2015 
2016       LLVMSetFunctionCallConv(function, LLVMFastCallConv);
2017       LLVMSetVisibility(function, LLVMHiddenVisibility);
2018       generate_update_cache_one_block(gallivm, function, format_desc);
2019    }
2020 
2021    args[0] = ptr_addr;
2022    args[1] = hash_index;
2023    args[2] = cache;
2024 
2025    LLVMBuildCall2(builder, function_type, function, args, ARRAY_SIZE(args), "");
2026    bb = LLVMGetInsertBlock(builder);
2027    inst = LLVMGetLastInstruction(bb);
2028    LLVMSetInstructionCallConv(inst, LLVMFastCallConv);
2029 }
2030 
2031 /*
2032  * cached lookup
2033  */
2034 static LLVMValueRef
compressed_fetch_cached(struct gallivm_state * gallivm,const struct util_format_description * format_desc,unsigned n,LLVMValueRef base_ptr,LLVMValueRef offset,LLVMValueRef i,LLVMValueRef j,LLVMValueRef cache)2035 compressed_fetch_cached(struct gallivm_state *gallivm,
2036                         const struct util_format_description *format_desc,
2037                         unsigned n,
2038                         LLVMValueRef base_ptr,
2039                         LLVMValueRef offset,
2040                         LLVMValueRef i,
2041                         LLVMValueRef j,
2042                         LLVMValueRef cache)
2043 
2044 {
2045    LLVMBuilderRef builder = gallivm->builder;
2046    unsigned count, low_bit, log2size;
2047    LLVMValueRef color, offset_stored, addr, ptr_addrtrunc, tmp;
2048    LLVMValueRef ij_index, hash_index, hash_mask, block_index;
2049    LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
2050    LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
2051    LLVMTypeRef i64t = LLVMInt64TypeInContext(gallivm->context);
2052    struct lp_type type;
2053    struct lp_build_context bld32;
2054    memset(&type, 0, sizeof type);
2055    type.width = 32;
2056    type.length = n;
2057 
2058    lp_build_context_init(&bld32, gallivm, type);
2059 
2060    /*
2061     * compute hash - we use direct mapped cache, the hash function could
2062     *                be better but it needs to be simple
2063     * per-element:
2064     *    compare offset with offset stored at tag (hash)
2065     *    if not equal extract block, store block, update tag
2066     *    extract color from cache
2067     *    assemble colors
2068     */
2069 
2070    low_bit = util_logbase2(format_desc->block.bits / 8);
2071    log2size = util_logbase2(LP_BUILD_FORMAT_CACHE_SIZE);
2072    addr = LLVMBuildPtrToInt(builder, base_ptr, i64t, "");
2073    ptr_addrtrunc = LLVMBuildPtrToInt(builder, base_ptr, i32t, "");
2074    ptr_addrtrunc = lp_build_broadcast_scalar(&bld32, ptr_addrtrunc);
2075    /* For the hash function, first mask off the unused lowest bits. Then just
2076       do some xor with address bits - only use lower 32bits */
2077    ptr_addrtrunc = LLVMBuildAdd(builder, offset, ptr_addrtrunc, "");
2078    ptr_addrtrunc = LLVMBuildLShr(builder, ptr_addrtrunc,
2079                                  lp_build_const_int_vec(gallivm, type, low_bit), "");
2080    /* This only really makes sense for size 64,128,256 */
2081    hash_index = ptr_addrtrunc;
2082    ptr_addrtrunc = LLVMBuildLShr(builder, ptr_addrtrunc,
2083                                  lp_build_const_int_vec(gallivm, type, 2*log2size), "");
2084    hash_index = LLVMBuildXor(builder, ptr_addrtrunc, hash_index, "");
2085    tmp = LLVMBuildLShr(builder, hash_index,
2086                        lp_build_const_int_vec(gallivm, type, log2size), "");
2087    hash_index = LLVMBuildXor(builder, hash_index, tmp, "");
2088 
2089    hash_mask = lp_build_const_int_vec(gallivm, type, LP_BUILD_FORMAT_CACHE_SIZE - 1);
2090    hash_index = LLVMBuildAnd(builder, hash_index, hash_mask, "");
2091    ij_index = LLVMBuildShl(builder, i, lp_build_const_int_vec(gallivm, type, 2), "");
2092    ij_index = LLVMBuildAdd(builder, ij_index, j, "");
2093    block_index = LLVMBuildShl(builder, hash_index,
2094                               lp_build_const_int_vec(gallivm, type, 4), "");
2095    block_index = LLVMBuildAdd(builder, ij_index, block_index, "");
2096 
2097    if (n > 1) {
2098       color = bld32.undef;
2099       for (count = 0; count < n; count++) {
2100          LLVMValueRef index, cond, colorx;
2101          LLVMValueRef block_indexx, hash_indexx, addrx, offsetx, ptr_addrx;
2102          struct lp_build_if_state if_ctx;
2103 
2104          index = lp_build_const_int32(gallivm, count);
2105          offsetx = LLVMBuildExtractElement(builder, offset, index, "");
2106          addrx = LLVMBuildZExt(builder, offsetx, i64t, "");
2107          addrx = LLVMBuildAdd(builder, addrx, addr, "");
2108          block_indexx = LLVMBuildExtractElement(builder, block_index, index, "");
2109          hash_indexx = LLVMBuildLShr(builder, block_indexx,
2110                                      lp_build_const_int32(gallivm, 4), "");
2111          offset_stored = s3tc_lookup_tag_data(gallivm, cache, hash_indexx);
2112          cond = LLVMBuildICmp(builder, LLVMIntNE, offset_stored, addrx, "");
2113 
2114          lp_build_if(&if_ctx, gallivm, cond);
2115          {
2116             ptr_addrx = LLVMBuildIntToPtr(builder, addrx,
2117                                           LLVMPointerType(i8t, 0), "");
2118             update_cached_block(gallivm, format_desc, ptr_addrx, hash_indexx, cache);
2119 #if LP_BUILD_FORMAT_CACHE_DEBUG
2120             s3tc_update_cache_access(gallivm, cache, 1,
2121                                      LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS);
2122 #endif
2123          }
2124          lp_build_endif(&if_ctx);
2125 
2126          colorx = s3tc_lookup_cached_pixel(gallivm, cache, block_indexx);
2127 
2128          color = LLVMBuildInsertElement(builder, color, colorx,
2129                                         lp_build_const_int32(gallivm, count), "");
2130       }
2131    }
2132    else {
2133       LLVMValueRef cond;
2134       struct lp_build_if_state if_ctx;
2135 
2136       tmp = LLVMBuildZExt(builder, offset, i64t, "");
2137       addr = LLVMBuildAdd(builder, tmp, addr, "");
2138       offset_stored = s3tc_lookup_tag_data(gallivm, cache, hash_index);
2139       cond = LLVMBuildICmp(builder, LLVMIntNE, offset_stored, addr, "");
2140 
2141       lp_build_if(&if_ctx, gallivm, cond);
2142       {
2143          tmp = LLVMBuildIntToPtr(builder, addr, LLVMPointerType(i8t, 0), "");
2144          update_cached_block(gallivm, format_desc, tmp, hash_index, cache);
2145 #if LP_BUILD_FORMAT_CACHE_DEBUG
2146          s3tc_update_cache_access(gallivm, cache, 1,
2147                                   LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS);
2148 #endif
2149       }
2150       lp_build_endif(&if_ctx);
2151 
2152       color = s3tc_lookup_cached_pixel(gallivm, cache, block_index);
2153    }
2154 #if LP_BUILD_FORMAT_CACHE_DEBUG
2155    s3tc_update_cache_access(gallivm, cache, n,
2156                             LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL);
2157 #endif
2158    return LLVMBuildBitCast(builder, color, LLVMVectorType(i8t, n * 4), "");
2159 }
2160 
2161 
2162 static LLVMValueRef
s3tc_dxt5_to_rgba_aos(struct gallivm_state * gallivm,unsigned n,enum pipe_format format,LLVMValueRef colors,LLVMValueRef codewords,LLVMValueRef alpha_lo,LLVMValueRef alpha_hi,LLVMValueRef i,LLVMValueRef j)2163 s3tc_dxt5_to_rgba_aos(struct gallivm_state *gallivm,
2164                       unsigned n,
2165                       enum pipe_format format,
2166                       LLVMValueRef colors,
2167                       LLVMValueRef codewords,
2168                       LLVMValueRef alpha_lo,
2169                       LLVMValueRef alpha_hi,
2170                       LLVMValueRef i,
2171                       LLVMValueRef j)
2172 {
2173    return s3tc_dxt5_full_to_rgba_aos(gallivm, n, format, colors,
2174                                      codewords, alpha_lo, alpha_hi, i, j);
2175 }
2176 
2177 
2178 /**
2179  * @param n  number of pixels processed (usually n=4, but it should also work with n=1
2180  *           and multiples of 4)
2181  * @param base_ptr  base pointer (32bit or 64bit pointer depending on the architecture)
2182  * @param offset <n x i32> vector with the relative offsets of the S3TC blocks
2183  * @param i  is a <n x i32> vector with the x subpixel coordinate (0..3)
2184  * @param j  is a <n x i32> vector with the y subpixel coordinate (0..3)
2185  * @return  a <4*n x i8> vector with the pixel RGBA values in AoS
2186  */
2187 LLVMValueRef
lp_build_fetch_s3tc_rgba_aos(struct gallivm_state * gallivm,const struct util_format_description * format_desc,unsigned n,LLVMValueRef base_ptr,LLVMValueRef offset,LLVMValueRef i,LLVMValueRef j,LLVMValueRef cache)2188 lp_build_fetch_s3tc_rgba_aos(struct gallivm_state *gallivm,
2189                              const struct util_format_description *format_desc,
2190                              unsigned n,
2191                              LLVMValueRef base_ptr,
2192                              LLVMValueRef offset,
2193                              LLVMValueRef i,
2194                              LLVMValueRef j,
2195                              LLVMValueRef cache)
2196 {
2197    LLVMValueRef rgba;
2198    LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
2199    LLVMBuilderRef builder = gallivm->builder;
2200 
2201    assert(format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC);
2202    assert(format_desc->block.width == 4);
2203    assert(format_desc->block.height == 4);
2204 
2205    assert((n == 1) || (n % 4 == 0));
2206 
2207 /*   debug_printf("format = %d\n", format_desc->format);*/
2208    if (cache) {
2209       rgba = compressed_fetch_cached(gallivm, format_desc, n,
2210                                      base_ptr, offset, i, j, cache);
2211       return rgba;
2212    }
2213 
2214    /*
2215     * Could use n > 8 here with avx2, but doesn't seem faster.
2216     */
2217    if (n > 4) {
2218       unsigned count;
2219       LLVMTypeRef i8_vectype = LLVMVectorType(i8t, 4 * n);
2220       LLVMTypeRef i128_type = LLVMIntTypeInContext(gallivm->context, 128);
2221       LLVMTypeRef i128_vectype =  LLVMVectorType(i128_type, n / 4);
2222       LLVMTypeRef i324_vectype = LLVMVectorType(LLVMInt32TypeInContext(
2223                                                 gallivm->context), 4);
2224       LLVMValueRef offset4, i4, j4, rgba4[LP_MAX_VECTOR_LENGTH/16];
2225       struct lp_type lp_324_vectype = lp_type_uint_vec(32, 128);
2226 
2227       assert(n / 4 <= ARRAY_SIZE(rgba4));
2228 
2229       rgba = LLVMGetUndef(i128_vectype);
2230 
2231       for (count = 0; count < n / 4; count++) {
2232          LLVMValueRef colors, codewords, alpha_lo = NULL, alpha_hi = NULL;
2233 
2234          i4 = lp_build_extract_range(gallivm, i, count * 4, 4);
2235          j4 = lp_build_extract_range(gallivm, j, count * 4, 4);
2236          offset4 = lp_build_extract_range(gallivm, offset, count * 4, 4);
2237 
2238          lp_build_gather_s3tc(gallivm, 4, format_desc, &colors, &codewords,
2239                               &alpha_lo, &alpha_hi, base_ptr, offset4);
2240 
2241          switch (format_desc->format) {
2242          case PIPE_FORMAT_DXT1_RGB:
2243          case PIPE_FORMAT_DXT1_RGBA:
2244          case PIPE_FORMAT_DXT1_SRGB:
2245          case PIPE_FORMAT_DXT1_SRGBA:
2246             rgba4[count] = s3tc_dxt1_to_rgba_aos(gallivm, 4, format_desc->format,
2247                                                  colors, codewords, i4, j4);
2248             break;
2249          case PIPE_FORMAT_DXT3_RGBA:
2250          case PIPE_FORMAT_DXT3_SRGBA:
2251             rgba4[count] = s3tc_dxt3_to_rgba_aos(gallivm, 4, format_desc->format, colors,
2252                                                  codewords, alpha_lo, alpha_hi, i4, j4);
2253             break;
2254          case PIPE_FORMAT_DXT5_RGBA:
2255          case PIPE_FORMAT_DXT5_SRGBA:
2256             rgba4[count] = s3tc_dxt5_to_rgba_aos(gallivm, 4, format_desc->format, colors,
2257                                                  codewords, alpha_lo, alpha_hi, i4, j4);
2258             break;
2259          default:
2260             assert(0);
2261             rgba4[count] = LLVMGetUndef(LLVMVectorType(i8t, 4));
2262             break;
2263          }
2264          /* shuffles typically give best results with dword elements...*/
2265          rgba4[count] = LLVMBuildBitCast(builder, rgba4[count], i324_vectype, "");
2266       }
2267       rgba = lp_build_concat(gallivm, rgba4, lp_324_vectype, n / 4);
2268       rgba = LLVMBuildBitCast(builder, rgba, i8_vectype, "");
2269    }
2270    else {
2271       LLVMValueRef colors, codewords, alpha_lo = NULL, alpha_hi = NULL;
2272 
2273       lp_build_gather_s3tc(gallivm, n, format_desc, &colors, &codewords,
2274                            &alpha_lo, &alpha_hi, base_ptr, offset);
2275 
2276       switch (format_desc->format) {
2277       case PIPE_FORMAT_DXT1_RGB:
2278       case PIPE_FORMAT_DXT1_RGBA:
2279       case PIPE_FORMAT_DXT1_SRGB:
2280       case PIPE_FORMAT_DXT1_SRGBA:
2281          rgba = s3tc_dxt1_to_rgba_aos(gallivm, n, format_desc->format,
2282                                       colors, codewords, i, j);
2283          break;
2284       case PIPE_FORMAT_DXT3_RGBA:
2285       case PIPE_FORMAT_DXT3_SRGBA:
2286          rgba = s3tc_dxt3_to_rgba_aos(gallivm, n, format_desc->format, colors,
2287                                       codewords, alpha_lo, alpha_hi, i, j);
2288          break;
2289       case PIPE_FORMAT_DXT5_RGBA:
2290       case PIPE_FORMAT_DXT5_SRGBA:
2291          rgba = s3tc_dxt5_to_rgba_aos(gallivm, n, format_desc->format, colors,
2292                                       codewords, alpha_lo, alpha_hi, i, j);
2293          break;
2294       default:
2295          assert(0);
2296          rgba = LLVMGetUndef(LLVMVectorType(i8t, 4*n));
2297          break;
2298       }
2299    }
2300 
2301    /* always return just decompressed values - srgb conversion is done later */
2302 
2303    return rgba;
2304 }
2305 
2306 /**
2307  * Gather elements from scatter positions in memory into vectors.
2308  * This is customised for fetching texels from s3tc textures.
2309  * For SSE, typical value is length=4.
2310  *
2311  * @param length length of the offsets
2312  * @param colors the stored colors of the blocks will be extracted into this.
2313  * @param codewords the codewords of the blocks will be extracted into this.
2314  * @param alpha_lo used for storing lower 32bit of alpha components for dxt3/5
2315  * @param alpha_hi used for storing higher 32bit of alpha components for dxt3/5
2316  * @param base_ptr base pointer, should be a i8 pointer type.
2317  * @param offsets vector with offsets
2318  */
2319 static void
lp_build_gather_rgtc(struct gallivm_state * gallivm,unsigned length,const struct util_format_description * format_desc,LLVMValueRef * red_lo,LLVMValueRef * red_hi,LLVMValueRef * green_lo,LLVMValueRef * green_hi,LLVMValueRef base_ptr,LLVMValueRef offsets)2320 lp_build_gather_rgtc(struct gallivm_state *gallivm,
2321                      unsigned length,
2322                      const struct util_format_description *format_desc,
2323                      LLVMValueRef *red_lo, LLVMValueRef *red_hi,
2324                      LLVMValueRef *green_lo, LLVMValueRef *green_hi,
2325                      LLVMValueRef base_ptr,
2326                      LLVMValueRef offsets)
2327 {
2328    LLVMBuilderRef builder = gallivm->builder;
2329    unsigned block_bits = format_desc->block.bits;
2330    unsigned i;
2331    LLVMValueRef elems[8];
2332    LLVMTypeRef type32 = LLVMInt32TypeInContext(gallivm->context);
2333    LLVMTypeRef type64 = LLVMInt64TypeInContext(gallivm->context);
2334    LLVMTypeRef type32dxt;
2335    struct lp_type lp_type32dxt;
2336 
2337    memset(&lp_type32dxt, 0, sizeof lp_type32dxt);
2338    lp_type32dxt.width = 32;
2339    lp_type32dxt.length = block_bits / 32;
2340    type32dxt = lp_build_vec_type(gallivm, lp_type32dxt);
2341 
2342    assert(block_bits == 64 || block_bits == 128);
2343    assert(length == 1 || length == 4 || length == 8);
2344 
2345    for (i = 0; i < length; ++i) {
2346       elems[i] = lp_build_gather_elem(gallivm, length,
2347                                       block_bits, block_bits, true,
2348                                       base_ptr, offsets, i, false);
2349       elems[i] = LLVMBuildBitCast(builder, elems[i], type32dxt, "");
2350    }
2351    if (length == 1) {
2352       LLVMValueRef elem = elems[0];
2353 
2354       *red_lo = LLVMBuildExtractElement(builder, elem,
2355                                         lp_build_const_int32(gallivm, 0), "");
2356       *red_hi = LLVMBuildExtractElement(builder, elem,
2357                                         lp_build_const_int32(gallivm, 1), "");
2358 
2359       if (block_bits == 128) {
2360          *green_lo = LLVMBuildExtractElement(builder, elem,
2361                                              lp_build_const_int32(gallivm, 2), "");
2362          *green_hi = LLVMBuildExtractElement(builder, elem,
2363                                              lp_build_const_int32(gallivm, 3), "");
2364       } else {
2365          *green_lo = NULL;
2366          *green_hi = NULL;
2367       }
2368    } else {
2369       LLVMValueRef tmp[4];
2370       struct lp_type lp_type32, lp_type64;
2371       memset(&lp_type32, 0, sizeof lp_type32);
2372       lp_type32.width = 32;
2373       lp_type32.length = length;
2374       lp_type32.sign = lp_type32dxt.sign;
2375       memset(&lp_type64, 0, sizeof lp_type64);
2376       lp_type64.width = 64;
2377       lp_type64.length = length/2;
2378       if (block_bits == 128) {
2379          if (length == 8) {
2380             for (i = 0; i < 4; ++i) {
2381                tmp[0] = elems[i];
2382                tmp[1] = elems[i+4];
2383                elems[i] = lp_build_concat(gallivm, tmp, lp_type32dxt, 2);
2384             }
2385          }
2386          lp_build_transpose_aos(gallivm, lp_type32, elems, tmp);
2387          *green_lo = tmp[2];
2388          *green_hi = tmp[3];
2389          *red_lo = tmp[0];
2390          *red_hi = tmp[1];
2391       } else {
2392          LLVMValueRef red01, red23;
2393          LLVMTypeRef type64_vec = LLVMVectorType(type64, length/2);
2394          LLVMTypeRef type32_vec = LLVMVectorType(type32, length);
2395 
2396          for (i = 0; i < length; ++i) {
2397             /* no-op shuffle */
2398             elems[i] = LLVMBuildShuffleVector(builder, elems[i],
2399                                               LLVMGetUndef(type32dxt),
2400                                               lp_build_const_extend_shuffle(gallivm, 2, 4), "");
2401          }
2402          if (length == 8) {
2403             struct lp_type lp_type32_4 = {0};
2404             lp_type32_4.width = 32;
2405             lp_type32_4.length = 4;
2406             for (i = 0; i < 4; ++i) {
2407                tmp[0] = elems[i];
2408                tmp[1] = elems[i+4];
2409                elems[i] = lp_build_concat(gallivm, tmp, lp_type32_4, 2);
2410             }
2411          }
2412          red01 = lp_build_interleave2_half(gallivm, lp_type32, elems[0], elems[1], 0);
2413          red23 = lp_build_interleave2_half(gallivm, lp_type32, elems[2], elems[3], 0);
2414          red01 = LLVMBuildBitCast(builder, red01, type64_vec, "");
2415          red23 = LLVMBuildBitCast(builder, red23, type64_vec, "");
2416          *red_lo = lp_build_interleave2_half(gallivm, lp_type64, red01, red23, 0);
2417          *red_hi = lp_build_interleave2_half(gallivm, lp_type64, red01, red23, 1);
2418          *red_lo = LLVMBuildBitCast(builder, *red_lo, type32_vec, "");
2419          *red_hi = LLVMBuildBitCast(builder, *red_hi, type32_vec, "");
2420          *green_lo = NULL;
2421          *green_hi = NULL;
2422       }
2423    }
2424 }
2425 
2426 static LLVMValueRef
rgtc1_to_rgba_aos(struct gallivm_state * gallivm,unsigned n,enum pipe_format format,LLVMValueRef red_lo,LLVMValueRef red_hi,LLVMValueRef i,LLVMValueRef j)2427 rgtc1_to_rgba_aos(struct gallivm_state *gallivm,
2428                   unsigned n,
2429                   enum pipe_format format,
2430                   LLVMValueRef red_lo,
2431                   LLVMValueRef red_hi,
2432                   LLVMValueRef i,
2433                   LLVMValueRef j)
2434 {
2435    LLVMBuilderRef builder = gallivm->builder;
2436    bool is_signed = (format == PIPE_FORMAT_RGTC1_SNORM);
2437    LLVMValueRef red = s3tc_dxt5_alpha_channel(gallivm, is_signed, n, red_hi, red_lo, i, j);
2438    LLVMValueRef rgba;
2439    struct lp_type type, type8;
2440    memset(&type, 0, sizeof type);
2441    type.width = 32;
2442    type.length = n;
2443    memset(&type8, 0, sizeof type8);
2444    type8.width = 8;
2445    type8.length = n*4;
2446    rgba = lp_build_const_int_vec(gallivm, type, is_signed ? (0x7f << 24) : (0xffu << 24));
2447    rgba = LLVMBuildOr(builder, rgba, red, "");
2448    return LLVMBuildBitCast(builder, rgba, lp_build_vec_type(gallivm, type8), "");
2449 }
2450 
2451 static LLVMValueRef
rgtc2_to_rgba_aos(struct gallivm_state * gallivm,unsigned n,enum pipe_format format,LLVMValueRef red_lo,LLVMValueRef red_hi,LLVMValueRef green_lo,LLVMValueRef green_hi,LLVMValueRef i,LLVMValueRef j)2452 rgtc2_to_rgba_aos(struct gallivm_state *gallivm,
2453                   unsigned n,
2454                   enum pipe_format format,
2455                   LLVMValueRef red_lo,
2456                   LLVMValueRef red_hi,
2457                   LLVMValueRef green_lo,
2458                   LLVMValueRef green_hi,
2459                   LLVMValueRef i,
2460                   LLVMValueRef j)
2461 {
2462    LLVMBuilderRef builder = gallivm->builder;
2463    bool is_signed = (format == PIPE_FORMAT_RGTC2_SNORM);
2464    LLVMValueRef red = s3tc_dxt5_alpha_channel(gallivm, is_signed, n, red_hi, red_lo, i, j);
2465    LLVMValueRef green = s3tc_dxt5_alpha_channel(gallivm, is_signed, n, green_hi, green_lo, i, j);
2466    LLVMValueRef rgba;
2467    struct lp_type type, type8;
2468    memset(&type, 0, sizeof type);
2469    type.width = 32;
2470    type.length = n;
2471    memset(&type8, 0, sizeof type8);
2472    type8.width = 8;
2473    type8.length = n*4;
2474    rgba = lp_build_const_int_vec(gallivm, type, is_signed ? (0x7f << 24) : (0xffu << 24));
2475    rgba = LLVMBuildOr(builder, rgba, red, "");
2476    green = LLVMBuildShl(builder, green, lp_build_const_int_vec(gallivm, type, 8), "");
2477    rgba = LLVMBuildOr(builder, rgba, green, "");
2478    return LLVMBuildBitCast(builder, rgba, lp_build_vec_type(gallivm, type8), "");
2479 }
2480 
2481 static LLVMValueRef
latc1_to_rgba_aos(struct gallivm_state * gallivm,unsigned n,enum pipe_format format,LLVMValueRef red_lo,LLVMValueRef red_hi,LLVMValueRef i,LLVMValueRef j)2482 latc1_to_rgba_aos(struct gallivm_state *gallivm,
2483                   unsigned n,
2484                   enum pipe_format format,
2485                   LLVMValueRef red_lo,
2486                   LLVMValueRef red_hi,
2487                   LLVMValueRef i,
2488                   LLVMValueRef j)
2489 {
2490    LLVMBuilderRef builder = gallivm->builder;
2491    bool is_signed = (format == PIPE_FORMAT_LATC1_SNORM);
2492    LLVMValueRef red = s3tc_dxt5_alpha_channel(gallivm, is_signed, n, red_hi, red_lo, i, j);
2493    LLVMValueRef rgba, temp;
2494    struct lp_type type, type8;
2495    memset(&type, 0, sizeof type);
2496    type.width = 32;
2497    type.length = n;
2498    memset(&type8, 0, sizeof type8);
2499    type8.width = 8;
2500    type8.length = n*4;
2501    rgba = lp_build_const_int_vec(gallivm, type, is_signed ? (0x7f << 24) : (0xffu << 24));
2502    rgba = LLVMBuildOr(builder, rgba, red, "");
2503    temp = LLVMBuildShl(builder, red, lp_build_const_int_vec(gallivm, type, 8), "");
2504    rgba = LLVMBuildOr(builder, rgba, temp, "");
2505    temp = LLVMBuildShl(builder, red, lp_build_const_int_vec(gallivm, type, 16), "");
2506    rgba = LLVMBuildOr(builder, rgba, temp, "");
2507    return LLVMBuildBitCast(builder, rgba, lp_build_vec_type(gallivm, type8), "");
2508 }
2509 
2510 static LLVMValueRef
latc2_to_rgba_aos(struct gallivm_state * gallivm,unsigned n,enum pipe_format format,LLVMValueRef red_lo,LLVMValueRef red_hi,LLVMValueRef green_lo,LLVMValueRef green_hi,LLVMValueRef i,LLVMValueRef j)2511 latc2_to_rgba_aos(struct gallivm_state *gallivm,
2512                   unsigned n,
2513                   enum pipe_format format,
2514                   LLVMValueRef red_lo,
2515                   LLVMValueRef red_hi,
2516                   LLVMValueRef green_lo,
2517                   LLVMValueRef green_hi,
2518                   LLVMValueRef i,
2519                   LLVMValueRef j)
2520 {
2521    LLVMBuilderRef builder = gallivm->builder;
2522    bool is_signed = (format == PIPE_FORMAT_LATC2_SNORM);
2523    LLVMValueRef red = s3tc_dxt5_alpha_channel(gallivm, is_signed, n, red_hi, red_lo, i, j);
2524    LLVMValueRef green = s3tc_dxt5_alpha_channel(gallivm, is_signed, n, green_hi, green_lo, i, j);
2525    LLVMValueRef rgba, temp;
2526    struct lp_type type, type8;
2527    memset(&type, 0, sizeof type);
2528    type.width = 32;
2529    type.length = n;
2530    memset(&type8, 0, sizeof type8);
2531    type8.width = 8;
2532    type8.length = n*4;
2533 
2534    temp = LLVMBuildShl(builder, red, lp_build_const_int_vec(gallivm, type, 8), "");
2535    rgba = LLVMBuildOr(builder, red, temp, "");
2536    temp = LLVMBuildShl(builder, red, lp_build_const_int_vec(gallivm, type, 16), "");
2537    rgba = LLVMBuildOr(builder, rgba, temp, "");
2538    temp = LLVMBuildShl(builder, green, lp_build_const_int_vec(gallivm, type, 24), "");
2539    rgba = LLVMBuildOr(builder, rgba, temp, "");
2540    return LLVMBuildBitCast(builder, rgba, lp_build_vec_type(gallivm, type8), "");
2541 }
2542 
2543 /**
2544  * @param n  number of pixels processed (usually n=4, but it should also work with n=1
2545  *           and multiples of 4)
2546  * @param base_ptr  base pointer (32bit or 64bit pointer depending on the architecture)
2547  * @param offset <n x i32> vector with the relative offsets of the S3TC blocks
2548  * @param i  is a <n x i32> vector with the x subpixel coordinate (0..3)
2549  * @param j  is a <n x i32> vector with the y subpixel coordinate (0..3)
2550  * @return  a <4*n x i8> vector with the pixel RGBA values in AoS
2551  */
2552 LLVMValueRef
lp_build_fetch_rgtc_rgba_aos(struct gallivm_state * gallivm,const struct util_format_description * format_desc,unsigned n,LLVMValueRef base_ptr,LLVMValueRef offset,LLVMValueRef i,LLVMValueRef j,LLVMValueRef cache)2553 lp_build_fetch_rgtc_rgba_aos(struct gallivm_state *gallivm,
2554                              const struct util_format_description *format_desc,
2555                              unsigned n,
2556                              LLVMValueRef base_ptr,
2557                              LLVMValueRef offset,
2558                              LLVMValueRef i,
2559                              LLVMValueRef j,
2560                              LLVMValueRef cache)
2561 {
2562    LLVMValueRef rgba;
2563    LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
2564    LLVMBuilderRef builder = gallivm->builder;
2565    LLVMValueRef red_lo, red_hi, green_lo, green_hi;
2566    assert(format_desc->layout == UTIL_FORMAT_LAYOUT_RGTC);
2567    assert(format_desc->block.width == 4);
2568    assert(format_desc->block.height == 4);
2569 
2570    assert((n == 1) || (n % 4 == 0));
2571 
2572    if (n > 4) {
2573       unsigned count;
2574       LLVMTypeRef i128_type = LLVMIntTypeInContext(gallivm->context, 128);
2575       LLVMTypeRef i128_vectype =  LLVMVectorType(i128_type, n / 4);
2576       LLVMTypeRef i8_vectype = LLVMVectorType(i8t, 4 * n);
2577       LLVMTypeRef i324_vectype = LLVMVectorType(LLVMInt32TypeInContext(
2578                                                    gallivm->context), 4);
2579       LLVMValueRef offset4, i4, j4, rgba4[LP_MAX_VECTOR_LENGTH/16];
2580       struct lp_type lp_324_vectype = lp_type_uint_vec(32, 128);
2581 
2582       rgba = LLVMGetUndef(i128_vectype);
2583 
2584       for (count = 0; count < n / 4; count++) {
2585 
2586          i4 = lp_build_extract_range(gallivm, i, count * 4, 4);
2587          j4 = lp_build_extract_range(gallivm, j, count * 4, 4);
2588          offset4 = lp_build_extract_range(gallivm, offset, count * 4, 4);
2589 
2590          lp_build_gather_rgtc(gallivm, 4, format_desc, &red_lo, &red_hi,
2591                               &green_lo, &green_hi, base_ptr, offset4);
2592 
2593          switch (format_desc->format) {
2594          case PIPE_FORMAT_RGTC1_UNORM:
2595          case PIPE_FORMAT_RGTC1_SNORM:
2596             rgba4[count] = rgtc1_to_rgba_aos(gallivm, 4, format_desc->format,
2597                                              red_lo, red_hi, i4, j4);
2598             break;
2599          case PIPE_FORMAT_RGTC2_UNORM:
2600          case PIPE_FORMAT_RGTC2_SNORM:
2601             rgba4[count] = rgtc2_to_rgba_aos(gallivm, 4, format_desc->format,
2602                                              red_lo, red_hi, green_lo, green_hi, i4, j4);
2603             break;
2604          case PIPE_FORMAT_LATC1_UNORM:
2605          case PIPE_FORMAT_LATC1_SNORM:
2606             rgba4[count] = latc1_to_rgba_aos(gallivm, 4, format_desc->format,
2607                                              red_lo, red_hi, i4, j4);
2608             break;
2609          case PIPE_FORMAT_LATC2_UNORM:
2610          case PIPE_FORMAT_LATC2_SNORM:
2611             rgba4[count] = latc2_to_rgba_aos(gallivm, 4, format_desc->format,
2612                                              red_lo, red_hi, green_lo, green_hi, i4, j4);
2613             break;
2614          default:
2615             assert(0);
2616             rgba4[count] = LLVMGetUndef(LLVMVectorType(i8t, 4));
2617             break;
2618          }
2619          /* shuffles typically give best results with dword elements...*/
2620          rgba4[count] = LLVMBuildBitCast(builder, rgba4[count], i324_vectype, "");
2621       }
2622       rgba = lp_build_concat(gallivm, rgba4, lp_324_vectype, n / 4);
2623       rgba = LLVMBuildBitCast(builder, rgba, i8_vectype, "");
2624    } else {
2625       LLVMValueRef red_lo, red_hi, green_lo, green_hi;
2626 
2627       lp_build_gather_rgtc(gallivm, n, format_desc, &red_lo, &red_hi,
2628                            &green_lo, &green_hi, base_ptr, offset);
2629 
2630       switch (format_desc->format) {
2631       case PIPE_FORMAT_RGTC1_UNORM:
2632       case PIPE_FORMAT_RGTC1_SNORM:
2633          rgba = rgtc1_to_rgba_aos(gallivm, n, format_desc->format,
2634                                   red_lo, red_hi, i, j);
2635          break;
2636       case PIPE_FORMAT_RGTC2_UNORM:
2637       case PIPE_FORMAT_RGTC2_SNORM:
2638          rgba = rgtc2_to_rgba_aos(gallivm, n, format_desc->format,
2639                                   red_lo, red_hi, green_lo, green_hi, i, j);
2640          break;
2641       case PIPE_FORMAT_LATC1_UNORM:
2642       case PIPE_FORMAT_LATC1_SNORM:
2643          rgba = latc1_to_rgba_aos(gallivm, n, format_desc->format,
2644                                   red_lo, red_hi, i, j);
2645          break;
2646       case PIPE_FORMAT_LATC2_UNORM:
2647       case PIPE_FORMAT_LATC2_SNORM:
2648          rgba = latc2_to_rgba_aos(gallivm, n, format_desc->format,
2649                                   red_lo, red_hi, green_lo, green_hi, i, j);
2650          break;
2651       default:
2652          assert(0);
2653          rgba = LLVMGetUndef(LLVMVectorType(i8t, 4*n));
2654          break;
2655       }
2656    }
2657    return rgba;
2658 }
2659