xref: /aosp_15_r20/external/mesa3d/src/gallium/auxiliary/gallivm/lp_bld_sample.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /**************************************************************************
2  *
3  * Copyright 2009 VMware, Inc.
4  * All Rights Reserved.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the
8  * "Software"), to deal in the Software without restriction, including
9  * without limitation the rights to use, copy, modify, merge, publish,
10  * distribute, sub license, and/or sell copies of the Software, and to
11  * permit persons to whom the Software is furnished to do so, subject to
12  * the following conditions:
13  *
14  * The above copyright notice and this permission notice (including the
15  * next paragraph) shall be included in all copies or substantial portions
16  * of the Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25  *
26  **************************************************************************/
27 
28 /**
29  * @file
30  * Texture sampling -- common code.
31  *
32  * @author Jose Fonseca <[email protected]>
33  */
34 
35 #include "pipe/p_defines.h"
36 #include "pipe/p_state.h"
37 #include "util/format/u_format.h"
38 #include "util/u_math.h"
39 #include "util/u_cpu_detect.h"
40 #include "lp_bld_arit.h"
41 #include "lp_bld_const.h"
42 #include "lp_bld_debug.h"
43 #include "lp_bld_printf.h"
44 #include "lp_bld_flow.h"
45 #include "lp_bld_sample.h"
46 #include "lp_bld_swizzle.h"
47 #include "lp_bld_type.h"
48 #include "lp_bld_logic.h"
49 #include "lp_bld_pack.h"
50 #include "lp_bld_quad.h"
51 #include "lp_bld_bitarit.h"
52 
53 
54 /*
55  * Bri-linear factor. Should be greater than one.
56  */
57 #define BRILINEAR_FACTOR 2
58 
59 
60 /**
61  * Does the given texture wrap mode allow sampling the texture border color?
62  * XXX maybe move this into gallium util code.
63  */
64 bool
lp_sampler_wrap_mode_uses_border_color(enum pipe_tex_wrap mode,enum pipe_tex_filter min_img_filter,enum pipe_tex_filter mag_img_filter)65 lp_sampler_wrap_mode_uses_border_color(enum pipe_tex_wrap mode,
66                                        enum pipe_tex_filter min_img_filter,
67                                        enum pipe_tex_filter mag_img_filter)
68 {
69    switch (mode) {
70    case PIPE_TEX_WRAP_REPEAT:
71    case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
72    case PIPE_TEX_WRAP_MIRROR_REPEAT:
73    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
74       return false;
75    case PIPE_TEX_WRAP_CLAMP:
76    case PIPE_TEX_WRAP_MIRROR_CLAMP:
77       if (min_img_filter == PIPE_TEX_FILTER_NEAREST &&
78           mag_img_filter == PIPE_TEX_FILTER_NEAREST) {
79          return false;
80       } else {
81          return true;
82       }
83    case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
84    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
85       return true;
86    default:
87       assert(0 && "unexpected wrap mode");
88       return false;
89    }
90 }
91 
92 
93 /**
94  * Initialize lp_sampler_static_texture_state object with the gallium
95  * texture/sampler_view state (this contains the parts which are
96  * considered static).
97  */
98 void
lp_sampler_static_texture_state(struct lp_static_texture_state * state,const struct pipe_sampler_view * view)99 lp_sampler_static_texture_state(struct lp_static_texture_state *state,
100                                 const struct pipe_sampler_view *view)
101 {
102    memset(state, 0, sizeof *state);
103 
104    if (!view || !view->texture)
105       return;
106 
107    const struct pipe_resource *texture = view->texture;
108 
109    state->format = view->format;
110    state->res_format = texture->format;
111    state->swizzle_r = view->swizzle_r;
112    state->swizzle_g = view->swizzle_g;
113    state->swizzle_b = view->swizzle_b;
114    state->swizzle_a = view->swizzle_a;
115    assert(state->swizzle_r < PIPE_SWIZZLE_NONE);
116    assert(state->swizzle_g < PIPE_SWIZZLE_NONE);
117    assert(state->swizzle_b < PIPE_SWIZZLE_NONE);
118    assert(state->swizzle_a < PIPE_SWIZZLE_NONE);
119 
120    /* check if it is a tex2d created from buf */
121    if (view->is_tex2d_from_buf)
122       state->target = PIPE_TEXTURE_2D;
123    else
124       state->target = view->target;
125 
126    state->res_target = texture->target;
127 
128    state->pot_width = util_is_power_of_two_or_zero(texture->width0);
129    state->pot_height = util_is_power_of_two_or_zero(texture->height0);
130    state->pot_depth = util_is_power_of_two_or_zero(texture->depth0);
131    state->level_zero_only = !view->u.tex.last_level;
132    state->tiled = !!(texture->flags & PIPE_RESOURCE_FLAG_SPARSE);
133    if (state->tiled)
134       state->tiled_samples = texture->nr_samples;
135 
136    /*
137     * the layer / element / level parameters are all either dynamic
138     * state or handled transparently wrt execution.
139     */
140 }
141 
142 
143 /**
144  * Initialize lp_sampler_static_texture_state object with the gallium
145  * texture/sampler_view state (this contains the parts which are
146  * considered static).
147  */
148 void
lp_sampler_static_texture_state_image(struct lp_static_texture_state * state,const struct pipe_image_view * view)149 lp_sampler_static_texture_state_image(struct lp_static_texture_state *state,
150                                       const struct pipe_image_view *view)
151 {
152    memset(state, 0, sizeof *state);
153 
154    if (!view || !view->resource)
155       return;
156 
157    const struct pipe_resource *resource = view->resource;
158 
159    state->format = view->format;
160    state->res_format = resource->format;
161    state->swizzle_r = PIPE_SWIZZLE_X;
162    state->swizzle_g = PIPE_SWIZZLE_Y;
163    state->swizzle_b = PIPE_SWIZZLE_Z;
164    state->swizzle_a = PIPE_SWIZZLE_W;
165    assert(state->swizzle_r < PIPE_SWIZZLE_NONE);
166    assert(state->swizzle_g < PIPE_SWIZZLE_NONE);
167    assert(state->swizzle_b < PIPE_SWIZZLE_NONE);
168    assert(state->swizzle_a < PIPE_SWIZZLE_NONE);
169 
170    state->target = resource->target;
171    state->res_target = resource->target;
172    state->pot_width = util_is_power_of_two_or_zero(resource->width0);
173    state->pot_height = util_is_power_of_two_or_zero(resource->height0);
174    state->pot_depth = util_is_power_of_two_or_zero(resource->depth0);
175    state->level_zero_only = view->u.tex.level == 0;
176    state->tiled = !!(resource->flags & PIPE_RESOURCE_FLAG_SPARSE);
177    if (state->tiled) {
178       state->tiled_samples = resource->nr_samples;
179       if (view->u.tex.is_2d_view_of_3d)
180          state->target = PIPE_TEXTURE_2D;
181    }
182 
183    /*
184     * the layer / element / level parameters are all either dynamic
185     * state or handled transparently wrt execution.
186     */
187 }
188 
189 
190 /**
191  * Initialize lp_sampler_static_sampler_state object with the gallium sampler
192  * state (this contains the parts which are considered static).
193  */
194 void
lp_sampler_static_sampler_state(struct lp_static_sampler_state * state,const struct pipe_sampler_state * sampler)195 lp_sampler_static_sampler_state(struct lp_static_sampler_state *state,
196                                 const struct pipe_sampler_state *sampler)
197 {
198    memset(state, 0, sizeof *state);
199 
200    if (!sampler)
201       return;
202 
203    /*
204     * We don't copy sampler state over unless it is actually enabled, to avoid
205     * spurious recompiles, as the sampler static state is part of the shader
206     * key.
207     *
208     * Ideally gallium frontends or cso_cache module would make all state
209     * canonical, but until that happens it's better to be safe than sorry here.
210     *
211     * XXX: Actually there's much more than can be done here, especially
212     * regarding 1D/2D/3D/CUBE textures, wrap modes, etc.
213     */
214 
215    state->wrap_s            = sampler->wrap_s;
216    state->wrap_t            = sampler->wrap_t;
217    state->wrap_r            = sampler->wrap_r;
218    state->min_img_filter    = sampler->min_img_filter;
219    state->mag_img_filter    = sampler->mag_img_filter;
220    state->min_mip_filter    = sampler->min_mip_filter;
221    state->seamless_cube_map = sampler->seamless_cube_map;
222    state->reduction_mode    = sampler->reduction_mode;
223    state->aniso = sampler->max_anisotropy > 1.0f;
224 
225    if (sampler->max_lod > 0.0f) {
226       state->max_lod_pos = 1;
227    }
228 
229    if (sampler->lod_bias != 0.0f) {
230       state->lod_bias_non_zero = 1;
231    }
232 
233    if (state->min_mip_filter != PIPE_TEX_MIPFILTER_NONE ||
234        state->min_img_filter != state->mag_img_filter) {
235 
236       /* If min_lod == max_lod we can greatly simplify mipmap selection.
237        * This is a case that occurs during automatic mipmap generation.
238        */
239       if (sampler->min_lod == sampler->max_lod) {
240          state->min_max_lod_equal = 1;
241       } else {
242          if (sampler->min_lod > 0.0f) {
243             state->apply_min_lod = 1;
244          }
245 
246          /*
247           * XXX this won't do anything with the mesa state tracker which always
248           * sets max_lod to not more than actually present mip maps...
249           */
250          if (sampler->max_lod < (PIPE_MAX_TEXTURE_LEVELS - 1)) {
251             state->apply_max_lod = 1;
252          }
253       }
254    }
255 
256    state->compare_mode      = sampler->compare_mode;
257    if (sampler->compare_mode != PIPE_TEX_COMPARE_NONE) {
258       state->compare_func   = sampler->compare_func;
259    }
260 
261    state->normalized_coords = !sampler->unnormalized_coords;
262 }
263 
264 
265 /* build aniso pmin value */
266 static LLVMValueRef
lp_build_pmin(struct lp_build_sample_context * bld,LLVMValueRef first_level,LLVMValueRef s,LLVMValueRef t,LLVMValueRef max_aniso)267 lp_build_pmin(struct lp_build_sample_context *bld,
268               LLVMValueRef first_level,
269               LLVMValueRef s,
270               LLVMValueRef t,
271               LLVMValueRef max_aniso)
272 {
273    struct gallivm_state *gallivm = bld->gallivm;
274    LLVMBuilderRef builder = bld->gallivm->builder;
275    struct lp_build_context *coord_bld = &bld->coord_bld;
276    struct lp_build_context *int_size_bld = &bld->int_size_in_bld;
277    struct lp_build_context *float_size_bld = &bld->float_size_in_bld;
278    struct lp_build_context *pmin_bld = &bld->lodf_bld;
279    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
280    LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
281    LLVMValueRef index1 = LLVMConstInt(i32t, 1, 0);
282    LLVMValueRef ddx_ddy = lp_build_packed_ddx_ddy_twocoord(coord_bld, s, t);
283    LLVMValueRef int_size, float_size;
284    const unsigned length = coord_bld->type.length;
285    const unsigned num_quads = length / 4;
286    const bool pmin_per_quad = pmin_bld->type.length != length;
287 
288    int_size = lp_build_minify(int_size_bld, bld->int_size, first_level, true);
289    float_size = lp_build_int_to_float(float_size_bld, int_size);
290    max_aniso = lp_build_broadcast_scalar(coord_bld, max_aniso);
291    max_aniso = lp_build_mul(coord_bld, max_aniso, max_aniso);
292 
293    static const unsigned char swizzle01[] = { /* no-op swizzle */
294       0, 1,
295       LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
296    };
297    static const unsigned char swizzle23[] = {
298       2, 3,
299       LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
300    };
301    LLVMValueRef ddx_ddys, ddx_ddyt, floatdim, shuffles[LP_MAX_VECTOR_LENGTH / 4];
302 
303    for (unsigned i = 0; i < num_quads; i++) {
304       shuffles[i*4+0] = shuffles[i*4+1] = index0;
305       shuffles[i*4+2] = shuffles[i*4+3] = index1;
306    }
307    floatdim = LLVMBuildShuffleVector(builder, float_size, float_size,
308                                      LLVMConstVector(shuffles, length), "");
309    ddx_ddy = lp_build_mul(coord_bld, ddx_ddy, floatdim);
310 
311    ddx_ddy = lp_build_mul(coord_bld, ddx_ddy, ddx_ddy);
312 
313    ddx_ddys = lp_build_swizzle_aos(coord_bld, ddx_ddy, swizzle01);
314    ddx_ddyt = lp_build_swizzle_aos(coord_bld, ddx_ddy, swizzle23);
315 
316    LLVMValueRef px2_py2 = lp_build_add(coord_bld, ddx_ddys, ddx_ddyt);
317 
318    static const unsigned char swizzle0[] = { /* no-op swizzle */
319      0, LP_BLD_SWIZZLE_DONTCARE,
320      LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
321    };
322    static const unsigned char swizzle1[] = {
323      1, LP_BLD_SWIZZLE_DONTCARE,
324      LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
325    };
326    LLVMValueRef px2 = lp_build_swizzle_aos(coord_bld, px2_py2, swizzle0);
327    LLVMValueRef py2 = lp_build_swizzle_aos(coord_bld, px2_py2, swizzle1);
328 
329    LLVMValueRef pmax2 = lp_build_max(coord_bld, px2, py2);
330    LLVMValueRef pmin2 = lp_build_min(coord_bld, px2, py2);
331 
332    LLVMValueRef temp = lp_build_mul(coord_bld, pmin2, max_aniso);
333 
334    LLVMValueRef comp = lp_build_compare(gallivm, coord_bld->type, PIPE_FUNC_GREATER,
335                                         pmin2, temp);
336 
337    LLVMValueRef pmin2_alt = lp_build_div(coord_bld, pmax2, max_aniso);
338 
339    pmin2 = lp_build_select(coord_bld, comp, pmin2_alt, pmin2);
340 
341    if (pmin_per_quad)
342       pmin2 = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
343                                         pmin_bld->type, pmin2, 0);
344    else
345       pmin2 = lp_build_swizzle_scalar_aos(pmin_bld, pmin2, 0, 4);
346    return pmin2;
347 }
348 
349 
350 /**
351  * Generate code to compute coordinate gradient (rho).
352  * \param derivs  partial derivatives of (s, t, r, q) with respect to X and Y
353  *
354  * The resulting rho has bld->levelf format (per quad or per element).
355  */
356 static LLVMValueRef
lp_build_rho(struct lp_build_sample_context * bld,LLVMValueRef first_level,LLVMValueRef s,LLVMValueRef t,LLVMValueRef r,const struct lp_derivatives * derivs)357 lp_build_rho(struct lp_build_sample_context *bld,
358              LLVMValueRef first_level,
359              LLVMValueRef s,
360              LLVMValueRef t,
361              LLVMValueRef r,
362              const struct lp_derivatives *derivs)
363 {
364    struct gallivm_state *gallivm = bld->gallivm;
365    struct lp_build_context *int_size_bld = &bld->int_size_in_bld;
366    struct lp_build_context *float_size_bld = &bld->float_size_in_bld;
367    struct lp_build_context *float_bld = &bld->float_bld;
368    struct lp_build_context *coord_bld = &bld->coord_bld;
369    struct lp_build_context *rho_bld = &bld->lodf_bld;
370    const unsigned dims = bld->dims;
371    LLVMValueRef ddx_ddy[2] = {NULL};
372    LLVMBuilderRef builder = bld->gallivm->builder;
373    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
374    LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
375    LLVMValueRef index1 = LLVMConstInt(i32t, 1, 0);
376    LLVMValueRef index2 = LLVMConstInt(i32t, 2, 0);
377    LLVMValueRef rho_vec;
378    LLVMValueRef rho;
379    unsigned length = coord_bld->type.length;
380    unsigned num_quads = length / 4;
381    bool rho_per_quad = rho_bld->type.length != length;
382    bool no_rho_opt = bld->no_rho_approx && (dims > 1);
383    LLVMValueRef i32undef = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
384    LLVMValueRef rho_xvec, rho_yvec;
385 
386    /* Note that all simplified calculations will only work for isotropic
387     * filtering
388     */
389 
390    /*
391     * rho calcs are always per quad except for explicit derivs (excluding
392     * the messy cube maps for now) when requested.
393     */
394 
395    LLVMValueRef int_size =
396       lp_build_minify(int_size_bld, bld->int_size, first_level, true);
397    LLVMValueRef float_size = lp_build_int_to_float(float_size_bld, int_size);
398 
399    if (derivs) {
400       LLVMValueRef ddmax[3] = { NULL }, ddx[3] = { NULL }, ddy[3] = { NULL };
401       for (unsigned i = 0; i < dims; i++) {
402          LLVMValueRef indexi = lp_build_const_int32(gallivm, i);
403 
404          LLVMValueRef floatdim =
405             lp_build_extract_broadcast(gallivm, bld->float_size_in_type,
406                                        coord_bld->type, float_size, indexi);
407 
408          /*
409           * note that for rho_per_quad case could reduce math (at some shuffle
410           * cost), but for now use same code to per-pixel lod case.
411           */
412          if (no_rho_opt) {
413             ddx[i] = lp_build_mul(coord_bld, floatdim, derivs->ddx[i]);
414             ddy[i] = lp_build_mul(coord_bld, floatdim, derivs->ddy[i]);
415             ddx[i] = lp_build_mul(coord_bld, ddx[i], ddx[i]);
416             ddy[i] = lp_build_mul(coord_bld, ddy[i], ddy[i]);
417          } else {
418             LLVMValueRef tmpx = lp_build_abs(coord_bld, derivs->ddx[i]);
419             LLVMValueRef tmpy = lp_build_abs(coord_bld, derivs->ddy[i]);
420             ddmax[i] = lp_build_max(coord_bld, tmpx, tmpy);
421             ddmax[i] = lp_build_mul(coord_bld, floatdim, ddmax[i]);
422          }
423       }
424       if (no_rho_opt) {
425          rho_xvec = lp_build_add(coord_bld, ddx[0], ddx[1]);
426          rho_yvec = lp_build_add(coord_bld, ddy[0], ddy[1]);
427          if (dims > 2) {
428             rho_xvec = lp_build_add(coord_bld, rho_xvec, ddx[2]);
429             rho_yvec = lp_build_add(coord_bld, rho_yvec, ddy[2]);
430          }
431          rho = lp_build_max(coord_bld, rho_xvec, rho_yvec);
432          /* skipping sqrt hence returning rho squared */
433       } else {
434          rho = ddmax[0];
435          if (dims > 1) {
436             rho = lp_build_max(coord_bld, rho, ddmax[1]);
437             if (dims > 2) {
438                rho = lp_build_max(coord_bld, rho, ddmax[2]);
439             }
440          }
441       }
442 
443       LLVMValueRef rho_is_inf = lp_build_is_inf_or_nan(gallivm,
444                                                        coord_bld->type, rho);
445       rho = lp_build_select(coord_bld, rho_is_inf, coord_bld->zero, rho);
446 
447       if (rho_per_quad) {
448          /*
449           * rho_vec contains per-pixel rho, convert to scalar per quad.
450           */
451          rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
452                                          rho_bld->type, rho, 0);
453       }
454    } else {
455       /*
456        * This looks all a bit complex, but it's not that bad
457        * (the shuffle code makes it look worse than it is).
458        * Still, might not be ideal for all cases.
459        */
460       static const unsigned char swizzle0[] = { /* no-op swizzle */
461          0, LP_BLD_SWIZZLE_DONTCARE,
462          LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
463       };
464       static const unsigned char swizzle1[] = {
465          1, LP_BLD_SWIZZLE_DONTCARE,
466          LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
467       };
468       static const unsigned char swizzle2[] = {
469          2, LP_BLD_SWIZZLE_DONTCARE,
470          LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
471       };
472 
473       if (dims < 2) {
474          ddx_ddy[0] = lp_build_packed_ddx_ddy_onecoord(coord_bld, s);
475       } else if (dims >= 2) {
476          ddx_ddy[0] = lp_build_packed_ddx_ddy_twocoord(coord_bld, s, t);
477          if (dims > 2) {
478             ddx_ddy[1] = lp_build_packed_ddx_ddy_onecoord(coord_bld, r);
479          }
480       }
481 
482       if (no_rho_opt) {
483          static const unsigned char swizzle01[] = { /* no-op swizzle */
484             0, 1,
485             LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
486          };
487          static const unsigned char swizzle23[] = {
488             2, 3,
489             LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
490          };
491          LLVMValueRef ddx_ddys, ddx_ddyt, floatdim;
492          LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH / 4];
493 
494          for (unsigned i = 0; i < num_quads; i++) {
495             shuffles[i*4+0] = shuffles[i*4+1] = index0;
496             shuffles[i*4+2] = shuffles[i*4+3] = index1;
497          }
498          floatdim = LLVMBuildShuffleVector(builder, float_size, float_size,
499                                            LLVMConstVector(shuffles, length),
500                                            "");
501          ddx_ddy[0] = lp_build_mul(coord_bld, ddx_ddy[0], floatdim);
502          ddx_ddy[0] = lp_build_mul(coord_bld, ddx_ddy[0], ddx_ddy[0]);
503          ddx_ddys = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle01);
504          ddx_ddyt = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle23);
505          rho_vec = lp_build_add(coord_bld, ddx_ddys, ddx_ddyt);
506 
507          if (dims > 2) {
508             static const unsigned char swizzle02[] = {
509                0, 2,
510                LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
511             };
512             floatdim = lp_build_extract_broadcast(gallivm, bld->float_size_in_type,
513                                                   coord_bld->type, float_size, index2);
514             ddx_ddy[1] = lp_build_mul(coord_bld, ddx_ddy[1], floatdim);
515             ddx_ddy[1] = lp_build_mul(coord_bld, ddx_ddy[1], ddx_ddy[1]);
516             ddx_ddy[1] = lp_build_swizzle_aos(coord_bld, ddx_ddy[1], swizzle02);
517             rho_vec = lp_build_add(coord_bld, rho_vec, ddx_ddy[1]);
518          }
519 
520          rho_xvec = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle0);
521          rho_yvec = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle1);
522          rho = lp_build_max(coord_bld, rho_xvec, rho_yvec);
523 
524          if (rho_per_quad) {
525             rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
526                                             rho_bld->type, rho, 0);
527          } else {
528             rho = lp_build_swizzle_scalar_aos(coord_bld, rho, 0, 4);
529          }
530          /* skipping sqrt hence returning rho squared */
531       } else {
532          ddx_ddy[0] = lp_build_abs(coord_bld, ddx_ddy[0]);
533          if (dims > 2) {
534             ddx_ddy[1] = lp_build_abs(coord_bld, ddx_ddy[1]);
535          } else {
536             ddx_ddy[1] = NULL; /* silence compiler warning */
537          }
538 
539          if (dims < 2) {
540             rho_xvec = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle0);
541             rho_yvec = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle2);
542          } else if (dims == 2) {
543             static const unsigned char swizzle02[] = {
544                0, 2,
545                LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
546             };
547             static const unsigned char swizzle13[] = {
548                1, 3,
549                LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
550             };
551             rho_xvec = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle02);
552             rho_yvec = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle13);
553          } else {
554             LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH];
555             LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH];
556             assert(dims == 3);
557             for (unsigned i = 0; i < num_quads; i++) {
558                shuffles1[4*i + 0] = lp_build_const_int32(gallivm, 4*i);
559                shuffles1[4*i + 1] = lp_build_const_int32(gallivm, 4*i + 2);
560                shuffles1[4*i + 2] = lp_build_const_int32(gallivm, length + 4*i);
561                shuffles1[4*i + 3] = i32undef;
562                shuffles2[4*i + 0] = lp_build_const_int32(gallivm, 4*i + 1);
563                shuffles2[4*i + 1] = lp_build_const_int32(gallivm, 4*i + 3);
564                shuffles2[4*i + 2] = lp_build_const_int32(gallivm, length + 4*i + 2);
565                shuffles2[4*i + 3] = i32undef;
566             }
567             rho_xvec = LLVMBuildShuffleVector(builder, ddx_ddy[0], ddx_ddy[1],
568                                               LLVMConstVector(shuffles1, length), "");
569             rho_yvec = LLVMBuildShuffleVector(builder, ddx_ddy[0], ddx_ddy[1],
570                                               LLVMConstVector(shuffles2, length), "");
571          }
572 
573          rho_vec = lp_build_max(coord_bld, rho_xvec, rho_yvec);
574 
575          if (bld->coord_type.length > 4) {
576             /* expand size to each quad */
577             if (dims > 1) {
578                /* could use some broadcast_vector helper for this? */
579                LLVMValueRef src[LP_MAX_VECTOR_LENGTH/4];
580                for (unsigned i = 0; i < num_quads; i++) {
581                   src[i] = float_size;
582                }
583                float_size = lp_build_concat(bld->gallivm, src,
584                                             float_size_bld->type, num_quads);
585             } else {
586                float_size = lp_build_broadcast_scalar(coord_bld, float_size);
587             }
588             rho_vec = lp_build_mul(coord_bld, rho_vec, float_size);
589 
590             if (dims <= 1) {
591                rho = rho_vec;
592             } else {
593                if (dims >= 2) {
594                   LLVMValueRef rho_s, rho_t, rho_r;
595 
596                   rho_s = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle0);
597                   rho_t = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle1);
598 
599                   rho = lp_build_max(coord_bld, rho_s, rho_t);
600 
601                   if (dims >= 3) {
602                      rho_r = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle2);
603                      rho = lp_build_max(coord_bld, rho, rho_r);
604                   }
605                }
606             }
607             if (rho_per_quad) {
608                rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
609                                                rho_bld->type, rho, 0);
610             } else {
611                rho = lp_build_swizzle_scalar_aos(coord_bld, rho, 0, 4);
612             }
613          } else {
614             if (dims <= 1) {
615                rho_vec = LLVMBuildExtractElement(builder, rho_vec, index0, "");
616             }
617             rho_vec = lp_build_mul(float_size_bld, rho_vec, float_size);
618 
619             if (dims <= 1) {
620                rho = rho_vec;
621             } else {
622                if (dims >= 2) {
623                   LLVMValueRef rho_s, rho_t, rho_r;
624 
625                   rho_s = LLVMBuildExtractElement(builder, rho_vec, index0, "");
626                   rho_t = LLVMBuildExtractElement(builder, rho_vec, index1, "");
627 
628                   rho = lp_build_max(float_bld, rho_s, rho_t);
629 
630                   if (dims >= 3) {
631                      rho_r = LLVMBuildExtractElement(builder, rho_vec, index2, "");
632                      rho = lp_build_max(float_bld, rho, rho_r);
633                   }
634                }
635             }
636             if (!rho_per_quad) {
637                rho = lp_build_broadcast_scalar(rho_bld, rho);
638             }
639          }
640       }
641    }
642 
643    return rho;
644 }
645 
646 
647 /*
648  * Bri-linear lod computation
649  *
650  * Use a piece-wise linear approximation of log2 such that:
651  * - round to nearest, for values in the neighborhood of -1, 0, 1, 2, etc.
652  * - linear approximation for values in the neighborhood of 0.5, 1.5., etc,
653  *   with the steepness specified in 'factor'
654  * - exact result for 0.5, 1.5, etc.
655  *
656  *
657  *   1.0 -              /----*
658  *                     /
659  *                    /
660  *                   /
661  *   0.5 -          *
662  *                 /
663  *                /
664  *               /
665  *   0.0 - *----/
666  *
667  *         |                 |
668  *        2^0               2^1
669  *
670  * This is a technique also commonly used in hardware:
671  * - http://ixbtlabs.com/articles2/gffx/nv40-rx800-3.html
672  *
673  * TODO: For correctness, this should only be applied when texture is known to
674  * have regular mipmaps, i.e., mipmaps derived from the base level.
675  *
676  * TODO: This could be done in fixed point, where applicable.
677  */
678 static void
lp_build_brilinear_lod(struct lp_build_context * bld,LLVMValueRef lod,double factor,LLVMValueRef * out_lod_ipart,LLVMValueRef * out_lod_fpart)679 lp_build_brilinear_lod(struct lp_build_context *bld,
680                        LLVMValueRef lod,
681                        double factor,
682                        LLVMValueRef *out_lod_ipart,
683                        LLVMValueRef *out_lod_fpart)
684 {
685    LLVMValueRef lod_fpart;
686    double pre_offset = (factor - 0.5)/factor - 0.5;
687    double post_offset = 1 - factor;
688 
689    if (0) {
690       lp_build_printf(bld->gallivm, "lod = %f\n", lod);
691    }
692 
693    lod = lp_build_add(bld, lod,
694                       lp_build_const_vec(bld->gallivm, bld->type, pre_offset));
695 
696    lp_build_ifloor_fract(bld, lod, out_lod_ipart, &lod_fpart);
697 
698    lod_fpart = lp_build_mad(bld, lod_fpart,
699                             lp_build_const_vec(bld->gallivm, bld->type, factor),
700                             lp_build_const_vec(bld->gallivm, bld->type, post_offset));
701 
702    /*
703     * It's not necessary to clamp lod_fpart since:
704     * - the above expression will never produce numbers greater than one.
705     * - the mip filtering branch is only taken if lod_fpart is positive
706     */
707 
708    *out_lod_fpart = lod_fpart;
709 
710    if (0) {
711       lp_build_printf(bld->gallivm, "lod_ipart = %i\n", *out_lod_ipart);
712       lp_build_printf(bld->gallivm, "lod_fpart = %f\n\n", *out_lod_fpart);
713    }
714 }
715 
716 
717 /*
718  * Combined log2 and brilinear lod computation.
719  *
720  * It's in all identical to calling lp_build_fast_log2() and
721  * lp_build_brilinear_lod() above, but by combining we can compute the integer
722  * and fractional part independently.
723  */
724 static void
lp_build_brilinear_rho(struct lp_build_context * bld,LLVMValueRef rho,double factor,LLVMValueRef * out_lod_ipart,LLVMValueRef * out_lod_fpart)725 lp_build_brilinear_rho(struct lp_build_context *bld,
726                        LLVMValueRef rho,
727                        double factor,
728                        LLVMValueRef *out_lod_ipart,
729                        LLVMValueRef *out_lod_fpart)
730 {
731    const double pre_factor = (2*factor - 0.5)/(M_SQRT2*factor);
732    const double post_offset = 1 - 2*factor;
733 
734    assert(bld->type.floating);
735 
736    assert(lp_check_value(bld->type, rho));
737 
738    /*
739     * The pre factor will make the intersections with the exact powers of two
740     * happen precisely where we want them to be, which means that the integer
741     * part will not need any post adjustments.
742     */
743    rho = lp_build_mul(bld, rho,
744                       lp_build_const_vec(bld->gallivm, bld->type, pre_factor));
745 
746    /* ipart = ifloor(log2(rho)) */
747    LLVMValueRef lod_ipart = lp_build_extract_exponent(bld, rho, 0);
748 
749    /* fpart = rho / 2**ipart */
750    LLVMValueRef lod_fpart = lp_build_extract_mantissa(bld, rho);
751 
752    lod_fpart =
753       lp_build_mad(bld, lod_fpart,
754                    lp_build_const_vec(bld->gallivm, bld->type, factor),
755                    lp_build_const_vec(bld->gallivm, bld->type, post_offset));
756 
757    /*
758     * Like lp_build_brilinear_lod, it's not necessary to clamp lod_fpart since:
759     * - the above expression will never produce numbers greater than one.
760     * - the mip filtering branch is only taken if lod_fpart is positive
761     */
762 
763    *out_lod_ipart = lod_ipart;
764    *out_lod_fpart = lod_fpart;
765 }
766 
767 
768 /**
769  * Fast implementation of iround(log2(sqrt(x))), based on
770  * log2(x^n) == n*log2(x).
771  *
772  * Gives accurate results all the time.
773  * (Could be trivially extended to handle other power-of-two roots.)
774  */
775 static LLVMValueRef
lp_build_ilog2_sqrt(struct lp_build_context * bld,LLVMValueRef x)776 lp_build_ilog2_sqrt(struct lp_build_context *bld,
777                     LLVMValueRef x)
778 {
779    LLVMBuilderRef builder = bld->gallivm->builder;
780    struct lp_type i_type = lp_int_type(bld->type);
781    LLVMValueRef one = lp_build_const_int_vec(bld->gallivm, i_type, 1);
782 
783    assert(bld->type.floating);
784 
785    assert(lp_check_value(bld->type, x));
786 
787    /* ipart = log2(x) + 0.5 = 0.5*(log2(x^2) + 1.0) */
788    LLVMValueRef ipart = lp_build_extract_exponent(bld, x, 1);
789    ipart = LLVMBuildAShr(builder, ipart, one, "");
790 
791    return ipart;
792 }
793 
794 
795 /**
796  * Generate code to compute texture level of detail (lambda).
797  * \param derivs  partial derivatives of (s, t, r, q) with respect to X and Y
798  * \param lod_bias  optional float vector with the shader lod bias
799  * \param explicit_lod  optional float vector with the explicit lod
800  * \param out_lod_ipart  integer part of lod
801  * \param out_lod_fpart  float part of lod (never larger than 1 but may be negative)
802  * \param out_lod_positive  (mask) if lod is positive (i.e. texture is minified)
803  *
804  * The resulting lod can be scalar per quad or be per element.
805  */
806 void
lp_build_lod_selector(struct lp_build_sample_context * bld,bool is_lodq,unsigned sampler_unit,LLVMValueRef first_level,LLVMValueRef s,LLVMValueRef t,LLVMValueRef r,const struct lp_derivatives * derivs,LLVMValueRef lod_bias,LLVMValueRef explicit_lod,enum pipe_tex_mipfilter mip_filter,LLVMValueRef max_aniso,LLVMValueRef * out_lod,LLVMValueRef * out_lod_ipart,LLVMValueRef * out_lod_fpart,LLVMValueRef * out_lod_positive)807 lp_build_lod_selector(struct lp_build_sample_context *bld,
808                       bool is_lodq,
809                       unsigned sampler_unit,
810                       LLVMValueRef first_level,
811                       LLVMValueRef s,
812                       LLVMValueRef t,
813                       LLVMValueRef r,
814                       const struct lp_derivatives *derivs,
815                       LLVMValueRef lod_bias, /* optional */
816                       LLVMValueRef explicit_lod, /* optional */
817                       enum pipe_tex_mipfilter mip_filter,
818                       LLVMValueRef max_aniso,
819                       LLVMValueRef *out_lod,
820                       LLVMValueRef *out_lod_ipart,
821                       LLVMValueRef *out_lod_fpart,
822                       LLVMValueRef *out_lod_positive)
823 
824 {
825    LLVMBuilderRef builder = bld->gallivm->builder;
826    struct lp_sampler_dynamic_state *dynamic_state = bld->dynamic_state;
827    struct lp_build_context *lodf_bld = &bld->lodf_bld;
828    LLVMValueRef lod;
829 
830    *out_lod_ipart = bld->lodi_bld.zero;
831    *out_lod_positive = bld->lodi_bld.zero;
832    *out_lod_fpart = lodf_bld->zero;
833 
834    /*
835     * For determining min/mag, we follow GL 4.1 spec, 3.9.12 Texture
836     * Magnification: "Implementations may either unconditionally assume c = 0
837     * for the minification vs. magnification switch-over point, or may choose
838     * to make c depend on the combination of minification and magnification
839     * modes as follows: if the magnification filter is given by LINEAR and the
840     * minification filter is given by NEAREST_MIPMAP_NEAREST or
841     * NEAREST_MIPMAP_LINEAR, then c = 0.5. This is done to ensure that a
842     * minified texture does not appear "sharper" than a magnified
843     * texture. Otherwise c = 0."  And 3.9.11 Texture Minification: "If lod is
844     * less than or equal to the constant c (see section 3.9.12) the texture is
845     * said to be magnified; if it is greater, the texture is minified."  So,
846     * using 0 as switchover point always, and using magnification for lod ==
847     * 0.  Note that the always c = 0 behavior is new (first appearing in GL
848     * 3.1 spec), old GL versions required 0.5 for the modes listed above.  I
849     * have no clue about the (undocumented) wishes of d3d9/d3d10 here!
850     */
851 
852    if (bld->static_sampler_state->min_max_lod_equal && !is_lodq) {
853       /* User is forcing sampling from a particular mipmap level.
854        * This is hit during mipmap generation.
855        */
856       LLVMValueRef min_lod =
857          dynamic_state->min_lod(bld->gallivm, bld->resources_type,
858                                 bld->resources_ptr, sampler_unit);
859 
860       lod = lp_build_broadcast_scalar(lodf_bld, min_lod);
861    } else {
862       if (explicit_lod) {
863          if (bld->num_lods != bld->coord_type.length)
864             lod = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type,
865                                             lodf_bld->type, explicit_lod, 0);
866          else
867             lod = explicit_lod;
868       } else {
869          LLVMValueRef rho;
870          bool rho_squared = bld->no_rho_approx && (bld->dims > 1);
871 
872          if (bld->static_sampler_state->aniso &&
873              !explicit_lod) {
874             rho = lp_build_pmin(bld, first_level, s, t, max_aniso);
875             rho_squared = true;
876          } else {
877             rho = lp_build_rho(bld, first_level, s, t, r, derivs);
878          }
879 
880          /*
881           * Compute lod = log2(rho)
882           */
883 
884          if (!lod_bias && !is_lodq &&
885              !bld->static_sampler_state->aniso &&
886              !bld->static_sampler_state->lod_bias_non_zero &&
887              !bld->static_sampler_state->apply_max_lod &&
888              !bld->static_sampler_state->apply_min_lod) {
889             /*
890              * Special case when there are no post-log2 adjustments, which
891              * saves instructions but keeping the integer and fractional lod
892              * computations separate from the start.
893              */
894 
895             if (mip_filter == PIPE_TEX_MIPFILTER_NONE ||
896                 mip_filter == PIPE_TEX_MIPFILTER_NEAREST) {
897                /*
898                 * Don't actually need both values all the time, lod_ipart is
899                 * needed for nearest mipfilter, lod_positive if min != mag.
900                 */
901                if (rho_squared) {
902                   *out_lod_ipart = lp_build_ilog2_sqrt(lodf_bld, rho);
903                } else {
904                   *out_lod_ipart = lp_build_ilog2(lodf_bld, rho);
905                }
906                *out_lod_positive = lp_build_cmp(lodf_bld, PIPE_FUNC_GREATER,
907                                                 rho, lodf_bld->one);
908                return;
909             }
910             if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR &&
911                 !bld->no_brilinear && !rho_squared &&
912                 !bld->static_sampler_state->aniso) {
913                /*
914                 * This can't work if rho is squared. Not sure if it could be
915                 * fixed while keeping it worthwile, could also do sqrt here
916                 * but brilinear and no_rho_opt seems like a combination not
917                 * making much sense anyway so just use ordinary path below.
918                 */
919                lp_build_brilinear_rho(lodf_bld, rho, BRILINEAR_FACTOR,
920                                       out_lod_ipart, out_lod_fpart);
921                *out_lod_positive = lp_build_cmp(lodf_bld, PIPE_FUNC_GREATER,
922                                                 rho, lodf_bld->one);
923                return;
924             }
925          }
926 
927          if (0) {
928             lod = lp_build_log2(lodf_bld, rho);
929          } else {
930             /* get more accurate results if we just sqaure rho always */
931             if (!rho_squared)
932                rho = lp_build_mul(lodf_bld, rho, rho);
933             lod = lp_build_fast_log2(lodf_bld, rho);
934          }
935 
936          /* log2(x^2) == 0.5*log2(x) */
937          lod = lp_build_mul(lodf_bld, lod,
938                             lp_build_const_vec(bld->gallivm,
939                                                lodf_bld->type, 0.5F));
940 
941          /* add shader lod bias */
942          if (lod_bias) {
943             if (bld->num_lods != bld->coord_type.length)
944                lod_bias = lp_build_pack_aos_scalars(bld->gallivm,
945                                                     bld->coord_bld.type,
946                                                     lodf_bld->type,
947                                                     lod_bias, 0);
948             lod = LLVMBuildFAdd(builder, lod, lod_bias, "shader_lod_bias");
949          }
950       }
951 
952       /* add sampler lod bias */
953       if (bld->static_sampler_state->lod_bias_non_zero) {
954          LLVMValueRef sampler_lod_bias =
955             dynamic_state->lod_bias(bld->gallivm, bld->resources_type,
956                                     bld->resources_ptr, sampler_unit);
957          sampler_lod_bias = lp_build_broadcast_scalar(lodf_bld,
958                                                       sampler_lod_bias);
959          lod = LLVMBuildFAdd(builder, lod, sampler_lod_bias, "sampler_lod_bias");
960       }
961 
962       if (is_lodq) {
963          *out_lod = lod;
964       }
965 
966       /* clamp lod */
967       if (bld->static_sampler_state->apply_max_lod) {
968          LLVMValueRef max_lod =
969             dynamic_state->max_lod(bld->gallivm, bld->resources_type,
970                                    bld->resources_ptr, sampler_unit);
971          max_lod = lp_build_broadcast_scalar(lodf_bld, max_lod);
972 
973          lod = lp_build_min(lodf_bld, lod, max_lod);
974       }
975       if (bld->static_sampler_state->apply_min_lod) {
976          LLVMValueRef min_lod =
977             dynamic_state->min_lod(bld->gallivm, bld->resources_type,
978                                    bld->resources_ptr, sampler_unit);
979          min_lod = lp_build_broadcast_scalar(lodf_bld, min_lod);
980 
981          lod = lp_build_max(lodf_bld, lod, min_lod);
982       }
983 
984       if (is_lodq) {
985          *out_lod_fpart = lod;
986          return;
987       }
988    }
989 
990    *out_lod_positive = lp_build_cmp(lodf_bld, PIPE_FUNC_GREATER,
991                                     lod, lodf_bld->zero);
992 
993    if (bld->static_sampler_state->aniso) {
994       *out_lod_ipart = lp_build_itrunc(lodf_bld, lod);
995    } else if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
996       if (!bld->no_brilinear) {
997          lp_build_brilinear_lod(lodf_bld, lod, BRILINEAR_FACTOR,
998                                 out_lod_ipart, out_lod_fpart);
999       } else {
1000          lp_build_ifloor_fract(lodf_bld, lod, out_lod_ipart, out_lod_fpart);
1001       }
1002 
1003       lp_build_name(*out_lod_fpart, "lod_fpart");
1004    } else {
1005       *out_lod_ipart = lp_build_iround(lodf_bld, lod);
1006    }
1007 
1008    lp_build_name(*out_lod_ipart, "lod_ipart");
1009 
1010    return;
1011 }
1012 
1013 
1014 /**
1015  * For PIPE_TEX_MIPFILTER_NEAREST, convert int part of lod
1016  * to actual mip level.
1017  * Note: this is all scalar per quad code.
1018  * \param lod_ipart  int texture level of detail
1019  * \param level_out  returns integer
1020  * \param out_of_bounds returns per coord out_of_bounds mask if provided
1021  */
1022 void
lp_build_nearest_mip_level(struct lp_build_sample_context * bld,LLVMValueRef first_level,LLVMValueRef last_level,LLVMValueRef lod_ipart,LLVMValueRef * level_out,LLVMValueRef * out_of_bounds)1023 lp_build_nearest_mip_level(struct lp_build_sample_context *bld,
1024                            LLVMValueRef first_level,
1025                            LLVMValueRef last_level,
1026                            LLVMValueRef lod_ipart,
1027                            LLVMValueRef *level_out,
1028                            LLVMValueRef *out_of_bounds)
1029 {
1030    struct lp_build_context *leveli_bld = &bld->leveli_bld;
1031    LLVMValueRef level = lp_build_add(leveli_bld, lod_ipart, first_level);
1032 
1033    if (out_of_bounds) {
1034       LLVMValueRef out, out1;
1035       out = lp_build_cmp(leveli_bld, PIPE_FUNC_LESS, level, first_level);
1036       out1 = lp_build_cmp(leveli_bld, PIPE_FUNC_GREATER, level, last_level);
1037       out = lp_build_or(leveli_bld, out, out1);
1038       if (bld->num_mips == bld->coord_bld.type.length) {
1039          *out_of_bounds = out;
1040       } else if (bld->num_mips == 1) {
1041          *out_of_bounds = lp_build_broadcast_scalar(&bld->int_coord_bld, out);
1042       } else {
1043          assert(bld->num_mips == bld->coord_bld.type.length / 4);
1044          *out_of_bounds =
1045             lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
1046                                                   leveli_bld->type,
1047                                                   bld->int_coord_bld.type,
1048                                                   out);
1049       }
1050       level = lp_build_andnot(&bld->int_coord_bld, level, *out_of_bounds);
1051       *level_out = level;
1052    } else {
1053       /* clamp level to legal range of levels */
1054       *level_out = lp_build_clamp(leveli_bld, level, first_level, last_level);
1055 
1056    }
1057 }
1058 
1059 
1060 /**
1061  * For PIPE_TEX_MIPFILTER_LINEAR, convert per-quad (or per element) int LOD(s)
1062  * to two (per-quad) (adjacent) mipmap level indexes, and fix up float lod
1063  * part accordingly.
1064  * Later, we'll sample from those two mipmap levels and interpolate between
1065  * them.
1066  */
1067 void
lp_build_linear_mip_levels(struct lp_build_sample_context * bld,unsigned texture_unit,LLVMValueRef first_level,LLVMValueRef last_level,LLVMValueRef lod_ipart,LLVMValueRef * lod_fpart_inout,LLVMValueRef * level0_out,LLVMValueRef * level1_out)1068 lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
1069                            unsigned texture_unit,
1070                            LLVMValueRef first_level,
1071                            LLVMValueRef last_level,
1072                            LLVMValueRef lod_ipart,
1073                            LLVMValueRef *lod_fpart_inout,
1074                            LLVMValueRef *level0_out,
1075                            LLVMValueRef *level1_out)
1076 {
1077    LLVMBuilderRef builder = bld->gallivm->builder;
1078    struct lp_build_context *leveli_bld = &bld->leveli_bld;
1079    struct lp_build_context *levelf_bld = &bld->levelf_bld;
1080    LLVMValueRef clamp_min;
1081    LLVMValueRef clamp_max;
1082 
1083    assert(bld->num_lods == bld->num_mips);
1084 
1085    *level0_out = lp_build_add(leveli_bld, lod_ipart, first_level);
1086    *level1_out = lp_build_add(leveli_bld, *level0_out, leveli_bld->one);
1087 
1088    /*
1089     * Clamp both *level0_out and *level1_out to [first_level, last_level],
1090     * with the minimum number of comparisons, and zeroing lod_fpart in the
1091     * extreme ends in the process.
1092     */
1093 
1094    /* *level0_out < first_level */
1095    clamp_min = LLVMBuildICmp(builder, LLVMIntSLT,
1096                              *level0_out, first_level,
1097                              "clamp_lod_to_first");
1098 
1099    *level0_out = LLVMBuildSelect(builder, clamp_min,
1100                                  first_level, *level0_out, "");
1101 
1102    *level1_out = LLVMBuildSelect(builder, clamp_min,
1103                                  first_level, *level1_out, "");
1104 
1105    *lod_fpart_inout = LLVMBuildSelect(builder, clamp_min,
1106                                       levelf_bld->zero, *lod_fpart_inout, "");
1107 
1108    /* *level0_out >= last_level */
1109    clamp_max = LLVMBuildICmp(builder, LLVMIntSGE,
1110                              *level0_out, last_level,
1111                              "clamp_lod_to_last");
1112 
1113    *level0_out = LLVMBuildSelect(builder, clamp_max,
1114                                  last_level, *level0_out, "");
1115 
1116    *level1_out = LLVMBuildSelect(builder, clamp_max,
1117                                  last_level, *level1_out, "");
1118 
1119    *lod_fpart_inout = LLVMBuildSelect(builder, clamp_max,
1120                                       levelf_bld->zero, *lod_fpart_inout, "");
1121 
1122    lp_build_name(*level0_out, "texture%u_miplevel0", texture_unit);
1123    lp_build_name(*level1_out, "texture%u_miplevel1", texture_unit);
1124    lp_build_name(*lod_fpart_inout, "texture%u_mipweight", texture_unit);
1125 }
1126 
1127 
1128 /**
1129  * A helper function that factorizes this common pattern.
1130  */
1131 LLVMValueRef
lp_sample_load_mip_value(struct gallivm_state * gallivm,LLVMTypeRef ptr_type,LLVMValueRef offsets,LLVMValueRef index1)1132 lp_sample_load_mip_value(struct gallivm_state *gallivm,
1133                          LLVMTypeRef ptr_type,
1134                          LLVMValueRef offsets,
1135                          LLVMValueRef index1)
1136 {
1137    LLVMValueRef zero = lp_build_const_int32(gallivm, 0);
1138    LLVMValueRef indexes[2] = {zero, index1};
1139    LLVMValueRef ptr = LLVMBuildGEP2(gallivm->builder, ptr_type, offsets,
1140                                     indexes, ARRAY_SIZE(indexes), "");
1141    return LLVMBuildLoad2(gallivm->builder,
1142                          LLVMInt32TypeInContext(gallivm->context), ptr, "");
1143 }
1144 
1145 
1146 /**
1147  * Return pointer to a single mipmap level.
1148  * \param level  integer mipmap level
1149  */
1150 LLVMValueRef
lp_build_get_mipmap_level(struct lp_build_sample_context * bld,LLVMValueRef level)1151 lp_build_get_mipmap_level(struct lp_build_sample_context *bld,
1152                           LLVMValueRef level)
1153 {
1154    LLVMValueRef mip_offset = lp_sample_load_mip_value(bld->gallivm, bld->mip_offsets_type,
1155                                                       bld->mip_offsets, level);
1156    LLVMBuilderRef builder = bld->gallivm->builder;
1157    LLVMValueRef data_ptr =
1158       LLVMBuildGEP2(builder,
1159                     LLVMInt8TypeInContext(bld->gallivm->context),
1160                     bld->base_ptr, &mip_offset, 1, "");
1161    return data_ptr;
1162 }
1163 
1164 
1165 /**
1166  * Return (per-pixel) offsets to mip levels.
1167  * \param level  integer mipmap level
1168  */
1169 LLVMValueRef
lp_build_get_mip_offsets(struct lp_build_sample_context * bld,LLVMValueRef level)1170 lp_build_get_mip_offsets(struct lp_build_sample_context *bld,
1171                          LLVMValueRef level)
1172 {
1173    LLVMBuilderRef builder = bld->gallivm->builder;
1174    LLVMValueRef offsets, offset1;
1175 
1176    if (bld->num_mips == 1) {
1177       offset1 = lp_sample_load_mip_value(bld->gallivm, bld->mip_offsets_type, bld->mip_offsets, level);
1178       offsets = lp_build_broadcast_scalar(&bld->int_coord_bld, offset1);
1179    } else if (bld->num_mips == bld->coord_bld.type.length / 4) {
1180       offsets = bld->int_coord_bld.undef;
1181       for (unsigned i = 0; i < bld->num_mips; i++) {
1182          LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
1183          offset1 = lp_sample_load_mip_value(bld->gallivm, bld->mip_offsets_type,
1184                                             bld->mip_offsets,
1185                                             LLVMBuildExtractElement(builder, level,
1186                                                                     indexi, ""));
1187          LLVMValueRef indexo = lp_build_const_int32(bld->gallivm, 4 * i);
1188          offsets = LLVMBuildInsertElement(builder, offsets, offset1,
1189                                           indexo, "");
1190       }
1191       offsets = lp_build_swizzle_scalar_aos(&bld->int_coord_bld,
1192                                             offsets, 0, 4);
1193    } else {
1194       assert (bld->num_mips == bld->coord_bld.type.length);
1195 
1196       offsets = bld->int_coord_bld.undef;
1197       for (unsigned i = 0; i < bld->num_mips; i++) {
1198          LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
1199          offset1 = lp_sample_load_mip_value(bld->gallivm, bld->mip_offsets_type,
1200                                             bld->mip_offsets,
1201                                             LLVMBuildExtractElement(builder, level,
1202                                                                     indexi, ""));
1203          offsets = LLVMBuildInsertElement(builder, offsets, offset1,
1204                                           indexi, "");
1205       }
1206    }
1207    return offsets;
1208 }
1209 
1210 
1211 /**
1212  * Codegen equivalent for u_minify().
1213  * @param lod_scalar  if lod is a (broadcasted) scalar
1214  * Return max(1, base_size >> level);
1215  */
1216 LLVMValueRef
lp_build_minify(struct lp_build_context * bld,LLVMValueRef base_size,LLVMValueRef level,bool lod_scalar)1217 lp_build_minify(struct lp_build_context *bld,
1218                 LLVMValueRef base_size,
1219                 LLVMValueRef level,
1220                 bool lod_scalar)
1221 {
1222    LLVMBuilderRef builder = bld->gallivm->builder;
1223    assert(lp_check_value(bld->type, base_size));
1224    assert(lp_check_value(bld->type, level));
1225 
1226    if (level == bld->zero) {
1227       /* if we're using mipmap level zero, no minification is needed */
1228       return base_size;
1229    } else {
1230       LLVMValueRef size;
1231       assert(bld->type.sign);
1232       if (lod_scalar ||
1233          (util_get_cpu_caps()->has_avx2 || !util_get_cpu_caps()->has_sse)) {
1234          size = LLVMBuildLShr(builder, base_size, level, "minify");
1235          size = lp_build_max(bld, size, bld->one);
1236       } else {
1237          /*
1238           * emulate shift with float mul, since intel "forgot" shifts with
1239           * per-element shift count until avx2, which results in terrible
1240           * scalar extraction (both count and value), scalar shift,
1241           * vector reinsertion. Should not be an issue on any non-x86 cpu
1242           * with a vector instruction set.
1243           * On cpus with AMD's XOP this should also be unnecessary but I'm
1244           * not sure if llvm would emit this with current flags.
1245           */
1246          LLVMValueRef const127, const23, lf;
1247          struct lp_type ftype;
1248          struct lp_build_context fbld;
1249          ftype = lp_type_float_vec(32, bld->type.length * bld->type.width);
1250          lp_build_context_init(&fbld, bld->gallivm, ftype);
1251          const127 = lp_build_const_int_vec(bld->gallivm, bld->type, 127);
1252          const23 = lp_build_const_int_vec(bld->gallivm, bld->type, 23);
1253 
1254          /* calculate 2^(-level) float */
1255          lf = lp_build_sub(bld, const127, level);
1256          lf = lp_build_shl(bld, lf, const23);
1257          lf = LLVMBuildBitCast(builder, lf, fbld.vec_type, "");
1258 
1259          /* finish shift operation by doing float mul */
1260          base_size = lp_build_int_to_float(&fbld, base_size);
1261          size = lp_build_mul(&fbld, base_size, lf);
1262          /*
1263           * do the max also with floats because
1264           * a) non-emulated int max requires sse41
1265           *    (this is actually a lie as we could cast to 16bit values
1266           *    as 16bit is sufficient and 16bit int max is sse2)
1267           * b) with avx we can do int max 4-wide but float max 8-wide
1268           */
1269          size = lp_build_max(&fbld, size, fbld.one);
1270          size = lp_build_itrunc(&fbld, size);
1271       }
1272       return size;
1273    }
1274 }
1275 
1276 
1277 /*
1278  * Scale image dimensions with block sizes.
1279  *
1280  * tex_blocksize is the resource format blocksize
1281  * view_blocksize is the view format blocksize
1282  *
1283  * This must be applied post-minification, but
1284  * only when blocksizes are different.
1285  *
1286  * ret = (size + (tex_blocksize - 1)) >> log2(tex_blocksize);
1287  * ret *= blocksize;
1288  */
1289 LLVMValueRef
lp_build_scale_view_dims(struct lp_build_context * bld,LLVMValueRef size,LLVMValueRef tex_blocksize,LLVMValueRef tex_blocksize_log2,LLVMValueRef view_blocksize)1290 lp_build_scale_view_dims(struct lp_build_context *bld, LLVMValueRef size,
1291                          LLVMValueRef tex_blocksize,
1292                          LLVMValueRef tex_blocksize_log2,
1293                          LLVMValueRef view_blocksize)
1294 {
1295    LLVMBuilderRef builder = bld->gallivm->builder;
1296    LLVMValueRef ret =
1297       LLVMBuildAdd(builder, size,
1298                    LLVMBuildSub(builder, tex_blocksize,
1299                                 lp_build_const_int_vec(bld->gallivm,
1300                                                        bld->type, 1), ""),
1301                    "");
1302    ret = LLVMBuildLShr(builder, ret, tex_blocksize_log2, "");
1303    ret = LLVMBuildMul(builder, ret, view_blocksize, "");
1304    return ret;
1305 }
1306 
1307 
1308 /*
1309  * Scale a single image dimension.
1310  *
1311  * Scale one image between resource and view blocksizes.
1312  * noop if sizes are the same.
1313  */
1314 LLVMValueRef
lp_build_scale_view_dim(struct gallivm_state * gallivm,LLVMValueRef size,unsigned tex_blocksize,unsigned view_blocksize)1315 lp_build_scale_view_dim(struct gallivm_state *gallivm, LLVMValueRef size,
1316                         unsigned tex_blocksize, unsigned view_blocksize)
1317 {
1318    if (tex_blocksize == view_blocksize)
1319       return size;
1320 
1321    LLVMBuilderRef builder = gallivm->builder;
1322    LLVMValueRef ret =
1323       LLVMBuildAdd(builder, size,
1324                    lp_build_const_int32(gallivm, tex_blocksize - 1), "");
1325    ret = LLVMBuildLShr(builder, ret,
1326                        lp_build_const_int32(gallivm,
1327                                             util_logbase2(tex_blocksize)), "");
1328    ret = LLVMBuildMul(builder, ret,
1329                       lp_build_const_int32(gallivm, view_blocksize), "");
1330    return ret;
1331 }
1332 
1333 
1334 /**
1335  * Dereference stride_array[mipmap_level] array to get a stride.
1336  * Return stride as a vector.
1337  */
1338 static LLVMValueRef
lp_build_get_level_stride_vec(struct lp_build_sample_context * bld,LLVMTypeRef stride_type,LLVMValueRef stride_array,LLVMValueRef level)1339 lp_build_get_level_stride_vec(struct lp_build_sample_context *bld,
1340                               LLVMTypeRef stride_type,
1341                               LLVMValueRef stride_array, LLVMValueRef level)
1342 {
1343    LLVMBuilderRef builder = bld->gallivm->builder;
1344    LLVMValueRef stride, stride1;
1345 
1346    if (bld->num_mips == 1) {
1347       stride1 = lp_sample_load_mip_value(bld->gallivm, stride_type, stride_array, level);
1348       stride = lp_build_broadcast_scalar(&bld->int_coord_bld, stride1);
1349    } else if (bld->num_mips == bld->coord_bld.type.length / 4) {
1350       LLVMValueRef stride1;
1351 
1352       stride = bld->int_coord_bld.undef;
1353       for (unsigned i = 0; i < bld->num_mips; i++) {
1354          LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
1355          stride1 = lp_sample_load_mip_value(bld->gallivm, stride_type, stride_array,
1356                                             LLVMBuildExtractElement(builder, level,
1357                                                                     indexi, ""));
1358          LLVMValueRef indexo = lp_build_const_int32(bld->gallivm, 4 * i);
1359          stride = LLVMBuildInsertElement(builder, stride, stride1, indexo, "");
1360       }
1361       stride = lp_build_swizzle_scalar_aos(&bld->int_coord_bld, stride, 0, 4);
1362    } else {
1363       LLVMValueRef stride1;
1364 
1365       assert (bld->num_mips == bld->coord_bld.type.length);
1366 
1367       stride = bld->int_coord_bld.undef;
1368       for (unsigned i = 0; i < bld->coord_bld.type.length; i++) {
1369          LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
1370          stride1 = lp_sample_load_mip_value(bld->gallivm, stride_type, stride_array,
1371                                             LLVMBuildExtractElement(builder, level,
1372                                                                     indexi, ""));
1373          stride = LLVMBuildInsertElement(builder, stride, stride1, indexi, "");
1374       }
1375    }
1376    return stride;
1377 }
1378 
1379 
1380 /**
1381  * When sampling a mipmap, we need to compute the width, height, depth
1382  * of the source levels from the level indexes.  This helper function
1383  * does that.
1384  */
1385 void
lp_build_mipmap_level_sizes(struct lp_build_sample_context * bld,LLVMValueRef ilevel,LLVMValueRef * out_size,LLVMValueRef * row_stride_vec,LLVMValueRef * img_stride_vec)1386 lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
1387                             LLVMValueRef ilevel,
1388                             LLVMValueRef *out_size,
1389                             LLVMValueRef *row_stride_vec,
1390                             LLVMValueRef *img_stride_vec)
1391 {
1392    const unsigned dims = bld->dims;
1393    LLVMValueRef ilevel_vec;
1394 
1395    /*
1396     * Compute width, height, depth at mipmap level 'ilevel'
1397     */
1398    if (bld->num_mips == 1) {
1399       ilevel_vec = lp_build_broadcast_scalar(&bld->int_size_bld, ilevel);
1400       *out_size = lp_build_minify(&bld->int_size_bld, bld->int_size,
1401                                   ilevel_vec, true);
1402       *out_size = lp_build_scale_view_dims(&bld->int_size_bld, *out_size,
1403                                            bld->int_tex_blocksize,
1404                                            bld->int_tex_blocksize_log2,
1405                                            bld->int_view_blocksize);
1406    } else {
1407       LLVMValueRef int_size_vec;
1408       LLVMValueRef int_tex_blocksize_vec, int_tex_blocksize_log2_vec;
1409       LLVMValueRef int_view_blocksize_vec;
1410       LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
1411       const unsigned num_quads = bld->coord_bld.type.length / 4;
1412 
1413       if (bld->num_mips == num_quads) {
1414          /*
1415           * XXX: this should be #ifndef SANE_INSTRUCTION_SET.
1416           * intel "forgot" the variable shift count instruction until avx2.
1417           * A harmless 8x32 shift gets translated into 32 instructions
1418           * (16 extracts, 8 scalar shifts, 8 inserts), llvm is apparently
1419           * unable to recognize if there are really just 2 different shift
1420           * count values. So do the shift 4-wide before expansion.
1421           */
1422          struct lp_build_context bld4;
1423          struct lp_type type4;
1424 
1425          type4 = bld->int_coord_bld.type;
1426          type4.length = 4;
1427 
1428          lp_build_context_init(&bld4, bld->gallivm, type4);
1429 
1430          if (bld->dims == 1) {
1431             assert(bld->int_size_in_bld.type.length == 1);
1432             int_size_vec = lp_build_broadcast_scalar(&bld4,
1433                                                      bld->int_size);
1434             int_tex_blocksize_vec =
1435                lp_build_broadcast_scalar(&bld4, bld->int_tex_blocksize);
1436             int_tex_blocksize_log2_vec =
1437                lp_build_broadcast_scalar(&bld4, bld->int_tex_blocksize_log2);
1438             int_view_blocksize_vec =
1439                lp_build_broadcast_scalar(&bld4, bld->int_view_blocksize);
1440          } else {
1441             assert(bld->int_size_in_bld.type.length == 4);
1442             int_size_vec = bld->int_size;
1443             int_tex_blocksize_vec = bld->int_tex_blocksize;
1444             int_tex_blocksize_log2_vec = bld->int_tex_blocksize_log2;
1445             int_view_blocksize_vec = bld->int_view_blocksize;
1446          }
1447 
1448          for (unsigned i = 0; i < num_quads; i++) {
1449             LLVMValueRef ileveli;
1450             LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
1451 
1452             ileveli = lp_build_extract_broadcast(bld->gallivm,
1453                                                  bld->leveli_bld.type,
1454                                                  bld4.type,
1455                                                  ilevel,
1456                                                  indexi);
1457             tmp[i] = lp_build_minify(&bld4, int_size_vec, ileveli, true);
1458             tmp[i] = lp_build_scale_view_dims(&bld4, tmp[i],
1459                                               int_tex_blocksize_vec,
1460                                               int_tex_blocksize_log2_vec,
1461                                               int_view_blocksize_vec);
1462          }
1463          /*
1464           * out_size is [w0, h0, d0, _, w1, h1, d1, _, ...] vector for
1465           * dims > 1, [w0, w0, w0, w0, w1, w1, w1, w1, ...] otherwise.
1466           */
1467          *out_size = lp_build_concat(bld->gallivm,
1468                                      tmp,
1469                                      bld4.type,
1470                                      num_quads);
1471       } else {
1472          /* FIXME: this is terrible and results in _huge_ vector
1473           * (for the dims > 1 case).
1474           * Should refactor this (together with extract_image_sizes) and do
1475           * something more useful. Could for instance if we have width,height
1476           * with 4-wide vector pack all elements into a 8xi16 vector
1477           * (on which we can still do useful math) instead of using a 16xi32
1478           * vector.
1479           * For dims == 1 this will create [w0, w1, w2, w3, ...] vector.
1480           * For dims > 1 this will create [w0, h0, d0, _, w1, h1, d1, _, ...]
1481           * vector.
1482           */
1483          assert(bld->num_mips == bld->coord_bld.type.length);
1484          if (bld->dims == 1) {
1485             assert(bld->int_size_in_bld.type.length == 1);
1486             int_size_vec = lp_build_broadcast_scalar(&bld->int_coord_bld,
1487                                                      bld->int_size);
1488             int_tex_blocksize_vec =
1489                lp_build_broadcast_scalar(&bld->int_coord_bld,
1490                                          bld->int_tex_blocksize);
1491             int_tex_blocksize_log2_vec =
1492                lp_build_broadcast_scalar(&bld->int_coord_bld,
1493                                          bld->int_tex_blocksize_log2);
1494             int_view_blocksize_vec =
1495                lp_build_broadcast_scalar(&bld->int_coord_bld,
1496                                          bld->int_view_blocksize);
1497             *out_size = lp_build_minify(&bld->int_coord_bld, int_size_vec,
1498                                         ilevel, false);
1499             *out_size = lp_build_scale_view_dims(&bld->int_coord_bld,
1500                                                  *out_size,
1501                                                  int_tex_blocksize_vec,
1502                                                  int_tex_blocksize_log2_vec,
1503                                                  int_view_blocksize_vec);
1504          } else {
1505             LLVMValueRef ilevel1;
1506             for (unsigned i = 0; i < bld->num_mips; i++) {
1507                LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
1508                ilevel1 = lp_build_extract_broadcast(bld->gallivm,
1509                                                     bld->int_coord_type,
1510                                                     bld->int_size_in_bld.type,
1511                                                     ilevel, indexi);
1512                tmp[i] = bld->int_size;
1513                tmp[i] = lp_build_minify(&bld->int_size_in_bld, tmp[i],
1514                                         ilevel1, true);
1515                tmp[i] = lp_build_scale_view_dims(&bld->int_size_in_bld,
1516                                                  tmp[i],
1517                                                  bld->int_tex_blocksize,
1518                                                  bld->int_tex_blocksize_log2,
1519                                                  bld->int_view_blocksize);
1520             }
1521             *out_size = lp_build_concat(bld->gallivm, tmp,
1522                                         bld->int_size_in_bld.type,
1523                                         bld->num_mips);
1524          }
1525       }
1526    }
1527 
1528    if (dims >= 2) {
1529       *row_stride_vec = lp_build_get_level_stride_vec(bld,
1530                                                       bld->row_stride_type,
1531                                                       bld->row_stride_array,
1532                                                       ilevel);
1533    }
1534    if (dims == 3 || has_layer_coord(bld->static_texture_state->target)) {
1535       *img_stride_vec = lp_build_get_level_stride_vec(bld,
1536                                                       bld->img_stride_type,
1537                                                       bld->img_stride_array,
1538                                                       ilevel);
1539    }
1540 }
1541 
1542 
1543 /**
1544  * Extract and broadcast texture size.
1545  *
1546  * @param size_type   type of the texture size vector (either
1547  *                    bld->int_size_type or bld->float_size_type)
1548  * @param coord_type  type of the texture size vector (either
1549  *                    bld->int_coord_type or bld->coord_type)
1550  * @param size        vector with the texture size (width, height, depth)
1551  */
1552 void
lp_build_extract_image_sizes(struct lp_build_sample_context * bld,struct lp_build_context * size_bld,struct lp_type coord_type,LLVMValueRef size,LLVMValueRef * out_width,LLVMValueRef * out_height,LLVMValueRef * out_depth)1553 lp_build_extract_image_sizes(struct lp_build_sample_context *bld,
1554                              struct lp_build_context *size_bld,
1555                              struct lp_type coord_type,
1556                              LLVMValueRef size,
1557                              LLVMValueRef *out_width,
1558                              LLVMValueRef *out_height,
1559                              LLVMValueRef *out_depth)
1560 {
1561    const unsigned dims = bld->dims;
1562    LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1563    struct lp_type size_type = size_bld->type;
1564 
1565    if (bld->num_mips == 1) {
1566       *out_width = lp_build_extract_broadcast(bld->gallivm,
1567                                               size_type,
1568                                               coord_type,
1569                                               size,
1570                                               LLVMConstInt(i32t, 0, 0));
1571       if (dims >= 2) {
1572          *out_height = lp_build_extract_broadcast(bld->gallivm,
1573                                                   size_type,
1574                                                   coord_type,
1575                                                   size,
1576                                                   LLVMConstInt(i32t, 1, 0));
1577          if (dims == 3) {
1578             *out_depth = lp_build_extract_broadcast(bld->gallivm,
1579                                                     size_type,
1580                                                     coord_type,
1581                                                     size,
1582                                                     LLVMConstInt(i32t, 2, 0));
1583          }
1584       }
1585    } else {
1586       unsigned num_quads = bld->coord_bld.type.length / 4;
1587 
1588       if (dims == 1) {
1589          *out_width = size;
1590       } else if (bld->num_mips == num_quads) {
1591          *out_width = lp_build_swizzle_scalar_aos(size_bld, size, 0, 4);
1592          if (dims >= 2) {
1593             *out_height = lp_build_swizzle_scalar_aos(size_bld, size, 1, 4);
1594             if (dims == 3) {
1595                *out_depth = lp_build_swizzle_scalar_aos(size_bld, size, 2, 4);
1596             }
1597          }
1598       } else {
1599          assert(bld->num_mips == bld->coord_type.length);
1600          *out_width = lp_build_pack_aos_scalars(bld->gallivm, size_type,
1601                                                 coord_type, size, 0);
1602          if (dims >= 2) {
1603             *out_height = lp_build_pack_aos_scalars(bld->gallivm, size_type,
1604                                                     coord_type, size, 1);
1605             if (dims == 3) {
1606                *out_depth = lp_build_pack_aos_scalars(bld->gallivm, size_type,
1607                                                       coord_type, size, 2);
1608             }
1609          }
1610       }
1611    }
1612 }
1613 
1614 
1615 /**
1616  * Unnormalize coords.
1617  *
1618  * @param flt_size  vector with the integer texture size (width, height, depth)
1619  */
1620 void
lp_build_unnormalized_coords(struct lp_build_sample_context * bld,LLVMValueRef flt_size,LLVMValueRef * s,LLVMValueRef * t,LLVMValueRef * r)1621 lp_build_unnormalized_coords(struct lp_build_sample_context *bld,
1622                              LLVMValueRef flt_size,
1623                              LLVMValueRef *s,
1624                              LLVMValueRef *t,
1625                              LLVMValueRef *r)
1626 {
1627    const unsigned dims = bld->dims;
1628    LLVMValueRef width;
1629    LLVMValueRef height = NULL;
1630    LLVMValueRef depth = NULL;
1631 
1632    lp_build_extract_image_sizes(bld,
1633                                 &bld->float_size_bld,
1634                                 bld->coord_type,
1635                                 flt_size,
1636                                 &width,
1637                                 &height,
1638                                 &depth);
1639 
1640    /* s = s * width, t = t * height */
1641    *s = lp_build_mul(&bld->coord_bld, *s, width);
1642    if (dims >= 2) {
1643       *t = lp_build_mul(&bld->coord_bld, *t, height);
1644       if (dims >= 3) {
1645          *r = lp_build_mul(&bld->coord_bld, *r, depth);
1646       }
1647    }
1648 }
1649 
1650 
1651 /**
1652  * Generate new coords and faces for cubemap texels falling off the face.
1653  *
1654  * @param face   face (center) of the pixel
1655  * @param x0     lower x coord
1656  * @param x1     higher x coord (must be x0 + 1)
1657  * @param y0     lower y coord
1658  * @param y1     higher y coord (must be x0 + 1)
1659  * @param max_coord     texture cube (level) size - 1
1660  * @param next_faces    new face values when falling off
1661  * @param next_xcoords  new x coord values when falling off
1662  * @param next_ycoords  new y coord values when falling off
1663  *
1664  * The arrays hold the new values when under/overflow of
1665  * lower x, higher x, lower y, higher y coord would occur (in this order).
1666  * next_xcoords/next_ycoords have two entries each (for both new lower and
1667  * higher coord).
1668  */
1669 void
lp_build_cube_new_coords(struct lp_build_context * ivec_bld,LLVMValueRef face,LLVMValueRef x0,LLVMValueRef x1,LLVMValueRef y0,LLVMValueRef y1,LLVMValueRef max_coord,LLVMValueRef next_faces[4],LLVMValueRef next_xcoords[4][2],LLVMValueRef next_ycoords[4][2])1670 lp_build_cube_new_coords(struct lp_build_context *ivec_bld,
1671                         LLVMValueRef face,
1672                         LLVMValueRef x0,
1673                         LLVMValueRef x1,
1674                         LLVMValueRef y0,
1675                         LLVMValueRef y1,
1676                         LLVMValueRef max_coord,
1677                         LLVMValueRef next_faces[4],
1678                         LLVMValueRef next_xcoords[4][2],
1679                         LLVMValueRef next_ycoords[4][2])
1680 {
1681    /*
1682     * Lookup tables aren't nice for simd code hence try some logic here.
1683     * (Note that while it would not be necessary to do per-sample (4) lookups
1684     * when using a LUT as it's impossible that texels fall off of positive
1685     * and negative edges simultaneously, it would however be necessary to
1686     * do 2 lookups for corner handling as in this case texels both fall off
1687     * of x and y axes.)
1688     */
1689    /*
1690     * Next faces (for face 012345):
1691     * x < 0.0  : 451110
1692     * x >= 1.0 : 540001
1693     * y < 0.0  : 225422
1694     * y >= 1.0 : 334533
1695     * Hence nfx+ (and nfy+) == nfx- (nfy-) xor 1
1696     * nfx-: face > 1 ? (face == 5 ? 0 : 1) : (4 + face & 1)
1697     * nfy+: face & ~4 > 1 ? face + 2 : 3;
1698     * This could also use pshufb instead, but would need (manually coded)
1699     * ssse3 intrinsic (llvm won't do non-constant shuffles).
1700     */
1701    struct gallivm_state *gallivm = ivec_bld->gallivm;
1702    LLVMValueRef sel, sel_f2345, sel_f23, sel_f2, tmpsel, tmp;
1703    LLVMValueRef faceand1, sel_fand1, maxmx0, maxmx1, maxmy0, maxmy1;
1704    LLVMValueRef c2 = lp_build_const_int_vec(gallivm, ivec_bld->type, 2);
1705    LLVMValueRef c3 = lp_build_const_int_vec(gallivm, ivec_bld->type, 3);
1706    LLVMValueRef c4 = lp_build_const_int_vec(gallivm, ivec_bld->type, 4);
1707    LLVMValueRef c5 = lp_build_const_int_vec(gallivm, ivec_bld->type, 5);
1708 
1709    sel = lp_build_cmp(ivec_bld, PIPE_FUNC_EQUAL, face, c5);
1710    tmpsel = lp_build_select(ivec_bld, sel, ivec_bld->zero, ivec_bld->one);
1711    sel_f2345 = lp_build_cmp(ivec_bld, PIPE_FUNC_GREATER, face, ivec_bld->one);
1712    faceand1 = lp_build_and(ivec_bld, face, ivec_bld->one);
1713    tmp = lp_build_add(ivec_bld, faceand1, c4);
1714    next_faces[0] = lp_build_select(ivec_bld, sel_f2345, tmpsel, tmp);
1715    next_faces[1] = lp_build_xor(ivec_bld, next_faces[0], ivec_bld->one);
1716 
1717    tmp = lp_build_andnot(ivec_bld, face, c4);
1718    sel_f23 = lp_build_cmp(ivec_bld, PIPE_FUNC_GREATER, tmp, ivec_bld->one);
1719    tmp = lp_build_add(ivec_bld, face, c2);
1720    next_faces[3] = lp_build_select(ivec_bld, sel_f23, tmp, c3);
1721    next_faces[2] = lp_build_xor(ivec_bld, next_faces[3], ivec_bld->one);
1722 
1723    /*
1724     * new xcoords (for face 012345):
1725     * x < 0.0  : max   max   t     max-t max  max
1726     * x >= 1.0 : 0     0     max-t t     0    0
1727     * y < 0.0  : max   0     max-s s     s    max-s
1728     * y >= 1.0 : max   0     s     max-s s    max-s
1729     *
1730     * ncx[1] = face & ~4 > 1 ? (face == 2 ? max-t : t) : 0
1731     * ncx[0] = max - ncx[1]
1732     * ncx[3] = face > 1 ? (face & 1 ? max-s : s) : (face & 1) ? 0 : max
1733     * ncx[2] = face & ~4 > 1 ? max - ncx[3] : ncx[3]
1734     */
1735    sel_f2 = lp_build_cmp(ivec_bld, PIPE_FUNC_EQUAL, face, c2);
1736    maxmy0 = lp_build_sub(ivec_bld, max_coord, y0);
1737    tmp = lp_build_select(ivec_bld, sel_f2, maxmy0, y0);
1738    next_xcoords[1][0] = lp_build_select(ivec_bld, sel_f23, tmp, ivec_bld->zero);
1739    next_xcoords[0][0] = lp_build_sub(ivec_bld, max_coord, next_xcoords[1][0]);
1740    maxmy1 = lp_build_sub(ivec_bld, max_coord, y1);
1741    tmp = lp_build_select(ivec_bld, sel_f2, maxmy1, y1);
1742    next_xcoords[1][1] = lp_build_select(ivec_bld, sel_f23, tmp, ivec_bld->zero);
1743    next_xcoords[0][1] = lp_build_sub(ivec_bld, max_coord, next_xcoords[1][1]);
1744 
1745    sel_fand1 = lp_build_cmp(ivec_bld, PIPE_FUNC_EQUAL, faceand1, ivec_bld->one);
1746 
1747    tmpsel = lp_build_select(ivec_bld, sel_fand1, ivec_bld->zero, max_coord);
1748    maxmx0 = lp_build_sub(ivec_bld, max_coord, x0);
1749    tmp = lp_build_select(ivec_bld, sel_fand1, maxmx0, x0);
1750    next_xcoords[3][0] = lp_build_select(ivec_bld, sel_f2345, tmp, tmpsel);
1751    tmp = lp_build_sub(ivec_bld, max_coord, next_xcoords[3][0]);
1752    next_xcoords[2][0] = lp_build_select(ivec_bld, sel_f23, tmp, next_xcoords[3][0]);
1753    maxmx1 = lp_build_sub(ivec_bld, max_coord, x1);
1754    tmp = lp_build_select(ivec_bld, sel_fand1, maxmx1, x1);
1755    next_xcoords[3][1] = lp_build_select(ivec_bld, sel_f2345, tmp, tmpsel);
1756    tmp = lp_build_sub(ivec_bld, max_coord, next_xcoords[3][1]);
1757    next_xcoords[2][1] = lp_build_select(ivec_bld, sel_f23, tmp, next_xcoords[3][1]);
1758 
1759    /*
1760     * new ycoords (for face 012345):
1761     * x < 0.0  : t     t     0     max   t    t
1762     * x >= 1.0 : t     t     0     max   t    t
1763     * y < 0.0  : max-s s     0     max   max  0
1764     * y >= 1.0 : s     max-s 0     max   0    max
1765     *
1766     * ncy[0] = face & ~4 > 1 ? (face == 2 ? 0 : max) : t
1767     * ncy[1] = ncy[0]
1768     * ncy[3] = face > 1 ? (face & 1 ? max : 0) : (face & 1) ? max-s : max
1769     * ncx[2] = face & ~4 > 1 ? max - ncx[3] : ncx[3]
1770     */
1771    tmp = lp_build_select(ivec_bld, sel_f2, ivec_bld->zero, max_coord);
1772    next_ycoords[0][0] = lp_build_select(ivec_bld, sel_f23, tmp, y0);
1773    next_ycoords[1][0] = next_ycoords[0][0];
1774    next_ycoords[0][1] = lp_build_select(ivec_bld, sel_f23, tmp, y1);
1775    next_ycoords[1][1] = next_ycoords[0][1];
1776 
1777    tmpsel = lp_build_select(ivec_bld, sel_fand1, maxmx0, x0);
1778    tmp = lp_build_select(ivec_bld, sel_fand1, max_coord, ivec_bld->zero);
1779    next_ycoords[3][0] = lp_build_select(ivec_bld, sel_f2345, tmp, tmpsel);
1780    tmp = lp_build_sub(ivec_bld, max_coord, next_ycoords[3][0]);
1781    next_ycoords[2][0] = lp_build_select(ivec_bld, sel_f23, next_ycoords[3][0], tmp);
1782    tmpsel = lp_build_select(ivec_bld, sel_fand1, maxmx1, x1);
1783    tmp = lp_build_select(ivec_bld, sel_fand1, max_coord, ivec_bld->zero);
1784    next_ycoords[3][1] = lp_build_select(ivec_bld, sel_f2345, tmp, tmpsel);
1785    tmp = lp_build_sub(ivec_bld, max_coord, next_ycoords[3][1]);
1786    next_ycoords[2][1] = lp_build_select(ivec_bld, sel_f23, next_ycoords[3][1], tmp);
1787 }
1788 
1789 
1790 /** Helper used by lp_build_cube_lookup() */
1791 static LLVMValueRef
lp_build_cube_imapos(struct lp_build_context * coord_bld,LLVMValueRef coord)1792 lp_build_cube_imapos(struct lp_build_context *coord_bld, LLVMValueRef coord)
1793 {
1794    /* ima = +0.5 / abs(coord); */
1795    LLVMValueRef posHalf = lp_build_const_vec(coord_bld->gallivm, coord_bld->type, 0.5);
1796    LLVMValueRef absCoord = lp_build_abs(coord_bld, coord);
1797    /* avoid div by zero */
1798    LLVMValueRef sel = lp_build_cmp(coord_bld, PIPE_FUNC_GREATER, absCoord, coord_bld->zero);
1799    LLVMValueRef div = lp_build_div(coord_bld, posHalf, absCoord);
1800    LLVMValueRef ima = lp_build_select(coord_bld, sel, div, coord_bld->zero);
1801    return ima;
1802 }
1803 
1804 
1805 /** Helper for doing 3-wise selection.
1806  * Returns sel1 ? val2 : (sel0 ? val0 : val1).
1807  */
1808 static LLVMValueRef
lp_build_select3(struct lp_build_context * sel_bld,LLVMValueRef sel0,LLVMValueRef sel1,LLVMValueRef val0,LLVMValueRef val1,LLVMValueRef val2)1809 lp_build_select3(struct lp_build_context *sel_bld,
1810                  LLVMValueRef sel0,
1811                  LLVMValueRef sel1,
1812                  LLVMValueRef val0,
1813                  LLVMValueRef val1,
1814                  LLVMValueRef val2)
1815 {
1816    LLVMValueRef tmp = lp_build_select(sel_bld, sel0, val0, val1);
1817    return lp_build_select(sel_bld, sel1, val2, tmp);
1818 }
1819 
1820 
1821 /**
1822  * Generate code to do cube face selection and compute per-face texcoords.
1823  */
1824 void
lp_build_cube_lookup(struct lp_build_sample_context * bld,LLVMValueRef * coords,const struct lp_derivatives * derivs_in,struct lp_derivatives * derivs_out,bool need_derivs)1825 lp_build_cube_lookup(struct lp_build_sample_context *bld,
1826                      LLVMValueRef *coords,
1827                      const struct lp_derivatives *derivs_in, /* optional */
1828                      struct lp_derivatives *derivs_out, /* optional */
1829                      bool need_derivs)
1830 {
1831    struct lp_build_context *coord_bld = &bld->coord_bld;
1832    LLVMBuilderRef builder = bld->gallivm->builder;
1833    struct gallivm_state *gallivm = bld->gallivm;
1834    LLVMValueRef si, ti, ri;
1835 
1836    /*
1837     * Do per-pixel face selection. We cannot however (as we used to do)
1838     * simply calculate the derivs afterwards (which is very bogus for
1839     * explicit derivs btw) because the values would be "random" when
1840     * not all pixels lie on the same face.
1841     */
1842    struct lp_build_context *cint_bld = &bld->int_coord_bld;
1843    struct lp_type intctype = cint_bld->type;
1844    LLVMTypeRef coord_vec_type = coord_bld->vec_type;
1845    LLVMTypeRef cint_vec_type = cint_bld->vec_type;
1846    LLVMValueRef as, at, ar, face, face_s, face_t;
1847    LLVMValueRef as_ge_at, maxasat, ar_ge_as_at;
1848    LLVMValueRef snewx, tnewx, snewy, tnewy, snewz, tnewz;
1849    LLVMValueRef tnegi, rnegi;
1850    LLVMValueRef ma, mai, signma, signmabit, imahalfpos;
1851    LLVMValueRef posHalf = lp_build_const_vec(gallivm, coord_bld->type, 0.5);
1852    LLVMValueRef signmask = lp_build_const_int_vec(gallivm, intctype,
1853                                                   1LL << (intctype.width - 1));
1854    LLVMValueRef signshift = lp_build_const_int_vec(gallivm, intctype,
1855                                                    intctype.width -1);
1856    LLVMValueRef facex = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_X);
1857    LLVMValueRef facey = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_Y);
1858    LLVMValueRef facez = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_Z);
1859    LLVMValueRef s = coords[0];
1860    LLVMValueRef t = coords[1];
1861    LLVMValueRef r = coords[2];
1862 
1863    assert(PIPE_TEX_FACE_NEG_X == PIPE_TEX_FACE_POS_X + 1);
1864    assert(PIPE_TEX_FACE_NEG_Y == PIPE_TEX_FACE_POS_Y + 1);
1865    assert(PIPE_TEX_FACE_NEG_Z == PIPE_TEX_FACE_POS_Z + 1);
1866 
1867    /*
1868     * get absolute value (for x/y/z face selection) and sign bit
1869     * (for mirroring minor coords and pos/neg face selection)
1870     * of the original coords.
1871     */
1872    as = lp_build_abs(&bld->coord_bld, s);
1873    at = lp_build_abs(&bld->coord_bld, t);
1874    ar = lp_build_abs(&bld->coord_bld, r);
1875 
1876    /*
1877     * major face determination: select x if x > y else select y
1878     * select z if z >= max(x,y) else select previous result
1879     * if some axis are the same we chose z over y, y over x - the
1880     * dx10 spec seems to ask for it while OpenGL doesn't care (if we
1881     * wouldn't care could save a select or two if using different
1882     * compares and doing at_g_as_ar last since tnewx and tnewz are the
1883     * same).
1884     */
1885    as_ge_at = lp_build_cmp(coord_bld, PIPE_FUNC_GREATER, as, at);
1886    maxasat = lp_build_max(coord_bld, as, at);
1887    ar_ge_as_at = lp_build_cmp(coord_bld, PIPE_FUNC_GEQUAL, ar, maxasat);
1888 
1889    if (need_derivs) {
1890       /*
1891        * XXX: This is really really complex.
1892        * It is a bit overkill to use this for implicit derivatives as well,
1893        * no way this is worth the cost in practice, but seems to be the
1894        * only way for getting accurate and per-pixel lod values.
1895        */
1896       LLVMValueRef ima, imahalf, tmp, ddx[3], ddy[3];
1897       LLVMValueRef madx, mady, madxdivma, madydivma;
1898       LLVMValueRef sdxi, tdxi, rdxi, sdyi, tdyi, rdyi;
1899       LLVMValueRef tdxnegi, rdxnegi, tdynegi, rdynegi;
1900       LLVMValueRef sdxnewx, sdxnewy, sdxnewz, tdxnewx, tdxnewy, tdxnewz;
1901       LLVMValueRef sdynewx, sdynewy, sdynewz, tdynewx, tdynewy, tdynewz;
1902       LLVMValueRef face_sdx, face_tdx, face_sdy, face_tdy;
1903       /*
1904        * s = 1/2 * (sc / ma + 1)
1905        * t = 1/2 * (tc / ma + 1)
1906        *
1907        * s' = 1/2 * (sc' * ma - sc * ma') / ma^2
1908        * t' = 1/2 * (tc' * ma - tc * ma') / ma^2
1909        *
1910        * dx.s = 0.5 * (dx.sc - sc * dx.ma / ma) / ma
1911        * dx.t = 0.5 * (dx.tc - tc * dx.ma / ma) / ma
1912        * dy.s = 0.5 * (dy.sc - sc * dy.ma / ma) / ma
1913        * dy.t = 0.5 * (dy.tc - tc * dy.ma / ma) / ma
1914        */
1915 
1916       /* select ma, calculate ima */
1917       ma = lp_build_select3(coord_bld, as_ge_at, ar_ge_as_at, s, t, r);
1918       mai = LLVMBuildBitCast(builder, ma, cint_vec_type, "");
1919       signmabit = LLVMBuildAnd(builder, mai, signmask, "");
1920       ima = lp_build_div(coord_bld, coord_bld->one, ma);
1921       imahalf = lp_build_mul(coord_bld, posHalf, ima);
1922       imahalfpos = lp_build_abs(coord_bld, imahalf);
1923 
1924       if (!derivs_in) {
1925          ddx[0] = lp_build_ddx(coord_bld, s);
1926          ddx[1] = lp_build_ddx(coord_bld, t);
1927          ddx[2] = lp_build_ddx(coord_bld, r);
1928          ddy[0] = lp_build_ddy(coord_bld, s);
1929          ddy[1] = lp_build_ddy(coord_bld, t);
1930          ddy[2] = lp_build_ddy(coord_bld, r);
1931       } else {
1932          ddx[0] = derivs_in->ddx[0];
1933          ddx[1] = derivs_in->ddx[1];
1934          ddx[2] = derivs_in->ddx[2];
1935          ddy[0] = derivs_in->ddy[0];
1936          ddy[1] = derivs_in->ddy[1];
1937          ddy[2] = derivs_in->ddy[2];
1938       }
1939 
1940       /* select major derivatives */
1941       madx = lp_build_select3(coord_bld, as_ge_at, ar_ge_as_at, ddx[0], ddx[1], ddx[2]);
1942       mady = lp_build_select3(coord_bld, as_ge_at, ar_ge_as_at, ddy[0], ddy[1], ddy[2]);
1943 
1944       si = LLVMBuildBitCast(builder, s, cint_vec_type, "");
1945       ti = LLVMBuildBitCast(builder, t, cint_vec_type, "");
1946       ri = LLVMBuildBitCast(builder, r, cint_vec_type, "");
1947 
1948       sdxi = LLVMBuildBitCast(builder, ddx[0], cint_vec_type, "");
1949       tdxi = LLVMBuildBitCast(builder, ddx[1], cint_vec_type, "");
1950       rdxi = LLVMBuildBitCast(builder, ddx[2], cint_vec_type, "");
1951 
1952       sdyi = LLVMBuildBitCast(builder, ddy[0], cint_vec_type, "");
1953       tdyi = LLVMBuildBitCast(builder, ddy[1], cint_vec_type, "");
1954       rdyi = LLVMBuildBitCast(builder, ddy[2], cint_vec_type, "");
1955 
1956       /*
1957        * compute all possible new s/t coords, which does the mirroring,
1958        * and do the same for derivs minor axes.
1959        * snewx = signma * -r;
1960        * tnewx = -t;
1961        * snewy = s;
1962        * tnewy = signma * r;
1963        * snewz = signma * s;
1964        * tnewz = -t;
1965        */
1966       tnegi = LLVMBuildXor(builder, ti, signmask, "");
1967       rnegi = LLVMBuildXor(builder, ri, signmask, "");
1968       tdxnegi = LLVMBuildXor(builder, tdxi, signmask, "");
1969       rdxnegi = LLVMBuildXor(builder, rdxi, signmask, "");
1970       tdynegi = LLVMBuildXor(builder, tdyi, signmask, "");
1971       rdynegi = LLVMBuildXor(builder, rdyi, signmask, "");
1972 
1973       snewx = LLVMBuildXor(builder, signmabit, rnegi, "");
1974       tnewx = tnegi;
1975       sdxnewx = LLVMBuildXor(builder, signmabit, rdxnegi, "");
1976       tdxnewx = tdxnegi;
1977       sdynewx = LLVMBuildXor(builder, signmabit, rdynegi, "");
1978       tdynewx = tdynegi;
1979 
1980       snewy = si;
1981       tnewy = LLVMBuildXor(builder, signmabit, ri, "");
1982       sdxnewy = sdxi;
1983       tdxnewy = LLVMBuildXor(builder, signmabit, rdxi, "");
1984       sdynewy = sdyi;
1985       tdynewy = LLVMBuildXor(builder, signmabit, rdyi, "");
1986 
1987       snewz = LLVMBuildXor(builder, signmabit, si, "");
1988       tnewz = tnegi;
1989       sdxnewz = LLVMBuildXor(builder, signmabit, sdxi, "");
1990       tdxnewz = tdxnegi;
1991       sdynewz = LLVMBuildXor(builder, signmabit, sdyi, "");
1992       tdynewz = tdynegi;
1993 
1994       /* select the mirrored values */
1995       face = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, facex, facey, facez);
1996       face_s = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, snewx, snewy, snewz);
1997       face_t = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, tnewx, tnewy, tnewz);
1998       face_sdx = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, sdxnewx, sdxnewy, sdxnewz);
1999       face_tdx = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, tdxnewx, tdxnewy, tdxnewz);
2000       face_sdy = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, sdynewx, sdynewy, sdynewz);
2001       face_tdy = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, tdynewx, tdynewy, tdynewz);
2002 
2003       face_s = LLVMBuildBitCast(builder, face_s, coord_vec_type, "");
2004       face_t = LLVMBuildBitCast(builder, face_t, coord_vec_type, "");
2005       face_sdx = LLVMBuildBitCast(builder, face_sdx, coord_vec_type, "");
2006       face_tdx = LLVMBuildBitCast(builder, face_tdx, coord_vec_type, "");
2007       face_sdy = LLVMBuildBitCast(builder, face_sdy, coord_vec_type, "");
2008       face_tdy = LLVMBuildBitCast(builder, face_tdy, coord_vec_type, "");
2009 
2010       /* deriv math, dx.s = 0.5 * (dx.sc - sc * dx.ma / ma) / ma */
2011       madxdivma = lp_build_mul(coord_bld, madx, ima);
2012       tmp = lp_build_mul(coord_bld, madxdivma, face_s);
2013       tmp = lp_build_sub(coord_bld, face_sdx, tmp);
2014       derivs_out->ddx[0] = lp_build_mul(coord_bld, tmp, imahalf);
2015 
2016       /* dx.t = 0.5 * (dx.tc - tc * dx.ma / ma) / ma */
2017       tmp = lp_build_mul(coord_bld, madxdivma, face_t);
2018       tmp = lp_build_sub(coord_bld, face_tdx, tmp);
2019       derivs_out->ddx[1] = lp_build_mul(coord_bld, tmp, imahalf);
2020 
2021       /* dy.s = 0.5 * (dy.sc - sc * dy.ma / ma) / ma */
2022       madydivma = lp_build_mul(coord_bld, mady, ima);
2023       tmp = lp_build_mul(coord_bld, madydivma, face_s);
2024       tmp = lp_build_sub(coord_bld, face_sdy, tmp);
2025       derivs_out->ddy[0] = lp_build_mul(coord_bld, tmp, imahalf);
2026 
2027       /* dy.t = 0.5 * (dy.tc - tc * dy.ma / ma) / ma */
2028       tmp = lp_build_mul(coord_bld, madydivma, face_t);
2029       tmp = lp_build_sub(coord_bld, face_tdy, tmp);
2030       derivs_out->ddy[1] = lp_build_mul(coord_bld, tmp, imahalf);
2031 
2032       signma = LLVMBuildLShr(builder, mai, signshift, "");
2033       coords[2] = LLVMBuildOr(builder, face, signma, "face");
2034 
2035       /* project coords */
2036       face_s = lp_build_mul(coord_bld, face_s, imahalfpos);
2037       face_t = lp_build_mul(coord_bld, face_t, imahalfpos);
2038 
2039       coords[0] = lp_build_add(coord_bld, face_s, posHalf);
2040       coords[1] = lp_build_add(coord_bld, face_t, posHalf);
2041 
2042       return;
2043    }
2044 
2045    ma = lp_build_select3(coord_bld, as_ge_at, ar_ge_as_at, s, t, r);
2046    mai = LLVMBuildBitCast(builder, ma, cint_vec_type, "");
2047    signmabit = LLVMBuildAnd(builder, mai, signmask, "");
2048 
2049    si = LLVMBuildBitCast(builder, s, cint_vec_type, "");
2050    ti = LLVMBuildBitCast(builder, t, cint_vec_type, "");
2051    ri = LLVMBuildBitCast(builder, r, cint_vec_type, "");
2052 
2053    /*
2054     * compute all possible new s/t coords, which does the mirroring
2055     * snewx = signma * -r;
2056     * tnewx = -t;
2057     * snewy = s;
2058     * tnewy = signma * r;
2059     * snewz = signma * s;
2060     * tnewz = -t;
2061     */
2062    tnegi = LLVMBuildXor(builder, ti, signmask, "");
2063    rnegi = LLVMBuildXor(builder, ri, signmask, "");
2064 
2065    snewx = LLVMBuildXor(builder, signmabit, rnegi, "");
2066    tnewx = tnegi;
2067 
2068    snewy = si;
2069    tnewy = LLVMBuildXor(builder, signmabit, ri, "");
2070 
2071    snewz = LLVMBuildXor(builder, signmabit, si, "");
2072    tnewz = tnegi;
2073 
2074    /* select the mirrored values */
2075    face_s = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, snewx, snewy, snewz);
2076    face_t = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, tnewx, tnewy, tnewz);
2077    face = lp_build_select3(cint_bld, as_ge_at, ar_ge_as_at, facex, facey, facez);
2078 
2079    face_s = LLVMBuildBitCast(builder, face_s, coord_vec_type, "");
2080    face_t = LLVMBuildBitCast(builder, face_t, coord_vec_type, "");
2081 
2082    /* add +1 for neg face */
2083    /* XXX with AVX probably want to use another select here -
2084     * as long as we ensure vblendvps gets used we can actually
2085     * skip the comparison and just use sign as a "mask" directly.
2086     */
2087    signma = LLVMBuildLShr(builder, mai, signshift, "");
2088    coords[2] = LLVMBuildOr(builder, face, signma, "face");
2089 
2090    /* project coords */
2091    imahalfpos = lp_build_cube_imapos(coord_bld, ma);
2092    face_s = lp_build_mul(coord_bld, face_s, imahalfpos);
2093    face_t = lp_build_mul(coord_bld, face_t, imahalfpos);
2094 
2095    coords[0] = lp_build_add(coord_bld, face_s, posHalf);
2096    coords[1] = lp_build_add(coord_bld, face_t, posHalf);
2097 }
2098 
2099 
2100 /**
2101  * Compute the partial offset of a pixel block along an arbitrary axis.
2102  *
2103  * @param coord   coordinate in pixels
2104  * @param stride  number of bytes between rows of successive pixel blocks
2105  * @param block_length  number of pixels in a pixels block along the coordinate
2106  *                      axis
2107  * @param out_offset    resulting relative offset of the pixel block in bytes
2108  * @param out_subcoord  resulting sub-block pixel coordinate
2109  */
2110 void
lp_build_sample_partial_offset(struct lp_build_context * bld,unsigned block_length,LLVMValueRef coord,LLVMValueRef stride,LLVMValueRef * out_offset,LLVMValueRef * out_subcoord)2111 lp_build_sample_partial_offset(struct lp_build_context *bld,
2112                                unsigned block_length,
2113                                LLVMValueRef coord,
2114                                LLVMValueRef stride,
2115                                LLVMValueRef *out_offset,
2116                                LLVMValueRef *out_subcoord)
2117 {
2118    LLVMBuilderRef builder = bld->gallivm->builder;
2119    LLVMValueRef offset;
2120    LLVMValueRef subcoord;
2121 
2122    if (block_length == 1) {
2123       subcoord = bld->zero;
2124    } else {
2125       /*
2126        * Pixel blocks have power of two dimensions. LLVM should convert the
2127        * rem/div to bit arithmetic.
2128        * TODO: Verify this.
2129        * It does indeed BUT it does transform it to scalar (and back) when doing so
2130        * (using roughly extract, shift/and, mov, unpack) (llvm 2.7).
2131        * The generated code looks seriously unfunny and is quite expensive.
2132        */
2133 #if 0
2134       LLVMValueRef block_width = lp_build_const_int_vec(bld->type, block_length);
2135       subcoord = LLVMBuildURem(builder, coord, block_width, "");
2136       coord    = LLVMBuildUDiv(builder, coord, block_width, "");
2137 #else
2138       unsigned logbase2 = util_logbase2(block_length);
2139       LLVMValueRef block_shift = lp_build_const_int_vec(bld->gallivm, bld->type, logbase2);
2140       LLVMValueRef block_mask = lp_build_const_int_vec(bld->gallivm, bld->type, block_length - 1);
2141       subcoord = LLVMBuildAnd(builder, coord, block_mask, "");
2142       coord = LLVMBuildLShr(builder, coord, block_shift, "");
2143 #endif
2144    }
2145 
2146    offset = lp_build_mul(bld, coord, stride);
2147 
2148    assert(out_offset);
2149    assert(out_subcoord);
2150 
2151    *out_offset = offset;
2152    *out_subcoord = subcoord;
2153 }
2154 
2155 
2156 /**
2157  * Compute the offset of a pixel block.
2158  *
2159  * x, y, z, y_stride, z_stride are vectors, and they refer to pixels.
2160  *
2161  * Returns the relative offset and i,j sub-block coordinates
2162  */
2163 void
lp_build_sample_offset(struct lp_build_context * bld,const struct util_format_description * format_desc,LLVMValueRef x,LLVMValueRef y,LLVMValueRef z,LLVMValueRef y_stride,LLVMValueRef z_stride,LLVMValueRef * out_offset,LLVMValueRef * out_i,LLVMValueRef * out_j)2164 lp_build_sample_offset(struct lp_build_context *bld,
2165                        const struct util_format_description *format_desc,
2166                        LLVMValueRef x,
2167                        LLVMValueRef y,
2168                        LLVMValueRef z,
2169                        LLVMValueRef y_stride,
2170                        LLVMValueRef z_stride,
2171                        LLVMValueRef *out_offset,
2172                        LLVMValueRef *out_i,
2173                        LLVMValueRef *out_j)
2174 {
2175    LLVMValueRef x_stride;
2176    LLVMValueRef offset;
2177 
2178    x_stride = lp_build_const_vec(bld->gallivm, bld->type,
2179                                  format_desc->block.bits/8);
2180 
2181    lp_build_sample_partial_offset(bld,
2182                                   format_desc->block.width,
2183                                   x, x_stride,
2184                                   &offset, out_i);
2185 
2186    if (y && y_stride) {
2187       LLVMValueRef y_offset;
2188       lp_build_sample_partial_offset(bld,
2189                                      format_desc->block.height,
2190                                      y, y_stride,
2191                                      &y_offset, out_j);
2192       offset = lp_build_add(bld, offset, y_offset);
2193    } else {
2194       *out_j = bld->zero;
2195    }
2196 
2197    if (z && z_stride) {
2198       LLVMValueRef z_offset;
2199       LLVMValueRef k;
2200       lp_build_sample_partial_offset(bld,
2201                                      1, /* pixel blocks are always 2D */
2202                                      z, z_stride,
2203                                      &z_offset, &k);
2204       offset = lp_build_add(bld, offset, z_offset);
2205    }
2206 
2207    *out_offset = offset;
2208 }
2209 
2210 
2211 
2212 void
lp_build_tiled_sample_offset(struct lp_build_context * bld,enum pipe_format format,const struct lp_static_texture_state * static_texture_state,LLVMValueRef x,LLVMValueRef y,LLVMValueRef z,LLVMValueRef width,LLVMValueRef height,LLVMValueRef z_stride,LLVMValueRef * out_offset,LLVMValueRef * out_i,LLVMValueRef * out_j)2213 lp_build_tiled_sample_offset(struct lp_build_context *bld,
2214                              enum pipe_format format,
2215                              const struct lp_static_texture_state *static_texture_state,
2216                              LLVMValueRef x,
2217                              LLVMValueRef y,
2218                              LLVMValueRef z,
2219                              LLVMValueRef width,
2220                              LLVMValueRef height,
2221                              LLVMValueRef z_stride,
2222                              LLVMValueRef *out_offset,
2223                              LLVMValueRef *out_i,
2224                              LLVMValueRef *out_j)
2225 {
2226    struct gallivm_state *gallivm = bld->gallivm;
2227    LLVMBuilderRef builder = gallivm->builder;
2228 
2229    assert(static_texture_state->tiled);
2230 
2231    uint32_t res_dimensions = 1;
2232    switch (static_texture_state->res_target) {
2233    case PIPE_TEXTURE_2D:
2234    case PIPE_TEXTURE_CUBE:
2235    case PIPE_TEXTURE_RECT:
2236    case PIPE_TEXTURE_2D_ARRAY:
2237       res_dimensions = 2;
2238       break;
2239    case PIPE_TEXTURE_3D:
2240       res_dimensions = 3;
2241       break;
2242    default:
2243       break;
2244    }
2245 
2246    uint32_t dimensions = 1;
2247    switch (static_texture_state->target) {
2248    case PIPE_TEXTURE_2D:
2249    case PIPE_TEXTURE_CUBE:
2250    case PIPE_TEXTURE_RECT:
2251    case PIPE_TEXTURE_2D_ARRAY:
2252       dimensions = 2;
2253       break;
2254    case PIPE_TEXTURE_3D:
2255       dimensions = 3;
2256       break;
2257    default:
2258       break;
2259    }
2260 
2261    uint32_t block_size[3] = {
2262       util_format_get_blockwidth(format),
2263       util_format_get_blockheight(format),
2264       util_format_get_blockdepth(format),
2265    };
2266 
2267    uint32_t sparse_tile_size[3] = {
2268       util_format_get_tilesize(format, res_dimensions, static_texture_state->tiled_samples, 0) * block_size[0],
2269       util_format_get_tilesize(format, res_dimensions, static_texture_state->tiled_samples, 1) * block_size[1],
2270       util_format_get_tilesize(format, res_dimensions, static_texture_state->tiled_samples, 2) * block_size[2],
2271    };
2272 
2273    LLVMValueRef sparse_tile_size_log2[3] = {
2274       lp_build_const_vec(gallivm, bld->type, util_logbase2(sparse_tile_size[0])),
2275       lp_build_const_vec(gallivm, bld->type, util_logbase2(sparse_tile_size[1])),
2276       lp_build_const_vec(gallivm, bld->type, util_logbase2(sparse_tile_size[2])),
2277    };
2278 
2279    LLVMValueRef tile_index = LLVMBuildLShr(builder, x, sparse_tile_size_log2[0], "");
2280 
2281    if (y && dimensions > 1) {
2282       LLVMValueRef x_tile_count = lp_build_add(bld, width, lp_build_const_vec(gallivm, bld->type, sparse_tile_size[0] - 1));
2283       x_tile_count = LLVMBuildLShr(builder, x_tile_count, sparse_tile_size_log2[0], "");
2284       LLVMValueRef y_tile = LLVMBuildLShr(builder, y, sparse_tile_size_log2[1], "");
2285       tile_index = lp_build_add(bld, tile_index, lp_build_mul(bld, y_tile, x_tile_count));
2286 
2287       if (z && dimensions > 2) {
2288          LLVMValueRef y_tile_count = lp_build_add(bld, height, lp_build_const_vec(gallivm, bld->type, sparse_tile_size[1] - 1));
2289          y_tile_count = LLVMBuildLShr(builder, y_tile_count, sparse_tile_size_log2[1], "");
2290          LLVMValueRef z_tile = LLVMBuildLShr(builder, z, sparse_tile_size_log2[2], "");
2291          tile_index = lp_build_add(bld, tile_index, lp_build_mul(bld, z_tile, lp_build_mul(bld, x_tile_count, y_tile_count)));
2292       }
2293    }
2294 
2295    LLVMValueRef offset = LLVMBuildShl(builder, tile_index, lp_build_const_vec(gallivm, bld->type, 16), "");
2296 
2297    LLVMValueRef sparse_tile_masks[3] = {
2298       lp_build_const_vec(gallivm, bld->type, sparse_tile_size[0] - 1),
2299       lp_build_const_vec(gallivm, bld->type, sparse_tile_size[1] - 1),
2300       lp_build_const_vec(gallivm, bld->type, sparse_tile_size[2] - 1),
2301    };
2302 
2303    x = LLVMBuildAnd(builder, x, sparse_tile_masks[0], "");
2304    LLVMValueRef x_stride = lp_build_const_vec(gallivm, bld->type, util_format_get_blocksize(format));
2305 
2306    LLVMValueRef x_offset;
2307    lp_build_sample_partial_offset(bld, block_size[0],
2308                                   x, x_stride, &x_offset, out_i);
2309    offset = lp_build_add(bld, offset, x_offset);
2310 
2311    if (y && dimensions > 1) {
2312       y = LLVMBuildAnd(builder, y, sparse_tile_masks[1], "");
2313       LLVMValueRef y_stride = lp_build_const_vec(gallivm, bld->type, util_format_get_blocksize(format) *
2314                                                  sparse_tile_size[0] / block_size[0]);
2315 
2316       LLVMValueRef y_offset;
2317       lp_build_sample_partial_offset(bld, block_size[1],
2318                                      y, y_stride, &y_offset, out_j);
2319       offset = lp_build_add(bld, offset, y_offset);
2320    } else {
2321       *out_j = bld->zero;
2322    }
2323 
2324    if (z && (z_stride || dimensions > 2)) {
2325       if (dimensions > 2) {
2326          z = LLVMBuildAnd(builder, z, sparse_tile_masks[2], "");
2327          z_stride = lp_build_const_vec(gallivm, bld->type, util_format_get_blocksize(format) *
2328                                        sparse_tile_size[0] / block_size[0] *
2329                                        sparse_tile_size[1] / block_size[1]);
2330       }
2331 
2332       LLVMValueRef z_offset;
2333       LLVMValueRef k;
2334       lp_build_sample_partial_offset(bld, 1, z, z_stride, &z_offset, &k);
2335       offset = lp_build_add(bld, offset, z_offset);
2336    }
2337 
2338    *out_offset = offset;
2339 }
2340 
2341 
2342 static LLVMValueRef
lp_build_sample_min(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef v0,LLVMValueRef v1)2343 lp_build_sample_min(struct lp_build_context *bld,
2344                     LLVMValueRef x,
2345                     LLVMValueRef v0,
2346                     LLVMValueRef v1)
2347 {
2348    /* if the incoming LERP weight is 0 then the min/max
2349     * should ignore that value. */
2350    LLVMValueRef mask = lp_build_compare(bld->gallivm,
2351                                         bld->type,
2352                                         PIPE_FUNC_NOTEQUAL,
2353                                         x, bld->zero);
2354    LLVMValueRef min = lp_build_min(bld, v0, v1);
2355 
2356    return lp_build_select(bld, mask, min, v0);
2357 }
2358 
2359 
2360 static LLVMValueRef
lp_build_sample_max(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef v0,LLVMValueRef v1)2361 lp_build_sample_max(struct lp_build_context *bld,
2362                     LLVMValueRef x,
2363                     LLVMValueRef v0,
2364                     LLVMValueRef v1)
2365 {
2366    /* if the incoming LERP weight is 0 then the min/max
2367     * should ignore that value. */
2368    LLVMValueRef mask = lp_build_compare(bld->gallivm,
2369                                         bld->type,
2370                                         PIPE_FUNC_NOTEQUAL,
2371                                         x, bld->zero);
2372    LLVMValueRef max = lp_build_max(bld, v0, v1);
2373 
2374    return lp_build_select(bld, mask, max, v0);
2375 }
2376 
2377 
2378 static LLVMValueRef
lp_build_sample_min_2d(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y,LLVMValueRef a,LLVMValueRef b,LLVMValueRef c,LLVMValueRef d)2379 lp_build_sample_min_2d(struct lp_build_context *bld,
2380                        LLVMValueRef x,
2381                        LLVMValueRef y,
2382                        LLVMValueRef a,
2383                        LLVMValueRef b,
2384                        LLVMValueRef c,
2385                        LLVMValueRef d)
2386 {
2387    LLVMValueRef v0 = lp_build_sample_min(bld, x, a, b);
2388    LLVMValueRef v1 = lp_build_sample_min(bld, x, c, d);
2389    return lp_build_sample_min(bld, y, v0, v1);
2390 }
2391 
2392 
2393 static LLVMValueRef
lp_build_sample_max_2d(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y,LLVMValueRef a,LLVMValueRef b,LLVMValueRef c,LLVMValueRef d)2394 lp_build_sample_max_2d(struct lp_build_context *bld,
2395                        LLVMValueRef x,
2396                        LLVMValueRef y,
2397                        LLVMValueRef a,
2398                        LLVMValueRef b,
2399                        LLVMValueRef c,
2400                        LLVMValueRef d)
2401 {
2402    LLVMValueRef v0 = lp_build_sample_max(bld, x, a, b);
2403    LLVMValueRef v1 = lp_build_sample_max(bld, x, c, d);
2404    return lp_build_sample_max(bld, y, v0, v1);
2405 }
2406 
2407 
2408 static LLVMValueRef
lp_build_sample_min_3d(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y,LLVMValueRef z,LLVMValueRef a,LLVMValueRef b,LLVMValueRef c,LLVMValueRef d,LLVMValueRef e,LLVMValueRef f,LLVMValueRef g,LLVMValueRef h)2409 lp_build_sample_min_3d(struct lp_build_context *bld,
2410                 LLVMValueRef x,
2411                 LLVMValueRef y,
2412                 LLVMValueRef z,
2413                 LLVMValueRef a, LLVMValueRef b,
2414                 LLVMValueRef c, LLVMValueRef d,
2415                 LLVMValueRef e, LLVMValueRef f,
2416                 LLVMValueRef g, LLVMValueRef h)
2417 {
2418    LLVMValueRef v0 = lp_build_sample_min_2d(bld, x, y, a, b, c, d);
2419    LLVMValueRef v1 = lp_build_sample_min_2d(bld, x, y, e, f, g, h);
2420    return lp_build_sample_min(bld, z, v0, v1);
2421 }
2422 
2423 
2424 static LLVMValueRef
lp_build_sample_max_3d(struct lp_build_context * bld,LLVMValueRef x,LLVMValueRef y,LLVMValueRef z,LLVMValueRef a,LLVMValueRef b,LLVMValueRef c,LLVMValueRef d,LLVMValueRef e,LLVMValueRef f,LLVMValueRef g,LLVMValueRef h)2425 lp_build_sample_max_3d(struct lp_build_context *bld,
2426                        LLVMValueRef x,
2427                        LLVMValueRef y,
2428                        LLVMValueRef z,
2429                        LLVMValueRef a, LLVMValueRef b,
2430                        LLVMValueRef c, LLVMValueRef d,
2431                        LLVMValueRef e, LLVMValueRef f,
2432                        LLVMValueRef g, LLVMValueRef h)
2433 {
2434    LLVMValueRef v0 = lp_build_sample_max_2d(bld, x, y, a, b, c, d);
2435    LLVMValueRef v1 = lp_build_sample_max_2d(bld, x, y, e, f, g, h);
2436    return lp_build_sample_max(bld, z, v0, v1);
2437 }
2438 
2439 
2440 void
lp_build_reduce_filter(struct lp_build_context * bld,enum pipe_tex_reduction_mode mode,unsigned flags,unsigned num_chan,LLVMValueRef x,LLVMValueRef * v00,LLVMValueRef * v01,LLVMValueRef * out)2441 lp_build_reduce_filter(struct lp_build_context *bld,
2442                        enum pipe_tex_reduction_mode mode,
2443                        unsigned flags,
2444                        unsigned num_chan,
2445                        LLVMValueRef x,
2446                        LLVMValueRef *v00,
2447                        LLVMValueRef *v01,
2448                        LLVMValueRef *out)
2449 {
2450    unsigned chan;
2451    switch (mode) {
2452    case PIPE_TEX_REDUCTION_MIN:
2453       for (chan = 0; chan < num_chan; chan++)
2454          out[chan] = lp_build_sample_min(bld, x, v00[chan], v01[chan]);
2455       break;
2456    case PIPE_TEX_REDUCTION_MAX:
2457       for (chan = 0; chan < num_chan; chan++)
2458          out[chan] = lp_build_sample_max(bld, x, v00[chan], v01[chan]);
2459       break;
2460    case PIPE_TEX_REDUCTION_WEIGHTED_AVERAGE:
2461    default:
2462       for (chan = 0; chan < num_chan; chan++)
2463          out[chan] = lp_build_lerp(bld, x, v00[chan], v01[chan], flags);
2464       break;
2465    }
2466 }
2467 
2468 
2469 void
lp_build_reduce_filter_2d(struct lp_build_context * bld,enum pipe_tex_reduction_mode mode,unsigned flags,unsigned num_chan,LLVMValueRef x,LLVMValueRef y,LLVMValueRef * v00,LLVMValueRef * v01,LLVMValueRef * v10,LLVMValueRef * v11,LLVMValueRef * out)2470 lp_build_reduce_filter_2d(struct lp_build_context *bld,
2471                           enum pipe_tex_reduction_mode mode,
2472                           unsigned flags,
2473                           unsigned num_chan,
2474                           LLVMValueRef x,
2475                           LLVMValueRef y,
2476                           LLVMValueRef *v00,
2477                           LLVMValueRef *v01,
2478                           LLVMValueRef *v10,
2479                           LLVMValueRef *v11,
2480                           LLVMValueRef *out)
2481 {
2482    switch (mode) {
2483    case PIPE_TEX_REDUCTION_MIN:
2484       for (unsigned chan = 0; chan < num_chan; chan++)
2485          out[chan] = lp_build_sample_min_2d(bld, x, y, v00[chan], v01[chan],
2486                                             v10[chan], v11[chan]);
2487       break;
2488    case PIPE_TEX_REDUCTION_MAX:
2489       for (unsigned chan = 0; chan < num_chan; chan++)
2490          out[chan] = lp_build_sample_max_2d(bld, x, y, v00[chan], v01[chan],
2491                                             v10[chan], v11[chan]);
2492       break;
2493    case PIPE_TEX_REDUCTION_WEIGHTED_AVERAGE:
2494    default:
2495       for (unsigned chan = 0; chan < num_chan; chan++)
2496          out[chan] = lp_build_lerp_2d(bld, x, y, v00[chan], v01[chan],
2497                                       v10[chan], v11[chan], flags);
2498       break;
2499    }
2500 }
2501 
2502 
2503 void
lp_build_reduce_filter_3d(struct lp_build_context * bld,enum pipe_tex_reduction_mode mode,unsigned flags,unsigned num_chan,LLVMValueRef x,LLVMValueRef y,LLVMValueRef z,LLVMValueRef * v000,LLVMValueRef * v001,LLVMValueRef * v010,LLVMValueRef * v011,LLVMValueRef * v100,LLVMValueRef * v101,LLVMValueRef * v110,LLVMValueRef * v111,LLVMValueRef * out)2504 lp_build_reduce_filter_3d(struct lp_build_context *bld,
2505                           enum pipe_tex_reduction_mode mode,
2506                           unsigned flags,
2507                           unsigned num_chan,
2508                           LLVMValueRef x,
2509                           LLVMValueRef y,
2510                           LLVMValueRef z,
2511                           LLVMValueRef *v000,
2512                           LLVMValueRef *v001,
2513                           LLVMValueRef *v010,
2514                           LLVMValueRef *v011,
2515                           LLVMValueRef *v100,
2516                           LLVMValueRef *v101,
2517                           LLVMValueRef *v110,
2518                           LLVMValueRef *v111,
2519                           LLVMValueRef *out)
2520 {
2521    switch (mode) {
2522    case PIPE_TEX_REDUCTION_MIN:
2523       for (unsigned chan = 0; chan < num_chan; chan++)
2524          out[chan] = lp_build_sample_min_3d(bld, x, y, z,
2525                                      v000[chan], v001[chan], v010[chan], v011[chan],
2526                                      v100[chan], v101[chan], v110[chan], v111[chan]);
2527       break;
2528    case PIPE_TEX_REDUCTION_MAX:
2529       for (unsigned chan = 0; chan < num_chan; chan++)
2530          out[chan] = lp_build_sample_max_3d(bld, x, y, z,
2531                                      v000[chan], v001[chan], v010[chan], v011[chan],
2532                                      v100[chan], v101[chan], v110[chan], v111[chan]);
2533       break;
2534    case PIPE_TEX_REDUCTION_WEIGHTED_AVERAGE:
2535    default:
2536       for (unsigned chan = 0; chan < num_chan; chan++)
2537          out[chan] = lp_build_lerp_3d(bld, x, y, z,
2538                                       v000[chan], v001[chan], v010[chan], v011[chan],
2539                                       v100[chan], v101[chan], v110[chan], v111[chan],
2540                                       flags);
2541       break;
2542    }
2543 }
2544 
2545 
2546 /*
2547  * generated from
2548  * const float alpha = 2;
2549  * for (unsigned i = 0; i < WEIGHT_LUT_SIZE; i++) {
2550  *    const float r2 = (float) i / (float) (WEIGHT_LUT_SIZE - 1);
2551  *    const float weight = (float)expf(-alpha * r2);
2552  */
2553 static const float aniso_filter_table[1024] = {
2554    1.000000, 0.998047, 0.996098, 0.994152, 0.992210, 0.990272, 0.988338, 0.986408,
2555    0.984481, 0.982559, 0.980640, 0.978724, 0.976813, 0.974905, 0.973001, 0.971100,
2556    0.969204, 0.967311, 0.965421, 0.963536, 0.961654, 0.959776, 0.957901, 0.956030,
2557    0.954163, 0.952299, 0.950439, 0.948583, 0.946730, 0.944881, 0.943036, 0.941194,
2558    0.939356, 0.937521, 0.935690, 0.933862, 0.932038, 0.930218, 0.928401, 0.926588,
2559    0.924778, 0.922972, 0.921169, 0.919370, 0.917575, 0.915782, 0.913994, 0.912209,
2560    0.910427, 0.908649, 0.906874, 0.905103, 0.903335, 0.901571, 0.899810, 0.898052,
2561    0.896298, 0.894548, 0.892801, 0.891057, 0.889317, 0.887580, 0.885846, 0.884116,
2562    0.882389, 0.880666, 0.878946, 0.877229, 0.875516, 0.873806, 0.872099, 0.870396,
2563    0.868696, 0.866999, 0.865306, 0.863616, 0.861929, 0.860245, 0.858565, 0.856888,
2564    0.855215, 0.853544, 0.851877, 0.850213, 0.848553, 0.846896, 0.845241, 0.843591,
2565    0.841943, 0.840299, 0.838657, 0.837019, 0.835385, 0.833753, 0.832124, 0.830499,
2566    0.828877, 0.827258, 0.825643, 0.824030, 0.822421, 0.820814, 0.819211, 0.817611,
2567    0.816014, 0.814420, 0.812830, 0.811242, 0.809658, 0.808076, 0.806498, 0.804923,
2568    0.803351, 0.801782, 0.800216, 0.798653, 0.797093, 0.795536, 0.793982, 0.792432,
2569    0.790884, 0.789339, 0.787798, 0.786259, 0.784723, 0.783191, 0.781661, 0.780134,
2570    0.778610, 0.777090, 0.775572, 0.774057, 0.772545, 0.771037, 0.769531, 0.768028,
2571    0.766528, 0.765030, 0.763536, 0.762045, 0.760557, 0.759071, 0.757589, 0.756109,
2572    0.754632, 0.753158, 0.751687, 0.750219, 0.748754, 0.747291, 0.745832, 0.744375,
2573    0.742921, 0.741470, 0.740022, 0.738577, 0.737134, 0.735694, 0.734258, 0.732823,
2574    0.731392, 0.729964, 0.728538, 0.727115, 0.725695, 0.724278, 0.722863, 0.721451,
2575    0.720042, 0.718636, 0.717232, 0.715831, 0.714433, 0.713038, 0.711645, 0.710255,
2576    0.708868, 0.707483, 0.706102, 0.704723, 0.703346, 0.701972, 0.700601, 0.699233,
2577    0.697867, 0.696504, 0.695144, 0.693786, 0.692431, 0.691079, 0.689729, 0.688382,
2578    0.687037, 0.685696, 0.684356, 0.683020, 0.681686, 0.680354, 0.679025, 0.677699,
2579    0.676376, 0.675054, 0.673736, 0.672420, 0.671107, 0.669796, 0.668488, 0.667182,
2580    0.665879, 0.664579, 0.663281, 0.661985, 0.660692, 0.659402, 0.658114, 0.656828,
2581    0.655546, 0.654265, 0.652987, 0.651712, 0.650439, 0.649169, 0.647901, 0.646635,
2582    0.645372, 0.644112, 0.642854, 0.641598, 0.640345, 0.639095, 0.637846, 0.636601,
2583    0.635357, 0.634116, 0.632878, 0.631642, 0.630408, 0.629177, 0.627948, 0.626721,
2584    0.625497, 0.624276, 0.623056, 0.621839, 0.620625, 0.619413, 0.618203, 0.616996,
2585    0.615790, 0.614588, 0.613387, 0.612189, 0.610994, 0.609800, 0.608609, 0.607421,
2586    0.606234, 0.605050, 0.603868, 0.602689, 0.601512, 0.600337, 0.599165, 0.597994,
2587    0.596826, 0.595661, 0.594497, 0.593336, 0.592177, 0.591021, 0.589866, 0.588714,
2588    0.587564, 0.586417, 0.585272, 0.584128, 0.582988, 0.581849, 0.580712, 0.579578,
2589    0.578446, 0.577317, 0.576189, 0.575064, 0.573940, 0.572819, 0.571701, 0.570584,
2590    0.569470, 0.568357, 0.567247, 0.566139, 0.565034, 0.563930, 0.562829, 0.561729,
2591    0.560632, 0.559537, 0.558444, 0.557354, 0.556265, 0.555179, 0.554094, 0.553012,
2592    0.551932, 0.550854, 0.549778, 0.548704, 0.547633, 0.546563, 0.545496, 0.544430,
2593    0.543367, 0.542306, 0.541246, 0.540189, 0.539134, 0.538081, 0.537030, 0.535981,
2594    0.534935, 0.533890, 0.532847, 0.531806, 0.530768, 0.529731, 0.528696, 0.527664,
2595    0.526633, 0.525604, 0.524578, 0.523553, 0.522531, 0.521510, 0.520492, 0.519475,
2596    0.518460, 0.517448, 0.516437, 0.515429, 0.514422, 0.513417, 0.512414, 0.511414,
2597    0.510415, 0.509418, 0.508423, 0.507430, 0.506439, 0.505450, 0.504462, 0.503477,
2598    0.502494, 0.501512, 0.500533, 0.499555, 0.498580, 0.497606, 0.496634, 0.495664,
2599    0.494696, 0.493730, 0.492765, 0.491803, 0.490842, 0.489884, 0.488927, 0.487972,
2600    0.487019, 0.486068, 0.485118, 0.484171, 0.483225, 0.482281, 0.481339, 0.480399,
2601    0.479461, 0.478524, 0.477590, 0.476657, 0.475726, 0.474797, 0.473870, 0.472944,
2602    0.472020, 0.471098, 0.470178, 0.469260, 0.468343, 0.467429, 0.466516, 0.465605,
2603    0.464695, 0.463788, 0.462882, 0.461978, 0.461075, 0.460175, 0.459276, 0.458379,
2604    0.457484, 0.456590, 0.455699, 0.454809, 0.453920, 0.453034, 0.452149, 0.451266,
2605    0.450384, 0.449505, 0.448627, 0.447751, 0.446876, 0.446003, 0.445132, 0.444263,
2606    0.443395, 0.442529, 0.441665, 0.440802, 0.439941, 0.439082, 0.438224, 0.437368,
2607    0.436514, 0.435662, 0.434811, 0.433961, 0.433114, 0.432268, 0.431424, 0.430581,
2608    0.429740, 0.428901, 0.428063, 0.427227, 0.426393, 0.425560, 0.424729, 0.423899,
2609    0.423071, 0.422245, 0.421420, 0.420597, 0.419776, 0.418956, 0.418137, 0.417321,
2610    0.416506, 0.415692, 0.414880, 0.414070, 0.413261, 0.412454, 0.411648, 0.410844,
2611    0.410042, 0.409241, 0.408442, 0.407644, 0.406848, 0.406053, 0.405260, 0.404469,
2612    0.403679, 0.402890, 0.402103, 0.401318, 0.400534, 0.399752, 0.398971, 0.398192,
2613    0.397414, 0.396638, 0.395863, 0.395090, 0.394319, 0.393548, 0.392780, 0.392013,
2614    0.391247, 0.390483, 0.389720, 0.388959, 0.388199, 0.387441, 0.386684, 0.385929,
2615    0.385175, 0.384423, 0.383672, 0.382923, 0.382175, 0.381429, 0.380684, 0.379940,
2616    0.379198, 0.378457, 0.377718, 0.376980, 0.376244, 0.375509, 0.374776, 0.374044,
2617    0.373313, 0.372584, 0.371856, 0.371130, 0.370405, 0.369682, 0.368960, 0.368239,
2618    0.367520, 0.366802, 0.366086, 0.365371, 0.364657, 0.363945, 0.363234, 0.362525,
2619    0.361817, 0.361110, 0.360405, 0.359701, 0.358998, 0.358297, 0.357597, 0.356899,
2620    0.356202, 0.355506, 0.354812, 0.354119, 0.353427, 0.352737, 0.352048, 0.351360,
2621    0.350674, 0.349989, 0.349306, 0.348623, 0.347942, 0.347263, 0.346585, 0.345908,
2622    0.345232, 0.344558, 0.343885, 0.343213, 0.342543, 0.341874, 0.341206, 0.340540,
2623    0.339874, 0.339211, 0.338548, 0.337887, 0.337227, 0.336568, 0.335911, 0.335255,
2624    0.334600, 0.333947, 0.333294, 0.332643, 0.331994, 0.331345, 0.330698, 0.330052,
2625    0.329408, 0.328764, 0.328122, 0.327481, 0.326842, 0.326203, 0.325566, 0.324930,
2626    0.324296, 0.323662, 0.323030, 0.322399, 0.321770, 0.321141, 0.320514, 0.319888,
2627    0.319263, 0.318639, 0.318017, 0.317396, 0.316776, 0.316157, 0.315540, 0.314924,
2628    0.314309, 0.313695, 0.313082, 0.312470, 0.311860, 0.311251, 0.310643, 0.310036,
2629    0.309431, 0.308827, 0.308223, 0.307621, 0.307021, 0.306421, 0.305822, 0.305225,
2630    0.304629, 0.304034, 0.303440, 0.302847, 0.302256, 0.301666, 0.301076, 0.300488,
2631    0.299902, 0.299316, 0.298731, 0.298148, 0.297565, 0.296984, 0.296404, 0.295825,
2632    0.295247, 0.294671, 0.294095, 0.293521, 0.292948, 0.292375, 0.291804, 0.291234,
2633    0.290666, 0.290098, 0.289531, 0.288966, 0.288401, 0.287838, 0.287276, 0.286715,
2634    0.286155, 0.285596, 0.285038, 0.284482, 0.283926, 0.283371, 0.282818, 0.282266,
2635    0.281714, 0.281164, 0.280615, 0.280067, 0.279520, 0.278974, 0.278429, 0.277885,
2636    0.277342, 0.276801, 0.276260, 0.275721, 0.275182, 0.274645, 0.274108, 0.273573,
2637    0.273038, 0.272505, 0.271973, 0.271442, 0.270912, 0.270382, 0.269854, 0.269327,
2638    0.268801, 0.268276, 0.267752, 0.267229, 0.266707, 0.266186, 0.265667, 0.265148,
2639    0.264630, 0.264113, 0.263597, 0.263082, 0.262568, 0.262056, 0.261544, 0.261033,
2640    0.260523, 0.260014, 0.259506, 0.259000, 0.258494, 0.257989, 0.257485, 0.256982,
2641    0.256480, 0.255979, 0.255479, 0.254980, 0.254482, 0.253985, 0.253489, 0.252994,
2642    0.252500, 0.252007, 0.251515, 0.251023, 0.250533, 0.250044, 0.249555, 0.249068,
2643    0.248582, 0.248096, 0.247611, 0.247128, 0.246645, 0.246163, 0.245683, 0.245203,
2644    0.244724, 0.244246, 0.243769, 0.243293, 0.242818, 0.242343, 0.241870, 0.241398,
2645    0.240926, 0.240456, 0.239986, 0.239517, 0.239049, 0.238583, 0.238117, 0.237651,
2646    0.237187, 0.236724, 0.236262, 0.235800, 0.235340, 0.234880, 0.234421, 0.233963,
2647    0.233506, 0.233050, 0.232595, 0.232141, 0.231688, 0.231235, 0.230783, 0.230333,
2648    0.229883, 0.229434, 0.228986, 0.228538, 0.228092, 0.227647, 0.227202, 0.226758,
2649    0.226315, 0.225873, 0.225432, 0.224992, 0.224552, 0.224114, 0.223676, 0.223239,
2650    0.222803, 0.222368, 0.221934, 0.221500, 0.221068, 0.220636, 0.220205, 0.219775,
2651    0.219346, 0.218917, 0.218490, 0.218063, 0.217637, 0.217212, 0.216788, 0.216364,
2652    0.215942, 0.215520, 0.215099, 0.214679, 0.214260, 0.213841, 0.213423, 0.213007,
2653    0.212591, 0.212175, 0.211761, 0.211347, 0.210935, 0.210523, 0.210111, 0.209701,
2654    0.209291, 0.208883, 0.208475, 0.208068, 0.207661, 0.207256, 0.206851, 0.206447,
2655    0.206044, 0.205641, 0.205239, 0.204839, 0.204439, 0.204039, 0.203641, 0.203243,
2656    0.202846, 0.202450, 0.202054, 0.201660, 0.201266, 0.200873, 0.200481, 0.200089,
2657    0.199698, 0.199308, 0.198919, 0.198530, 0.198143, 0.197756, 0.197369, 0.196984,
2658    0.196599, 0.196215, 0.195832, 0.195449, 0.195068, 0.194687, 0.194306, 0.193927,
2659    0.193548, 0.193170, 0.192793, 0.192416, 0.192041, 0.191665, 0.191291, 0.190917,
2660    0.190545, 0.190172, 0.189801, 0.189430, 0.189060, 0.188691, 0.188323, 0.187955,
2661    0.187588, 0.187221, 0.186856, 0.186491, 0.186126, 0.185763, 0.185400, 0.185038,
2662    0.184676, 0.184316, 0.183956, 0.183597, 0.183238, 0.182880, 0.182523, 0.182166,
2663    0.181811, 0.181455, 0.181101, 0.180747, 0.180394, 0.180042, 0.179690, 0.179339,
2664    0.178989, 0.178640, 0.178291, 0.177942, 0.177595, 0.177248, 0.176902, 0.176556,
2665    0.176211, 0.175867, 0.175524, 0.175181, 0.174839, 0.174497, 0.174157, 0.173816,
2666    0.173477, 0.173138, 0.172800, 0.172462, 0.172126, 0.171789, 0.171454, 0.171119,
2667    0.170785, 0.170451, 0.170118, 0.169786, 0.169454, 0.169124, 0.168793, 0.168463,
2668    0.168134, 0.167806, 0.167478, 0.167151, 0.166825, 0.166499, 0.166174, 0.165849,
2669    0.165525, 0.165202, 0.164879, 0.164557, 0.164236, 0.163915, 0.163595, 0.163275,
2670    0.162957, 0.162638, 0.162321, 0.162004, 0.161687, 0.161371, 0.161056, 0.160742,
2671    0.160428, 0.160114, 0.159802, 0.159489, 0.159178, 0.158867, 0.158557, 0.158247,
2672    0.157938, 0.157630, 0.157322, 0.157014, 0.156708, 0.156402, 0.156096, 0.155791,
2673    0.155487, 0.155183, 0.154880, 0.154578, 0.154276, 0.153975, 0.153674, 0.153374,
2674    0.153074, 0.152775, 0.152477, 0.152179, 0.151882, 0.151585, 0.151289, 0.150994,
2675    0.150699, 0.150404, 0.150111, 0.149817, 0.149525, 0.149233, 0.148941, 0.148650,
2676    0.148360, 0.148070, 0.147781, 0.147492, 0.147204, 0.146917, 0.146630, 0.146344,
2677    0.146058, 0.145772, 0.145488, 0.145204, 0.144920, 0.144637, 0.144354, 0.144072,
2678    0.143791, 0.143510, 0.143230, 0.142950, 0.142671, 0.142392, 0.142114, 0.141837,
2679    0.141560, 0.141283, 0.141007, 0.140732, 0.140457, 0.140183, 0.139909, 0.139636,
2680    0.139363, 0.139091, 0.138819, 0.138548, 0.138277, 0.138007, 0.137738, 0.137469,
2681    0.137200, 0.136932, 0.136665, 0.136398, 0.136131, 0.135865, 0.135600, 0.135335,
2682 };
2683 
2684 
2685 const float *
lp_build_sample_aniso_filter_table(void)2686 lp_build_sample_aniso_filter_table(void)
2687 {
2688    return aniso_filter_table;
2689 }
2690