xref: /aosp_15_r20/external/mesa3d/src/gallium/drivers/llvmpipe/lp_linear_sampler.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /**************************************************************************
2  *
3  * Copyright 2010-2021 VMware, Inc.
4  * All Rights Reserved.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the
8  * "Software"), to deal in the Software without restriction, including
9  * without limitation the rights to use, copy, modify, merge, publish,
10  * distribute, sub license, and/or sell copies of the Software, and to
11  * permit persons to whom the Software is furnished to do so, subject to
12  * the following conditions:
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
18  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
19  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
20  * USE OR OTHER DEALINGS IN THE SOFTWARE.
21  *
22  * The above copyright notice and this permission notice (including the
23  * next paragraph) shall be included in all copies or substantial portions
24  * of the Software.
25  *
26  **************************************************************************/
27 
28 
29 #include "util/detect.h"
30 
31 #include "util/u_math.h"
32 #include "util/u_cpu_detect.h"
33 #include "util/u_pack_color.h"
34 #include "util/u_rect.h"
35 #include "util/u_sse.h"
36 
37 #include "lp_jit.h"
38 #include "lp_debug.h"
39 #include "lp_state_fs.h"
40 #include "lp_linear_priv.h"
41 
42 #if DETECT_ARCH_SSE
43 
44 #define FIXED16_SHIFT  16
45 #define FIXED16_ONE    (1<<16)
46 #define FIXED16_HALF   (1<<15)
47 
48 /*
49  * Color tolerance.  Allow 1 bit of error in 8 bit unorm colors.
50  */
51 #define FIXED16_TOL (FIXED16_ONE >> 7)
52 
53 /*
54  * Tolerance for texture coordinate derivatives when doing linear filtering.
55  *
56  * (Note that extra care needs to be taken when doing linear filtering as
57  * coordinates may snap up to neighbour texels inside the tile).
58  */
59 #define FIXED16_TOL_DERIV (FIXED16_TOL / TILE_SIZE)
60 
61 
62 static inline int
float_to_fixed16(float f)63 float_to_fixed16(float f)
64 {
65    return f * (float)FIXED16_ONE;
66 }
67 
68 
69 static inline int
fixed16_frac(int x)70 fixed16_frac(int x)
71 {
72    return x & (FIXED16_ONE - 1);
73 }
74 
75 
76 static inline int
fixed16_approx(int x,int y,int tol)77 fixed16_approx(int x, int y, int tol)
78 {
79    return y - tol <= x && x <= y + tol;
80 }
81 
82 /* set alpha channel of rgba value to 0xff. */
83 static inline uint32_t
rgbx(uint32_t src_val)84 rgbx(uint32_t src_val)
85 {
86    return src_val | 0xff000000;
87 }
88 
89 /* swap red/blue channels of a 32-bit rgba value. */
90 static inline uint32_t
rb_swap(uint32_t src_val)91 rb_swap(uint32_t src_val)
92 {
93    uint32_t dst_val = src_val & 0xff00ff00;
94    dst_val |= (src_val & 0xff) << 16;
95    dst_val |= (src_val & 0xff0000) >> 16;
96    return dst_val;
97 }
98 
99 /* swap red/blue channels and set alpha to 0xff
100  * of a 32-bit rgbx value. */
101 static inline uint32_t
rbx_swap(uint32_t src_val)102 rbx_swap(uint32_t src_val)
103 {
104    uint32_t dst_val = 0xff000000;
105    dst_val |= src_val & 0xff00;
106    dst_val |= (src_val & 0xff) << 16;
107    dst_val |= (src_val & 0xff0000) >> 16;
108    return dst_val;
109 }
110 
111 /* set alpha channel of 128-bit 4xrgba values to 0xff. */
112 static inline __m128i
rgbx_128(const __m128i src_val)113 rgbx_128(const __m128i src_val)
114 {
115    const __m128i mask = _mm_set1_epi32(0xff000000);
116    __m128i bgrx = _mm_or_si128(src_val, mask);
117    return bgrx;
118 }
119 
120 /* swap red/blue channels of a 128-bit 4xrgba value. */
121 /* ssse3 could use pshufb */
122 static inline __m128i
rb_swap_128(const __m128i src_val)123 rb_swap_128(const __m128i src_val)
124 {
125    const __m128i mask = _mm_set1_epi32(0xff00ff00);
126    const __m128i mask_r = _mm_set1_epi32(0xff);
127 
128    __m128i rgba = _mm_and_si128(src_val, mask);
129    __m128i r = _mm_srli_epi32(src_val, 16);
130    __m128i b = _mm_and_si128(src_val, mask_r);
131    r = _mm_and_si128(r, mask_r);
132    b = _mm_slli_epi32(b, 16);
133    rgba = _mm_or_si128(rgba, r);
134    rgba = _mm_or_si128(rgba, b);
135    return rgba;
136 }
137 
138 /* swap red/blue channels and set alpha to 0xff
139  * of a 128-bit 4xrgbx value. */
140 static inline __m128i
rbx_swap_128(const __m128i src_val)141 rbx_swap_128(const __m128i src_val)
142 {
143    const __m128i mask_a = _mm_set1_epi32(0xff000000);
144    const __m128i mask_g = _mm_set1_epi32(0xff00);
145    const __m128i mask_r = _mm_set1_epi32(0xff);
146 
147    __m128i rgbx = _mm_and_si128(src_val, mask_g);
148    __m128i r = _mm_srli_epi32(src_val, 16);
149    __m128i b = _mm_and_si128(src_val, mask_r);
150    r = _mm_and_si128(r, mask_r);
151    b = _mm_slli_epi32(b, 16);
152    rgbx = _mm_or_si128(rgbx, mask_a);
153    rgbx = _mm_or_si128(rgbx, r);
154    rgbx = _mm_or_si128(rgbx, b);
155    return rgbx;
156 }
157 
158 /*
159  * Unstretched blit of a bgra texture.
160  */
161 static const uint32_t *
fetch_memcpy_bgra(struct lp_linear_elem * elem)162 fetch_memcpy_bgra(struct lp_linear_elem *elem)
163 {
164    struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
165    const struct lp_jit_texture *texture = samp->texture;
166    const uint32_t *src_row =
167       (const uint32_t *)((const uint8_t *)texture->base +
168                          (samp->t >> FIXED16_SHIFT) * texture->row_stride[0]);
169    const int s     = samp->s;
170    const int width = samp->width;
171    const uint32_t *row;
172 
173    src_row = &src_row[s >> FIXED16_SHIFT];
174 
175    if (((uintptr_t)src_row & 0xf) == 0) {
176       /* The source texels are already aligned. Return them */
177       row = src_row;
178    } else {
179       memcpy(samp->row, src_row, width * sizeof *row);
180       row = samp->row;
181    }
182 
183    samp->t += samp->dtdy;
184    return row;
185 }
186 
187 /**
188  * Fetch and stretch one row.
189  */
190 static inline const uint32_t *
fetch_and_stretch_bgra_row(struct lp_linear_sampler * samp,int y)191 fetch_and_stretch_bgra_row(struct lp_linear_sampler *samp,
192                            int y)
193 {
194    const struct lp_jit_texture *texture = samp->texture;
195    const uint32_t *data = (const uint32_t *)texture->base;
196    const int stride = texture->row_stride[0] / sizeof(uint32_t);
197    const int width = samp->width;
198 
199    /*
200     * Search the stretched row cache first.
201     */
202 
203    if (y == samp->stretched_row_y[0]) {
204       samp->stretched_row_index = 1;
205       return samp->stretched_row[0];
206    }
207 
208    if (y == samp->stretched_row_y[1]) {
209       samp->stretched_row_index = 0;
210       return samp->stretched_row[1];
211    }
212 
213    /*
214     * Replace one entry.
215     */
216 
217    const uint32_t * restrict src_row = data + y * stride;
218    uint32_t * restrict dst_row = samp->stretched_row[samp->stretched_row_index];
219 
220    if (fixed16_frac(samp->s) == 0 &&
221        samp->dsdx == FIXED16_ONE) { // TODO: could be relaxed
222       /*
223        * 1:1 blit on the x direction.
224        */
225       src_row += samp->s >> FIXED16_SHIFT;
226 
227       if (((uintptr_t)src_row & 0xf) == 0) {
228          /* The source texture is already aligned. Return it */
229          return src_row;
230       }
231 
232       /* Copy the source texture */
233       for (int i = 0; i < width; i += 4) {
234          __m128i src = _mm_loadu_si128((const __m128i *)&src_row[i]);
235          *(__m128i *)&dst_row[i] = src;
236       }
237    } else {
238       util_sse2_stretch_row_8unorm((__m128i *)dst_row,
239                                    align(width, 4),
240                                    src_row, samp->s, samp->dsdx);
241    }
242 
243    samp->stretched_row_y[samp->stretched_row_index] = y;
244    samp->stretched_row_index ^= 1;
245 
246    return dst_row;
247 }
248 
249 
250 /* Maximise only as we fetch unscaled pixels linearly into a size-64
251  * temporary.  For minimise, we will want to either have a bigger
252  * temporary or fetch sparsely.
253  */
254 static const uint32_t *
fetch_axis_aligned_linear_bgra(struct lp_linear_elem * elem)255 fetch_axis_aligned_linear_bgra(struct lp_linear_elem *elem)
256 {
257    struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
258    const int width = samp->width;
259    uint32_t * restrict row = samp->row;
260    const int y = samp->t >> FIXED16_SHIFT;
261    const int w = (samp->t >> 8) & 0xff;
262 
263    samp->t += samp->dtdy;
264 
265    const uint32_t * restrict src_row0 = fetch_and_stretch_bgra_row(samp, y);
266 
267    if (w == 0) {
268       return src_row0;
269    }
270 
271    const uint32_t * restrict src_row1 = fetch_and_stretch_bgra_row(samp, y + 1);
272 
273    __m128i wt = _mm_set1_epi16(w);
274 
275    /* Combine the two rows using a constant weight.
276     */
277    for (int i = 0; i < width; i += 4) {
278       __m128i srca = _mm_load_si128((const __m128i *)&src_row0[i]);
279       __m128i srcb = _mm_load_si128((const __m128i *)&src_row1[i]);
280 
281       *(__m128i *)&row[i] = util_sse2_lerp_epi8_fixed88(srca, srcb, &wt, &wt);
282    }
283 
284    return row;
285 }
286 
287 
288 /* Non-axis-aligned version.  Don't try to take advantage of
289  * maximize.
290  */
291 static const uint32_t *
fetch_linear_bgra(struct lp_linear_elem * elem)292 fetch_linear_bgra(struct lp_linear_elem *elem)
293 {
294    struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
295    const struct lp_jit_texture *texture = samp->texture;
296    const int stride     = texture->row_stride[0] / sizeof(uint32_t);
297    const uint32_t *data  = (const uint32_t *)texture->base;
298    const int dsdx  = samp->dsdx;
299    const int dtdx  = samp->dtdx;
300    const int width = samp->width;
301    uint32_t *row   = samp->row;
302    int s = samp->s;
303    int t = samp->t;
304 
305    for (int i = 0; i < width; i += 4) {
306       union m128i si0, si1, si2, si3, ws, wt;
307       __m128i si02, si13;
308 
309       for (int j = 0; j < 4; j++) {
310          const uint32_t *src = data + (t >> 16) * stride + (s >> 16);
311 
312          si0.ui[j] = src[0];
313          si1.ui[j] = src[1];
314          si2.ui[j] = src[stride + 0];
315          si3.ui[j] = src[stride + 1];
316 
317          ws.ui[j] = (s>>8) & 0xff;
318          wt.ui[j] = (t>>8) & 0xff;
319 
320          s += dsdx;
321          t += dtdx;
322       }
323 
324       ws.m = _mm_or_si128(ws.m, _mm_slli_epi32(ws.m, 16));
325       ws.m = _mm_or_si128(ws.m, _mm_slli_epi32(ws.m, 8));
326 
327       wt.m = _mm_or_si128(wt.m, _mm_slli_epi32(wt.m, 16));
328       wt.m = _mm_or_si128(wt.m, _mm_slli_epi32(wt.m, 8));
329 
330       si02 = util_sse2_lerp_epi8_fixed08(si0.m, si2.m, wt.m);
331       si13 = util_sse2_lerp_epi8_fixed08(si1.m, si3.m, wt.m);
332 
333       *(__m128i *)&row[i] = util_sse2_lerp_epi8_fixed08(si02, si13, ws.m);
334    }
335 
336    samp->s += samp->dsdy;
337    samp->t += samp->dtdy;
338    return row;
339 }
340 
341 
342 /* Clamped, non-axis-aligned version.  Don't try to take advantage of
343  * maximize.
344  */
345 static const uint32_t *
fetch_clamp_linear_bgra(struct lp_linear_elem * elem)346 fetch_clamp_linear_bgra(struct lp_linear_elem *elem)
347 {
348    struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
349    const struct lp_jit_texture *texture = samp->texture;
350    const uint32_t *data  = (const uint32_t *)texture->base;
351    const int stride     = texture->row_stride[0] / sizeof(uint32_t);
352    const int tex_height = texture->height - 1;
353    const int tex_width  = texture->width - 1;
354    const int dsdx  = samp->dsdx;
355    const int dtdx  = samp->dtdx;
356    const int width = samp->width;
357    uint32_t *row   = samp->row;
358    int s = samp->s;
359    int t = samp->t;
360 
361    /* width, height, stride (in pixels) must be smaller than 32768 */
362    __m128i dsdx4, dtdx4, s4, t4, stride4, w4, h4, zero, one;
363    s4 = _mm_set1_epi32(s);
364    t4 = _mm_set1_epi32(t);
365    s4 = _mm_add_epi32(s4, _mm_set_epi32(3*dsdx, 2*dsdx, dsdx, 0));
366    t4 =  _mm_add_epi32(t4, _mm_set_epi32(3*dtdx, 2*dtdx, dtdx, 0));
367    dsdx4 = _mm_set1_epi32(4*dsdx);
368    dtdx4 = _mm_set1_epi32(4*dtdx);
369    stride4 = _mm_set1_epi32(stride);
370    w4 = _mm_set1_epi32(tex_width);
371    h4 = _mm_set1_epi32(tex_height);
372    zero = _mm_setzero_si128();
373    one = _mm_set1_epi32(1);
374 
375    for (int i = 0; i < width; i += 4) {
376       union m128i addr[4];
377       __m128i ws, wt, wsl, wsh, wtl, wth;
378       __m128i s4s, t4s, cs0, cs1, ct0, ct1, tmp, si[4];
379 
380       s4s = _mm_srli_epi32(s4, 16);
381       t4s = _mm_srli_epi32(t4, 16);
382       cs0 = _mm_min_epi16(_mm_max_epi16(s4s, zero), w4);
383       cs1 = _mm_add_epi16(s4s, one);
384       cs1 = _mm_min_epi16(_mm_max_epi16(cs1, zero), w4);
385       ct0 = _mm_min_epi16(_mm_max_epi16(t4s, zero), h4);
386       ct1 = _mm_add_epi16(t4s, one);
387       ct1 = _mm_min_epi16(_mm_max_epi16(ct1, zero), h4);
388       tmp = _mm_madd_epi16(ct0, stride4);
389       addr[0].m = _mm_add_epi32(tmp, cs0);
390       addr[1].m = _mm_add_epi32(tmp, cs1);
391       tmp = _mm_madd_epi16(ct1, stride4);
392       addr[2].m = _mm_add_epi32(tmp, cs0);
393       addr[3].m = _mm_add_epi32(tmp, cs1);
394 
395       for (int j = 0; j < 4; j++) {
396          __m128i ld1, ld2, ld3;
397          si[j] = _mm_cvtsi32_si128(data[addr[j].ui[0]]);
398          ld1 = _mm_cvtsi32_si128(data[addr[j].ui[1]]);
399          si[j] = _mm_unpacklo_epi32(si[j], ld1);
400          ld2 = _mm_cvtsi32_si128(data[addr[j].ui[2]]);
401          ld3 = _mm_cvtsi32_si128(data[addr[j].ui[3]]);
402          ld2 = _mm_unpacklo_epi32(ld2, ld3);
403          si[j] =  _mm_unpacklo_epi64(si[j], ld2);
404       }
405 
406       ws = _mm_srli_epi32(s4, 8);
407       ws = _mm_and_si128(ws, _mm_set1_epi32(0xFF));
408       wt = _mm_srli_epi32(t4, 8);
409       wt = _mm_and_si128(wt, _mm_set1_epi32(0xFF));
410 
411       s4 = _mm_add_epi32(s4, dsdx4);
412       t4 = _mm_add_epi32(t4, dtdx4);
413 
414 #if 0
415 /* scalar code for reference */
416       for (int j = 0; j < 4; j++) {
417          int s0 = s >> FIXED16_SHIFT;
418          int t0 = t >> FIXED16_SHIFT;
419          int cs0 = CLAMP(s0    , 0, tex_width);
420          int cs1 = CLAMP(s0 + 1, 0, tex_width);
421          int ct0 = CLAMP(t0    , 0, tex_height);
422          int ct1 = CLAMP(t0 + 1, 0, tex_height);
423 
424          si0.ui[j] = data[ct0 * stride + cs0];
425          si1.ui[j] = data[ct0 * stride + cs1];
426          si2.ui[j] = data[ct1 * stride + cs0];
427          si3.ui[j] = data[ct1 * stride + cs1];
428 
429          ws.ui[j] = (s>>8) & 0xff;
430          wt.ui[j] = (t>>8) & 0xff;
431 
432          s += dsdx;
433          t += dtdx;
434       }
435 #endif
436 
437       ws = _mm_or_si128(ws, _mm_slli_epi32(ws, 16));
438       wsl = _mm_shuffle_epi32(ws, _MM_SHUFFLE(1,1,0,0));
439       wsh = _mm_shuffle_epi32(ws, _MM_SHUFFLE(3,3,2,2));
440 
441       wt = _mm_or_si128(wt, _mm_slli_epi32(wt, 16));
442       wtl = _mm_shuffle_epi32(wt, _MM_SHUFFLE(1,1,0,0));
443       wth = _mm_shuffle_epi32(wt, _MM_SHUFFLE(3,3,2,2));
444 
445       *(__m128i *)&row[i] = util_sse2_lerp_2d_epi8_fixed88(si[0], si[2],
446                                                            &si[1], &si[3],
447                                                            &wtl, &wth,
448                                                            &wsl, &wsh);
449    }
450 
451    samp->s += samp->dsdy;
452    samp->t += samp->dtdy;
453 
454    return row;
455 }
456 
457 /* don't generate bgra 128-bits or memcpy ops they have their own path */
458 #define FETCH_TYPE bgra
459 #define OP
460 #define NO_MEMCPY
461 #include "lp_linear_sampler_tmp.h"
462 
463 #define FETCH_TYPE bgrx
464 #define OP rgbx
465 #define OP128 rgbx_128
466 #include "lp_linear_sampler_tmp.h"
467 
468 #define FETCH_TYPE bgra_swapped
469 #define OP rb_swap
470 #define OP128 rb_swap_128
471 #include "lp_linear_sampler_tmp.h"
472 
473 #define FETCH_TYPE bgrx_swapped
474 #define OP rbx_swap
475 #define OP128 rbx_swap_128
476 #include "lp_linear_sampler_tmp.h"
477 
478 static bool
sampler_is_nearest(const struct lp_linear_sampler * samp,const struct lp_sampler_static_state * sampler_state,bool minify)479 sampler_is_nearest(const struct lp_linear_sampler *samp,
480                    const struct lp_sampler_static_state *sampler_state,
481                    bool minify)
482 {
483    unsigned img_filter;
484 
485    if (minify)
486       img_filter = sampler_state->sampler_state.min_img_filter;
487    else
488       img_filter = sampler_state->sampler_state.mag_img_filter;
489 
490    /* Is it obviously nearest?
491     */
492    if (img_filter == PIPE_TEX_FILTER_NEAREST)
493       return true;
494 
495    /* Otherwise look for linear samplers which devolve to nearest.
496     */
497 
498    /* Needs to be axis aligned.
499     */
500    if (!samp->axis_aligned)
501       return false;
502 
503    if (0) {
504       /* For maximizing shaders, revert to nearest
505        */
506       if (samp->dsdx < -FIXED16_HALF && samp->dsdx < FIXED16_HALF &&
507           samp->dtdy < -FIXED16_HALF && samp->dtdy < FIXED16_HALF)
508          return true;
509 
510       /* For severely minimising shaders, revert to nearest:
511        */
512       if ((samp->dsdx < 2 * FIXED16_ONE || samp->dsdx > 2 * FIXED16_ONE) &&
513           (samp->dtdy < 2 * FIXED16_ONE || samp->dtdy > 2 * FIXED16_ONE))
514          return true;
515    }
516 
517    /*
518     * Must be near a pixel center:
519     */
520    if (!fixed16_approx(fixed16_frac(samp->s), FIXED16_HALF, FIXED16_TOL) ||
521        !fixed16_approx(fixed16_frac(samp->t), FIXED16_HALF, FIXED16_TOL))
522       return false;
523 
524    /*
525     * Must make a full step between pixels:
526     */
527    if (!fixed16_approx(samp->dsdx, FIXED16_ONE, FIXED16_TOL_DERIV) ||
528        !fixed16_approx(samp->dtdy, FIXED16_ONE, FIXED16_TOL_DERIV))
529       return false;
530 
531    /* Treat it as nearest!
532     */
533    return true;
534 }
535 
536 
537 /* XXX: Lots of static-state parameters being passed in here but very
538  * little info is extracted from each one.  Consolidate it all down to
539  * something succinct in the prepare phase?
540  */
541 bool
lp_linear_init_sampler(struct lp_linear_sampler * samp,const struct lp_tgsi_texture_info * info,const struct lp_sampler_static_state * sampler_state,const struct lp_jit_texture * texture,int x0,int y0,int width,int height,const float (* a0)[4],const float (* dadx)[4],const float (* dady)[4],bool rgba_order)542 lp_linear_init_sampler(struct lp_linear_sampler *samp,
543                        const struct lp_tgsi_texture_info *info,
544                        const struct lp_sampler_static_state *sampler_state,
545                        const struct lp_jit_texture *texture,
546                        int x0, int y0, int width, int height,
547                        const float (*a0)[4],
548                        const float (*dadx)[4],
549                        const float (*dady)[4],
550                        bool rgba_order)
551 {
552    const struct lp_tgsi_channel_info *schan = &info->coord[0];
553    const struct lp_tgsi_channel_info *tchan = &info->coord[1];
554 
555    assert(schan->file == TGSI_FILE_INPUT);
556    assert(tchan->file == TGSI_FILE_INPUT);
557 
558    float w0   =   a0[0][3];
559 
560    int foo = 1;
561    float s0   =   a0[schan->u.index+foo][schan->swizzle];
562    float dsdx = dadx[schan->u.index+foo][schan->swizzle];
563    float dsdy = dady[schan->u.index+foo][schan->swizzle];
564 
565    float t0   =   a0[tchan->u.index+foo][tchan->swizzle];
566    float dtdx = dadx[tchan->u.index+foo][tchan->swizzle];
567    float dtdy = dady[tchan->u.index+foo][tchan->swizzle];
568 
569    int mins, mint, maxs, maxt;
570    float oow = 1.0f / w0;
571    float width_oow = texture->width * oow;
572    float height_oow = texture->height * oow;
573    float fdsdx = dsdx * width_oow;
574    float fdsdy = dsdy * width_oow;
575    float fdtdx = dtdx * height_oow;
576    float fdtdy = dtdy * height_oow;
577    int fetch_width;
578    int fetch_height;
579    bool minify;
580    bool need_wrap;
581    bool is_nearest;
582 
583    samp->texture = texture;
584    samp->width = width;
585 
586    samp->s = float_to_fixed16(fdsdx * x0 +
587                               fdsdy * y0 +
588                               s0 * width_oow);
589 
590    samp->t = float_to_fixed16(fdtdx * x0 +
591                               fdtdy * y0 +
592                               t0 * height_oow);
593 
594    samp->dsdx = float_to_fixed16(fdsdx);
595    samp->dsdy = float_to_fixed16(fdsdy);
596    samp->dtdx = float_to_fixed16(fdtdx);
597    samp->dtdy = float_to_fixed16(fdtdy);
598 
599 
600    samp->axis_aligned = (samp->dsdy == 0 &&
601                          samp->dtdx == 0); // TODO: could be relaxed
602 
603    {
604       int dsdx = samp->dsdx >= 0 ? samp->dsdx : -samp->dsdx;
605       int dsdy = samp->dsdy >= 0 ? samp->dsdy : -samp->dsdy;
606       int dtdx = samp->dtdx >= 0 ? samp->dtdx : -samp->dtdx;
607       int dtdy = samp->dtdy >= 0 ? samp->dtdy : -samp->dtdy;
608       int rho = MAX4(dsdx, dsdy, dtdx, dtdy);
609 
610       minify = (rho > FIXED16_ONE);
611    }
612 
613    is_nearest = sampler_is_nearest(samp, sampler_state, minify);
614 
615    if (!is_nearest) {
616       samp->s -= FIXED16_HALF;
617       samp->t -= FIXED16_HALF;
618    }
619 
620    /* Check for clamping.  This rarely happens as we're rejecting interpolants
621     * which fall outside the 0..1 range.
622     */
623 
624    if (is_nearest) {
625       /* Nearest fetch routines don't employ SSE and always operate one pixel
626        * at a time.
627        */
628       fetch_width = width - 1;
629    } else {
630       /* Linear fetch routines employ SSE, and always fetch groups of four
631        * texels.
632        */
633       fetch_width = align(width, 4) - 1;
634    }
635    fetch_height = height - 1;
636 
637    if (samp->axis_aligned) {
638       int s0 = samp->s;
639       int s1 = samp->s + fetch_width  * samp->dsdx;
640       int t0 = samp->t;
641       int t1 = samp->t + fetch_height * samp->dtdy;
642 
643       mins = MIN2(s0, s1);
644       mint = MIN2(t0, t1);
645       maxs = MAX2(s0, s1);
646       maxt = MAX2(t0, t1);
647    } else {
648       int s0 = samp->s;
649       int s1 = samp->s + fetch_width  * samp->dsdx;
650       int s2 = samp->s + fetch_height * samp->dsdy;
651       int s3 = samp->s + fetch_width  * samp->dsdx + fetch_height * samp->dsdy;
652       int t0 = samp->t;
653       int t1 = samp->t + fetch_width  * samp->dtdx;
654       int t2 = samp->t + fetch_height * samp->dtdy;
655       int t3 = samp->t + fetch_width  * samp->dtdx + fetch_height * samp->dtdy;
656 
657       mins = MIN4(s0, s1, s2, s3);
658       mint = MIN4(t0, t1, t2, t3);
659       maxs = MAX4(s0, s1, s2, s3);
660       maxt = MAX4(t0, t1, t2, t3);
661    }
662 
663    if (is_nearest) {
664       need_wrap = (mins < 0 ||
665                    mint < 0 ||
666                    maxs >= (texture->width  << FIXED16_SHIFT) ||
667                    maxt >= (texture->height << FIXED16_SHIFT));
668    } else {
669       need_wrap = (mins < 0 ||
670                    mint < 0 ||
671                    maxs + FIXED16_ONE >= (texture->width  << FIXED16_SHIFT) ||
672                    maxt + FIXED16_ONE >= (texture->height << FIXED16_SHIFT));
673    }
674 
675    if (0 && need_wrap) {
676       debug_printf("%u x %u %s\n",
677                    texture->width, texture->height,
678                    is_nearest ? "nearest" : "linear");
679       debug_printf("mins = %f\n", mins*1.0f/FIXED16_ONE);
680       debug_printf("mint = %f\n", mint*1.0f/FIXED16_ONE);
681       debug_printf("maxs = %f\n", maxs*1.0f/FIXED16_ONE);
682       debug_printf("maxt = %f\n", maxt*1.0f/FIXED16_ONE);
683       debug_printf("\n");
684    }
685 
686    /* We accept any mode below, but we only implement clamping.
687     */
688    if (need_wrap &&
689        (sampler_state->sampler_state.wrap_s != PIPE_TEX_WRAP_CLAMP_TO_EDGE ||
690         sampler_state->sampler_state.wrap_t != PIPE_TEX_WRAP_CLAMP_TO_EDGE)) {
691        return false;
692    }
693 
694    if (is_nearest) {
695       switch (sampler_state->texture_state.format) {
696       case PIPE_FORMAT_B8G8R8A8_UNORM:
697          if (rgba_order) {
698             if (need_wrap)
699                samp->base.fetch = fetch_clamp_bgra_swapped;
700             else if (!samp->axis_aligned)
701                samp->base.fetch = fetch_bgra_swapped;
702             else if (samp->dsdx != FIXED16_ONE) // TODO: could be relaxed
703                samp->base.fetch = fetch_axis_aligned_bgra_swapped;
704             else
705                samp->base.fetch = fetch_memcpy_bgra_swapped;
706          } else {
707             if (need_wrap)
708                samp->base.fetch = fetch_clamp_bgra;
709             else if (!samp->axis_aligned)
710                samp->base.fetch = fetch_bgra;
711             else if (samp->dsdx != FIXED16_ONE) // TODO: could be relaxed
712                samp->base.fetch = fetch_axis_aligned_bgra;
713             else
714                samp->base.fetch = fetch_memcpy_bgra;
715          }
716          return true;
717       case PIPE_FORMAT_B8G8R8X8_UNORM:
718          if (rgba_order) {
719             if (need_wrap)
720                samp->base.fetch = fetch_clamp_bgrx_swapped;
721             else if (!samp->axis_aligned)
722                samp->base.fetch = fetch_bgrx_swapped;
723             else if (samp->dsdx != FIXED16_ONE) // TODO: could be relaxed
724                samp->base.fetch = fetch_axis_aligned_bgrx_swapped;
725             else
726                samp->base.fetch = fetch_memcpy_bgrx_swapped;
727          } else {
728             if (need_wrap)
729                samp->base.fetch = fetch_clamp_bgrx;
730             else if (!samp->axis_aligned)
731                samp->base.fetch = fetch_bgrx;
732             else if (samp->dsdx != FIXED16_ONE) // TODO: could be relaxed
733                samp->base.fetch = fetch_axis_aligned_bgrx;
734             else
735                samp->base.fetch = fetch_memcpy_bgrx;
736          }
737          return true;
738       case PIPE_FORMAT_R8G8B8A8_UNORM:
739          if (!rgba_order) {
740             if (need_wrap)
741                samp->base.fetch = fetch_clamp_bgra_swapped;
742             else if (!samp->axis_aligned)
743                samp->base.fetch = fetch_bgra_swapped;
744             else if (samp->dsdx != FIXED16_ONE) // TODO: could be relaxed
745                samp->base.fetch = fetch_axis_aligned_bgra_swapped;
746             else
747                samp->base.fetch = fetch_memcpy_bgra_swapped;
748          } else {
749             if (need_wrap)
750                samp->base.fetch = fetch_clamp_bgra;
751             else if (!samp->axis_aligned)
752                samp->base.fetch = fetch_bgra;
753             else if (samp->dsdx != FIXED16_ONE) // TODO: could be relaxed
754                samp->base.fetch = fetch_axis_aligned_bgra;
755             else
756                samp->base.fetch = fetch_memcpy_bgra;
757          }
758          return true;
759       case PIPE_FORMAT_R8G8B8X8_UNORM:
760          if (!rgba_order) {
761             if (need_wrap)
762                samp->base.fetch = fetch_clamp_bgrx_swapped;
763             else if (!samp->axis_aligned)
764                samp->base.fetch = fetch_bgrx_swapped;
765             else if (samp->dsdx != FIXED16_ONE) // TODO: could be relaxed
766                samp->base.fetch = fetch_axis_aligned_bgrx_swapped;
767             else
768                samp->base.fetch = fetch_memcpy_bgrx_swapped;
769          } else {
770             if (need_wrap)
771                samp->base.fetch = fetch_clamp_bgrx;
772             else if (!samp->axis_aligned)
773                samp->base.fetch = fetch_bgrx;
774             else if (samp->dsdx != FIXED16_ONE) // TODO: could be relaxed
775                samp->base.fetch = fetch_axis_aligned_bgrx;
776             else
777                samp->base.fetch = fetch_memcpy_bgrx;
778          }
779          return true;
780       default:
781          break;
782       }
783 
784       FAIL("unknown format for nearest");
785    } else {
786       samp->stretched_row_y[0] = -1;
787       samp->stretched_row_y[1] = -1;
788       samp->stretched_row_index = 0;
789 
790       switch (sampler_state->texture_state.format) {
791       case PIPE_FORMAT_B8G8R8A8_UNORM:
792          if (rgba_order) {
793             if (need_wrap)
794                samp->base.fetch = fetch_clamp_linear_bgra_swapped;
795             else if (!samp->axis_aligned)
796                samp->base.fetch = fetch_linear_bgra_swapped;
797             else
798                samp->base.fetch = fetch_axis_aligned_linear_bgra_swapped;
799          } else {
800             if (need_wrap)
801                samp->base.fetch = fetch_clamp_linear_bgra;
802             else if (!samp->axis_aligned)
803                samp->base.fetch = fetch_linear_bgra;
804             else
805                samp->base.fetch = fetch_axis_aligned_linear_bgra;
806          }
807          return true;
808       case PIPE_FORMAT_B8G8R8X8_UNORM:
809          if (rgba_order) {
810             if (need_wrap)
811                samp->base.fetch = fetch_clamp_linear_bgrx_swapped;
812             else if (!samp->axis_aligned)
813                samp->base.fetch = fetch_linear_bgrx_swapped;
814             else
815                samp->base.fetch = fetch_axis_aligned_linear_bgrx_swapped;
816          } else {
817             if (need_wrap)
818                samp->base.fetch = fetch_clamp_linear_bgrx;
819             else if (!samp->axis_aligned)
820                samp->base.fetch = fetch_linear_bgrx;
821             else
822                samp->base.fetch = fetch_axis_aligned_linear_bgrx;
823          }
824          return true;
825       case PIPE_FORMAT_R8G8B8A8_UNORM:
826          if (!rgba_order) {
827             if (need_wrap)
828                samp->base.fetch = fetch_clamp_linear_bgra_swapped;
829             else if (!samp->axis_aligned)
830                samp->base.fetch = fetch_linear_bgra_swapped;
831             else
832                samp->base.fetch = fetch_axis_aligned_linear_bgra_swapped;
833          } else {
834             if (need_wrap)
835                samp->base.fetch = fetch_clamp_linear_bgra;
836             else if (!samp->axis_aligned)
837                samp->base.fetch = fetch_linear_bgra;
838             else
839                samp->base.fetch = fetch_axis_aligned_linear_bgra;
840          }
841          return true;
842       case PIPE_FORMAT_R8G8B8X8_UNORM:
843          if (!rgba_order) {
844             if (need_wrap)
845                samp->base.fetch = fetch_clamp_linear_bgrx_swapped;
846             else if (!samp->axis_aligned)
847                samp->base.fetch = fetch_linear_bgrx_swapped;
848             else
849                samp->base.fetch = fetch_axis_aligned_linear_bgrx_swapped;
850          } else {
851             if (need_wrap)
852                samp->base.fetch = fetch_clamp_linear_bgrx;
853             else if (!samp->axis_aligned)
854                samp->base.fetch = fetch_linear_bgrx;
855             else
856                samp->base.fetch = fetch_axis_aligned_linear_bgrx;
857          }
858          return true;
859       default:
860          break;
861       }
862 
863       FAIL("unknown format");
864    }
865 }
866 
867 
868 static const uint32_t *
fetch_noop(struct lp_linear_elem * elem)869 fetch_noop(struct lp_linear_elem *elem)
870 {
871    struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
872    return samp->row;
873 }
874 
875 
876 void
lp_linear_init_noop_sampler(struct lp_linear_sampler * samp)877 lp_linear_init_noop_sampler(struct lp_linear_sampler *samp)
878 {
879    samp->base.fetch = fetch_noop;
880 }
881 
882 
883 /*
884  * Check the given sampler and texture info for linear path compatibility.
885  */
886 bool
lp_linear_check_sampler(const struct lp_sampler_static_state * sampler,const struct lp_tgsi_texture_info * tex)887 lp_linear_check_sampler(const struct lp_sampler_static_state *sampler,
888                         const struct lp_tgsi_texture_info *tex)
889 {
890    if (tex->modifier != LP_BLD_TEX_MODIFIER_NONE)
891       return false;
892 
893    if (tex->target != TGSI_TEXTURE_2D)
894       return false;
895 
896    if (tex->coord[0].file != TGSI_FILE_INPUT ||
897        tex->coord[1].file != TGSI_FILE_INPUT)
898       return false;
899 
900    /* These are the only sampling modes we support at the moment.
901     *
902     * Actually we'll accept any mode as we're failing on any
903     * interpolant which exceeds 0..1.  Clamping is applied only to
904     * avoid invalid reads.
905     */
906    if (!is_nearest_sampler(sampler) &&
907        !is_linear_sampler(sampler))
908       return false;
909 
910    /* These are the only texture formats we support at the moment
911     */
912    if (sampler->texture_state.format != PIPE_FORMAT_B8G8R8A8_UNORM &&
913        sampler->texture_state.format != PIPE_FORMAT_B8G8R8X8_UNORM &&
914        sampler->texture_state.format != PIPE_FORMAT_R8G8B8A8_UNORM &&
915        sampler->texture_state.format != PIPE_FORMAT_R8G8B8X8_UNORM)
916       return false;
917 
918    /* We don't support sampler view swizzling on the linear path */
919    if (sampler->texture_state.swizzle_r != PIPE_SWIZZLE_X ||
920        sampler->texture_state.swizzle_g != PIPE_SWIZZLE_Y ||
921        sampler->texture_state.swizzle_b != PIPE_SWIZZLE_Z ||
922        sampler->texture_state.swizzle_a != PIPE_SWIZZLE_W) {
923       return false;
924    }
925 
926    return true;
927 }
928 
929 #else  // DETECT_ARCH_SSE
930 
931 bool
lp_linear_check_sampler(const struct lp_sampler_static_state * sampler,const struct lp_tgsi_texture_info * tex)932 lp_linear_check_sampler(const struct lp_sampler_static_state *sampler,
933                         const struct lp_tgsi_texture_info *tex)
934 {
935    return false;
936 }
937 
938 #endif  // DETECT_ARCH_SSE
939