1 /**************************************************************************
2 *
3 * Copyright 2010-2021 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
18 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
19 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
20 * USE OR OTHER DEALINGS IN THE SOFTWARE.
21 *
22 * The above copyright notice and this permission notice (including the
23 * next paragraph) shall be included in all copies or substantial portions
24 * of the Software.
25 *
26 **************************************************************************/
27
28
29 #include "util/detect.h"
30
31 #include "util/u_math.h"
32 #include "util/u_cpu_detect.h"
33 #include "util/u_pack_color.h"
34 #include "util/u_rect.h"
35 #include "util/u_sse.h"
36
37 #include "lp_jit.h"
38 #include "lp_debug.h"
39 #include "lp_state_fs.h"
40 #include "lp_linear_priv.h"
41
42 #if DETECT_ARCH_SSE
43
44 #define FIXED16_SHIFT 16
45 #define FIXED16_ONE (1<<16)
46 #define FIXED16_HALF (1<<15)
47
48 /*
49 * Color tolerance. Allow 1 bit of error in 8 bit unorm colors.
50 */
51 #define FIXED16_TOL (FIXED16_ONE >> 7)
52
53 /*
54 * Tolerance for texture coordinate derivatives when doing linear filtering.
55 *
56 * (Note that extra care needs to be taken when doing linear filtering as
57 * coordinates may snap up to neighbour texels inside the tile).
58 */
59 #define FIXED16_TOL_DERIV (FIXED16_TOL / TILE_SIZE)
60
61
62 static inline int
float_to_fixed16(float f)63 float_to_fixed16(float f)
64 {
65 return f * (float)FIXED16_ONE;
66 }
67
68
69 static inline int
fixed16_frac(int x)70 fixed16_frac(int x)
71 {
72 return x & (FIXED16_ONE - 1);
73 }
74
75
76 static inline int
fixed16_approx(int x,int y,int tol)77 fixed16_approx(int x, int y, int tol)
78 {
79 return y - tol <= x && x <= y + tol;
80 }
81
82 /* set alpha channel of rgba value to 0xff. */
83 static inline uint32_t
rgbx(uint32_t src_val)84 rgbx(uint32_t src_val)
85 {
86 return src_val | 0xff000000;
87 }
88
89 /* swap red/blue channels of a 32-bit rgba value. */
90 static inline uint32_t
rb_swap(uint32_t src_val)91 rb_swap(uint32_t src_val)
92 {
93 uint32_t dst_val = src_val & 0xff00ff00;
94 dst_val |= (src_val & 0xff) << 16;
95 dst_val |= (src_val & 0xff0000) >> 16;
96 return dst_val;
97 }
98
99 /* swap red/blue channels and set alpha to 0xff
100 * of a 32-bit rgbx value. */
101 static inline uint32_t
rbx_swap(uint32_t src_val)102 rbx_swap(uint32_t src_val)
103 {
104 uint32_t dst_val = 0xff000000;
105 dst_val |= src_val & 0xff00;
106 dst_val |= (src_val & 0xff) << 16;
107 dst_val |= (src_val & 0xff0000) >> 16;
108 return dst_val;
109 }
110
111 /* set alpha channel of 128-bit 4xrgba values to 0xff. */
112 static inline __m128i
rgbx_128(const __m128i src_val)113 rgbx_128(const __m128i src_val)
114 {
115 const __m128i mask = _mm_set1_epi32(0xff000000);
116 __m128i bgrx = _mm_or_si128(src_val, mask);
117 return bgrx;
118 }
119
120 /* swap red/blue channels of a 128-bit 4xrgba value. */
121 /* ssse3 could use pshufb */
122 static inline __m128i
rb_swap_128(const __m128i src_val)123 rb_swap_128(const __m128i src_val)
124 {
125 const __m128i mask = _mm_set1_epi32(0xff00ff00);
126 const __m128i mask_r = _mm_set1_epi32(0xff);
127
128 __m128i rgba = _mm_and_si128(src_val, mask);
129 __m128i r = _mm_srli_epi32(src_val, 16);
130 __m128i b = _mm_and_si128(src_val, mask_r);
131 r = _mm_and_si128(r, mask_r);
132 b = _mm_slli_epi32(b, 16);
133 rgba = _mm_or_si128(rgba, r);
134 rgba = _mm_or_si128(rgba, b);
135 return rgba;
136 }
137
138 /* swap red/blue channels and set alpha to 0xff
139 * of a 128-bit 4xrgbx value. */
140 static inline __m128i
rbx_swap_128(const __m128i src_val)141 rbx_swap_128(const __m128i src_val)
142 {
143 const __m128i mask_a = _mm_set1_epi32(0xff000000);
144 const __m128i mask_g = _mm_set1_epi32(0xff00);
145 const __m128i mask_r = _mm_set1_epi32(0xff);
146
147 __m128i rgbx = _mm_and_si128(src_val, mask_g);
148 __m128i r = _mm_srli_epi32(src_val, 16);
149 __m128i b = _mm_and_si128(src_val, mask_r);
150 r = _mm_and_si128(r, mask_r);
151 b = _mm_slli_epi32(b, 16);
152 rgbx = _mm_or_si128(rgbx, mask_a);
153 rgbx = _mm_or_si128(rgbx, r);
154 rgbx = _mm_or_si128(rgbx, b);
155 return rgbx;
156 }
157
158 /*
159 * Unstretched blit of a bgra texture.
160 */
161 static const uint32_t *
fetch_memcpy_bgra(struct lp_linear_elem * elem)162 fetch_memcpy_bgra(struct lp_linear_elem *elem)
163 {
164 struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
165 const struct lp_jit_texture *texture = samp->texture;
166 const uint32_t *src_row =
167 (const uint32_t *)((const uint8_t *)texture->base +
168 (samp->t >> FIXED16_SHIFT) * texture->row_stride[0]);
169 const int s = samp->s;
170 const int width = samp->width;
171 const uint32_t *row;
172
173 src_row = &src_row[s >> FIXED16_SHIFT];
174
175 if (((uintptr_t)src_row & 0xf) == 0) {
176 /* The source texels are already aligned. Return them */
177 row = src_row;
178 } else {
179 memcpy(samp->row, src_row, width * sizeof *row);
180 row = samp->row;
181 }
182
183 samp->t += samp->dtdy;
184 return row;
185 }
186
187 /**
188 * Fetch and stretch one row.
189 */
190 static inline const uint32_t *
fetch_and_stretch_bgra_row(struct lp_linear_sampler * samp,int y)191 fetch_and_stretch_bgra_row(struct lp_linear_sampler *samp,
192 int y)
193 {
194 const struct lp_jit_texture *texture = samp->texture;
195 const uint32_t *data = (const uint32_t *)texture->base;
196 const int stride = texture->row_stride[0] / sizeof(uint32_t);
197 const int width = samp->width;
198
199 /*
200 * Search the stretched row cache first.
201 */
202
203 if (y == samp->stretched_row_y[0]) {
204 samp->stretched_row_index = 1;
205 return samp->stretched_row[0];
206 }
207
208 if (y == samp->stretched_row_y[1]) {
209 samp->stretched_row_index = 0;
210 return samp->stretched_row[1];
211 }
212
213 /*
214 * Replace one entry.
215 */
216
217 const uint32_t * restrict src_row = data + y * stride;
218 uint32_t * restrict dst_row = samp->stretched_row[samp->stretched_row_index];
219
220 if (fixed16_frac(samp->s) == 0 &&
221 samp->dsdx == FIXED16_ONE) { // TODO: could be relaxed
222 /*
223 * 1:1 blit on the x direction.
224 */
225 src_row += samp->s >> FIXED16_SHIFT;
226
227 if (((uintptr_t)src_row & 0xf) == 0) {
228 /* The source texture is already aligned. Return it */
229 return src_row;
230 }
231
232 /* Copy the source texture */
233 for (int i = 0; i < width; i += 4) {
234 __m128i src = _mm_loadu_si128((const __m128i *)&src_row[i]);
235 *(__m128i *)&dst_row[i] = src;
236 }
237 } else {
238 util_sse2_stretch_row_8unorm((__m128i *)dst_row,
239 align(width, 4),
240 src_row, samp->s, samp->dsdx);
241 }
242
243 samp->stretched_row_y[samp->stretched_row_index] = y;
244 samp->stretched_row_index ^= 1;
245
246 return dst_row;
247 }
248
249
250 /* Maximise only as we fetch unscaled pixels linearly into a size-64
251 * temporary. For minimise, we will want to either have a bigger
252 * temporary or fetch sparsely.
253 */
254 static const uint32_t *
fetch_axis_aligned_linear_bgra(struct lp_linear_elem * elem)255 fetch_axis_aligned_linear_bgra(struct lp_linear_elem *elem)
256 {
257 struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
258 const int width = samp->width;
259 uint32_t * restrict row = samp->row;
260 const int y = samp->t >> FIXED16_SHIFT;
261 const int w = (samp->t >> 8) & 0xff;
262
263 samp->t += samp->dtdy;
264
265 const uint32_t * restrict src_row0 = fetch_and_stretch_bgra_row(samp, y);
266
267 if (w == 0) {
268 return src_row0;
269 }
270
271 const uint32_t * restrict src_row1 = fetch_and_stretch_bgra_row(samp, y + 1);
272
273 __m128i wt = _mm_set1_epi16(w);
274
275 /* Combine the two rows using a constant weight.
276 */
277 for (int i = 0; i < width; i += 4) {
278 __m128i srca = _mm_load_si128((const __m128i *)&src_row0[i]);
279 __m128i srcb = _mm_load_si128((const __m128i *)&src_row1[i]);
280
281 *(__m128i *)&row[i] = util_sse2_lerp_epi8_fixed88(srca, srcb, &wt, &wt);
282 }
283
284 return row;
285 }
286
287
288 /* Non-axis-aligned version. Don't try to take advantage of
289 * maximize.
290 */
291 static const uint32_t *
fetch_linear_bgra(struct lp_linear_elem * elem)292 fetch_linear_bgra(struct lp_linear_elem *elem)
293 {
294 struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
295 const struct lp_jit_texture *texture = samp->texture;
296 const int stride = texture->row_stride[0] / sizeof(uint32_t);
297 const uint32_t *data = (const uint32_t *)texture->base;
298 const int dsdx = samp->dsdx;
299 const int dtdx = samp->dtdx;
300 const int width = samp->width;
301 uint32_t *row = samp->row;
302 int s = samp->s;
303 int t = samp->t;
304
305 for (int i = 0; i < width; i += 4) {
306 union m128i si0, si1, si2, si3, ws, wt;
307 __m128i si02, si13;
308
309 for (int j = 0; j < 4; j++) {
310 const uint32_t *src = data + (t >> 16) * stride + (s >> 16);
311
312 si0.ui[j] = src[0];
313 si1.ui[j] = src[1];
314 si2.ui[j] = src[stride + 0];
315 si3.ui[j] = src[stride + 1];
316
317 ws.ui[j] = (s>>8) & 0xff;
318 wt.ui[j] = (t>>8) & 0xff;
319
320 s += dsdx;
321 t += dtdx;
322 }
323
324 ws.m = _mm_or_si128(ws.m, _mm_slli_epi32(ws.m, 16));
325 ws.m = _mm_or_si128(ws.m, _mm_slli_epi32(ws.m, 8));
326
327 wt.m = _mm_or_si128(wt.m, _mm_slli_epi32(wt.m, 16));
328 wt.m = _mm_or_si128(wt.m, _mm_slli_epi32(wt.m, 8));
329
330 si02 = util_sse2_lerp_epi8_fixed08(si0.m, si2.m, wt.m);
331 si13 = util_sse2_lerp_epi8_fixed08(si1.m, si3.m, wt.m);
332
333 *(__m128i *)&row[i] = util_sse2_lerp_epi8_fixed08(si02, si13, ws.m);
334 }
335
336 samp->s += samp->dsdy;
337 samp->t += samp->dtdy;
338 return row;
339 }
340
341
342 /* Clamped, non-axis-aligned version. Don't try to take advantage of
343 * maximize.
344 */
345 static const uint32_t *
fetch_clamp_linear_bgra(struct lp_linear_elem * elem)346 fetch_clamp_linear_bgra(struct lp_linear_elem *elem)
347 {
348 struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
349 const struct lp_jit_texture *texture = samp->texture;
350 const uint32_t *data = (const uint32_t *)texture->base;
351 const int stride = texture->row_stride[0] / sizeof(uint32_t);
352 const int tex_height = texture->height - 1;
353 const int tex_width = texture->width - 1;
354 const int dsdx = samp->dsdx;
355 const int dtdx = samp->dtdx;
356 const int width = samp->width;
357 uint32_t *row = samp->row;
358 int s = samp->s;
359 int t = samp->t;
360
361 /* width, height, stride (in pixels) must be smaller than 32768 */
362 __m128i dsdx4, dtdx4, s4, t4, stride4, w4, h4, zero, one;
363 s4 = _mm_set1_epi32(s);
364 t4 = _mm_set1_epi32(t);
365 s4 = _mm_add_epi32(s4, _mm_set_epi32(3*dsdx, 2*dsdx, dsdx, 0));
366 t4 = _mm_add_epi32(t4, _mm_set_epi32(3*dtdx, 2*dtdx, dtdx, 0));
367 dsdx4 = _mm_set1_epi32(4*dsdx);
368 dtdx4 = _mm_set1_epi32(4*dtdx);
369 stride4 = _mm_set1_epi32(stride);
370 w4 = _mm_set1_epi32(tex_width);
371 h4 = _mm_set1_epi32(tex_height);
372 zero = _mm_setzero_si128();
373 one = _mm_set1_epi32(1);
374
375 for (int i = 0; i < width; i += 4) {
376 union m128i addr[4];
377 __m128i ws, wt, wsl, wsh, wtl, wth;
378 __m128i s4s, t4s, cs0, cs1, ct0, ct1, tmp, si[4];
379
380 s4s = _mm_srli_epi32(s4, 16);
381 t4s = _mm_srli_epi32(t4, 16);
382 cs0 = _mm_min_epi16(_mm_max_epi16(s4s, zero), w4);
383 cs1 = _mm_add_epi16(s4s, one);
384 cs1 = _mm_min_epi16(_mm_max_epi16(cs1, zero), w4);
385 ct0 = _mm_min_epi16(_mm_max_epi16(t4s, zero), h4);
386 ct1 = _mm_add_epi16(t4s, one);
387 ct1 = _mm_min_epi16(_mm_max_epi16(ct1, zero), h4);
388 tmp = _mm_madd_epi16(ct0, stride4);
389 addr[0].m = _mm_add_epi32(tmp, cs0);
390 addr[1].m = _mm_add_epi32(tmp, cs1);
391 tmp = _mm_madd_epi16(ct1, stride4);
392 addr[2].m = _mm_add_epi32(tmp, cs0);
393 addr[3].m = _mm_add_epi32(tmp, cs1);
394
395 for (int j = 0; j < 4; j++) {
396 __m128i ld1, ld2, ld3;
397 si[j] = _mm_cvtsi32_si128(data[addr[j].ui[0]]);
398 ld1 = _mm_cvtsi32_si128(data[addr[j].ui[1]]);
399 si[j] = _mm_unpacklo_epi32(si[j], ld1);
400 ld2 = _mm_cvtsi32_si128(data[addr[j].ui[2]]);
401 ld3 = _mm_cvtsi32_si128(data[addr[j].ui[3]]);
402 ld2 = _mm_unpacklo_epi32(ld2, ld3);
403 si[j] = _mm_unpacklo_epi64(si[j], ld2);
404 }
405
406 ws = _mm_srli_epi32(s4, 8);
407 ws = _mm_and_si128(ws, _mm_set1_epi32(0xFF));
408 wt = _mm_srli_epi32(t4, 8);
409 wt = _mm_and_si128(wt, _mm_set1_epi32(0xFF));
410
411 s4 = _mm_add_epi32(s4, dsdx4);
412 t4 = _mm_add_epi32(t4, dtdx4);
413
414 #if 0
415 /* scalar code for reference */
416 for (int j = 0; j < 4; j++) {
417 int s0 = s >> FIXED16_SHIFT;
418 int t0 = t >> FIXED16_SHIFT;
419 int cs0 = CLAMP(s0 , 0, tex_width);
420 int cs1 = CLAMP(s0 + 1, 0, tex_width);
421 int ct0 = CLAMP(t0 , 0, tex_height);
422 int ct1 = CLAMP(t0 + 1, 0, tex_height);
423
424 si0.ui[j] = data[ct0 * stride + cs0];
425 si1.ui[j] = data[ct0 * stride + cs1];
426 si2.ui[j] = data[ct1 * stride + cs0];
427 si3.ui[j] = data[ct1 * stride + cs1];
428
429 ws.ui[j] = (s>>8) & 0xff;
430 wt.ui[j] = (t>>8) & 0xff;
431
432 s += dsdx;
433 t += dtdx;
434 }
435 #endif
436
437 ws = _mm_or_si128(ws, _mm_slli_epi32(ws, 16));
438 wsl = _mm_shuffle_epi32(ws, _MM_SHUFFLE(1,1,0,0));
439 wsh = _mm_shuffle_epi32(ws, _MM_SHUFFLE(3,3,2,2));
440
441 wt = _mm_or_si128(wt, _mm_slli_epi32(wt, 16));
442 wtl = _mm_shuffle_epi32(wt, _MM_SHUFFLE(1,1,0,0));
443 wth = _mm_shuffle_epi32(wt, _MM_SHUFFLE(3,3,2,2));
444
445 *(__m128i *)&row[i] = util_sse2_lerp_2d_epi8_fixed88(si[0], si[2],
446 &si[1], &si[3],
447 &wtl, &wth,
448 &wsl, &wsh);
449 }
450
451 samp->s += samp->dsdy;
452 samp->t += samp->dtdy;
453
454 return row;
455 }
456
457 /* don't generate bgra 128-bits or memcpy ops they have their own path */
458 #define FETCH_TYPE bgra
459 #define OP
460 #define NO_MEMCPY
461 #include "lp_linear_sampler_tmp.h"
462
463 #define FETCH_TYPE bgrx
464 #define OP rgbx
465 #define OP128 rgbx_128
466 #include "lp_linear_sampler_tmp.h"
467
468 #define FETCH_TYPE bgra_swapped
469 #define OP rb_swap
470 #define OP128 rb_swap_128
471 #include "lp_linear_sampler_tmp.h"
472
473 #define FETCH_TYPE bgrx_swapped
474 #define OP rbx_swap
475 #define OP128 rbx_swap_128
476 #include "lp_linear_sampler_tmp.h"
477
478 static bool
sampler_is_nearest(const struct lp_linear_sampler * samp,const struct lp_sampler_static_state * sampler_state,bool minify)479 sampler_is_nearest(const struct lp_linear_sampler *samp,
480 const struct lp_sampler_static_state *sampler_state,
481 bool minify)
482 {
483 unsigned img_filter;
484
485 if (minify)
486 img_filter = sampler_state->sampler_state.min_img_filter;
487 else
488 img_filter = sampler_state->sampler_state.mag_img_filter;
489
490 /* Is it obviously nearest?
491 */
492 if (img_filter == PIPE_TEX_FILTER_NEAREST)
493 return true;
494
495 /* Otherwise look for linear samplers which devolve to nearest.
496 */
497
498 /* Needs to be axis aligned.
499 */
500 if (!samp->axis_aligned)
501 return false;
502
503 if (0) {
504 /* For maximizing shaders, revert to nearest
505 */
506 if (samp->dsdx < -FIXED16_HALF && samp->dsdx < FIXED16_HALF &&
507 samp->dtdy < -FIXED16_HALF && samp->dtdy < FIXED16_HALF)
508 return true;
509
510 /* For severely minimising shaders, revert to nearest:
511 */
512 if ((samp->dsdx < 2 * FIXED16_ONE || samp->dsdx > 2 * FIXED16_ONE) &&
513 (samp->dtdy < 2 * FIXED16_ONE || samp->dtdy > 2 * FIXED16_ONE))
514 return true;
515 }
516
517 /*
518 * Must be near a pixel center:
519 */
520 if (!fixed16_approx(fixed16_frac(samp->s), FIXED16_HALF, FIXED16_TOL) ||
521 !fixed16_approx(fixed16_frac(samp->t), FIXED16_HALF, FIXED16_TOL))
522 return false;
523
524 /*
525 * Must make a full step between pixels:
526 */
527 if (!fixed16_approx(samp->dsdx, FIXED16_ONE, FIXED16_TOL_DERIV) ||
528 !fixed16_approx(samp->dtdy, FIXED16_ONE, FIXED16_TOL_DERIV))
529 return false;
530
531 /* Treat it as nearest!
532 */
533 return true;
534 }
535
536
537 /* XXX: Lots of static-state parameters being passed in here but very
538 * little info is extracted from each one. Consolidate it all down to
539 * something succinct in the prepare phase?
540 */
541 bool
lp_linear_init_sampler(struct lp_linear_sampler * samp,const struct lp_tgsi_texture_info * info,const struct lp_sampler_static_state * sampler_state,const struct lp_jit_texture * texture,int x0,int y0,int width,int height,const float (* a0)[4],const float (* dadx)[4],const float (* dady)[4],bool rgba_order)542 lp_linear_init_sampler(struct lp_linear_sampler *samp,
543 const struct lp_tgsi_texture_info *info,
544 const struct lp_sampler_static_state *sampler_state,
545 const struct lp_jit_texture *texture,
546 int x0, int y0, int width, int height,
547 const float (*a0)[4],
548 const float (*dadx)[4],
549 const float (*dady)[4],
550 bool rgba_order)
551 {
552 const struct lp_tgsi_channel_info *schan = &info->coord[0];
553 const struct lp_tgsi_channel_info *tchan = &info->coord[1];
554
555 assert(schan->file == TGSI_FILE_INPUT);
556 assert(tchan->file == TGSI_FILE_INPUT);
557
558 float w0 = a0[0][3];
559
560 int foo = 1;
561 float s0 = a0[schan->u.index+foo][schan->swizzle];
562 float dsdx = dadx[schan->u.index+foo][schan->swizzle];
563 float dsdy = dady[schan->u.index+foo][schan->swizzle];
564
565 float t0 = a0[tchan->u.index+foo][tchan->swizzle];
566 float dtdx = dadx[tchan->u.index+foo][tchan->swizzle];
567 float dtdy = dady[tchan->u.index+foo][tchan->swizzle];
568
569 int mins, mint, maxs, maxt;
570 float oow = 1.0f / w0;
571 float width_oow = texture->width * oow;
572 float height_oow = texture->height * oow;
573 float fdsdx = dsdx * width_oow;
574 float fdsdy = dsdy * width_oow;
575 float fdtdx = dtdx * height_oow;
576 float fdtdy = dtdy * height_oow;
577 int fetch_width;
578 int fetch_height;
579 bool minify;
580 bool need_wrap;
581 bool is_nearest;
582
583 samp->texture = texture;
584 samp->width = width;
585
586 samp->s = float_to_fixed16(fdsdx * x0 +
587 fdsdy * y0 +
588 s0 * width_oow);
589
590 samp->t = float_to_fixed16(fdtdx * x0 +
591 fdtdy * y0 +
592 t0 * height_oow);
593
594 samp->dsdx = float_to_fixed16(fdsdx);
595 samp->dsdy = float_to_fixed16(fdsdy);
596 samp->dtdx = float_to_fixed16(fdtdx);
597 samp->dtdy = float_to_fixed16(fdtdy);
598
599
600 samp->axis_aligned = (samp->dsdy == 0 &&
601 samp->dtdx == 0); // TODO: could be relaxed
602
603 {
604 int dsdx = samp->dsdx >= 0 ? samp->dsdx : -samp->dsdx;
605 int dsdy = samp->dsdy >= 0 ? samp->dsdy : -samp->dsdy;
606 int dtdx = samp->dtdx >= 0 ? samp->dtdx : -samp->dtdx;
607 int dtdy = samp->dtdy >= 0 ? samp->dtdy : -samp->dtdy;
608 int rho = MAX4(dsdx, dsdy, dtdx, dtdy);
609
610 minify = (rho > FIXED16_ONE);
611 }
612
613 is_nearest = sampler_is_nearest(samp, sampler_state, minify);
614
615 if (!is_nearest) {
616 samp->s -= FIXED16_HALF;
617 samp->t -= FIXED16_HALF;
618 }
619
620 /* Check for clamping. This rarely happens as we're rejecting interpolants
621 * which fall outside the 0..1 range.
622 */
623
624 if (is_nearest) {
625 /* Nearest fetch routines don't employ SSE and always operate one pixel
626 * at a time.
627 */
628 fetch_width = width - 1;
629 } else {
630 /* Linear fetch routines employ SSE, and always fetch groups of four
631 * texels.
632 */
633 fetch_width = align(width, 4) - 1;
634 }
635 fetch_height = height - 1;
636
637 if (samp->axis_aligned) {
638 int s0 = samp->s;
639 int s1 = samp->s + fetch_width * samp->dsdx;
640 int t0 = samp->t;
641 int t1 = samp->t + fetch_height * samp->dtdy;
642
643 mins = MIN2(s0, s1);
644 mint = MIN2(t0, t1);
645 maxs = MAX2(s0, s1);
646 maxt = MAX2(t0, t1);
647 } else {
648 int s0 = samp->s;
649 int s1 = samp->s + fetch_width * samp->dsdx;
650 int s2 = samp->s + fetch_height * samp->dsdy;
651 int s3 = samp->s + fetch_width * samp->dsdx + fetch_height * samp->dsdy;
652 int t0 = samp->t;
653 int t1 = samp->t + fetch_width * samp->dtdx;
654 int t2 = samp->t + fetch_height * samp->dtdy;
655 int t3 = samp->t + fetch_width * samp->dtdx + fetch_height * samp->dtdy;
656
657 mins = MIN4(s0, s1, s2, s3);
658 mint = MIN4(t0, t1, t2, t3);
659 maxs = MAX4(s0, s1, s2, s3);
660 maxt = MAX4(t0, t1, t2, t3);
661 }
662
663 if (is_nearest) {
664 need_wrap = (mins < 0 ||
665 mint < 0 ||
666 maxs >= (texture->width << FIXED16_SHIFT) ||
667 maxt >= (texture->height << FIXED16_SHIFT));
668 } else {
669 need_wrap = (mins < 0 ||
670 mint < 0 ||
671 maxs + FIXED16_ONE >= (texture->width << FIXED16_SHIFT) ||
672 maxt + FIXED16_ONE >= (texture->height << FIXED16_SHIFT));
673 }
674
675 if (0 && need_wrap) {
676 debug_printf("%u x %u %s\n",
677 texture->width, texture->height,
678 is_nearest ? "nearest" : "linear");
679 debug_printf("mins = %f\n", mins*1.0f/FIXED16_ONE);
680 debug_printf("mint = %f\n", mint*1.0f/FIXED16_ONE);
681 debug_printf("maxs = %f\n", maxs*1.0f/FIXED16_ONE);
682 debug_printf("maxt = %f\n", maxt*1.0f/FIXED16_ONE);
683 debug_printf("\n");
684 }
685
686 /* We accept any mode below, but we only implement clamping.
687 */
688 if (need_wrap &&
689 (sampler_state->sampler_state.wrap_s != PIPE_TEX_WRAP_CLAMP_TO_EDGE ||
690 sampler_state->sampler_state.wrap_t != PIPE_TEX_WRAP_CLAMP_TO_EDGE)) {
691 return false;
692 }
693
694 if (is_nearest) {
695 switch (sampler_state->texture_state.format) {
696 case PIPE_FORMAT_B8G8R8A8_UNORM:
697 if (rgba_order) {
698 if (need_wrap)
699 samp->base.fetch = fetch_clamp_bgra_swapped;
700 else if (!samp->axis_aligned)
701 samp->base.fetch = fetch_bgra_swapped;
702 else if (samp->dsdx != FIXED16_ONE) // TODO: could be relaxed
703 samp->base.fetch = fetch_axis_aligned_bgra_swapped;
704 else
705 samp->base.fetch = fetch_memcpy_bgra_swapped;
706 } else {
707 if (need_wrap)
708 samp->base.fetch = fetch_clamp_bgra;
709 else if (!samp->axis_aligned)
710 samp->base.fetch = fetch_bgra;
711 else if (samp->dsdx != FIXED16_ONE) // TODO: could be relaxed
712 samp->base.fetch = fetch_axis_aligned_bgra;
713 else
714 samp->base.fetch = fetch_memcpy_bgra;
715 }
716 return true;
717 case PIPE_FORMAT_B8G8R8X8_UNORM:
718 if (rgba_order) {
719 if (need_wrap)
720 samp->base.fetch = fetch_clamp_bgrx_swapped;
721 else if (!samp->axis_aligned)
722 samp->base.fetch = fetch_bgrx_swapped;
723 else if (samp->dsdx != FIXED16_ONE) // TODO: could be relaxed
724 samp->base.fetch = fetch_axis_aligned_bgrx_swapped;
725 else
726 samp->base.fetch = fetch_memcpy_bgrx_swapped;
727 } else {
728 if (need_wrap)
729 samp->base.fetch = fetch_clamp_bgrx;
730 else if (!samp->axis_aligned)
731 samp->base.fetch = fetch_bgrx;
732 else if (samp->dsdx != FIXED16_ONE) // TODO: could be relaxed
733 samp->base.fetch = fetch_axis_aligned_bgrx;
734 else
735 samp->base.fetch = fetch_memcpy_bgrx;
736 }
737 return true;
738 case PIPE_FORMAT_R8G8B8A8_UNORM:
739 if (!rgba_order) {
740 if (need_wrap)
741 samp->base.fetch = fetch_clamp_bgra_swapped;
742 else if (!samp->axis_aligned)
743 samp->base.fetch = fetch_bgra_swapped;
744 else if (samp->dsdx != FIXED16_ONE) // TODO: could be relaxed
745 samp->base.fetch = fetch_axis_aligned_bgra_swapped;
746 else
747 samp->base.fetch = fetch_memcpy_bgra_swapped;
748 } else {
749 if (need_wrap)
750 samp->base.fetch = fetch_clamp_bgra;
751 else if (!samp->axis_aligned)
752 samp->base.fetch = fetch_bgra;
753 else if (samp->dsdx != FIXED16_ONE) // TODO: could be relaxed
754 samp->base.fetch = fetch_axis_aligned_bgra;
755 else
756 samp->base.fetch = fetch_memcpy_bgra;
757 }
758 return true;
759 case PIPE_FORMAT_R8G8B8X8_UNORM:
760 if (!rgba_order) {
761 if (need_wrap)
762 samp->base.fetch = fetch_clamp_bgrx_swapped;
763 else if (!samp->axis_aligned)
764 samp->base.fetch = fetch_bgrx_swapped;
765 else if (samp->dsdx != FIXED16_ONE) // TODO: could be relaxed
766 samp->base.fetch = fetch_axis_aligned_bgrx_swapped;
767 else
768 samp->base.fetch = fetch_memcpy_bgrx_swapped;
769 } else {
770 if (need_wrap)
771 samp->base.fetch = fetch_clamp_bgrx;
772 else if (!samp->axis_aligned)
773 samp->base.fetch = fetch_bgrx;
774 else if (samp->dsdx != FIXED16_ONE) // TODO: could be relaxed
775 samp->base.fetch = fetch_axis_aligned_bgrx;
776 else
777 samp->base.fetch = fetch_memcpy_bgrx;
778 }
779 return true;
780 default:
781 break;
782 }
783
784 FAIL("unknown format for nearest");
785 } else {
786 samp->stretched_row_y[0] = -1;
787 samp->stretched_row_y[1] = -1;
788 samp->stretched_row_index = 0;
789
790 switch (sampler_state->texture_state.format) {
791 case PIPE_FORMAT_B8G8R8A8_UNORM:
792 if (rgba_order) {
793 if (need_wrap)
794 samp->base.fetch = fetch_clamp_linear_bgra_swapped;
795 else if (!samp->axis_aligned)
796 samp->base.fetch = fetch_linear_bgra_swapped;
797 else
798 samp->base.fetch = fetch_axis_aligned_linear_bgra_swapped;
799 } else {
800 if (need_wrap)
801 samp->base.fetch = fetch_clamp_linear_bgra;
802 else if (!samp->axis_aligned)
803 samp->base.fetch = fetch_linear_bgra;
804 else
805 samp->base.fetch = fetch_axis_aligned_linear_bgra;
806 }
807 return true;
808 case PIPE_FORMAT_B8G8R8X8_UNORM:
809 if (rgba_order) {
810 if (need_wrap)
811 samp->base.fetch = fetch_clamp_linear_bgrx_swapped;
812 else if (!samp->axis_aligned)
813 samp->base.fetch = fetch_linear_bgrx_swapped;
814 else
815 samp->base.fetch = fetch_axis_aligned_linear_bgrx_swapped;
816 } else {
817 if (need_wrap)
818 samp->base.fetch = fetch_clamp_linear_bgrx;
819 else if (!samp->axis_aligned)
820 samp->base.fetch = fetch_linear_bgrx;
821 else
822 samp->base.fetch = fetch_axis_aligned_linear_bgrx;
823 }
824 return true;
825 case PIPE_FORMAT_R8G8B8A8_UNORM:
826 if (!rgba_order) {
827 if (need_wrap)
828 samp->base.fetch = fetch_clamp_linear_bgra_swapped;
829 else if (!samp->axis_aligned)
830 samp->base.fetch = fetch_linear_bgra_swapped;
831 else
832 samp->base.fetch = fetch_axis_aligned_linear_bgra_swapped;
833 } else {
834 if (need_wrap)
835 samp->base.fetch = fetch_clamp_linear_bgra;
836 else if (!samp->axis_aligned)
837 samp->base.fetch = fetch_linear_bgra;
838 else
839 samp->base.fetch = fetch_axis_aligned_linear_bgra;
840 }
841 return true;
842 case PIPE_FORMAT_R8G8B8X8_UNORM:
843 if (!rgba_order) {
844 if (need_wrap)
845 samp->base.fetch = fetch_clamp_linear_bgrx_swapped;
846 else if (!samp->axis_aligned)
847 samp->base.fetch = fetch_linear_bgrx_swapped;
848 else
849 samp->base.fetch = fetch_axis_aligned_linear_bgrx_swapped;
850 } else {
851 if (need_wrap)
852 samp->base.fetch = fetch_clamp_linear_bgrx;
853 else if (!samp->axis_aligned)
854 samp->base.fetch = fetch_linear_bgrx;
855 else
856 samp->base.fetch = fetch_axis_aligned_linear_bgrx;
857 }
858 return true;
859 default:
860 break;
861 }
862
863 FAIL("unknown format");
864 }
865 }
866
867
868 static const uint32_t *
fetch_noop(struct lp_linear_elem * elem)869 fetch_noop(struct lp_linear_elem *elem)
870 {
871 struct lp_linear_sampler *samp = (struct lp_linear_sampler *)elem;
872 return samp->row;
873 }
874
875
876 void
lp_linear_init_noop_sampler(struct lp_linear_sampler * samp)877 lp_linear_init_noop_sampler(struct lp_linear_sampler *samp)
878 {
879 samp->base.fetch = fetch_noop;
880 }
881
882
883 /*
884 * Check the given sampler and texture info for linear path compatibility.
885 */
886 bool
lp_linear_check_sampler(const struct lp_sampler_static_state * sampler,const struct lp_tgsi_texture_info * tex)887 lp_linear_check_sampler(const struct lp_sampler_static_state *sampler,
888 const struct lp_tgsi_texture_info *tex)
889 {
890 if (tex->modifier != LP_BLD_TEX_MODIFIER_NONE)
891 return false;
892
893 if (tex->target != TGSI_TEXTURE_2D)
894 return false;
895
896 if (tex->coord[0].file != TGSI_FILE_INPUT ||
897 tex->coord[1].file != TGSI_FILE_INPUT)
898 return false;
899
900 /* These are the only sampling modes we support at the moment.
901 *
902 * Actually we'll accept any mode as we're failing on any
903 * interpolant which exceeds 0..1. Clamping is applied only to
904 * avoid invalid reads.
905 */
906 if (!is_nearest_sampler(sampler) &&
907 !is_linear_sampler(sampler))
908 return false;
909
910 /* These are the only texture formats we support at the moment
911 */
912 if (sampler->texture_state.format != PIPE_FORMAT_B8G8R8A8_UNORM &&
913 sampler->texture_state.format != PIPE_FORMAT_B8G8R8X8_UNORM &&
914 sampler->texture_state.format != PIPE_FORMAT_R8G8B8A8_UNORM &&
915 sampler->texture_state.format != PIPE_FORMAT_R8G8B8X8_UNORM)
916 return false;
917
918 /* We don't support sampler view swizzling on the linear path */
919 if (sampler->texture_state.swizzle_r != PIPE_SWIZZLE_X ||
920 sampler->texture_state.swizzle_g != PIPE_SWIZZLE_Y ||
921 sampler->texture_state.swizzle_b != PIPE_SWIZZLE_Z ||
922 sampler->texture_state.swizzle_a != PIPE_SWIZZLE_W) {
923 return false;
924 }
925
926 return true;
927 }
928
929 #else // DETECT_ARCH_SSE
930
931 bool
lp_linear_check_sampler(const struct lp_sampler_static_state * sampler,const struct lp_tgsi_texture_info * tex)932 lp_linear_check_sampler(const struct lp_sampler_static_state *sampler,
933 const struct lp_tgsi_texture_info *tex)
934 {
935 return false;
936 }
937
938 #endif // DETECT_ARCH_SSE
939