xref: /aosp_15_r20/external/mesa3d/src/gallium/drivers/llvmpipe/lp_linear_interp.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /**************************************************************************
2  *
3  * Copyright 2010-2021 VMware, Inc.
4  * All Rights Reserved.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the
8  * "Software"), to deal in the Software without restriction, including
9  * without limitation the rights to use, copy, modify, merge, publish,
10  * distribute, sub license, and/or sell copies of the Software, and to
11  * permit persons to whom the Software is furnished to do so, subject to
12  * the following conditions:
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
18  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
19  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
20  * USE OR OTHER DEALINGS IN THE SOFTWARE.
21  *
22  * The above copyright notice and this permission notice (including the
23  * next paragraph) shall be included in all copies or substantial portions
24  * of the Software.
25  *
26  **************************************************************************/
27 
28 
29 #include "util/detect.h"
30 
31 #include "util/u_math.h"
32 #include "util/u_cpu_detect.h"
33 #include "util/u_pack_color.h"
34 #include "util/u_rect.h"
35 #include "util/u_sse.h"
36 
37 #include "lp_jit.h"
38 #include "lp_rast.h"
39 #include "lp_debug.h"
40 #include "lp_state_fs.h"
41 #include "lp_linear_priv.h"
42 
43 
44 #if DETECT_ARCH_SSE
45 
46 #define FIXED15_ONE 0x7fff
47 
48 
49 /* Translate floating point value to 1.15 unsigned fixed-point.
50  */
51 static inline uint16_t
float_to_ufixed_1_15(float f)52 float_to_ufixed_1_15(float f)
53 {
54    return CLAMP((unsigned)(f * (float)FIXED15_ONE), 0, FIXED15_ONE);
55 }
56 
57 
58 /* Translate floating point value to 1.15 signed fixed-point.
59  */
60 static inline int16_t
float_to_sfixed_1_15(float f)61 float_to_sfixed_1_15(float f)
62 {
63    return CLAMP((signed)(f * (float)FIXED15_ONE), -FIXED15_ONE, FIXED15_ONE);
64 }
65 
66 
67 /* Interpolate in 1.15 space, but produce a packed row of 0.8 values.
68  */
69 static const uint32_t *
interp_0_8(struct lp_linear_elem * elem)70 interp_0_8(struct lp_linear_elem *elem)
71 {
72    struct lp_linear_interp *interp = (struct lp_linear_interp *)elem;
73    uint32_t *row = interp->row;
74    __m128i a0 = interp->a0;
75    const __m128i dadx = interp->dadx;
76    const int width = (interp->width + 3) & ~3;
77 
78    for (int i = 0; i < width; i += 4) {
79       __m128i l = _mm_srai_epi16(a0, 7); // l = a0 >> 7
80       a0 = _mm_add_epi16(a0, dadx);      // a0 += dadx
81 
82       __m128i h = _mm_srai_epi16(a0, 7); // h = a0 >> 7
83       a0 = _mm_add_epi16(a0, dadx);      // a0 += dadx
84 
85       // pack l[0..7] and h[0..7] as 16 bytes
86       *(__m128i *)&row[i] =  _mm_packus_epi16(l, h);
87    }
88 
89    // advance to next row
90    interp->a0 = _mm_add_epi16(interp->a0, interp->dady);
91    return interp->row;
92 }
93 
94 
95 static const uint32_t *
interp_noop(struct lp_linear_elem * elem)96 interp_noop(struct lp_linear_elem *elem)
97 {
98    struct lp_linear_interp *interp = (struct lp_linear_interp *)elem;
99    return interp->row;
100 }
101 
102 
103 static const uint32_t *
interp_check(struct lp_linear_elem * elem)104 interp_check(struct lp_linear_elem *elem)
105 {
106    struct lp_linear_interp *interp = (struct lp_linear_interp *)elem;
107    interp->row[0] = 1;
108    return interp->row;
109 }
110 
111 
112 /* Not quite a noop - we use row[0] to track whether this gets called
113  * or not, so we can optimize which interpolants we care about.
114  */
115 void
lp_linear_init_noop_interp(struct lp_linear_interp * interp)116 lp_linear_init_noop_interp(struct lp_linear_interp *interp)
117 {
118    interp->row[0] = 0;
119    interp->base.fetch = interp_check;
120 }
121 
122 
123 bool
lp_linear_init_interp(struct lp_linear_interp * interp,int x,int y,int width,int height,unsigned usage_mask,bool perspective,float oow,const float * a0,const float * dadx,const float * dady)124 lp_linear_init_interp(struct lp_linear_interp *interp,
125                       int x, int y, int width, int height,
126                       unsigned usage_mask,
127                       bool perspective,
128                       float oow,
129                       const float *a0,
130                       const float *dadx,
131                       const float *dady)
132 {
133    float s0[4];
134    float dsdx[4];
135    float dsdy[4];
136    int16_t s0_fp[8];
137    int16_t dsdx_fp[4];
138    int16_t dsdy_fp[4];
139 
140    /* Zero coefficients to avoid using uninitialised values */
141    memset(s0, 0, sizeof(s0));
142    memset(dsdx, 0, sizeof(dsdx));
143    memset(dsdy, 0, sizeof(dsdy));
144    memset(s0_fp, 0, sizeof(s0_fp));
145    memset(dsdx_fp, 0, sizeof(dsdx_fp));
146    memset(dsdy_fp, 0, sizeof(dsdy_fp));
147 
148    if (perspective && oow != 1.0f) {
149       for (unsigned j = 0; j < 4; j++) {
150          if (usage_mask & (1<<j)) {
151             s0[j]   =   a0[j] * oow;
152             dsdx[j] = dadx[j] * oow;
153             dsdy[j] = dady[j] * oow;
154          }
155       }
156    } else {
157       for (unsigned j = 0; j < 4; j++) {
158          if (usage_mask & (1<<j)) {
159             s0[j]   =   a0[j];
160             dsdx[j] = dadx[j];
161             dsdy[j] = dady[j];
162          }
163       }
164    }
165 
166    s0[0] += x * dsdx[0] + y * dsdy[0];
167    s0[1] += x * dsdx[1] + y * dsdy[1];
168    s0[2] += x * dsdx[2] + y * dsdy[2];
169    s0[3] += x * dsdx[3] + y * dsdy[3];
170 
171    /* XXX: lift all of this into the rectangle setup code.
172     *
173     * For rectangles with linear shaders, at setup time:
174     *    - if w is constant (else mark as non-fastpath)
175     *        - premultiply perspective interpolants by w
176     *        - set w = 1 in position
177     *   - check all interpolants for min/max 0..1 (else mark as
178     *          non-fastpath)
179     */
180    for (unsigned j = 0; j < 4; j++) {
181       if (usage_mask & (1<<j)) {
182          // compute texcoords at rect corners
183          float a = s0[j];
184          float b = s0[j] + (width  - 1) * dsdx[j];
185          float c = s0[j] + (height - 1) * dsdy[j];
186          float d = s0[j] + (height - 1) * dsdy[j] + (width - 1) * dsdx[j];
187 
188          if (MIN4(a,b,c,d) < 0.0)
189             FAIL("min < 0.0"); // out of bounds
190 
191          if (MAX4(a,b,c,d) > 1.0)
192             FAIL("max > 1.0"); // out of bounds
193 
194          dsdx_fp[j]   = float_to_sfixed_1_15(dsdx[j]);
195          dsdy_fp[j]   = float_to_sfixed_1_15(dsdy[j]);
196 
197          s0_fp[j]     = float_to_ufixed_1_15(s0[j]);  // first pixel
198          s0_fp[j + 4] = s0_fp[j] + dsdx_fp[j];        // second pixel
199 
200          dsdx_fp[j] *= 2;
201       }
202    }
203 
204    interp->width = align(width, 4);
205    /* RGBA->BGRA swizzle here */
206    interp->a0    = _mm_setr_epi16(s0_fp[2], s0_fp[1], s0_fp[0], s0_fp[3],
207                                   s0_fp[6], s0_fp[5], s0_fp[4], s0_fp[7]);
208 
209    interp->dadx  = _mm_setr_epi16(dsdx_fp[2], dsdx_fp[1], dsdx_fp[0], dsdx_fp[3],
210                                   dsdx_fp[2], dsdx_fp[1], dsdx_fp[0], dsdx_fp[3]);
211 
212    interp->dady  = _mm_setr_epi16(dsdy_fp[2], dsdy_fp[1], dsdy_fp[0], dsdy_fp[3],
213                                   dsdy_fp[2], dsdy_fp[1], dsdy_fp[0], dsdy_fp[3]);
214 
215    /* If the value is y-invariant, eagerly calculate it here and then
216     * always return the precalculated value.
217     */
218    if (dsdy[0] == 0 &&
219        dsdy[1] == 0 &&
220        dsdy[2] == 0 &&
221        dsdy[3] == 0) {
222       interp_0_8(&interp->base);
223       interp->base.fetch = interp_noop;
224    } else {
225       interp->base.fetch = interp_0_8;
226    }
227 
228    return true;
229 }
230 
231 #else //DETECT_ARCH_SSE
232 
233 bool
lp_linear_init_interp(struct lp_linear_interp * interp,int x,int y,int width,int height,unsigned usage_mask,bool perspective,float oow,const float * a0,const float * dadx,const float * dady)234 lp_linear_init_interp(struct lp_linear_interp *interp,
235                       int x, int y, int width, int height,
236                       unsigned usage_mask,
237                       bool perspective,
238                       float oow,
239                       const float *a0,
240                       const float *dadx,
241                       const float *dady)
242 {
243    return false;
244 }
245 
246 #endif //DETECT_ARCH_SSE
247