xref: /aosp_15_r20/external/mesa3d/src/gallium/drivers/llvmpipe/lp_setup_tri.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /**************************************************************************
2  *
3  * Copyright 2007 VMware, Inc.
4  * All Rights Reserved.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the
8  * "Software"), to deal in the Software without restriction, including
9  * without limitation the rights to use, copy, modify, merge, publish,
10  * distribute, sub license, and/or sell copies of the Software, and to
11  * permit persons to whom the Software is furnished to do so, subject to
12  * the following conditions:
13  *
14  * The above copyright notice and this permission notice (including the
15  * next paragraph) shall be included in all copies or substantial portions
16  * of the Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25  *
26  **************************************************************************/
27 
28 /*
29  * Binning code for triangles
30  */
31 
32 #include "util/detect.h"
33 
34 #if DETECT_ARCH_SSE
35 #include <emmintrin.h>
36 #elif defined(_ARCH_PWR8) && UTIL_ARCH_LITTLE_ENDIAN
37 #include <altivec.h>
38 /*
39 altivec.h inclusion in -std=c++98..11 causes bool to be redefined
40  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58241
41 */
42 #undef bool
43 #endif
44 
45 #include <stdbool.h>
46 
47 #include "util/u_math.h"
48 #include "util/u_memory.h"
49 #include "util/u_rect.h"
50 #include "util/u_sse.h"
51 #include "lp_perf.h"
52 #include "lp_setup_context.h"
53 #include "lp_rast.h"
54 #include "lp_state_fs.h"
55 #include "lp_state_setup.h"
56 #include "lp_context.h"
57 
58 #include <inttypes.h>
59 
60 #if defined(_ARCH_PWR8) && UTIL_ARCH_LITTLE_ENDIAN
61 #include "util/u_pwr8.h"
62 #endif
63 
64 #if !DETECT_ARCH_SSE
65 
66 static inline int
subpixel_snap(float a)67 subpixel_snap(float a)
68 {
69    return util_iround(FIXED_ONE * a);
70 }
71 
72 #endif
73 
74 /* Position and area in fixed point coordinates */
75 struct fixed_position {
76    int32_t x[4];
77    int32_t y[4];
78    int32_t dx01;
79    int32_t dy01;
80    int32_t dx20;
81    int32_t dy20;
82 };
83 
84 
85 /**
86  * Alloc space for a new triangle plus the input.a0/dadx/dady arrays
87  * immediately after it.
88  * The memory is allocated from the per-scene pool, not per-tile.
89  * \param num_inputs  number of fragment shader inputs
90  * \return pointer to triangle space
91  */
92 struct lp_rast_triangle *
lp_setup_alloc_triangle(struct lp_scene * scene,unsigned nr_inputs,unsigned nr_planes)93 lp_setup_alloc_triangle(struct lp_scene *scene,
94                         unsigned nr_inputs,
95                         unsigned nr_planes)
96 {
97    // add 1 for XYZW position
98    unsigned input_array_sz = (nr_inputs + 1) * sizeof(float[4]);
99    unsigned plane_sz = nr_planes * sizeof(struct lp_rast_plane);
100 
101    STATIC_ASSERT(sizeof(struct lp_rast_plane) % 8 == 0);
102 
103    const unsigned tri_size  = sizeof(struct lp_rast_triangle)
104       + 3 * input_array_sz +   // 3 = da + dadx + dady
105       + plane_sz;
106 
107    struct lp_rast_triangle *tri = lp_scene_alloc_aligned(scene, tri_size, 16);
108    if (!tri)
109       return NULL;
110 
111    tri->inputs.stride = input_array_sz;
112 
113    {
114       ASSERTED char *a = (char *)tri;
115       ASSERTED char *b = (char *)&GET_PLANES(tri)[nr_planes];
116 
117       assert(b - a == tri_size);
118    }
119 
120    return tri;
121 }
122 
123 
124 void
lp_setup_print_vertex(struct lp_setup_context * setup,const char * name,const float (* v)[4])125 lp_setup_print_vertex(struct lp_setup_context *setup,
126                       const char *name,
127                       const float (*v)[4])
128 {
129    const struct lp_setup_variant_key *key = &setup->setup.variant->key;
130 
131    debug_printf("   wpos (%s[0]) xyzw %f %f %f %f\n",
132                 name,
133                 v[0][0], v[0][1], v[0][2], v[0][3]);
134 
135    for (int i = 0; i < key->num_inputs; i++) {
136       const float *in = v[key->inputs[i].src_index];
137 
138       debug_printf("  in[%d] (%s[%d]) %s%s%s%s ",
139                    i,
140                    name, key->inputs[i].src_index,
141                    (key->inputs[i].usage_mask & 0x1) ? "x" : " ",
142                    (key->inputs[i].usage_mask & 0x2) ? "y" : " ",
143                    (key->inputs[i].usage_mask & 0x4) ? "z" : " ",
144                    (key->inputs[i].usage_mask & 0x8) ? "w" : " ");
145 
146       for (int j = 0; j < 4; j++)
147          if (key->inputs[i].usage_mask & (1<<j))
148             debug_printf("%.5f ", in[j]);
149 
150       debug_printf("\n");
151    }
152 }
153 
154 
155 /**
156  * Print triangle vertex attribs (for debug).
157  */
158 void
lp_setup_print_triangle(struct lp_setup_context * setup,const float (* v0)[4],const float (* v1)[4],const float (* v2)[4])159 lp_setup_print_triangle(struct lp_setup_context *setup,
160                         const float (*v0)[4],
161                         const float (*v1)[4],
162                         const float (*v2)[4])
163 {
164    debug_printf("triangle\n");
165 
166    {
167       const float ex = v0[0][0] - v2[0][0];
168       const float ey = v0[0][1] - v2[0][1];
169       const float fx = v1[0][0] - v2[0][0];
170       const float fy = v1[0][1] - v2[0][1];
171 
172       /* det = cross(e,f).z */
173       const float det = ex * fy - ey * fx;
174       if (det < 0.0f)
175          debug_printf("   - ccw\n");
176       else if (det > 0.0f)
177          debug_printf("   - cw\n");
178       else
179          debug_printf("   - zero area\n");
180    }
181 
182    lp_setup_print_vertex(setup, "v0", v0);
183    lp_setup_print_vertex(setup, "v1", v1);
184    lp_setup_print_vertex(setup, "v2", v2);
185 }
186 
187 
188 #define MAX_PLANES 8
189 static unsigned
190 lp_rast_tri_tab[MAX_PLANES+1] = {
191    0,               /* should be impossible */
192    LP_RAST_OP_TRIANGLE_1,
193    LP_RAST_OP_TRIANGLE_2,
194    LP_RAST_OP_TRIANGLE_3,
195    LP_RAST_OP_TRIANGLE_4,
196    LP_RAST_OP_TRIANGLE_5,
197    LP_RAST_OP_TRIANGLE_6,
198    LP_RAST_OP_TRIANGLE_7,
199    LP_RAST_OP_TRIANGLE_8
200 };
201 
202 static unsigned
203 lp_rast_32_tri_tab[MAX_PLANES+1] = {
204    0,               /* should be impossible */
205    LP_RAST_OP_TRIANGLE_32_1,
206    LP_RAST_OP_TRIANGLE_32_2,
207    LP_RAST_OP_TRIANGLE_32_3,
208    LP_RAST_OP_TRIANGLE_32_4,
209    LP_RAST_OP_TRIANGLE_32_5,
210    LP_RAST_OP_TRIANGLE_32_6,
211    LP_RAST_OP_TRIANGLE_32_7,
212    LP_RAST_OP_TRIANGLE_32_8
213 };
214 
215 
216 static unsigned
217 lp_rast_ms_tri_tab[MAX_PLANES+1] = {
218    0,               /* should be impossible */
219    LP_RAST_OP_MS_TRIANGLE_1,
220    LP_RAST_OP_MS_TRIANGLE_2,
221    LP_RAST_OP_MS_TRIANGLE_3,
222    LP_RAST_OP_MS_TRIANGLE_4,
223    LP_RAST_OP_MS_TRIANGLE_5,
224    LP_RAST_OP_MS_TRIANGLE_6,
225    LP_RAST_OP_MS_TRIANGLE_7,
226    LP_RAST_OP_MS_TRIANGLE_8
227 };
228 
229 
230 /*
231  * Detect big primitives drawn with an alpha == 1.0.
232  *
233  * This is used when simulating anti-aliasing primitives in shaders, e.g.,
234  * when drawing the windows client area in Aero's flip-3d effect.
235  */
236 static bool
check_opaque(const struct lp_setup_context * setup,const float (* v1)[4],const float (* v2)[4],const float (* v3)[4])237 check_opaque(const struct lp_setup_context *setup,
238              const float (*v1)[4],
239              const float (*v2)[4],
240              const float (*v3)[4])
241 {
242    const struct lp_fragment_shader_variant *variant =
243       setup->fs.current.variant;
244 
245    if (variant->opaque)
246       return true;
247 
248    if (!variant->potentially_opaque)
249       return false;
250 
251    const struct lp_tgsi_channel_info *alpha_info = &variant->shader->info.cbuf[0][3];
252    if (alpha_info->file == TGSI_FILE_CONSTANT) {
253       const float *constants = setup->fs.current.jit_resources.constants[0].f;
254       float alpha = constants[alpha_info->u.index*4 +
255                               alpha_info->swizzle];
256       return alpha == 1.0f;
257    }
258 
259    if (alpha_info->file == TGSI_FILE_INPUT) {
260       return (v1[1 + alpha_info->u.index][alpha_info->swizzle] == 1.0f &&
261               v2[1 + alpha_info->u.index][alpha_info->swizzle] == 1.0f &&
262               v3[1 + alpha_info->u.index][alpha_info->swizzle] == 1.0f);
263    }
264 
265    return false;
266 }
267 
268 
269 /**
270  * Do basic setup for triangle rasterization and determine which
271  * framebuffer tiles are touched.  Put the triangle in the scene's
272  * bins for the tiles which we overlap.
273  */
274 static bool
do_triangle_ccw(struct lp_setup_context * setup,struct fixed_position * position,const float (* v0)[4],const float (* v1)[4],const float (* v2)[4],bool frontfacing)275 do_triangle_ccw(struct lp_setup_context *setup,
276                 struct fixed_position *position,
277                 const float (*v0)[4],
278                 const float (*v1)[4],
279                 const float (*v2)[4],
280                 bool frontfacing)
281 {
282    struct lp_scene *scene = setup->scene;
283 
284    const float (*pv)[4];
285    if (setup->flatshade_first) {
286       pv = v0;
287    } else {
288       pv = v2;
289    }
290 
291    unsigned viewport_index = 0;
292    if (setup->viewport_index_slot > 0) {
293       unsigned *udata = (unsigned*)pv[setup->viewport_index_slot];
294       viewport_index = lp_clamp_viewport_idx(*udata);
295    }
296 
297    unsigned layer = 0;
298    if (setup->layer_slot > 0) {
299       layer = *(unsigned*)pv[setup->layer_slot];
300       layer = MIN2(layer, scene->fb_max_layer);
301    }
302 
303    /* Bounding rectangle (in pixels) */
304    struct u_rect bbox;
305    {
306       /* Yes this is necessary to accurately calculate bounding boxes
307        * with the two fill-conventions we support.  GL (normally) ends
308        * up needing a bottom-left fill convention, which requires
309        * slightly different rounding.
310        */
311       int adj = (setup->bottom_edge_rule != 0) ? 1 : 0;
312 
313       /* Inclusive x0, exclusive x1 */
314       bbox.x0 =  MIN3(position->x[0], position->x[1],
315                       position->x[2]) >> FIXED_ORDER;
316       bbox.x1 = (MAX3(position->x[0], position->x[1],
317                       position->x[2]) - 1) >> FIXED_ORDER;
318 
319       /* Inclusive / exclusive depending upon adj (bottom-left or top-right) */
320       bbox.y0 = (MIN3(position->y[0], position->y[1],
321                       position->y[2]) + adj) >> FIXED_ORDER;
322       bbox.y1 = (MAX3(position->y[0], position->y[1],
323                       position->y[2]) - 1 + adj) >> FIXED_ORDER;
324    }
325 
326    if (!u_rect_test_intersection(&setup->draw_regions[viewport_index], &bbox)) {
327       if (0) debug_printf("no intersection\n");
328       LP_COUNT(nr_culled_tris);
329       return true;
330    }
331 
332    int max_szorig = ((bbox.x1 - (bbox.x0 & ~3)) |
333                      (bbox.y1 - (bbox.y0 & ~3)));
334    bool use_32bits = max_szorig <= MAX_FIXED_LENGTH32;
335 #if defined(_ARCH_PWR8) && UTIL_ARCH_LITTLE_ENDIAN
336    bool pwr8_limit_check = (bbox.x1 - bbox.x0) <= MAX_FIXED_LENGTH32 &&
337       (bbox.y1 - bbox.y0) <= MAX_FIXED_LENGTH32;
338 #endif
339 
340    /* Can safely discard negative regions, but need to keep hold of
341     * information about when the triangle extends past screen
342     * boundaries.  See trimmed_box in lp_setup_bin_triangle().
343     */
344    bbox.x0 = MAX2(bbox.x0, 0);
345    bbox.y0 = MAX2(bbox.y0, 0);
346 
347    int nr_planes = 3;
348 
349    /*
350     * Determine how many scissor planes we need, that is drop scissor
351     * edges if the bounding box of the tri is fully inside that edge.
352     */
353    const struct u_rect *scissor = &setup->draw_regions[viewport_index];
354    bool s_planes[4];
355    scissor_planes_needed(s_planes, &bbox, scissor);
356    nr_planes += s_planes[0] + s_planes[1] + s_planes[2] + s_planes[3];
357 
358    const struct lp_setup_variant_key *key = &setup->setup.variant->key;
359    struct lp_rast_triangle *tri =
360       lp_setup_alloc_triangle(scene, key->num_inputs, nr_planes);
361    if (!tri)
362       return false;
363 
364 #if MESA_DEBUG
365    tri->v[0][0] = v0[0][0];
366    tri->v[1][0] = v1[0][0];
367    tri->v[2][0] = v2[0][0];
368    tri->v[0][1] = v0[0][1];
369    tri->v[1][1] = v1[0][1];
370    tri->v[2][1] = v2[0][1];
371 #endif
372 
373    LP_COUNT(nr_tris);
374 
375    /*
376     * Rotate the tri such that v0 is closest to the fb origin.
377     * This can give more accurate a0 value (which is at fb origin)
378     * when calculating the interpolants.
379     * It can't work when there's flat shading for instance in one
380     * of the attributes, hence restrict this to just a single attribute
381     * which is what causes some test failures.
382     * (This does not address the problem that interpolation may be
383     * inaccurate if gradients are relatively steep in small tris far
384     * away from the origin. It does however fix the (silly) wgf11rasterizer
385     * Interpolator test.)
386     * XXX This causes problems with mipgen -EmuTexture for not yet really
387     * understood reasons (if the vertices would be submitted in a different
388     * order, we'd also generate the same "wrong" results here without
389     * rotation). In any case, that we generate different values if a prim
390     * has the vertices rotated but is otherwise the same (which is due to
391     * numerical issues) is not a nice property. An additional problem by
392     * swapping the vertices here (which is possibly worse) is that
393     * the same primitive coming in twice might generate different values
394     * (in particular for z) due to the swapping potentially not happening
395     * both times, if the attributes to be interpolated are different. For now,
396     * just restrict this to not get used with dx9 (by checking pixel offset),
397     * could also restrict it further to only trigger with wgf11Interpolator
398     * Rasterizer test (the only place which needs it, with always the same
399     * vertices even).
400     */
401    if ((LP_DEBUG & DEBUG_ACCURATE_A0) &&
402        setup->pixel_offset == 0.5f &&
403        key->num_inputs == 1 &&
404        (key->inputs[0].interp == LP_INTERP_LINEAR ||
405         key->inputs[0].interp == LP_INTERP_PERSPECTIVE) &&
406         setup->fs.current_tex_num == 0 &&
407         setup->cullmode == 0) {
408       float dist0 = v0[0][0] * v0[0][0] + v0[0][1] * v0[0][1];
409       float dist1 = v1[0][0] * v1[0][0] + v1[0][1] * v1[0][1];
410       float dist2 = v2[0][0] * v2[0][0] + v2[0][1] * v2[0][1];
411       if (dist0 > dist1 && dist1 < dist2) {
412          const float (*vt)[4];
413          vt = v0;
414          v0 = v1;
415          v1 = v2;
416          v2 = vt;
417          // rotate positions
418          int x = position->x[0];
419          int y = position->y[0];
420          position->x[0] = position->x[1];
421          position->y[0] = position->y[1];
422          position->x[1] = position->x[2];
423          position->y[1] = position->y[2];
424          position->x[2] = x;
425          position->y[2] = y;
426 
427          position->dx20 = position->dx01;
428          position->dy20 = position->dy01;
429          position->dx01 = position->x[0] - position->x[1];
430          position->dy01 = position->y[0] - position->y[1];
431       } else if (dist0 > dist2) {
432          const float (*vt)[4];
433          vt = v0;
434          v0 = v2;
435          v2 = v1;
436          v1 = vt;
437          // rotate positions
438          int x = position->x[0];
439          int y = position->y[0];
440          position->x[0] = position->x[2];
441          position->y[0] = position->y[2];
442          position->x[2] = position->x[1];
443          position->y[2] = position->y[1];
444          position->x[1] = x;
445          position->y[1] = y;
446 
447          position->dx01 = position->dx20;
448          position->dy01 = position->dy20;
449          position->dx20 = position->x[2] - position->x[0];
450          position->dy20 = position->y[2] - position->y[0];
451       }
452    }
453 
454    /* Setup parameter interpolants:
455     */
456    setup->setup.variant->jit_function(v0, v1, v2,
457                                       frontfacing,
458                                       GET_A0(&tri->inputs),
459                                       GET_DADX(&tri->inputs),
460                                       GET_DADY(&tri->inputs),
461                                       &setup->setup.variant->key);
462 
463    tri->inputs.frontfacing = frontfacing;
464    tri->inputs.disable = false;
465    tri->inputs.is_blit = false;
466    tri->inputs.layer = layer;
467    tri->inputs.viewport_index = viewport_index;
468    tri->inputs.view_index = setup->view_index;
469 
470    if (0)
471       lp_dump_setup_coef(&setup->setup.variant->key,
472                          GET_A0(&tri->inputs),
473                          GET_DADX(&tri->inputs),
474                          GET_DADY(&tri->inputs));
475 
476    struct lp_rast_plane *plane = GET_PLANES(tri);
477 
478 #if DETECT_ARCH_SSE
479    if (1) {
480       __m128i vertx, verty;
481       __m128i shufx, shufy;
482       __m128i dcdx, dcdy;
483       __m128i cdx02, cdx13, cdy02, cdy13, c02, c13;
484       __m128i c01, c23, unused;
485       __m128i dcdx_neg_mask;
486       __m128i dcdy_neg_mask;
487       __m128i dcdx_zero_mask;
488       __m128i top_left_flag, c_dec;
489       __m128i eo, p0, p1, p2;
490       __m128i zero = _mm_setzero_si128();
491 
492       vertx = _mm_load_si128((__m128i *)position->x); /* vertex x coords */
493       verty = _mm_load_si128((__m128i *)position->y); /* vertex y coords */
494 
495       shufx = _mm_shuffle_epi32(vertx, _MM_SHUFFLE(3,0,2,1));
496       shufy = _mm_shuffle_epi32(verty, _MM_SHUFFLE(3,0,2,1));
497 
498       dcdx = _mm_sub_epi32(verty, shufy);
499       dcdy = _mm_sub_epi32(vertx, shufx);
500 
501       dcdx_neg_mask = _mm_srai_epi32(dcdx, 31);
502       dcdx_zero_mask = _mm_cmpeq_epi32(dcdx, zero);
503       dcdy_neg_mask = _mm_srai_epi32(dcdy, 31);
504 
505       top_left_flag = _mm_set1_epi32((setup->bottom_edge_rule == 0) ? ~0 : 0);
506 
507       c_dec = _mm_or_si128(dcdx_neg_mask,
508                            _mm_and_si128(dcdx_zero_mask,
509                                          _mm_xor_si128(dcdy_neg_mask,
510                                                        top_left_flag)));
511 
512       /*
513        * 64 bit arithmetic.
514        * Note we need _signed_ mul (_mm_mul_epi32) which we emulate.
515        */
516       cdx02 = mm_mullohi_epi32(dcdx, vertx, &cdx13);
517       cdy02 = mm_mullohi_epi32(dcdy, verty, &cdy13);
518       c02 = _mm_sub_epi64(cdx02, cdy02);
519       c13 = _mm_sub_epi64(cdx13, cdy13);
520       c02 = _mm_sub_epi64(c02, _mm_shuffle_epi32(c_dec,
521                                                  _MM_SHUFFLE(2,2,0,0)));
522       c13 = _mm_sub_epi64(c13, _mm_shuffle_epi32(c_dec,
523                                                  _MM_SHUFFLE(3,3,1,1)));
524 
525       /*
526        * Useful for very small fbs/tris (or fewer subpixel bits) only:
527        * c = _mm_sub_epi32(mm_mullo_epi32(dcdx, vertx),
528        *                   mm_mullo_epi32(dcdy, verty));
529        *
530        * c = _mm_sub_epi32(c, c_dec);
531        */
532 
533       /* Scale up to match c:
534        */
535       dcdx = _mm_slli_epi32(dcdx, FIXED_ORDER);
536       dcdy = _mm_slli_epi32(dcdy, FIXED_ORDER);
537 
538       /*
539        * Calculate trivial reject values:
540        * Note eo cannot overflow even if dcdx/dcdy would already have
541        * 31 bits (which they shouldn't have). This is because eo
542        * is never negative (albeit if we rely on that need to be careful...)
543        */
544       eo = _mm_sub_epi32(_mm_andnot_si128(dcdy_neg_mask, dcdy),
545                          _mm_and_si128(dcdx_neg_mask, dcdx));
546 
547       /* ei = _mm_sub_epi32(_mm_sub_epi32(dcdy, dcdx), eo); */
548 
549       /*
550        * Pointless transpose which gets undone immediately in
551        * rasterization.
552        * It is actually difficult to do away with it - would essentially
553        * need GET_PLANES_DX, GET_PLANES_DY etc., but the calculations
554        * for this then would need to depend on the number of planes.
555        * The transpose is quite special here due to c being 64bit...
556        * The store has to be unaligned (unless we'd make the plane size
557        * a multiple of 128), and of course storing eo separately...
558        */
559       c01 = _mm_unpacklo_epi64(c02, c13);
560       c23 = _mm_unpackhi_epi64(c02, c13);
561       transpose2_64_2_32(&c01, &c23, &dcdx, &dcdy,
562                          &p0, &p1, &p2, &unused);
563       _mm_storeu_si128((__m128i *)&plane[0], p0);
564       plane[0].eo = (uint32_t)_mm_cvtsi128_si32(eo);
565       _mm_storeu_si128((__m128i *)&plane[1], p1);
566       eo = _mm_shuffle_epi32(eo, _MM_SHUFFLE(3,2,0,1));
567       plane[1].eo = (uint32_t)_mm_cvtsi128_si32(eo);
568       _mm_storeu_si128((__m128i *)&plane[2], p2);
569       eo = _mm_shuffle_epi32(eo, _MM_SHUFFLE(0,0,0,2));
570       plane[2].eo = (uint32_t)_mm_cvtsi128_si32(eo);
571    } else
572 #elif defined(_ARCH_PWR8) && UTIL_ARCH_LITTLE_ENDIAN
573    /*
574     * XXX this code is effectively disabled for all practical purposes,
575     * as the allowed fb size is tiny if FIXED_ORDER is 8.
576     */
577    if (setup->fb.width <= MAX_FIXED_LENGTH32 &&
578        setup->fb.height <= MAX_FIXED_LENGTH32 &&
579        pwr8_limit_check) {
580       unsigned int bottom_edge;
581       __m128i vertx, verty;
582       __m128i shufx, shufy;
583       __m128i dcdx, dcdy, c;
584       __m128i unused;
585       __m128i dcdx_neg_mask;
586       __m128i dcdy_neg_mask;
587       __m128i dcdx_zero_mask;
588       __m128i top_left_flag;
589       __m128i c_inc_mask, c_inc;
590       __m128i eo, p0, p1, p2;
591       __m128i_union vshuf_mask;
592       __m128i zero = vec_splats((unsigned char) 0);
593       alignas(16) int32_t temp_vec[4];
594 
595 #if UTIL_ARCH_LITTLE_ENDIAN
596       vshuf_mask.i[0] = 0x07060504;
597       vshuf_mask.i[1] = 0x0B0A0908;
598       vshuf_mask.i[2] = 0x03020100;
599       vshuf_mask.i[3] = 0x0F0E0D0C;
600 #else
601       vshuf_mask.i[0] = 0x00010203;
602       vshuf_mask.i[1] = 0x0C0D0E0F;
603       vshuf_mask.i[2] = 0x04050607;
604       vshuf_mask.i[3] = 0x08090A0B;
605 #endif
606 
607       /* vertex x coords */
608       vertx = vec_load_si128((const uint32_t *) position->x);
609       /* vertex y coords */
610       verty = vec_load_si128((const uint32_t *) position->y);
611 
612       shufx = vec_perm (vertx, vertx, vshuf_mask.m128i);
613       shufy = vec_perm (verty, verty, vshuf_mask.m128i);
614 
615       dcdx = vec_sub_epi32(verty, shufy);
616       dcdy = vec_sub_epi32(vertx, shufx);
617 
618       dcdx_neg_mask = vec_srai_epi32(dcdx, 31);
619       dcdx_zero_mask = vec_cmpeq_epi32(dcdx, zero);
620       dcdy_neg_mask = vec_srai_epi32(dcdy, 31);
621 
622       bottom_edge = (setup->bottom_edge_rule == 0) ? ~0 : 0;
623       top_left_flag = (__m128i) vec_splats(bottom_edge);
624 
625       c_inc_mask = vec_or(dcdx_neg_mask,
626                                 vec_and(dcdx_zero_mask,
627                                               vec_xor(dcdy_neg_mask,
628                                                             top_left_flag)));
629 
630       c_inc = vec_srli_epi32(c_inc_mask, 31);
631 
632       c = vec_sub_epi32(vec_mullo_epi32(dcdx, vertx),
633                         vec_mullo_epi32(dcdy, verty));
634 
635       c = vec_add_epi32(c, c_inc);
636 
637       /* Scale up to match c:
638        */
639       dcdx = vec_slli_epi32(dcdx, FIXED_ORDER);
640       dcdy = vec_slli_epi32(dcdy, FIXED_ORDER);
641 
642       /* Calculate trivial reject values:
643        */
644       eo = vec_sub_epi32(vec_andnot_si128(dcdy_neg_mask, dcdy),
645                          vec_and(dcdx_neg_mask, dcdx));
646 
647       /* ei = _mm_sub_epi32(_mm_sub_epi32(dcdy, dcdx), eo); */
648 
649       /* Pointless transpose which gets undone immediately in
650        * rasterization:
651        */
652       transpose4_epi32(&c, &dcdx, &dcdy, &eo,
653                        &p0, &p1, &p2, &unused);
654 
655 #define STORE_PLANE(plane, vec) do {                  \
656          vec_store_si128((uint32_t *)&temp_vec, vec); \
657          plane.c    = (int64_t)temp_vec[0];           \
658          plane.dcdx = temp_vec[1];                    \
659          plane.dcdy = temp_vec[2];                    \
660          plane.eo   = temp_vec[3];                    \
661       } while(0)
662 
663       STORE_PLANE(plane[0], p0);
664       STORE_PLANE(plane[1], p1);
665       STORE_PLANE(plane[2], p2);
666 #undef STORE_PLANE
667    } else
668 #endif
669    {
670       plane[0].dcdy = position->dx01;
671       plane[1].dcdy = position->x[1] - position->x[2];
672       plane[2].dcdy = position->dx20;
673       plane[0].dcdx = position->dy01;
674       plane[1].dcdx = position->y[1] - position->y[2];
675       plane[2].dcdx = position->dy20;
676 
677       for (int i = 0; i < 3; i++) {
678          /* half-edge constants, will be iterated over the whole render
679           * target.
680           */
681          plane[i].c = IMUL64(plane[i].dcdx, position->x[i]) -
682                       IMUL64(plane[i].dcdy, position->y[i]);
683 
684          /* correct for top-left vs. bottom-left fill convention.
685           */
686          if (plane[i].dcdx < 0) {
687             /* both fill conventions want this - adjust for left edges */
688             plane[i].c++;
689          }
690          else if (plane[i].dcdx == 0) {
691             if (setup->bottom_edge_rule == 0) {
692                /* correct for top-left fill convention:
693                 */
694                if (plane[i].dcdy > 0)
695                   plane[i].c++;
696             } else {
697                /* correct for bottom-left fill convention:
698                 */
699                if (plane[i].dcdy < 0)
700                   plane[i].c++;
701             }
702          }
703 
704          /* Scale up to match c:
705           */
706          assert((plane[i].dcdx << FIXED_ORDER) >> FIXED_ORDER == plane[i].dcdx);
707          assert((plane[i].dcdy << FIXED_ORDER) >> FIXED_ORDER == plane[i].dcdy);
708          plane[i].dcdx <<= FIXED_ORDER;
709          plane[i].dcdy <<= FIXED_ORDER;
710 
711          /* find trivial reject offsets for each edge for a single-pixel
712           * sized block.  These will be scaled up at each recursive level to
713           * match the active blocksize.  Scaling in this way works best if
714           * the blocks are square.
715           */
716          plane[i].eo = 0;
717          if (plane[i].dcdx < 0) plane[i].eo -= plane[i].dcdx;
718          if (plane[i].dcdy > 0) plane[i].eo += plane[i].dcdy;
719       }
720    }
721 
722    if (0) {
723       debug_printf("p0: %"PRIx64"/%08x/%08x/%08x\n",
724                    plane[0].c,
725                    plane[0].dcdx,
726                    plane[0].dcdy,
727                    plane[0].eo);
728 
729       debug_printf("p1: %"PRIx64"/%08x/%08x/%08x\n",
730                    plane[1].c,
731                    plane[1].dcdx,
732                    plane[1].dcdy,
733                    plane[1].eo);
734 
735       debug_printf("p2: %"PRIx64"/%08x/%08x/%08x\n",
736                    plane[2].c,
737                    plane[2].dcdx,
738                    plane[2].dcdy,
739                    plane[2].eo);
740    }
741 
742    if (nr_planes > 3) {
743       lp_setup_add_scissor_planes(scissor, &plane[3],
744                                   s_planes, setup->multisample);
745    }
746 
747    return lp_setup_bin_triangle(setup, tri, use_32bits,
748                                 check_opaque(setup, v0, v1, v2),
749                                 &bbox, nr_planes, viewport_index);
750 }
751 
752 /*
753  * Round to nearest less or equal power of two of the input.
754  *
755  * Undefined if no bit set exists, so code should check against 0 first.
756  */
757 static inline uint32_t
floor_pot(uint32_t n)758 floor_pot(uint32_t n)
759 {
760 #if DETECT_CC_GCC && (DETECT_ARCH_X86 || DETECT_ARCH_X86_64)
761    if (n == 0)
762       return 0;
763 
764    __asm__("bsr %1,%0"
765           : "=r" (n)
766           : "rm" (n)
767           : "cc");
768    return 1 << n;
769 #else
770    n |= (n >>  1);
771    n |= (n >>  2);
772    n |= (n >>  4);
773    n |= (n >>  8);
774    n |= (n >> 16);
775    return n - (n >> 1);
776 #endif
777 }
778 
779 
780 bool
lp_setup_bin_triangle(struct lp_setup_context * setup,struct lp_rast_triangle * tri,bool use_32bits,bool opaque,const struct u_rect * bbox,int nr_planes,unsigned viewport_index)781 lp_setup_bin_triangle(struct lp_setup_context *setup,
782                       struct lp_rast_triangle *tri,
783                       bool use_32bits,
784                       bool opaque,
785                       const struct u_rect *bbox,
786                       int nr_planes,
787                       unsigned viewport_index)
788 {
789    struct lp_scene *scene = setup->scene;
790    unsigned cmd;
791 
792    /* What is the largest power-of-two boundary this triangle crosses:
793     */
794    const int dx = floor_pot((bbox->x0 ^ bbox->x1) |
795                             (bbox->y0 ^ bbox->y1));
796 
797    /* The largest dimension of the rasterized area of the triangle
798     * (aligned to a 4x4 grid), rounded down to the nearest power of two:
799     */
800    const int max_sz = ((bbox->x1 - (bbox->x0 & ~3)) |
801                        (bbox->y1 - (bbox->y0 & ~3)));
802    const int sz = floor_pot(max_sz);
803 
804    /*
805     * NOTE: It is important to use the original bounding box
806     * which might contain negative values here, because if the
807     * plane math may overflow or not with the 32bit rasterization
808     * functions depends on the original extent of the triangle.
809     */
810 
811    /* Now apply scissor, etc to the bounding box.  Could do this
812     * earlier, but it confuses the logic for tri-16 and would force
813     * the rasterizer to also respect scissor, etc, just for the rare
814     * cases where a small triangle extends beyond the scissor.
815     */
816    struct u_rect trimmed_box = *bbox;
817    u_rect_find_intersection(&setup->draw_regions[viewport_index],
818                             &trimmed_box);
819 
820    /* Determine which tile(s) intersect the triangle's bounding box
821     */
822    if (dx < TILE_SIZE) {
823       const int ix0 = bbox->x0 / TILE_SIZE;
824       const int iy0 = bbox->y0 / TILE_SIZE;
825       unsigned px = bbox->x0 & 63 & ~3;
826       unsigned py = bbox->y0 & 63 & ~3;
827 
828       assert(iy0 == bbox->y1 / TILE_SIZE &&
829              ix0 == bbox->x1 / TILE_SIZE);
830 
831       if (nr_planes == 3) {
832          if (sz < 4) {
833             /* Triangle is contained in a single 4x4 stamp:
834              */
835             assert(px + 4 <= TILE_SIZE);
836             assert(py + 4 <= TILE_SIZE);
837             if (setup->multisample)
838                cmd = LP_RAST_OP_MS_TRIANGLE_3_4;
839             else
840                cmd = use_32bits ? LP_RAST_OP_TRIANGLE_32_3_4 : LP_RAST_OP_TRIANGLE_3_4;
841             return lp_scene_bin_cmd_with_state(scene, ix0, iy0,
842                                                setup->fs.stored, cmd,
843                                                lp_rast_arg_triangle_contained(tri, px, py));
844          }
845 
846          if (sz < 16) {
847             /* Triangle is contained in a single 16x16 block:
848              */
849 
850             /*
851              * The 16x16 block is only 4x4 aligned, and can exceed the tile
852              * dimensions if the triangle is 16 pixels in one dimension but 4
853              * in the other. So budge the 16x16 back inside the tile.
854              */
855             px = MIN2(px, TILE_SIZE - 16);
856             py = MIN2(py, TILE_SIZE - 16);
857 
858             assert(px + 16 <= TILE_SIZE);
859             assert(py + 16 <= TILE_SIZE);
860 
861             if (setup->multisample)
862                cmd = LP_RAST_OP_MS_TRIANGLE_3_16;
863             else
864                cmd = use_32bits ? LP_RAST_OP_TRIANGLE_32_3_16 : LP_RAST_OP_TRIANGLE_3_16;
865             return lp_scene_bin_cmd_with_state(scene, ix0, iy0,
866                                                setup->fs.stored, cmd,
867                                                lp_rast_arg_triangle_contained(tri, px, py));
868          }
869       } else if (nr_planes == 4 && sz < 16) {
870          px = MIN2(px, TILE_SIZE - 16);
871          py = MIN2(py, TILE_SIZE - 16);
872 
873          assert(px + 16 <= TILE_SIZE);
874          assert(py + 16 <= TILE_SIZE);
875 
876          if (setup->multisample)
877             cmd = LP_RAST_OP_MS_TRIANGLE_4_16;
878          else
879             cmd = use_32bits ? LP_RAST_OP_TRIANGLE_32_4_16 : LP_RAST_OP_TRIANGLE_4_16;
880          return lp_scene_bin_cmd_with_state(scene, ix0, iy0,
881                                             setup->fs.stored, cmd,
882                                             lp_rast_arg_triangle_contained(tri, px, py));
883       }
884 
885       /* Triangle is contained in a single tile:
886        */
887       if (setup->multisample)
888          cmd = lp_rast_ms_tri_tab[nr_planes];
889       else
890          cmd = use_32bits ? lp_rast_32_tri_tab[nr_planes] : lp_rast_tri_tab[nr_planes];
891       return lp_scene_bin_cmd_with_state(scene, ix0, iy0, setup->fs.stored,
892                                          cmd,
893                                          lp_rast_arg_triangle(tri,
894                                                               (1<<nr_planes)-1));
895    } else {
896       struct lp_rast_plane *plane = GET_PLANES(tri);
897       int64_t c[MAX_PLANES];
898       int64_t ei[MAX_PLANES];
899 
900       int64_t eo[MAX_PLANES];
901       int64_t xstep[MAX_PLANES];
902       int64_t ystep[MAX_PLANES];
903 
904       const int ix0 = trimmed_box.x0 / TILE_SIZE;
905       const int iy0 = trimmed_box.y0 / TILE_SIZE;
906       const int ix1 = trimmed_box.x1 / TILE_SIZE;
907       const int iy1 = trimmed_box.y1 / TILE_SIZE;
908 
909       for (int i = 0; i < nr_planes; i++) {
910          c[i] = (plane[i].c +
911                  IMUL64(plane[i].dcdy, iy0) * TILE_SIZE -
912                  IMUL64(plane[i].dcdx, ix0) * TILE_SIZE);
913 
914          ei[i] = (plane[i].dcdy -
915                   plane[i].dcdx -
916                   (int64_t)plane[i].eo) << TILE_ORDER;
917 
918          eo[i] = (int64_t)plane[i].eo << TILE_ORDER;
919          xstep[i] = -(((int64_t)plane[i].dcdx) << TILE_ORDER);
920          ystep[i] = ((int64_t)plane[i].dcdy) << TILE_ORDER;
921       }
922 
923       tri->inputs.is_blit = lp_setup_is_blit(setup, &tri->inputs);
924 
925       /* Test tile-sized blocks against the triangle.
926        * Discard blocks fully outside the tri.  If the block is fully
927        * contained inside the tri, bin an lp_rast_shade_tile command.
928        * Else, bin a lp_rast_triangle command.
929        */
930       for (int y = iy0; y <= iy1; y++) {
931          bool in = false;  /* are we inside the triangle? */
932          int64_t cx[MAX_PLANES];
933 
934          for (int i = 0; i < nr_planes; i++)
935             cx[i] = c[i];
936 
937          for (int x = ix0; x <= ix1; x++) {
938             int out = 0, partial = 0;
939 
940             for (int i = 0; i < nr_planes; i++) {
941                int64_t planeout = cx[i] + eo[i];
942                int64_t planepartial = cx[i] + ei[i] - 1;
943                out |= (int) (planeout >> 63);
944                partial |= ((int) (planepartial >> 63)) & (1<<i);
945             }
946 
947             if (out) {
948                /* do nothing */
949                if (in)
950                   break;  /* exiting triangle, all done with this row */
951                LP_COUNT(nr_empty_64);
952             } else if (partial) {
953                /* Not trivially accepted by at least one plane -
954                 * rasterize/shade partial tile
955                 */
956                int count = util_bitcount(partial);
957                in = true;
958 
959                if (setup->multisample)
960                   cmd = lp_rast_ms_tri_tab[count];
961                else
962                   cmd = use_32bits ? lp_rast_32_tri_tab[count] : lp_rast_tri_tab[count];
963                if (!lp_scene_bin_cmd_with_state(scene, x, y,
964                                                 setup->fs.stored, cmd,
965                                                 lp_rast_arg_triangle(tri, partial)))
966                   goto fail;
967 
968                LP_COUNT(nr_partially_covered_64);
969             } else {
970                /* triangle covers the whole tile- shade whole tile */
971                LP_COUNT(nr_fully_covered_64);
972                in = true;
973                if (!lp_setup_whole_tile(setup, &tri->inputs, x, y, opaque))
974                   goto fail;
975             }
976 
977             /* Iterate cx values across the region: */
978             for (int i = 0; i < nr_planes; i++)
979                cx[i] += xstep[i];
980          }
981 
982          /* Iterate c values down the region: */
983          for (int i = 0; i < nr_planes; i++)
984             c[i] += ystep[i];
985       }
986    }
987 
988    return true;
989 
990 fail:
991    /* Need to disable any partially binned triangle.  This is easier
992     * than trying to locate all the triangle, shade-tile, etc,
993     * commands which may have been binned.
994     */
995    tri->inputs.disable = true;
996    return false;
997 }
998 
999 
1000 /**
1001  * Try to draw the triangle, restart the scene on failure.
1002  */
1003 static inline void
retry_triangle_ccw(struct lp_setup_context * setup,struct fixed_position * position,const float (* v0)[4],const float (* v1)[4],const float (* v2)[4],bool front)1004 retry_triangle_ccw(struct lp_setup_context *setup,
1005                    struct fixed_position *position,
1006                    const float (*v0)[4],
1007                    const float (*v1)[4],
1008                    const float (*v2)[4],
1009                    bool front)
1010 {
1011    if (0)
1012       lp_setup_print_triangle(setup, v0, v1, v2);
1013 
1014    if (lp_setup_zero_sample_mask(setup)) {
1015       if (0) debug_printf("zero sample mask\n");
1016       LP_COUNT(nr_culled_tris);
1017       return;
1018    }
1019 
1020    if (!do_triangle_ccw(setup, position, v0, v1, v2, front)) {
1021       if (!lp_setup_flush_and_restart(setup))
1022          return;
1023 
1024       if (!do_triangle_ccw(setup, position, v0, v1, v2, front))
1025          return;
1026    }
1027 }
1028 
1029 
1030 /**
1031  * Calculate fixed position data for a triangle
1032  * It is unfortunate we need to do that here (as we need area
1033  * calculated in fixed point), as there's quite some code duplication
1034  * to what is done in the jit setup prog.
1035  */
1036 static inline int8_t
calc_fixed_position(struct lp_setup_context * setup,struct fixed_position * position,const float (* v0)[4],const float (* v1)[4],const float (* v2)[4])1037 calc_fixed_position(struct lp_setup_context *setup,
1038                     struct fixed_position* position,
1039                     const float (*v0)[4],
1040                     const float (*v1)[4],
1041                     const float (*v2)[4])
1042 {
1043    float pixel_offset = setup->multisample ? 0.0 : setup->pixel_offset;
1044    /*
1045     * The rounding may not be quite the same with DETECT_ARCH_SSE
1046     * (util_iround right now only does nearest/even on x87,
1047     * otherwise nearest/away-from-zero).
1048     * Both should be acceptable, I think.
1049     */
1050 #if DETECT_ARCH_SSE
1051    __m128 v0r, v1r;
1052    __m128 vxy0xy2, vxy1xy0;
1053    __m128i vxy0xy2i, vxy1xy0i;
1054    __m128i dxdy0120, x0x2y0y2, x1x0y1y0, x0120, y0120;
1055    __m128 pix_offset = _mm_set1_ps(pixel_offset);
1056    __m128 fixed_one = _mm_set1_ps((float)FIXED_ONE);
1057    v0r = _mm_castpd_ps(_mm_load_sd((double *)v0[0]));
1058    vxy0xy2 = _mm_loadh_pi(v0r, (__m64 *)v2[0]);
1059    v1r = _mm_castpd_ps(_mm_load_sd((double *)v1[0]));
1060    vxy1xy0 = _mm_movelh_ps(v1r, vxy0xy2);
1061    vxy0xy2 = _mm_sub_ps(vxy0xy2, pix_offset);
1062    vxy1xy0 = _mm_sub_ps(vxy1xy0, pix_offset);
1063    vxy0xy2 = _mm_mul_ps(vxy0xy2, fixed_one);
1064    vxy1xy0 = _mm_mul_ps(vxy1xy0, fixed_one);
1065    vxy0xy2i = _mm_cvtps_epi32(vxy0xy2);
1066    vxy1xy0i = _mm_cvtps_epi32(vxy1xy0);
1067    dxdy0120 = _mm_sub_epi32(vxy0xy2i, vxy1xy0i);
1068    _mm_store_si128((__m128i *)&position->dx01, dxdy0120);
1069    /*
1070     * For the mul, would need some more shuffles, plus emulation
1071     * for the signed mul (without sse41), so don't bother.
1072     */
1073    x0x2y0y2 = _mm_shuffle_epi32(vxy0xy2i, _MM_SHUFFLE(3,1,2,0));
1074    x1x0y1y0 = _mm_shuffle_epi32(vxy1xy0i, _MM_SHUFFLE(3,1,2,0));
1075    x0120 = _mm_unpacklo_epi32(x0x2y0y2, x1x0y1y0);
1076    y0120 = _mm_unpackhi_epi32(x0x2y0y2, x1x0y1y0);
1077    _mm_store_si128((__m128i *)&position->x[0], x0120);
1078    _mm_store_si128((__m128i *)&position->y[0], y0120);
1079 
1080 #else
1081    position->x[0] = subpixel_snap(v0[0][0] - pixel_offset);
1082    position->x[1] = subpixel_snap(v1[0][0] - pixel_offset);
1083    position->x[2] = subpixel_snap(v2[0][0] - pixel_offset);
1084    position->x[3] = 0; // should be unused
1085 
1086    position->y[0] = subpixel_snap(v0[0][1] - pixel_offset);
1087    position->y[1] = subpixel_snap(v1[0][1] - pixel_offset);
1088    position->y[2] = subpixel_snap(v2[0][1] - pixel_offset);
1089    position->y[3] = 0; // should be unused
1090 
1091    position->dx01 = position->x[0] - position->x[1];
1092    position->dy01 = position->y[0] - position->y[1];
1093 
1094    position->dx20 = position->x[2] - position->x[0];
1095    position->dy20 = position->y[2] - position->y[0];
1096 #endif
1097 
1098    uint64_t area = IMUL64(position->dx01, position->dy20) -
1099       IMUL64(position->dx20, position->dy01);
1100    return area == 0 ? 0 : (area & (1ULL << 63)) ? -1 : 1;
1101 }
1102 
1103 
1104 /**
1105  * Rotate a triangle, flipping its clockwise direction,
1106  * Swaps values for xy[0] and xy[1]
1107  */
1108 static inline void
rotate_fixed_position_01(struct fixed_position * position)1109 rotate_fixed_position_01(struct fixed_position* position)
1110 {
1111    int x = position->x[1];
1112    int y = position->y[1];
1113 
1114    position->x[1] = position->x[0];
1115    position->y[1] = position->y[0];
1116    position->x[0] = x;
1117    position->y[0] = y;
1118 
1119    position->dx01 = -position->dx01;
1120    position->dy01 = -position->dy01;
1121    position->dx20 = position->x[2] - position->x[0];
1122    position->dy20 = position->y[2] - position->y[0];
1123 }
1124 
1125 
1126 /**
1127  * Rotate a triangle, flipping its clockwise direction,
1128  * Swaps values for xy[1] and xy[2]
1129  */
1130 static inline void
rotate_fixed_position_12(struct fixed_position * position)1131 rotate_fixed_position_12(struct fixed_position* position)
1132 {
1133    int x = position->x[2];
1134    int y = position->y[2];
1135 
1136    position->x[2] = position->x[1];
1137    position->y[2] = position->y[1];
1138    position->x[1] = x;
1139    position->y[1] = y;
1140 
1141    x = position->dx01;
1142    y = position->dy01;
1143    position->dx01 = -position->dx20;
1144    position->dy01 = -position->dy20;
1145    position->dx20 = -x;
1146    position->dy20 = -y;
1147 }
1148 
1149 
1150 /**
1151  * Draw triangle if it's CW, cull otherwise.
1152  */
1153 static void
triangle_cw(struct lp_setup_context * setup,const float (* v0)[4],const float (* v1)[4],const float (* v2)[4])1154 triangle_cw(struct lp_setup_context *setup,
1155             const float (*v0)[4],
1156             const float (*v1)[4],
1157             const float (*v2)[4])
1158 {
1159    alignas(16) struct fixed_position position;
1160    struct llvmpipe_context *lp_context = llvmpipe_context(setup->pipe);
1161 
1162    if (lp_context->active_statistics_queries) {
1163       lp_context->pipeline_statistics.c_primitives++;
1164    }
1165 
1166    int8_t area_sign = calc_fixed_position(setup, &position, v0, v1, v2);
1167 
1168    if (area_sign < 0) {
1169       if (setup->flatshade_first) {
1170          rotate_fixed_position_12(&position);
1171          retry_triangle_ccw(setup, &position, v0, v2, v1,
1172                             !setup->ccw_is_frontface);
1173       } else {
1174          rotate_fixed_position_01(&position);
1175          retry_triangle_ccw(setup, &position, v1, v0, v2,
1176                             !setup->ccw_is_frontface);
1177       }
1178    }
1179 }
1180 
1181 
1182 static void
triangle_ccw(struct lp_setup_context * setup,const float (* v0)[4],const float (* v1)[4],const float (* v2)[4])1183 triangle_ccw(struct lp_setup_context *setup,
1184              const float (*v0)[4],
1185              const float (*v1)[4],
1186              const float (*v2)[4])
1187 {
1188    alignas(16) struct fixed_position position;
1189    struct llvmpipe_context *lp_context = llvmpipe_context(setup->pipe);
1190 
1191    if (lp_context->active_statistics_queries) {
1192       lp_context->pipeline_statistics.c_primitives++;
1193    }
1194 
1195    int8_t area_sign = calc_fixed_position(setup, &position, v0, v1, v2);
1196 
1197    if (area_sign > 0)
1198       retry_triangle_ccw(setup, &position, v0, v1, v2, setup->ccw_is_frontface);
1199 }
1200 
1201 
1202 /**
1203  * Draw triangle whether it's CW or CCW.
1204  */
1205 static void
triangle_both(struct lp_setup_context * setup,const float (* v0)[4],const float (* v1)[4],const float (* v2)[4])1206 triangle_both(struct lp_setup_context *setup,
1207               const float (*v0)[4],
1208               const float (*v1)[4],
1209               const float (*v2)[4])
1210 {
1211    alignas(16) struct fixed_position position;
1212    struct llvmpipe_context *lp_context = llvmpipe_context(setup->pipe);
1213 
1214    if (lp_context->active_statistics_queries) {
1215       lp_context->pipeline_statistics.c_primitives++;
1216    }
1217 
1218    int8_t area_sign = calc_fixed_position(setup, &position, v0, v1, v2);
1219 
1220    if (0) {
1221       assert(!util_is_inf_or_nan(v0[0][0]));
1222       assert(!util_is_inf_or_nan(v0[0][1]));
1223       assert(!util_is_inf_or_nan(v1[0][0]));
1224       assert(!util_is_inf_or_nan(v1[0][1]));
1225       assert(!util_is_inf_or_nan(v2[0][0]));
1226       assert(!util_is_inf_or_nan(v2[0][1]));
1227    }
1228 
1229    if (area_sign > 0) {
1230       retry_triangle_ccw(setup, &position, v0, v1, v2,
1231                          setup->ccw_is_frontface);
1232    } else if (area_sign < 0) {
1233       if (setup->flatshade_first) {
1234          rotate_fixed_position_12(&position);
1235          retry_triangle_ccw(setup, &position, v0, v2, v1,
1236                             !setup->ccw_is_frontface);
1237       } else {
1238          rotate_fixed_position_01(&position);
1239          retry_triangle_ccw(setup, &position, v1, v0, v2,
1240                             !setup->ccw_is_frontface);
1241       }
1242    }
1243 }
1244 
1245 
1246 static void
triangle_noop(struct lp_setup_context * setup,const float (* v0)[4],const float (* v1)[4],const float (* v2)[4])1247 triangle_noop(struct lp_setup_context *setup,
1248               const float (*v0)[4],
1249               const float (*v1)[4],
1250               const float (*v2)[4])
1251 {
1252 }
1253 
1254 
1255 void
lp_setup_choose_triangle(struct lp_setup_context * setup)1256 lp_setup_choose_triangle(struct lp_setup_context *setup)
1257 {
1258    if (setup->rasterizer_discard) {
1259       setup->triangle = triangle_noop;
1260       return;
1261    }
1262    switch (setup->cullmode) {
1263    case PIPE_FACE_NONE:
1264       setup->triangle = triangle_both;
1265       break;
1266    case PIPE_FACE_BACK:
1267       setup->triangle = setup->ccw_is_frontface ? triangle_ccw : triangle_cw;
1268       break;
1269    case PIPE_FACE_FRONT:
1270       setup->triangle = setup->ccw_is_frontface ? triangle_cw : triangle_ccw;
1271       break;
1272    default:
1273       setup->triangle = triangle_noop;
1274       break;
1275    }
1276 }
1277