xref: /aosp_15_r20/external/mesa3d/src/gallium/auxiliary/tgsi/tgsi_exec.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /**************************************************************************
2  *
3  * Copyright 2007-2008 VMware, Inc.
4  * All Rights Reserved.
5  * Copyright 2009-2010 VMware, Inc.  All rights Reserved.
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a
8  * copy of this software and associated documentation files (the
9  * "Software"), to deal in the Software without restriction, including
10  * without limitation the rights to use, copy, modify, merge, publish,
11  * distribute, sub license, and/or sell copies of the Software, and to
12  * permit persons to whom the Software is furnished to do so, subject to
13  * the following conditions:
14  *
15  * The above copyright notice and this permission notice (including the
16  * next paragraph) shall be included in all copies or substantial portions
17  * of the Software.
18  *
19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
22  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
23  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26  *
27  **************************************************************************/
28 
29 /**
30  * TGSI interpreter/executor.
31  *
32  * Flow control information:
33  *
34  * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
35  * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
36  * care since a condition may be true for some quad components but false
37  * for other components.
38  *
39  * We basically execute all statements (even if they're in the part of
40  * an IF/ELSE clause that's "not taken") and use a special mask to
41  * control writing to destination registers.  This is the ExecMask.
42  * See store_dest().
43  *
44  * The ExecMask is computed from three other masks (CondMask, LoopMask and
45  * ContMask) which are controlled by the flow control instructions (namely:
46  * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
47  *
48  *
49  * Authors:
50  *   Michal Krol
51  *   Brian Paul
52  */
53 
54 #include "util/compiler.h"
55 #include "pipe/p_state.h"
56 #include "pipe/p_shader_tokens.h"
57 #include "tgsi/tgsi_dump.h"
58 #include "tgsi/tgsi_parse.h"
59 #include "tgsi/tgsi_util.h"
60 #include "tgsi_exec.h"
61 #include "util/compiler.h"
62 #include "util/half_float.h"
63 #include "util/u_memory.h"
64 #include "util/u_math.h"
65 #include "util/rounding.h"
66 
67 
68 #define DEBUG_EXECUTION 0
69 
70 
71 #define TILE_TOP_LEFT     0
72 #define TILE_TOP_RIGHT    1
73 #define TILE_BOTTOM_LEFT  2
74 #define TILE_BOTTOM_RIGHT 3
75 
76 static_assert(alignof(union tgsi_exec_channel) == 16, "");
77 static_assert(alignof(struct tgsi_exec_vector) == 16, "");
78 static_assert(alignof(struct tgsi_exec_machine) == 16, "");
79 
80 union tgsi_double_channel {
81    alignas(16)
82    double d[TGSI_QUAD_SIZE];
83    unsigned u[TGSI_QUAD_SIZE][2];
84    uint64_t u64[TGSI_QUAD_SIZE];
85    int64_t i64[TGSI_QUAD_SIZE];
86 };
87 
88 struct tgsi_double_vector {
89    alignas(16)
90    union tgsi_double_channel xy;
91    union tgsi_double_channel zw;
92 };
93 
94 static_assert(alignof(union tgsi_double_channel) == 16, "");
95 static_assert(alignof(struct tgsi_double_vector) == 16, "");
96 
97 static void
micro_abs(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src)98 micro_abs(union tgsi_exec_channel *dst,
99           const union tgsi_exec_channel *src)
100 {
101    dst->f[0] = fabsf(src->f[0]);
102    dst->f[1] = fabsf(src->f[1]);
103    dst->f[2] = fabsf(src->f[2]);
104    dst->f[3] = fabsf(src->f[3]);
105 }
106 
107 static void
micro_arl(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src)108 micro_arl(union tgsi_exec_channel *dst,
109           const union tgsi_exec_channel *src)
110 {
111    dst->i[0] = (int)floorf(src->f[0]);
112    dst->i[1] = (int)floorf(src->f[1]);
113    dst->i[2] = (int)floorf(src->f[2]);
114    dst->i[3] = (int)floorf(src->f[3]);
115 }
116 
117 static void
micro_arr(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src)118 micro_arr(union tgsi_exec_channel *dst,
119           const union tgsi_exec_channel *src)
120 {
121    dst->i[0] = (int)floorf(src->f[0] + 0.5f);
122    dst->i[1] = (int)floorf(src->f[1] + 0.5f);
123    dst->i[2] = (int)floorf(src->f[2] + 0.5f);
124    dst->i[3] = (int)floorf(src->f[3] + 0.5f);
125 }
126 
127 static void
micro_ceil(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src)128 micro_ceil(union tgsi_exec_channel *dst,
129            const union tgsi_exec_channel *src)
130 {
131    dst->f[0] = ceilf(src->f[0]);
132    dst->f[1] = ceilf(src->f[1]);
133    dst->f[2] = ceilf(src->f[2]);
134    dst->f[3] = ceilf(src->f[3]);
135 }
136 
137 static void
micro_cmp(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src0,const union tgsi_exec_channel * src1,const union tgsi_exec_channel * src2)138 micro_cmp(union tgsi_exec_channel *dst,
139           const union tgsi_exec_channel *src0,
140           const union tgsi_exec_channel *src1,
141           const union tgsi_exec_channel *src2)
142 {
143    dst->f[0] = src0->f[0] < 0.0f ? src1->f[0] : src2->f[0];
144    dst->f[1] = src0->f[1] < 0.0f ? src1->f[1] : src2->f[1];
145    dst->f[2] = src0->f[2] < 0.0f ? src1->f[2] : src2->f[2];
146    dst->f[3] = src0->f[3] < 0.0f ? src1->f[3] : src2->f[3];
147 }
148 
149 static void
micro_cos(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src)150 micro_cos(union tgsi_exec_channel *dst,
151           const union tgsi_exec_channel *src)
152 {
153    dst->f[0] = cosf(src->f[0]);
154    dst->f[1] = cosf(src->f[1]);
155    dst->f[2] = cosf(src->f[2]);
156    dst->f[3] = cosf(src->f[3]);
157 }
158 
159 static void
micro_d2f(union tgsi_exec_channel * dst,const union tgsi_double_channel * src)160 micro_d2f(union tgsi_exec_channel *dst,
161           const union tgsi_double_channel *src)
162 {
163    dst->f[0] = (float)src->d[0];
164    dst->f[1] = (float)src->d[1];
165    dst->f[2] = (float)src->d[2];
166    dst->f[3] = (float)src->d[3];
167 }
168 
169 static void
micro_d2i(union tgsi_exec_channel * dst,const union tgsi_double_channel * src)170 micro_d2i(union tgsi_exec_channel *dst,
171           const union tgsi_double_channel *src)
172 {
173    dst->i[0] = (int)src->d[0];
174    dst->i[1] = (int)src->d[1];
175    dst->i[2] = (int)src->d[2];
176    dst->i[3] = (int)src->d[3];
177 }
178 
179 static void
micro_d2u(union tgsi_exec_channel * dst,const union tgsi_double_channel * src)180 micro_d2u(union tgsi_exec_channel *dst,
181           const union tgsi_double_channel *src)
182 {
183    dst->u[0] = (unsigned)src->d[0];
184    dst->u[1] = (unsigned)src->d[1];
185    dst->u[2] = (unsigned)src->d[2];
186    dst->u[3] = (unsigned)src->d[3];
187 }
188 static void
micro_dabs(union tgsi_double_channel * dst,const union tgsi_double_channel * src)189 micro_dabs(union tgsi_double_channel *dst,
190            const union tgsi_double_channel *src)
191 {
192    dst->d[0] = src->d[0] >= 0.0 ? src->d[0] : -src->d[0];
193    dst->d[1] = src->d[1] >= 0.0 ? src->d[1] : -src->d[1];
194    dst->d[2] = src->d[2] >= 0.0 ? src->d[2] : -src->d[2];
195    dst->d[3] = src->d[3] >= 0.0 ? src->d[3] : -src->d[3];
196 }
197 
198 static void
micro_dadd(union tgsi_double_channel * dst,const union tgsi_double_channel * src)199 micro_dadd(union tgsi_double_channel *dst,
200           const union tgsi_double_channel *src)
201 {
202    dst->d[0] = src[0].d[0] + src[1].d[0];
203    dst->d[1] = src[0].d[1] + src[1].d[1];
204    dst->d[2] = src[0].d[2] + src[1].d[2];
205    dst->d[3] = src[0].d[3] + src[1].d[3];
206 }
207 
208 static void
micro_ddiv(union tgsi_double_channel * dst,const union tgsi_double_channel * src)209 micro_ddiv(union tgsi_double_channel *dst,
210           const union tgsi_double_channel *src)
211 {
212    dst->d[0] = src[0].d[0] / src[1].d[0];
213    dst->d[1] = src[0].d[1] / src[1].d[1];
214    dst->d[2] = src[0].d[2] / src[1].d[2];
215    dst->d[3] = src[0].d[3] / src[1].d[3];
216 }
217 
218 static void
micro_ddx(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src)219 micro_ddx(union tgsi_exec_channel *dst,
220           const union tgsi_exec_channel *src)
221 {
222    dst->f[0] =
223    dst->f[1] =
224    dst->f[2] =
225    dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
226 }
227 
228 static void
micro_ddx_fine(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src)229 micro_ddx_fine(union tgsi_exec_channel *dst,
230           const union tgsi_exec_channel *src)
231 {
232    dst->f[0] =
233    dst->f[1] = src->f[TILE_TOP_RIGHT] - src->f[TILE_TOP_LEFT];
234    dst->f[2] =
235    dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
236 }
237 
238 
239 static void
micro_ddy(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src)240 micro_ddy(union tgsi_exec_channel *dst,
241           const union tgsi_exec_channel *src)
242 {
243    dst->f[0] =
244    dst->f[1] =
245    dst->f[2] =
246    dst->f[3] = src->f[TILE_BOTTOM_LEFT] - src->f[TILE_TOP_LEFT];
247 }
248 
249 static void
micro_ddy_fine(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src)250 micro_ddy_fine(union tgsi_exec_channel *dst,
251           const union tgsi_exec_channel *src)
252 {
253    dst->f[0] =
254    dst->f[2] = src->f[TILE_BOTTOM_LEFT] - src->f[TILE_TOP_LEFT];
255    dst->f[1] =
256    dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_TOP_RIGHT];
257 }
258 
259 static void
micro_dmul(union tgsi_double_channel * dst,const union tgsi_double_channel * src)260 micro_dmul(union tgsi_double_channel *dst,
261            const union tgsi_double_channel *src)
262 {
263    dst->d[0] = src[0].d[0] * src[1].d[0];
264    dst->d[1] = src[0].d[1] * src[1].d[1];
265    dst->d[2] = src[0].d[2] * src[1].d[2];
266    dst->d[3] = src[0].d[3] * src[1].d[3];
267 }
268 
269 static void
micro_dmax(union tgsi_double_channel * dst,const union tgsi_double_channel * src)270 micro_dmax(union tgsi_double_channel *dst,
271            const union tgsi_double_channel *src)
272 {
273    dst->d[0] = fmax(src[0].d[0], src[1].d[0]);
274    dst->d[1] = fmax(src[0].d[1], src[1].d[1]);
275    dst->d[2] = fmax(src[0].d[2], src[1].d[2]);
276    dst->d[3] = fmax(src[0].d[3], src[1].d[3]);
277 }
278 
279 static void
micro_dmin(union tgsi_double_channel * dst,const union tgsi_double_channel * src)280 micro_dmin(union tgsi_double_channel *dst,
281            const union tgsi_double_channel *src)
282 {
283    dst->d[0] = fmin(src[0].d[0], src[1].d[0]);
284    dst->d[1] = fmin(src[0].d[1], src[1].d[1]);
285    dst->d[2] = fmin(src[0].d[2], src[1].d[2]);
286    dst->d[3] = fmin(src[0].d[3], src[1].d[3]);
287 }
288 
289 static void
micro_dneg(union tgsi_double_channel * dst,const union tgsi_double_channel * src)290 micro_dneg(union tgsi_double_channel *dst,
291            const union tgsi_double_channel *src)
292 {
293    dst->d[0] = -src->d[0];
294    dst->d[1] = -src->d[1];
295    dst->d[2] = -src->d[2];
296    dst->d[3] = -src->d[3];
297 }
298 
299 static void
micro_dslt(union tgsi_double_channel * dst,const union tgsi_double_channel * src)300 micro_dslt(union tgsi_double_channel *dst,
301            const union tgsi_double_channel *src)
302 {
303    dst->u[0][0] = src[0].d[0] < src[1].d[0] ? ~0U : 0U;
304    dst->u[1][0] = src[0].d[1] < src[1].d[1] ? ~0U : 0U;
305    dst->u[2][0] = src[0].d[2] < src[1].d[2] ? ~0U : 0U;
306    dst->u[3][0] = src[0].d[3] < src[1].d[3] ? ~0U : 0U;
307 }
308 
309 static void
micro_dsne(union tgsi_double_channel * dst,const union tgsi_double_channel * src)310 micro_dsne(union tgsi_double_channel *dst,
311            const union tgsi_double_channel *src)
312 {
313    dst->u[0][0] = src[0].d[0] != src[1].d[0] ? ~0U : 0U;
314    dst->u[1][0] = src[0].d[1] != src[1].d[1] ? ~0U : 0U;
315    dst->u[2][0] = src[0].d[2] != src[1].d[2] ? ~0U : 0U;
316    dst->u[3][0] = src[0].d[3] != src[1].d[3] ? ~0U : 0U;
317 }
318 
319 static void
micro_dsge(union tgsi_double_channel * dst,const union tgsi_double_channel * src)320 micro_dsge(union tgsi_double_channel *dst,
321            const union tgsi_double_channel *src)
322 {
323    dst->u[0][0] = src[0].d[0] >= src[1].d[0] ? ~0U : 0U;
324    dst->u[1][0] = src[0].d[1] >= src[1].d[1] ? ~0U : 0U;
325    dst->u[2][0] = src[0].d[2] >= src[1].d[2] ? ~0U : 0U;
326    dst->u[3][0] = src[0].d[3] >= src[1].d[3] ? ~0U : 0U;
327 }
328 
329 static void
micro_dseq(union tgsi_double_channel * dst,const union tgsi_double_channel * src)330 micro_dseq(union tgsi_double_channel *dst,
331            const union tgsi_double_channel *src)
332 {
333    dst->u[0][0] = src[0].d[0] == src[1].d[0] ? ~0U : 0U;
334    dst->u[1][0] = src[0].d[1] == src[1].d[1] ? ~0U : 0U;
335    dst->u[2][0] = src[0].d[2] == src[1].d[2] ? ~0U : 0U;
336    dst->u[3][0] = src[0].d[3] == src[1].d[3] ? ~0U : 0U;
337 }
338 
339 static void
micro_drcp(union tgsi_double_channel * dst,const union tgsi_double_channel * src)340 micro_drcp(union tgsi_double_channel *dst,
341            const union tgsi_double_channel *src)
342 {
343    dst->d[0] = 1.0 / src->d[0];
344    dst->d[1] = 1.0 / src->d[1];
345    dst->d[2] = 1.0 / src->d[2];
346    dst->d[3] = 1.0 / src->d[3];
347 }
348 
349 static void
micro_dsqrt(union tgsi_double_channel * dst,const union tgsi_double_channel * src)350 micro_dsqrt(union tgsi_double_channel *dst,
351             const union tgsi_double_channel *src)
352 {
353    dst->d[0] = sqrt(src->d[0]);
354    dst->d[1] = sqrt(src->d[1]);
355    dst->d[2] = sqrt(src->d[2]);
356    dst->d[3] = sqrt(src->d[3]);
357 }
358 
359 static void
micro_drsq(union tgsi_double_channel * dst,const union tgsi_double_channel * src)360 micro_drsq(union tgsi_double_channel *dst,
361           const union tgsi_double_channel *src)
362 {
363    dst->d[0] = 1.0 / sqrt(src->d[0]);
364    dst->d[1] = 1.0 / sqrt(src->d[1]);
365    dst->d[2] = 1.0 / sqrt(src->d[2]);
366    dst->d[3] = 1.0 / sqrt(src->d[3]);
367 }
368 
369 static void
micro_dmad(union tgsi_double_channel * dst,const union tgsi_double_channel * src)370 micro_dmad(union tgsi_double_channel *dst,
371            const union tgsi_double_channel *src)
372 {
373    dst->d[0] = src[0].d[0] * src[1].d[0] + src[2].d[0];
374    dst->d[1] = src[0].d[1] * src[1].d[1] + src[2].d[1];
375    dst->d[2] = src[0].d[2] * src[1].d[2] + src[2].d[2];
376    dst->d[3] = src[0].d[3] * src[1].d[3] + src[2].d[3];
377 }
378 
379 static void
micro_dfrac(union tgsi_double_channel * dst,const union tgsi_double_channel * src)380 micro_dfrac(union tgsi_double_channel *dst,
381             const union tgsi_double_channel *src)
382 {
383    dst->d[0] = src->d[0] - floor(src->d[0]);
384    dst->d[1] = src->d[1] - floor(src->d[1]);
385    dst->d[2] = src->d[2] - floor(src->d[2]);
386    dst->d[3] = src->d[3] - floor(src->d[3]);
387 }
388 
389 static void
micro_dflr(union tgsi_double_channel * dst,const union tgsi_double_channel * src)390 micro_dflr(union tgsi_double_channel *dst,
391            const union tgsi_double_channel *src)
392 {
393    dst->d[0] = floor(src->d[0]);
394    dst->d[1] = floor(src->d[1]);
395    dst->d[2] = floor(src->d[2]);
396    dst->d[3] = floor(src->d[3]);
397 }
398 
399 static void
micro_dldexp(union tgsi_double_channel * dst,const union tgsi_double_channel * src0,union tgsi_exec_channel * src1)400 micro_dldexp(union tgsi_double_channel *dst,
401              const union tgsi_double_channel *src0,
402              union tgsi_exec_channel *src1)
403 {
404    dst->d[0] = ldexp(src0->d[0], src1->i[0]);
405    dst->d[1] = ldexp(src0->d[1], src1->i[1]);
406    dst->d[2] = ldexp(src0->d[2], src1->i[2]);
407    dst->d[3] = ldexp(src0->d[3], src1->i[3]);
408 }
409 
410 static void
micro_exp2(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src)411 micro_exp2(union tgsi_exec_channel *dst,
412            const union tgsi_exec_channel *src)
413 {
414 #if MESA_DEBUG
415    /* Inf is okay for this instruction, so clamp it to silence assertions. */
416    unsigned i;
417    union tgsi_exec_channel clamped;
418 
419    for (i = 0; i < 4; i++) {
420       if (src->f[i] > 127.99999f) {
421          clamped.f[i] = 127.99999f;
422       } else if (src->f[i] < -126.99999f) {
423          clamped.f[i] = -126.99999f;
424       } else {
425          clamped.f[i] = src->f[i];
426       }
427    }
428    src = &clamped;
429 #endif /* MESA_DEBUG */
430 
431    dst->f[0] = powf(2.0f, src->f[0]);
432    dst->f[1] = powf(2.0f, src->f[1]);
433    dst->f[2] = powf(2.0f, src->f[2]);
434    dst->f[3] = powf(2.0f, src->f[3]);
435 }
436 
437 static void
micro_f2d(union tgsi_double_channel * dst,const union tgsi_exec_channel * src)438 micro_f2d(union tgsi_double_channel *dst,
439           const union tgsi_exec_channel *src)
440 {
441    dst->d[0] = (double)src->f[0];
442    dst->d[1] = (double)src->f[1];
443    dst->d[2] = (double)src->f[2];
444    dst->d[3] = (double)src->f[3];
445 }
446 
447 static void
micro_flr(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src)448 micro_flr(union tgsi_exec_channel *dst,
449           const union tgsi_exec_channel *src)
450 {
451    dst->f[0] = floorf(src->f[0]);
452    dst->f[1] = floorf(src->f[1]);
453    dst->f[2] = floorf(src->f[2]);
454    dst->f[3] = floorf(src->f[3]);
455 }
456 
457 static void
micro_frc(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src)458 micro_frc(union tgsi_exec_channel *dst,
459           const union tgsi_exec_channel *src)
460 {
461    dst->f[0] = src->f[0] - floorf(src->f[0]);
462    dst->f[1] = src->f[1] - floorf(src->f[1]);
463    dst->f[2] = src->f[2] - floorf(src->f[2]);
464    dst->f[3] = src->f[3] - floorf(src->f[3]);
465 }
466 
467 static void
micro_i2d(union tgsi_double_channel * dst,const union tgsi_exec_channel * src)468 micro_i2d(union tgsi_double_channel *dst,
469           const union tgsi_exec_channel *src)
470 {
471    dst->d[0] = (double)src->i[0];
472    dst->d[1] = (double)src->i[1];
473    dst->d[2] = (double)src->i[2];
474    dst->d[3] = (double)src->i[3];
475 }
476 
477 static void
micro_iabs(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src)478 micro_iabs(union tgsi_exec_channel *dst,
479            const union tgsi_exec_channel *src)
480 {
481    dst->i[0] = src->i[0] >= 0 ? src->i[0] : -src->i[0];
482    dst->i[1] = src->i[1] >= 0 ? src->i[1] : -src->i[1];
483    dst->i[2] = src->i[2] >= 0 ? src->i[2] : -src->i[2];
484    dst->i[3] = src->i[3] >= 0 ? src->i[3] : -src->i[3];
485 }
486 
487 static void
micro_ineg(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src)488 micro_ineg(union tgsi_exec_channel *dst,
489            const union tgsi_exec_channel *src)
490 {
491    dst->i[0] = -src->i[0];
492    dst->i[1] = -src->i[1];
493    dst->i[2] = -src->i[2];
494    dst->i[3] = -src->i[3];
495 }
496 
497 static void
micro_lg2(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src)498 micro_lg2(union tgsi_exec_channel *dst,
499           const union tgsi_exec_channel *src)
500 {
501    dst->f[0] = logf(src->f[0]) * 1.442695f;
502    dst->f[1] = logf(src->f[1]) * 1.442695f;
503    dst->f[2] = logf(src->f[2]) * 1.442695f;
504    dst->f[3] = logf(src->f[3]) * 1.442695f;
505 }
506 
507 static void
micro_lrp(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src0,const union tgsi_exec_channel * src1,const union tgsi_exec_channel * src2)508 micro_lrp(union tgsi_exec_channel *dst,
509           const union tgsi_exec_channel *src0,
510           const union tgsi_exec_channel *src1,
511           const union tgsi_exec_channel *src2)
512 {
513    dst->f[0] = src0->f[0] * (src1->f[0] - src2->f[0]) + src2->f[0];
514    dst->f[1] = src0->f[1] * (src1->f[1] - src2->f[1]) + src2->f[1];
515    dst->f[2] = src0->f[2] * (src1->f[2] - src2->f[2]) + src2->f[2];
516    dst->f[3] = src0->f[3] * (src1->f[3] - src2->f[3]) + src2->f[3];
517 }
518 
519 static void
micro_mad(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src0,const union tgsi_exec_channel * src1,const union tgsi_exec_channel * src2)520 micro_mad(union tgsi_exec_channel *dst,
521           const union tgsi_exec_channel *src0,
522           const union tgsi_exec_channel *src1,
523           const union tgsi_exec_channel *src2)
524 {
525    dst->f[0] = src0->f[0] * src1->f[0] + src2->f[0];
526    dst->f[1] = src0->f[1] * src1->f[1] + src2->f[1];
527    dst->f[2] = src0->f[2] * src1->f[2] + src2->f[2];
528    dst->f[3] = src0->f[3] * src1->f[3] + src2->f[3];
529 }
530 
531 static void
micro_mov(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src)532 micro_mov(union tgsi_exec_channel *dst,
533           const union tgsi_exec_channel *src)
534 {
535    dst->u[0] = src->u[0];
536    dst->u[1] = src->u[1];
537    dst->u[2] = src->u[2];
538    dst->u[3] = src->u[3];
539 }
540 
541 static void
micro_rcp(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src)542 micro_rcp(union tgsi_exec_channel *dst,
543           const union tgsi_exec_channel *src)
544 {
545 #if 0 /* for debugging */
546    assert(src->f[0] != 0.0f);
547    assert(src->f[1] != 0.0f);
548    assert(src->f[2] != 0.0f);
549    assert(src->f[3] != 0.0f);
550 #endif
551    dst->f[0] = 1.0f / src->f[0];
552    dst->f[1] = 1.0f / src->f[1];
553    dst->f[2] = 1.0f / src->f[2];
554    dst->f[3] = 1.0f / src->f[3];
555 }
556 
557 static void
micro_rnd(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src)558 micro_rnd(union tgsi_exec_channel *dst,
559           const union tgsi_exec_channel *src)
560 {
561    dst->f[0] = _mesa_roundevenf(src->f[0]);
562    dst->f[1] = _mesa_roundevenf(src->f[1]);
563    dst->f[2] = _mesa_roundevenf(src->f[2]);
564    dst->f[3] = _mesa_roundevenf(src->f[3]);
565 }
566 
567 static void
micro_rsq(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src)568 micro_rsq(union tgsi_exec_channel *dst,
569           const union tgsi_exec_channel *src)
570 {
571 #if 0 /* for debugging */
572    assert(src->f[0] != 0.0f);
573    assert(src->f[1] != 0.0f);
574    assert(src->f[2] != 0.0f);
575    assert(src->f[3] != 0.0f);
576 #endif
577    dst->f[0] = 1.0f / sqrtf(src->f[0]);
578    dst->f[1] = 1.0f / sqrtf(src->f[1]);
579    dst->f[2] = 1.0f / sqrtf(src->f[2]);
580    dst->f[3] = 1.0f / sqrtf(src->f[3]);
581 }
582 
583 static void
micro_sqrt(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src)584 micro_sqrt(union tgsi_exec_channel *dst,
585            const union tgsi_exec_channel *src)
586 {
587    dst->f[0] = sqrtf(src->f[0]);
588    dst->f[1] = sqrtf(src->f[1]);
589    dst->f[2] = sqrtf(src->f[2]);
590    dst->f[3] = sqrtf(src->f[3]);
591 }
592 
593 static void
micro_seq(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src0,const union tgsi_exec_channel * src1)594 micro_seq(union tgsi_exec_channel *dst,
595           const union tgsi_exec_channel *src0,
596           const union tgsi_exec_channel *src1)
597 {
598    dst->f[0] = src0->f[0] == src1->f[0] ? 1.0f : 0.0f;
599    dst->f[1] = src0->f[1] == src1->f[1] ? 1.0f : 0.0f;
600    dst->f[2] = src0->f[2] == src1->f[2] ? 1.0f : 0.0f;
601    dst->f[3] = src0->f[3] == src1->f[3] ? 1.0f : 0.0f;
602 }
603 
604 static void
micro_sge(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src0,const union tgsi_exec_channel * src1)605 micro_sge(union tgsi_exec_channel *dst,
606           const union tgsi_exec_channel *src0,
607           const union tgsi_exec_channel *src1)
608 {
609    dst->f[0] = src0->f[0] >= src1->f[0] ? 1.0f : 0.0f;
610    dst->f[1] = src0->f[1] >= src1->f[1] ? 1.0f : 0.0f;
611    dst->f[2] = src0->f[2] >= src1->f[2] ? 1.0f : 0.0f;
612    dst->f[3] = src0->f[3] >= src1->f[3] ? 1.0f : 0.0f;
613 }
614 
615 static void
micro_sgn(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src)616 micro_sgn(union tgsi_exec_channel *dst,
617           const union tgsi_exec_channel *src)
618 {
619    dst->f[0] = src->f[0] < 0.0f ? -1.0f : src->f[0] > 0.0f ? 1.0f : 0.0f;
620    dst->f[1] = src->f[1] < 0.0f ? -1.0f : src->f[1] > 0.0f ? 1.0f : 0.0f;
621    dst->f[2] = src->f[2] < 0.0f ? -1.0f : src->f[2] > 0.0f ? 1.0f : 0.0f;
622    dst->f[3] = src->f[3] < 0.0f ? -1.0f : src->f[3] > 0.0f ? 1.0f : 0.0f;
623 }
624 
625 static void
micro_isgn(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src)626 micro_isgn(union tgsi_exec_channel *dst,
627           const union tgsi_exec_channel *src)
628 {
629    dst->i[0] = src->i[0] < 0 ? -1 : src->i[0] > 0 ? 1 : 0;
630    dst->i[1] = src->i[1] < 0 ? -1 : src->i[1] > 0 ? 1 : 0;
631    dst->i[2] = src->i[2] < 0 ? -1 : src->i[2] > 0 ? 1 : 0;
632    dst->i[3] = src->i[3] < 0 ? -1 : src->i[3] > 0 ? 1 : 0;
633 }
634 
635 static void
micro_sgt(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src0,const union tgsi_exec_channel * src1)636 micro_sgt(union tgsi_exec_channel *dst,
637           const union tgsi_exec_channel *src0,
638           const union tgsi_exec_channel *src1)
639 {
640    dst->f[0] = src0->f[0] > src1->f[0] ? 1.0f : 0.0f;
641    dst->f[1] = src0->f[1] > src1->f[1] ? 1.0f : 0.0f;
642    dst->f[2] = src0->f[2] > src1->f[2] ? 1.0f : 0.0f;
643    dst->f[3] = src0->f[3] > src1->f[3] ? 1.0f : 0.0f;
644 }
645 
646 static void
micro_sin(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src)647 micro_sin(union tgsi_exec_channel *dst,
648           const union tgsi_exec_channel *src)
649 {
650    dst->f[0] = sinf(src->f[0]);
651    dst->f[1] = sinf(src->f[1]);
652    dst->f[2] = sinf(src->f[2]);
653    dst->f[3] = sinf(src->f[3]);
654 }
655 
656 static void
micro_sle(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src0,const union tgsi_exec_channel * src1)657 micro_sle(union tgsi_exec_channel *dst,
658           const union tgsi_exec_channel *src0,
659           const union tgsi_exec_channel *src1)
660 {
661    dst->f[0] = src0->f[0] <= src1->f[0] ? 1.0f : 0.0f;
662    dst->f[1] = src0->f[1] <= src1->f[1] ? 1.0f : 0.0f;
663    dst->f[2] = src0->f[2] <= src1->f[2] ? 1.0f : 0.0f;
664    dst->f[3] = src0->f[3] <= src1->f[3] ? 1.0f : 0.0f;
665 }
666 
667 static void
micro_slt(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src0,const union tgsi_exec_channel * src1)668 micro_slt(union tgsi_exec_channel *dst,
669           const union tgsi_exec_channel *src0,
670           const union tgsi_exec_channel *src1)
671 {
672    dst->f[0] = src0->f[0] < src1->f[0] ? 1.0f : 0.0f;
673    dst->f[1] = src0->f[1] < src1->f[1] ? 1.0f : 0.0f;
674    dst->f[2] = src0->f[2] < src1->f[2] ? 1.0f : 0.0f;
675    dst->f[3] = src0->f[3] < src1->f[3] ? 1.0f : 0.0f;
676 }
677 
678 static void
micro_sne(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src0,const union tgsi_exec_channel * src1)679 micro_sne(union tgsi_exec_channel *dst,
680           const union tgsi_exec_channel *src0,
681           const union tgsi_exec_channel *src1)
682 {
683    dst->f[0] = src0->f[0] != src1->f[0] ? 1.0f : 0.0f;
684    dst->f[1] = src0->f[1] != src1->f[1] ? 1.0f : 0.0f;
685    dst->f[2] = src0->f[2] != src1->f[2] ? 1.0f : 0.0f;
686    dst->f[3] = src0->f[3] != src1->f[3] ? 1.0f : 0.0f;
687 }
688 
689 static void
micro_trunc(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src)690 micro_trunc(union tgsi_exec_channel *dst,
691             const union tgsi_exec_channel *src)
692 {
693    dst->f[0] = truncf(src->f[0]);
694    dst->f[1] = truncf(src->f[1]);
695    dst->f[2] = truncf(src->f[2]);
696    dst->f[3] = truncf(src->f[3]);
697 }
698 
699 static void
micro_u2d(union tgsi_double_channel * dst,const union tgsi_exec_channel * src)700 micro_u2d(union tgsi_double_channel *dst,
701           const union tgsi_exec_channel *src)
702 {
703    dst->d[0] = (double)src->u[0];
704    dst->d[1] = (double)src->u[1];
705    dst->d[2] = (double)src->u[2];
706    dst->d[3] = (double)src->u[3];
707 }
708 
709 static void
micro_i64abs(union tgsi_double_channel * dst,const union tgsi_double_channel * src)710 micro_i64abs(union tgsi_double_channel *dst,
711              const union tgsi_double_channel *src)
712 {
713    dst->i64[0] = src->i64[0] >= 0.0 ? src->i64[0] : -src->i64[0];
714    dst->i64[1] = src->i64[1] >= 0.0 ? src->i64[1] : -src->i64[1];
715    dst->i64[2] = src->i64[2] >= 0.0 ? src->i64[2] : -src->i64[2];
716    dst->i64[3] = src->i64[3] >= 0.0 ? src->i64[3] : -src->i64[3];
717 }
718 
719 static void
micro_i64sgn(union tgsi_double_channel * dst,const union tgsi_double_channel * src)720 micro_i64sgn(union tgsi_double_channel *dst,
721              const union tgsi_double_channel *src)
722 {
723    dst->i64[0] = src->i64[0] < 0 ? -1 : src->i64[0] > 0 ? 1 : 0;
724    dst->i64[1] = src->i64[1] < 0 ? -1 : src->i64[1] > 0 ? 1 : 0;
725    dst->i64[2] = src->i64[2] < 0 ? -1 : src->i64[2] > 0 ? 1 : 0;
726    dst->i64[3] = src->i64[3] < 0 ? -1 : src->i64[3] > 0 ? 1 : 0;
727 }
728 
729 static void
micro_i64neg(union tgsi_double_channel * dst,const union tgsi_double_channel * src)730 micro_i64neg(union tgsi_double_channel *dst,
731              const union tgsi_double_channel *src)
732 {
733    dst->i64[0] = -src->i64[0];
734    dst->i64[1] = -src->i64[1];
735    dst->i64[2] = -src->i64[2];
736    dst->i64[3] = -src->i64[3];
737 }
738 
739 static void
micro_u64seq(union tgsi_double_channel * dst,const union tgsi_double_channel * src)740 micro_u64seq(union tgsi_double_channel *dst,
741            const union tgsi_double_channel *src)
742 {
743    dst->u[0][0] = src[0].u64[0] == src[1].u64[0] ? ~0U : 0U;
744    dst->u[1][0] = src[0].u64[1] == src[1].u64[1] ? ~0U : 0U;
745    dst->u[2][0] = src[0].u64[2] == src[1].u64[2] ? ~0U : 0U;
746    dst->u[3][0] = src[0].u64[3] == src[1].u64[3] ? ~0U : 0U;
747 }
748 
749 static void
micro_u64sne(union tgsi_double_channel * dst,const union tgsi_double_channel * src)750 micro_u64sne(union tgsi_double_channel *dst,
751              const union tgsi_double_channel *src)
752 {
753    dst->u[0][0] = src[0].u64[0] != src[1].u64[0] ? ~0U : 0U;
754    dst->u[1][0] = src[0].u64[1] != src[1].u64[1] ? ~0U : 0U;
755    dst->u[2][0] = src[0].u64[2] != src[1].u64[2] ? ~0U : 0U;
756    dst->u[3][0] = src[0].u64[3] != src[1].u64[3] ? ~0U : 0U;
757 }
758 
759 static void
micro_i64slt(union tgsi_double_channel * dst,const union tgsi_double_channel * src)760 micro_i64slt(union tgsi_double_channel *dst,
761              const union tgsi_double_channel *src)
762 {
763    dst->u[0][0] = src[0].i64[0] < src[1].i64[0] ? ~0U : 0U;
764    dst->u[1][0] = src[0].i64[1] < src[1].i64[1] ? ~0U : 0U;
765    dst->u[2][0] = src[0].i64[2] < src[1].i64[2] ? ~0U : 0U;
766    dst->u[3][0] = src[0].i64[3] < src[1].i64[3] ? ~0U : 0U;
767 }
768 
769 static void
micro_u64slt(union tgsi_double_channel * dst,const union tgsi_double_channel * src)770 micro_u64slt(union tgsi_double_channel *dst,
771              const union tgsi_double_channel *src)
772 {
773    dst->u[0][0] = src[0].u64[0] < src[1].u64[0] ? ~0U : 0U;
774    dst->u[1][0] = src[0].u64[1] < src[1].u64[1] ? ~0U : 0U;
775    dst->u[2][0] = src[0].u64[2] < src[1].u64[2] ? ~0U : 0U;
776    dst->u[3][0] = src[0].u64[3] < src[1].u64[3] ? ~0U : 0U;
777 }
778 
779 static void
micro_i64sge(union tgsi_double_channel * dst,const union tgsi_double_channel * src)780 micro_i64sge(union tgsi_double_channel *dst,
781            const union tgsi_double_channel *src)
782 {
783    dst->u[0][0] = src[0].i64[0] >= src[1].i64[0] ? ~0U : 0U;
784    dst->u[1][0] = src[0].i64[1] >= src[1].i64[1] ? ~0U : 0U;
785    dst->u[2][0] = src[0].i64[2] >= src[1].i64[2] ? ~0U : 0U;
786    dst->u[3][0] = src[0].i64[3] >= src[1].i64[3] ? ~0U : 0U;
787 }
788 
789 static void
micro_u64sge(union tgsi_double_channel * dst,const union tgsi_double_channel * src)790 micro_u64sge(union tgsi_double_channel *dst,
791              const union tgsi_double_channel *src)
792 {
793    dst->u[0][0] = src[0].u64[0] >= src[1].u64[0] ? ~0U : 0U;
794    dst->u[1][0] = src[0].u64[1] >= src[1].u64[1] ? ~0U : 0U;
795    dst->u[2][0] = src[0].u64[2] >= src[1].u64[2] ? ~0U : 0U;
796    dst->u[3][0] = src[0].u64[3] >= src[1].u64[3] ? ~0U : 0U;
797 }
798 
799 static void
micro_u64max(union tgsi_double_channel * dst,const union tgsi_double_channel * src)800 micro_u64max(union tgsi_double_channel *dst,
801              const union tgsi_double_channel *src)
802 {
803    dst->u64[0] = src[0].u64[0] > src[1].u64[0] ? src[0].u64[0] : src[1].u64[0];
804    dst->u64[1] = src[0].u64[1] > src[1].u64[1] ? src[0].u64[1] : src[1].u64[1];
805    dst->u64[2] = src[0].u64[2] > src[1].u64[2] ? src[0].u64[2] : src[1].u64[2];
806    dst->u64[3] = src[0].u64[3] > src[1].u64[3] ? src[0].u64[3] : src[1].u64[3];
807 }
808 
809 static void
micro_i64max(union tgsi_double_channel * dst,const union tgsi_double_channel * src)810 micro_i64max(union tgsi_double_channel *dst,
811              const union tgsi_double_channel *src)
812 {
813    dst->i64[0] = src[0].i64[0] > src[1].i64[0] ? src[0].i64[0] : src[1].i64[0];
814    dst->i64[1] = src[0].i64[1] > src[1].i64[1] ? src[0].i64[1] : src[1].i64[1];
815    dst->i64[2] = src[0].i64[2] > src[1].i64[2] ? src[0].i64[2] : src[1].i64[2];
816    dst->i64[3] = src[0].i64[3] > src[1].i64[3] ? src[0].i64[3] : src[1].i64[3];
817 }
818 
819 static void
micro_u64min(union tgsi_double_channel * dst,const union tgsi_double_channel * src)820 micro_u64min(union tgsi_double_channel *dst,
821              const union tgsi_double_channel *src)
822 {
823    dst->u64[0] = src[0].u64[0] < src[1].u64[0] ? src[0].u64[0] : src[1].u64[0];
824    dst->u64[1] = src[0].u64[1] < src[1].u64[1] ? src[0].u64[1] : src[1].u64[1];
825    dst->u64[2] = src[0].u64[2] < src[1].u64[2] ? src[0].u64[2] : src[1].u64[2];
826    dst->u64[3] = src[0].u64[3] < src[1].u64[3] ? src[0].u64[3] : src[1].u64[3];
827 }
828 
829 static void
micro_i64min(union tgsi_double_channel * dst,const union tgsi_double_channel * src)830 micro_i64min(union tgsi_double_channel *dst,
831              const union tgsi_double_channel *src)
832 {
833    dst->i64[0] = src[0].i64[0] < src[1].i64[0] ? src[0].i64[0] : src[1].i64[0];
834    dst->i64[1] = src[0].i64[1] < src[1].i64[1] ? src[0].i64[1] : src[1].i64[1];
835    dst->i64[2] = src[0].i64[2] < src[1].i64[2] ? src[0].i64[2] : src[1].i64[2];
836    dst->i64[3] = src[0].i64[3] < src[1].i64[3] ? src[0].i64[3] : src[1].i64[3];
837 }
838 
839 static void
micro_u64add(union tgsi_double_channel * dst,const union tgsi_double_channel * src)840 micro_u64add(union tgsi_double_channel *dst,
841              const union tgsi_double_channel *src)
842 {
843    dst->u64[0] = src[0].u64[0] + src[1].u64[0];
844    dst->u64[1] = src[0].u64[1] + src[1].u64[1];
845    dst->u64[2] = src[0].u64[2] + src[1].u64[2];
846    dst->u64[3] = src[0].u64[3] + src[1].u64[3];
847 }
848 
849 static void
micro_u64mul(union tgsi_double_channel * dst,const union tgsi_double_channel * src)850 micro_u64mul(union tgsi_double_channel *dst,
851              const union tgsi_double_channel *src)
852 {
853    dst->u64[0] = src[0].u64[0] * src[1].u64[0];
854    dst->u64[1] = src[0].u64[1] * src[1].u64[1];
855    dst->u64[2] = src[0].u64[2] * src[1].u64[2];
856    dst->u64[3] = src[0].u64[3] * src[1].u64[3];
857 }
858 
859 static void
micro_u64div(union tgsi_double_channel * dst,const union tgsi_double_channel * src)860 micro_u64div(union tgsi_double_channel *dst,
861              const union tgsi_double_channel *src)
862 {
863    dst->u64[0] = src[1].u64[0] ? src[0].u64[0] / src[1].u64[0] : ~0ull;
864    dst->u64[1] = src[1].u64[1] ? src[0].u64[1] / src[1].u64[1] : ~0ull;
865    dst->u64[2] = src[1].u64[2] ? src[0].u64[2] / src[1].u64[2] : ~0ull;
866    dst->u64[3] = src[1].u64[3] ? src[0].u64[3] / src[1].u64[3] : ~0ull;
867 }
868 
869 static void
micro_i64div(union tgsi_double_channel * dst,const union tgsi_double_channel * src)870 micro_i64div(union tgsi_double_channel *dst,
871              const union tgsi_double_channel *src)
872 {
873    dst->i64[0] = src[1].i64[0] ? src[0].i64[0] / src[1].i64[0] : 0;
874    dst->i64[1] = src[1].i64[1] ? src[0].i64[1] / src[1].i64[1] : 0;
875    dst->i64[2] = src[1].i64[2] ? src[0].i64[2] / src[1].i64[2] : 0;
876    dst->i64[3] = src[1].i64[3] ? src[0].i64[3] / src[1].i64[3] : 0;
877 }
878 
879 static void
micro_u64mod(union tgsi_double_channel * dst,const union tgsi_double_channel * src)880 micro_u64mod(union tgsi_double_channel *dst,
881              const union tgsi_double_channel *src)
882 {
883    dst->u64[0] = src[1].u64[0] ? src[0].u64[0] % src[1].u64[0] : ~0ull;
884    dst->u64[1] = src[1].u64[1] ? src[0].u64[1] % src[1].u64[1] : ~0ull;
885    dst->u64[2] = src[1].u64[2] ? src[0].u64[2] % src[1].u64[2] : ~0ull;
886    dst->u64[3] = src[1].u64[3] ? src[0].u64[3] % src[1].u64[3] : ~0ull;
887 }
888 
889 static void
micro_i64mod(union tgsi_double_channel * dst,const union tgsi_double_channel * src)890 micro_i64mod(union tgsi_double_channel *dst,
891              const union tgsi_double_channel *src)
892 {
893    dst->i64[0] = src[1].i64[0] ? src[0].i64[0] % src[1].i64[0] : ~0ll;
894    dst->i64[1] = src[1].i64[1] ? src[0].i64[1] % src[1].i64[1] : ~0ll;
895    dst->i64[2] = src[1].i64[2] ? src[0].i64[2] % src[1].i64[2] : ~0ll;
896    dst->i64[3] = src[1].i64[3] ? src[0].i64[3] % src[1].i64[3] : ~0ll;
897 }
898 
899 static void
micro_u64shl(union tgsi_double_channel * dst,const union tgsi_double_channel * src0,union tgsi_exec_channel * src1)900 micro_u64shl(union tgsi_double_channel *dst,
901              const union tgsi_double_channel *src0,
902              union tgsi_exec_channel *src1)
903 {
904    unsigned masked_count;
905    masked_count = src1->u[0] & 0x3f;
906    dst->u64[0] = src0->u64[0] << masked_count;
907    masked_count = src1->u[1] & 0x3f;
908    dst->u64[1] = src0->u64[1] << masked_count;
909    masked_count = src1->u[2] & 0x3f;
910    dst->u64[2] = src0->u64[2] << masked_count;
911    masked_count = src1->u[3] & 0x3f;
912    dst->u64[3] = src0->u64[3] << masked_count;
913 }
914 
915 static void
micro_i64shr(union tgsi_double_channel * dst,const union tgsi_double_channel * src0,union tgsi_exec_channel * src1)916 micro_i64shr(union tgsi_double_channel *dst,
917              const union tgsi_double_channel *src0,
918              union tgsi_exec_channel *src1)
919 {
920    unsigned masked_count;
921    masked_count = src1->u[0] & 0x3f;
922    dst->i64[0] = src0->i64[0] >> masked_count;
923    masked_count = src1->u[1] & 0x3f;
924    dst->i64[1] = src0->i64[1] >> masked_count;
925    masked_count = src1->u[2] & 0x3f;
926    dst->i64[2] = src0->i64[2] >> masked_count;
927    masked_count = src1->u[3] & 0x3f;
928    dst->i64[3] = src0->i64[3] >> masked_count;
929 }
930 
931 static void
micro_u64shr(union tgsi_double_channel * dst,const union tgsi_double_channel * src0,union tgsi_exec_channel * src1)932 micro_u64shr(union tgsi_double_channel *dst,
933              const union tgsi_double_channel *src0,
934              union tgsi_exec_channel *src1)
935 {
936    unsigned masked_count;
937    masked_count = src1->u[0] & 0x3f;
938    dst->u64[0] = src0->u64[0] >> masked_count;
939    masked_count = src1->u[1] & 0x3f;
940    dst->u64[1] = src0->u64[1] >> masked_count;
941    masked_count = src1->u[2] & 0x3f;
942    dst->u64[2] = src0->u64[2] >> masked_count;
943    masked_count = src1->u[3] & 0x3f;
944    dst->u64[3] = src0->u64[3] >> masked_count;
945 }
946 
947 enum tgsi_exec_datatype {
948    TGSI_EXEC_DATA_FLOAT,
949    TGSI_EXEC_DATA_INT,
950    TGSI_EXEC_DATA_UINT,
951    TGSI_EXEC_DATA_DOUBLE,
952    TGSI_EXEC_DATA_INT64,
953    TGSI_EXEC_DATA_UINT64,
954 };
955 
956 /** The execution mask depends on the conditional mask and the loop mask */
957 #define UPDATE_EXEC_MASK(MACH) \
958       MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->Switch.mask & MACH->FuncMask
959 
960 
961 static const union tgsi_exec_channel ZeroVec =
962    { { 0.0, 0.0, 0.0, 0.0 } };
963 
964 static const union tgsi_exec_channel OneVec = {
965    {1.0f, 1.0f, 1.0f, 1.0f}
966 };
967 
968 static const union tgsi_exec_channel P128Vec = {
969    {128.0f, 128.0f, 128.0f, 128.0f}
970 };
971 
972 static const union tgsi_exec_channel M128Vec = {
973    {-128.0f, -128.0f, -128.0f, -128.0f}
974 };
975 
976 #if MESA_DEBUG
977 static void
print_chan(const char * msg,const union tgsi_exec_channel * chan)978 print_chan(const char *msg, const union tgsi_exec_channel *chan)
979 {
980    debug_printf("%s = {%f, %f, %f, %f}\n",
981                 msg, chan->f[0], chan->f[1], chan->f[2], chan->f[3]);
982 }
983 #endif
984 
985 
986 #if MESA_DEBUG
987 static void
print_temp(const struct tgsi_exec_machine * mach,unsigned index)988 print_temp(const struct tgsi_exec_machine *mach, unsigned index)
989 {
990    const struct tgsi_exec_vector *tmp = &mach->Temps[index];
991    int i;
992    debug_printf("Temp[%u] =\n", index);
993    for (i = 0; i < 4; i++) {
994       debug_printf("  %c: { %f, %f, %f, %f }\n",
995                    "XYZW"[i],
996                    tmp->xyzw[i].f[0],
997                    tmp->xyzw[i].f[1],
998                    tmp->xyzw[i].f[2],
999                    tmp->xyzw[i].f[3]);
1000    }
1001 }
1002 #endif
1003 
1004 
1005 void
tgsi_exec_set_constant_buffers(struct tgsi_exec_machine * mach,unsigned num_bufs,const struct tgsi_exec_consts_info * bufs)1006 tgsi_exec_set_constant_buffers(struct tgsi_exec_machine *mach,
1007                                unsigned num_bufs,
1008                                const struct tgsi_exec_consts_info *bufs)
1009 {
1010    unsigned i;
1011 
1012    for (i = 0; i < num_bufs; i++) {
1013       mach->Consts[i] = bufs[i].ptr;
1014       mach->ConstsSize[i] = bufs[i].size;
1015    }
1016 }
1017 
1018 /**
1019  * Initialize machine state by expanding tokens to full instructions,
1020  * allocating temporary storage, setting up constants, etc.
1021  * After this, we can call tgsi_exec_machine_run() many times.
1022  */
1023 void
tgsi_exec_machine_bind_shader(struct tgsi_exec_machine * mach,const struct tgsi_token * tokens,struct tgsi_sampler * sampler,struct tgsi_image * image,struct tgsi_buffer * buffer)1024 tgsi_exec_machine_bind_shader(
1025    struct tgsi_exec_machine *mach,
1026    const struct tgsi_token *tokens,
1027    struct tgsi_sampler *sampler,
1028    struct tgsi_image *image,
1029    struct tgsi_buffer *buffer)
1030 {
1031    unsigned k;
1032    struct tgsi_parse_context parse;
1033    struct tgsi_full_instruction *instructions;
1034    struct tgsi_full_declaration *declarations;
1035    unsigned maxInstructions = 10, numInstructions = 0;
1036    unsigned maxDeclarations = 10, numDeclarations = 0;
1037 
1038 #if 0
1039    tgsi_dump(tokens, 0);
1040 #endif
1041 
1042    mach->Tokens = tokens;
1043    mach->Sampler = sampler;
1044    mach->Image = image;
1045    mach->Buffer = buffer;
1046 
1047    if (!tokens) {
1048       /* unbind and free all */
1049       FREE(mach->Declarations);
1050       mach->Declarations = NULL;
1051       mach->NumDeclarations = 0;
1052 
1053       FREE(mach->Instructions);
1054       mach->Instructions = NULL;
1055       mach->NumInstructions = 0;
1056 
1057       return;
1058    }
1059 
1060    k = tgsi_parse_init (&parse, mach->Tokens);
1061    if (k != TGSI_PARSE_OK) {
1062       debug_printf( "Problem parsing!\n" );
1063       return;
1064    }
1065 
1066    mach->ImmLimit = 0;
1067    mach->NumOutputs = 0;
1068 
1069    for (k = 0; k < TGSI_SEMANTIC_COUNT; k++)
1070       mach->SysSemanticToIndex[k] = -1;
1071 
1072    if (mach->ShaderType == PIPE_SHADER_GEOMETRY &&
1073        !mach->UsedGeometryShader) {
1074       struct tgsi_exec_vector *inputs;
1075       struct tgsi_exec_vector *outputs;
1076 
1077       inputs = align_malloc(sizeof(struct tgsi_exec_vector) *
1078                             TGSI_MAX_PRIM_VERTICES * PIPE_MAX_SHADER_INPUTS,
1079                             16);
1080 
1081       if (!inputs)
1082          return;
1083 
1084       outputs = align_malloc(sizeof(struct tgsi_exec_vector) *
1085                              TGSI_MAX_TOTAL_VERTICES, 16);
1086 
1087       if (!outputs) {
1088          align_free(inputs);
1089          return;
1090       }
1091 
1092       align_free(mach->Inputs);
1093       align_free(mach->Outputs);
1094 
1095       mach->Inputs = inputs;
1096       mach->Outputs = outputs;
1097       mach->UsedGeometryShader = true;
1098    }
1099 
1100    declarations = (struct tgsi_full_declaration *)
1101       MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) );
1102 
1103    if (!declarations) {
1104       return;
1105    }
1106 
1107    instructions = (struct tgsi_full_instruction *)
1108       MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) );
1109 
1110    if (!instructions) {
1111       FREE( declarations );
1112       return;
1113    }
1114 
1115    while( !tgsi_parse_end_of_tokens( &parse ) ) {
1116       unsigned i;
1117 
1118       tgsi_parse_token( &parse );
1119       switch( parse.FullToken.Token.Type ) {
1120       case TGSI_TOKEN_TYPE_DECLARATION:
1121          /* save expanded declaration */
1122          if (numDeclarations == maxDeclarations) {
1123             declarations = REALLOC(declarations,
1124                                    maxDeclarations
1125                                    * sizeof(struct tgsi_full_declaration),
1126                                    (maxDeclarations + 10)
1127                                    * sizeof(struct tgsi_full_declaration));
1128             maxDeclarations += 10;
1129          }
1130          if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_OUTPUT)
1131             mach->NumOutputs = MAX2(mach->NumOutputs, parse.FullToken.FullDeclaration.Range.Last + 1);
1132          else if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
1133             const struct tgsi_full_declaration *decl = &parse.FullToken.FullDeclaration;
1134             mach->SysSemanticToIndex[decl->Semantic.Name] = decl->Range.First;
1135          }
1136 
1137          memcpy(declarations + numDeclarations,
1138                 &parse.FullToken.FullDeclaration,
1139                 sizeof(declarations[0]));
1140          numDeclarations++;
1141          break;
1142 
1143       case TGSI_TOKEN_TYPE_IMMEDIATE:
1144          {
1145             unsigned size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
1146             assert( size <= 4 );
1147             if (mach->ImmLimit >= mach->ImmsReserved) {
1148                unsigned newReserved = mach->ImmsReserved ? 2 * mach->ImmsReserved : 128;
1149                float4 *imms = REALLOC(mach->Imms, mach->ImmsReserved, newReserved * sizeof(float4));
1150                if (imms) {
1151                   mach->ImmsReserved = newReserved;
1152                   mach->Imms = imms;
1153                } else {
1154                   debug_printf("Unable to (re)allocate space for immidiate constants\n");
1155                   break;
1156                }
1157             }
1158 
1159             for( i = 0; i < size; i++ ) {
1160                mach->Imms[mach->ImmLimit][i] =
1161 		  parse.FullToken.FullImmediate.u[i].Float;
1162             }
1163             mach->ImmLimit += 1;
1164          }
1165          break;
1166 
1167       case TGSI_TOKEN_TYPE_INSTRUCTION:
1168 
1169          /* save expanded instruction */
1170          if (numInstructions == maxInstructions) {
1171             instructions = REALLOC(instructions,
1172                                    maxInstructions
1173                                    * sizeof(struct tgsi_full_instruction),
1174                                    (maxInstructions + 10)
1175                                    * sizeof(struct tgsi_full_instruction));
1176             maxInstructions += 10;
1177          }
1178 
1179          memcpy(instructions + numInstructions,
1180                 &parse.FullToken.FullInstruction,
1181                 sizeof(instructions[0]));
1182 
1183          numInstructions++;
1184          break;
1185 
1186       case TGSI_TOKEN_TYPE_PROPERTY:
1187          if (mach->ShaderType == PIPE_SHADER_GEOMETRY) {
1188             if (parse.FullToken.FullProperty.Property.PropertyName == TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES) {
1189                mach->MaxOutputVertices = parse.FullToken.FullProperty.u[0].Data;
1190             }
1191          }
1192          break;
1193 
1194       default:
1195          assert( 0 );
1196       }
1197    }
1198    tgsi_parse_free (&parse);
1199 
1200    FREE(mach->Declarations);
1201    mach->Declarations = declarations;
1202    mach->NumDeclarations = numDeclarations;
1203 
1204    FREE(mach->Instructions);
1205    mach->Instructions = instructions;
1206    mach->NumInstructions = numInstructions;
1207 }
1208 
1209 
1210 struct tgsi_exec_machine *
tgsi_exec_machine_create(enum pipe_shader_type shader_type)1211 tgsi_exec_machine_create(enum pipe_shader_type shader_type)
1212 {
1213    struct tgsi_exec_machine *mach;
1214 
1215    mach = align_malloc( sizeof *mach, 16 );
1216    if (!mach)
1217       goto fail;
1218 
1219    memset(mach, 0, sizeof(*mach));
1220 
1221    mach->ShaderType = shader_type;
1222 
1223    if (shader_type != PIPE_SHADER_COMPUTE) {
1224       mach->Inputs = align_malloc(sizeof(struct tgsi_exec_vector) * PIPE_MAX_SHADER_INPUTS, 16);
1225       mach->Outputs = align_malloc(sizeof(struct tgsi_exec_vector) * PIPE_MAX_SHADER_OUTPUTS, 16);
1226       if (!mach->Inputs || !mach->Outputs)
1227          goto fail;
1228    }
1229 
1230    if (shader_type == PIPE_SHADER_FRAGMENT) {
1231       mach->InputSampleOffsetApply = align_malloc(sizeof(apply_sample_offset_func) * PIPE_MAX_SHADER_INPUTS, 16);
1232       if (!mach->InputSampleOffsetApply)
1233          goto fail;
1234    }
1235 
1236 #if MESA_DEBUG
1237    /* silence warnings */
1238    (void) print_chan;
1239    (void) print_temp;
1240 #endif
1241 
1242    return mach;
1243 
1244 fail:
1245    if (mach) {
1246       align_free(mach->InputSampleOffsetApply);
1247       align_free(mach->Inputs);
1248       align_free(mach->Outputs);
1249       align_free(mach);
1250    }
1251    return NULL;
1252 }
1253 
1254 
1255 void
tgsi_exec_machine_destroy(struct tgsi_exec_machine * mach)1256 tgsi_exec_machine_destroy(struct tgsi_exec_machine *mach)
1257 {
1258    if (mach) {
1259       FREE(mach->Instructions);
1260       FREE(mach->Declarations);
1261       FREE(mach->Imms);
1262 
1263       align_free(mach->InputSampleOffsetApply);
1264       align_free(mach->Inputs);
1265       align_free(mach->Outputs);
1266 
1267       align_free(mach);
1268    }
1269 }
1270 
1271 static void
micro_add(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src0,const union tgsi_exec_channel * src1)1272 micro_add(union tgsi_exec_channel *dst,
1273           const union tgsi_exec_channel *src0,
1274           const union tgsi_exec_channel *src1)
1275 {
1276    dst->f[0] = src0->f[0] + src1->f[0];
1277    dst->f[1] = src0->f[1] + src1->f[1];
1278    dst->f[2] = src0->f[2] + src1->f[2];
1279    dst->f[3] = src0->f[3] + src1->f[3];
1280 }
1281 
1282 static void
micro_div(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src0,const union tgsi_exec_channel * src1)1283 micro_div(
1284    union tgsi_exec_channel *dst,
1285    const union tgsi_exec_channel *src0,
1286    const union tgsi_exec_channel *src1 )
1287 {
1288    dst->f[0] = src0->f[0] / src1->f[0];
1289    dst->f[1] = src0->f[1] / src1->f[1];
1290    dst->f[2] = src0->f[2] / src1->f[2];
1291    dst->f[3] = src0->f[3] / src1->f[3];
1292 }
1293 
1294 static void
micro_lt(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src0,const union tgsi_exec_channel * src1,const union tgsi_exec_channel * src2,const union tgsi_exec_channel * src3)1295 micro_lt(
1296    union tgsi_exec_channel *dst,
1297    const union tgsi_exec_channel *src0,
1298    const union tgsi_exec_channel *src1,
1299    const union tgsi_exec_channel *src2,
1300    const union tgsi_exec_channel *src3 )
1301 {
1302    dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
1303    dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
1304    dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
1305    dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
1306 }
1307 
1308 static void
micro_max(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src0,const union tgsi_exec_channel * src1)1309 micro_max(union tgsi_exec_channel *dst,
1310           const union tgsi_exec_channel *src0,
1311           const union tgsi_exec_channel *src1)
1312 {
1313    dst->f[0] = fmaxf(src0->f[0], src1->f[0]);
1314    dst->f[1] = fmaxf(src0->f[1], src1->f[1]);
1315    dst->f[2] = fmaxf(src0->f[2], src1->f[2]);
1316    dst->f[3] = fmaxf(src0->f[3], src1->f[3]);
1317 }
1318 
1319 static void
micro_min(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src0,const union tgsi_exec_channel * src1)1320 micro_min(union tgsi_exec_channel *dst,
1321           const union tgsi_exec_channel *src0,
1322           const union tgsi_exec_channel *src1)
1323 {
1324    dst->f[0] = fminf(src0->f[0], src1->f[0]);
1325    dst->f[1] = fminf(src0->f[1], src1->f[1]);
1326    dst->f[2] = fminf(src0->f[2], src1->f[2]);
1327    dst->f[3] = fminf(src0->f[3], src1->f[3]);
1328 }
1329 
1330 static void
micro_mul(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src0,const union tgsi_exec_channel * src1)1331 micro_mul(union tgsi_exec_channel *dst,
1332           const union tgsi_exec_channel *src0,
1333           const union tgsi_exec_channel *src1)
1334 {
1335    dst->f[0] = src0->f[0] * src1->f[0];
1336    dst->f[1] = src0->f[1] * src1->f[1];
1337    dst->f[2] = src0->f[2] * src1->f[2];
1338    dst->f[3] = src0->f[3] * src1->f[3];
1339 }
1340 
1341 static void
micro_neg(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src)1342 micro_neg(
1343    union tgsi_exec_channel *dst,
1344    const union tgsi_exec_channel *src )
1345 {
1346    dst->f[0] = -src->f[0];
1347    dst->f[1] = -src->f[1];
1348    dst->f[2] = -src->f[2];
1349    dst->f[3] = -src->f[3];
1350 }
1351 
1352 static void
micro_pow(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src0,const union tgsi_exec_channel * src1)1353 micro_pow(
1354    union tgsi_exec_channel *dst,
1355    const union tgsi_exec_channel *src0,
1356    const union tgsi_exec_channel *src1 )
1357 {
1358    dst->f[0] = powf( src0->f[0], src1->f[0] );
1359    dst->f[1] = powf( src0->f[1], src1->f[1] );
1360    dst->f[2] = powf( src0->f[2], src1->f[2] );
1361    dst->f[3] = powf( src0->f[3], src1->f[3] );
1362 }
1363 
1364 static void
micro_ldexp(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src0,const union tgsi_exec_channel * src1)1365 micro_ldexp(union tgsi_exec_channel *dst,
1366             const union tgsi_exec_channel *src0,
1367             const union tgsi_exec_channel *src1)
1368 {
1369    dst->f[0] = ldexpf(src0->f[0], src1->i[0]);
1370    dst->f[1] = ldexpf(src0->f[1], src1->i[1]);
1371    dst->f[2] = ldexpf(src0->f[2], src1->i[2]);
1372    dst->f[3] = ldexpf(src0->f[3], src1->i[3]);
1373 }
1374 
1375 static void
micro_sub(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src0,const union tgsi_exec_channel * src1)1376 micro_sub(union tgsi_exec_channel *dst,
1377           const union tgsi_exec_channel *src0,
1378           const union tgsi_exec_channel *src1)
1379 {
1380    dst->f[0] = src0->f[0] - src1->f[0];
1381    dst->f[1] = src0->f[1] - src1->f[1];
1382    dst->f[2] = src0->f[2] - src1->f[2];
1383    dst->f[3] = src0->f[3] - src1->f[3];
1384 }
1385 
1386 static void
fetch_src_file_channel(const struct tgsi_exec_machine * mach,const unsigned file,const unsigned swizzle,const union tgsi_exec_channel * index,const union tgsi_exec_channel * index2D,union tgsi_exec_channel * chan)1387 fetch_src_file_channel(const struct tgsi_exec_machine *mach,
1388                        const unsigned file,
1389                        const unsigned swizzle,
1390                        const union tgsi_exec_channel *index,
1391                        const union tgsi_exec_channel *index2D,
1392                        union tgsi_exec_channel *chan)
1393 {
1394    unsigned i;
1395 
1396    assert(swizzle < 4);
1397 
1398    switch (file) {
1399    case TGSI_FILE_CONSTANT:
1400       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1401          /* NOTE: copying the const value as a unsigned instead of float */
1402          const unsigned constbuf = index2D->i[i];
1403          const unsigned pos = index->i[i] * 4 + swizzle;
1404          /* const buffer bounds check */
1405          if (pos >= mach->ConstsSize[constbuf] / 4) {
1406             if (0) {
1407                /* Debug: print warning */
1408                static int count = 0;
1409                if (count++ < 100)
1410                   debug_printf("TGSI Exec: const buffer index %d"
1411                                  " out of bounds\n", pos);
1412             }
1413             chan->u[i] = 0;
1414          } else {
1415             const unsigned *buf = (const unsigned *)mach->Consts[constbuf];
1416             chan->u[i] = buf[pos];
1417          }
1418       }
1419       break;
1420 
1421    case TGSI_FILE_INPUT:
1422       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1423          /*
1424          if (PIPE_SHADER_GEOMETRY == mach->ShaderType) {
1425             debug_printf("Fetching Input[%d] (2d=%d, 1d=%d)\n",
1426                          index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i],
1427                          index2D->i[i], index->i[i]);
1428                          }*/
1429          int pos = index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i];
1430          assert(pos >= 0);
1431          assert(pos < TGSI_MAX_PRIM_VERTICES * PIPE_MAX_ATTRIBS);
1432          chan->u[i] = mach->Inputs[pos].xyzw[swizzle].u[i];
1433       }
1434       break;
1435 
1436    case TGSI_FILE_SYSTEM_VALUE:
1437       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1438          chan->u[i] = mach->SystemValue[index->i[i]].xyzw[swizzle].u[i];
1439       }
1440       break;
1441 
1442    case TGSI_FILE_TEMPORARY:
1443       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1444          assert(index->i[i] < TGSI_EXEC_NUM_TEMPS);
1445          assert(index2D->i[i] == 0);
1446 
1447          chan->u[i] = mach->Temps[index->i[i]].xyzw[swizzle].u[i];
1448       }
1449       break;
1450 
1451    case TGSI_FILE_IMMEDIATE:
1452       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1453          assert(index->i[i] >= 0 && index->i[i] < (int)mach->ImmLimit);
1454          assert(index2D->i[i] == 0);
1455 
1456          chan->f[i] = mach->Imms[index->i[i]][swizzle];
1457       }
1458       break;
1459 
1460    case TGSI_FILE_ADDRESS:
1461       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1462          assert(index->i[i] >= 0 && index->i[i] < ARRAY_SIZE(mach->Addrs));
1463          assert(index2D->i[i] == 0);
1464 
1465          chan->u[i] = mach->Addrs[index->i[i]].xyzw[swizzle].u[i];
1466       }
1467       break;
1468 
1469    case TGSI_FILE_OUTPUT:
1470       /* vertex/fragment output vars can be read too */
1471       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1472          assert(index->i[i] >= 0);
1473          assert(index2D->i[i] == 0);
1474 
1475          chan->u[i] = mach->Outputs[index->i[i]].xyzw[swizzle].u[i];
1476       }
1477       break;
1478 
1479    default:
1480       assert(0);
1481       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1482          chan->u[i] = 0;
1483       }
1484    }
1485 }
1486 
1487 static void
get_index_registers(const struct tgsi_exec_machine * mach,const struct tgsi_full_src_register * reg,union tgsi_exec_channel * index,union tgsi_exec_channel * index2D)1488 get_index_registers(const struct tgsi_exec_machine *mach,
1489                     const struct tgsi_full_src_register *reg,
1490                     union tgsi_exec_channel *index,
1491                     union tgsi_exec_channel *index2D)
1492 {
1493    /* We start with a direct index into a register file.
1494     *
1495     *    file[1],
1496     *    where:
1497     *       file = Register.File
1498     *       [1] = Register.Index
1499     */
1500    index->i[0] =
1501    index->i[1] =
1502    index->i[2] =
1503    index->i[3] = reg->Register.Index;
1504 
1505    /* There is an extra source register that indirectly subscripts
1506     * a register file. The direct index now becomes an offset
1507     * that is being added to the indirect register.
1508     *
1509     *    file[ind[2].x+1],
1510     *    where:
1511     *       ind = Indirect.File
1512     *       [2] = Indirect.Index
1513     *       .x = Indirect.SwizzleX
1514     */
1515    if (reg->Register.Indirect) {
1516       const unsigned execmask = mach->ExecMask;
1517 
1518       assert(reg->Indirect.File == TGSI_FILE_ADDRESS);
1519       const union tgsi_exec_channel *addr = &mach->Addrs[reg->Indirect.Index].xyzw[reg->Indirect.Swizzle];
1520       for (int i = 0; i < TGSI_QUAD_SIZE; i++)
1521          index->i[i] += addr->u[i];
1522 
1523       /* for disabled execution channels, zero-out the index to
1524        * avoid using a potential garbage value.
1525        */
1526       for (int i = 0; i < TGSI_QUAD_SIZE; i++) {
1527          if ((execmask & (1 << i)) == 0)
1528             index->i[i] = 0;
1529       }
1530    }
1531 
1532    /* There is an extra source register that is a second
1533     * subscript to a register file. Effectively it means that
1534     * the register file is actually a 2D array of registers.
1535     *
1536     *    file[3][1],
1537     *    where:
1538     *       [3] = Dimension.Index
1539     */
1540    if (reg->Register.Dimension) {
1541       index2D->i[0] =
1542       index2D->i[1] =
1543       index2D->i[2] =
1544       index2D->i[3] = reg->Dimension.Index;
1545 
1546       /* Again, the second subscript index can be addressed indirectly
1547        * identically to the first one.
1548        * Nothing stops us from indirectly addressing the indirect register,
1549        * but there is no need for that, so we won't exercise it.
1550        *
1551        *    file[ind[4].y+3][1],
1552        *    where:
1553        *       ind = DimIndirect.File
1554        *       [4] = DimIndirect.Index
1555        *       .y = DimIndirect.SwizzleX
1556        */
1557       if (reg->Dimension.Indirect) {
1558          const unsigned execmask = mach->ExecMask;
1559 
1560          assert(reg->DimIndirect.File == TGSI_FILE_ADDRESS);
1561          const union tgsi_exec_channel *addr = &mach->Addrs[reg->DimIndirect.Index].xyzw[reg->DimIndirect.Swizzle];
1562          for (int i = 0; i < TGSI_QUAD_SIZE; i++)
1563             index2D->i[i] += addr->u[i];
1564 
1565          /* for disabled execution channels, zero-out the index to
1566           * avoid using a potential garbage value.
1567           */
1568          for (int i = 0; i < TGSI_QUAD_SIZE; i++) {
1569             if ((execmask & (1 << i)) == 0) {
1570                index2D->i[i] = 0;
1571             }
1572          }
1573       }
1574 
1575       /* If by any chance there was a need for a 3D array of register
1576        * files, we would have to check whether Dimension is followed
1577        * by a dimension register and continue the saga.
1578        */
1579    } else {
1580       index2D->i[0] =
1581       index2D->i[1] =
1582       index2D->i[2] =
1583       index2D->i[3] = 0;
1584    }
1585 }
1586 
1587 
1588 static void
fetch_source_d(const struct tgsi_exec_machine * mach,union tgsi_exec_channel * chan,const struct tgsi_full_src_register * reg,const unsigned chan_index)1589 fetch_source_d(const struct tgsi_exec_machine *mach,
1590                union tgsi_exec_channel *chan,
1591                const struct tgsi_full_src_register *reg,
1592 	       const unsigned chan_index)
1593 {
1594    union tgsi_exec_channel index;
1595    union tgsi_exec_channel index2D;
1596    unsigned swizzle;
1597 
1598    get_index_registers(mach, reg, &index, &index2D);
1599 
1600 
1601    swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
1602    fetch_src_file_channel(mach,
1603                           reg->Register.File,
1604                           swizzle,
1605                           &index,
1606                           &index2D,
1607                           chan);
1608 }
1609 
1610 static void
fetch_source(const struct tgsi_exec_machine * mach,union tgsi_exec_channel * chan,const struct tgsi_full_src_register * reg,const unsigned chan_index,enum tgsi_exec_datatype src_datatype)1611 fetch_source(const struct tgsi_exec_machine *mach,
1612              union tgsi_exec_channel *chan,
1613              const struct tgsi_full_src_register *reg,
1614              const unsigned chan_index,
1615              enum tgsi_exec_datatype src_datatype)
1616 {
1617    fetch_source_d(mach, chan, reg, chan_index);
1618 
1619    if (reg->Register.Absolute) {
1620       assert(src_datatype == TGSI_EXEC_DATA_FLOAT);
1621       micro_abs(chan, chan);
1622    }
1623 
1624    if (reg->Register.Negate) {
1625       if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1626          micro_neg(chan, chan);
1627       } else {
1628          micro_ineg(chan, chan);
1629       }
1630    }
1631 }
1632 
1633 static union tgsi_exec_channel *
store_dest_dstret(struct tgsi_exec_machine * mach,const union tgsi_exec_channel * chan,const struct tgsi_full_dst_register * reg,unsigned chan_index)1634 store_dest_dstret(struct tgsi_exec_machine *mach,
1635                  const union tgsi_exec_channel *chan,
1636                  const struct tgsi_full_dst_register *reg,
1637                  unsigned chan_index)
1638 {
1639    static union tgsi_exec_channel null;
1640    union tgsi_exec_channel *dst;
1641    int offset = 0;  /* indirection offset */
1642    int index;
1643 
1644 
1645    /* There is an extra source register that indirectly subscripts
1646     * a register file. The direct index now becomes an offset
1647     * that is being added to the indirect register.
1648     *
1649     *    file[ind[2].x+1],
1650     *    where:
1651     *       ind = Indirect.File
1652     *       [2] = Indirect.Index
1653     *       .x = Indirect.SwizzleX
1654     */
1655    if (reg->Register.Indirect) {
1656       union tgsi_exec_channel index;
1657       union tgsi_exec_channel indir_index;
1658       unsigned swizzle;
1659 
1660       /* which address register (always zero for now) */
1661       index.i[0] =
1662       index.i[1] =
1663       index.i[2] =
1664       index.i[3] = reg->Indirect.Index;
1665 
1666       /* get current value of address register[swizzle] */
1667       swizzle = reg->Indirect.Swizzle;
1668 
1669       /* fetch values from the address/indirection register */
1670       fetch_src_file_channel(mach,
1671                              reg->Indirect.File,
1672                              swizzle,
1673                              &index,
1674                              &ZeroVec,
1675                              &indir_index);
1676 
1677       /* save indirection offset */
1678       offset = indir_index.i[0];
1679    }
1680 
1681    switch (reg->Register.File) {
1682    case TGSI_FILE_NULL:
1683       dst = &null;
1684       break;
1685 
1686    case TGSI_FILE_OUTPUT:
1687       index = mach->OutputVertexOffset + reg->Register.Index;
1688       dst = &mach->Outputs[offset + index].xyzw[chan_index];
1689 #if 0
1690       debug_printf("NumOutputs = %d, TEMP_O_C/I = %d, redindex = %d\n",
1691                    mach->NumOutputs, mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0],
1692                    reg->Register.Index);
1693       if (PIPE_SHADER_GEOMETRY == mach->ShaderType) {
1694          debug_printf("STORING OUT[%d] mask(%d), = (", offset + index, execmask);
1695          for (i = 0; i < TGSI_QUAD_SIZE; i++)
1696             if (execmask & (1 << i))
1697                debug_printf("%f, ", chan->f[i]);
1698          debug_printf(")\n");
1699       }
1700 #endif
1701       break;
1702 
1703    case TGSI_FILE_TEMPORARY:
1704       index = reg->Register.Index;
1705       assert( index < TGSI_EXEC_NUM_TEMPS );
1706       dst = &mach->Temps[offset + index].xyzw[chan_index];
1707       break;
1708 
1709    case TGSI_FILE_ADDRESS:
1710       index = reg->Register.Index;
1711       assert(index >= 0 && index < ARRAY_SIZE(mach->Addrs));
1712       dst = &mach->Addrs[index].xyzw[chan_index];
1713       break;
1714 
1715    default:
1716       unreachable("Bad destination file");
1717    }
1718 
1719    return dst;
1720 }
1721 
1722 static void
store_dest_double(struct tgsi_exec_machine * mach,const union tgsi_exec_channel * chan,const struct tgsi_full_dst_register * reg,unsigned chan_index)1723 store_dest_double(struct tgsi_exec_machine *mach,
1724                  const union tgsi_exec_channel *chan,
1725                  const struct tgsi_full_dst_register *reg,
1726                  unsigned chan_index)
1727 {
1728    union tgsi_exec_channel *dst;
1729    const unsigned execmask = mach->ExecMask;
1730    int i;
1731 
1732    dst = store_dest_dstret(mach, chan, reg, chan_index);
1733    if (!dst)
1734       return;
1735 
1736    /* doubles path */
1737    for (i = 0; i < TGSI_QUAD_SIZE; i++)
1738       if (execmask & (1 << i))
1739          dst->i[i] = chan->i[i];
1740 }
1741 
1742 static void
store_dest(struct tgsi_exec_machine * mach,const union tgsi_exec_channel * chan,const struct tgsi_full_dst_register * reg,const struct tgsi_full_instruction * inst,unsigned chan_index)1743 store_dest(struct tgsi_exec_machine *mach,
1744            const union tgsi_exec_channel *chan,
1745            const struct tgsi_full_dst_register *reg,
1746            const struct tgsi_full_instruction *inst,
1747            unsigned chan_index)
1748 {
1749    union tgsi_exec_channel *dst;
1750    const unsigned execmask = mach->ExecMask;
1751    int i;
1752 
1753    dst = store_dest_dstret(mach, chan, reg, chan_index);
1754    if (!dst)
1755       return;
1756 
1757    if (!inst->Instruction.Saturate) {
1758       for (i = 0; i < TGSI_QUAD_SIZE; i++)
1759          if (execmask & (1 << i))
1760             dst->i[i] = chan->i[i];
1761    }
1762    else {
1763       for (i = 0; i < TGSI_QUAD_SIZE; i++)
1764          if (execmask & (1 << i))
1765             dst->f[i] = fminf(fmaxf(chan->f[i], 0.0f), 1.0f);
1766    }
1767 }
1768 
1769 #define FETCH(VAL,INDEX,CHAN)\
1770     fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_FLOAT)
1771 
1772 #define IFETCH(VAL,INDEX,CHAN)\
1773     fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_INT)
1774 
1775 
1776 /**
1777  * Execute ARB-style KIL which is predicated by a src register.
1778  * Kill fragment if any of the four values is less than zero.
1779  */
1780 static void
exec_kill_if(struct tgsi_exec_machine * mach,const struct tgsi_full_instruction * inst)1781 exec_kill_if(struct tgsi_exec_machine *mach,
1782              const struct tgsi_full_instruction *inst)
1783 {
1784    unsigned uniquemask;
1785    unsigned chan_index;
1786    unsigned kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1787    union tgsi_exec_channel r[1];
1788 
1789    /* This mask stores component bits that were already tested. */
1790    uniquemask = 0;
1791 
1792    for (chan_index = 0; chan_index < 4; chan_index++)
1793    {
1794       unsigned swizzle;
1795       unsigned i;
1796 
1797       /* unswizzle channel */
1798       swizzle = tgsi_util_get_full_src_register_swizzle (
1799                         &inst->Src[0],
1800                         chan_index);
1801 
1802       /* check if the component has not been already tested */
1803       if (uniquemask & (1 << swizzle))
1804          continue;
1805       uniquemask |= 1 << swizzle;
1806 
1807       FETCH(&r[0], 0, chan_index);
1808       for (i = 0; i < 4; i++)
1809          if (r[0].f[i] < 0.0f)
1810             kilmask |= 1 << i;
1811    }
1812 
1813    /* restrict to fragments currently executing */
1814    kilmask &= mach->ExecMask;
1815 
1816    mach->KillMask |= kilmask;
1817 }
1818 
1819 /**
1820  * Unconditional fragment kill/discard.
1821  */
1822 static void
exec_kill(struct tgsi_exec_machine * mach)1823 exec_kill(struct tgsi_exec_machine *mach)
1824 {
1825    /* kill fragment for all fragments currently executing.
1826     * bit 0 = pixel 0, bit 1 = pixel 1, etc.
1827     */
1828    mach->KillMask |= mach->ExecMask;
1829 }
1830 
1831 static void
emit_vertex(struct tgsi_exec_machine * mach,const struct tgsi_full_instruction * inst)1832 emit_vertex(struct tgsi_exec_machine *mach,
1833             const struct tgsi_full_instruction *inst)
1834 {
1835    union tgsi_exec_channel r[1];
1836    unsigned stream_id;
1837    unsigned prim_count;
1838    /* FIXME: check for exec mask correctly
1839    unsigned i;
1840    for (i = 0; i < TGSI_QUAD_SIZE; ++i) {
1841          if ((mach->ExecMask & (1 << i)))
1842    */
1843    IFETCH(&r[0], 0, TGSI_CHAN_X);
1844    stream_id = r[0].u[0];
1845    prim_count = mach->OutputPrimCount[stream_id];
1846    if (mach->ExecMask) {
1847       if (mach->Primitives[stream_id][prim_count] >= mach->MaxOutputVertices)
1848          return;
1849 
1850       if (mach->Primitives[stream_id][prim_count] == 0)
1851          mach->PrimitiveOffsets[stream_id][prim_count] = mach->OutputVertexOffset;
1852       mach->OutputVertexOffset += mach->NumOutputs;
1853       mach->Primitives[stream_id][prim_count]++;
1854    }
1855 }
1856 
1857 static void
emit_primitive(struct tgsi_exec_machine * mach,const struct tgsi_full_instruction * inst)1858 emit_primitive(struct tgsi_exec_machine *mach,
1859                const struct tgsi_full_instruction *inst)
1860 {
1861    unsigned *prim_count;
1862    union tgsi_exec_channel r[1];
1863    unsigned stream_id = 0;
1864    /* FIXME: check for exec mask correctly
1865    unsigned i;
1866    for (i = 0; i < TGSI_QUAD_SIZE; ++i) {
1867          if ((mach->ExecMask & (1 << i)))
1868    */
1869    if (inst) {
1870       IFETCH(&r[0], 0, TGSI_CHAN_X);
1871       stream_id = r[0].u[0];
1872    }
1873    prim_count = &mach->OutputPrimCount[stream_id];
1874    if (mach->ExecMask) {
1875       ++(*prim_count);
1876       assert((*prim_count * mach->NumOutputs) < TGSI_MAX_TOTAL_VERTICES);
1877       mach->Primitives[stream_id][*prim_count] = 0;
1878    }
1879 }
1880 
1881 static void
conditional_emit_primitive(struct tgsi_exec_machine * mach)1882 conditional_emit_primitive(struct tgsi_exec_machine *mach)
1883 {
1884    if (PIPE_SHADER_GEOMETRY == mach->ShaderType) {
1885       int emitted_verts = mach->Primitives[0][mach->OutputPrimCount[0]];
1886       if (emitted_verts) {
1887          emit_primitive(mach, NULL);
1888       }
1889    }
1890 }
1891 
1892 
1893 /*
1894  * Fetch four texture samples using STR texture coordinates.
1895  */
1896 static void
fetch_texel(struct tgsi_sampler * sampler,const unsigned sview_idx,const unsigned sampler_idx,const union tgsi_exec_channel * s,const union tgsi_exec_channel * t,const union tgsi_exec_channel * p,const union tgsi_exec_channel * c0,const union tgsi_exec_channel * c1,float derivs[3][2][TGSI_QUAD_SIZE],const int8_t offset[3],enum tgsi_sampler_control control,union tgsi_exec_channel * r,union tgsi_exec_channel * g,union tgsi_exec_channel * b,union tgsi_exec_channel * a)1897 fetch_texel( struct tgsi_sampler *sampler,
1898              const unsigned sview_idx,
1899              const unsigned sampler_idx,
1900              const union tgsi_exec_channel *s,
1901              const union tgsi_exec_channel *t,
1902              const union tgsi_exec_channel *p,
1903              const union tgsi_exec_channel *c0,
1904              const union tgsi_exec_channel *c1,
1905              float derivs[3][2][TGSI_QUAD_SIZE],
1906              const int8_t offset[3],
1907              enum tgsi_sampler_control control,
1908              union tgsi_exec_channel *r,
1909              union tgsi_exec_channel *g,
1910              union tgsi_exec_channel *b,
1911              union tgsi_exec_channel *a )
1912 {
1913    unsigned j;
1914    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
1915 
1916    /* FIXME: handle explicit derivs, offsets */
1917    sampler->get_samples(sampler, sview_idx, sampler_idx,
1918                         s->f, t->f, p->f, c0->f, c1->f, derivs, offset, control, rgba);
1919 
1920    for (j = 0; j < 4; j++) {
1921       r->f[j] = rgba[0][j];
1922       g->f[j] = rgba[1][j];
1923       b->f[j] = rgba[2][j];
1924       a->f[j] = rgba[3][j];
1925    }
1926 }
1927 
1928 
1929 enum tex_modifier {
1930    TEX_MODIFIER_NONE         = 0,
1931    TEX_MODIFIER_PROJECTED    = 1,
1932    TEX_MODIFIER_LOD_BIAS     = 2,
1933    TEX_MODIFIER_EXPLICIT_LOD = 3,
1934    TEX_MODIFIER_LEVEL_ZERO   = 4,
1935    TEX_MODIFIER_GATHER       = 5,
1936 };
1937 
1938 /*
1939  * Fetch all 3 (for s,t,r coords) texel offsets, put them into int array.
1940  */
1941 static void
fetch_texel_offsets(struct tgsi_exec_machine * mach,const struct tgsi_full_instruction * inst,int8_t offsets[3])1942 fetch_texel_offsets(struct tgsi_exec_machine *mach,
1943                     const struct tgsi_full_instruction *inst,
1944                     int8_t offsets[3])
1945 {
1946    if (inst->Texture.NumOffsets == 1) {
1947       union tgsi_exec_channel index;
1948       union tgsi_exec_channel offset[3];
1949       index.i[0] = index.i[1] = index.i[2] = index.i[3] = inst->TexOffsets[0].Index;
1950       fetch_src_file_channel(mach, inst->TexOffsets[0].File,
1951                              inst->TexOffsets[0].SwizzleX, &index, &ZeroVec, &offset[0]);
1952       fetch_src_file_channel(mach, inst->TexOffsets[0].File,
1953                              inst->TexOffsets[0].SwizzleY, &index, &ZeroVec, &offset[1]);
1954       fetch_src_file_channel(mach, inst->TexOffsets[0].File,
1955                              inst->TexOffsets[0].SwizzleZ, &index, &ZeroVec, &offset[2]);
1956      offsets[0] = offset[0].i[0];
1957      offsets[1] = offset[1].i[0];
1958      offsets[2] = offset[2].i[0];
1959    } else {
1960      assert(inst->Texture.NumOffsets == 0);
1961      offsets[0] = offsets[1] = offsets[2] = 0;
1962    }
1963 }
1964 
1965 
1966 /*
1967  * Fetch dx and dy values for one channel (s, t or r).
1968  * Put dx values into one float array, dy values into another.
1969  */
1970 static void
fetch_assign_deriv_channel(struct tgsi_exec_machine * mach,const struct tgsi_full_instruction * inst,unsigned regdsrcx,unsigned chan,float derivs[2][TGSI_QUAD_SIZE])1971 fetch_assign_deriv_channel(struct tgsi_exec_machine *mach,
1972                            const struct tgsi_full_instruction *inst,
1973                            unsigned regdsrcx,
1974                            unsigned chan,
1975                            float derivs[2][TGSI_QUAD_SIZE])
1976 {
1977    union tgsi_exec_channel d;
1978    FETCH(&d, regdsrcx, chan);
1979    derivs[0][0] = d.f[0];
1980    derivs[0][1] = d.f[1];
1981    derivs[0][2] = d.f[2];
1982    derivs[0][3] = d.f[3];
1983    FETCH(&d, regdsrcx + 1, chan);
1984    derivs[1][0] = d.f[0];
1985    derivs[1][1] = d.f[1];
1986    derivs[1][2] = d.f[2];
1987    derivs[1][3] = d.f[3];
1988 }
1989 
1990 static unsigned
fetch_sampler_unit(struct tgsi_exec_machine * mach,const struct tgsi_full_instruction * inst,unsigned sampler)1991 fetch_sampler_unit(struct tgsi_exec_machine *mach,
1992                    const struct tgsi_full_instruction *inst,
1993                    unsigned sampler)
1994 {
1995    unsigned unit = 0;
1996    int i;
1997    if (inst->Src[sampler].Register.Indirect) {
1998       const struct tgsi_full_src_register *reg = &inst->Src[sampler];
1999       union tgsi_exec_channel indir_index, index2;
2000       const unsigned execmask = mach->ExecMask;
2001       index2.i[0] =
2002       index2.i[1] =
2003       index2.i[2] =
2004       index2.i[3] = reg->Indirect.Index;
2005 
2006       fetch_src_file_channel(mach,
2007                              reg->Indirect.File,
2008                              reg->Indirect.Swizzle,
2009                              &index2,
2010                              &ZeroVec,
2011                              &indir_index);
2012       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
2013          if (execmask & (1 << i)) {
2014             unit = inst->Src[sampler].Register.Index + indir_index.i[i];
2015             break;
2016          }
2017       }
2018 
2019    } else {
2020       unit = inst->Src[sampler].Register.Index;
2021    }
2022    return unit;
2023 }
2024 
2025 /*
2026  * execute a texture instruction.
2027  *
2028  * modifier is used to control the channel routing for the
2029  * instruction variants like proj, lod, and texture with lod bias.
2030  * sampler indicates which src register the sampler is contained in.
2031  */
2032 static void
exec_tex(struct tgsi_exec_machine * mach,const struct tgsi_full_instruction * inst,enum tex_modifier modifier,unsigned sampler)2033 exec_tex(struct tgsi_exec_machine *mach,
2034          const struct tgsi_full_instruction *inst,
2035          enum tex_modifier modifier, unsigned sampler)
2036 {
2037    const union tgsi_exec_channel *args[5], *proj = NULL;
2038    union tgsi_exec_channel r[5];
2039    enum tgsi_sampler_control control = TGSI_SAMPLER_LOD_NONE;
2040    unsigned chan;
2041    unsigned unit;
2042    int8_t offsets[3];
2043    int dim, shadow_ref, i;
2044 
2045    unit = fetch_sampler_unit(mach, inst, sampler);
2046    /* always fetch all 3 offsets, overkill but keeps code simple */
2047    fetch_texel_offsets(mach, inst, offsets);
2048 
2049    assert(modifier != TEX_MODIFIER_LEVEL_ZERO);
2050    assert(inst->Texture.Texture != TGSI_TEXTURE_BUFFER);
2051 
2052    dim = tgsi_util_get_texture_coord_dim(inst->Texture.Texture);
2053    shadow_ref = tgsi_util_get_shadow_ref_src_index(inst->Texture.Texture);
2054 
2055    assert(dim <= 4);
2056    if (shadow_ref >= 0)
2057       assert(shadow_ref >= dim && shadow_ref < (int)ARRAY_SIZE(args));
2058 
2059    /* fetch modifier to the last argument */
2060    if (modifier != TEX_MODIFIER_NONE) {
2061       const int last = ARRAY_SIZE(args) - 1;
2062 
2063       /* fetch modifier from src0.w or src1.x */
2064       if (sampler == 1) {
2065          assert(dim <= TGSI_CHAN_W && shadow_ref != TGSI_CHAN_W);
2066          FETCH(&r[last], 0, TGSI_CHAN_W);
2067       }
2068       else {
2069          FETCH(&r[last], 1, TGSI_CHAN_X);
2070       }
2071 
2072       if (modifier != TEX_MODIFIER_PROJECTED) {
2073          args[last] = &r[last];
2074       }
2075       else {
2076          proj = &r[last];
2077          args[last] = &ZeroVec;
2078       }
2079 
2080       /* point unused arguments to zero vector */
2081       for (i = dim; i < last; i++)
2082          args[i] = &ZeroVec;
2083 
2084       if (modifier == TEX_MODIFIER_EXPLICIT_LOD)
2085          control = TGSI_SAMPLER_LOD_EXPLICIT;
2086       else if (modifier == TEX_MODIFIER_LOD_BIAS)
2087          control = TGSI_SAMPLER_LOD_BIAS;
2088       else if (modifier == TEX_MODIFIER_GATHER)
2089          control = TGSI_SAMPLER_GATHER;
2090    }
2091    else {
2092       for (i = dim; i < (int)ARRAY_SIZE(args); i++)
2093          args[i] = &ZeroVec;
2094    }
2095 
2096    /* fetch coordinates */
2097    for (i = 0; i < dim; i++) {
2098       FETCH(&r[i], 0, TGSI_CHAN_X + i);
2099 
2100       if (proj)
2101          micro_div(&r[i], &r[i], proj);
2102 
2103       args[i] = &r[i];
2104    }
2105 
2106    /* fetch reference value */
2107    if (shadow_ref >= 0) {
2108       FETCH(&r[shadow_ref], shadow_ref / 4, TGSI_CHAN_X + (shadow_ref % 4));
2109 
2110       if (proj)
2111          micro_div(&r[shadow_ref], &r[shadow_ref], proj);
2112 
2113       args[shadow_ref] = &r[shadow_ref];
2114    }
2115 
2116    fetch_texel(mach->Sampler, unit, unit,
2117          args[0], args[1], args[2], args[3], args[4],
2118          NULL, offsets, control,
2119          &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
2120 
2121 #if 0
2122    debug_printf("fetch r: %g %g %g %g\n",
2123          r[0].f[0], r[0].f[1], r[0].f[2], r[0].f[3]);
2124    debug_printf("fetch g: %g %g %g %g\n",
2125          r[1].f[0], r[1].f[1], r[1].f[2], r[1].f[3]);
2126    debug_printf("fetch b: %g %g %g %g\n",
2127          r[2].f[0], r[2].f[1], r[2].f[2], r[2].f[3]);
2128    debug_printf("fetch a: %g %g %g %g\n",
2129          r[3].f[0], r[3].f[1], r[3].f[2], r[3].f[3]);
2130 #endif
2131 
2132    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2133       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2134          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan);
2135       }
2136    }
2137 }
2138 
2139 static void
exec_lodq(struct tgsi_exec_machine * mach,const struct tgsi_full_instruction * inst)2140 exec_lodq(struct tgsi_exec_machine *mach,
2141           const struct tgsi_full_instruction *inst)
2142 {
2143    unsigned resource_unit, sampler_unit;
2144    unsigned dim;
2145    unsigned i;
2146    union tgsi_exec_channel coords[4];
2147    const union tgsi_exec_channel *args[ARRAY_SIZE(coords)];
2148    union tgsi_exec_channel r[2];
2149 
2150    resource_unit = fetch_sampler_unit(mach, inst, 1);
2151    if (inst->Instruction.Opcode == TGSI_OPCODE_LOD) {
2152       unsigned target = mach->SamplerViews[resource_unit].Resource;
2153       dim = tgsi_util_get_texture_coord_dim(target);
2154       sampler_unit = fetch_sampler_unit(mach, inst, 2);
2155    } else {
2156       dim = tgsi_util_get_texture_coord_dim(inst->Texture.Texture);
2157       sampler_unit = resource_unit;
2158    }
2159    assert(dim <= ARRAY_SIZE(coords));
2160    /* fetch coordinates */
2161    for (i = 0; i < dim; i++) {
2162       FETCH(&coords[i], 0, TGSI_CHAN_X + i);
2163       args[i] = &coords[i];
2164    }
2165    for (i = dim; i < ARRAY_SIZE(coords); i++) {
2166       args[i] = &ZeroVec;
2167    }
2168    mach->Sampler->query_lod(mach->Sampler, resource_unit, sampler_unit,
2169                             args[0]->f,
2170                             args[1]->f,
2171                             args[2]->f,
2172                             args[3]->f,
2173                             TGSI_SAMPLER_LOD_NONE,
2174                             r[0].f,
2175                             r[1].f);
2176 
2177    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2178       store_dest(mach, &r[0], &inst->Dst[0], inst, TGSI_CHAN_X);
2179    }
2180    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2181       store_dest(mach, &r[1], &inst->Dst[0], inst, TGSI_CHAN_Y);
2182    }
2183    if (inst->Instruction.Opcode == TGSI_OPCODE_LOD) {
2184       unsigned char swizzles[4];
2185       unsigned chan;
2186       swizzles[0] = inst->Src[1].Register.SwizzleX;
2187       swizzles[1] = inst->Src[1].Register.SwizzleY;
2188       swizzles[2] = inst->Src[1].Register.SwizzleZ;
2189       swizzles[3] = inst->Src[1].Register.SwizzleW;
2190 
2191       for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2192          if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2193             if (swizzles[chan] >= 2) {
2194                store_dest(mach, &ZeroVec,
2195                           &inst->Dst[0], inst, chan);
2196             } else {
2197                store_dest(mach, &r[swizzles[chan]],
2198                           &inst->Dst[0], inst, chan);
2199             }
2200          }
2201       }
2202    } else {
2203       if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2204          store_dest(mach, &r[0], &inst->Dst[0], inst, TGSI_CHAN_X);
2205       }
2206       if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2207          store_dest(mach, &r[1], &inst->Dst[0], inst, TGSI_CHAN_Y);
2208       }
2209    }
2210 }
2211 
2212 static void
exec_txd(struct tgsi_exec_machine * mach,const struct tgsi_full_instruction * inst)2213 exec_txd(struct tgsi_exec_machine *mach,
2214          const struct tgsi_full_instruction *inst)
2215 {
2216    union tgsi_exec_channel r[4];
2217    float derivs[3][2][TGSI_QUAD_SIZE];
2218    unsigned chan;
2219    unsigned unit;
2220    int8_t offsets[3];
2221 
2222    unit = fetch_sampler_unit(mach, inst, 3);
2223    /* always fetch all 3 offsets, overkill but keeps code simple */
2224    fetch_texel_offsets(mach, inst, offsets);
2225 
2226    switch (inst->Texture.Texture) {
2227    case TGSI_TEXTURE_1D:
2228       FETCH(&r[0], 0, TGSI_CHAN_X);
2229 
2230       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
2231 
2232       fetch_texel(mach->Sampler, unit, unit,
2233                   &r[0], &ZeroVec, &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, C, LOD */
2234                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2235                   &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
2236       break;
2237 
2238    case TGSI_TEXTURE_SHADOW1D:
2239    case TGSI_TEXTURE_1D_ARRAY:
2240    case TGSI_TEXTURE_SHADOW1D_ARRAY:
2241       /* SHADOW1D/1D_ARRAY would not need Y/Z respectively, but don't bother */
2242       FETCH(&r[0], 0, TGSI_CHAN_X);
2243       FETCH(&r[1], 0, TGSI_CHAN_Y);
2244       FETCH(&r[2], 0, TGSI_CHAN_Z);
2245 
2246       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
2247 
2248       fetch_texel(mach->Sampler, unit, unit,
2249                   &r[0], &r[1], &r[2], &ZeroVec, &ZeroVec,   /* S, T, P, C, LOD */
2250                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2251                   &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
2252       break;
2253 
2254    case TGSI_TEXTURE_2D:
2255    case TGSI_TEXTURE_RECT:
2256       FETCH(&r[0], 0, TGSI_CHAN_X);
2257       FETCH(&r[1], 0, TGSI_CHAN_Y);
2258 
2259       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
2260       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_Y, derivs[1]);
2261 
2262       fetch_texel(mach->Sampler, unit, unit,
2263                   &r[0], &r[1], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, C, LOD */
2264                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2265                   &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
2266       break;
2267 
2268 
2269    case TGSI_TEXTURE_SHADOW2D:
2270    case TGSI_TEXTURE_SHADOWRECT:
2271    case TGSI_TEXTURE_2D_ARRAY:
2272    case TGSI_TEXTURE_SHADOW2D_ARRAY:
2273       /* only SHADOW2D_ARRAY actually needs W */
2274       FETCH(&r[0], 0, TGSI_CHAN_X);
2275       FETCH(&r[1], 0, TGSI_CHAN_Y);
2276       FETCH(&r[2], 0, TGSI_CHAN_Z);
2277       FETCH(&r[3], 0, TGSI_CHAN_W);
2278 
2279       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
2280       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_Y, derivs[1]);
2281 
2282       fetch_texel(mach->Sampler, unit, unit,
2283                   &r[0], &r[1], &r[2], &r[3], &ZeroVec,   /* inputs */
2284                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2285                   &r[0], &r[1], &r[2], &r[3]);     /* outputs */
2286       break;
2287 
2288    case TGSI_TEXTURE_3D:
2289    case TGSI_TEXTURE_CUBE:
2290    case TGSI_TEXTURE_CUBE_ARRAY:
2291    case TGSI_TEXTURE_SHADOWCUBE:
2292       /* only TEXTURE_CUBE_ARRAY and TEXTURE_SHADOWCUBE actually need W */
2293       FETCH(&r[0], 0, TGSI_CHAN_X);
2294       FETCH(&r[1], 0, TGSI_CHAN_Y);
2295       FETCH(&r[2], 0, TGSI_CHAN_Z);
2296       FETCH(&r[3], 0, TGSI_CHAN_W);
2297 
2298       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
2299       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_Y, derivs[1]);
2300       fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_Z, derivs[2]);
2301 
2302       fetch_texel(mach->Sampler, unit, unit,
2303                   &r[0], &r[1], &r[2], &r[3], &ZeroVec,   /* inputs */
2304                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2305                   &r[0], &r[1], &r[2], &r[3]);     /* outputs */
2306       break;
2307 
2308    default:
2309       assert(0);
2310    }
2311 
2312    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2313       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2314          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan);
2315       }
2316    }
2317 }
2318 
2319 
2320 static void
exec_txf(struct tgsi_exec_machine * mach,const struct tgsi_full_instruction * inst)2321 exec_txf(struct tgsi_exec_machine *mach,
2322          const struct tgsi_full_instruction *inst)
2323 {
2324    union tgsi_exec_channel r[4];
2325    unsigned chan;
2326    unsigned unit;
2327    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
2328    int j;
2329    int8_t offsets[3];
2330    unsigned target;
2331 
2332    unit = fetch_sampler_unit(mach, inst, 1);
2333    /* always fetch all 3 offsets, overkill but keeps code simple */
2334    fetch_texel_offsets(mach, inst, offsets);
2335 
2336    IFETCH(&r[3], 0, TGSI_CHAN_W);
2337 
2338    if (inst->Instruction.Opcode == TGSI_OPCODE_SAMPLE_I ||
2339        inst->Instruction.Opcode == TGSI_OPCODE_SAMPLE_I_MS) {
2340       target = mach->SamplerViews[unit].Resource;
2341    }
2342    else {
2343       target = inst->Texture.Texture;
2344    }
2345    switch(target) {
2346    case TGSI_TEXTURE_3D:
2347    case TGSI_TEXTURE_2D_ARRAY:
2348    case TGSI_TEXTURE_SHADOW2D_ARRAY:
2349    case TGSI_TEXTURE_2D_ARRAY_MSAA:
2350       IFETCH(&r[2], 0, TGSI_CHAN_Z);
2351       FALLTHROUGH;
2352    case TGSI_TEXTURE_2D:
2353    case TGSI_TEXTURE_RECT:
2354    case TGSI_TEXTURE_SHADOW1D_ARRAY:
2355    case TGSI_TEXTURE_SHADOW2D:
2356    case TGSI_TEXTURE_SHADOWRECT:
2357    case TGSI_TEXTURE_1D_ARRAY:
2358    case TGSI_TEXTURE_2D_MSAA:
2359       IFETCH(&r[1], 0, TGSI_CHAN_Y);
2360       FALLTHROUGH;
2361    case TGSI_TEXTURE_BUFFER:
2362    case TGSI_TEXTURE_1D:
2363    case TGSI_TEXTURE_SHADOW1D:
2364       IFETCH(&r[0], 0, TGSI_CHAN_X);
2365       break;
2366    default:
2367       assert(0);
2368       break;
2369    }
2370 
2371    mach->Sampler->get_texel(mach->Sampler, unit, r[0].i, r[1].i, r[2].i, r[3].i,
2372                             offsets, rgba);
2373 
2374    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
2375       r[0].f[j] = rgba[0][j];
2376       r[1].f[j] = rgba[1][j];
2377       r[2].f[j] = rgba[2][j];
2378       r[3].f[j] = rgba[3][j];
2379    }
2380 
2381    if (inst->Instruction.Opcode == TGSI_OPCODE_SAMPLE_I ||
2382        inst->Instruction.Opcode == TGSI_OPCODE_SAMPLE_I_MS) {
2383       unsigned char swizzles[4];
2384       swizzles[0] = inst->Src[1].Register.SwizzleX;
2385       swizzles[1] = inst->Src[1].Register.SwizzleY;
2386       swizzles[2] = inst->Src[1].Register.SwizzleZ;
2387       swizzles[3] = inst->Src[1].Register.SwizzleW;
2388 
2389       for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2390          if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2391             store_dest(mach, &r[swizzles[chan]],
2392                        &inst->Dst[0], inst, chan);
2393          }
2394       }
2395    }
2396    else {
2397       for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2398          if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2399             store_dest(mach, &r[chan], &inst->Dst[0], inst, chan);
2400          }
2401       }
2402    }
2403 }
2404 
2405 static void
exec_txq(struct tgsi_exec_machine * mach,const struct tgsi_full_instruction * inst)2406 exec_txq(struct tgsi_exec_machine *mach,
2407          const struct tgsi_full_instruction *inst)
2408 {
2409    int result[4];
2410    union tgsi_exec_channel r[4], src;
2411    unsigned chan;
2412    unsigned unit;
2413    int i,j;
2414 
2415    unit = fetch_sampler_unit(mach, inst, 1);
2416 
2417    fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_INT);
2418 
2419    /* XXX: This interface can't return per-pixel values */
2420    mach->Sampler->get_dims(mach->Sampler, unit, src.i[0], result);
2421 
2422    for (i = 0; i < TGSI_QUAD_SIZE; i++) {
2423       for (j = 0; j < 4; j++) {
2424          r[j].i[i] = result[j];
2425       }
2426    }
2427 
2428    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2429       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2430          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan);
2431       }
2432    }
2433 }
2434 
2435 static void
exec_sample(struct tgsi_exec_machine * mach,const struct tgsi_full_instruction * inst,enum tex_modifier modifier,bool compare)2436 exec_sample(struct tgsi_exec_machine *mach,
2437             const struct tgsi_full_instruction *inst,
2438             enum tex_modifier modifier, bool compare)
2439 {
2440    const unsigned resource_unit = inst->Src[1].Register.Index;
2441    const unsigned sampler_unit = inst->Src[2].Register.Index;
2442    union tgsi_exec_channel r[5], c1;
2443    const union tgsi_exec_channel *lod = &ZeroVec;
2444    enum tgsi_sampler_control control = TGSI_SAMPLER_LOD_NONE;
2445    unsigned chan;
2446    unsigned char swizzles[4];
2447    int8_t offsets[3];
2448 
2449    /* always fetch all 3 offsets, overkill but keeps code simple */
2450    fetch_texel_offsets(mach, inst, offsets);
2451 
2452    assert(modifier != TEX_MODIFIER_PROJECTED);
2453 
2454    if (modifier != TEX_MODIFIER_NONE) {
2455       if (modifier == TEX_MODIFIER_LOD_BIAS) {
2456          FETCH(&c1, 3, TGSI_CHAN_X);
2457          lod = &c1;
2458          control = TGSI_SAMPLER_LOD_BIAS;
2459       }
2460       else if (modifier == TEX_MODIFIER_EXPLICIT_LOD) {
2461          FETCH(&c1, 3, TGSI_CHAN_X);
2462          lod = &c1;
2463          control = TGSI_SAMPLER_LOD_EXPLICIT;
2464       }
2465       else if (modifier == TEX_MODIFIER_GATHER) {
2466          control = TGSI_SAMPLER_GATHER;
2467       }
2468       else {
2469          assert(modifier == TEX_MODIFIER_LEVEL_ZERO);
2470          control = TGSI_SAMPLER_LOD_ZERO;
2471       }
2472    }
2473 
2474    FETCH(&r[0], 0, TGSI_CHAN_X);
2475 
2476    switch (mach->SamplerViews[resource_unit].Resource) {
2477    case TGSI_TEXTURE_1D:
2478       if (compare) {
2479          FETCH(&r[2], 3, TGSI_CHAN_X);
2480          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2481                      &r[0], &ZeroVec, &r[2], &ZeroVec, lod, /* S, T, P, C, LOD */
2482                      NULL, offsets, control,
2483                      &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
2484       }
2485       else {
2486          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2487                      &r[0], &ZeroVec, &ZeroVec, &ZeroVec, lod, /* S, T, P, C, LOD */
2488                      NULL, offsets, control,
2489                      &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
2490       }
2491       break;
2492 
2493    case TGSI_TEXTURE_1D_ARRAY:
2494    case TGSI_TEXTURE_2D:
2495    case TGSI_TEXTURE_RECT:
2496       FETCH(&r[1], 0, TGSI_CHAN_Y);
2497       if (compare) {
2498          FETCH(&r[2], 3, TGSI_CHAN_X);
2499          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2500                      &r[0], &r[1], &r[2], &ZeroVec, lod,    /* S, T, P, C, LOD */
2501                      NULL, offsets, control,
2502                      &r[0], &r[1], &r[2], &r[3]);  /* outputs */
2503       }
2504       else {
2505          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2506                      &r[0], &r[1], &ZeroVec, &ZeroVec, lod,    /* S, T, P, C, LOD */
2507                      NULL, offsets, control,
2508                      &r[0], &r[1], &r[2], &r[3]);  /* outputs */
2509       }
2510       break;
2511 
2512    case TGSI_TEXTURE_2D_ARRAY:
2513    case TGSI_TEXTURE_3D:
2514    case TGSI_TEXTURE_CUBE:
2515       FETCH(&r[1], 0, TGSI_CHAN_Y);
2516       FETCH(&r[2], 0, TGSI_CHAN_Z);
2517       if(compare) {
2518          FETCH(&r[3], 3, TGSI_CHAN_X);
2519          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2520                      &r[0], &r[1], &r[2], &r[3], lod,
2521                      NULL, offsets, control,
2522                      &r[0], &r[1], &r[2], &r[3]);
2523       }
2524       else {
2525          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2526                      &r[0], &r[1], &r[2], &ZeroVec, lod,
2527                      NULL, offsets, control,
2528                      &r[0], &r[1], &r[2], &r[3]);
2529       }
2530       break;
2531 
2532    case TGSI_TEXTURE_CUBE_ARRAY:
2533       FETCH(&r[1], 0, TGSI_CHAN_Y);
2534       FETCH(&r[2], 0, TGSI_CHAN_Z);
2535       FETCH(&r[3], 0, TGSI_CHAN_W);
2536       if(compare) {
2537          FETCH(&r[4], 3, TGSI_CHAN_X);
2538          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2539                      &r[0], &r[1], &r[2], &r[3], &r[4],
2540                      NULL, offsets, control,
2541                      &r[0], &r[1], &r[2], &r[3]);
2542       }
2543       else {
2544          fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2545                      &r[0], &r[1], &r[2], &r[3], lod,
2546                      NULL, offsets, control,
2547                      &r[0], &r[1], &r[2], &r[3]);
2548       }
2549       break;
2550 
2551 
2552    default:
2553       assert(0);
2554    }
2555 
2556    swizzles[0] = inst->Src[1].Register.SwizzleX;
2557    swizzles[1] = inst->Src[1].Register.SwizzleY;
2558    swizzles[2] = inst->Src[1].Register.SwizzleZ;
2559    swizzles[3] = inst->Src[1].Register.SwizzleW;
2560 
2561    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2562       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2563          store_dest(mach, &r[swizzles[chan]],
2564                     &inst->Dst[0], inst, chan);
2565       }
2566    }
2567 }
2568 
2569 static void
exec_sample_d(struct tgsi_exec_machine * mach,const struct tgsi_full_instruction * inst)2570 exec_sample_d(struct tgsi_exec_machine *mach,
2571               const struct tgsi_full_instruction *inst)
2572 {
2573    const unsigned resource_unit = inst->Src[1].Register.Index;
2574    const unsigned sampler_unit = inst->Src[2].Register.Index;
2575    union tgsi_exec_channel r[4];
2576    float derivs[3][2][TGSI_QUAD_SIZE];
2577    unsigned chan;
2578    unsigned char swizzles[4];
2579    int8_t offsets[3];
2580 
2581    /* always fetch all 3 offsets, overkill but keeps code simple */
2582    fetch_texel_offsets(mach, inst, offsets);
2583 
2584    FETCH(&r[0], 0, TGSI_CHAN_X);
2585 
2586    switch (mach->SamplerViews[resource_unit].Resource) {
2587    case TGSI_TEXTURE_1D:
2588    case TGSI_TEXTURE_1D_ARRAY:
2589       /* only 1D array actually needs Y */
2590       FETCH(&r[1], 0, TGSI_CHAN_Y);
2591 
2592       fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_X, derivs[0]);
2593 
2594       fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2595                   &r[0], &r[1], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, C, LOD */
2596                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2597                   &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
2598       break;
2599 
2600    case TGSI_TEXTURE_2D:
2601    case TGSI_TEXTURE_RECT:
2602    case TGSI_TEXTURE_2D_ARRAY:
2603       /* only 2D array actually needs Z */
2604       FETCH(&r[1], 0, TGSI_CHAN_Y);
2605       FETCH(&r[2], 0, TGSI_CHAN_Z);
2606 
2607       fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_X, derivs[0]);
2608       fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_Y, derivs[1]);
2609 
2610       fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2611                   &r[0], &r[1], &r[2], &ZeroVec, &ZeroVec,   /* inputs */
2612                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2613                   &r[0], &r[1], &r[2], &r[3]);     /* outputs */
2614       break;
2615 
2616    case TGSI_TEXTURE_3D:
2617    case TGSI_TEXTURE_CUBE:
2618    case TGSI_TEXTURE_CUBE_ARRAY:
2619       /* only cube array actually needs W */
2620       FETCH(&r[1], 0, TGSI_CHAN_Y);
2621       FETCH(&r[2], 0, TGSI_CHAN_Z);
2622       FETCH(&r[3], 0, TGSI_CHAN_W);
2623 
2624       fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_X, derivs[0]);
2625       fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_Y, derivs[1]);
2626       fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_Z, derivs[2]);
2627 
2628       fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2629                   &r[0], &r[1], &r[2], &r[3], &ZeroVec,
2630                   derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2631                   &r[0], &r[1], &r[2], &r[3]);
2632       break;
2633 
2634    default:
2635       assert(0);
2636    }
2637 
2638    swizzles[0] = inst->Src[1].Register.SwizzleX;
2639    swizzles[1] = inst->Src[1].Register.SwizzleY;
2640    swizzles[2] = inst->Src[1].Register.SwizzleZ;
2641    swizzles[3] = inst->Src[1].Register.SwizzleW;
2642 
2643    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2644       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2645          store_dest(mach, &r[swizzles[chan]],
2646                     &inst->Dst[0], inst, chan);
2647       }
2648    }
2649 }
2650 
2651 
2652 /**
2653  * Evaluate a constant-valued coefficient at the position of the
2654  * current quad.
2655  */
2656 static void
eval_constant_coef(struct tgsi_exec_machine * mach,unsigned attrib,unsigned chan)2657 eval_constant_coef(
2658    struct tgsi_exec_machine *mach,
2659    unsigned attrib,
2660    unsigned chan )
2661 {
2662    unsigned i;
2663 
2664    for( i = 0; i < TGSI_QUAD_SIZE; i++ ) {
2665       mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
2666    }
2667 }
2668 
2669 static void
interp_constant_offset(UNUSED const struct tgsi_exec_machine * mach,UNUSED unsigned attrib,UNUSED unsigned chan,UNUSED float ofs_x,UNUSED float ofs_y,UNUSED union tgsi_exec_channel * out_chan)2670 interp_constant_offset(
2671       UNUSED const struct tgsi_exec_machine *mach,
2672       UNUSED unsigned attrib,
2673       UNUSED unsigned chan,
2674       UNUSED float ofs_x,
2675       UNUSED float ofs_y,
2676       UNUSED union tgsi_exec_channel *out_chan)
2677 {
2678 }
2679 
2680 /**
2681  * Evaluate a linear-valued coefficient at the position of the
2682  * current quad.
2683  */
2684 static void
interp_linear_offset(const struct tgsi_exec_machine * mach,unsigned attrib,unsigned chan,float ofs_x,float ofs_y,union tgsi_exec_channel * out_chan)2685 interp_linear_offset(
2686       const struct tgsi_exec_machine *mach,
2687       unsigned attrib,
2688       unsigned chan,
2689       float ofs_x,
2690       float ofs_y,
2691       union tgsi_exec_channel *out_chan)
2692 {
2693    const float dadx = mach->InterpCoefs[attrib].dadx[chan];
2694    const float dady = mach->InterpCoefs[attrib].dady[chan];
2695    const float delta = ofs_x * dadx + ofs_y * dady;
2696    out_chan->f[0] += delta;
2697    out_chan->f[1] += delta;
2698    out_chan->f[2] += delta;
2699    out_chan->f[3] += delta;
2700 }
2701 
2702 static void
eval_linear_coef(struct tgsi_exec_machine * mach,unsigned attrib,unsigned chan)2703 eval_linear_coef(struct tgsi_exec_machine *mach,
2704                  unsigned attrib,
2705                  unsigned chan)
2706 {
2707    const float x = mach->QuadPos.xyzw[0].f[0];
2708    const float y = mach->QuadPos.xyzw[1].f[0];
2709    const float dadx = mach->InterpCoefs[attrib].dadx[chan];
2710    const float dady = mach->InterpCoefs[attrib].dady[chan];
2711    const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
2712 
2713    mach->Inputs[attrib].xyzw[chan].f[0] = a0;
2714    mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
2715    mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
2716    mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
2717 }
2718 
2719 /**
2720  * Evaluate a perspective-valued coefficient at the position of the
2721  * current quad.
2722  */
2723 
2724 static void
interp_perspective_offset(const struct tgsi_exec_machine * mach,unsigned attrib,unsigned chan,float ofs_x,float ofs_y,union tgsi_exec_channel * out_chan)2725 interp_perspective_offset(
2726    const struct tgsi_exec_machine *mach,
2727    unsigned attrib,
2728    unsigned chan,
2729    float ofs_x,
2730    float ofs_y,
2731    union tgsi_exec_channel *out_chan)
2732 {
2733    const float dadx = mach->InterpCoefs[attrib].dadx[chan];
2734    const float dady = mach->InterpCoefs[attrib].dady[chan];
2735    const float *w = mach->QuadPos.xyzw[3].f;
2736    const float delta = ofs_x * dadx + ofs_y * dady;
2737    out_chan->f[0] += delta / w[0];
2738    out_chan->f[1] += delta / w[1];
2739    out_chan->f[2] += delta / w[2];
2740    out_chan->f[3] += delta / w[3];
2741 }
2742 
2743 static void
eval_perspective_coef(struct tgsi_exec_machine * mach,unsigned attrib,unsigned chan)2744 eval_perspective_coef(
2745    struct tgsi_exec_machine *mach,
2746    unsigned attrib,
2747    unsigned chan )
2748 {
2749    const float x = mach->QuadPos.xyzw[0].f[0];
2750    const float y = mach->QuadPos.xyzw[1].f[0];
2751    const float dadx = mach->InterpCoefs[attrib].dadx[chan];
2752    const float dady = mach->InterpCoefs[attrib].dady[chan];
2753    const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
2754    const float *w = mach->QuadPos.xyzw[3].f;
2755    /* divide by W here */
2756    mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
2757    mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
2758    mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
2759    mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
2760 }
2761 
2762 
2763 typedef void (* eval_coef_func)(
2764    struct tgsi_exec_machine *mach,
2765    unsigned attrib,
2766    unsigned chan );
2767 
2768 static void
exec_declaration(struct tgsi_exec_machine * mach,const struct tgsi_full_declaration * decl)2769 exec_declaration(struct tgsi_exec_machine *mach,
2770                  const struct tgsi_full_declaration *decl)
2771 {
2772    if (decl->Declaration.File == TGSI_FILE_SAMPLER_VIEW) {
2773       mach->SamplerViews[decl->Range.First] = decl->SamplerView;
2774       return;
2775    }
2776 
2777    if (mach->ShaderType == PIPE_SHADER_FRAGMENT) {
2778       if (decl->Declaration.File == TGSI_FILE_INPUT) {
2779          unsigned first, last, mask;
2780 
2781          first = decl->Range.First;
2782          last = decl->Range.Last;
2783          mask = decl->Declaration.UsageMask;
2784 
2785          /* XXX we could remove this special-case code since
2786           * mach->InterpCoefs[first].a0 should already have the
2787           * front/back-face value.  But we should first update the
2788           * ureg code to emit the right UsageMask value (WRITEMASK_X).
2789           * Then, we could remove the tgsi_exec_machine::Face field.
2790           */
2791          /* XXX make FACE a system value */
2792          if (decl->Semantic.Name == TGSI_SEMANTIC_FACE) {
2793             unsigned i;
2794 
2795             assert(decl->Semantic.Index == 0);
2796             assert(first == last);
2797 
2798             for (i = 0; i < TGSI_QUAD_SIZE; i++) {
2799                mach->Inputs[first].xyzw[0].f[i] = mach->Face;
2800             }
2801          } else {
2802             eval_coef_func eval;
2803             apply_sample_offset_func interp;
2804             unsigned i, j;
2805 
2806             switch (decl->Interp.Interpolate) {
2807             case TGSI_INTERPOLATE_CONSTANT:
2808                eval = eval_constant_coef;
2809                interp = interp_constant_offset;
2810                break;
2811 
2812             case TGSI_INTERPOLATE_LINEAR:
2813                eval = eval_linear_coef;
2814                interp = interp_linear_offset;
2815                break;
2816 
2817             case TGSI_INTERPOLATE_PERSPECTIVE:
2818                eval = eval_perspective_coef;
2819                interp = interp_perspective_offset;
2820                break;
2821 
2822             case TGSI_INTERPOLATE_COLOR:
2823                eval = mach->flatshade_color ? eval_constant_coef : eval_perspective_coef;
2824                interp = mach->flatshade_color ? interp_constant_offset : interp_perspective_offset;
2825                break;
2826 
2827             default:
2828                assert(0);
2829                return;
2830             }
2831 
2832             for (i = first; i <= last; i++)
2833                mach->InputSampleOffsetApply[i] = interp;
2834 
2835             for (j = 0; j < TGSI_NUM_CHANNELS; j++) {
2836                if (mask & (1 << j)) {
2837                   for (i = first; i <= last; i++) {
2838                      eval(mach, i, j);
2839                   }
2840                }
2841             }
2842          }
2843 
2844          if (DEBUG_EXECUTION) {
2845             unsigned i, j;
2846             for (i = first; i <= last; ++i) {
2847                debug_printf("IN[%2u] = ", i);
2848                for (j = 0; j < TGSI_NUM_CHANNELS; j++) {
2849                   if (j > 0) {
2850                      debug_printf("         ");
2851                   }
2852                   debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
2853                                mach->Inputs[i].xyzw[0].f[j], mach->Inputs[i].xyzw[0].u[j],
2854                                mach->Inputs[i].xyzw[1].f[j], mach->Inputs[i].xyzw[1].u[j],
2855                                mach->Inputs[i].xyzw[2].f[j], mach->Inputs[i].xyzw[2].u[j],
2856                                mach->Inputs[i].xyzw[3].f[j], mach->Inputs[i].xyzw[3].u[j]);
2857                }
2858             }
2859          }
2860       }
2861    }
2862 
2863 }
2864 
2865 typedef void (* micro_unary_op)(union tgsi_exec_channel *dst,
2866                                 const union tgsi_exec_channel *src);
2867 
2868 static void
exec_scalar_unary(struct tgsi_exec_machine * mach,const struct tgsi_full_instruction * inst,micro_unary_op op,enum tgsi_exec_datatype src_datatype)2869 exec_scalar_unary(struct tgsi_exec_machine *mach,
2870                   const struct tgsi_full_instruction *inst,
2871                   micro_unary_op op,
2872                   enum tgsi_exec_datatype src_datatype)
2873 {
2874    unsigned int chan;
2875    union tgsi_exec_channel src;
2876    union tgsi_exec_channel dst;
2877 
2878    fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, src_datatype);
2879    op(&dst, &src);
2880    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2881       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2882          store_dest(mach, &dst, &inst->Dst[0], inst, chan);
2883       }
2884    }
2885 }
2886 
2887 static void
exec_vector_unary(struct tgsi_exec_machine * mach,const struct tgsi_full_instruction * inst,micro_unary_op op,enum tgsi_exec_datatype src_datatype)2888 exec_vector_unary(struct tgsi_exec_machine *mach,
2889                   const struct tgsi_full_instruction *inst,
2890                   micro_unary_op op,
2891                   enum tgsi_exec_datatype src_datatype)
2892 {
2893    unsigned int chan;
2894    struct tgsi_exec_vector dst;
2895 
2896    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2897       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2898          union tgsi_exec_channel src;
2899 
2900          fetch_source(mach, &src, &inst->Src[0], chan, src_datatype);
2901          op(&dst.xyzw[chan], &src);
2902       }
2903    }
2904    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2905       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2906          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan);
2907       }
2908    }
2909 }
2910 
2911 typedef void (* micro_binary_op)(union tgsi_exec_channel *dst,
2912                                  const union tgsi_exec_channel *src0,
2913                                  const union tgsi_exec_channel *src1);
2914 
2915 static void
exec_scalar_binary(struct tgsi_exec_machine * mach,const struct tgsi_full_instruction * inst,micro_binary_op op,enum tgsi_exec_datatype src_datatype)2916 exec_scalar_binary(struct tgsi_exec_machine *mach,
2917                    const struct tgsi_full_instruction *inst,
2918                    micro_binary_op op,
2919                    enum tgsi_exec_datatype src_datatype)
2920 {
2921    unsigned int chan;
2922    union tgsi_exec_channel src[2];
2923    union tgsi_exec_channel dst;
2924 
2925    fetch_source(mach, &src[0], &inst->Src[0], TGSI_CHAN_X, src_datatype);
2926    fetch_source(mach, &src[1], &inst->Src[1], TGSI_CHAN_X, src_datatype);
2927    op(&dst, &src[0], &src[1]);
2928    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2929       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2930          store_dest(mach, &dst, &inst->Dst[0], inst, chan);
2931       }
2932    }
2933 }
2934 
2935 static void
exec_vector_binary(struct tgsi_exec_machine * mach,const struct tgsi_full_instruction * inst,micro_binary_op op,enum tgsi_exec_datatype src_datatype)2936 exec_vector_binary(struct tgsi_exec_machine *mach,
2937                    const struct tgsi_full_instruction *inst,
2938                    micro_binary_op op,
2939                    enum tgsi_exec_datatype src_datatype)
2940 {
2941    unsigned int chan;
2942    struct tgsi_exec_vector dst;
2943 
2944    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2945       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2946          union tgsi_exec_channel src[2];
2947 
2948          fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
2949          fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
2950          op(&dst.xyzw[chan], &src[0], &src[1]);
2951       }
2952    }
2953    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2954       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2955          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan);
2956       }
2957    }
2958 }
2959 
2960 typedef void (* micro_trinary_op)(union tgsi_exec_channel *dst,
2961                                   const union tgsi_exec_channel *src0,
2962                                   const union tgsi_exec_channel *src1,
2963                                   const union tgsi_exec_channel *src2);
2964 
2965 static void
exec_vector_trinary(struct tgsi_exec_machine * mach,const struct tgsi_full_instruction * inst,micro_trinary_op op,enum tgsi_exec_datatype src_datatype)2966 exec_vector_trinary(struct tgsi_exec_machine *mach,
2967                     const struct tgsi_full_instruction *inst,
2968                     micro_trinary_op op,
2969                     enum tgsi_exec_datatype src_datatype)
2970 {
2971    unsigned int chan;
2972    struct tgsi_exec_vector dst;
2973 
2974    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2975       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2976          union tgsi_exec_channel src[3];
2977 
2978          fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
2979          fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
2980          fetch_source(mach, &src[2], &inst->Src[2], chan, src_datatype);
2981          op(&dst.xyzw[chan], &src[0], &src[1], &src[2]);
2982       }
2983    }
2984    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2985       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2986          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan);
2987       }
2988    }
2989 }
2990 
2991 typedef void (* micro_quaternary_op)(union tgsi_exec_channel *dst,
2992                                      const union tgsi_exec_channel *src0,
2993                                      const union tgsi_exec_channel *src1,
2994                                      const union tgsi_exec_channel *src2,
2995                                      const union tgsi_exec_channel *src3);
2996 
2997 static void
exec_vector_quaternary(struct tgsi_exec_machine * mach,const struct tgsi_full_instruction * inst,micro_quaternary_op op,enum tgsi_exec_datatype src_datatype)2998 exec_vector_quaternary(struct tgsi_exec_machine *mach,
2999                        const struct tgsi_full_instruction *inst,
3000                        micro_quaternary_op op,
3001                        enum tgsi_exec_datatype src_datatype)
3002 {
3003    unsigned int chan;
3004    struct tgsi_exec_vector dst;
3005 
3006    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3007       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3008          union tgsi_exec_channel src[4];
3009 
3010          fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
3011          fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
3012          fetch_source(mach, &src[2], &inst->Src[2], chan, src_datatype);
3013          fetch_source(mach, &src[3], &inst->Src[3], chan, src_datatype);
3014          op(&dst.xyzw[chan], &src[0], &src[1], &src[2], &src[3]);
3015       }
3016    }
3017    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3018       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3019          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan);
3020       }
3021    }
3022 }
3023 
3024 static void
exec_dp3(struct tgsi_exec_machine * mach,const struct tgsi_full_instruction * inst)3025 exec_dp3(struct tgsi_exec_machine *mach,
3026          const struct tgsi_full_instruction *inst)
3027 {
3028    unsigned int chan;
3029    union tgsi_exec_channel arg[3];
3030 
3031    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3032    fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3033    micro_mul(&arg[2], &arg[0], &arg[1]);
3034 
3035    for (chan = TGSI_CHAN_Y; chan <= TGSI_CHAN_Z; chan++) {
3036       fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
3037       fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
3038       micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
3039    }
3040 
3041    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3042       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3043          store_dest(mach, &arg[2], &inst->Dst[0], inst, chan);
3044       }
3045    }
3046 }
3047 
3048 static void
exec_dp4(struct tgsi_exec_machine * mach,const struct tgsi_full_instruction * inst)3049 exec_dp4(struct tgsi_exec_machine *mach,
3050          const struct tgsi_full_instruction *inst)
3051 {
3052    unsigned int chan;
3053    union tgsi_exec_channel arg[3];
3054 
3055    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3056    fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3057    micro_mul(&arg[2], &arg[0], &arg[1]);
3058 
3059    for (chan = TGSI_CHAN_Y; chan <= TGSI_CHAN_W; chan++) {
3060       fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
3061       fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
3062       micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
3063    }
3064 
3065    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3066       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3067          store_dest(mach, &arg[2], &inst->Dst[0], inst, chan);
3068       }
3069    }
3070 }
3071 
3072 static void
exec_dp2(struct tgsi_exec_machine * mach,const struct tgsi_full_instruction * inst)3073 exec_dp2(struct tgsi_exec_machine *mach,
3074          const struct tgsi_full_instruction *inst)
3075 {
3076    unsigned int chan;
3077    union tgsi_exec_channel arg[3];
3078 
3079    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3080    fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3081    micro_mul(&arg[2], &arg[0], &arg[1]);
3082 
3083    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3084    fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3085    micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
3086 
3087    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3088       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3089          store_dest(mach, &arg[2], &inst->Dst[0], inst, chan);
3090       }
3091    }
3092 }
3093 
3094 static void
exec_pk2h(struct tgsi_exec_machine * mach,const struct tgsi_full_instruction * inst)3095 exec_pk2h(struct tgsi_exec_machine *mach,
3096           const struct tgsi_full_instruction *inst)
3097 {
3098    unsigned chan;
3099    union tgsi_exec_channel arg[2], dst;
3100 
3101    fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3102    fetch_source(mach, &arg[1], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3103    for (chan = 0; chan < TGSI_QUAD_SIZE; chan++) {
3104       dst.u[chan] = _mesa_float_to_half(arg[0].f[chan]) |
3105          (_mesa_float_to_half(arg[1].f[chan]) << 16);
3106    }
3107    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3108       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3109          store_dest(mach, &dst, &inst->Dst[0], inst, chan);
3110       }
3111    }
3112 }
3113 
3114 static void
exec_up2h(struct tgsi_exec_machine * mach,const struct tgsi_full_instruction * inst)3115 exec_up2h(struct tgsi_exec_machine *mach,
3116           const struct tgsi_full_instruction *inst)
3117 {
3118    unsigned chan;
3119    union tgsi_exec_channel arg, dst[2];
3120 
3121    fetch_source(mach, &arg, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_UINT);
3122    for (chan = 0; chan < TGSI_QUAD_SIZE; chan++) {
3123       dst[0].f[chan] = _mesa_half_to_float(arg.u[chan] & 0xffff);
3124       dst[1].f[chan] = _mesa_half_to_float(arg.u[chan] >> 16);
3125    }
3126    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3127       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3128          store_dest(mach, &dst[chan & 1], &inst->Dst[0], inst, chan);
3129       }
3130    }
3131 }
3132 
3133 static void
micro_ucmp(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src0,const union tgsi_exec_channel * src1,const union tgsi_exec_channel * src2)3134 micro_ucmp(union tgsi_exec_channel *dst,
3135            const union tgsi_exec_channel *src0,
3136            const union tgsi_exec_channel *src1,
3137            const union tgsi_exec_channel *src2)
3138 {
3139    dst->f[0] = src0->u[0] ? src1->f[0] : src2->f[0];
3140    dst->f[1] = src0->u[1] ? src1->f[1] : src2->f[1];
3141    dst->f[2] = src0->u[2] ? src1->f[2] : src2->f[2];
3142    dst->f[3] = src0->u[3] ? src1->f[3] : src2->f[3];
3143 }
3144 
3145 static void
exec_ucmp(struct tgsi_exec_machine * mach,const struct tgsi_full_instruction * inst)3146 exec_ucmp(struct tgsi_exec_machine *mach,
3147           const struct tgsi_full_instruction *inst)
3148 {
3149    unsigned int chan;
3150    struct tgsi_exec_vector dst;
3151 
3152    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3153       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3154          union tgsi_exec_channel src[3];
3155 
3156          fetch_source(mach, &src[0], &inst->Src[0], chan,
3157                       TGSI_EXEC_DATA_UINT);
3158          fetch_source(mach, &src[1], &inst->Src[1], chan,
3159                       TGSI_EXEC_DATA_FLOAT);
3160          fetch_source(mach, &src[2], &inst->Src[2], chan,
3161                       TGSI_EXEC_DATA_FLOAT);
3162          micro_ucmp(&dst.xyzw[chan], &src[0], &src[1], &src[2]);
3163       }
3164    }
3165    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3166       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3167          store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan);
3168       }
3169    }
3170 }
3171 
3172 static void
exec_dst(struct tgsi_exec_machine * mach,const struct tgsi_full_instruction * inst)3173 exec_dst(struct tgsi_exec_machine *mach,
3174          const struct tgsi_full_instruction *inst)
3175 {
3176    union tgsi_exec_channel r[2];
3177    union tgsi_exec_channel d[4];
3178 
3179    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3180       fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3181       fetch_source(mach, &r[1], &inst->Src[1], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3182       micro_mul(&d[TGSI_CHAN_Y], &r[0], &r[1]);
3183    }
3184    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3185       fetch_source(mach, &d[TGSI_CHAN_Z], &inst->Src[0], TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
3186    }
3187    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3188       fetch_source(mach, &d[TGSI_CHAN_W], &inst->Src[1], TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
3189    }
3190 
3191    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
3192       store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_X);
3193    }
3194    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3195       store_dest(mach, &d[TGSI_CHAN_Y], &inst->Dst[0], inst, TGSI_CHAN_Y);
3196    }
3197    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3198       store_dest(mach, &d[TGSI_CHAN_Z], &inst->Dst[0], inst, TGSI_CHAN_Z);
3199    }
3200    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3201       store_dest(mach, &d[TGSI_CHAN_W], &inst->Dst[0], inst, TGSI_CHAN_W);
3202    }
3203 }
3204 
3205 static void
exec_log(struct tgsi_exec_machine * mach,const struct tgsi_full_instruction * inst)3206 exec_log(struct tgsi_exec_machine *mach,
3207          const struct tgsi_full_instruction *inst)
3208 {
3209    union tgsi_exec_channel r[3];
3210 
3211    fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3212    micro_abs(&r[2], &r[0]);  /* r2 = abs(r0) */
3213    micro_lg2(&r[1], &r[2]);  /* r1 = lg2(r2) */
3214    micro_flr(&r[0], &r[1]);  /* r0 = floor(r1) */
3215    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
3216       store_dest(mach, &r[0], &inst->Dst[0], inst, TGSI_CHAN_X);
3217    }
3218    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3219       micro_exp2(&r[0], &r[0]);       /* r0 = 2 ^ r0 */
3220       micro_div(&r[0], &r[2], &r[0]); /* r0 = r2 / r0 */
3221       store_dest(mach, &r[0], &inst->Dst[0], inst, TGSI_CHAN_Y);
3222    }
3223    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3224       store_dest(mach, &r[1], &inst->Dst[0], inst, TGSI_CHAN_Z);
3225    }
3226    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3227       store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W);
3228    }
3229 }
3230 
3231 static void
exec_exp(struct tgsi_exec_machine * mach,const struct tgsi_full_instruction * inst)3232 exec_exp(struct tgsi_exec_machine *mach,
3233          const struct tgsi_full_instruction *inst)
3234 {
3235    union tgsi_exec_channel r[3];
3236 
3237    fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3238    micro_flr(&r[1], &r[0]);  /* r1 = floor(r0) */
3239    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
3240       micro_exp2(&r[2], &r[1]);       /* r2 = 2 ^ r1 */
3241       store_dest(mach, &r[2], &inst->Dst[0], inst, TGSI_CHAN_X);
3242    }
3243    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3244       micro_sub(&r[2], &r[0], &r[1]); /* r2 = r0 - r1 */
3245       store_dest(mach, &r[2], &inst->Dst[0], inst, TGSI_CHAN_Y);
3246    }
3247    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3248       micro_exp2(&r[2], &r[0]);       /* r2 = 2 ^ r0 */
3249       store_dest(mach, &r[2], &inst->Dst[0], inst, TGSI_CHAN_Z);
3250    }
3251    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3252       store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W);
3253    }
3254 }
3255 
3256 static void
exec_lit(struct tgsi_exec_machine * mach,const struct tgsi_full_instruction * inst)3257 exec_lit(struct tgsi_exec_machine *mach,
3258          const struct tgsi_full_instruction *inst)
3259 {
3260    union tgsi_exec_channel r[3];
3261    union tgsi_exec_channel d[3];
3262 
3263    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_YZ) {
3264       fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3265       if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3266          fetch_source(mach, &r[1], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3267          micro_max(&r[1], &r[1], &ZeroVec);
3268 
3269          fetch_source(mach, &r[2], &inst->Src[0], TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
3270          micro_min(&r[2], &r[2], &P128Vec);
3271          micro_max(&r[2], &r[2], &M128Vec);
3272          micro_pow(&r[1], &r[1], &r[2]);
3273          micro_lt(&d[TGSI_CHAN_Z], &ZeroVec, &r[0], &r[1], &ZeroVec);
3274          store_dest(mach, &d[TGSI_CHAN_Z], &inst->Dst[0], inst, TGSI_CHAN_Z);
3275       }
3276       if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3277          micro_max(&d[TGSI_CHAN_Y], &r[0], &ZeroVec);
3278          store_dest(mach, &d[TGSI_CHAN_Y], &inst->Dst[0], inst, TGSI_CHAN_Y);
3279       }
3280    }
3281    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
3282       store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_X);
3283    }
3284 
3285    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3286       store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W);
3287    }
3288 }
3289 
3290 static void
exec_break(struct tgsi_exec_machine * mach)3291 exec_break(struct tgsi_exec_machine *mach)
3292 {
3293    if (mach->BreakType == TGSI_EXEC_BREAK_INSIDE_LOOP) {
3294       /* turn off loop channels for each enabled exec channel */
3295       mach->LoopMask &= ~mach->ExecMask;
3296       /* Todo: if mach->LoopMask == 0, jump to end of loop */
3297       UPDATE_EXEC_MASK(mach);
3298    } else {
3299       assert(mach->BreakType == TGSI_EXEC_BREAK_INSIDE_SWITCH);
3300 
3301       mach->Switch.mask = 0x0;
3302 
3303       UPDATE_EXEC_MASK(mach);
3304    }
3305 }
3306 
3307 static void
exec_switch(struct tgsi_exec_machine * mach,const struct tgsi_full_instruction * inst)3308 exec_switch(struct tgsi_exec_machine *mach,
3309             const struct tgsi_full_instruction *inst)
3310 {
3311    assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
3312    assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
3313 
3314    mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
3315    fetch_source(mach, &mach->Switch.selector, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_UINT);
3316    mach->Switch.mask = 0x0;
3317    mach->Switch.defaultMask = 0x0;
3318 
3319    mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
3320    mach->BreakType = TGSI_EXEC_BREAK_INSIDE_SWITCH;
3321 
3322    UPDATE_EXEC_MASK(mach);
3323 }
3324 
3325 static void
exec_case(struct tgsi_exec_machine * mach,const struct tgsi_full_instruction * inst)3326 exec_case(struct tgsi_exec_machine *mach,
3327           const struct tgsi_full_instruction *inst)
3328 {
3329    unsigned prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
3330    union tgsi_exec_channel src;
3331    unsigned mask = 0;
3332 
3333    fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_UINT);
3334 
3335    if (mach->Switch.selector.u[0] == src.u[0]) {
3336       mask |= 0x1;
3337    }
3338    if (mach->Switch.selector.u[1] == src.u[1]) {
3339       mask |= 0x2;
3340    }
3341    if (mach->Switch.selector.u[2] == src.u[2]) {
3342       mask |= 0x4;
3343    }
3344    if (mach->Switch.selector.u[3] == src.u[3]) {
3345       mask |= 0x8;
3346    }
3347 
3348    mach->Switch.defaultMask |= mask;
3349 
3350    mach->Switch.mask |= mask & prevMask;
3351 
3352    UPDATE_EXEC_MASK(mach);
3353 }
3354 
3355 /* FIXME: this will only work if default is last */
3356 static void
exec_default(struct tgsi_exec_machine * mach)3357 exec_default(struct tgsi_exec_machine *mach)
3358 {
3359    unsigned prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
3360 
3361    mach->Switch.mask |= ~mach->Switch.defaultMask & prevMask;
3362 
3363    UPDATE_EXEC_MASK(mach);
3364 }
3365 
3366 static void
exec_endswitch(struct tgsi_exec_machine * mach)3367 exec_endswitch(struct tgsi_exec_machine *mach)
3368 {
3369    mach->Switch = mach->SwitchStack[--mach->SwitchStackTop];
3370    mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
3371 
3372    UPDATE_EXEC_MASK(mach);
3373 }
3374 
3375 typedef void (* micro_dop)(union tgsi_double_channel *dst,
3376                            const union tgsi_double_channel *src);
3377 
3378 typedef void (* micro_dop_sop)(union tgsi_double_channel *dst,
3379                                const union tgsi_double_channel *src0,
3380                                union tgsi_exec_channel *src1);
3381 
3382 typedef void (* micro_dop_s)(union tgsi_double_channel *dst,
3383                              const union tgsi_exec_channel *src);
3384 
3385 typedef void (* micro_sop_d)(union tgsi_exec_channel *dst,
3386                              const union tgsi_double_channel *src);
3387 
3388 static void
fetch_double_channel(struct tgsi_exec_machine * mach,union tgsi_double_channel * chan,const struct tgsi_full_src_register * reg,unsigned chan_0,unsigned chan_1)3389 fetch_double_channel(struct tgsi_exec_machine *mach,
3390                      union tgsi_double_channel *chan,
3391                      const struct tgsi_full_src_register *reg,
3392                      unsigned chan_0,
3393                      unsigned chan_1)
3394 {
3395    union tgsi_exec_channel src[2];
3396    unsigned i;
3397 
3398    fetch_source_d(mach, &src[0], reg, chan_0);
3399    fetch_source_d(mach, &src[1], reg, chan_1);
3400 
3401    for (i = 0; i < TGSI_QUAD_SIZE; i++) {
3402       chan->u[i][0] = src[0].u[i];
3403       chan->u[i][1] = src[1].u[i];
3404    }
3405    assert(!reg->Register.Absolute);
3406    assert(!reg->Register.Negate);
3407 }
3408 
3409 static void
store_double_channel(struct tgsi_exec_machine * mach,const union tgsi_double_channel * chan,const struct tgsi_full_dst_register * reg,const struct tgsi_full_instruction * inst,unsigned chan_0,unsigned chan_1)3410 store_double_channel(struct tgsi_exec_machine *mach,
3411                      const union tgsi_double_channel *chan,
3412                      const struct tgsi_full_dst_register *reg,
3413                      const struct tgsi_full_instruction *inst,
3414                      unsigned chan_0,
3415                      unsigned chan_1)
3416 {
3417    union tgsi_exec_channel dst[2];
3418    unsigned i;
3419    union tgsi_double_channel temp;
3420    const unsigned execmask = mach->ExecMask;
3421 
3422    if (!inst->Instruction.Saturate) {
3423       for (i = 0; i < TGSI_QUAD_SIZE; i++)
3424          if (execmask & (1 << i)) {
3425             dst[0].u[i] = chan->u[i][0];
3426             dst[1].u[i] = chan->u[i][1];
3427          }
3428    }
3429    else {
3430       for (i = 0; i < TGSI_QUAD_SIZE; i++)
3431          if (execmask & (1 << i)) {
3432             if (chan->d[i] < 0.0 || isnan(chan->d[i]))
3433                temp.d[i] = 0.0;
3434             else if (chan->d[i] > 1.0)
3435                temp.d[i] = 1.0;
3436             else
3437                temp.d[i] = chan->d[i];
3438 
3439             dst[0].u[i] = temp.u[i][0];
3440             dst[1].u[i] = temp.u[i][1];
3441          }
3442    }
3443 
3444    store_dest_double(mach, &dst[0], reg, chan_0);
3445    if (chan_1 != (unsigned)-1)
3446       store_dest_double(mach, &dst[1], reg, chan_1);
3447 }
3448 
3449 static void
exec_double_unary(struct tgsi_exec_machine * mach,const struct tgsi_full_instruction * inst,micro_dop op)3450 exec_double_unary(struct tgsi_exec_machine *mach,
3451                   const struct tgsi_full_instruction *inst,
3452                   micro_dop op)
3453 {
3454    union tgsi_double_channel src;
3455    union tgsi_double_channel dst;
3456 
3457    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) == TGSI_WRITEMASK_XY) {
3458       fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
3459       op(&dst, &src);
3460       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
3461    }
3462    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_ZW) == TGSI_WRITEMASK_ZW) {
3463       fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
3464       op(&dst, &src);
3465       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
3466    }
3467 }
3468 
3469 static void
exec_double_binary(struct tgsi_exec_machine * mach,const struct tgsi_full_instruction * inst,micro_dop op,enum tgsi_exec_datatype dst_datatype)3470 exec_double_binary(struct tgsi_exec_machine *mach,
3471                    const struct tgsi_full_instruction *inst,
3472                    micro_dop op,
3473                    enum tgsi_exec_datatype dst_datatype)
3474 {
3475    union tgsi_double_channel src[2];
3476    union tgsi_double_channel dst;
3477    int first_dest_chan, second_dest_chan;
3478    int wmask;
3479 
3480    wmask = inst->Dst[0].Register.WriteMask;
3481    /* these are & because of the way DSLT etc store their destinations */
3482    if (wmask & TGSI_WRITEMASK_XY) {
3483       first_dest_chan = TGSI_CHAN_X;
3484       second_dest_chan = TGSI_CHAN_Y;
3485       if (dst_datatype == TGSI_EXEC_DATA_UINT) {
3486          first_dest_chan = (wmask & TGSI_WRITEMASK_X) ? TGSI_CHAN_X : TGSI_CHAN_Y;
3487          second_dest_chan = -1;
3488       }
3489 
3490       fetch_double_channel(mach, &src[0], &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
3491       fetch_double_channel(mach, &src[1], &inst->Src[1], TGSI_CHAN_X, TGSI_CHAN_Y);
3492       op(&dst, src);
3493       store_double_channel(mach, &dst, &inst->Dst[0], inst, first_dest_chan, second_dest_chan);
3494    }
3495 
3496    if (wmask & TGSI_WRITEMASK_ZW) {
3497       first_dest_chan = TGSI_CHAN_Z;
3498       second_dest_chan = TGSI_CHAN_W;
3499       if (dst_datatype == TGSI_EXEC_DATA_UINT) {
3500          first_dest_chan = (wmask & TGSI_WRITEMASK_Z) ? TGSI_CHAN_Z : TGSI_CHAN_W;
3501          second_dest_chan = -1;
3502       }
3503 
3504       fetch_double_channel(mach, &src[0], &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
3505       fetch_double_channel(mach, &src[1], &inst->Src[1], TGSI_CHAN_Z, TGSI_CHAN_W);
3506       op(&dst, src);
3507       store_double_channel(mach, &dst, &inst->Dst[0], inst, first_dest_chan, second_dest_chan);
3508    }
3509 }
3510 
3511 static void
exec_double_trinary(struct tgsi_exec_machine * mach,const struct tgsi_full_instruction * inst,micro_dop op)3512 exec_double_trinary(struct tgsi_exec_machine *mach,
3513                     const struct tgsi_full_instruction *inst,
3514                     micro_dop op)
3515 {
3516    union tgsi_double_channel src[3];
3517    union tgsi_double_channel dst;
3518 
3519    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) == TGSI_WRITEMASK_XY) {
3520       fetch_double_channel(mach, &src[0], &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
3521       fetch_double_channel(mach, &src[1], &inst->Src[1], TGSI_CHAN_X, TGSI_CHAN_Y);
3522       fetch_double_channel(mach, &src[2], &inst->Src[2], TGSI_CHAN_X, TGSI_CHAN_Y);
3523       op(&dst, src);
3524       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
3525    }
3526    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_ZW) == TGSI_WRITEMASK_ZW) {
3527       fetch_double_channel(mach, &src[0], &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
3528       fetch_double_channel(mach, &src[1], &inst->Src[1], TGSI_CHAN_Z, TGSI_CHAN_W);
3529       fetch_double_channel(mach, &src[2], &inst->Src[2], TGSI_CHAN_Z, TGSI_CHAN_W);
3530       op(&dst, src);
3531       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
3532    }
3533 }
3534 
3535 static void
exec_dldexp(struct tgsi_exec_machine * mach,const struct tgsi_full_instruction * inst)3536 exec_dldexp(struct tgsi_exec_machine *mach,
3537             const struct tgsi_full_instruction *inst)
3538 {
3539    union tgsi_double_channel src0;
3540    union tgsi_exec_channel src1;
3541    union tgsi_double_channel dst;
3542    int wmask;
3543 
3544    wmask = inst->Dst[0].Register.WriteMask;
3545    if (wmask & TGSI_WRITEMASK_XY) {
3546       fetch_double_channel(mach, &src0, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
3547       fetch_source(mach, &src1, &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_INT);
3548       micro_dldexp(&dst, &src0, &src1);
3549       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
3550    }
3551 
3552    if (wmask & TGSI_WRITEMASK_ZW) {
3553       fetch_double_channel(mach, &src0, &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
3554       fetch_source(mach, &src1, &inst->Src[1], TGSI_CHAN_Z, TGSI_EXEC_DATA_INT);
3555       micro_dldexp(&dst, &src0, &src1);
3556       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
3557    }
3558 }
3559 
3560 static void
exec_arg0_64_arg1_32(struct tgsi_exec_machine * mach,const struct tgsi_full_instruction * inst,micro_dop_sop op)3561 exec_arg0_64_arg1_32(struct tgsi_exec_machine *mach,
3562             const struct tgsi_full_instruction *inst,
3563             micro_dop_sop op)
3564 {
3565    union tgsi_double_channel src0;
3566    union tgsi_exec_channel src1;
3567    union tgsi_double_channel dst;
3568    int wmask;
3569 
3570    wmask = inst->Dst[0].Register.WriteMask;
3571    if (wmask & TGSI_WRITEMASK_XY) {
3572       fetch_double_channel(mach, &src0, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
3573       fetch_source(mach, &src1, &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_INT);
3574       op(&dst, &src0, &src1);
3575       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
3576    }
3577 
3578    if (wmask & TGSI_WRITEMASK_ZW) {
3579       fetch_double_channel(mach, &src0, &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
3580       fetch_source(mach, &src1, &inst->Src[1], TGSI_CHAN_Z, TGSI_EXEC_DATA_INT);
3581       op(&dst, &src0, &src1);
3582       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
3583    }
3584 }
3585 
3586 static int
get_image_coord_dim(unsigned tgsi_tex)3587 get_image_coord_dim(unsigned tgsi_tex)
3588 {
3589    int dim;
3590    switch (tgsi_tex) {
3591    case TGSI_TEXTURE_BUFFER:
3592    case TGSI_TEXTURE_1D:
3593       dim = 1;
3594       break;
3595    case TGSI_TEXTURE_2D:
3596    case TGSI_TEXTURE_RECT:
3597    case TGSI_TEXTURE_1D_ARRAY:
3598    case TGSI_TEXTURE_2D_MSAA:
3599       dim = 2;
3600       break;
3601    case TGSI_TEXTURE_3D:
3602    case TGSI_TEXTURE_CUBE:
3603    case TGSI_TEXTURE_2D_ARRAY:
3604    case TGSI_TEXTURE_2D_ARRAY_MSAA:
3605    case TGSI_TEXTURE_CUBE_ARRAY:
3606       dim = 3;
3607       break;
3608    default:
3609       assert(!"unknown texture target");
3610       dim = 0;
3611       break;
3612    }
3613 
3614    return dim;
3615 }
3616 
3617 static int
get_image_coord_sample(unsigned tgsi_tex)3618 get_image_coord_sample(unsigned tgsi_tex)
3619 {
3620    int sample = 0;
3621    switch (tgsi_tex) {
3622    case TGSI_TEXTURE_2D_MSAA:
3623       sample = 3;
3624       break;
3625    case TGSI_TEXTURE_2D_ARRAY_MSAA:
3626       sample = 4;
3627       break;
3628    default:
3629       break;
3630    }
3631    return sample;
3632 }
3633 
3634 static void
exec_load_img(struct tgsi_exec_machine * mach,const struct tgsi_full_instruction * inst)3635 exec_load_img(struct tgsi_exec_machine *mach,
3636               const struct tgsi_full_instruction *inst)
3637 {
3638    union tgsi_exec_channel r[4], sample_r;
3639    unsigned unit;
3640    int sample;
3641    int i, j;
3642    int dim;
3643    unsigned chan;
3644    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
3645    struct tgsi_image_params params;
3646 
3647    unit = fetch_sampler_unit(mach, inst, 0);
3648    dim = get_image_coord_dim(inst->Memory.Texture);
3649    sample = get_image_coord_sample(inst->Memory.Texture);
3650    assert(dim <= 3);
3651 
3652    params.execmask = mach->ExecMask & mach->NonHelperMask & ~mach->KillMask;
3653    params.unit = unit;
3654    params.tgsi_tex_instr = inst->Memory.Texture;
3655    params.format = inst->Memory.Format;
3656 
3657    for (i = 0; i < dim; i++) {
3658       IFETCH(&r[i], 1, TGSI_CHAN_X + i);
3659    }
3660 
3661    if (sample)
3662       IFETCH(&sample_r, 1, TGSI_CHAN_X + sample);
3663 
3664    mach->Image->load(mach->Image, &params,
3665                      r[0].i, r[1].i, r[2].i, sample_r.i,
3666                      rgba);
3667    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
3668       r[0].f[j] = rgba[0][j];
3669       r[1].f[j] = rgba[1][j];
3670       r[2].f[j] = rgba[2][j];
3671       r[3].f[j] = rgba[3][j];
3672    }
3673    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3674       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3675          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan);
3676       }
3677    }
3678 }
3679 
3680 static void
exec_load_membuf(struct tgsi_exec_machine * mach,const struct tgsi_full_instruction * inst)3681 exec_load_membuf(struct tgsi_exec_machine *mach,
3682                  const struct tgsi_full_instruction *inst)
3683 {
3684    uint32_t unit = fetch_sampler_unit(mach, inst, 0);
3685 
3686    uint32_t size;
3687    const char *ptr;
3688    switch (inst->Src[0].Register.File) {
3689    case TGSI_FILE_MEMORY:
3690       ptr = mach->LocalMem;
3691       size = mach->LocalMemSize;
3692       break;
3693 
3694    case TGSI_FILE_BUFFER:
3695       ptr = mach->Buffer->lookup(mach->Buffer, unit, &size);
3696       break;
3697 
3698    case TGSI_FILE_CONSTANT:
3699       if (unit < ARRAY_SIZE(mach->Consts)) {
3700          ptr = mach->Consts[unit];
3701          size = mach->ConstsSize[unit];
3702       } else {
3703          ptr = NULL;
3704          size = 0;
3705       }
3706       break;
3707 
3708    default:
3709       unreachable("unsupported TGSI_OPCODE_LOAD file");
3710    }
3711 
3712    union tgsi_exec_channel offset;
3713    IFETCH(&offset, 1, TGSI_CHAN_X);
3714 
3715    assert(inst->Dst[0].Register.WriteMask);
3716    uint32_t load_size = util_last_bit(inst->Dst[0].Register.WriteMask) * 4;
3717 
3718    union tgsi_exec_channel rgba[TGSI_NUM_CHANNELS];
3719    memset(&rgba, 0, sizeof(rgba));
3720    for (int j = 0; j < TGSI_QUAD_SIZE; j++) {
3721       if (size >= load_size && offset.u[j] <= (size - load_size)) {
3722          for (int chan = 0; chan < load_size / 4; chan++)
3723             rgba[chan].u[j] = *(uint32_t *)(ptr + offset.u[j] + chan * 4);
3724       }
3725    }
3726 
3727    for (int chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3728       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3729          store_dest(mach, &rgba[chan], &inst->Dst[0], inst, chan);
3730       }
3731    }
3732 }
3733 
3734 static void
exec_load(struct tgsi_exec_machine * mach,const struct tgsi_full_instruction * inst)3735 exec_load(struct tgsi_exec_machine *mach,
3736           const struct tgsi_full_instruction *inst)
3737 {
3738    if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
3739       exec_load_img(mach, inst);
3740    else
3741       exec_load_membuf(mach, inst);
3742 }
3743 
3744 static unsigned
fetch_store_img_unit(struct tgsi_exec_machine * mach,const struct tgsi_full_dst_register * dst)3745 fetch_store_img_unit(struct tgsi_exec_machine *mach,
3746                      const struct tgsi_full_dst_register *dst)
3747 {
3748    unsigned unit = 0;
3749    int i;
3750    if (dst->Register.Indirect) {
3751       union tgsi_exec_channel indir_index, index2;
3752       const unsigned execmask = mach->ExecMask;
3753       index2.i[0] =
3754       index2.i[1] =
3755       index2.i[2] =
3756       index2.i[3] = dst->Indirect.Index;
3757 
3758       fetch_src_file_channel(mach,
3759                              dst->Indirect.File,
3760                              dst->Indirect.Swizzle,
3761                              &index2,
3762                              &ZeroVec,
3763                              &indir_index);
3764       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
3765          if (execmask & (1 << i)) {
3766             unit = dst->Register.Index + indir_index.i[i];
3767             break;
3768          }
3769       }
3770    } else {
3771       unit = dst->Register.Index;
3772    }
3773    return unit;
3774 }
3775 
3776 static void
exec_store_img(struct tgsi_exec_machine * mach,const struct tgsi_full_instruction * inst)3777 exec_store_img(struct tgsi_exec_machine *mach,
3778                const struct tgsi_full_instruction *inst)
3779 {
3780    union tgsi_exec_channel r[3], sample_r;
3781    union tgsi_exec_channel value[4];
3782    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
3783    struct tgsi_image_params params;
3784    int dim;
3785    int sample;
3786    int i, j;
3787    unsigned unit;
3788    unit = fetch_store_img_unit(mach, &inst->Dst[0]);
3789    dim = get_image_coord_dim(inst->Memory.Texture);
3790    sample = get_image_coord_sample(inst->Memory.Texture);
3791    assert(dim <= 3);
3792 
3793    params.execmask = mach->ExecMask & mach->NonHelperMask & ~mach->KillMask;
3794    params.unit = unit;
3795    params.tgsi_tex_instr = inst->Memory.Texture;
3796    params.format = inst->Memory.Format;
3797 
3798    for (i = 0; i < dim; i++) {
3799       IFETCH(&r[i], 0, TGSI_CHAN_X + i);
3800    }
3801 
3802    for (i = 0; i < 4; i++) {
3803       FETCH(&value[i], 1, TGSI_CHAN_X + i);
3804    }
3805    if (sample)
3806       IFETCH(&sample_r, 0, TGSI_CHAN_X + sample);
3807 
3808    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
3809       rgba[0][j] = value[0].f[j];
3810       rgba[1][j] = value[1].f[j];
3811       rgba[2][j] = value[2].f[j];
3812       rgba[3][j] = value[3].f[j];
3813    }
3814 
3815    mach->Image->store(mach->Image, &params,
3816                       r[0].i, r[1].i, r[2].i, sample_r.i,
3817                       rgba);
3818 }
3819 
3820 
3821 static void
exec_store_membuf(struct tgsi_exec_machine * mach,const struct tgsi_full_instruction * inst)3822 exec_store_membuf(struct tgsi_exec_machine *mach,
3823                const struct tgsi_full_instruction *inst)
3824 {
3825    uint32_t unit = fetch_store_img_unit(mach, &inst->Dst[0]);
3826    uint32_t size;
3827 
3828    int execmask = mach->ExecMask & mach->NonHelperMask & ~mach->KillMask;
3829 
3830    const char *ptr;
3831    switch (inst->Dst[0].Register.File) {
3832    case TGSI_FILE_MEMORY:
3833       ptr = mach->LocalMem;
3834       size = mach->LocalMemSize;
3835       break;
3836 
3837    case TGSI_FILE_BUFFER:
3838       ptr = mach->Buffer->lookup(mach->Buffer, unit, &size);
3839       break;
3840 
3841    default:
3842       unreachable("unsupported TGSI_OPCODE_STORE file");
3843    }
3844 
3845    union tgsi_exec_channel offset;
3846    IFETCH(&offset, 0, TGSI_CHAN_X);
3847 
3848    union tgsi_exec_channel value[4];
3849    for (int i = 0; i < 4; i++)
3850       FETCH(&value[i], 1, TGSI_CHAN_X + i);
3851 
3852    for (int j = 0; j < TGSI_QUAD_SIZE; j++) {
3853       if (!(execmask & (1 << j)))
3854          continue;
3855       if (size < offset.u[j])
3856          continue;
3857 
3858       uint32_t *invocation_ptr = (uint32_t *)(ptr + offset.u[j]);
3859       uint32_t size_avail = size - offset.u[j];
3860 
3861       for (int chan = 0; chan < MIN2(4, size_avail / 4); chan++) {
3862          if (inst->Dst[0].Register.WriteMask & (1 << chan))
3863             memcpy(&invocation_ptr[chan], &value[chan].u[j], 4);
3864       }
3865    }
3866 }
3867 
3868 static void
exec_store(struct tgsi_exec_machine * mach,const struct tgsi_full_instruction * inst)3869 exec_store(struct tgsi_exec_machine *mach,
3870            const struct tgsi_full_instruction *inst)
3871 {
3872    if (inst->Dst[0].Register.File == TGSI_FILE_IMAGE)
3873       exec_store_img(mach, inst);
3874    else
3875       exec_store_membuf(mach, inst);
3876 }
3877 
3878 static void
exec_atomop_img(struct tgsi_exec_machine * mach,const struct tgsi_full_instruction * inst)3879 exec_atomop_img(struct tgsi_exec_machine *mach,
3880                 const struct tgsi_full_instruction *inst)
3881 {
3882    union tgsi_exec_channel r[4], sample_r;
3883    union tgsi_exec_channel value[4], value2[4];
3884    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
3885    float rgba2[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
3886    struct tgsi_image_params params;
3887    int dim;
3888    int sample;
3889    int i, j;
3890    unsigned unit, chan;
3891    unit = fetch_sampler_unit(mach, inst, 0);
3892    dim = get_image_coord_dim(inst->Memory.Texture);
3893    sample = get_image_coord_sample(inst->Memory.Texture);
3894    assert(dim <= 3);
3895 
3896    params.execmask = mach->ExecMask & mach->NonHelperMask & ~mach->KillMask;
3897    params.unit = unit;
3898    params.tgsi_tex_instr = inst->Memory.Texture;
3899    params.format = inst->Memory.Format;
3900 
3901    for (i = 0; i < dim; i++) {
3902       IFETCH(&r[i], 1, TGSI_CHAN_X + i);
3903    }
3904 
3905    for (i = 0; i < 4; i++) {
3906       FETCH(&value[i], 2, TGSI_CHAN_X + i);
3907       if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
3908          FETCH(&value2[i], 3, TGSI_CHAN_X + i);
3909    }
3910    if (sample)
3911       IFETCH(&sample_r, 1, TGSI_CHAN_X + sample);
3912 
3913    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
3914       rgba[0][j] = value[0].f[j];
3915       rgba[1][j] = value[1].f[j];
3916       rgba[2][j] = value[2].f[j];
3917       rgba[3][j] = value[3].f[j];
3918    }
3919    if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
3920       for (j = 0; j < TGSI_QUAD_SIZE; j++) {
3921          rgba2[0][j] = value2[0].f[j];
3922          rgba2[1][j] = value2[1].f[j];
3923          rgba2[2][j] = value2[2].f[j];
3924          rgba2[3][j] = value2[3].f[j];
3925       }
3926    }
3927 
3928    mach->Image->op(mach->Image, &params, inst->Instruction.Opcode,
3929                    r[0].i, r[1].i, r[2].i, sample_r.i,
3930                    rgba, rgba2);
3931 
3932    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
3933       r[0].f[j] = rgba[0][j];
3934       r[1].f[j] = rgba[1][j];
3935       r[2].f[j] = rgba[2][j];
3936       r[3].f[j] = rgba[3][j];
3937    }
3938    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3939       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3940          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan);
3941       }
3942    }
3943 }
3944 
3945 static void
exec_atomop_membuf(struct tgsi_exec_machine * mach,const struct tgsi_full_instruction * inst)3946 exec_atomop_membuf(struct tgsi_exec_machine *mach,
3947                    const struct tgsi_full_instruction *inst)
3948 {
3949    union tgsi_exec_channel offset, r0, r1;
3950    unsigned chan, i;
3951    int execmask = mach->ExecMask & mach->NonHelperMask & ~mach->KillMask;
3952    IFETCH(&offset, 1, TGSI_CHAN_X);
3953 
3954    if (!(inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X))
3955       return;
3956 
3957    void *ptr[TGSI_QUAD_SIZE];
3958    if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
3959       uint32_t unit = fetch_sampler_unit(mach, inst, 0);
3960       uint32_t size;
3961       char *buffer = mach->Buffer->lookup(mach->Buffer, unit, &size);
3962       for (int i = 0; i < TGSI_QUAD_SIZE; i++) {
3963          if (likely(size >= 4 && offset.u[i] <= size - 4))
3964             ptr[i] = buffer + offset.u[i];
3965          else
3966             ptr[i] = NULL;
3967       }
3968    } else {
3969       assert(inst->Src[0].Register.File == TGSI_FILE_MEMORY);
3970 
3971       for (i = 0; i < TGSI_QUAD_SIZE; i++) {
3972          if (likely(mach->LocalMemSize >= 4 && offset.u[i] <= mach->LocalMemSize - 4))
3973             ptr[i] = (char *)mach->LocalMem + offset.u[i];
3974          else
3975             ptr[i] = NULL;
3976       }
3977    }
3978 
3979    FETCH(&r0, 2, TGSI_CHAN_X);
3980    if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
3981       FETCH(&r1, 3, TGSI_CHAN_X);
3982 
3983    /* The load/op/store sequence has to happen inside the loop since ptr
3984     * may have the same ptr in some of the invocations.
3985     */
3986    for (int i = 0; i < TGSI_QUAD_SIZE; i++) {
3987       if (!(execmask & (1 << i)))
3988          continue;
3989 
3990       uint32_t val = 0;
3991       if (ptr[i]) {
3992          memcpy(&val, ptr[i], sizeof(val));
3993 
3994          uint32_t result;
3995          switch (inst->Instruction.Opcode) {
3996          case TGSI_OPCODE_ATOMUADD:
3997             result = val + r0.u[i];
3998             break;
3999          case TGSI_OPCODE_ATOMXOR:
4000             result = val ^ r0.u[i];
4001             break;
4002          case TGSI_OPCODE_ATOMOR:
4003             result = val | r0.u[i];
4004             break;
4005          case TGSI_OPCODE_ATOMAND:
4006             result = val & r0.u[i];
4007             break;
4008          case TGSI_OPCODE_ATOMUMIN:
4009             result = MIN2(val, r0.u[i]);
4010             break;
4011          case TGSI_OPCODE_ATOMUMAX:
4012             result = MAX2(val, r0.u[i]);
4013             break;
4014          case TGSI_OPCODE_ATOMIMIN:
4015             result = MIN2((int32_t)val, r0.i[i]);
4016             break;
4017          case TGSI_OPCODE_ATOMIMAX:
4018             result = MAX2((int32_t)val, r0.i[i]);
4019             break;
4020          case TGSI_OPCODE_ATOMXCHG:
4021             result = r0.u[i];
4022             break;
4023          case TGSI_OPCODE_ATOMCAS:
4024             if (val == r0.u[i])
4025                result = r1.u[i];
4026             else
4027                result = val;
4028             break;
4029          case TGSI_OPCODE_ATOMFADD:
4030                result = fui(uif(val) + r0.f[i]);
4031             break;
4032          default:
4033             unreachable("bad atomic op");
4034          }
4035          memcpy(ptr[i], &result, sizeof(result));
4036       }
4037 
4038       r0.u[i] = val;
4039    }
4040 
4041    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++)
4042       store_dest(mach, &r0, &inst->Dst[0], inst, chan);
4043 }
4044 
4045 static void
exec_atomop(struct tgsi_exec_machine * mach,const struct tgsi_full_instruction * inst)4046 exec_atomop(struct tgsi_exec_machine *mach,
4047             const struct tgsi_full_instruction *inst)
4048 {
4049    if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
4050       exec_atomop_img(mach, inst);
4051    else
4052       exec_atomop_membuf(mach, inst);
4053 }
4054 
4055 static void
exec_resq_img(struct tgsi_exec_machine * mach,const struct tgsi_full_instruction * inst)4056 exec_resq_img(struct tgsi_exec_machine *mach,
4057               const struct tgsi_full_instruction *inst)
4058 {
4059    int result[4];
4060    union tgsi_exec_channel r[4];
4061    unsigned unit;
4062    int i, chan, j;
4063    struct tgsi_image_params params;
4064 
4065    unit = fetch_sampler_unit(mach, inst, 0);
4066 
4067    params.execmask = mach->ExecMask & mach->NonHelperMask & ~mach->KillMask;
4068    params.unit = unit;
4069    params.tgsi_tex_instr = inst->Memory.Texture;
4070    params.format = inst->Memory.Format;
4071 
4072    mach->Image->get_dims(mach->Image, &params, result);
4073 
4074    for (i = 0; i < TGSI_QUAD_SIZE; i++) {
4075       for (j = 0; j < 4; j++) {
4076          r[j].i[i] = result[j];
4077       }
4078    }
4079 
4080    for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4081       if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
4082          store_dest(mach, &r[chan], &inst->Dst[0], inst, chan);
4083       }
4084    }
4085 }
4086 
4087 static void
exec_resq_buf(struct tgsi_exec_machine * mach,const struct tgsi_full_instruction * inst)4088 exec_resq_buf(struct tgsi_exec_machine *mach,
4089               const struct tgsi_full_instruction *inst)
4090 {
4091    uint32_t unit = fetch_sampler_unit(mach, inst, 0);
4092    uint32_t size;
4093    (void)mach->Buffer->lookup(mach->Buffer, unit, &size);
4094 
4095    union tgsi_exec_channel r;
4096    for (int i = 0; i < TGSI_QUAD_SIZE; i++)
4097       r.i[i] = size;
4098 
4099    if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
4100       for (int chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4101          store_dest(mach, &r, &inst->Dst[0], inst, TGSI_CHAN_X);
4102       }
4103    }
4104 }
4105 
4106 static void
exec_resq(struct tgsi_exec_machine * mach,const struct tgsi_full_instruction * inst)4107 exec_resq(struct tgsi_exec_machine *mach,
4108           const struct tgsi_full_instruction *inst)
4109 {
4110    if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
4111       exec_resq_img(mach, inst);
4112    else
4113       exec_resq_buf(mach, inst);
4114 }
4115 
4116 static void
micro_f2u64(union tgsi_double_channel * dst,const union tgsi_exec_channel * src)4117 micro_f2u64(union tgsi_double_channel *dst,
4118             const union tgsi_exec_channel *src)
4119 {
4120    dst->u64[0] = (uint64_t)src->f[0];
4121    dst->u64[1] = (uint64_t)src->f[1];
4122    dst->u64[2] = (uint64_t)src->f[2];
4123    dst->u64[3] = (uint64_t)src->f[3];
4124 }
4125 
4126 static void
micro_f2i64(union tgsi_double_channel * dst,const union tgsi_exec_channel * src)4127 micro_f2i64(union tgsi_double_channel *dst,
4128             const union tgsi_exec_channel *src)
4129 {
4130    dst->i64[0] = (int64_t)src->f[0];
4131    dst->i64[1] = (int64_t)src->f[1];
4132    dst->i64[2] = (int64_t)src->f[2];
4133    dst->i64[3] = (int64_t)src->f[3];
4134 }
4135 
4136 static void
micro_u2i64(union tgsi_double_channel * dst,const union tgsi_exec_channel * src)4137 micro_u2i64(union tgsi_double_channel *dst,
4138             const union tgsi_exec_channel *src)
4139 {
4140    dst->u64[0] = (uint64_t)src->u[0];
4141    dst->u64[1] = (uint64_t)src->u[1];
4142    dst->u64[2] = (uint64_t)src->u[2];
4143    dst->u64[3] = (uint64_t)src->u[3];
4144 }
4145 
4146 static void
micro_i2i64(union tgsi_double_channel * dst,const union tgsi_exec_channel * src)4147 micro_i2i64(union tgsi_double_channel *dst,
4148             const union tgsi_exec_channel *src)
4149 {
4150    dst->i64[0] = (int64_t)src->i[0];
4151    dst->i64[1] = (int64_t)src->i[1];
4152    dst->i64[2] = (int64_t)src->i[2];
4153    dst->i64[3] = (int64_t)src->i[3];
4154 }
4155 
4156 static void
micro_d2u64(union tgsi_double_channel * dst,const union tgsi_double_channel * src)4157 micro_d2u64(union tgsi_double_channel *dst,
4158            const union tgsi_double_channel *src)
4159 {
4160    dst->u64[0] = (uint64_t)src->d[0];
4161    dst->u64[1] = (uint64_t)src->d[1];
4162    dst->u64[2] = (uint64_t)src->d[2];
4163    dst->u64[3] = (uint64_t)src->d[3];
4164 }
4165 
4166 static void
micro_d2i64(union tgsi_double_channel * dst,const union tgsi_double_channel * src)4167 micro_d2i64(union tgsi_double_channel *dst,
4168            const union tgsi_double_channel *src)
4169 {
4170    dst->i64[0] = (int64_t)src->d[0];
4171    dst->i64[1] = (int64_t)src->d[1];
4172    dst->i64[2] = (int64_t)src->d[2];
4173    dst->i64[3] = (int64_t)src->d[3];
4174 }
4175 
4176 static void
micro_u642d(union tgsi_double_channel * dst,const union tgsi_double_channel * src)4177 micro_u642d(union tgsi_double_channel *dst,
4178            const union tgsi_double_channel *src)
4179 {
4180    dst->d[0] = (double)src->u64[0];
4181    dst->d[1] = (double)src->u64[1];
4182    dst->d[2] = (double)src->u64[2];
4183    dst->d[3] = (double)src->u64[3];
4184 }
4185 
4186 static void
micro_i642d(union tgsi_double_channel * dst,const union tgsi_double_channel * src)4187 micro_i642d(union tgsi_double_channel *dst,
4188            const union tgsi_double_channel *src)
4189 {
4190    dst->d[0] = (double)src->i64[0];
4191    dst->d[1] = (double)src->i64[1];
4192    dst->d[2] = (double)src->i64[2];
4193    dst->d[3] = (double)src->i64[3];
4194 }
4195 
4196 static void
micro_u642f(union tgsi_exec_channel * dst,const union tgsi_double_channel * src)4197 micro_u642f(union tgsi_exec_channel *dst,
4198             const union tgsi_double_channel *src)
4199 {
4200    dst->f[0] = (float)src->u64[0];
4201    dst->f[1] = (float)src->u64[1];
4202    dst->f[2] = (float)src->u64[2];
4203    dst->f[3] = (float)src->u64[3];
4204 }
4205 
4206 static void
micro_i642f(union tgsi_exec_channel * dst,const union tgsi_double_channel * src)4207 micro_i642f(union tgsi_exec_channel *dst,
4208             const union tgsi_double_channel *src)
4209 {
4210    dst->f[0] = (float)src->i64[0];
4211    dst->f[1] = (float)src->i64[1];
4212    dst->f[2] = (float)src->i64[2];
4213    dst->f[3] = (float)src->i64[3];
4214 }
4215 
4216 static void
exec_t_2_64(struct tgsi_exec_machine * mach,const struct tgsi_full_instruction * inst,micro_dop_s op,enum tgsi_exec_datatype src_datatype)4217 exec_t_2_64(struct tgsi_exec_machine *mach,
4218           const struct tgsi_full_instruction *inst,
4219           micro_dop_s op,
4220           enum tgsi_exec_datatype src_datatype)
4221 {
4222    union tgsi_exec_channel src;
4223    union tgsi_double_channel dst;
4224 
4225    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) == TGSI_WRITEMASK_XY) {
4226       fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, src_datatype);
4227       op(&dst, &src);
4228       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
4229    }
4230    if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_ZW) == TGSI_WRITEMASK_ZW) {
4231       fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_Y, src_datatype);
4232       op(&dst, &src);
4233       store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
4234    }
4235 }
4236 
4237 static void
exec_64_2_t(struct tgsi_exec_machine * mach,const struct tgsi_full_instruction * inst,micro_sop_d op)4238 exec_64_2_t(struct tgsi_exec_machine *mach,
4239             const struct tgsi_full_instruction *inst,
4240             micro_sop_d op)
4241 {
4242    union tgsi_double_channel src;
4243    union tgsi_exec_channel dst;
4244    int wm = inst->Dst[0].Register.WriteMask;
4245    int i;
4246    int bit;
4247    for (i = 0; i < 2; i++) {
4248       bit = ffs(wm);
4249       if (bit) {
4250          wm &= ~(1 << (bit - 1));
4251          if (i == 0)
4252             fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
4253          else
4254             fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
4255          op(&dst, &src);
4256          store_dest(mach, &dst, &inst->Dst[0], inst, bit - 1);
4257       }
4258    }
4259 }
4260 
4261 static void
micro_i2f(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src)4262 micro_i2f(union tgsi_exec_channel *dst,
4263           const union tgsi_exec_channel *src)
4264 {
4265    dst->f[0] = (float)src->i[0];
4266    dst->f[1] = (float)src->i[1];
4267    dst->f[2] = (float)src->i[2];
4268    dst->f[3] = (float)src->i[3];
4269 }
4270 
4271 static void
micro_not(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src)4272 micro_not(union tgsi_exec_channel *dst,
4273           const union tgsi_exec_channel *src)
4274 {
4275    dst->u[0] = ~src->u[0];
4276    dst->u[1] = ~src->u[1];
4277    dst->u[2] = ~src->u[2];
4278    dst->u[3] = ~src->u[3];
4279 }
4280 
4281 static void
micro_shl(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src0,const union tgsi_exec_channel * src1)4282 micro_shl(union tgsi_exec_channel *dst,
4283           const union tgsi_exec_channel *src0,
4284           const union tgsi_exec_channel *src1)
4285 {
4286    unsigned masked_count;
4287    masked_count = src1->u[0] & 0x1f;
4288    dst->u[0] = src0->u[0] << masked_count;
4289    masked_count = src1->u[1] & 0x1f;
4290    dst->u[1] = src0->u[1] << masked_count;
4291    masked_count = src1->u[2] & 0x1f;
4292    dst->u[2] = src0->u[2] << masked_count;
4293    masked_count = src1->u[3] & 0x1f;
4294    dst->u[3] = src0->u[3] << masked_count;
4295 }
4296 
4297 static void
micro_and(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src0,const union tgsi_exec_channel * src1)4298 micro_and(union tgsi_exec_channel *dst,
4299           const union tgsi_exec_channel *src0,
4300           const union tgsi_exec_channel *src1)
4301 {
4302    dst->u[0] = src0->u[0] & src1->u[0];
4303    dst->u[1] = src0->u[1] & src1->u[1];
4304    dst->u[2] = src0->u[2] & src1->u[2];
4305    dst->u[3] = src0->u[3] & src1->u[3];
4306 }
4307 
4308 static void
micro_or(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src0,const union tgsi_exec_channel * src1)4309 micro_or(union tgsi_exec_channel *dst,
4310          const union tgsi_exec_channel *src0,
4311          const union tgsi_exec_channel *src1)
4312 {
4313    dst->u[0] = src0->u[0] | src1->u[0];
4314    dst->u[1] = src0->u[1] | src1->u[1];
4315    dst->u[2] = src0->u[2] | src1->u[2];
4316    dst->u[3] = src0->u[3] | src1->u[3];
4317 }
4318 
4319 static void
micro_xor(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src0,const union tgsi_exec_channel * src1)4320 micro_xor(union tgsi_exec_channel *dst,
4321           const union tgsi_exec_channel *src0,
4322           const union tgsi_exec_channel *src1)
4323 {
4324    dst->u[0] = src0->u[0] ^ src1->u[0];
4325    dst->u[1] = src0->u[1] ^ src1->u[1];
4326    dst->u[2] = src0->u[2] ^ src1->u[2];
4327    dst->u[3] = src0->u[3] ^ src1->u[3];
4328 }
4329 
4330 static void
micro_mod(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src0,const union tgsi_exec_channel * src1)4331 micro_mod(union tgsi_exec_channel *dst,
4332           const union tgsi_exec_channel *src0,
4333           const union tgsi_exec_channel *src1)
4334 {
4335    dst->i[0] = src1->i[0] ? src0->i[0] % src1->i[0] : ~0;
4336    dst->i[1] = src1->i[1] ? src0->i[1] % src1->i[1] : ~0;
4337    dst->i[2] = src1->i[2] ? src0->i[2] % src1->i[2] : ~0;
4338    dst->i[3] = src1->i[3] ? src0->i[3] % src1->i[3] : ~0;
4339 }
4340 
4341 static void
micro_f2i(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src)4342 micro_f2i(union tgsi_exec_channel *dst,
4343           const union tgsi_exec_channel *src)
4344 {
4345    dst->i[0] = (int)src->f[0];
4346    dst->i[1] = (int)src->f[1];
4347    dst->i[2] = (int)src->f[2];
4348    dst->i[3] = (int)src->f[3];
4349 }
4350 
4351 static void
micro_fseq(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src0,const union tgsi_exec_channel * src1)4352 micro_fseq(union tgsi_exec_channel *dst,
4353            const union tgsi_exec_channel *src0,
4354            const union tgsi_exec_channel *src1)
4355 {
4356    dst->u[0] = src0->f[0] == src1->f[0] ? ~0 : 0;
4357    dst->u[1] = src0->f[1] == src1->f[1] ? ~0 : 0;
4358    dst->u[2] = src0->f[2] == src1->f[2] ? ~0 : 0;
4359    dst->u[3] = src0->f[3] == src1->f[3] ? ~0 : 0;
4360 }
4361 
4362 static void
micro_fsge(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src0,const union tgsi_exec_channel * src1)4363 micro_fsge(union tgsi_exec_channel *dst,
4364            const union tgsi_exec_channel *src0,
4365            const union tgsi_exec_channel *src1)
4366 {
4367    dst->u[0] = src0->f[0] >= src1->f[0] ? ~0 : 0;
4368    dst->u[1] = src0->f[1] >= src1->f[1] ? ~0 : 0;
4369    dst->u[2] = src0->f[2] >= src1->f[2] ? ~0 : 0;
4370    dst->u[3] = src0->f[3] >= src1->f[3] ? ~0 : 0;
4371 }
4372 
4373 static void
micro_fslt(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src0,const union tgsi_exec_channel * src1)4374 micro_fslt(union tgsi_exec_channel *dst,
4375            const union tgsi_exec_channel *src0,
4376            const union tgsi_exec_channel *src1)
4377 {
4378    dst->u[0] = src0->f[0] < src1->f[0] ? ~0 : 0;
4379    dst->u[1] = src0->f[1] < src1->f[1] ? ~0 : 0;
4380    dst->u[2] = src0->f[2] < src1->f[2] ? ~0 : 0;
4381    dst->u[3] = src0->f[3] < src1->f[3] ? ~0 : 0;
4382 }
4383 
4384 static void
micro_fsne(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src0,const union tgsi_exec_channel * src1)4385 micro_fsne(union tgsi_exec_channel *dst,
4386            const union tgsi_exec_channel *src0,
4387            const union tgsi_exec_channel *src1)
4388 {
4389    dst->u[0] = src0->f[0] != src1->f[0] ? ~0 : 0;
4390    dst->u[1] = src0->f[1] != src1->f[1] ? ~0 : 0;
4391    dst->u[2] = src0->f[2] != src1->f[2] ? ~0 : 0;
4392    dst->u[3] = src0->f[3] != src1->f[3] ? ~0 : 0;
4393 }
4394 
4395 static void
micro_idiv(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src0,const union tgsi_exec_channel * src1)4396 micro_idiv(union tgsi_exec_channel *dst,
4397            const union tgsi_exec_channel *src0,
4398            const union tgsi_exec_channel *src1)
4399 {
4400    dst->i[0] = src1->i[0] ? src0->i[0] / src1->i[0] : 0;
4401    dst->i[1] = src1->i[1] ? src0->i[1] / src1->i[1] : 0;
4402    dst->i[2] = src1->i[2] ? src0->i[2] / src1->i[2] : 0;
4403    dst->i[3] = src1->i[3] ? src0->i[3] / src1->i[3] : 0;
4404 }
4405 
4406 static void
micro_imax(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src0,const union tgsi_exec_channel * src1)4407 micro_imax(union tgsi_exec_channel *dst,
4408            const union tgsi_exec_channel *src0,
4409            const union tgsi_exec_channel *src1)
4410 {
4411    dst->i[0] = src0->i[0] > src1->i[0] ? src0->i[0] : src1->i[0];
4412    dst->i[1] = src0->i[1] > src1->i[1] ? src0->i[1] : src1->i[1];
4413    dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2];
4414    dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3];
4415 }
4416 
4417 static void
micro_imin(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src0,const union tgsi_exec_channel * src1)4418 micro_imin(union tgsi_exec_channel *dst,
4419            const union tgsi_exec_channel *src0,
4420            const union tgsi_exec_channel *src1)
4421 {
4422    dst->i[0] = src0->i[0] < src1->i[0] ? src0->i[0] : src1->i[0];
4423    dst->i[1] = src0->i[1] < src1->i[1] ? src0->i[1] : src1->i[1];
4424    dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2];
4425    dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3];
4426 }
4427 
4428 static void
micro_isge(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src0,const union tgsi_exec_channel * src1)4429 micro_isge(union tgsi_exec_channel *dst,
4430            const union tgsi_exec_channel *src0,
4431            const union tgsi_exec_channel *src1)
4432 {
4433    dst->i[0] = src0->i[0] >= src1->i[0] ? -1 : 0;
4434    dst->i[1] = src0->i[1] >= src1->i[1] ? -1 : 0;
4435    dst->i[2] = src0->i[2] >= src1->i[2] ? -1 : 0;
4436    dst->i[3] = src0->i[3] >= src1->i[3] ? -1 : 0;
4437 }
4438 
4439 static void
micro_ishr(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src0,const union tgsi_exec_channel * src1)4440 micro_ishr(union tgsi_exec_channel *dst,
4441            const union tgsi_exec_channel *src0,
4442            const union tgsi_exec_channel *src1)
4443 {
4444    unsigned masked_count;
4445    masked_count = src1->i[0] & 0x1f;
4446    dst->i[0] = src0->i[0] >> masked_count;
4447    masked_count = src1->i[1] & 0x1f;
4448    dst->i[1] = src0->i[1] >> masked_count;
4449    masked_count = src1->i[2] & 0x1f;
4450    dst->i[2] = src0->i[2] >> masked_count;
4451    masked_count = src1->i[3] & 0x1f;
4452    dst->i[3] = src0->i[3] >> masked_count;
4453 }
4454 
4455 static void
micro_islt(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src0,const union tgsi_exec_channel * src1)4456 micro_islt(union tgsi_exec_channel *dst,
4457            const union tgsi_exec_channel *src0,
4458            const union tgsi_exec_channel *src1)
4459 {
4460    dst->i[0] = src0->i[0] < src1->i[0] ? -1 : 0;
4461    dst->i[1] = src0->i[1] < src1->i[1] ? -1 : 0;
4462    dst->i[2] = src0->i[2] < src1->i[2] ? -1 : 0;
4463    dst->i[3] = src0->i[3] < src1->i[3] ? -1 : 0;
4464 }
4465 
4466 static void
micro_f2u(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src)4467 micro_f2u(union tgsi_exec_channel *dst,
4468           const union tgsi_exec_channel *src)
4469 {
4470    dst->u[0] = (uint32_t)src->f[0];
4471    dst->u[1] = (uint32_t)src->f[1];
4472    dst->u[2] = (uint32_t)src->f[2];
4473    dst->u[3] = (uint32_t)src->f[3];
4474 }
4475 
4476 static void
micro_u2f(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src)4477 micro_u2f(union tgsi_exec_channel *dst,
4478           const union tgsi_exec_channel *src)
4479 {
4480    dst->f[0] = (float)src->u[0];
4481    dst->f[1] = (float)src->u[1];
4482    dst->f[2] = (float)src->u[2];
4483    dst->f[3] = (float)src->u[3];
4484 }
4485 
4486 static void
micro_uadd(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src0,const union tgsi_exec_channel * src1)4487 micro_uadd(union tgsi_exec_channel *dst,
4488            const union tgsi_exec_channel *src0,
4489            const union tgsi_exec_channel *src1)
4490 {
4491    dst->u[0] = src0->u[0] + src1->u[0];
4492    dst->u[1] = src0->u[1] + src1->u[1];
4493    dst->u[2] = src0->u[2] + src1->u[2];
4494    dst->u[3] = src0->u[3] + src1->u[3];
4495 }
4496 
4497 static void
micro_udiv(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src0,const union tgsi_exec_channel * src1)4498 micro_udiv(union tgsi_exec_channel *dst,
4499            const union tgsi_exec_channel *src0,
4500            const union tgsi_exec_channel *src1)
4501 {
4502    dst->u[0] = src1->u[0] ? src0->u[0] / src1->u[0] : ~0u;
4503    dst->u[1] = src1->u[1] ? src0->u[1] / src1->u[1] : ~0u;
4504    dst->u[2] = src1->u[2] ? src0->u[2] / src1->u[2] : ~0u;
4505    dst->u[3] = src1->u[3] ? src0->u[3] / src1->u[3] : ~0u;
4506 }
4507 
4508 static void
micro_umad(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src0,const union tgsi_exec_channel * src1,const union tgsi_exec_channel * src2)4509 micro_umad(union tgsi_exec_channel *dst,
4510            const union tgsi_exec_channel *src0,
4511            const union tgsi_exec_channel *src1,
4512            const union tgsi_exec_channel *src2)
4513 {
4514    dst->u[0] = src0->u[0] * src1->u[0] + src2->u[0];
4515    dst->u[1] = src0->u[1] * src1->u[1] + src2->u[1];
4516    dst->u[2] = src0->u[2] * src1->u[2] + src2->u[2];
4517    dst->u[3] = src0->u[3] * src1->u[3] + src2->u[3];
4518 }
4519 
4520 static void
micro_umax(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src0,const union tgsi_exec_channel * src1)4521 micro_umax(union tgsi_exec_channel *dst,
4522            const union tgsi_exec_channel *src0,
4523            const union tgsi_exec_channel *src1)
4524 {
4525    dst->u[0] = src0->u[0] > src1->u[0] ? src0->u[0] : src1->u[0];
4526    dst->u[1] = src0->u[1] > src1->u[1] ? src0->u[1] : src1->u[1];
4527    dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2];
4528    dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3];
4529 }
4530 
4531 static void
micro_umin(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src0,const union tgsi_exec_channel * src1)4532 micro_umin(union tgsi_exec_channel *dst,
4533            const union tgsi_exec_channel *src0,
4534            const union tgsi_exec_channel *src1)
4535 {
4536    dst->u[0] = src0->u[0] < src1->u[0] ? src0->u[0] : src1->u[0];
4537    dst->u[1] = src0->u[1] < src1->u[1] ? src0->u[1] : src1->u[1];
4538    dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2];
4539    dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3];
4540 }
4541 
4542 static void
micro_umod(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src0,const union tgsi_exec_channel * src1)4543 micro_umod(union tgsi_exec_channel *dst,
4544            const union tgsi_exec_channel *src0,
4545            const union tgsi_exec_channel *src1)
4546 {
4547    dst->u[0] = src1->u[0] ? src0->u[0] % src1->u[0] : ~0u;
4548    dst->u[1] = src1->u[1] ? src0->u[1] % src1->u[1] : ~0u;
4549    dst->u[2] = src1->u[2] ? src0->u[2] % src1->u[2] : ~0u;
4550    dst->u[3] = src1->u[3] ? src0->u[3] % src1->u[3] : ~0u;
4551 }
4552 
4553 static void
micro_umul(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src0,const union tgsi_exec_channel * src1)4554 micro_umul(union tgsi_exec_channel *dst,
4555            const union tgsi_exec_channel *src0,
4556            const union tgsi_exec_channel *src1)
4557 {
4558    dst->u[0] = src0->u[0] * src1->u[0];
4559    dst->u[1] = src0->u[1] * src1->u[1];
4560    dst->u[2] = src0->u[2] * src1->u[2];
4561    dst->u[3] = src0->u[3] * src1->u[3];
4562 }
4563 
4564 static void
micro_imul_hi(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src0,const union tgsi_exec_channel * src1)4565 micro_imul_hi(union tgsi_exec_channel *dst,
4566               const union tgsi_exec_channel *src0,
4567               const union tgsi_exec_channel *src1)
4568 {
4569 #define I64M(x, y) ((((int64_t)x) * ((int64_t)y)) >> 32)
4570    dst->i[0] = I64M(src0->i[0], src1->i[0]);
4571    dst->i[1] = I64M(src0->i[1], src1->i[1]);
4572    dst->i[2] = I64M(src0->i[2], src1->i[2]);
4573    dst->i[3] = I64M(src0->i[3], src1->i[3]);
4574 #undef I64M
4575 }
4576 
4577 static void
micro_umul_hi(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src0,const union tgsi_exec_channel * src1)4578 micro_umul_hi(union tgsi_exec_channel *dst,
4579               const union tgsi_exec_channel *src0,
4580               const union tgsi_exec_channel *src1)
4581 {
4582 #define U64M(x, y) ((((uint64_t)x) * ((uint64_t)y)) >> 32)
4583    dst->u[0] = U64M(src0->u[0], src1->u[0]);
4584    dst->u[1] = U64M(src0->u[1], src1->u[1]);
4585    dst->u[2] = U64M(src0->u[2], src1->u[2]);
4586    dst->u[3] = U64M(src0->u[3], src1->u[3]);
4587 #undef U64M
4588 }
4589 
4590 static void
micro_useq(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src0,const union tgsi_exec_channel * src1)4591 micro_useq(union tgsi_exec_channel *dst,
4592            const union tgsi_exec_channel *src0,
4593            const union tgsi_exec_channel *src1)
4594 {
4595    dst->u[0] = src0->u[0] == src1->u[0] ? ~0 : 0;
4596    dst->u[1] = src0->u[1] == src1->u[1] ? ~0 : 0;
4597    dst->u[2] = src0->u[2] == src1->u[2] ? ~0 : 0;
4598    dst->u[3] = src0->u[3] == src1->u[3] ? ~0 : 0;
4599 }
4600 
4601 static void
micro_usge(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src0,const union tgsi_exec_channel * src1)4602 micro_usge(union tgsi_exec_channel *dst,
4603            const union tgsi_exec_channel *src0,
4604            const union tgsi_exec_channel *src1)
4605 {
4606    dst->u[0] = src0->u[0] >= src1->u[0] ? ~0 : 0;
4607    dst->u[1] = src0->u[1] >= src1->u[1] ? ~0 : 0;
4608    dst->u[2] = src0->u[2] >= src1->u[2] ? ~0 : 0;
4609    dst->u[3] = src0->u[3] >= src1->u[3] ? ~0 : 0;
4610 }
4611 
4612 static void
micro_ushr(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src0,const union tgsi_exec_channel * src1)4613 micro_ushr(union tgsi_exec_channel *dst,
4614            const union tgsi_exec_channel *src0,
4615            const union tgsi_exec_channel *src1)
4616 {
4617    unsigned masked_count;
4618    masked_count = src1->u[0] & 0x1f;
4619    dst->u[0] = src0->u[0] >> masked_count;
4620    masked_count = src1->u[1] & 0x1f;
4621    dst->u[1] = src0->u[1] >> masked_count;
4622    masked_count = src1->u[2] & 0x1f;
4623    dst->u[2] = src0->u[2] >> masked_count;
4624    masked_count = src1->u[3] & 0x1f;
4625    dst->u[3] = src0->u[3] >> masked_count;
4626 }
4627 
4628 static void
micro_uslt(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src0,const union tgsi_exec_channel * src1)4629 micro_uslt(union tgsi_exec_channel *dst,
4630            const union tgsi_exec_channel *src0,
4631            const union tgsi_exec_channel *src1)
4632 {
4633    dst->u[0] = src0->u[0] < src1->u[0] ? ~0 : 0;
4634    dst->u[1] = src0->u[1] < src1->u[1] ? ~0 : 0;
4635    dst->u[2] = src0->u[2] < src1->u[2] ? ~0 : 0;
4636    dst->u[3] = src0->u[3] < src1->u[3] ? ~0 : 0;
4637 }
4638 
4639 static void
micro_usne(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src0,const union tgsi_exec_channel * src1)4640 micro_usne(union tgsi_exec_channel *dst,
4641            const union tgsi_exec_channel *src0,
4642            const union tgsi_exec_channel *src1)
4643 {
4644    dst->u[0] = src0->u[0] != src1->u[0] ? ~0 : 0;
4645    dst->u[1] = src0->u[1] != src1->u[1] ? ~0 : 0;
4646    dst->u[2] = src0->u[2] != src1->u[2] ? ~0 : 0;
4647    dst->u[3] = src0->u[3] != src1->u[3] ? ~0 : 0;
4648 }
4649 
4650 static void
micro_uarl(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src)4651 micro_uarl(union tgsi_exec_channel *dst,
4652            const union tgsi_exec_channel *src)
4653 {
4654    dst->i[0] = src->u[0];
4655    dst->i[1] = src->u[1];
4656    dst->i[2] = src->u[2];
4657    dst->i[3] = src->u[3];
4658 }
4659 
4660 /**
4661  * Signed bitfield extract (i.e. sign-extend the extracted bits)
4662  */
4663 static void
micro_ibfe(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src0,const union tgsi_exec_channel * src1,const union tgsi_exec_channel * src2)4664 micro_ibfe(union tgsi_exec_channel *dst,
4665            const union tgsi_exec_channel *src0,
4666            const union tgsi_exec_channel *src1,
4667            const union tgsi_exec_channel *src2)
4668 {
4669    int i;
4670    for (i = 0; i < 4; i++) {
4671       int width = src2->i[i];
4672       int offset = src1->i[i] & 0x1f;
4673       if (width == 32 && offset == 0) {
4674          dst->i[i] = src0->i[i];
4675          continue;
4676       }
4677       width &= 0x1f;
4678       if (width == 0)
4679          dst->i[i] = 0;
4680       else if (width + offset < 32)
4681          dst->i[i] = (src0->i[i] << (32 - width - offset)) >> (32 - width);
4682       else
4683          dst->i[i] = src0->i[i] >> offset;
4684    }
4685 }
4686 
4687 /**
4688  * Unsigned bitfield extract
4689  */
4690 static void
micro_ubfe(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src0,const union tgsi_exec_channel * src1,const union tgsi_exec_channel * src2)4691 micro_ubfe(union tgsi_exec_channel *dst,
4692            const union tgsi_exec_channel *src0,
4693            const union tgsi_exec_channel *src1,
4694            const union tgsi_exec_channel *src2)
4695 {
4696    int i;
4697    for (i = 0; i < 4; i++) {
4698       int width = src2->u[i];
4699       int offset = src1->u[i] & 0x1f;
4700       if (width == 32 && offset == 0) {
4701          dst->u[i] = src0->u[i];
4702          continue;
4703       }
4704       width &= 0x1f;
4705       if (width == 0)
4706          dst->u[i] = 0;
4707       else if (width + offset < 32)
4708          dst->u[i] = (src0->u[i] << (32 - width - offset)) >> (32 - width);
4709       else
4710          dst->u[i] = src0->u[i] >> offset;
4711    }
4712 }
4713 
4714 /**
4715  * Bitfield insert: copy low bits from src1 into a region of src0.
4716  */
4717 static void
micro_bfi(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src0,const union tgsi_exec_channel * src1,const union tgsi_exec_channel * src2,const union tgsi_exec_channel * src3)4718 micro_bfi(union tgsi_exec_channel *dst,
4719           const union tgsi_exec_channel *src0,
4720           const union tgsi_exec_channel *src1,
4721           const union tgsi_exec_channel *src2,
4722           const union tgsi_exec_channel *src3)
4723 {
4724    int i;
4725    for (i = 0; i < 4; i++) {
4726       int width = src3->u[i];
4727       int offset = src2->u[i] & 0x1f;
4728       if (width == 32) {
4729          dst->u[i] = src1->u[i];
4730       } else {
4731          int bitmask = ((1 << width) - 1) << offset;
4732          dst->u[i] = ((src1->u[i] << offset) & bitmask) | (src0->u[i] & ~bitmask);
4733       }
4734    }
4735 }
4736 
4737 static void
micro_brev(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src)4738 micro_brev(union tgsi_exec_channel *dst,
4739            const union tgsi_exec_channel *src)
4740 {
4741    dst->u[0] = util_bitreverse(src->u[0]);
4742    dst->u[1] = util_bitreverse(src->u[1]);
4743    dst->u[2] = util_bitreverse(src->u[2]);
4744    dst->u[3] = util_bitreverse(src->u[3]);
4745 }
4746 
4747 static void
micro_popc(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src)4748 micro_popc(union tgsi_exec_channel *dst,
4749            const union tgsi_exec_channel *src)
4750 {
4751    dst->u[0] = util_bitcount(src->u[0]);
4752    dst->u[1] = util_bitcount(src->u[1]);
4753    dst->u[2] = util_bitcount(src->u[2]);
4754    dst->u[3] = util_bitcount(src->u[3]);
4755 }
4756 
4757 static void
micro_lsb(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src)4758 micro_lsb(union tgsi_exec_channel *dst,
4759           const union tgsi_exec_channel *src)
4760 {
4761    dst->i[0] = ffs(src->u[0]) - 1;
4762    dst->i[1] = ffs(src->u[1]) - 1;
4763    dst->i[2] = ffs(src->u[2]) - 1;
4764    dst->i[3] = ffs(src->u[3]) - 1;
4765 }
4766 
4767 static void
micro_imsb(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src)4768 micro_imsb(union tgsi_exec_channel *dst,
4769            const union tgsi_exec_channel *src)
4770 {
4771    dst->i[0] = util_last_bit_signed(src->i[0]) - 1;
4772    dst->i[1] = util_last_bit_signed(src->i[1]) - 1;
4773    dst->i[2] = util_last_bit_signed(src->i[2]) - 1;
4774    dst->i[3] = util_last_bit_signed(src->i[3]) - 1;
4775 }
4776 
4777 static void
micro_umsb(union tgsi_exec_channel * dst,const union tgsi_exec_channel * src)4778 micro_umsb(union tgsi_exec_channel *dst,
4779            const union tgsi_exec_channel *src)
4780 {
4781    dst->i[0] = util_last_bit(src->u[0]) - 1;
4782    dst->i[1] = util_last_bit(src->u[1]) - 1;
4783    dst->i[2] = util_last_bit(src->u[2]) - 1;
4784    dst->i[3] = util_last_bit(src->u[3]) - 1;
4785 }
4786 
4787 
4788 static void
exec_interp_at_sample(struct tgsi_exec_machine * mach,const struct tgsi_full_instruction * inst)4789 exec_interp_at_sample(struct tgsi_exec_machine *mach,
4790                       const struct tgsi_full_instruction *inst)
4791 {
4792    union tgsi_exec_channel index;
4793    union tgsi_exec_channel index2D;
4794    union tgsi_exec_channel result[TGSI_NUM_CHANNELS];
4795    const struct tgsi_full_src_register *reg = &inst->Src[0];
4796 
4797    assert(reg->Register.File == TGSI_FILE_INPUT);
4798    assert(inst->Src[1].Register.File == TGSI_FILE_IMMEDIATE);
4799 
4800    get_index_registers(mach, reg, &index, &index2D);
4801    float sample = mach->Imms[inst->Src[1].Register.Index][inst->Src[1].Register.SwizzleX];
4802 
4803    /* Short cut: sample 0 is like a normal fetch */
4804    for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4805       if (!(inst->Dst[0].Register.WriteMask & (1 << chan)))
4806          continue;
4807 
4808       fetch_src_file_channel(mach, TGSI_FILE_INPUT, chan, &index, &index2D,
4809                              &result[chan]);
4810       if (sample != 0.0f) {
4811 
4812       /* TODO: define the samples > 0, but so far we only do fake MSAA */
4813          float x = 0;
4814          float y = 0;
4815 
4816          unsigned pos = index2D.i[chan] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index.i[chan];
4817          assert(pos >= 0);
4818          assert(pos < TGSI_MAX_PRIM_VERTICES * PIPE_MAX_ATTRIBS);
4819          mach->InputSampleOffsetApply[pos](mach, pos, chan, x, y, &result[chan]);
4820       }
4821       store_dest(mach, &result[chan], &inst->Dst[0], inst, chan);
4822    }
4823 }
4824 
4825 
4826 static void
exec_interp_at_offset(struct tgsi_exec_machine * mach,const struct tgsi_full_instruction * inst)4827 exec_interp_at_offset(struct tgsi_exec_machine *mach,
4828                       const struct tgsi_full_instruction *inst)
4829 {
4830    union tgsi_exec_channel index;
4831    union tgsi_exec_channel index2D;
4832    union tgsi_exec_channel ofsx;
4833    union tgsi_exec_channel ofsy;
4834    const struct tgsi_full_src_register *reg = &inst->Src[0];
4835 
4836    assert(reg->Register.File == TGSI_FILE_INPUT);
4837 
4838    get_index_registers(mach, reg, &index, &index2D);
4839    unsigned pos = index2D.i[0] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index.i[0];
4840 
4841    fetch_source(mach, &ofsx, &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
4842    fetch_source(mach, &ofsy, &inst->Src[1], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
4843 
4844    for (int chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4845       if (!(inst->Dst[0].Register.WriteMask & (1 << chan)))
4846          continue;
4847       union tgsi_exec_channel result;
4848       fetch_src_file_channel(mach, TGSI_FILE_INPUT, chan, &index, &index2D, &result);
4849       mach->InputSampleOffsetApply[pos](mach, pos, chan, ofsx.f[chan], ofsy.f[chan], &result);
4850       store_dest(mach, &result, &inst->Dst[0], inst, chan);
4851    }
4852 }
4853 
4854 
4855 static void
exec_interp_at_centroid(struct tgsi_exec_machine * mach,const struct tgsi_full_instruction * inst)4856 exec_interp_at_centroid(struct tgsi_exec_machine *mach,
4857                         const struct tgsi_full_instruction *inst)
4858 {
4859    union tgsi_exec_channel index;
4860    union tgsi_exec_channel index2D;
4861    union tgsi_exec_channel result[TGSI_NUM_CHANNELS];
4862    const struct tgsi_full_src_register *reg = &inst->Src[0];
4863 
4864    assert(reg->Register.File == TGSI_FILE_INPUT);
4865    get_index_registers(mach, reg, &index, &index2D);
4866 
4867    for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4868       if (!(inst->Dst[0].Register.WriteMask & (1 << chan)))
4869          continue;
4870 
4871       /* Here we should add the change to use a sample that lies within the
4872        * primitive (Section 15.2):
4873        *
4874        * "When interpolating variables declared using centroid in ,
4875        * the variable is sampled at a location within the pixel covered
4876        * by the primitive generating the fragment.
4877        * ...
4878        * The built-in functions interpolateAtCentroid ... will sample
4879        * variables as though they were declared with the centroid ...
4880        * qualifier[s]."
4881        *
4882        * Since we only support 1 sample currently, this is just a pass-through.
4883        */
4884       fetch_src_file_channel(mach, TGSI_FILE_INPUT, chan, &index, &index2D,
4885                              &result[chan]);
4886       store_dest(mach, &result[chan], &inst->Dst[0], inst, chan);
4887    }
4888 
4889 }
4890 
4891 
4892 /**
4893  * Execute a TGSI instruction.
4894  * Returns TRUE if a barrier instruction is hit,
4895  * otherwise FALSE.
4896  */
4897 static bool
exec_instruction(struct tgsi_exec_machine * mach,const struct tgsi_full_instruction * inst,int * pc)4898 exec_instruction(
4899    struct tgsi_exec_machine *mach,
4900    const struct tgsi_full_instruction *inst,
4901    int *pc )
4902 {
4903    union tgsi_exec_channel r[10];
4904 
4905    (*pc)++;
4906 
4907    switch (inst->Instruction.Opcode) {
4908    case TGSI_OPCODE_ARL:
4909       exec_vector_unary(mach, inst, micro_arl, TGSI_EXEC_DATA_FLOAT);
4910       break;
4911 
4912    case TGSI_OPCODE_MOV:
4913       exec_vector_unary(mach, inst, micro_mov, TGSI_EXEC_DATA_FLOAT);
4914       break;
4915 
4916    case TGSI_OPCODE_LIT:
4917       exec_lit(mach, inst);
4918       break;
4919 
4920    case TGSI_OPCODE_RCP:
4921       exec_scalar_unary(mach, inst, micro_rcp, TGSI_EXEC_DATA_FLOAT);
4922       break;
4923 
4924    case TGSI_OPCODE_RSQ:
4925       exec_scalar_unary(mach, inst, micro_rsq, TGSI_EXEC_DATA_FLOAT);
4926       break;
4927 
4928    case TGSI_OPCODE_EXP:
4929       exec_exp(mach, inst);
4930       break;
4931 
4932    case TGSI_OPCODE_LOG:
4933       exec_log(mach, inst);
4934       break;
4935 
4936    case TGSI_OPCODE_MUL:
4937       exec_vector_binary(mach, inst, micro_mul, TGSI_EXEC_DATA_FLOAT);
4938       break;
4939 
4940    case TGSI_OPCODE_ADD:
4941       exec_vector_binary(mach, inst, micro_add, TGSI_EXEC_DATA_FLOAT);
4942       break;
4943 
4944    case TGSI_OPCODE_DP3:
4945       exec_dp3(mach, inst);
4946       break;
4947 
4948    case TGSI_OPCODE_DP4:
4949       exec_dp4(mach, inst);
4950       break;
4951 
4952    case TGSI_OPCODE_DST:
4953       exec_dst(mach, inst);
4954       break;
4955 
4956    case TGSI_OPCODE_MIN:
4957       exec_vector_binary(mach, inst, micro_min, TGSI_EXEC_DATA_FLOAT);
4958       break;
4959 
4960    case TGSI_OPCODE_MAX:
4961       exec_vector_binary(mach, inst, micro_max, TGSI_EXEC_DATA_FLOAT);
4962       break;
4963 
4964    case TGSI_OPCODE_SLT:
4965       exec_vector_binary(mach, inst, micro_slt, TGSI_EXEC_DATA_FLOAT);
4966       break;
4967 
4968    case TGSI_OPCODE_SGE:
4969       exec_vector_binary(mach, inst, micro_sge, TGSI_EXEC_DATA_FLOAT);
4970       break;
4971 
4972    case TGSI_OPCODE_MAD:
4973       exec_vector_trinary(mach, inst, micro_mad, TGSI_EXEC_DATA_FLOAT);
4974       break;
4975 
4976    case TGSI_OPCODE_LRP:
4977       exec_vector_trinary(mach, inst, micro_lrp, TGSI_EXEC_DATA_FLOAT);
4978       break;
4979 
4980    case TGSI_OPCODE_SQRT:
4981       exec_scalar_unary(mach, inst, micro_sqrt, TGSI_EXEC_DATA_FLOAT);
4982       break;
4983 
4984    case TGSI_OPCODE_FRC:
4985       exec_vector_unary(mach, inst, micro_frc, TGSI_EXEC_DATA_FLOAT);
4986       break;
4987 
4988    case TGSI_OPCODE_FLR:
4989       exec_vector_unary(mach, inst, micro_flr, TGSI_EXEC_DATA_FLOAT);
4990       break;
4991 
4992    case TGSI_OPCODE_ROUND:
4993       exec_vector_unary(mach, inst, micro_rnd, TGSI_EXEC_DATA_FLOAT);
4994       break;
4995 
4996    case TGSI_OPCODE_EX2:
4997       exec_scalar_unary(mach, inst, micro_exp2, TGSI_EXEC_DATA_FLOAT);
4998       break;
4999 
5000    case TGSI_OPCODE_LG2:
5001       exec_scalar_unary(mach, inst, micro_lg2, TGSI_EXEC_DATA_FLOAT);
5002       break;
5003 
5004    case TGSI_OPCODE_POW:
5005       exec_scalar_binary(mach, inst, micro_pow, TGSI_EXEC_DATA_FLOAT);
5006       break;
5007 
5008    case TGSI_OPCODE_LDEXP:
5009       exec_vector_binary(mach, inst, micro_ldexp, TGSI_EXEC_DATA_FLOAT);
5010       break;
5011 
5012    case TGSI_OPCODE_COS:
5013       exec_scalar_unary(mach, inst, micro_cos, TGSI_EXEC_DATA_FLOAT);
5014       break;
5015 
5016    case TGSI_OPCODE_DDX_FINE:
5017       exec_vector_unary(mach, inst, micro_ddx_fine, TGSI_EXEC_DATA_FLOAT);
5018       break;
5019 
5020    case TGSI_OPCODE_DDX:
5021       exec_vector_unary(mach, inst, micro_ddx, TGSI_EXEC_DATA_FLOAT);
5022       break;
5023 
5024    case TGSI_OPCODE_DDY_FINE:
5025       exec_vector_unary(mach, inst, micro_ddy_fine, TGSI_EXEC_DATA_FLOAT);
5026       break;
5027 
5028    case TGSI_OPCODE_DDY:
5029       exec_vector_unary(mach, inst, micro_ddy, TGSI_EXEC_DATA_FLOAT);
5030       break;
5031 
5032    case TGSI_OPCODE_KILL:
5033       exec_kill (mach);
5034       break;
5035 
5036    case TGSI_OPCODE_KILL_IF:
5037       exec_kill_if (mach, inst);
5038       break;
5039 
5040    case TGSI_OPCODE_PK2H:
5041       exec_pk2h(mach, inst);
5042       break;
5043 
5044    case TGSI_OPCODE_PK2US:
5045       assert (0);
5046       break;
5047 
5048    case TGSI_OPCODE_PK4B:
5049       assert (0);
5050       break;
5051 
5052    case TGSI_OPCODE_PK4UB:
5053       assert (0);
5054       break;
5055 
5056    case TGSI_OPCODE_SEQ:
5057       exec_vector_binary(mach, inst, micro_seq, TGSI_EXEC_DATA_FLOAT);
5058       break;
5059 
5060    case TGSI_OPCODE_SGT:
5061       exec_vector_binary(mach, inst, micro_sgt, TGSI_EXEC_DATA_FLOAT);
5062       break;
5063 
5064    case TGSI_OPCODE_SIN:
5065       exec_scalar_unary(mach, inst, micro_sin, TGSI_EXEC_DATA_FLOAT);
5066       break;
5067 
5068    case TGSI_OPCODE_SLE:
5069       exec_vector_binary(mach, inst, micro_sle, TGSI_EXEC_DATA_FLOAT);
5070       break;
5071 
5072    case TGSI_OPCODE_SNE:
5073       exec_vector_binary(mach, inst, micro_sne, TGSI_EXEC_DATA_FLOAT);
5074       break;
5075 
5076    case TGSI_OPCODE_TEX:
5077       /* simple texture lookup */
5078       /* src[0] = texcoord */
5079       /* src[1] = sampler unit */
5080       exec_tex(mach, inst, TEX_MODIFIER_NONE, 1);
5081       break;
5082 
5083    case TGSI_OPCODE_TXB:
5084       /* Texture lookup with lod bias */
5085       /* src[0] = texcoord (src[0].w = LOD bias) */
5086       /* src[1] = sampler unit */
5087       exec_tex(mach, inst, TEX_MODIFIER_LOD_BIAS, 1);
5088       break;
5089 
5090    case TGSI_OPCODE_TXD:
5091       /* Texture lookup with explict partial derivatives */
5092       /* src[0] = texcoord */
5093       /* src[1] = d[strq]/dx */
5094       /* src[2] = d[strq]/dy */
5095       /* src[3] = sampler unit */
5096       exec_txd(mach, inst);
5097       break;
5098 
5099    case TGSI_OPCODE_TXL:
5100       /* Texture lookup with explit LOD */
5101       /* src[0] = texcoord (src[0].w = LOD) */
5102       /* src[1] = sampler unit */
5103       exec_tex(mach, inst, TEX_MODIFIER_EXPLICIT_LOD, 1);
5104       break;
5105 
5106    case TGSI_OPCODE_TXP:
5107       /* Texture lookup with projection */
5108       /* src[0] = texcoord (src[0].w = projection) */
5109       /* src[1] = sampler unit */
5110       exec_tex(mach, inst, TEX_MODIFIER_PROJECTED, 1);
5111       break;
5112 
5113    case TGSI_OPCODE_TG4:
5114       /* src[0] = texcoord */
5115       /* src[1] = component */
5116       /* src[2] = sampler unit */
5117       exec_tex(mach, inst, TEX_MODIFIER_GATHER, 2);
5118       break;
5119 
5120    case TGSI_OPCODE_LODQ:
5121       /* src[0] = texcoord */
5122       /* src[1] = sampler unit */
5123       exec_lodq(mach, inst);
5124       break;
5125 
5126    case TGSI_OPCODE_UP2H:
5127       exec_up2h(mach, inst);
5128       break;
5129 
5130    case TGSI_OPCODE_UP2US:
5131       assert (0);
5132       break;
5133 
5134    case TGSI_OPCODE_UP4B:
5135       assert (0);
5136       break;
5137 
5138    case TGSI_OPCODE_UP4UB:
5139       assert (0);
5140       break;
5141 
5142    case TGSI_OPCODE_ARR:
5143       exec_vector_unary(mach, inst, micro_arr, TGSI_EXEC_DATA_FLOAT);
5144       break;
5145 
5146    case TGSI_OPCODE_CAL:
5147       /* skip the call if no execution channels are enabled */
5148       if (mach->ExecMask) {
5149          /* do the call */
5150 
5151          /* First, record the depths of the execution stacks.
5152           * This is important for deeply nested/looped return statements.
5153           * We have to unwind the stacks by the correct amount.  For a
5154           * real code generator, we could determine the number of entries
5155           * to pop off each stack with simple static analysis and avoid
5156           * implementing this data structure at run time.
5157           */
5158          mach->CallStack[mach->CallStackTop].CondStackTop = mach->CondStackTop;
5159          mach->CallStack[mach->CallStackTop].LoopStackTop = mach->LoopStackTop;
5160          mach->CallStack[mach->CallStackTop].ContStackTop = mach->ContStackTop;
5161          mach->CallStack[mach->CallStackTop].SwitchStackTop = mach->SwitchStackTop;
5162          mach->CallStack[mach->CallStackTop].BreakStackTop = mach->BreakStackTop;
5163          /* note that PC was already incremented above */
5164          mach->CallStack[mach->CallStackTop].ReturnAddr = *pc;
5165 
5166          mach->CallStackTop++;
5167 
5168          /* Second, push the Cond, Loop, Cont, Func stacks */
5169          assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
5170          assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
5171          assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
5172          assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
5173          assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
5174          assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
5175 
5176          mach->CondStack[mach->CondStackTop++] = mach->CondMask;
5177          mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
5178          mach->ContStack[mach->ContStackTop++] = mach->ContMask;
5179          mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
5180          mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
5181          mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
5182 
5183          /* Finally, jump to the subroutine.  The label is a pointer
5184           * (an instruction number) to the BGNSUB instruction.
5185           */
5186          *pc = inst->Label.Label;
5187          assert(mach->Instructions[*pc].Instruction.Opcode
5188                 == TGSI_OPCODE_BGNSUB);
5189       }
5190       break;
5191 
5192    case TGSI_OPCODE_RET:
5193       mach->FuncMask &= ~mach->ExecMask;
5194       UPDATE_EXEC_MASK(mach);
5195 
5196       if (mach->FuncMask == 0x0) {
5197          /* really return now (otherwise, keep executing */
5198 
5199          if (mach->CallStackTop == 0) {
5200             /* returning from main() */
5201             mach->CondStackTop = 0;
5202             mach->LoopStackTop = 0;
5203             mach->ContStackTop = 0;
5204             mach->LoopLabelStackTop = 0;
5205             mach->SwitchStackTop = 0;
5206             mach->BreakStackTop = 0;
5207             *pc = -1;
5208             return false;
5209          }
5210 
5211          assert(mach->CallStackTop > 0);
5212          mach->CallStackTop--;
5213 
5214          mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
5215          mach->CondMask = mach->CondStack[mach->CondStackTop];
5216 
5217          mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
5218          mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
5219 
5220          mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
5221          mach->ContMask = mach->ContStack[mach->ContStackTop];
5222 
5223          mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
5224          mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
5225 
5226          mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
5227          mach->BreakType = mach->BreakStack[mach->BreakStackTop];
5228 
5229          assert(mach->FuncStackTop > 0);
5230          mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
5231 
5232          *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
5233 
5234          UPDATE_EXEC_MASK(mach);
5235       }
5236       break;
5237 
5238    case TGSI_OPCODE_SSG:
5239       exec_vector_unary(mach, inst, micro_sgn, TGSI_EXEC_DATA_FLOAT);
5240       break;
5241 
5242    case TGSI_OPCODE_CMP:
5243       exec_vector_trinary(mach, inst, micro_cmp, TGSI_EXEC_DATA_FLOAT);
5244       break;
5245 
5246    case TGSI_OPCODE_DIV:
5247       exec_vector_binary(mach, inst, micro_div, TGSI_EXEC_DATA_FLOAT);
5248       break;
5249 
5250    case TGSI_OPCODE_DP2:
5251       exec_dp2(mach, inst);
5252       break;
5253 
5254    case TGSI_OPCODE_IF:
5255       /* push CondMask */
5256       assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
5257       mach->CondStack[mach->CondStackTop++] = mach->CondMask;
5258       FETCH( &r[0], 0, TGSI_CHAN_X );
5259       for (int i = 0; i < TGSI_QUAD_SIZE; i++) {
5260          if (!r[0].f[i])
5261             mach->CondMask &= ~(1 << i);
5262       }
5263       UPDATE_EXEC_MASK(mach);
5264       /* If no channels are taking the then branch, jump to ELSE. */
5265       if (!mach->CondMask)
5266          *pc = inst->Label.Label;
5267       break;
5268 
5269    case TGSI_OPCODE_UIF:
5270       /* push CondMask */
5271       assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
5272       mach->CondStack[mach->CondStackTop++] = mach->CondMask;
5273       IFETCH( &r[0], 0, TGSI_CHAN_X );
5274       for (int i = 0; i < TGSI_QUAD_SIZE; i++) {
5275          if (!r[0].u[i])
5276             mach->CondMask &= ~(1 << i);
5277       }
5278       UPDATE_EXEC_MASK(mach);
5279       /* If no channels are taking the then branch, jump to ELSE. */
5280       if (!mach->CondMask)
5281          *pc = inst->Label.Label;
5282       break;
5283 
5284    case TGSI_OPCODE_ELSE:
5285       /* invert CondMask wrt previous mask */
5286       {
5287          unsigned prevMask;
5288          assert(mach->CondStackTop > 0);
5289          prevMask = mach->CondStack[mach->CondStackTop - 1];
5290          mach->CondMask = ~mach->CondMask & prevMask;
5291          UPDATE_EXEC_MASK(mach);
5292 
5293          /* If no channels are taking ELSE, jump to ENDIF */
5294          if (!mach->CondMask)
5295             *pc = inst->Label.Label;
5296       }
5297       break;
5298 
5299    case TGSI_OPCODE_ENDIF:
5300       /* pop CondMask */
5301       assert(mach->CondStackTop > 0);
5302       mach->CondMask = mach->CondStack[--mach->CondStackTop];
5303       UPDATE_EXEC_MASK(mach);
5304       break;
5305 
5306    case TGSI_OPCODE_END:
5307       /* make sure we end primitives which haven't
5308        * been explicitly emitted */
5309       conditional_emit_primitive(mach);
5310       /* halt execution */
5311       *pc = -1;
5312       break;
5313 
5314    case TGSI_OPCODE_CEIL:
5315       exec_vector_unary(mach, inst, micro_ceil, TGSI_EXEC_DATA_FLOAT);
5316       break;
5317 
5318    case TGSI_OPCODE_I2F:
5319       exec_vector_unary(mach, inst, micro_i2f, TGSI_EXEC_DATA_INT);
5320       break;
5321 
5322    case TGSI_OPCODE_NOT:
5323       exec_vector_unary(mach, inst, micro_not, TGSI_EXEC_DATA_UINT);
5324       break;
5325 
5326    case TGSI_OPCODE_TRUNC:
5327       exec_vector_unary(mach, inst, micro_trunc, TGSI_EXEC_DATA_FLOAT);
5328       break;
5329 
5330    case TGSI_OPCODE_SHL:
5331       exec_vector_binary(mach, inst, micro_shl, TGSI_EXEC_DATA_UINT);
5332       break;
5333 
5334    case TGSI_OPCODE_AND:
5335       exec_vector_binary(mach, inst, micro_and, TGSI_EXEC_DATA_UINT);
5336       break;
5337 
5338    case TGSI_OPCODE_OR:
5339       exec_vector_binary(mach, inst, micro_or, TGSI_EXEC_DATA_UINT);
5340       break;
5341 
5342    case TGSI_OPCODE_MOD:
5343       exec_vector_binary(mach, inst, micro_mod, TGSI_EXEC_DATA_INT);
5344       break;
5345 
5346    case TGSI_OPCODE_XOR:
5347       exec_vector_binary(mach, inst, micro_xor, TGSI_EXEC_DATA_UINT);
5348       break;
5349 
5350    case TGSI_OPCODE_TXF:
5351       exec_txf(mach, inst);
5352       break;
5353 
5354    case TGSI_OPCODE_TXQ:
5355       exec_txq(mach, inst);
5356       break;
5357 
5358    case TGSI_OPCODE_EMIT:
5359       emit_vertex(mach, inst);
5360       break;
5361 
5362    case TGSI_OPCODE_ENDPRIM:
5363       emit_primitive(mach, inst);
5364       break;
5365 
5366    case TGSI_OPCODE_BGNLOOP:
5367       /* push LoopMask and ContMasks */
5368       assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
5369       assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
5370       assert(mach->LoopLabelStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
5371       assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
5372 
5373       mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
5374       mach->ContStack[mach->ContStackTop++] = mach->ContMask;
5375       mach->LoopLabelStack[mach->LoopLabelStackTop++] = *pc - 1;
5376       mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
5377       mach->BreakType = TGSI_EXEC_BREAK_INSIDE_LOOP;
5378       break;
5379 
5380    case TGSI_OPCODE_ENDLOOP:
5381       /* Restore ContMask, but don't pop */
5382       assert(mach->ContStackTop > 0);
5383       mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
5384       UPDATE_EXEC_MASK(mach);
5385       if (mach->ExecMask) {
5386          /* repeat loop: jump to instruction just past BGNLOOP */
5387          assert(mach->LoopLabelStackTop > 0);
5388          *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
5389       }
5390       else {
5391          /* exit loop: pop LoopMask */
5392          assert(mach->LoopStackTop > 0);
5393          mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
5394          /* pop ContMask */
5395          assert(mach->ContStackTop > 0);
5396          mach->ContMask = mach->ContStack[--mach->ContStackTop];
5397          assert(mach->LoopLabelStackTop > 0);
5398          --mach->LoopLabelStackTop;
5399 
5400          mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
5401       }
5402       UPDATE_EXEC_MASK(mach);
5403       break;
5404 
5405    case TGSI_OPCODE_BRK:
5406       exec_break(mach);
5407       break;
5408 
5409    case TGSI_OPCODE_CONT:
5410       /* turn off cont channels for each enabled exec channel */
5411       mach->ContMask &= ~mach->ExecMask;
5412       /* Todo: if mach->LoopMask == 0, jump to end of loop */
5413       UPDATE_EXEC_MASK(mach);
5414       break;
5415 
5416    case TGSI_OPCODE_BGNSUB:
5417       /* no-op */
5418       break;
5419 
5420    case TGSI_OPCODE_ENDSUB:
5421       /*
5422        * XXX: This really should be a no-op. We should never reach this opcode.
5423        */
5424 
5425       assert(mach->CallStackTop > 0);
5426       mach->CallStackTop--;
5427 
5428       mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
5429       mach->CondMask = mach->CondStack[mach->CondStackTop];
5430 
5431       mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
5432       mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
5433 
5434       mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
5435       mach->ContMask = mach->ContStack[mach->ContStackTop];
5436 
5437       mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
5438       mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
5439 
5440       mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
5441       mach->BreakType = mach->BreakStack[mach->BreakStackTop];
5442 
5443       assert(mach->FuncStackTop > 0);
5444       mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
5445 
5446       *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
5447 
5448       UPDATE_EXEC_MASK(mach);
5449       break;
5450 
5451    case TGSI_OPCODE_NOP:
5452       break;
5453 
5454    case TGSI_OPCODE_F2I:
5455       exec_vector_unary(mach, inst, micro_f2i, TGSI_EXEC_DATA_FLOAT);
5456       break;
5457 
5458    case TGSI_OPCODE_FSEQ:
5459       exec_vector_binary(mach, inst, micro_fseq, TGSI_EXEC_DATA_FLOAT);
5460       break;
5461 
5462    case TGSI_OPCODE_FSGE:
5463       exec_vector_binary(mach, inst, micro_fsge, TGSI_EXEC_DATA_FLOAT);
5464       break;
5465 
5466    case TGSI_OPCODE_FSLT:
5467       exec_vector_binary(mach, inst, micro_fslt, TGSI_EXEC_DATA_FLOAT);
5468       break;
5469 
5470    case TGSI_OPCODE_FSNE:
5471       exec_vector_binary(mach, inst, micro_fsne, TGSI_EXEC_DATA_FLOAT);
5472       break;
5473 
5474    case TGSI_OPCODE_IDIV:
5475       exec_vector_binary(mach, inst, micro_idiv, TGSI_EXEC_DATA_INT);
5476       break;
5477 
5478    case TGSI_OPCODE_IMAX:
5479       exec_vector_binary(mach, inst, micro_imax, TGSI_EXEC_DATA_INT);
5480       break;
5481 
5482    case TGSI_OPCODE_IMIN:
5483       exec_vector_binary(mach, inst, micro_imin, TGSI_EXEC_DATA_INT);
5484       break;
5485 
5486    case TGSI_OPCODE_INEG:
5487       exec_vector_unary(mach, inst, micro_ineg, TGSI_EXEC_DATA_INT);
5488       break;
5489 
5490    case TGSI_OPCODE_ISGE:
5491       exec_vector_binary(mach, inst, micro_isge, TGSI_EXEC_DATA_INT);
5492       break;
5493 
5494    case TGSI_OPCODE_ISHR:
5495       exec_vector_binary(mach, inst, micro_ishr, TGSI_EXEC_DATA_INT);
5496       break;
5497 
5498    case TGSI_OPCODE_ISLT:
5499       exec_vector_binary(mach, inst, micro_islt, TGSI_EXEC_DATA_INT);
5500       break;
5501 
5502    case TGSI_OPCODE_F2U:
5503       exec_vector_unary(mach, inst, micro_f2u, TGSI_EXEC_DATA_FLOAT);
5504       break;
5505 
5506    case TGSI_OPCODE_U2F:
5507       exec_vector_unary(mach, inst, micro_u2f, TGSI_EXEC_DATA_UINT);
5508       break;
5509 
5510    case TGSI_OPCODE_UADD:
5511       exec_vector_binary(mach, inst, micro_uadd, TGSI_EXEC_DATA_INT);
5512       break;
5513 
5514    case TGSI_OPCODE_UDIV:
5515       exec_vector_binary(mach, inst, micro_udiv, TGSI_EXEC_DATA_UINT);
5516       break;
5517 
5518    case TGSI_OPCODE_UMAD:
5519       exec_vector_trinary(mach, inst, micro_umad, TGSI_EXEC_DATA_UINT);
5520       break;
5521 
5522    case TGSI_OPCODE_UMAX:
5523       exec_vector_binary(mach, inst, micro_umax, TGSI_EXEC_DATA_UINT);
5524       break;
5525 
5526    case TGSI_OPCODE_UMIN:
5527       exec_vector_binary(mach, inst, micro_umin, TGSI_EXEC_DATA_UINT);
5528       break;
5529 
5530    case TGSI_OPCODE_UMOD:
5531       exec_vector_binary(mach, inst, micro_umod, TGSI_EXEC_DATA_UINT);
5532       break;
5533 
5534    case TGSI_OPCODE_UMUL:
5535       exec_vector_binary(mach, inst, micro_umul, TGSI_EXEC_DATA_UINT);
5536       break;
5537 
5538    case TGSI_OPCODE_IMUL_HI:
5539       exec_vector_binary(mach, inst, micro_imul_hi, TGSI_EXEC_DATA_INT);
5540       break;
5541 
5542    case TGSI_OPCODE_UMUL_HI:
5543       exec_vector_binary(mach, inst, micro_umul_hi, TGSI_EXEC_DATA_UINT);
5544       break;
5545 
5546    case TGSI_OPCODE_USEQ:
5547       exec_vector_binary(mach, inst, micro_useq, TGSI_EXEC_DATA_UINT);
5548       break;
5549 
5550    case TGSI_OPCODE_USGE:
5551       exec_vector_binary(mach, inst, micro_usge, TGSI_EXEC_DATA_UINT);
5552       break;
5553 
5554    case TGSI_OPCODE_USHR:
5555       exec_vector_binary(mach, inst, micro_ushr, TGSI_EXEC_DATA_UINT);
5556       break;
5557 
5558    case TGSI_OPCODE_USLT:
5559       exec_vector_binary(mach, inst, micro_uslt, TGSI_EXEC_DATA_UINT);
5560       break;
5561 
5562    case TGSI_OPCODE_USNE:
5563       exec_vector_binary(mach, inst, micro_usne, TGSI_EXEC_DATA_UINT);
5564       break;
5565 
5566    case TGSI_OPCODE_SWITCH:
5567       exec_switch(mach, inst);
5568       break;
5569 
5570    case TGSI_OPCODE_CASE:
5571       exec_case(mach, inst);
5572       break;
5573 
5574    case TGSI_OPCODE_DEFAULT:
5575       exec_default(mach);
5576       break;
5577 
5578    case TGSI_OPCODE_ENDSWITCH:
5579       exec_endswitch(mach);
5580       break;
5581 
5582    case TGSI_OPCODE_SAMPLE_I:
5583       exec_txf(mach, inst);
5584       break;
5585 
5586    case TGSI_OPCODE_SAMPLE_I_MS:
5587       exec_txf(mach, inst);
5588       break;
5589 
5590    case TGSI_OPCODE_SAMPLE:
5591       exec_sample(mach, inst, TEX_MODIFIER_NONE, false);
5592       break;
5593 
5594    case TGSI_OPCODE_SAMPLE_B:
5595       exec_sample(mach, inst, TEX_MODIFIER_LOD_BIAS, false);
5596       break;
5597 
5598    case TGSI_OPCODE_SAMPLE_C:
5599       exec_sample(mach, inst, TEX_MODIFIER_NONE, true);
5600       break;
5601 
5602    case TGSI_OPCODE_SAMPLE_C_LZ:
5603       exec_sample(mach, inst, TEX_MODIFIER_LEVEL_ZERO, true);
5604       break;
5605 
5606    case TGSI_OPCODE_SAMPLE_D:
5607       exec_sample_d(mach, inst);
5608       break;
5609 
5610    case TGSI_OPCODE_SAMPLE_L:
5611       exec_sample(mach, inst, TEX_MODIFIER_EXPLICIT_LOD, false);
5612       break;
5613 
5614    case TGSI_OPCODE_GATHER4:
5615       exec_sample(mach, inst, TEX_MODIFIER_GATHER, false);
5616       break;
5617 
5618    case TGSI_OPCODE_SVIEWINFO:
5619       exec_txq(mach, inst);
5620       break;
5621 
5622    case TGSI_OPCODE_SAMPLE_POS:
5623       assert(0);
5624       break;
5625 
5626    case TGSI_OPCODE_SAMPLE_INFO:
5627       assert(0);
5628       break;
5629 
5630    case TGSI_OPCODE_LOD:
5631       exec_lodq(mach, inst);
5632       break;
5633 
5634    case TGSI_OPCODE_UARL:
5635       exec_vector_unary(mach, inst, micro_uarl, TGSI_EXEC_DATA_UINT);
5636       break;
5637 
5638    case TGSI_OPCODE_UCMP:
5639       exec_ucmp(mach, inst);
5640       break;
5641 
5642    case TGSI_OPCODE_IABS:
5643       exec_vector_unary(mach, inst, micro_iabs, TGSI_EXEC_DATA_INT);
5644       break;
5645 
5646    case TGSI_OPCODE_ISSG:
5647       exec_vector_unary(mach, inst, micro_isgn, TGSI_EXEC_DATA_INT);
5648       break;
5649 
5650    case TGSI_OPCODE_TEX2:
5651       /* simple texture lookup */
5652       /* src[0] = texcoord */
5653       /* src[1] = compare */
5654       /* src[2] = sampler unit */
5655       exec_tex(mach, inst, TEX_MODIFIER_NONE, 2);
5656       break;
5657    case TGSI_OPCODE_TXB2:
5658       /* simple texture lookup */
5659       /* src[0] = texcoord */
5660       /* src[1] = bias */
5661       /* src[2] = sampler unit */
5662       exec_tex(mach, inst, TEX_MODIFIER_LOD_BIAS, 2);
5663       break;
5664    case TGSI_OPCODE_TXL2:
5665       /* simple texture lookup */
5666       /* src[0] = texcoord */
5667       /* src[1] = lod */
5668       /* src[2] = sampler unit */
5669       exec_tex(mach, inst, TEX_MODIFIER_EXPLICIT_LOD, 2);
5670       break;
5671 
5672    case TGSI_OPCODE_IBFE:
5673       exec_vector_trinary(mach, inst, micro_ibfe, TGSI_EXEC_DATA_INT);
5674       break;
5675    case TGSI_OPCODE_UBFE:
5676       exec_vector_trinary(mach, inst, micro_ubfe, TGSI_EXEC_DATA_UINT);
5677       break;
5678    case TGSI_OPCODE_BFI:
5679       exec_vector_quaternary(mach, inst, micro_bfi, TGSI_EXEC_DATA_UINT);
5680       break;
5681    case TGSI_OPCODE_BREV:
5682       exec_vector_unary(mach, inst, micro_brev, TGSI_EXEC_DATA_UINT);
5683       break;
5684    case TGSI_OPCODE_POPC:
5685       exec_vector_unary(mach, inst, micro_popc, TGSI_EXEC_DATA_UINT);
5686       break;
5687    case TGSI_OPCODE_LSB:
5688       exec_vector_unary(mach, inst, micro_lsb, TGSI_EXEC_DATA_UINT);
5689       break;
5690    case TGSI_OPCODE_IMSB:
5691       exec_vector_unary(mach, inst, micro_imsb, TGSI_EXEC_DATA_INT);
5692       break;
5693    case TGSI_OPCODE_UMSB:
5694       exec_vector_unary(mach, inst, micro_umsb, TGSI_EXEC_DATA_UINT);
5695       break;
5696 
5697    case TGSI_OPCODE_F2D:
5698       exec_t_2_64(mach, inst, micro_f2d, TGSI_EXEC_DATA_FLOAT);
5699       break;
5700 
5701    case TGSI_OPCODE_D2F:
5702       exec_64_2_t(mach, inst, micro_d2f);
5703       break;
5704 
5705    case TGSI_OPCODE_DABS:
5706       exec_double_unary(mach, inst, micro_dabs);
5707       break;
5708 
5709    case TGSI_OPCODE_DNEG:
5710       exec_double_unary(mach, inst, micro_dneg);
5711       break;
5712 
5713    case TGSI_OPCODE_DADD:
5714       exec_double_binary(mach, inst, micro_dadd, TGSI_EXEC_DATA_DOUBLE);
5715       break;
5716 
5717    case TGSI_OPCODE_DDIV:
5718       exec_double_binary(mach, inst, micro_ddiv, TGSI_EXEC_DATA_DOUBLE);
5719       break;
5720 
5721    case TGSI_OPCODE_DMUL:
5722       exec_double_binary(mach, inst, micro_dmul, TGSI_EXEC_DATA_DOUBLE);
5723       break;
5724 
5725    case TGSI_OPCODE_DMAX:
5726       exec_double_binary(mach, inst, micro_dmax, TGSI_EXEC_DATA_DOUBLE);
5727       break;
5728 
5729    case TGSI_OPCODE_DMIN:
5730       exec_double_binary(mach, inst, micro_dmin, TGSI_EXEC_DATA_DOUBLE);
5731       break;
5732 
5733    case TGSI_OPCODE_DSLT:
5734       exec_double_binary(mach, inst, micro_dslt, TGSI_EXEC_DATA_UINT);
5735       break;
5736 
5737    case TGSI_OPCODE_DSGE:
5738       exec_double_binary(mach, inst, micro_dsge, TGSI_EXEC_DATA_UINT);
5739       break;
5740 
5741    case TGSI_OPCODE_DSEQ:
5742       exec_double_binary(mach, inst, micro_dseq, TGSI_EXEC_DATA_UINT);
5743       break;
5744 
5745    case TGSI_OPCODE_DSNE:
5746       exec_double_binary(mach, inst, micro_dsne, TGSI_EXEC_DATA_UINT);
5747       break;
5748 
5749    case TGSI_OPCODE_DRCP:
5750       exec_double_unary(mach, inst, micro_drcp);
5751       break;
5752 
5753    case TGSI_OPCODE_DSQRT:
5754       exec_double_unary(mach, inst, micro_dsqrt);
5755       break;
5756 
5757    case TGSI_OPCODE_DRSQ:
5758       exec_double_unary(mach, inst, micro_drsq);
5759       break;
5760 
5761    case TGSI_OPCODE_DMAD:
5762       exec_double_trinary(mach, inst, micro_dmad);
5763       break;
5764 
5765    case TGSI_OPCODE_DFRAC:
5766       exec_double_unary(mach, inst, micro_dfrac);
5767       break;
5768 
5769    case TGSI_OPCODE_DFLR:
5770       exec_double_unary(mach, inst, micro_dflr);
5771       break;
5772 
5773    case TGSI_OPCODE_DLDEXP:
5774       exec_dldexp(mach, inst);
5775       break;
5776 
5777    case TGSI_OPCODE_I2D:
5778       exec_t_2_64(mach, inst, micro_i2d, TGSI_EXEC_DATA_FLOAT);
5779       break;
5780 
5781    case TGSI_OPCODE_D2I:
5782       exec_64_2_t(mach, inst, micro_d2i);
5783       break;
5784 
5785    case TGSI_OPCODE_U2D:
5786       exec_t_2_64(mach, inst, micro_u2d, TGSI_EXEC_DATA_FLOAT);
5787       break;
5788 
5789    case TGSI_OPCODE_D2U:
5790       exec_64_2_t(mach, inst, micro_d2u);
5791       break;
5792 
5793    case TGSI_OPCODE_LOAD:
5794       exec_load(mach, inst);
5795       break;
5796 
5797    case TGSI_OPCODE_STORE:
5798       exec_store(mach, inst);
5799       break;
5800 
5801    case TGSI_OPCODE_ATOMUADD:
5802    case TGSI_OPCODE_ATOMXCHG:
5803    case TGSI_OPCODE_ATOMCAS:
5804    case TGSI_OPCODE_ATOMAND:
5805    case TGSI_OPCODE_ATOMOR:
5806    case TGSI_OPCODE_ATOMXOR:
5807    case TGSI_OPCODE_ATOMUMIN:
5808    case TGSI_OPCODE_ATOMUMAX:
5809    case TGSI_OPCODE_ATOMIMIN:
5810    case TGSI_OPCODE_ATOMIMAX:
5811    case TGSI_OPCODE_ATOMFADD:
5812       exec_atomop(mach, inst);
5813       break;
5814 
5815    case TGSI_OPCODE_RESQ:
5816       exec_resq(mach, inst);
5817       break;
5818    case TGSI_OPCODE_BARRIER:
5819    case TGSI_OPCODE_MEMBAR:
5820       return true;
5821       break;
5822 
5823    case TGSI_OPCODE_I64ABS:
5824       exec_double_unary(mach, inst, micro_i64abs);
5825       break;
5826 
5827    case TGSI_OPCODE_I64SSG:
5828       exec_double_unary(mach, inst, micro_i64sgn);
5829       break;
5830 
5831    case TGSI_OPCODE_I64NEG:
5832       exec_double_unary(mach, inst, micro_i64neg);
5833       break;
5834 
5835    case TGSI_OPCODE_U64SEQ:
5836       exec_double_binary(mach, inst, micro_u64seq, TGSI_EXEC_DATA_UINT);
5837       break;
5838 
5839    case TGSI_OPCODE_U64SNE:
5840       exec_double_binary(mach, inst, micro_u64sne, TGSI_EXEC_DATA_UINT);
5841       break;
5842 
5843    case TGSI_OPCODE_I64SLT:
5844       exec_double_binary(mach, inst, micro_i64slt, TGSI_EXEC_DATA_UINT);
5845       break;
5846    case TGSI_OPCODE_U64SLT:
5847       exec_double_binary(mach, inst, micro_u64slt, TGSI_EXEC_DATA_UINT);
5848       break;
5849 
5850    case TGSI_OPCODE_I64SGE:
5851       exec_double_binary(mach, inst, micro_i64sge, TGSI_EXEC_DATA_UINT);
5852       break;
5853    case TGSI_OPCODE_U64SGE:
5854       exec_double_binary(mach, inst, micro_u64sge, TGSI_EXEC_DATA_UINT);
5855       break;
5856 
5857    case TGSI_OPCODE_I64MIN:
5858       exec_double_binary(mach, inst, micro_i64min, TGSI_EXEC_DATA_INT64);
5859       break;
5860    case TGSI_OPCODE_U64MIN:
5861       exec_double_binary(mach, inst, micro_u64min, TGSI_EXEC_DATA_UINT64);
5862       break;
5863    case TGSI_OPCODE_I64MAX:
5864       exec_double_binary(mach, inst, micro_i64max, TGSI_EXEC_DATA_INT64);
5865       break;
5866    case TGSI_OPCODE_U64MAX:
5867       exec_double_binary(mach, inst, micro_u64max, TGSI_EXEC_DATA_UINT64);
5868       break;
5869    case TGSI_OPCODE_U64ADD:
5870       exec_double_binary(mach, inst, micro_u64add, TGSI_EXEC_DATA_UINT64);
5871       break;
5872    case TGSI_OPCODE_U64MUL:
5873       exec_double_binary(mach, inst, micro_u64mul, TGSI_EXEC_DATA_UINT64);
5874       break;
5875    case TGSI_OPCODE_U64SHL:
5876       exec_arg0_64_arg1_32(mach, inst, micro_u64shl);
5877       break;
5878    case TGSI_OPCODE_I64SHR:
5879       exec_arg0_64_arg1_32(mach, inst, micro_i64shr);
5880       break;
5881    case TGSI_OPCODE_U64SHR:
5882       exec_arg0_64_arg1_32(mach, inst, micro_u64shr);
5883       break;
5884    case TGSI_OPCODE_U64DIV:
5885       exec_double_binary(mach, inst, micro_u64div, TGSI_EXEC_DATA_UINT64);
5886       break;
5887    case TGSI_OPCODE_I64DIV:
5888       exec_double_binary(mach, inst, micro_i64div, TGSI_EXEC_DATA_INT64);
5889       break;
5890    case TGSI_OPCODE_U64MOD:
5891       exec_double_binary(mach, inst, micro_u64mod, TGSI_EXEC_DATA_UINT64);
5892       break;
5893    case TGSI_OPCODE_I64MOD:
5894       exec_double_binary(mach, inst, micro_i64mod, TGSI_EXEC_DATA_INT64);
5895       break;
5896 
5897    case TGSI_OPCODE_F2U64:
5898       exec_t_2_64(mach, inst, micro_f2u64, TGSI_EXEC_DATA_FLOAT);
5899       break;
5900 
5901    case TGSI_OPCODE_F2I64:
5902       exec_t_2_64(mach, inst, micro_f2i64, TGSI_EXEC_DATA_FLOAT);
5903       break;
5904 
5905    case TGSI_OPCODE_U2I64:
5906       exec_t_2_64(mach, inst, micro_u2i64, TGSI_EXEC_DATA_INT);
5907       break;
5908    case TGSI_OPCODE_I2I64:
5909       exec_t_2_64(mach, inst, micro_i2i64, TGSI_EXEC_DATA_INT);
5910       break;
5911 
5912    case TGSI_OPCODE_D2U64:
5913       exec_double_unary(mach, inst, micro_d2u64);
5914       break;
5915 
5916    case TGSI_OPCODE_D2I64:
5917       exec_double_unary(mach, inst, micro_d2i64);
5918       break;
5919 
5920    case TGSI_OPCODE_U642F:
5921       exec_64_2_t(mach, inst, micro_u642f);
5922       break;
5923    case TGSI_OPCODE_I642F:
5924       exec_64_2_t(mach, inst, micro_i642f);
5925       break;
5926 
5927    case TGSI_OPCODE_U642D:
5928       exec_double_unary(mach, inst, micro_u642d);
5929       break;
5930    case TGSI_OPCODE_I642D:
5931       exec_double_unary(mach, inst, micro_i642d);
5932       break;
5933    case TGSI_OPCODE_INTERP_SAMPLE:
5934       exec_interp_at_sample(mach, inst);
5935       break;
5936    case TGSI_OPCODE_INTERP_OFFSET:
5937       exec_interp_at_offset(mach, inst);
5938       break;
5939    case TGSI_OPCODE_INTERP_CENTROID:
5940       exec_interp_at_centroid(mach, inst);
5941       break;
5942    default:
5943       assert( 0 );
5944    }
5945    return false;
5946 }
5947 
5948 static void
tgsi_exec_machine_setup_masks(struct tgsi_exec_machine * mach)5949 tgsi_exec_machine_setup_masks(struct tgsi_exec_machine *mach)
5950 {
5951    unsigned default_mask = 0xf;
5952 
5953    mach->KillMask = 0;
5954    mach->OutputVertexOffset = 0;
5955 
5956    if (mach->ShaderType == PIPE_SHADER_GEOMETRY) {
5957       for (unsigned i = 0; i < TGSI_MAX_VERTEX_STREAMS; i++) {
5958          mach->OutputPrimCount[i] = 0;
5959          mach->Primitives[i][0] = 0;
5960       }
5961       /* GS runs on a single primitive for now */
5962       default_mask = 0x1;
5963    }
5964 
5965    if (mach->NonHelperMask == 0)
5966       mach->NonHelperMask = default_mask;
5967    mach->CondMask = default_mask;
5968    mach->LoopMask = default_mask;
5969    mach->ContMask = default_mask;
5970    mach->FuncMask = default_mask;
5971    mach->ExecMask = default_mask;
5972 
5973    mach->Switch.mask = default_mask;
5974 
5975    assert(mach->CondStackTop == 0);
5976    assert(mach->LoopStackTop == 0);
5977    assert(mach->ContStackTop == 0);
5978    assert(mach->SwitchStackTop == 0);
5979    assert(mach->BreakStackTop == 0);
5980    assert(mach->CallStackTop == 0);
5981 }
5982 
5983 /**
5984  * Run TGSI interpreter.
5985  * \return bitmask of "alive" quad components
5986  */
5987 uint
tgsi_exec_machine_run(struct tgsi_exec_machine * mach,int start_pc)5988 tgsi_exec_machine_run( struct tgsi_exec_machine *mach, int start_pc )
5989 {
5990    unsigned i;
5991 
5992    mach->pc = start_pc;
5993 
5994    if (!start_pc) {
5995       tgsi_exec_machine_setup_masks(mach);
5996 
5997       /* execute declarations (interpolants) */
5998       for (i = 0; i < mach->NumDeclarations; i++) {
5999          exec_declaration( mach, mach->Declarations+i );
6000       }
6001    }
6002 
6003    {
6004 #if DEBUG_EXECUTION
6005       struct tgsi_exec_vector temps[TGSI_EXEC_NUM_TEMPS];
6006       struct tgsi_exec_vector outputs[PIPE_MAX_ATTRIBS];
6007       unsigned inst = 1;
6008 
6009       if (!start_pc) {
6010          memset(mach->Temps, 0, sizeof(temps));
6011          if (mach->Outputs)
6012             memset(mach->Outputs, 0, sizeof(outputs));
6013          memset(temps, 0, sizeof(temps));
6014          memset(outputs, 0, sizeof(outputs));
6015       }
6016 #endif
6017 
6018       /* execute instructions, until pc is set to -1 */
6019       while (mach->pc != -1) {
6020          bool barrier_hit;
6021 #if DEBUG_EXECUTION
6022          unsigned i;
6023 
6024          tgsi_dump_instruction(&mach->Instructions[mach->pc], inst++);
6025 #endif
6026 
6027          assert(mach->pc < (int) mach->NumInstructions);
6028          barrier_hit = exec_instruction(mach, mach->Instructions + mach->pc, &mach->pc);
6029 
6030          /* for compute shaders if we hit a barrier return now for later rescheduling */
6031          if (barrier_hit && mach->ShaderType == PIPE_SHADER_COMPUTE)
6032             return 0;
6033 
6034 #if DEBUG_EXECUTION
6035          for (i = 0; i < TGSI_EXEC_NUM_TEMPS; i++) {
6036             if (memcmp(&temps[i], &mach->Temps[i], sizeof(temps[i]))) {
6037                unsigned j;
6038 
6039                memcpy(&temps[i], &mach->Temps[i], sizeof(temps[i]));
6040                debug_printf("TEMP[%2u] = ", i);
6041                for (j = 0; j < 4; j++) {
6042                   if (j > 0) {
6043                      debug_printf("           ");
6044                   }
6045                   debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
6046                                temps[i].xyzw[0].f[j], temps[i].xyzw[0].u[j],
6047                                temps[i].xyzw[1].f[j], temps[i].xyzw[1].u[j],
6048                                temps[i].xyzw[2].f[j], temps[i].xyzw[2].u[j],
6049                                temps[i].xyzw[3].f[j], temps[i].xyzw[3].u[j]);
6050                }
6051             }
6052          }
6053          if (mach->Outputs) {
6054             for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
6055                if (memcmp(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]))) {
6056                   unsigned j;
6057 
6058                   memcpy(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]));
6059                   debug_printf("OUT[%2u] =  ", i);
6060                   for (j = 0; j < 4; j++) {
6061                      if (j > 0) {
6062                         debug_printf("           ");
6063                      }
6064                      debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
6065                                   outputs[i].xyzw[0].f[j], outputs[i].xyzw[0].u[j],
6066                                   outputs[i].xyzw[1].f[j], outputs[i].xyzw[1].u[j],
6067                                   outputs[i].xyzw[2].f[j], outputs[i].xyzw[2].u[j],
6068                                   outputs[i].xyzw[3].f[j], outputs[i].xyzw[3].u[j]);
6069                   }
6070                }
6071             }
6072          }
6073 #endif
6074       }
6075    }
6076 
6077 #if 0
6078    /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
6079    if (mach->ShaderType == PIPE_SHADER_FRAGMENT) {
6080       /*
6081        * Scale back depth component.
6082        */
6083       for (i = 0; i < 4; i++)
6084          mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
6085    }
6086 #endif
6087 
6088    /* Strictly speaking, these assertions aren't really needed but they
6089     * can potentially catch some bugs in the control flow code.
6090     */
6091    assert(mach->CondStackTop == 0);
6092    assert(mach->LoopStackTop == 0);
6093    assert(mach->ContStackTop == 0);
6094    assert(mach->SwitchStackTop == 0);
6095    assert(mach->BreakStackTop == 0);
6096    assert(mach->CallStackTop == 0);
6097 
6098    return ~mach->KillMask;
6099 }
6100