xref: /aosp_15_r20/external/mesa3d/src/gallium/drivers/freedreno/a4xx/fd4_program.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2014 Rob Clark <[email protected]>
3  * SPDX-License-Identifier: MIT
4  *
5  * Authors:
6  *    Rob Clark <[email protected]>
7  */
8 
9 #include "pipe/p_state.h"
10 #include "util/format/u_format.h"
11 #include "util/u_inlines.h"
12 #include "util/u_memory.h"
13 #include "util/u_string.h"
14 
15 #include "freedreno_program.h"
16 
17 #include "fd4_emit.h"
18 #include "fd4_format.h"
19 #include "fd4_program.h"
20 #include "fd4_texture.h"
21 
22 void
fd4_emit_shader(struct fd_ringbuffer * ring,const struct ir3_shader_variant * so)23 fd4_emit_shader(struct fd_ringbuffer *ring, const struct ir3_shader_variant *so)
24 {
25    const struct ir3_info *si = &so->info;
26    enum a4xx_state_block sb = fd4_stage2shadersb(so->type);
27    enum a4xx_state_src src;
28    uint32_t i, sz, *bin;
29 
30    if (FD_DBG(DIRECT)) {
31       sz = si->sizedwords;
32       src = SS4_DIRECT;
33       bin = fd_bo_map(so->bo);
34    } else {
35       sz = 0;
36       src = SS4_INDIRECT;
37       bin = NULL;
38    }
39 
40    OUT_PKT3(ring, CP_LOAD_STATE4, 2 + sz);
41    OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(0) |
42                      CP_LOAD_STATE4_0_STATE_SRC(src) |
43                      CP_LOAD_STATE4_0_STATE_BLOCK(sb) |
44                      CP_LOAD_STATE4_0_NUM_UNIT(so->instrlen));
45    if (bin) {
46       OUT_RING(ring, CP_LOAD_STATE4_1_EXT_SRC_ADDR(0) |
47                         CP_LOAD_STATE4_1_STATE_TYPE(ST4_SHADER));
48    } else {
49       OUT_RELOC(ring, so->bo, 0, CP_LOAD_STATE4_1_STATE_TYPE(ST4_SHADER), 0);
50    }
51 
52    /* for how clever coverity is, it is sometimes rather dull, and
53     * doesn't realize that the only case where bin==NULL, sz==0:
54     */
55    assume(bin || (sz == 0));
56 
57    for (i = 0; i < sz; i++) {
58       OUT_RING(ring, bin[i]);
59    }
60 }
61 
62 struct stage {
63    const struct ir3_shader_variant *v;
64    const struct ir3_info *i;
65    /* const sizes are in units of 4 * vec4 */
66    uint8_t constoff;
67    uint8_t constlen;
68    /* instr sizes are in units of 16 instructions */
69    uint8_t instroff;
70    uint8_t instrlen;
71 };
72 
73 enum { VS = 0, FS = 1, HS = 2, DS = 3, GS = 4, MAX_STAGES };
74 
75 static void
setup_stages(struct fd4_emit * emit,struct stage * s)76 setup_stages(struct fd4_emit *emit, struct stage *s)
77 {
78    unsigned i;
79 
80    s[VS].v = fd4_emit_get_vp(emit);
81    s[FS].v = fd4_emit_get_fp(emit);
82 
83    s[HS].v = s[DS].v = s[GS].v = NULL; /* for now */
84 
85    for (i = 0; i < MAX_STAGES; i++) {
86       if (s[i].v) {
87          s[i].i = &s[i].v->info;
88          /* constlen is in units of 4 * vec4: */
89          assert(s[i].v->constlen % 4 == 0);
90          s[i].constlen = s[i].v->constlen / 4;
91          /* instrlen is already in units of 16 instr.. although
92           * probably we should ditch that and not make the compiler
93           * care about instruction group size of a3xx vs a4xx
94           */
95          s[i].instrlen = s[i].v->instrlen;
96       } else {
97          s[i].i = NULL;
98          s[i].constlen = 0;
99          s[i].instrlen = 0;
100       }
101    }
102 
103    /* NOTE: at least for gles2, blob partitions VS at bottom of const
104     * space and FS taking entire remaining space.  We probably don't
105     * need to do that the same way, but for now mimic what the blob
106     * does to make it easier to diff against register values from blob
107     *
108     * NOTE: if VS.instrlen + FS.instrlen > 64, then one or both shaders
109     * is run from external memory.
110     */
111    if ((s[VS].instrlen + s[FS].instrlen) > 64) {
112       /* prioritize FS for internal memory: */
113       if (s[FS].instrlen < 64) {
114          /* if FS can fit, kick VS out to external memory: */
115          s[VS].instrlen = 0;
116       } else if (s[VS].instrlen < 64) {
117          /* otherwise if VS can fit, kick out FS: */
118          s[FS].instrlen = 0;
119       } else {
120          /* neither can fit, run both from external memory: */
121          s[VS].instrlen = 0;
122          s[FS].instrlen = 0;
123       }
124    }
125    s[VS].constlen = 66;
126    s[FS].constlen = 128 - s[VS].constlen;
127    s[VS].instroff = 0;
128    s[VS].constoff = 0;
129    s[FS].instroff = 64 - s[FS].instrlen;
130    s[FS].constoff = s[VS].constlen;
131    s[HS].instroff = s[DS].instroff = s[GS].instroff = s[FS].instroff;
132    s[HS].constoff = s[DS].constoff = s[GS].constoff = s[FS].constoff;
133 }
134 
135 void
fd4_program_emit(struct fd_ringbuffer * ring,struct fd4_emit * emit,int nr,struct pipe_surface ** bufs)136 fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit, int nr,
137                  struct pipe_surface **bufs)
138 {
139    struct stage s[MAX_STAGES];
140    uint32_t pos_regid, posz_regid, psize_regid, color_regid[8];
141    uint32_t face_regid, coord_regid, zwcoord_regid, samp_id_regid,
142       samp_mask_regid, ij_regid[IJ_COUNT];
143    enum a3xx_threadsize fssz;
144    int constmode;
145    int i, j;
146 
147    assert(nr <= ARRAY_SIZE(color_regid));
148 
149    if (emit->binning_pass)
150       nr = 0;
151 
152    setup_stages(emit, s);
153 
154    fssz = (s[FS].i->double_threadsize) ? FOUR_QUADS : TWO_QUADS;
155 
156    /* blob seems to always use constmode currently: */
157    constmode = 1;
158 
159    pos_regid = ir3_find_output_regid(s[VS].v, VARYING_SLOT_POS);
160    if (pos_regid == regid(63, 0)) {
161       /* hw dislikes when there is no position output, which can
162        * happen for transform-feedback vertex shaders.  Just tell
163        * the hw to use r0.x, with whatever random value is there:
164        */
165       pos_regid = regid(0, 0);
166    }
167    posz_regid = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DEPTH);
168    psize_regid = ir3_find_output_regid(s[VS].v, VARYING_SLOT_PSIZ);
169    if (s[FS].v->color0_mrt) {
170       color_regid[0] = color_regid[1] = color_regid[2] = color_regid[3] =
171          color_regid[4] = color_regid[5] = color_regid[6] = color_regid[7] =
172             ir3_find_output_regid(s[FS].v, FRAG_RESULT_COLOR);
173    } else {
174       color_regid[0] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA0);
175       color_regid[1] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA1);
176       color_regid[2] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA2);
177       color_regid[3] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA3);
178       color_regid[4] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA4);
179       color_regid[5] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA5);
180       color_regid[6] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA6);
181       color_regid[7] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA7);
182    }
183 
184    samp_id_regid = ir3_find_sysval_regid(s[FS].v, SYSTEM_VALUE_SAMPLE_ID);
185    samp_mask_regid =
186       ir3_find_sysval_regid(s[FS].v, SYSTEM_VALUE_SAMPLE_MASK_IN);
187    face_regid = ir3_find_sysval_regid(s[FS].v, SYSTEM_VALUE_FRONT_FACE);
188    coord_regid = ir3_find_sysval_regid(s[FS].v, SYSTEM_VALUE_FRAG_COORD);
189    zwcoord_regid =
190       (coord_regid == regid(63, 0)) ? regid(63, 0) : (coord_regid + 2);
191    for (unsigned i = 0; i < ARRAY_SIZE(ij_regid); i++)
192       ij_regid[i] = ir3_find_sysval_regid(
193          s[FS].v, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + i);
194 
195    /* we could probably divide this up into things that need to be
196     * emitted if frag-prog is dirty vs if vert-prog is dirty..
197     */
198 
199    OUT_PKT0(ring, REG_A4XX_HLSQ_UPDATE_CONTROL, 1);
200    OUT_RING(ring, 0x00000003);
201 
202    OUT_PKT0(ring, REG_A4XX_HLSQ_CONTROL_0_REG, 5);
203    OUT_RING(ring, A4XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(fssz) |
204                      A4XX_HLSQ_CONTROL_0_REG_CONSTMODE(constmode) |
205                      A4XX_HLSQ_CONTROL_0_REG_FSSUPERTHREADENABLE |
206                      /* NOTE:  I guess SHADERRESTART and CONSTFULLUPDATE maybe
207                       * flush some caches? I think we only need to set those
208                       * bits if we have updated const or shader..
209                       */
210                      A4XX_HLSQ_CONTROL_0_REG_SPSHADERRESTART |
211                      A4XX_HLSQ_CONTROL_0_REG_SPCONSTFULLUPDATE);
212    OUT_RING(ring, A4XX_HLSQ_CONTROL_1_REG_VSTHREADSIZE(TWO_QUADS) |
213                      A4XX_HLSQ_CONTROL_1_REG_VSSUPERTHREADENABLE |
214                      A4XX_HLSQ_CONTROL_1_REG_COORDREGID(coord_regid) |
215                      A4XX_HLSQ_CONTROL_1_REG_ZWCOORDREGID(zwcoord_regid));
216    OUT_RING(ring, A4XX_HLSQ_CONTROL_2_REG_PRIMALLOCTHRESHOLD(63) |
217                      A4XX_HLSQ_CONTROL_2_REG_SAMPLEID_REGID(samp_id_regid) |
218                      A4XX_HLSQ_CONTROL_2_REG_SAMPLEMASK_REGID(samp_mask_regid) |
219                      A4XX_HLSQ_CONTROL_2_REG_FACEREGID(face_regid));
220    /* XXX left out centroid/sample for now */
221    OUT_RING(
222       ring,
223       A4XX_HLSQ_CONTROL_3_REG_IJ_PERSP_PIXEL(ij_regid[IJ_PERSP_PIXEL]) |
224          A4XX_HLSQ_CONTROL_3_REG_IJ_LINEAR_PIXEL(ij_regid[IJ_LINEAR_PIXEL]) |
225          A4XX_HLSQ_CONTROL_3_REG_IJ_PERSP_CENTROID(
226             ij_regid[IJ_PERSP_CENTROID]) |
227          A4XX_HLSQ_CONTROL_3_REG_IJ_LINEAR_CENTROID(
228             ij_regid[IJ_LINEAR_CENTROID]));
229    OUT_RING(ring, 0x00fcfcfc); /* XXX HLSQ_CONTROL_4 */
230 
231    OUT_PKT0(ring, REG_A4XX_HLSQ_VS_CONTROL_REG, 5);
232    OUT_RING(ring,
233             A4XX_HLSQ_VS_CONTROL_REG_CONSTLENGTH(s[VS].constlen) |
234                A4XX_HLSQ_VS_CONTROL_REG_CONSTOBJECTOFFSET(s[VS].constoff) |
235                COND(s[VS].v && s[VS].v->has_ssbo, A4XX_HLSQ_VS_CONTROL_REG_SSBO_ENABLE) |
236                COND(s[VS].v, A4XX_HLSQ_VS_CONTROL_REG_ENABLED) |
237                A4XX_HLSQ_VS_CONTROL_REG_INSTRLENGTH(s[VS].instrlen) |
238                A4XX_HLSQ_VS_CONTROL_REG_SHADEROBJOFFSET(s[VS].instroff));
239    OUT_RING(ring,
240             A4XX_HLSQ_FS_CONTROL_REG_CONSTLENGTH(s[FS].constlen) |
241                A4XX_HLSQ_FS_CONTROL_REG_CONSTOBJECTOFFSET(s[FS].constoff) |
242                COND(s[FS].v && s[FS].v->has_ssbo, A4XX_HLSQ_FS_CONTROL_REG_SSBO_ENABLE) |
243                COND(s[FS].v, A4XX_HLSQ_FS_CONTROL_REG_ENABLED) |
244                A4XX_HLSQ_FS_CONTROL_REG_INSTRLENGTH(s[FS].instrlen) |
245                A4XX_HLSQ_FS_CONTROL_REG_SHADEROBJOFFSET(s[FS].instroff));
246    OUT_RING(ring,
247             A4XX_HLSQ_HS_CONTROL_REG_CONSTLENGTH(s[HS].constlen) |
248                A4XX_HLSQ_HS_CONTROL_REG_CONSTOBJECTOFFSET(s[HS].constoff) |
249                COND(s[HS].v && s[HS].v->has_ssbo, A4XX_HLSQ_HS_CONTROL_REG_SSBO_ENABLE) |
250                A4XX_HLSQ_HS_CONTROL_REG_INSTRLENGTH(s[HS].instrlen) |
251                A4XX_HLSQ_HS_CONTROL_REG_SHADEROBJOFFSET(s[HS].instroff));
252    OUT_RING(ring,
253             A4XX_HLSQ_DS_CONTROL_REG_CONSTLENGTH(s[DS].constlen) |
254                A4XX_HLSQ_DS_CONTROL_REG_CONSTOBJECTOFFSET(s[DS].constoff) |
255                COND(s[DS].v && s[DS].v->has_ssbo, A4XX_HLSQ_DS_CONTROL_REG_SSBO_ENABLE) |
256                A4XX_HLSQ_DS_CONTROL_REG_INSTRLENGTH(s[DS].instrlen) |
257                A4XX_HLSQ_DS_CONTROL_REG_SHADEROBJOFFSET(s[DS].instroff));
258    OUT_RING(ring,
259             A4XX_HLSQ_GS_CONTROL_REG_CONSTLENGTH(s[GS].constlen) |
260                A4XX_HLSQ_GS_CONTROL_REG_CONSTOBJECTOFFSET(s[GS].constoff) |
261                COND(s[GS].v && s[GS].v->has_ssbo, A4XX_HLSQ_GS_CONTROL_REG_SSBO_ENABLE) |
262                A4XX_HLSQ_GS_CONTROL_REG_INSTRLENGTH(s[GS].instrlen) |
263                A4XX_HLSQ_GS_CONTROL_REG_SHADEROBJOFFSET(s[GS].instroff));
264 
265    OUT_PKT0(ring, REG_A4XX_SP_SP_CTRL_REG, 1);
266    OUT_RING(ring,
267             0x140010 | /* XXX */
268                COND(emit->binning_pass, A4XX_SP_SP_CTRL_REG_BINNING_PASS));
269 
270    OUT_PKT0(ring, REG_A4XX_SP_INSTR_CACHE_CTRL, 1);
271    OUT_RING(ring, 0x7f | /* XXX */
272                      COND(s[VS].instrlen, A4XX_SP_INSTR_CACHE_CTRL_VS_BUFFER) |
273                      COND(s[FS].instrlen, A4XX_SP_INSTR_CACHE_CTRL_FS_BUFFER) |
274                      COND(s[VS].instrlen && s[FS].instrlen,
275                           A4XX_SP_INSTR_CACHE_CTRL_INSTR_BUFFER));
276 
277    OUT_PKT0(ring, REG_A4XX_SP_VS_LENGTH_REG, 1);
278    OUT_RING(ring, s[VS].v->instrlen); /* SP_VS_LENGTH_REG */
279 
280    OUT_PKT0(ring, REG_A4XX_SP_VS_CTRL_REG0, 3);
281    OUT_RING(
282       ring,
283       A4XX_SP_VS_CTRL_REG0_THREADMODE(MULTI) |
284          A4XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT(s[VS].i->max_half_reg + 1) |
285          A4XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT(s[VS].i->max_reg + 1) |
286          A4XX_SP_VS_CTRL_REG0_INOUTREGOVERLAP(0) |
287          A4XX_SP_VS_CTRL_REG0_THREADSIZE(TWO_QUADS) |
288          A4XX_SP_VS_CTRL_REG0_SUPERTHREADMODE |
289          COND(s[VS].v->need_pixlod, A4XX_SP_VS_CTRL_REG0_PIXLODENABLE));
290    OUT_RING(ring,
291             A4XX_SP_VS_CTRL_REG1_CONSTLENGTH(s[VS].constlen) |
292                A4XX_SP_VS_CTRL_REG1_INITIALOUTSTANDING(s[VS].v->total_in));
293    OUT_RING(ring, A4XX_SP_VS_PARAM_REG_POSREGID(pos_regid) |
294                      A4XX_SP_VS_PARAM_REG_PSIZEREGID(psize_regid) |
295                      A4XX_SP_VS_PARAM_REG_TOTALVSOUTVAR(s[FS].v->varying_in));
296 
297    struct ir3_shader_linkage l = {0};
298    ir3_link_shaders(&l, s[VS].v, s[FS].v, false);
299 
300    for (i = 0, j = 0; (i < 16) && (j < l.cnt); i++) {
301       uint32_t reg = 0;
302 
303       OUT_PKT0(ring, REG_A4XX_SP_VS_OUT_REG(i), 1);
304 
305       reg |= A4XX_SP_VS_OUT_REG_A_REGID(l.var[j].regid);
306       reg |= A4XX_SP_VS_OUT_REG_A_COMPMASK(l.var[j].compmask);
307       j++;
308 
309       reg |= A4XX_SP_VS_OUT_REG_B_REGID(l.var[j].regid);
310       reg |= A4XX_SP_VS_OUT_REG_B_COMPMASK(l.var[j].compmask);
311       j++;
312 
313       OUT_RING(ring, reg);
314    }
315 
316    for (i = 0, j = 0; (i < 8) && (j < l.cnt); i++) {
317       uint32_t reg = 0;
318 
319       OUT_PKT0(ring, REG_A4XX_SP_VS_VPC_DST_REG(i), 1);
320 
321       reg |= A4XX_SP_VS_VPC_DST_REG_OUTLOC0(l.var[j++].loc + 8);
322       reg |= A4XX_SP_VS_VPC_DST_REG_OUTLOC1(l.var[j++].loc + 8);
323       reg |= A4XX_SP_VS_VPC_DST_REG_OUTLOC2(l.var[j++].loc + 8);
324       reg |= A4XX_SP_VS_VPC_DST_REG_OUTLOC3(l.var[j++].loc + 8);
325 
326       OUT_RING(ring, reg);
327    }
328 
329    OUT_PKT0(ring, REG_A4XX_SP_VS_OBJ_OFFSET_REG, 2);
330    OUT_RING(ring, A4XX_SP_VS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(s[VS].constoff) |
331                      A4XX_SP_VS_OBJ_OFFSET_REG_SHADEROBJOFFSET(s[VS].instroff));
332    OUT_RELOC(ring, s[VS].v->bo, 0, 0, 0); /* SP_VS_OBJ_START_REG */
333 
334    if (emit->binning_pass) {
335       OUT_PKT0(ring, REG_A4XX_SP_FS_LENGTH_REG, 1);
336       OUT_RING(ring, 0x00000000); /* SP_FS_LENGTH_REG */
337 
338       OUT_PKT0(ring, REG_A4XX_SP_FS_CTRL_REG0, 2);
339       OUT_RING(ring,
340                A4XX_SP_FS_CTRL_REG0_THREADMODE(MULTI) |
341                   COND(s[FS].v->total_in > 0, A4XX_SP_FS_CTRL_REG0_VARYING) |
342                   A4XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(0) |
343                   A4XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(0) |
344                   A4XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP(1) |
345                   A4XX_SP_FS_CTRL_REG0_THREADSIZE(fssz) |
346                   A4XX_SP_FS_CTRL_REG0_SUPERTHREADMODE);
347       OUT_RING(ring,
348                A4XX_SP_FS_CTRL_REG1_CONSTLENGTH(s[FS].constlen) | 0x80000000);
349 
350       OUT_PKT0(ring, REG_A4XX_SP_FS_OBJ_OFFSET_REG, 2);
351       OUT_RING(ring,
352                A4XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(s[FS].constoff) |
353                   A4XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(s[FS].instroff));
354       OUT_RING(ring, 0x00000000);
355    } else {
356       OUT_PKT0(ring, REG_A4XX_SP_FS_LENGTH_REG, 1);
357       OUT_RING(ring, s[FS].v->instrlen); /* SP_FS_LENGTH_REG */
358 
359       OUT_PKT0(ring, REG_A4XX_SP_FS_CTRL_REG0, 2);
360       OUT_RING(
361          ring,
362          A4XX_SP_FS_CTRL_REG0_THREADMODE(MULTI) |
363             COND(s[FS].v->total_in > 0, A4XX_SP_FS_CTRL_REG0_VARYING) |
364             A4XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(s[FS].i->max_half_reg + 1) |
365             A4XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(s[FS].i->max_reg + 1) |
366             A4XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP(1) |
367             A4XX_SP_FS_CTRL_REG0_THREADSIZE(fssz) |
368             A4XX_SP_FS_CTRL_REG0_SUPERTHREADMODE |
369             COND(s[FS].v->need_pixlod, A4XX_SP_FS_CTRL_REG0_PIXLODENABLE));
370       OUT_RING(ring,
371                A4XX_SP_FS_CTRL_REG1_CONSTLENGTH(s[FS].constlen) |
372                   0x80000000 | /* XXX */
373                   COND(s[FS].v->frag_face, A4XX_SP_FS_CTRL_REG1_FACENESS) |
374                   COND(s[FS].v->total_in > 0, A4XX_SP_FS_CTRL_REG1_VARYING) |
375                   COND(s[FS].v->fragcoord_compmask != 0,
376                        A4XX_SP_FS_CTRL_REG1_FRAGCOORD));
377 
378       OUT_PKT0(ring, REG_A4XX_SP_FS_OBJ_OFFSET_REG, 2);
379       OUT_RING(ring,
380                A4XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(s[FS].constoff) |
381                   A4XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(s[FS].instroff));
382       OUT_RELOC(ring, s[FS].v->bo, 0, 0, 0); /* SP_FS_OBJ_START_REG */
383    }
384 
385    OUT_PKT0(ring, REG_A4XX_SP_HS_OBJ_OFFSET_REG, 1);
386    OUT_RING(ring, A4XX_SP_HS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(s[HS].constoff) |
387                      A4XX_SP_HS_OBJ_OFFSET_REG_SHADEROBJOFFSET(s[HS].instroff));
388 
389    OUT_PKT0(ring, REG_A4XX_SP_DS_OBJ_OFFSET_REG, 1);
390    OUT_RING(ring, A4XX_SP_DS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(s[DS].constoff) |
391                      A4XX_SP_DS_OBJ_OFFSET_REG_SHADEROBJOFFSET(s[DS].instroff));
392 
393    OUT_PKT0(ring, REG_A4XX_SP_GS_OBJ_OFFSET_REG, 1);
394    OUT_RING(ring, A4XX_SP_GS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(s[GS].constoff) |
395                      A4XX_SP_GS_OBJ_OFFSET_REG_SHADEROBJOFFSET(s[GS].instroff));
396 
397    OUT_PKT0(ring, REG_A4XX_GRAS_CNTL, 1);
398    OUT_RING(ring,
399             CONDREG(face_regid, A4XX_GRAS_CNTL_IJ_PERSP) |
400                CONDREG(zwcoord_regid, A4XX_GRAS_CNTL_IJ_PERSP) |
401                CONDREG(ij_regid[IJ_PERSP_PIXEL], A4XX_GRAS_CNTL_IJ_PERSP) |
402                CONDREG(ij_regid[IJ_LINEAR_PIXEL], A4XX_GRAS_CNTL_IJ_LINEAR) |
403                CONDREG(ij_regid[IJ_PERSP_CENTROID], A4XX_GRAS_CNTL_IJ_PERSP));
404 
405    OUT_PKT0(ring, REG_A4XX_RB_RENDER_CONTROL2, 1);
406    OUT_RING(
407       ring,
408       A4XX_RB_RENDER_CONTROL2_MSAA_SAMPLES(0) |
409          CONDREG(ij_regid[IJ_PERSP_PIXEL],
410                  A4XX_RB_RENDER_CONTROL2_IJ_PERSP_PIXEL) |
411          CONDREG(ij_regid[IJ_PERSP_CENTROID],
412                  A4XX_RB_RENDER_CONTROL2_IJ_PERSP_CENTROID) |
413          CONDREG(ij_regid[IJ_LINEAR_PIXEL], A4XX_RB_RENDER_CONTROL2_SIZE) |
414          CONDREG(samp_id_regid, A4XX_RB_RENDER_CONTROL2_SAMPLEID) |
415          COND(s[FS].v->frag_face, A4XX_RB_RENDER_CONTROL2_FACENESS) |
416          CONDREG(samp_mask_regid, A4XX_RB_RENDER_CONTROL2_SAMPLEMASK) |
417          COND(s[FS].v->fragcoord_compmask != 0,
418               A4XX_RB_RENDER_CONTROL2_COORD_MASK(s[FS].v->fragcoord_compmask)));
419 
420    OUT_PKT0(ring, REG_A4XX_RB_FS_OUTPUT_REG, 1);
421    OUT_RING(ring,
422             A4XX_RB_FS_OUTPUT_REG_MRT(nr) |
423                COND(s[FS].v->writes_pos, A4XX_RB_FS_OUTPUT_REG_FRAG_WRITES_Z));
424 
425    OUT_PKT0(ring, REG_A4XX_SP_FS_OUTPUT_REG, 1);
426    OUT_RING(ring,
427             A4XX_SP_FS_OUTPUT_REG_MRT(nr) |
428                COND(s[FS].v->writes_pos, A4XX_SP_FS_OUTPUT_REG_DEPTH_ENABLE) |
429                A4XX_SP_FS_OUTPUT_REG_DEPTH_REGID(posz_regid));
430 
431    OUT_PKT0(ring, REG_A4XX_SP_FS_MRT_REG(0), 8);
432    for (i = 0; i < 8; i++) {
433       enum a4xx_color_fmt format = 0;
434       bool srgb = false;
435       bool uint = false;
436       bool sint = false;
437       if (i < nr) {
438          format = fd4_emit_format(bufs[i]);
439          if (bufs[i]) {
440             if (!emit->no_decode_srgb)
441                srgb = util_format_is_srgb(bufs[i]->format);
442             uint = util_format_is_pure_uint(bufs[i]->format);
443             sint = util_format_is_pure_sint(bufs[i]->format);
444          }
445       }
446       OUT_RING(ring, A4XX_SP_FS_MRT_REG_REGID(color_regid[i]) |
447                         A4XX_SP_FS_MRT_REG_MRTFORMAT(format) |
448                         COND(srgb, A4XX_SP_FS_MRT_REG_COLOR_SRGB) |
449                         COND(uint, A4XX_SP_FS_MRT_REG_COLOR_UINT) |
450                         COND(sint, A4XX_SP_FS_MRT_REG_COLOR_SINT) |
451                         COND(color_regid[i] & HALF_REG_ID,
452                              A4XX_SP_FS_MRT_REG_HALF_PRECISION));
453    }
454 
455    if (emit->binning_pass) {
456       OUT_PKT0(ring, REG_A4XX_VPC_ATTR, 2);
457       OUT_RING(ring, A4XX_VPC_ATTR_THRDASSIGN(1) | 0x40000000 | /* XXX */
458                         COND(s[VS].v->writes_psize, A4XX_VPC_ATTR_PSIZE));
459       OUT_RING(ring, 0x00000000);
460    } else {
461       uint32_t vinterp[8], vpsrepl[8];
462 
463       memset(vinterp, 0, sizeof(vinterp));
464       memset(vpsrepl, 0, sizeof(vpsrepl));
465 
466       /* looks like we need to do int varyings in the frag
467        * shader on a4xx (no flatshad reg?  or a420.0 bug?):
468        *
469        *    (sy)(ss)nop
470        *    (sy)ldlv.u32 r0.x,l[r0.x], 1
471        *    ldlv.u32 r0.y,l[r0.x+1], 1
472        *    (ss)bary.f (ei)r63.x, 0, r0.x
473        *    (ss)(rpt1)cov.s32f16 hr0.x, (r)r0.x
474        *    (rpt5)nop
475        *    sam (f16)(xyzw)hr0.x, hr0.x, s#0, t#0
476        *
477        * Possibly on later a4xx variants we'll be able to use
478        * something like the code below instead of workaround
479        * in the shader:
480        */
481       /* figure out VARYING_INTERP / VARYING_PS_REPL register values: */
482       for (j = -1;
483            (j = ir3_next_varying(s[FS].v, j)) < (int)s[FS].v->inputs_count;) {
484          /* NOTE: varyings are packed, so if compmask is 0xb
485           * then first, third, and fourth component occupy
486           * three consecutive varying slots:
487           */
488          unsigned compmask = s[FS].v->inputs[j].compmask;
489 
490          uint32_t inloc = s[FS].v->inputs[j].inloc;
491 
492          if (s[FS].v->inputs[j].flat ||
493              (s[FS].v->inputs[j].rasterflat && emit->rasterflat)) {
494             uint32_t loc = inloc;
495 
496             for (i = 0; i < 4; i++) {
497                if (compmask & (1 << i)) {
498                   vinterp[loc / 16] |= 1 << ((loc % 16) * 2);
499                   // flatshade[loc / 32] |= 1 << (loc % 32);
500                   loc++;
501                }
502             }
503          }
504 
505          bool coord_mode = emit->sprite_coord_mode;
506          if (ir3_point_sprite(s[FS].v, j, emit->sprite_coord_enable,
507                               &coord_mode)) {
508             /* mask is two 2-bit fields, where:
509              *   '01' -> S
510              *   '10' -> T
511              *   '11' -> 1 - T  (flip mode)
512              */
513             unsigned mask = coord_mode ? 0b1101 : 0b1001;
514             uint32_t loc = inloc;
515             if (compmask & 0x1) {
516                vpsrepl[loc / 16] |= ((mask >> 0) & 0x3) << ((loc % 16) * 2);
517                loc++;
518             }
519             if (compmask & 0x2) {
520                vpsrepl[loc / 16] |= ((mask >> 2) & 0x3) << ((loc % 16) * 2);
521                loc++;
522             }
523             if (compmask & 0x4) {
524                /* .z <- 0.0f */
525                vinterp[loc / 16] |= 0b10 << ((loc % 16) * 2);
526                loc++;
527             }
528             if (compmask & 0x8) {
529                /* .w <- 1.0f */
530                vinterp[loc / 16] |= 0b11 << ((loc % 16) * 2);
531                loc++;
532             }
533          }
534       }
535 
536       OUT_PKT0(ring, REG_A4XX_VPC_ATTR, 2);
537       OUT_RING(ring, A4XX_VPC_ATTR_TOTALATTR(s[FS].v->total_in) |
538                         A4XX_VPC_ATTR_THRDASSIGN(1) |
539                         COND(s[FS].v->total_in > 0, A4XX_VPC_ATTR_ENABLE) |
540                         0x40000000 | /* XXX */
541                         COND(s[VS].v->writes_psize, A4XX_VPC_ATTR_PSIZE));
542       OUT_RING(ring, A4XX_VPC_PACK_NUMFPNONPOSVAR(s[FS].v->total_in) |
543                         A4XX_VPC_PACK_NUMNONPOSVSVAR(s[FS].v->total_in));
544 
545       OUT_PKT0(ring, REG_A4XX_VPC_VARYING_INTERP_MODE(0), 8);
546       for (i = 0; i < 8; i++)
547          OUT_RING(ring, vinterp[i]); /* VPC_VARYING_INTERP[i].MODE */
548 
549       OUT_PKT0(ring, REG_A4XX_VPC_VARYING_PS_REPL_MODE(0), 8);
550       for (i = 0; i < 8; i++)
551          OUT_RING(ring, vpsrepl[i]); /* VPC_VARYING_PS_REPL[i] */
552    }
553 
554    if (s[VS].instrlen)
555       fd4_emit_shader(ring, s[VS].v);
556 
557    if (!emit->binning_pass)
558       if (s[FS].instrlen)
559          fd4_emit_shader(ring, s[FS].v);
560 }
561 
562 static struct ir3_program_state *
fd4_program_create(void * data,const struct ir3_shader_variant * bs,const struct ir3_shader_variant * vs,const struct ir3_shader_variant * hs,const struct ir3_shader_variant * ds,const struct ir3_shader_variant * gs,const struct ir3_shader_variant * fs,const struct ir3_cache_key * key)563 fd4_program_create(void *data, const struct ir3_shader_variant *bs,
564                    const struct ir3_shader_variant *vs,
565                    const struct ir3_shader_variant *hs,
566                    const struct ir3_shader_variant *ds,
567                    const struct ir3_shader_variant *gs,
568                    const struct ir3_shader_variant *fs,
569                    const struct ir3_cache_key *key) in_dt
570 {
571    struct fd_context *ctx = fd_context(data);
572    struct fd4_program_state *state = CALLOC_STRUCT(fd4_program_state);
573 
574    tc_assert_driver_thread(ctx->tc);
575 
576    state->bs = bs;
577    state->vs = vs;
578    state->fs = fs;
579 
580    return &state->base;
581 }
582 
583 static void
fd4_program_destroy(void * data,struct ir3_program_state * state)584 fd4_program_destroy(void *data, struct ir3_program_state *state)
585 {
586    struct fd4_program_state *so = fd4_program_state(state);
587    free(so);
588 }
589 
590 static const struct ir3_cache_funcs cache_funcs = {
591    .create_state = fd4_program_create,
592    .destroy_state = fd4_program_destroy,
593 };
594 
595 void
fd4_prog_init(struct pipe_context * pctx)596 fd4_prog_init(struct pipe_context *pctx)
597 {
598    struct fd_context *ctx = fd_context(pctx);
599 
600    ctx->shader_cache = ir3_cache_create(&cache_funcs, ctx);
601    ir3_prog_init(pctx);
602    fd_prog_init(pctx);
603 }
604