xref: /aosp_15_r20/external/mesa3d/src/gallium/drivers/nouveau/nv30/nvfx_fragprog.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 #include <float.h>
2 #include "pipe/p_context.h"
3 #include "pipe/p_defines.h"
4 #include "pipe/p_state.h"
5 #include "util/u_dynarray.h"
6 #include "util/u_inlines.h"
7 #include "util/u_debug.h"
8 #include "util/u_memory.h"
9 
10 #include "pipe/p_shader_tokens.h"
11 #include "tgsi/tgsi_parse.h"
12 #include "tgsi/tgsi_dump.h"
13 
14 #include "nouveau_debug.h"
15 #include "nv_object.xml.h"
16 #include "nv30/nv30-40_3d.xml.h"
17 #include "nv30/nvfx_shader.h"
18 #include "nv30/nv30_state.h"
19 
20 struct nvfx_fpc {
21    struct nv30_fragprog *fp;
22 
23    unsigned max_temps;
24    unsigned long long r_temps;
25    unsigned long long r_temps_discard;
26    struct nvfx_reg r_result[PIPE_MAX_SHADER_OUTPUTS];
27    struct nvfx_reg r_input[PIPE_MAX_SHADER_INPUTS];
28    struct nvfx_reg *r_temp;
29 
30    int num_regs;
31 
32    unsigned inst_offset;
33    unsigned have_const;
34    unsigned is_nv4x;
35 
36    struct util_dynarray imm_data;
37 
38    struct nvfx_reg* r_imm;
39    unsigned nr_imm;
40 
41    struct util_dynarray if_stack;
42    //struct util_dynarray loop_stack;
43    struct util_dynarray label_relocs;
44 };
45 
46 static inline struct nvfx_reg
temp(struct nvfx_fpc * fpc)47 temp(struct nvfx_fpc *fpc)
48 {
49    int idx = __builtin_ctzll(~fpc->r_temps);
50 
51    if (idx >= fpc->max_temps) {
52       NOUVEAU_ERR("out of temps!!\n");
53       return nvfx_reg(NVFXSR_TEMP, 0);
54    }
55 
56    fpc->r_temps |= (1ULL << idx);
57    fpc->r_temps_discard |= (1ULL << idx);
58    return nvfx_reg(NVFXSR_TEMP, idx);
59 }
60 
61 static inline void
release_temps(struct nvfx_fpc * fpc)62 release_temps(struct nvfx_fpc *fpc)
63 {
64    fpc->r_temps &= ~fpc->r_temps_discard;
65    fpc->r_temps_discard = 0ULL;
66 }
67 
68 static inline struct nvfx_reg
nvfx_fp_imm(struct nvfx_fpc * fpc,float a,float b,float c,float d)69 nvfx_fp_imm(struct nvfx_fpc *fpc, float a, float b, float c, float d)
70 {
71    float v[4] = {a, b, c, d};
72    int idx = fpc->imm_data.size >> 4;
73 
74    memcpy(util_dynarray_grow(&fpc->imm_data, float, 4), v, 4 * sizeof(float));
75    return nvfx_reg(NVFXSR_IMM, idx);
76 }
77 
78 static void
grow_insns(struct nvfx_fpc * fpc,int size)79 grow_insns(struct nvfx_fpc *fpc, int size)
80 {
81    struct nv30_fragprog *fp = fpc->fp;
82 
83    fp->insn_len += size;
84    fp->insn = realloc(fp->insn, sizeof(uint32_t) * fp->insn_len);
85 }
86 
87 static void
emit_src(struct nvfx_fpc * fpc,int pos,struct nvfx_src src)88 emit_src(struct nvfx_fpc *fpc, int pos, struct nvfx_src src)
89 {
90    struct nv30_fragprog *fp = fpc->fp;
91    uint32_t *hw = &fp->insn[fpc->inst_offset];
92    uint32_t sr = 0;
93 
94    switch (src.reg.type) {
95    case NVFXSR_INPUT:
96       sr |= (NVFX_FP_REG_TYPE_INPUT << NVFX_FP_REG_TYPE_SHIFT);
97       hw[0] |= (src.reg.index << NVFX_FP_OP_INPUT_SRC_SHIFT);
98       break;
99    case NVFXSR_OUTPUT:
100       sr |= NVFX_FP_REG_SRC_HALF;
101       FALLTHROUGH;
102    case NVFXSR_TEMP:
103       sr |= (NVFX_FP_REG_TYPE_TEMP << NVFX_FP_REG_TYPE_SHIFT);
104       sr |= (src.reg.index << NVFX_FP_REG_SRC_SHIFT);
105       break;
106    case NVFXSR_IMM:
107       if (!fpc->have_const) {
108          grow_insns(fpc, 4);
109          hw = &fp->insn[fpc->inst_offset];
110          fpc->have_const = 1;
111       }
112 
113       memcpy(&fp->insn[fpc->inst_offset + 4],
114             (float*)fpc->imm_data.data + src.reg.index * 4,
115             sizeof(uint32_t) * 4);
116 
117       sr |= (NVFX_FP_REG_TYPE_CONST << NVFX_FP_REG_TYPE_SHIFT);
118       break;
119    case NVFXSR_CONST:
120       if (!fpc->have_const) {
121          grow_insns(fpc, 4);
122          hw = &fp->insn[fpc->inst_offset];
123          fpc->have_const = 1;
124       }
125 
126       {
127          struct nv30_fragprog_data *fpd;
128 
129          fp->consts = realloc(fp->consts, ++fp->nr_consts *
130                     sizeof(*fpd));
131          fpd = &fp->consts[fp->nr_consts - 1];
132          fpd->offset = fpc->inst_offset + 4;
133          fpd->index = src.reg.index;
134          memset(&fp->insn[fpd->offset], 0, sizeof(uint32_t) * 4);
135       }
136 
137       sr |= (NVFX_FP_REG_TYPE_CONST << NVFX_FP_REG_TYPE_SHIFT);
138       break;
139    case NVFXSR_NONE:
140       sr |= (NVFX_FP_REG_TYPE_INPUT << NVFX_FP_REG_TYPE_SHIFT);
141       break;
142    default:
143       assert(0);
144    }
145 
146    if (src.negate)
147       sr |= NVFX_FP_REG_NEGATE;
148 
149    if (src.abs)
150       hw[1] |= (1 << (29 + pos));
151 
152    sr |= ((src.swz[0] << NVFX_FP_REG_SWZ_X_SHIFT) |
153           (src.swz[1] << NVFX_FP_REG_SWZ_Y_SHIFT) |
154           (src.swz[2] << NVFX_FP_REG_SWZ_Z_SHIFT) |
155           (src.swz[3] << NVFX_FP_REG_SWZ_W_SHIFT));
156 
157    hw[pos + 1] |= sr;
158 }
159 
160 static void
emit_dst(struct nvfx_fpc * fpc,struct nvfx_reg dst)161 emit_dst(struct nvfx_fpc *fpc, struct nvfx_reg dst)
162 {
163    struct nv30_fragprog *fp = fpc->fp;
164    uint32_t *hw = &fp->insn[fpc->inst_offset];
165 
166    switch (dst.type) {
167    case NVFXSR_OUTPUT:
168       if (dst.index == 1)
169          fp->fp_control |= 0x0000000e;
170       else {
171          hw[0] |= NVFX_FP_OP_OUT_REG_HALF;
172          dst.index <<= 1;
173       }
174       FALLTHROUGH;
175    case NVFXSR_TEMP:
176       if (fpc->num_regs < (dst.index + 1))
177          fpc->num_regs = dst.index + 1;
178       break;
179    case NVFXSR_NONE:
180       hw[0] |= (1 << 30);
181       break;
182    default:
183       assert(0);
184    }
185 
186    hw[0] |= (dst.index << NVFX_FP_OP_OUT_REG_SHIFT);
187 }
188 
189 static void
nvfx_fp_emit(struct nvfx_fpc * fpc,struct nvfx_insn insn)190 nvfx_fp_emit(struct nvfx_fpc *fpc, struct nvfx_insn insn)
191 {
192    struct nv30_fragprog *fp = fpc->fp;
193    uint32_t *hw;
194 
195    fpc->inst_offset = fp->insn_len;
196    fpc->have_const = 0;
197    grow_insns(fpc, 4);
198    hw = &fp->insn[fpc->inst_offset];
199    memset(hw, 0, sizeof(uint32_t) * 4);
200 
201    if (insn.op == NVFX_FP_OP_OPCODE_KIL)
202       fp->fp_control |= NV30_3D_FP_CONTROL_USES_KIL;
203    hw[0] |= (insn.op << NVFX_FP_OP_OPCODE_SHIFT);
204    hw[0] |= (insn.mask << NVFX_FP_OP_OUTMASK_SHIFT);
205    hw[2] |= (insn.scale << NVFX_FP_OP_DST_SCALE_SHIFT);
206 
207    if (insn.sat)
208       hw[0] |= NVFX_FP_OP_OUT_SAT;
209 
210    if (insn.cc_update)
211       hw[0] |= NVFX_FP_OP_COND_WRITE_ENABLE;
212    hw[1] |= (insn.cc_test << NVFX_FP_OP_COND_SHIFT);
213    hw[1] |= ((insn.cc_swz[0] << NVFX_FP_OP_COND_SWZ_X_SHIFT) |
214         (insn.cc_swz[1] << NVFX_FP_OP_COND_SWZ_Y_SHIFT) |
215         (insn.cc_swz[2] << NVFX_FP_OP_COND_SWZ_Z_SHIFT) |
216         (insn.cc_swz[3] << NVFX_FP_OP_COND_SWZ_W_SHIFT));
217 
218    if(insn.unit >= 0)
219    {
220       hw[0] |= (insn.unit << NVFX_FP_OP_TEX_UNIT_SHIFT);
221    }
222 
223    emit_dst(fpc, insn.dst);
224    emit_src(fpc, 0, insn.src[0]);
225    emit_src(fpc, 1, insn.src[1]);
226    emit_src(fpc, 2, insn.src[2]);
227 }
228 
229 #define arith(s,o,d,m,s0,s1,s2) \
230        nvfx_insn((s), NVFX_FP_OP_OPCODE_##o, -1, \
231                        (d), (m), (s0), (s1), (s2))
232 
233 #define tex(s,o,u,d,m,s0,s1,s2) \
234    nvfx_insn((s), NVFX_FP_OP_OPCODE_##o, (u), \
235                    (d), (m), (s0), none, none)
236 
237 /* IF src.x != 0, as TGSI specifies */
238 static void
nv40_fp_if(struct nvfx_fpc * fpc,struct nvfx_src src)239 nv40_fp_if(struct nvfx_fpc *fpc, struct nvfx_src src)
240 {
241    const struct nvfx_src none = nvfx_src(nvfx_reg(NVFXSR_NONE, 0));
242    struct nvfx_insn insn = arith(0, MOV, none.reg, NVFX_FP_MASK_X, src, none, none);
243    uint32_t *hw;
244    insn.cc_update = 1;
245    nvfx_fp_emit(fpc, insn);
246 
247    fpc->inst_offset = fpc->fp->insn_len;
248    grow_insns(fpc, 4);
249    hw = &fpc->fp->insn[fpc->inst_offset];
250    /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */
251    hw[0] = (NV40_FP_OP_BRA_OPCODE_IF << NVFX_FP_OP_OPCODE_SHIFT) |
252       NV40_FP_OP_OUT_NONE |
253       (NVFX_FP_PRECISION_FP16 << NVFX_FP_OP_PRECISION_SHIFT);
254    /* Use .xxxx swizzle so that we check only src[0].x*/
255    hw[1] = (0 << NVFX_FP_OP_COND_SWZ_X_SHIFT) |
256          (0 << NVFX_FP_OP_COND_SWZ_Y_SHIFT) |
257          (0 << NVFX_FP_OP_COND_SWZ_Z_SHIFT) |
258          (0 << NVFX_FP_OP_COND_SWZ_W_SHIFT) |
259          (NVFX_FP_OP_COND_NE << NVFX_FP_OP_COND_SHIFT);
260    hw[2] = 0; /* | NV40_FP_OP_OPCODE_IS_BRANCH | else_offset */
261    hw[3] = 0; /* | endif_offset */
262    util_dynarray_append(&fpc->if_stack, unsigned, fpc->inst_offset);
263 }
264 
265 /* IF src.x != 0, as TGSI specifies */
266 static void
nv40_fp_cal(struct nvfx_fpc * fpc,unsigned target)267 nv40_fp_cal(struct nvfx_fpc *fpc, unsigned target)
268 {
269         struct nvfx_relocation reloc;
270         uint32_t *hw;
271         fpc->inst_offset = fpc->fp->insn_len;
272         grow_insns(fpc, 4);
273         hw = &fpc->fp->insn[fpc->inst_offset];
274         /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */
275         hw[0] = (NV40_FP_OP_BRA_OPCODE_CAL << NVFX_FP_OP_OPCODE_SHIFT);
276         /* Use .xxxx swizzle so that we check only src[0].x*/
277         hw[1] = (NVFX_SWZ_IDENTITY << NVFX_FP_OP_COND_SWZ_ALL_SHIFT) |
278                         (NVFX_FP_OP_COND_TR << NVFX_FP_OP_COND_SHIFT);
279         hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH; /* | call_offset */
280         hw[3] = 0;
281         reloc.target = target;
282         reloc.location = fpc->inst_offset + 2;
283         util_dynarray_append(&fpc->label_relocs, struct nvfx_relocation, reloc);
284 }
285 
286 static void
nv40_fp_ret(struct nvfx_fpc * fpc)287 nv40_fp_ret(struct nvfx_fpc *fpc)
288 {
289    uint32_t *hw;
290    fpc->inst_offset = fpc->fp->insn_len;
291    grow_insns(fpc, 4);
292    hw = &fpc->fp->insn[fpc->inst_offset];
293    /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */
294    hw[0] = (NV40_FP_OP_BRA_OPCODE_RET << NVFX_FP_OP_OPCODE_SHIFT);
295    /* Use .xxxx swizzle so that we check only src[0].x*/
296    hw[1] = (NVFX_SWZ_IDENTITY << NVFX_FP_OP_COND_SWZ_ALL_SHIFT) |
297          (NVFX_FP_OP_COND_TR << NVFX_FP_OP_COND_SHIFT);
298    hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH; /* | call_offset */
299    hw[3] = 0;
300 }
301 
302 static void
nv40_fp_rep(struct nvfx_fpc * fpc,unsigned count,unsigned target)303 nv40_fp_rep(struct nvfx_fpc *fpc, unsigned count, unsigned target)
304 {
305         struct nvfx_relocation reloc;
306         uint32_t *hw;
307         fpc->inst_offset = fpc->fp->insn_len;
308         grow_insns(fpc, 4);
309         hw = &fpc->fp->insn[fpc->inst_offset];
310         /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */
311         hw[0] = (NV40_FP_OP_BRA_OPCODE_REP << NVFX_FP_OP_OPCODE_SHIFT) |
312                         NV40_FP_OP_OUT_NONE |
313                         (NVFX_FP_PRECISION_FP16 << NVFX_FP_OP_PRECISION_SHIFT);
314         /* Use .xxxx swizzle so that we check only src[0].x*/
315         hw[1] = (NVFX_SWZ_IDENTITY << NVFX_FP_OP_COND_SWZ_ALL_SHIFT) |
316                         (NVFX_FP_OP_COND_TR << NVFX_FP_OP_COND_SHIFT);
317         hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH |
318                         (count << NV40_FP_OP_REP_COUNT1_SHIFT) |
319                         (count << NV40_FP_OP_REP_COUNT2_SHIFT) |
320                         (count << NV40_FP_OP_REP_COUNT3_SHIFT);
321         hw[3] = 0; /* | end_offset */
322         reloc.target = target;
323         reloc.location = fpc->inst_offset + 3;
324         util_dynarray_append(&fpc->label_relocs, struct nvfx_relocation, reloc);
325         //util_dynarray_append(&fpc->loop_stack, unsigned, target);
326 }
327 
328 #if 0
329 /* documentation only */
330 /* warning: this only works forward, and probably only if not inside any IF */
331 static void
332 nv40_fp_bra(struct nvfx_fpc *fpc, unsigned target)
333 {
334         struct nvfx_relocation reloc;
335         uint32_t *hw;
336         fpc->inst_offset = fpc->fp->insn_len;
337         grow_insns(fpc, 4);
338         hw = &fpc->fp->insn[fpc->inst_offset];
339         /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */
340         hw[0] = (NV40_FP_OP_BRA_OPCODE_IF << NVFX_FP_OP_OPCODE_SHIFT) |
341                 NV40_FP_OP_OUT_NONE |
342                 (NVFX_FP_PRECISION_FP16 << NVFX_FP_OP_PRECISION_SHIFT);
343         /* Use .xxxx swizzle so that we check only src[0].x*/
344         hw[1] = (NVFX_SWZ_IDENTITY << NVFX_FP_OP_COND_SWZ_X_SHIFT) |
345                         (NVFX_FP_OP_COND_FL << NVFX_FP_OP_COND_SHIFT);
346         hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH; /* | else_offset */
347         hw[3] = 0; /* | endif_offset */
348         reloc.target = target;
349         reloc.location = fpc->inst_offset + 2;
350         util_dynarray_append(&fpc->label_relocs, struct nvfx_relocation, reloc);
351         reloc.target = target;
352         reloc.location = fpc->inst_offset + 3;
353         util_dynarray_append(&fpc->label_relocs, struct nvfx_relocation, reloc);
354 }
355 #endif
356 
357 static void
nv40_fp_brk(struct nvfx_fpc * fpc)358 nv40_fp_brk(struct nvfx_fpc *fpc)
359 {
360    uint32_t *hw;
361    fpc->inst_offset = fpc->fp->insn_len;
362    grow_insns(fpc, 4);
363    hw = &fpc->fp->insn[fpc->inst_offset];
364    /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */
365    hw[0] = (NV40_FP_OP_BRA_OPCODE_BRK << NVFX_FP_OP_OPCODE_SHIFT) |
366       NV40_FP_OP_OUT_NONE;
367    /* Use .xxxx swizzle so that we check only src[0].x*/
368    hw[1] = (NVFX_SWZ_IDENTITY << NVFX_FP_OP_COND_SWZ_X_SHIFT) |
369          (NVFX_FP_OP_COND_TR << NVFX_FP_OP_COND_SHIFT);
370    hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH;
371    hw[3] = 0;
372 }
373 
374 static inline struct nvfx_src
tgsi_src(struct nvfx_fpc * fpc,const struct tgsi_full_src_register * fsrc)375 tgsi_src(struct nvfx_fpc *fpc, const struct tgsi_full_src_register *fsrc)
376 {
377    struct nvfx_src src;
378 
379    switch (fsrc->Register.File) {
380    case TGSI_FILE_INPUT:
381       src.reg = fpc->r_input[fsrc->Register.Index];
382       break;
383    case TGSI_FILE_CONSTANT:
384       src.reg = nvfx_reg(NVFXSR_CONST, fsrc->Register.Index);
385       break;
386    case TGSI_FILE_IMMEDIATE:
387       assert(fsrc->Register.Index < fpc->nr_imm);
388       src.reg = fpc->r_imm[fsrc->Register.Index];
389       break;
390    case TGSI_FILE_TEMPORARY:
391       src.reg = fpc->r_temp[fsrc->Register.Index];
392       break;
393    /* NV40 fragprog result regs are just temps, so this is simple */
394    case TGSI_FILE_OUTPUT:
395       src.reg = fpc->r_result[fsrc->Register.Index];
396       break;
397    default:
398       NOUVEAU_ERR("bad src file\n");
399       src.reg.index = 0;
400       src.reg.type = 0;
401       break;
402    }
403 
404    src.abs = fsrc->Register.Absolute;
405    src.negate = fsrc->Register.Negate;
406    src.swz[0] = fsrc->Register.SwizzleX;
407    src.swz[1] = fsrc->Register.SwizzleY;
408    src.swz[2] = fsrc->Register.SwizzleZ;
409    src.swz[3] = fsrc->Register.SwizzleW;
410    src.indirect = 0;
411    src.indirect_reg = 0;
412    src.indirect_swz = 0;
413    return src;
414 }
415 
416 static inline struct nvfx_reg
tgsi_dst(struct nvfx_fpc * fpc,const struct tgsi_full_dst_register * fdst)417 tgsi_dst(struct nvfx_fpc *fpc, const struct tgsi_full_dst_register *fdst) {
418    switch (fdst->Register.File) {
419    case TGSI_FILE_OUTPUT:
420       return fpc->r_result[fdst->Register.Index];
421    case TGSI_FILE_TEMPORARY:
422       return fpc->r_temp[fdst->Register.Index];
423    case TGSI_FILE_NULL:
424       return nvfx_reg(NVFXSR_NONE, 0);
425    default:
426       NOUVEAU_ERR("bad dst file %d\n", fdst->Register.File);
427       return nvfx_reg(NVFXSR_NONE, 0);
428    }
429 }
430 
431 static inline int
tgsi_mask(uint tgsi)432 tgsi_mask(uint tgsi)
433 {
434    int mask = 0;
435 
436    if (tgsi & TGSI_WRITEMASK_X) mask |= NVFX_FP_MASK_X;
437    if (tgsi & TGSI_WRITEMASK_Y) mask |= NVFX_FP_MASK_Y;
438    if (tgsi & TGSI_WRITEMASK_Z) mask |= NVFX_FP_MASK_Z;
439    if (tgsi & TGSI_WRITEMASK_W) mask |= NVFX_FP_MASK_W;
440    return mask;
441 }
442 
443 static bool
nvfx_fragprog_parse_instruction(struct nvfx_fpc * fpc,const struct tgsi_full_instruction * finst)444 nvfx_fragprog_parse_instruction(struct nvfx_fpc *fpc,
445             const struct tgsi_full_instruction *finst)
446 {
447    const struct nvfx_src none = nvfx_src(nvfx_reg(NVFXSR_NONE, 0));
448    struct nvfx_insn insn;
449    struct nvfx_src src[3], tmp;
450    struct nvfx_reg dst;
451    int mask, sat, unit = 0;
452    int ai = -1, ci = -1, ii = -1;
453    int i;
454 
455    if (finst->Instruction.Opcode == TGSI_OPCODE_END)
456       return true;
457 
458    for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
459       const struct tgsi_full_src_register *fsrc;
460 
461       fsrc = &finst->Src[i];
462       if (fsrc->Register.File == TGSI_FILE_TEMPORARY) {
463          src[i] = tgsi_src(fpc, fsrc);
464       }
465    }
466 
467    for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
468       const struct tgsi_full_src_register *fsrc;
469 
470       fsrc = &finst->Src[i];
471 
472       switch (fsrc->Register.File) {
473       case TGSI_FILE_INPUT:
474          if(fpc->fp->info.input_semantic_name[fsrc->Register.Index] == TGSI_SEMANTIC_FOG && (0
475                || fsrc->Register.SwizzleX == PIPE_SWIZZLE_W
476                || fsrc->Register.SwizzleY == PIPE_SWIZZLE_W
477                || fsrc->Register.SwizzleZ == PIPE_SWIZZLE_W
478                || fsrc->Register.SwizzleW == PIPE_SWIZZLE_W
479                )) {
480             /* hardware puts 0 in fogcoord.w, but GL/Gallium want 1 there */
481             struct nvfx_src addend = nvfx_src(nvfx_fp_imm(fpc, 0, 0, 0, 1));
482             addend.swz[0] = fsrc->Register.SwizzleX;
483             addend.swz[1] = fsrc->Register.SwizzleY;
484             addend.swz[2] = fsrc->Register.SwizzleZ;
485             addend.swz[3] = fsrc->Register.SwizzleW;
486             src[i] = nvfx_src(temp(fpc));
487             nvfx_fp_emit(fpc, arith(0, ADD, src[i].reg, NVFX_FP_MASK_ALL, tgsi_src(fpc, fsrc), addend, none));
488          } else if (ai == -1 || ai == fsrc->Register.Index) {
489             ai = fsrc->Register.Index;
490             src[i] = tgsi_src(fpc, fsrc);
491          } else {
492             src[i] = nvfx_src(temp(fpc));
493             nvfx_fp_emit(fpc, arith(0, MOV, src[i].reg, NVFX_FP_MASK_ALL, tgsi_src(fpc, fsrc), none, none));
494          }
495          break;
496       case TGSI_FILE_CONSTANT:
497          if ((ci == -1 && ii == -1) ||
498              ci == fsrc->Register.Index) {
499             ci = fsrc->Register.Index;
500             src[i] = tgsi_src(fpc, fsrc);
501          } else {
502             src[i] = nvfx_src(temp(fpc));
503             nvfx_fp_emit(fpc, arith(0, MOV, src[i].reg, NVFX_FP_MASK_ALL, tgsi_src(fpc, fsrc), none, none));
504          }
505          break;
506       case TGSI_FILE_IMMEDIATE:
507          if ((ci == -1 && ii == -1) ||
508              ii == fsrc->Register.Index) {
509             ii = fsrc->Register.Index;
510             src[i] = tgsi_src(fpc, fsrc);
511          } else {
512             src[i] = nvfx_src(temp(fpc));
513             nvfx_fp_emit(fpc, arith(0, MOV, src[i].reg, NVFX_FP_MASK_ALL, tgsi_src(fpc, fsrc), none, none));
514          }
515          break;
516       case TGSI_FILE_TEMPORARY:
517          /* handled above */
518          break;
519       case TGSI_FILE_SAMPLER:
520          unit = fsrc->Register.Index;
521          break;
522       case TGSI_FILE_OUTPUT:
523          break;
524       default:
525          NOUVEAU_ERR("bad src file\n");
526          return false;
527       }
528    }
529 
530    dst  = tgsi_dst(fpc, &finst->Dst[0]);
531    mask = tgsi_mask(finst->Dst[0].Register.WriteMask);
532    sat  = finst->Instruction.Saturate;
533 
534    switch (finst->Instruction.Opcode) {
535    case TGSI_OPCODE_ADD:
536       nvfx_fp_emit(fpc, arith(sat, ADD, dst, mask, src[0], src[1], none));
537       break;
538    case TGSI_OPCODE_CEIL:
539       tmp = nvfx_src(temp(fpc));
540       nvfx_fp_emit(fpc, arith(0, FLR, tmp.reg, mask, neg(src[0]), none, none));
541       nvfx_fp_emit(fpc, arith(sat, MOV, dst, mask, neg(tmp), none, none));
542       break;
543    case TGSI_OPCODE_CMP:
544       insn = arith(0, MOV, none.reg, mask, src[0], none, none);
545       insn.cc_update = 1;
546       nvfx_fp_emit(fpc, insn);
547 
548       insn = arith(sat, MOV, dst, mask, src[2], none, none);
549       insn.cc_test = NVFX_COND_GE;
550       nvfx_fp_emit(fpc, insn);
551 
552       insn = arith(sat, MOV, dst, mask, src[1], none, none);
553       insn.cc_test = NVFX_COND_LT;
554       nvfx_fp_emit(fpc, insn);
555       break;
556    case TGSI_OPCODE_COS:
557       nvfx_fp_emit(fpc, arith(sat, COS, dst, mask, src[0], none, none));
558       break;
559    case TGSI_OPCODE_DDX:
560       if (mask & (NVFX_FP_MASK_Z | NVFX_FP_MASK_W)) {
561          tmp = nvfx_src(temp(fpc));
562          nvfx_fp_emit(fpc, arith(sat, DDX, tmp.reg, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, swz(src[0], Z, W, Z, W), none, none));
563          nvfx_fp_emit(fpc, arith(0, MOV, tmp.reg, NVFX_FP_MASK_Z | NVFX_FP_MASK_W, swz(tmp, X, Y, X, Y), none, none));
564          nvfx_fp_emit(fpc, arith(sat, DDX, tmp.reg, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, src[0], none, none));
565          nvfx_fp_emit(fpc, arith(0, MOV, dst, mask, tmp, none, none));
566       } else {
567          nvfx_fp_emit(fpc, arith(sat, DDX, dst, mask, src[0], none, none));
568       }
569       break;
570    case TGSI_OPCODE_DDY:
571       if (mask & (NVFX_FP_MASK_Z | NVFX_FP_MASK_W)) {
572          tmp = nvfx_src(temp(fpc));
573          nvfx_fp_emit(fpc, arith(sat, DDY, tmp.reg, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, swz(src[0], Z, W, Z, W), none, none));
574          nvfx_fp_emit(fpc, arith(0, MOV, tmp.reg, NVFX_FP_MASK_Z | NVFX_FP_MASK_W, swz(tmp, X, Y, X, Y), none, none));
575          nvfx_fp_emit(fpc, arith(sat, DDY, tmp.reg, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, src[0], none, none));
576          nvfx_fp_emit(fpc, arith(0, MOV, dst, mask, tmp, none, none));
577       } else {
578          nvfx_fp_emit(fpc, arith(sat, DDY, dst, mask, src[0], none, none));
579       }
580       break;
581    case TGSI_OPCODE_DP2:
582       tmp = nvfx_src(temp(fpc));
583       nvfx_fp_emit(fpc, arith(0, MUL, tmp.reg, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, src[0], src[1], none));
584       nvfx_fp_emit(fpc, arith(0, ADD, dst, mask, swz(tmp, X, X, X, X), swz(tmp, Y, Y, Y, Y), none));
585       break;
586    case TGSI_OPCODE_DP3:
587       nvfx_fp_emit(fpc, arith(sat, DP3, dst, mask, src[0], src[1], none));
588       break;
589    case TGSI_OPCODE_DP4:
590       nvfx_fp_emit(fpc, arith(sat, DP4, dst, mask, src[0], src[1], none));
591       break;
592    case TGSI_OPCODE_DST:
593       nvfx_fp_emit(fpc, arith(sat, DST, dst, mask, src[0], src[1], none));
594       break;
595    case TGSI_OPCODE_EX2:
596       nvfx_fp_emit(fpc, arith(sat, EX2, dst, mask, src[0], none, none));
597       break;
598    case TGSI_OPCODE_FLR:
599       nvfx_fp_emit(fpc, arith(sat, FLR, dst, mask, src[0], none, none));
600       break;
601    case TGSI_OPCODE_FRC:
602       nvfx_fp_emit(fpc, arith(sat, FRC, dst, mask, src[0], none, none));
603       break;
604    case TGSI_OPCODE_KILL:
605       nvfx_fp_emit(fpc, arith(0, KIL, none.reg, 0, none, none, none));
606       break;
607    case TGSI_OPCODE_KILL_IF:
608       insn = arith(0, MOV, none.reg, NVFX_FP_MASK_ALL, src[0], none, none);
609       insn.cc_update = 1;
610       nvfx_fp_emit(fpc, insn);
611 
612       insn = arith(0, KIL, none.reg, 0, none, none, none);
613       insn.cc_test = NVFX_COND_LT;
614       nvfx_fp_emit(fpc, insn);
615       break;
616    case TGSI_OPCODE_LG2:
617       nvfx_fp_emit(fpc, arith(sat, LG2, dst, mask, src[0], none, none));
618       break;
619    case TGSI_OPCODE_LIT:
620       if(!fpc->is_nv4x)
621          nvfx_fp_emit(fpc, arith(sat, LIT_NV30, dst, mask, src[0], none, none));
622       else {
623          /* we use FLT_MIN, so that log2 never gives -infinity, and thus multiplication by
624           * specular 0 always gives 0, so that ex2 gives 1, to satisfy the 0^0 = 1 requirement
625           *
626           * NOTE: if we start using half precision, we might need an fp16 FLT_MIN here instead
627           */
628          struct nvfx_src maxs = nvfx_src(nvfx_fp_imm(fpc, 0, FLT_MIN, 0, 0));
629          tmp = nvfx_src(temp(fpc));
630          if (ci>= 0 || ii >= 0) {
631             nvfx_fp_emit(fpc, arith(0, MOV, tmp.reg, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, maxs, none, none));
632             maxs = tmp;
633          }
634          nvfx_fp_emit(fpc, arith(0, MAX, tmp.reg, NVFX_FP_MASK_Y | NVFX_FP_MASK_W, swz(src[0], X, X, X, Y), swz(maxs, X, X, Y, Y), none));
635          nvfx_fp_emit(fpc, arith(0, LG2, tmp.reg, NVFX_FP_MASK_W, swz(tmp, W, W, W, W), none, none));
636          nvfx_fp_emit(fpc, arith(0, MUL, tmp.reg, NVFX_FP_MASK_W, swz(tmp, W, W, W, W), swz(src[0], W, W, W, W), none));
637          nvfx_fp_emit(fpc, arith(sat, LITEX2_NV40, dst, mask, swz(tmp, Y, Y, W, W), none, none));
638       }
639       break;
640    case TGSI_OPCODE_LRP:
641       if(!fpc->is_nv4x)
642          nvfx_fp_emit(fpc, arith(sat, LRP_NV30, dst, mask, src[0], src[1], src[2]));
643       else {
644          tmp = nvfx_src(temp(fpc));
645          nvfx_fp_emit(fpc, arith(0, MAD, tmp.reg, mask, neg(src[0]), src[2], src[2]));
646          nvfx_fp_emit(fpc, arith(sat, MAD, dst, mask, src[0], src[1], tmp));
647       }
648       break;
649    case TGSI_OPCODE_MAD:
650       nvfx_fp_emit(fpc, arith(sat, MAD, dst, mask, src[0], src[1], src[2]));
651       break;
652    case TGSI_OPCODE_MAX:
653       nvfx_fp_emit(fpc, arith(sat, MAX, dst, mask, src[0], src[1], none));
654       break;
655    case TGSI_OPCODE_MIN:
656       nvfx_fp_emit(fpc, arith(sat, MIN, dst, mask, src[0], src[1], none));
657       break;
658    case TGSI_OPCODE_MOV:
659       nvfx_fp_emit(fpc, arith(sat, MOV, dst, mask, src[0], none, none));
660       break;
661    case TGSI_OPCODE_MUL:
662       nvfx_fp_emit(fpc, arith(sat, MUL, dst, mask, src[0], src[1], none));
663       break;
664    case TGSI_OPCODE_NOP:
665       break;
666    case TGSI_OPCODE_POW:
667       if(!fpc->is_nv4x)
668          nvfx_fp_emit(fpc, arith(sat, POW_NV30, dst, mask, src[0], src[1], none));
669       else {
670          tmp = nvfx_src(temp(fpc));
671          nvfx_fp_emit(fpc, arith(0, LG2, tmp.reg, NVFX_FP_MASK_X, swz(src[0], X, X, X, X), none, none));
672          nvfx_fp_emit(fpc, arith(0, MUL, tmp.reg, NVFX_FP_MASK_X, swz(tmp, X, X, X, X), swz(src[1], X, X, X, X), none));
673          nvfx_fp_emit(fpc, arith(sat, EX2, dst, mask, swz(tmp, X, X, X, X), none, none));
674       }
675       break;
676    case TGSI_OPCODE_RCP:
677       nvfx_fp_emit(fpc, arith(sat, RCP, dst, mask, src[0], none, none));
678       break;
679    case TGSI_OPCODE_RSQ:
680       if(!fpc->is_nv4x)
681          nvfx_fp_emit(fpc, arith(sat, RSQ_NV30, dst, mask, abs(swz(src[0], X, X, X, X)), none, none));
682       else {
683          tmp = nvfx_src(temp(fpc));
684          insn = arith(0, LG2, tmp.reg, NVFX_FP_MASK_X, abs(swz(src[0], X, X, X, X)), none, none);
685          insn.scale = NVFX_FP_OP_DST_SCALE_INV_2X;
686          nvfx_fp_emit(fpc, insn);
687          nvfx_fp_emit(fpc, arith(sat, EX2, dst, mask, neg(swz(tmp, X, X, X, X)), none, none));
688       }
689       break;
690    case TGSI_OPCODE_SEQ:
691       nvfx_fp_emit(fpc, arith(sat, SEQ, dst, mask, src[0], src[1], none));
692       break;
693    case TGSI_OPCODE_SGE:
694       nvfx_fp_emit(fpc, arith(sat, SGE, dst, mask, src[0], src[1], none));
695       break;
696    case TGSI_OPCODE_SGT:
697       nvfx_fp_emit(fpc, arith(sat, SGT, dst, mask, src[0], src[1], none));
698       break;
699    case TGSI_OPCODE_SIN:
700       nvfx_fp_emit(fpc, arith(sat, SIN, dst, mask, src[0], none, none));
701       break;
702    case TGSI_OPCODE_SLE:
703       nvfx_fp_emit(fpc, arith(sat, SLE, dst, mask, src[0], src[1], none));
704       break;
705    case TGSI_OPCODE_SLT:
706       nvfx_fp_emit(fpc, arith(sat, SLT, dst, mask, src[0], src[1], none));
707       break;
708    case TGSI_OPCODE_SNE:
709       nvfx_fp_emit(fpc, arith(sat, SNE, dst, mask, src[0], src[1], none));
710       break;
711    case TGSI_OPCODE_SSG:
712    {
713       struct nvfx_src minones = swz(nvfx_src(nvfx_fp_imm(fpc, -1, -1, -1, -1)), X, X, X, X);
714 
715       insn = arith(sat, MOV, dst, mask, src[0], none, none);
716       insn.cc_update = 1;
717       nvfx_fp_emit(fpc, insn);
718 
719       insn = arith(0, STR, dst, mask, none, none, none);
720       insn.cc_test = NVFX_COND_GT;
721       nvfx_fp_emit(fpc, insn);
722 
723       if(!sat) {
724          insn = arith(0, MOV, dst, mask, minones, none, none);
725          insn.cc_test = NVFX_COND_LT;
726          nvfx_fp_emit(fpc, insn);
727       }
728       break;
729    }
730    case TGSI_OPCODE_TEX:
731       nvfx_fp_emit(fpc, tex(sat, TEX, unit, dst, mask, src[0], none, none));
732       break;
733         case TGSI_OPCODE_TRUNC:
734                 tmp = nvfx_src(temp(fpc));
735                 insn = arith(0, MOV, none.reg, mask, src[0], none, none);
736                 insn.cc_update = 1;
737                 nvfx_fp_emit(fpc, insn);
738 
739                 nvfx_fp_emit(fpc, arith(0, FLR, tmp.reg, mask, abs(src[0]), none, none));
740                 nvfx_fp_emit(fpc, arith(sat, MOV, dst, mask, tmp, none, none));
741 
742                 insn = arith(sat, MOV, dst, mask, neg(tmp), none, none);
743                 insn.cc_test = NVFX_COND_LT;
744                 nvfx_fp_emit(fpc, insn);
745                 break;
746         case TGSI_OPCODE_TXB:
747                 nvfx_fp_emit(fpc, tex(sat, TXB, unit, dst, mask, src[0], none, none));
748                 break;
749         case TGSI_OPCODE_TXL:
750                 if(fpc->is_nv4x)
751                         nvfx_fp_emit(fpc, tex(sat, TXL_NV40, unit, dst, mask, src[0], none, none));
752                 else /* unsupported on nv30, use TEX and hope they like it */
753                         nvfx_fp_emit(fpc, tex(sat, TEX, unit, dst, mask, src[0], none, none));
754                 break;
755         case TGSI_OPCODE_TXP:
756                 nvfx_fp_emit(fpc, tex(sat, TXP, unit, dst, mask, src[0], none, none));
757                 break;
758 
759    case TGSI_OPCODE_IF:
760       // MOVRC0 R31 (TR0.xyzw), R<src>:
761       // IF (NE.xxxx) ELSE <else> END <end>
762       if(!fpc->is_nv4x)
763          goto nv3x_cflow;
764       nv40_fp_if(fpc, src[0]);
765       break;
766 
767    case TGSI_OPCODE_ELSE:
768    {
769       uint32_t *hw;
770       if(!fpc->is_nv4x)
771          goto nv3x_cflow;
772       assert(util_dynarray_contains(&fpc->if_stack, unsigned));
773       hw = &fpc->fp->insn[util_dynarray_top(&fpc->if_stack, unsigned)];
774       hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH | fpc->fp->insn_len;
775       break;
776    }
777 
778    case TGSI_OPCODE_ENDIF:
779    {
780       uint32_t *hw;
781       if(!fpc->is_nv4x)
782          goto nv3x_cflow;
783       assert(util_dynarray_contains(&fpc->if_stack, unsigned));
784       hw = &fpc->fp->insn[util_dynarray_pop(&fpc->if_stack, unsigned)];
785       if(!hw[2])
786          hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH | fpc->fp->insn_len;
787       hw[3] = fpc->fp->insn_len;
788       break;
789    }
790 
791    case TGSI_OPCODE_BGNSUB:
792    case TGSI_OPCODE_ENDSUB:
793       /* nothing to do here */
794       break;
795 
796    case TGSI_OPCODE_CAL:
797       if(!fpc->is_nv4x)
798          goto nv3x_cflow;
799       nv40_fp_cal(fpc, finst->Label.Label);
800       break;
801 
802    case TGSI_OPCODE_RET:
803       if(!fpc->is_nv4x)
804          goto nv3x_cflow;
805       nv40_fp_ret(fpc);
806       break;
807 
808    case TGSI_OPCODE_BGNLOOP:
809       if(!fpc->is_nv4x)
810          goto nv3x_cflow;
811       /* TODO: we should support using two nested REPs to allow a > 255 iteration count */
812       nv40_fp_rep(fpc, 255, finst->Label.Label);
813       break;
814 
815    case TGSI_OPCODE_ENDLOOP:
816       break;
817 
818    case TGSI_OPCODE_BRK:
819       if(!fpc->is_nv4x)
820          goto nv3x_cflow;
821       nv40_fp_brk(fpc);
822       break;
823 
824    case TGSI_OPCODE_CONT:
825    {
826       static int warned = 0;
827       if(!warned) {
828          NOUVEAU_ERR("Sorry, the continue keyword is not implemented: ignoring it.\n");
829          warned = 1;
830       }
831       break;
832    }
833 
834         default:
835       NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
836       return false;
837    }
838 
839 out:
840    release_temps(fpc);
841    return true;
842 nv3x_cflow:
843    {
844       static int warned = 0;
845       if(!warned) {
846          NOUVEAU_ERR(
847                "Sorry, control flow instructions are not supported in hardware on nv3x: ignoring them\n"
848                "If rendering is incorrect, try to disable GLSL support in the application.\n");
849          warned = 1;
850       }
851    }
852    goto out;
853 }
854 
855 static bool
nvfx_fragprog_parse_decl_input(struct nvfx_fpc * fpc,const struct tgsi_full_declaration * fdec)856 nvfx_fragprog_parse_decl_input(struct nvfx_fpc *fpc,
857                                const struct tgsi_full_declaration *fdec)
858 {
859    unsigned idx = fdec->Range.First;
860    unsigned hw;
861 
862    switch (fdec->Semantic.Name) {
863    case TGSI_SEMANTIC_POSITION:
864       hw = NVFX_FP_OP_INPUT_SRC_POSITION;
865       break;
866    case TGSI_SEMANTIC_COLOR:
867       hw = NVFX_FP_OP_INPUT_SRC_COL0 + fdec->Semantic.Index;
868       break;
869    case TGSI_SEMANTIC_FOG:
870       hw = NVFX_FP_OP_INPUT_SRC_FOGC;
871       break;
872    case TGSI_SEMANTIC_FACE:
873       hw = NV40_FP_OP_INPUT_SRC_FACING;
874       break;
875    case TGSI_SEMANTIC_TEXCOORD:
876       assert(fdec->Semantic.Index < 8);
877       fpc->fp->texcoord[fdec->Semantic.Index] = fdec->Semantic.Index;
878       fpc->fp->texcoords |= (1 << fdec->Semantic.Index);
879       fpc->fp->vp_or |= (0x00004000 << fdec->Semantic.Index);
880       hw = NVFX_FP_OP_INPUT_SRC_TC(fdec->Semantic.Index);
881       break;
882    case TGSI_SEMANTIC_GENERIC:
883    case TGSI_SEMANTIC_PCOORD:
884       /* will be assigned to remaining TC slots later */
885       return true;
886    default:
887       assert(0);
888       return false;
889    }
890 
891    fpc->r_input[idx] = nvfx_reg(NVFXSR_INPUT, hw);
892    return true;
893 }
894 
895 static bool
nvfx_fragprog_assign_generic(struct nvfx_fpc * fpc,const struct tgsi_full_declaration * fdec)896 nvfx_fragprog_assign_generic(struct nvfx_fpc *fpc,
897                              const struct tgsi_full_declaration *fdec)
898 {
899    unsigned num_texcoords = fpc->is_nv4x ? 10 : 8;
900    unsigned idx = fdec->Range.First;
901    unsigned hw;
902 
903    switch (fdec->Semantic.Name) {
904    case TGSI_SEMANTIC_GENERIC:
905    case TGSI_SEMANTIC_PCOORD:
906       for (hw = 0; hw < num_texcoords; hw++) {
907          if (fpc->fp->texcoord[hw] == 0xffff) {
908             if (hw <= 7) {
909                fpc->fp->texcoords |= (0x1 << hw);
910                fpc->fp->vp_or |= (0x00004000 << hw);
911             } else {
912                fpc->fp->vp_or |= (0x00001000 << (hw - 8));
913             }
914             if (fdec->Semantic.Name == TGSI_SEMANTIC_PCOORD) {
915                fpc->fp->texcoord[hw] = 0xfffe;
916                fpc->fp->point_sprite_control |= (0x00000100 << hw);
917             } else {
918                fpc->fp->texcoord[hw] = fdec->Semantic.Index + 8;
919             }
920             hw = NVFX_FP_OP_INPUT_SRC_TC(hw);
921             fpc->r_input[idx] = nvfx_reg(NVFXSR_INPUT, hw);
922             return true;
923          }
924       }
925       return false;
926    default:
927       return true;
928    }
929 }
930 
931 static bool
nvfx_fragprog_parse_decl_output(struct nvfx_fpc * fpc,const struct tgsi_full_declaration * fdec)932 nvfx_fragprog_parse_decl_output(struct nvfx_fpc *fpc,
933             const struct tgsi_full_declaration *fdec)
934 {
935    unsigned idx = fdec->Range.First;
936    unsigned hw;
937 
938    switch (fdec->Semantic.Name) {
939    case TGSI_SEMANTIC_POSITION:
940       hw = 1;
941       break;
942    case TGSI_SEMANTIC_COLOR:
943       hw = ~0;
944       switch (fdec->Semantic.Index) {
945       case 0: hw = 0; break;
946       case 1: hw = 2; break;
947       case 2: hw = 3; break;
948       case 3: hw = 4; break;
949       }
950       if(hw > ((fpc->is_nv4x) ? 4 : 2)) {
951          NOUVEAU_ERR("bad rcol index\n");
952          return false;
953       }
954       break;
955    default:
956       NOUVEAU_ERR("bad output semantic\n");
957       return false;
958    }
959 
960    fpc->r_result[idx] = nvfx_reg(NVFXSR_OUTPUT, hw);
961    fpc->r_temps |= (1ULL << hw);
962    return true;
963 }
964 
965 static bool
nvfx_fragprog_prepare(struct nvfx_fpc * fpc)966 nvfx_fragprog_prepare(struct nvfx_fpc *fpc)
967 {
968    struct tgsi_parse_context p;
969    int high_temp = -1, i;
970 
971    fpc->r_imm = CALLOC(fpc->fp->info.immediate_count, sizeof(struct nvfx_reg));
972 
973    tgsi_parse_init(&p, fpc->fp->pipe.tokens);
974    while (!tgsi_parse_end_of_tokens(&p)) {
975       const union tgsi_full_token *tok = &p.FullToken;
976 
977       tgsi_parse_token(&p);
978       switch(tok->Token.Type) {
979       case TGSI_TOKEN_TYPE_DECLARATION:
980       {
981          const struct tgsi_full_declaration *fdec;
982          fdec = &p.FullToken.FullDeclaration;
983          switch (fdec->Declaration.File) {
984          case TGSI_FILE_INPUT:
985             if (!nvfx_fragprog_parse_decl_input(fpc, fdec))
986                goto out_err;
987             break;
988          case TGSI_FILE_OUTPUT:
989             if (!nvfx_fragprog_parse_decl_output(fpc, fdec))
990                goto out_err;
991             break;
992          case TGSI_FILE_TEMPORARY:
993             if (fdec->Range.Last > high_temp) {
994                high_temp =
995                   fdec->Range.Last;
996             }
997             break;
998          default:
999             break;
1000          }
1001       }
1002          break;
1003       case TGSI_TOKEN_TYPE_IMMEDIATE:
1004       {
1005          struct tgsi_full_immediate *imm;
1006 
1007          imm = &p.FullToken.FullImmediate;
1008          assert(imm->Immediate.DataType == TGSI_IMM_FLOAT32);
1009          assert(fpc->nr_imm < fpc->fp->info.immediate_count);
1010 
1011          fpc->r_imm[fpc->nr_imm++] = nvfx_fp_imm(fpc, imm->u[0].Float, imm->u[1].Float, imm->u[2].Float, imm->u[3].Float);
1012          break;
1013       }
1014       default:
1015          break;
1016       }
1017    }
1018    tgsi_parse_free(&p);
1019 
1020    tgsi_parse_init(&p, fpc->fp->pipe.tokens);
1021    while (!tgsi_parse_end_of_tokens(&p)) {
1022       const struct tgsi_full_declaration *fdec;
1023       tgsi_parse_token(&p);
1024       switch(p.FullToken.Token.Type) {
1025       case TGSI_TOKEN_TYPE_DECLARATION:
1026          fdec = &p.FullToken.FullDeclaration;
1027          switch (fdec->Declaration.File) {
1028          case TGSI_FILE_INPUT:
1029             if (!nvfx_fragprog_assign_generic(fpc, fdec))
1030                goto out_err;
1031             break;
1032          default:
1033             break;
1034          }
1035          break;
1036       default:
1037          break;
1038       }
1039    }
1040    tgsi_parse_free(&p);
1041 
1042    if (++high_temp) {
1043       fpc->r_temp = CALLOC(high_temp, sizeof(struct nvfx_reg));
1044       for (i = 0; i < high_temp; i++)
1045          fpc->r_temp[i] = temp(fpc);
1046       fpc->r_temps_discard = 0ULL;
1047    }
1048 
1049    return true;
1050 
1051 out_err:
1052    FREE(fpc->r_temp);
1053    fpc->r_temp = NULL;
1054 
1055    tgsi_parse_free(&p);
1056    return false;
1057 }
1058 
1059 DEBUG_GET_ONCE_BOOL_OPTION(nvfx_dump_fp, "NVFX_DUMP_FP", false)
1060 
1061 void
_nvfx_fragprog_translate(uint16_t oclass,struct nv30_fragprog * fp)1062 _nvfx_fragprog_translate(uint16_t oclass, struct nv30_fragprog *fp)
1063 {
1064    struct tgsi_parse_context parse;
1065    struct nvfx_fpc *fpc = NULL;
1066    struct util_dynarray insns;
1067 
1068    fp->translated = false;
1069    fp->point_sprite_control = 0;
1070    fp->vp_or = 0;
1071 
1072    fpc = CALLOC_STRUCT(nvfx_fpc);
1073    if (!fpc)
1074       goto out_err;
1075 
1076    fpc->is_nv4x = (oclass >= NV40_3D_CLASS) ? ~0 : 0;
1077    fpc->max_temps = fpc->is_nv4x ? 48 : 32;
1078    fpc->fp = fp;
1079    fpc->num_regs = 2;
1080    memset(fp->texcoord, 0xff, sizeof(fp->texcoord));
1081 
1082    if (fp->info.properties[TGSI_PROPERTY_FS_COORD_ORIGIN])
1083       fp->coord_conventions |= NV30_3D_COORD_CONVENTIONS_ORIGIN_INVERTED;
1084    if (fp->info.properties[TGSI_PROPERTY_FS_COORD_PIXEL_CENTER])
1085       fp->coord_conventions |= NV30_3D_COORD_CONVENTIONS_CENTER_INTEGER;
1086    if (fp->info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS])
1087       fp->rt_enable |= NV30_3D_RT_ENABLE_MRT;
1088 
1089    if (!nvfx_fragprog_prepare(fpc))
1090       goto out_err;
1091 
1092    tgsi_parse_init(&parse, fp->pipe.tokens);
1093    util_dynarray_init(&insns, NULL);
1094 
1095    while (!tgsi_parse_end_of_tokens(&parse)) {
1096       tgsi_parse_token(&parse);
1097 
1098       switch (parse.FullToken.Token.Type) {
1099       case TGSI_TOKEN_TYPE_INSTRUCTION:
1100       {
1101          const struct tgsi_full_instruction *finst;
1102 
1103          util_dynarray_append(&insns, unsigned, fp->insn_len);
1104          finst = &parse.FullToken.FullInstruction;
1105          if (!nvfx_fragprog_parse_instruction(fpc, finst))
1106             goto out_err;
1107       }
1108          break;
1109       default:
1110          break;
1111       }
1112    }
1113    util_dynarray_append(&insns, unsigned, fp->insn_len);
1114 
1115    for(unsigned i = 0; i < fpc->label_relocs.size; i += sizeof(struct nvfx_relocation))
1116    {
1117       struct nvfx_relocation* label_reloc = (struct nvfx_relocation*)((char*)fpc->label_relocs.data + i);
1118       fp->insn[label_reloc->location] |= ((unsigned*)insns.data)[label_reloc->target];
1119    }
1120    util_dynarray_fini(&insns);
1121 
1122    if(!fpc->is_nv4x)
1123       fp->fp_control |= (fpc->num_regs-1)/2;
1124    else
1125       fp->fp_control |= fpc->num_regs << NV40_3D_FP_CONTROL_TEMP_COUNT__SHIFT;
1126 
1127    /* Terminate final instruction */
1128    if(fp->insn)
1129       fp->insn[fpc->inst_offset] |= 0x00000001;
1130 
1131    /* Append NOP + END instruction for branches to the end of the program */
1132    fpc->inst_offset = fp->insn_len;
1133    grow_insns(fpc, 4);
1134    fp->insn[fpc->inst_offset + 0] = 0x00000001;
1135    fp->insn[fpc->inst_offset + 1] = 0x00000000;
1136    fp->insn[fpc->inst_offset + 2] = 0x00000000;
1137    fp->insn[fpc->inst_offset + 3] = 0x00000000;
1138 
1139    if(debug_get_option_nvfx_dump_fp())
1140    {
1141       debug_printf("\n");
1142       tgsi_dump(fp->pipe.tokens, 0);
1143 
1144       debug_printf("\n%s fragment program:\n", fpc->is_nv4x ? "nv4x" : "nv3x");
1145       for (unsigned i = 0; i < fp->insn_len; i += 4)
1146          debug_printf("%3u: %08x %08x %08x %08x\n", i >> 2, fp->insn[i], fp->insn[i + 1], fp->insn[i + 2], fp->insn[i + 3]);
1147       debug_printf("\n");
1148    }
1149 
1150    fp->translated = true;
1151 
1152 out:
1153    tgsi_parse_free(&parse);
1154    if (fpc)
1155    {
1156       FREE(fpc->r_temp);
1157       FREE(fpc->r_imm);
1158       util_dynarray_fini(&fpc->if_stack);
1159       util_dynarray_fini(&fpc->label_relocs);
1160       util_dynarray_fini(&fpc->imm_data);
1161       //util_dynarray_fini(&fpc->loop_stack);
1162       FREE(fpc);
1163    }
1164 
1165    return;
1166 
1167 out_err:
1168    _debug_printf("Error: failed to compile this fragment program:\n");
1169    tgsi_dump(fp->pipe.tokens, 0);
1170    goto out;
1171 }
1172