1 #include <float.h>
2 #include "pipe/p_context.h"
3 #include "pipe/p_defines.h"
4 #include "pipe/p_state.h"
5 #include "util/u_dynarray.h"
6 #include "util/u_inlines.h"
7 #include "util/u_debug.h"
8 #include "util/u_memory.h"
9
10 #include "pipe/p_shader_tokens.h"
11 #include "tgsi/tgsi_parse.h"
12 #include "tgsi/tgsi_dump.h"
13
14 #include "nouveau_debug.h"
15 #include "nv_object.xml.h"
16 #include "nv30/nv30-40_3d.xml.h"
17 #include "nv30/nvfx_shader.h"
18 #include "nv30/nv30_state.h"
19
20 struct nvfx_fpc {
21 struct nv30_fragprog *fp;
22
23 unsigned max_temps;
24 unsigned long long r_temps;
25 unsigned long long r_temps_discard;
26 struct nvfx_reg r_result[PIPE_MAX_SHADER_OUTPUTS];
27 struct nvfx_reg r_input[PIPE_MAX_SHADER_INPUTS];
28 struct nvfx_reg *r_temp;
29
30 int num_regs;
31
32 unsigned inst_offset;
33 unsigned have_const;
34 unsigned is_nv4x;
35
36 struct util_dynarray imm_data;
37
38 struct nvfx_reg* r_imm;
39 unsigned nr_imm;
40
41 struct util_dynarray if_stack;
42 //struct util_dynarray loop_stack;
43 struct util_dynarray label_relocs;
44 };
45
46 static inline struct nvfx_reg
temp(struct nvfx_fpc * fpc)47 temp(struct nvfx_fpc *fpc)
48 {
49 int idx = __builtin_ctzll(~fpc->r_temps);
50
51 if (idx >= fpc->max_temps) {
52 NOUVEAU_ERR("out of temps!!\n");
53 return nvfx_reg(NVFXSR_TEMP, 0);
54 }
55
56 fpc->r_temps |= (1ULL << idx);
57 fpc->r_temps_discard |= (1ULL << idx);
58 return nvfx_reg(NVFXSR_TEMP, idx);
59 }
60
61 static inline void
release_temps(struct nvfx_fpc * fpc)62 release_temps(struct nvfx_fpc *fpc)
63 {
64 fpc->r_temps &= ~fpc->r_temps_discard;
65 fpc->r_temps_discard = 0ULL;
66 }
67
68 static inline struct nvfx_reg
nvfx_fp_imm(struct nvfx_fpc * fpc,float a,float b,float c,float d)69 nvfx_fp_imm(struct nvfx_fpc *fpc, float a, float b, float c, float d)
70 {
71 float v[4] = {a, b, c, d};
72 int idx = fpc->imm_data.size >> 4;
73
74 memcpy(util_dynarray_grow(&fpc->imm_data, float, 4), v, 4 * sizeof(float));
75 return nvfx_reg(NVFXSR_IMM, idx);
76 }
77
78 static void
grow_insns(struct nvfx_fpc * fpc,int size)79 grow_insns(struct nvfx_fpc *fpc, int size)
80 {
81 struct nv30_fragprog *fp = fpc->fp;
82
83 fp->insn_len += size;
84 fp->insn = realloc(fp->insn, sizeof(uint32_t) * fp->insn_len);
85 }
86
87 static void
emit_src(struct nvfx_fpc * fpc,int pos,struct nvfx_src src)88 emit_src(struct nvfx_fpc *fpc, int pos, struct nvfx_src src)
89 {
90 struct nv30_fragprog *fp = fpc->fp;
91 uint32_t *hw = &fp->insn[fpc->inst_offset];
92 uint32_t sr = 0;
93
94 switch (src.reg.type) {
95 case NVFXSR_INPUT:
96 sr |= (NVFX_FP_REG_TYPE_INPUT << NVFX_FP_REG_TYPE_SHIFT);
97 hw[0] |= (src.reg.index << NVFX_FP_OP_INPUT_SRC_SHIFT);
98 break;
99 case NVFXSR_OUTPUT:
100 sr |= NVFX_FP_REG_SRC_HALF;
101 FALLTHROUGH;
102 case NVFXSR_TEMP:
103 sr |= (NVFX_FP_REG_TYPE_TEMP << NVFX_FP_REG_TYPE_SHIFT);
104 sr |= (src.reg.index << NVFX_FP_REG_SRC_SHIFT);
105 break;
106 case NVFXSR_IMM:
107 if (!fpc->have_const) {
108 grow_insns(fpc, 4);
109 hw = &fp->insn[fpc->inst_offset];
110 fpc->have_const = 1;
111 }
112
113 memcpy(&fp->insn[fpc->inst_offset + 4],
114 (float*)fpc->imm_data.data + src.reg.index * 4,
115 sizeof(uint32_t) * 4);
116
117 sr |= (NVFX_FP_REG_TYPE_CONST << NVFX_FP_REG_TYPE_SHIFT);
118 break;
119 case NVFXSR_CONST:
120 if (!fpc->have_const) {
121 grow_insns(fpc, 4);
122 hw = &fp->insn[fpc->inst_offset];
123 fpc->have_const = 1;
124 }
125
126 {
127 struct nv30_fragprog_data *fpd;
128
129 fp->consts = realloc(fp->consts, ++fp->nr_consts *
130 sizeof(*fpd));
131 fpd = &fp->consts[fp->nr_consts - 1];
132 fpd->offset = fpc->inst_offset + 4;
133 fpd->index = src.reg.index;
134 memset(&fp->insn[fpd->offset], 0, sizeof(uint32_t) * 4);
135 }
136
137 sr |= (NVFX_FP_REG_TYPE_CONST << NVFX_FP_REG_TYPE_SHIFT);
138 break;
139 case NVFXSR_NONE:
140 sr |= (NVFX_FP_REG_TYPE_INPUT << NVFX_FP_REG_TYPE_SHIFT);
141 break;
142 default:
143 assert(0);
144 }
145
146 if (src.negate)
147 sr |= NVFX_FP_REG_NEGATE;
148
149 if (src.abs)
150 hw[1] |= (1 << (29 + pos));
151
152 sr |= ((src.swz[0] << NVFX_FP_REG_SWZ_X_SHIFT) |
153 (src.swz[1] << NVFX_FP_REG_SWZ_Y_SHIFT) |
154 (src.swz[2] << NVFX_FP_REG_SWZ_Z_SHIFT) |
155 (src.swz[3] << NVFX_FP_REG_SWZ_W_SHIFT));
156
157 hw[pos + 1] |= sr;
158 }
159
160 static void
emit_dst(struct nvfx_fpc * fpc,struct nvfx_reg dst)161 emit_dst(struct nvfx_fpc *fpc, struct nvfx_reg dst)
162 {
163 struct nv30_fragprog *fp = fpc->fp;
164 uint32_t *hw = &fp->insn[fpc->inst_offset];
165
166 switch (dst.type) {
167 case NVFXSR_OUTPUT:
168 if (dst.index == 1)
169 fp->fp_control |= 0x0000000e;
170 else {
171 hw[0] |= NVFX_FP_OP_OUT_REG_HALF;
172 dst.index <<= 1;
173 }
174 FALLTHROUGH;
175 case NVFXSR_TEMP:
176 if (fpc->num_regs < (dst.index + 1))
177 fpc->num_regs = dst.index + 1;
178 break;
179 case NVFXSR_NONE:
180 hw[0] |= (1 << 30);
181 break;
182 default:
183 assert(0);
184 }
185
186 hw[0] |= (dst.index << NVFX_FP_OP_OUT_REG_SHIFT);
187 }
188
189 static void
nvfx_fp_emit(struct nvfx_fpc * fpc,struct nvfx_insn insn)190 nvfx_fp_emit(struct nvfx_fpc *fpc, struct nvfx_insn insn)
191 {
192 struct nv30_fragprog *fp = fpc->fp;
193 uint32_t *hw;
194
195 fpc->inst_offset = fp->insn_len;
196 fpc->have_const = 0;
197 grow_insns(fpc, 4);
198 hw = &fp->insn[fpc->inst_offset];
199 memset(hw, 0, sizeof(uint32_t) * 4);
200
201 if (insn.op == NVFX_FP_OP_OPCODE_KIL)
202 fp->fp_control |= NV30_3D_FP_CONTROL_USES_KIL;
203 hw[0] |= (insn.op << NVFX_FP_OP_OPCODE_SHIFT);
204 hw[0] |= (insn.mask << NVFX_FP_OP_OUTMASK_SHIFT);
205 hw[2] |= (insn.scale << NVFX_FP_OP_DST_SCALE_SHIFT);
206
207 if (insn.sat)
208 hw[0] |= NVFX_FP_OP_OUT_SAT;
209
210 if (insn.cc_update)
211 hw[0] |= NVFX_FP_OP_COND_WRITE_ENABLE;
212 hw[1] |= (insn.cc_test << NVFX_FP_OP_COND_SHIFT);
213 hw[1] |= ((insn.cc_swz[0] << NVFX_FP_OP_COND_SWZ_X_SHIFT) |
214 (insn.cc_swz[1] << NVFX_FP_OP_COND_SWZ_Y_SHIFT) |
215 (insn.cc_swz[2] << NVFX_FP_OP_COND_SWZ_Z_SHIFT) |
216 (insn.cc_swz[3] << NVFX_FP_OP_COND_SWZ_W_SHIFT));
217
218 if(insn.unit >= 0)
219 {
220 hw[0] |= (insn.unit << NVFX_FP_OP_TEX_UNIT_SHIFT);
221 }
222
223 emit_dst(fpc, insn.dst);
224 emit_src(fpc, 0, insn.src[0]);
225 emit_src(fpc, 1, insn.src[1]);
226 emit_src(fpc, 2, insn.src[2]);
227 }
228
229 #define arith(s,o,d,m,s0,s1,s2) \
230 nvfx_insn((s), NVFX_FP_OP_OPCODE_##o, -1, \
231 (d), (m), (s0), (s1), (s2))
232
233 #define tex(s,o,u,d,m,s0,s1,s2) \
234 nvfx_insn((s), NVFX_FP_OP_OPCODE_##o, (u), \
235 (d), (m), (s0), none, none)
236
237 /* IF src.x != 0, as TGSI specifies */
238 static void
nv40_fp_if(struct nvfx_fpc * fpc,struct nvfx_src src)239 nv40_fp_if(struct nvfx_fpc *fpc, struct nvfx_src src)
240 {
241 const struct nvfx_src none = nvfx_src(nvfx_reg(NVFXSR_NONE, 0));
242 struct nvfx_insn insn = arith(0, MOV, none.reg, NVFX_FP_MASK_X, src, none, none);
243 uint32_t *hw;
244 insn.cc_update = 1;
245 nvfx_fp_emit(fpc, insn);
246
247 fpc->inst_offset = fpc->fp->insn_len;
248 grow_insns(fpc, 4);
249 hw = &fpc->fp->insn[fpc->inst_offset];
250 /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */
251 hw[0] = (NV40_FP_OP_BRA_OPCODE_IF << NVFX_FP_OP_OPCODE_SHIFT) |
252 NV40_FP_OP_OUT_NONE |
253 (NVFX_FP_PRECISION_FP16 << NVFX_FP_OP_PRECISION_SHIFT);
254 /* Use .xxxx swizzle so that we check only src[0].x*/
255 hw[1] = (0 << NVFX_FP_OP_COND_SWZ_X_SHIFT) |
256 (0 << NVFX_FP_OP_COND_SWZ_Y_SHIFT) |
257 (0 << NVFX_FP_OP_COND_SWZ_Z_SHIFT) |
258 (0 << NVFX_FP_OP_COND_SWZ_W_SHIFT) |
259 (NVFX_FP_OP_COND_NE << NVFX_FP_OP_COND_SHIFT);
260 hw[2] = 0; /* | NV40_FP_OP_OPCODE_IS_BRANCH | else_offset */
261 hw[3] = 0; /* | endif_offset */
262 util_dynarray_append(&fpc->if_stack, unsigned, fpc->inst_offset);
263 }
264
265 /* IF src.x != 0, as TGSI specifies */
266 static void
nv40_fp_cal(struct nvfx_fpc * fpc,unsigned target)267 nv40_fp_cal(struct nvfx_fpc *fpc, unsigned target)
268 {
269 struct nvfx_relocation reloc;
270 uint32_t *hw;
271 fpc->inst_offset = fpc->fp->insn_len;
272 grow_insns(fpc, 4);
273 hw = &fpc->fp->insn[fpc->inst_offset];
274 /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */
275 hw[0] = (NV40_FP_OP_BRA_OPCODE_CAL << NVFX_FP_OP_OPCODE_SHIFT);
276 /* Use .xxxx swizzle so that we check only src[0].x*/
277 hw[1] = (NVFX_SWZ_IDENTITY << NVFX_FP_OP_COND_SWZ_ALL_SHIFT) |
278 (NVFX_FP_OP_COND_TR << NVFX_FP_OP_COND_SHIFT);
279 hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH; /* | call_offset */
280 hw[3] = 0;
281 reloc.target = target;
282 reloc.location = fpc->inst_offset + 2;
283 util_dynarray_append(&fpc->label_relocs, struct nvfx_relocation, reloc);
284 }
285
286 static void
nv40_fp_ret(struct nvfx_fpc * fpc)287 nv40_fp_ret(struct nvfx_fpc *fpc)
288 {
289 uint32_t *hw;
290 fpc->inst_offset = fpc->fp->insn_len;
291 grow_insns(fpc, 4);
292 hw = &fpc->fp->insn[fpc->inst_offset];
293 /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */
294 hw[0] = (NV40_FP_OP_BRA_OPCODE_RET << NVFX_FP_OP_OPCODE_SHIFT);
295 /* Use .xxxx swizzle so that we check only src[0].x*/
296 hw[1] = (NVFX_SWZ_IDENTITY << NVFX_FP_OP_COND_SWZ_ALL_SHIFT) |
297 (NVFX_FP_OP_COND_TR << NVFX_FP_OP_COND_SHIFT);
298 hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH; /* | call_offset */
299 hw[3] = 0;
300 }
301
302 static void
nv40_fp_rep(struct nvfx_fpc * fpc,unsigned count,unsigned target)303 nv40_fp_rep(struct nvfx_fpc *fpc, unsigned count, unsigned target)
304 {
305 struct nvfx_relocation reloc;
306 uint32_t *hw;
307 fpc->inst_offset = fpc->fp->insn_len;
308 grow_insns(fpc, 4);
309 hw = &fpc->fp->insn[fpc->inst_offset];
310 /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */
311 hw[0] = (NV40_FP_OP_BRA_OPCODE_REP << NVFX_FP_OP_OPCODE_SHIFT) |
312 NV40_FP_OP_OUT_NONE |
313 (NVFX_FP_PRECISION_FP16 << NVFX_FP_OP_PRECISION_SHIFT);
314 /* Use .xxxx swizzle so that we check only src[0].x*/
315 hw[1] = (NVFX_SWZ_IDENTITY << NVFX_FP_OP_COND_SWZ_ALL_SHIFT) |
316 (NVFX_FP_OP_COND_TR << NVFX_FP_OP_COND_SHIFT);
317 hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH |
318 (count << NV40_FP_OP_REP_COUNT1_SHIFT) |
319 (count << NV40_FP_OP_REP_COUNT2_SHIFT) |
320 (count << NV40_FP_OP_REP_COUNT3_SHIFT);
321 hw[3] = 0; /* | end_offset */
322 reloc.target = target;
323 reloc.location = fpc->inst_offset + 3;
324 util_dynarray_append(&fpc->label_relocs, struct nvfx_relocation, reloc);
325 //util_dynarray_append(&fpc->loop_stack, unsigned, target);
326 }
327
328 #if 0
329 /* documentation only */
330 /* warning: this only works forward, and probably only if not inside any IF */
331 static void
332 nv40_fp_bra(struct nvfx_fpc *fpc, unsigned target)
333 {
334 struct nvfx_relocation reloc;
335 uint32_t *hw;
336 fpc->inst_offset = fpc->fp->insn_len;
337 grow_insns(fpc, 4);
338 hw = &fpc->fp->insn[fpc->inst_offset];
339 /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */
340 hw[0] = (NV40_FP_OP_BRA_OPCODE_IF << NVFX_FP_OP_OPCODE_SHIFT) |
341 NV40_FP_OP_OUT_NONE |
342 (NVFX_FP_PRECISION_FP16 << NVFX_FP_OP_PRECISION_SHIFT);
343 /* Use .xxxx swizzle so that we check only src[0].x*/
344 hw[1] = (NVFX_SWZ_IDENTITY << NVFX_FP_OP_COND_SWZ_X_SHIFT) |
345 (NVFX_FP_OP_COND_FL << NVFX_FP_OP_COND_SHIFT);
346 hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH; /* | else_offset */
347 hw[3] = 0; /* | endif_offset */
348 reloc.target = target;
349 reloc.location = fpc->inst_offset + 2;
350 util_dynarray_append(&fpc->label_relocs, struct nvfx_relocation, reloc);
351 reloc.target = target;
352 reloc.location = fpc->inst_offset + 3;
353 util_dynarray_append(&fpc->label_relocs, struct nvfx_relocation, reloc);
354 }
355 #endif
356
357 static void
nv40_fp_brk(struct nvfx_fpc * fpc)358 nv40_fp_brk(struct nvfx_fpc *fpc)
359 {
360 uint32_t *hw;
361 fpc->inst_offset = fpc->fp->insn_len;
362 grow_insns(fpc, 4);
363 hw = &fpc->fp->insn[fpc->inst_offset];
364 /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */
365 hw[0] = (NV40_FP_OP_BRA_OPCODE_BRK << NVFX_FP_OP_OPCODE_SHIFT) |
366 NV40_FP_OP_OUT_NONE;
367 /* Use .xxxx swizzle so that we check only src[0].x*/
368 hw[1] = (NVFX_SWZ_IDENTITY << NVFX_FP_OP_COND_SWZ_X_SHIFT) |
369 (NVFX_FP_OP_COND_TR << NVFX_FP_OP_COND_SHIFT);
370 hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH;
371 hw[3] = 0;
372 }
373
374 static inline struct nvfx_src
tgsi_src(struct nvfx_fpc * fpc,const struct tgsi_full_src_register * fsrc)375 tgsi_src(struct nvfx_fpc *fpc, const struct tgsi_full_src_register *fsrc)
376 {
377 struct nvfx_src src;
378
379 switch (fsrc->Register.File) {
380 case TGSI_FILE_INPUT:
381 src.reg = fpc->r_input[fsrc->Register.Index];
382 break;
383 case TGSI_FILE_CONSTANT:
384 src.reg = nvfx_reg(NVFXSR_CONST, fsrc->Register.Index);
385 break;
386 case TGSI_FILE_IMMEDIATE:
387 assert(fsrc->Register.Index < fpc->nr_imm);
388 src.reg = fpc->r_imm[fsrc->Register.Index];
389 break;
390 case TGSI_FILE_TEMPORARY:
391 src.reg = fpc->r_temp[fsrc->Register.Index];
392 break;
393 /* NV40 fragprog result regs are just temps, so this is simple */
394 case TGSI_FILE_OUTPUT:
395 src.reg = fpc->r_result[fsrc->Register.Index];
396 break;
397 default:
398 NOUVEAU_ERR("bad src file\n");
399 src.reg.index = 0;
400 src.reg.type = 0;
401 break;
402 }
403
404 src.abs = fsrc->Register.Absolute;
405 src.negate = fsrc->Register.Negate;
406 src.swz[0] = fsrc->Register.SwizzleX;
407 src.swz[1] = fsrc->Register.SwizzleY;
408 src.swz[2] = fsrc->Register.SwizzleZ;
409 src.swz[3] = fsrc->Register.SwizzleW;
410 src.indirect = 0;
411 src.indirect_reg = 0;
412 src.indirect_swz = 0;
413 return src;
414 }
415
416 static inline struct nvfx_reg
tgsi_dst(struct nvfx_fpc * fpc,const struct tgsi_full_dst_register * fdst)417 tgsi_dst(struct nvfx_fpc *fpc, const struct tgsi_full_dst_register *fdst) {
418 switch (fdst->Register.File) {
419 case TGSI_FILE_OUTPUT:
420 return fpc->r_result[fdst->Register.Index];
421 case TGSI_FILE_TEMPORARY:
422 return fpc->r_temp[fdst->Register.Index];
423 case TGSI_FILE_NULL:
424 return nvfx_reg(NVFXSR_NONE, 0);
425 default:
426 NOUVEAU_ERR("bad dst file %d\n", fdst->Register.File);
427 return nvfx_reg(NVFXSR_NONE, 0);
428 }
429 }
430
431 static inline int
tgsi_mask(uint tgsi)432 tgsi_mask(uint tgsi)
433 {
434 int mask = 0;
435
436 if (tgsi & TGSI_WRITEMASK_X) mask |= NVFX_FP_MASK_X;
437 if (tgsi & TGSI_WRITEMASK_Y) mask |= NVFX_FP_MASK_Y;
438 if (tgsi & TGSI_WRITEMASK_Z) mask |= NVFX_FP_MASK_Z;
439 if (tgsi & TGSI_WRITEMASK_W) mask |= NVFX_FP_MASK_W;
440 return mask;
441 }
442
443 static bool
nvfx_fragprog_parse_instruction(struct nvfx_fpc * fpc,const struct tgsi_full_instruction * finst)444 nvfx_fragprog_parse_instruction(struct nvfx_fpc *fpc,
445 const struct tgsi_full_instruction *finst)
446 {
447 const struct nvfx_src none = nvfx_src(nvfx_reg(NVFXSR_NONE, 0));
448 struct nvfx_insn insn;
449 struct nvfx_src src[3], tmp;
450 struct nvfx_reg dst;
451 int mask, sat, unit = 0;
452 int ai = -1, ci = -1, ii = -1;
453 int i;
454
455 if (finst->Instruction.Opcode == TGSI_OPCODE_END)
456 return true;
457
458 for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
459 const struct tgsi_full_src_register *fsrc;
460
461 fsrc = &finst->Src[i];
462 if (fsrc->Register.File == TGSI_FILE_TEMPORARY) {
463 src[i] = tgsi_src(fpc, fsrc);
464 }
465 }
466
467 for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
468 const struct tgsi_full_src_register *fsrc;
469
470 fsrc = &finst->Src[i];
471
472 switch (fsrc->Register.File) {
473 case TGSI_FILE_INPUT:
474 if(fpc->fp->info.input_semantic_name[fsrc->Register.Index] == TGSI_SEMANTIC_FOG && (0
475 || fsrc->Register.SwizzleX == PIPE_SWIZZLE_W
476 || fsrc->Register.SwizzleY == PIPE_SWIZZLE_W
477 || fsrc->Register.SwizzleZ == PIPE_SWIZZLE_W
478 || fsrc->Register.SwizzleW == PIPE_SWIZZLE_W
479 )) {
480 /* hardware puts 0 in fogcoord.w, but GL/Gallium want 1 there */
481 struct nvfx_src addend = nvfx_src(nvfx_fp_imm(fpc, 0, 0, 0, 1));
482 addend.swz[0] = fsrc->Register.SwizzleX;
483 addend.swz[1] = fsrc->Register.SwizzleY;
484 addend.swz[2] = fsrc->Register.SwizzleZ;
485 addend.swz[3] = fsrc->Register.SwizzleW;
486 src[i] = nvfx_src(temp(fpc));
487 nvfx_fp_emit(fpc, arith(0, ADD, src[i].reg, NVFX_FP_MASK_ALL, tgsi_src(fpc, fsrc), addend, none));
488 } else if (ai == -1 || ai == fsrc->Register.Index) {
489 ai = fsrc->Register.Index;
490 src[i] = tgsi_src(fpc, fsrc);
491 } else {
492 src[i] = nvfx_src(temp(fpc));
493 nvfx_fp_emit(fpc, arith(0, MOV, src[i].reg, NVFX_FP_MASK_ALL, tgsi_src(fpc, fsrc), none, none));
494 }
495 break;
496 case TGSI_FILE_CONSTANT:
497 if ((ci == -1 && ii == -1) ||
498 ci == fsrc->Register.Index) {
499 ci = fsrc->Register.Index;
500 src[i] = tgsi_src(fpc, fsrc);
501 } else {
502 src[i] = nvfx_src(temp(fpc));
503 nvfx_fp_emit(fpc, arith(0, MOV, src[i].reg, NVFX_FP_MASK_ALL, tgsi_src(fpc, fsrc), none, none));
504 }
505 break;
506 case TGSI_FILE_IMMEDIATE:
507 if ((ci == -1 && ii == -1) ||
508 ii == fsrc->Register.Index) {
509 ii = fsrc->Register.Index;
510 src[i] = tgsi_src(fpc, fsrc);
511 } else {
512 src[i] = nvfx_src(temp(fpc));
513 nvfx_fp_emit(fpc, arith(0, MOV, src[i].reg, NVFX_FP_MASK_ALL, tgsi_src(fpc, fsrc), none, none));
514 }
515 break;
516 case TGSI_FILE_TEMPORARY:
517 /* handled above */
518 break;
519 case TGSI_FILE_SAMPLER:
520 unit = fsrc->Register.Index;
521 break;
522 case TGSI_FILE_OUTPUT:
523 break;
524 default:
525 NOUVEAU_ERR("bad src file\n");
526 return false;
527 }
528 }
529
530 dst = tgsi_dst(fpc, &finst->Dst[0]);
531 mask = tgsi_mask(finst->Dst[0].Register.WriteMask);
532 sat = finst->Instruction.Saturate;
533
534 switch (finst->Instruction.Opcode) {
535 case TGSI_OPCODE_ADD:
536 nvfx_fp_emit(fpc, arith(sat, ADD, dst, mask, src[0], src[1], none));
537 break;
538 case TGSI_OPCODE_CEIL:
539 tmp = nvfx_src(temp(fpc));
540 nvfx_fp_emit(fpc, arith(0, FLR, tmp.reg, mask, neg(src[0]), none, none));
541 nvfx_fp_emit(fpc, arith(sat, MOV, dst, mask, neg(tmp), none, none));
542 break;
543 case TGSI_OPCODE_CMP:
544 insn = arith(0, MOV, none.reg, mask, src[0], none, none);
545 insn.cc_update = 1;
546 nvfx_fp_emit(fpc, insn);
547
548 insn = arith(sat, MOV, dst, mask, src[2], none, none);
549 insn.cc_test = NVFX_COND_GE;
550 nvfx_fp_emit(fpc, insn);
551
552 insn = arith(sat, MOV, dst, mask, src[1], none, none);
553 insn.cc_test = NVFX_COND_LT;
554 nvfx_fp_emit(fpc, insn);
555 break;
556 case TGSI_OPCODE_COS:
557 nvfx_fp_emit(fpc, arith(sat, COS, dst, mask, src[0], none, none));
558 break;
559 case TGSI_OPCODE_DDX:
560 if (mask & (NVFX_FP_MASK_Z | NVFX_FP_MASK_W)) {
561 tmp = nvfx_src(temp(fpc));
562 nvfx_fp_emit(fpc, arith(sat, DDX, tmp.reg, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, swz(src[0], Z, W, Z, W), none, none));
563 nvfx_fp_emit(fpc, arith(0, MOV, tmp.reg, NVFX_FP_MASK_Z | NVFX_FP_MASK_W, swz(tmp, X, Y, X, Y), none, none));
564 nvfx_fp_emit(fpc, arith(sat, DDX, tmp.reg, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, src[0], none, none));
565 nvfx_fp_emit(fpc, arith(0, MOV, dst, mask, tmp, none, none));
566 } else {
567 nvfx_fp_emit(fpc, arith(sat, DDX, dst, mask, src[0], none, none));
568 }
569 break;
570 case TGSI_OPCODE_DDY:
571 if (mask & (NVFX_FP_MASK_Z | NVFX_FP_MASK_W)) {
572 tmp = nvfx_src(temp(fpc));
573 nvfx_fp_emit(fpc, arith(sat, DDY, tmp.reg, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, swz(src[0], Z, W, Z, W), none, none));
574 nvfx_fp_emit(fpc, arith(0, MOV, tmp.reg, NVFX_FP_MASK_Z | NVFX_FP_MASK_W, swz(tmp, X, Y, X, Y), none, none));
575 nvfx_fp_emit(fpc, arith(sat, DDY, tmp.reg, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, src[0], none, none));
576 nvfx_fp_emit(fpc, arith(0, MOV, dst, mask, tmp, none, none));
577 } else {
578 nvfx_fp_emit(fpc, arith(sat, DDY, dst, mask, src[0], none, none));
579 }
580 break;
581 case TGSI_OPCODE_DP2:
582 tmp = nvfx_src(temp(fpc));
583 nvfx_fp_emit(fpc, arith(0, MUL, tmp.reg, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, src[0], src[1], none));
584 nvfx_fp_emit(fpc, arith(0, ADD, dst, mask, swz(tmp, X, X, X, X), swz(tmp, Y, Y, Y, Y), none));
585 break;
586 case TGSI_OPCODE_DP3:
587 nvfx_fp_emit(fpc, arith(sat, DP3, dst, mask, src[0], src[1], none));
588 break;
589 case TGSI_OPCODE_DP4:
590 nvfx_fp_emit(fpc, arith(sat, DP4, dst, mask, src[0], src[1], none));
591 break;
592 case TGSI_OPCODE_DST:
593 nvfx_fp_emit(fpc, arith(sat, DST, dst, mask, src[0], src[1], none));
594 break;
595 case TGSI_OPCODE_EX2:
596 nvfx_fp_emit(fpc, arith(sat, EX2, dst, mask, src[0], none, none));
597 break;
598 case TGSI_OPCODE_FLR:
599 nvfx_fp_emit(fpc, arith(sat, FLR, dst, mask, src[0], none, none));
600 break;
601 case TGSI_OPCODE_FRC:
602 nvfx_fp_emit(fpc, arith(sat, FRC, dst, mask, src[0], none, none));
603 break;
604 case TGSI_OPCODE_KILL:
605 nvfx_fp_emit(fpc, arith(0, KIL, none.reg, 0, none, none, none));
606 break;
607 case TGSI_OPCODE_KILL_IF:
608 insn = arith(0, MOV, none.reg, NVFX_FP_MASK_ALL, src[0], none, none);
609 insn.cc_update = 1;
610 nvfx_fp_emit(fpc, insn);
611
612 insn = arith(0, KIL, none.reg, 0, none, none, none);
613 insn.cc_test = NVFX_COND_LT;
614 nvfx_fp_emit(fpc, insn);
615 break;
616 case TGSI_OPCODE_LG2:
617 nvfx_fp_emit(fpc, arith(sat, LG2, dst, mask, src[0], none, none));
618 break;
619 case TGSI_OPCODE_LIT:
620 if(!fpc->is_nv4x)
621 nvfx_fp_emit(fpc, arith(sat, LIT_NV30, dst, mask, src[0], none, none));
622 else {
623 /* we use FLT_MIN, so that log2 never gives -infinity, and thus multiplication by
624 * specular 0 always gives 0, so that ex2 gives 1, to satisfy the 0^0 = 1 requirement
625 *
626 * NOTE: if we start using half precision, we might need an fp16 FLT_MIN here instead
627 */
628 struct nvfx_src maxs = nvfx_src(nvfx_fp_imm(fpc, 0, FLT_MIN, 0, 0));
629 tmp = nvfx_src(temp(fpc));
630 if (ci>= 0 || ii >= 0) {
631 nvfx_fp_emit(fpc, arith(0, MOV, tmp.reg, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, maxs, none, none));
632 maxs = tmp;
633 }
634 nvfx_fp_emit(fpc, arith(0, MAX, tmp.reg, NVFX_FP_MASK_Y | NVFX_FP_MASK_W, swz(src[0], X, X, X, Y), swz(maxs, X, X, Y, Y), none));
635 nvfx_fp_emit(fpc, arith(0, LG2, tmp.reg, NVFX_FP_MASK_W, swz(tmp, W, W, W, W), none, none));
636 nvfx_fp_emit(fpc, arith(0, MUL, tmp.reg, NVFX_FP_MASK_W, swz(tmp, W, W, W, W), swz(src[0], W, W, W, W), none));
637 nvfx_fp_emit(fpc, arith(sat, LITEX2_NV40, dst, mask, swz(tmp, Y, Y, W, W), none, none));
638 }
639 break;
640 case TGSI_OPCODE_LRP:
641 if(!fpc->is_nv4x)
642 nvfx_fp_emit(fpc, arith(sat, LRP_NV30, dst, mask, src[0], src[1], src[2]));
643 else {
644 tmp = nvfx_src(temp(fpc));
645 nvfx_fp_emit(fpc, arith(0, MAD, tmp.reg, mask, neg(src[0]), src[2], src[2]));
646 nvfx_fp_emit(fpc, arith(sat, MAD, dst, mask, src[0], src[1], tmp));
647 }
648 break;
649 case TGSI_OPCODE_MAD:
650 nvfx_fp_emit(fpc, arith(sat, MAD, dst, mask, src[0], src[1], src[2]));
651 break;
652 case TGSI_OPCODE_MAX:
653 nvfx_fp_emit(fpc, arith(sat, MAX, dst, mask, src[0], src[1], none));
654 break;
655 case TGSI_OPCODE_MIN:
656 nvfx_fp_emit(fpc, arith(sat, MIN, dst, mask, src[0], src[1], none));
657 break;
658 case TGSI_OPCODE_MOV:
659 nvfx_fp_emit(fpc, arith(sat, MOV, dst, mask, src[0], none, none));
660 break;
661 case TGSI_OPCODE_MUL:
662 nvfx_fp_emit(fpc, arith(sat, MUL, dst, mask, src[0], src[1], none));
663 break;
664 case TGSI_OPCODE_NOP:
665 break;
666 case TGSI_OPCODE_POW:
667 if(!fpc->is_nv4x)
668 nvfx_fp_emit(fpc, arith(sat, POW_NV30, dst, mask, src[0], src[1], none));
669 else {
670 tmp = nvfx_src(temp(fpc));
671 nvfx_fp_emit(fpc, arith(0, LG2, tmp.reg, NVFX_FP_MASK_X, swz(src[0], X, X, X, X), none, none));
672 nvfx_fp_emit(fpc, arith(0, MUL, tmp.reg, NVFX_FP_MASK_X, swz(tmp, X, X, X, X), swz(src[1], X, X, X, X), none));
673 nvfx_fp_emit(fpc, arith(sat, EX2, dst, mask, swz(tmp, X, X, X, X), none, none));
674 }
675 break;
676 case TGSI_OPCODE_RCP:
677 nvfx_fp_emit(fpc, arith(sat, RCP, dst, mask, src[0], none, none));
678 break;
679 case TGSI_OPCODE_RSQ:
680 if(!fpc->is_nv4x)
681 nvfx_fp_emit(fpc, arith(sat, RSQ_NV30, dst, mask, abs(swz(src[0], X, X, X, X)), none, none));
682 else {
683 tmp = nvfx_src(temp(fpc));
684 insn = arith(0, LG2, tmp.reg, NVFX_FP_MASK_X, abs(swz(src[0], X, X, X, X)), none, none);
685 insn.scale = NVFX_FP_OP_DST_SCALE_INV_2X;
686 nvfx_fp_emit(fpc, insn);
687 nvfx_fp_emit(fpc, arith(sat, EX2, dst, mask, neg(swz(tmp, X, X, X, X)), none, none));
688 }
689 break;
690 case TGSI_OPCODE_SEQ:
691 nvfx_fp_emit(fpc, arith(sat, SEQ, dst, mask, src[0], src[1], none));
692 break;
693 case TGSI_OPCODE_SGE:
694 nvfx_fp_emit(fpc, arith(sat, SGE, dst, mask, src[0], src[1], none));
695 break;
696 case TGSI_OPCODE_SGT:
697 nvfx_fp_emit(fpc, arith(sat, SGT, dst, mask, src[0], src[1], none));
698 break;
699 case TGSI_OPCODE_SIN:
700 nvfx_fp_emit(fpc, arith(sat, SIN, dst, mask, src[0], none, none));
701 break;
702 case TGSI_OPCODE_SLE:
703 nvfx_fp_emit(fpc, arith(sat, SLE, dst, mask, src[0], src[1], none));
704 break;
705 case TGSI_OPCODE_SLT:
706 nvfx_fp_emit(fpc, arith(sat, SLT, dst, mask, src[0], src[1], none));
707 break;
708 case TGSI_OPCODE_SNE:
709 nvfx_fp_emit(fpc, arith(sat, SNE, dst, mask, src[0], src[1], none));
710 break;
711 case TGSI_OPCODE_SSG:
712 {
713 struct nvfx_src minones = swz(nvfx_src(nvfx_fp_imm(fpc, -1, -1, -1, -1)), X, X, X, X);
714
715 insn = arith(sat, MOV, dst, mask, src[0], none, none);
716 insn.cc_update = 1;
717 nvfx_fp_emit(fpc, insn);
718
719 insn = arith(0, STR, dst, mask, none, none, none);
720 insn.cc_test = NVFX_COND_GT;
721 nvfx_fp_emit(fpc, insn);
722
723 if(!sat) {
724 insn = arith(0, MOV, dst, mask, minones, none, none);
725 insn.cc_test = NVFX_COND_LT;
726 nvfx_fp_emit(fpc, insn);
727 }
728 break;
729 }
730 case TGSI_OPCODE_TEX:
731 nvfx_fp_emit(fpc, tex(sat, TEX, unit, dst, mask, src[0], none, none));
732 break;
733 case TGSI_OPCODE_TRUNC:
734 tmp = nvfx_src(temp(fpc));
735 insn = arith(0, MOV, none.reg, mask, src[0], none, none);
736 insn.cc_update = 1;
737 nvfx_fp_emit(fpc, insn);
738
739 nvfx_fp_emit(fpc, arith(0, FLR, tmp.reg, mask, abs(src[0]), none, none));
740 nvfx_fp_emit(fpc, arith(sat, MOV, dst, mask, tmp, none, none));
741
742 insn = arith(sat, MOV, dst, mask, neg(tmp), none, none);
743 insn.cc_test = NVFX_COND_LT;
744 nvfx_fp_emit(fpc, insn);
745 break;
746 case TGSI_OPCODE_TXB:
747 nvfx_fp_emit(fpc, tex(sat, TXB, unit, dst, mask, src[0], none, none));
748 break;
749 case TGSI_OPCODE_TXL:
750 if(fpc->is_nv4x)
751 nvfx_fp_emit(fpc, tex(sat, TXL_NV40, unit, dst, mask, src[0], none, none));
752 else /* unsupported on nv30, use TEX and hope they like it */
753 nvfx_fp_emit(fpc, tex(sat, TEX, unit, dst, mask, src[0], none, none));
754 break;
755 case TGSI_OPCODE_TXP:
756 nvfx_fp_emit(fpc, tex(sat, TXP, unit, dst, mask, src[0], none, none));
757 break;
758
759 case TGSI_OPCODE_IF:
760 // MOVRC0 R31 (TR0.xyzw), R<src>:
761 // IF (NE.xxxx) ELSE <else> END <end>
762 if(!fpc->is_nv4x)
763 goto nv3x_cflow;
764 nv40_fp_if(fpc, src[0]);
765 break;
766
767 case TGSI_OPCODE_ELSE:
768 {
769 uint32_t *hw;
770 if(!fpc->is_nv4x)
771 goto nv3x_cflow;
772 assert(util_dynarray_contains(&fpc->if_stack, unsigned));
773 hw = &fpc->fp->insn[util_dynarray_top(&fpc->if_stack, unsigned)];
774 hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH | fpc->fp->insn_len;
775 break;
776 }
777
778 case TGSI_OPCODE_ENDIF:
779 {
780 uint32_t *hw;
781 if(!fpc->is_nv4x)
782 goto nv3x_cflow;
783 assert(util_dynarray_contains(&fpc->if_stack, unsigned));
784 hw = &fpc->fp->insn[util_dynarray_pop(&fpc->if_stack, unsigned)];
785 if(!hw[2])
786 hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH | fpc->fp->insn_len;
787 hw[3] = fpc->fp->insn_len;
788 break;
789 }
790
791 case TGSI_OPCODE_BGNSUB:
792 case TGSI_OPCODE_ENDSUB:
793 /* nothing to do here */
794 break;
795
796 case TGSI_OPCODE_CAL:
797 if(!fpc->is_nv4x)
798 goto nv3x_cflow;
799 nv40_fp_cal(fpc, finst->Label.Label);
800 break;
801
802 case TGSI_OPCODE_RET:
803 if(!fpc->is_nv4x)
804 goto nv3x_cflow;
805 nv40_fp_ret(fpc);
806 break;
807
808 case TGSI_OPCODE_BGNLOOP:
809 if(!fpc->is_nv4x)
810 goto nv3x_cflow;
811 /* TODO: we should support using two nested REPs to allow a > 255 iteration count */
812 nv40_fp_rep(fpc, 255, finst->Label.Label);
813 break;
814
815 case TGSI_OPCODE_ENDLOOP:
816 break;
817
818 case TGSI_OPCODE_BRK:
819 if(!fpc->is_nv4x)
820 goto nv3x_cflow;
821 nv40_fp_brk(fpc);
822 break;
823
824 case TGSI_OPCODE_CONT:
825 {
826 static int warned = 0;
827 if(!warned) {
828 NOUVEAU_ERR("Sorry, the continue keyword is not implemented: ignoring it.\n");
829 warned = 1;
830 }
831 break;
832 }
833
834 default:
835 NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
836 return false;
837 }
838
839 out:
840 release_temps(fpc);
841 return true;
842 nv3x_cflow:
843 {
844 static int warned = 0;
845 if(!warned) {
846 NOUVEAU_ERR(
847 "Sorry, control flow instructions are not supported in hardware on nv3x: ignoring them\n"
848 "If rendering is incorrect, try to disable GLSL support in the application.\n");
849 warned = 1;
850 }
851 }
852 goto out;
853 }
854
855 static bool
nvfx_fragprog_parse_decl_input(struct nvfx_fpc * fpc,const struct tgsi_full_declaration * fdec)856 nvfx_fragprog_parse_decl_input(struct nvfx_fpc *fpc,
857 const struct tgsi_full_declaration *fdec)
858 {
859 unsigned idx = fdec->Range.First;
860 unsigned hw;
861
862 switch (fdec->Semantic.Name) {
863 case TGSI_SEMANTIC_POSITION:
864 hw = NVFX_FP_OP_INPUT_SRC_POSITION;
865 break;
866 case TGSI_SEMANTIC_COLOR:
867 hw = NVFX_FP_OP_INPUT_SRC_COL0 + fdec->Semantic.Index;
868 break;
869 case TGSI_SEMANTIC_FOG:
870 hw = NVFX_FP_OP_INPUT_SRC_FOGC;
871 break;
872 case TGSI_SEMANTIC_FACE:
873 hw = NV40_FP_OP_INPUT_SRC_FACING;
874 break;
875 case TGSI_SEMANTIC_TEXCOORD:
876 assert(fdec->Semantic.Index < 8);
877 fpc->fp->texcoord[fdec->Semantic.Index] = fdec->Semantic.Index;
878 fpc->fp->texcoords |= (1 << fdec->Semantic.Index);
879 fpc->fp->vp_or |= (0x00004000 << fdec->Semantic.Index);
880 hw = NVFX_FP_OP_INPUT_SRC_TC(fdec->Semantic.Index);
881 break;
882 case TGSI_SEMANTIC_GENERIC:
883 case TGSI_SEMANTIC_PCOORD:
884 /* will be assigned to remaining TC slots later */
885 return true;
886 default:
887 assert(0);
888 return false;
889 }
890
891 fpc->r_input[idx] = nvfx_reg(NVFXSR_INPUT, hw);
892 return true;
893 }
894
895 static bool
nvfx_fragprog_assign_generic(struct nvfx_fpc * fpc,const struct tgsi_full_declaration * fdec)896 nvfx_fragprog_assign_generic(struct nvfx_fpc *fpc,
897 const struct tgsi_full_declaration *fdec)
898 {
899 unsigned num_texcoords = fpc->is_nv4x ? 10 : 8;
900 unsigned idx = fdec->Range.First;
901 unsigned hw;
902
903 switch (fdec->Semantic.Name) {
904 case TGSI_SEMANTIC_GENERIC:
905 case TGSI_SEMANTIC_PCOORD:
906 for (hw = 0; hw < num_texcoords; hw++) {
907 if (fpc->fp->texcoord[hw] == 0xffff) {
908 if (hw <= 7) {
909 fpc->fp->texcoords |= (0x1 << hw);
910 fpc->fp->vp_or |= (0x00004000 << hw);
911 } else {
912 fpc->fp->vp_or |= (0x00001000 << (hw - 8));
913 }
914 if (fdec->Semantic.Name == TGSI_SEMANTIC_PCOORD) {
915 fpc->fp->texcoord[hw] = 0xfffe;
916 fpc->fp->point_sprite_control |= (0x00000100 << hw);
917 } else {
918 fpc->fp->texcoord[hw] = fdec->Semantic.Index + 8;
919 }
920 hw = NVFX_FP_OP_INPUT_SRC_TC(hw);
921 fpc->r_input[idx] = nvfx_reg(NVFXSR_INPUT, hw);
922 return true;
923 }
924 }
925 return false;
926 default:
927 return true;
928 }
929 }
930
931 static bool
nvfx_fragprog_parse_decl_output(struct nvfx_fpc * fpc,const struct tgsi_full_declaration * fdec)932 nvfx_fragprog_parse_decl_output(struct nvfx_fpc *fpc,
933 const struct tgsi_full_declaration *fdec)
934 {
935 unsigned idx = fdec->Range.First;
936 unsigned hw;
937
938 switch (fdec->Semantic.Name) {
939 case TGSI_SEMANTIC_POSITION:
940 hw = 1;
941 break;
942 case TGSI_SEMANTIC_COLOR:
943 hw = ~0;
944 switch (fdec->Semantic.Index) {
945 case 0: hw = 0; break;
946 case 1: hw = 2; break;
947 case 2: hw = 3; break;
948 case 3: hw = 4; break;
949 }
950 if(hw > ((fpc->is_nv4x) ? 4 : 2)) {
951 NOUVEAU_ERR("bad rcol index\n");
952 return false;
953 }
954 break;
955 default:
956 NOUVEAU_ERR("bad output semantic\n");
957 return false;
958 }
959
960 fpc->r_result[idx] = nvfx_reg(NVFXSR_OUTPUT, hw);
961 fpc->r_temps |= (1ULL << hw);
962 return true;
963 }
964
965 static bool
nvfx_fragprog_prepare(struct nvfx_fpc * fpc)966 nvfx_fragprog_prepare(struct nvfx_fpc *fpc)
967 {
968 struct tgsi_parse_context p;
969 int high_temp = -1, i;
970
971 fpc->r_imm = CALLOC(fpc->fp->info.immediate_count, sizeof(struct nvfx_reg));
972
973 tgsi_parse_init(&p, fpc->fp->pipe.tokens);
974 while (!tgsi_parse_end_of_tokens(&p)) {
975 const union tgsi_full_token *tok = &p.FullToken;
976
977 tgsi_parse_token(&p);
978 switch(tok->Token.Type) {
979 case TGSI_TOKEN_TYPE_DECLARATION:
980 {
981 const struct tgsi_full_declaration *fdec;
982 fdec = &p.FullToken.FullDeclaration;
983 switch (fdec->Declaration.File) {
984 case TGSI_FILE_INPUT:
985 if (!nvfx_fragprog_parse_decl_input(fpc, fdec))
986 goto out_err;
987 break;
988 case TGSI_FILE_OUTPUT:
989 if (!nvfx_fragprog_parse_decl_output(fpc, fdec))
990 goto out_err;
991 break;
992 case TGSI_FILE_TEMPORARY:
993 if (fdec->Range.Last > high_temp) {
994 high_temp =
995 fdec->Range.Last;
996 }
997 break;
998 default:
999 break;
1000 }
1001 }
1002 break;
1003 case TGSI_TOKEN_TYPE_IMMEDIATE:
1004 {
1005 struct tgsi_full_immediate *imm;
1006
1007 imm = &p.FullToken.FullImmediate;
1008 assert(imm->Immediate.DataType == TGSI_IMM_FLOAT32);
1009 assert(fpc->nr_imm < fpc->fp->info.immediate_count);
1010
1011 fpc->r_imm[fpc->nr_imm++] = nvfx_fp_imm(fpc, imm->u[0].Float, imm->u[1].Float, imm->u[2].Float, imm->u[3].Float);
1012 break;
1013 }
1014 default:
1015 break;
1016 }
1017 }
1018 tgsi_parse_free(&p);
1019
1020 tgsi_parse_init(&p, fpc->fp->pipe.tokens);
1021 while (!tgsi_parse_end_of_tokens(&p)) {
1022 const struct tgsi_full_declaration *fdec;
1023 tgsi_parse_token(&p);
1024 switch(p.FullToken.Token.Type) {
1025 case TGSI_TOKEN_TYPE_DECLARATION:
1026 fdec = &p.FullToken.FullDeclaration;
1027 switch (fdec->Declaration.File) {
1028 case TGSI_FILE_INPUT:
1029 if (!nvfx_fragprog_assign_generic(fpc, fdec))
1030 goto out_err;
1031 break;
1032 default:
1033 break;
1034 }
1035 break;
1036 default:
1037 break;
1038 }
1039 }
1040 tgsi_parse_free(&p);
1041
1042 if (++high_temp) {
1043 fpc->r_temp = CALLOC(high_temp, sizeof(struct nvfx_reg));
1044 for (i = 0; i < high_temp; i++)
1045 fpc->r_temp[i] = temp(fpc);
1046 fpc->r_temps_discard = 0ULL;
1047 }
1048
1049 return true;
1050
1051 out_err:
1052 FREE(fpc->r_temp);
1053 fpc->r_temp = NULL;
1054
1055 tgsi_parse_free(&p);
1056 return false;
1057 }
1058
1059 DEBUG_GET_ONCE_BOOL_OPTION(nvfx_dump_fp, "NVFX_DUMP_FP", false)
1060
1061 void
_nvfx_fragprog_translate(uint16_t oclass,struct nv30_fragprog * fp)1062 _nvfx_fragprog_translate(uint16_t oclass, struct nv30_fragprog *fp)
1063 {
1064 struct tgsi_parse_context parse;
1065 struct nvfx_fpc *fpc = NULL;
1066 struct util_dynarray insns;
1067
1068 fp->translated = false;
1069 fp->point_sprite_control = 0;
1070 fp->vp_or = 0;
1071
1072 fpc = CALLOC_STRUCT(nvfx_fpc);
1073 if (!fpc)
1074 goto out_err;
1075
1076 fpc->is_nv4x = (oclass >= NV40_3D_CLASS) ? ~0 : 0;
1077 fpc->max_temps = fpc->is_nv4x ? 48 : 32;
1078 fpc->fp = fp;
1079 fpc->num_regs = 2;
1080 memset(fp->texcoord, 0xff, sizeof(fp->texcoord));
1081
1082 if (fp->info.properties[TGSI_PROPERTY_FS_COORD_ORIGIN])
1083 fp->coord_conventions |= NV30_3D_COORD_CONVENTIONS_ORIGIN_INVERTED;
1084 if (fp->info.properties[TGSI_PROPERTY_FS_COORD_PIXEL_CENTER])
1085 fp->coord_conventions |= NV30_3D_COORD_CONVENTIONS_CENTER_INTEGER;
1086 if (fp->info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS])
1087 fp->rt_enable |= NV30_3D_RT_ENABLE_MRT;
1088
1089 if (!nvfx_fragprog_prepare(fpc))
1090 goto out_err;
1091
1092 tgsi_parse_init(&parse, fp->pipe.tokens);
1093 util_dynarray_init(&insns, NULL);
1094
1095 while (!tgsi_parse_end_of_tokens(&parse)) {
1096 tgsi_parse_token(&parse);
1097
1098 switch (parse.FullToken.Token.Type) {
1099 case TGSI_TOKEN_TYPE_INSTRUCTION:
1100 {
1101 const struct tgsi_full_instruction *finst;
1102
1103 util_dynarray_append(&insns, unsigned, fp->insn_len);
1104 finst = &parse.FullToken.FullInstruction;
1105 if (!nvfx_fragprog_parse_instruction(fpc, finst))
1106 goto out_err;
1107 }
1108 break;
1109 default:
1110 break;
1111 }
1112 }
1113 util_dynarray_append(&insns, unsigned, fp->insn_len);
1114
1115 for(unsigned i = 0; i < fpc->label_relocs.size; i += sizeof(struct nvfx_relocation))
1116 {
1117 struct nvfx_relocation* label_reloc = (struct nvfx_relocation*)((char*)fpc->label_relocs.data + i);
1118 fp->insn[label_reloc->location] |= ((unsigned*)insns.data)[label_reloc->target];
1119 }
1120 util_dynarray_fini(&insns);
1121
1122 if(!fpc->is_nv4x)
1123 fp->fp_control |= (fpc->num_regs-1)/2;
1124 else
1125 fp->fp_control |= fpc->num_regs << NV40_3D_FP_CONTROL_TEMP_COUNT__SHIFT;
1126
1127 /* Terminate final instruction */
1128 if(fp->insn)
1129 fp->insn[fpc->inst_offset] |= 0x00000001;
1130
1131 /* Append NOP + END instruction for branches to the end of the program */
1132 fpc->inst_offset = fp->insn_len;
1133 grow_insns(fpc, 4);
1134 fp->insn[fpc->inst_offset + 0] = 0x00000001;
1135 fp->insn[fpc->inst_offset + 1] = 0x00000000;
1136 fp->insn[fpc->inst_offset + 2] = 0x00000000;
1137 fp->insn[fpc->inst_offset + 3] = 0x00000000;
1138
1139 if(debug_get_option_nvfx_dump_fp())
1140 {
1141 debug_printf("\n");
1142 tgsi_dump(fp->pipe.tokens, 0);
1143
1144 debug_printf("\n%s fragment program:\n", fpc->is_nv4x ? "nv4x" : "nv3x");
1145 for (unsigned i = 0; i < fp->insn_len; i += 4)
1146 debug_printf("%3u: %08x %08x %08x %08x\n", i >> 2, fp->insn[i], fp->insn[i + 1], fp->insn[i + 2], fp->insn[i + 3]);
1147 debug_printf("\n");
1148 }
1149
1150 fp->translated = true;
1151
1152 out:
1153 tgsi_parse_free(&parse);
1154 if (fpc)
1155 {
1156 FREE(fpc->r_temp);
1157 FREE(fpc->r_imm);
1158 util_dynarray_fini(&fpc->if_stack);
1159 util_dynarray_fini(&fpc->label_relocs);
1160 util_dynarray_fini(&fpc->imm_data);
1161 //util_dynarray_fini(&fpc->loop_stack);
1162 FREE(fpc);
1163 }
1164
1165 return;
1166
1167 out_err:
1168 _debug_printf("Error: failed to compile this fragment program:\n");
1169 tgsi_dump(fp->pipe.tokens, 0);
1170 goto out;
1171 }
1172