xref: /aosp_15_r20/external/mesa3d/src/gallium/frontends/nine/nine_ff.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2011 Joakim Sindholt <[email protected]>
3  * Copyright Axel Davy <[email protected]>
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "device9.h"
8 #include "basetexture9.h"
9 #include "vertexdeclaration9.h"
10 #include "vertexshader9.h"
11 #include "pixelshader9.h"
12 #include "nine_ff.h"
13 #include "nine_defines.h"
14 #include "nine_helpers.h"
15 #include "nine_pipe.h"
16 #include "nine_dump.h"
17 
18 #include "pipe/p_context.h"
19 #include "tgsi/tgsi_ureg.h"
20 #include "tgsi/tgsi_dump.h"
21 #include "util/bitscan.h"
22 #include "util/box.h"
23 #include "util/u_hash_table.h"
24 #include "util/u_upload_mgr.h"
25 
26 #define DBG_CHANNEL DBG_FF
27 
28 #define NINE_FF_NUM_VS_CONST 204
29 #define NINE_FF_NUM_PS_CONST 24
30 
31 struct fvec4
32 {
33     float x, y, z, w;
34 };
35 
36 struct nine_ff_vs_key
37 {
38     union {
39         struct {
40             uint32_t position_t : 1;
41             uint32_t lighting   : 1;
42             uint32_t darkness   : 1; /* lighting enabled but no active lights */
43             uint32_t localviewer : 1;
44             uint32_t vertexpointsize : 1;
45             uint32_t pointscale : 1;
46             uint32_t vertexblend : 3;
47             uint32_t vertexblend_indexed : 1;
48             uint32_t vertextween : 1;
49             uint32_t mtl_diffuse : 2; /* 0 = material, 1 = color1, 2 = color2 */
50             uint32_t mtl_ambient : 2;
51             uint32_t mtl_specular : 2;
52             uint32_t mtl_emissive : 2;
53             uint32_t fog_mode : 2;
54             uint32_t fog_range : 1;
55             uint32_t color0in_one : 1;
56             uint32_t color1in_zero : 1;
57             uint32_t has_normal : 1;
58             uint32_t fog : 1;
59             uint32_t normalizenormals : 1;
60             uint32_t ucp : 1;
61             uint32_t pad1 : 4;
62             uint32_t tc_dim_input: 16; /* 8 * 2 bits */
63             uint32_t pad2 : 16;
64             uint32_t tc_dim_output: 24; /* 8 * 3 bits */
65             uint32_t pad3 : 8;
66             uint32_t tc_gen : 24; /* 8 * 3 bits */
67             uint32_t pad4 : 8;
68             uint32_t tc_idx : 24;
69             uint32_t clipplane_emulate : 8;
70             uint32_t passthrough;
71         };
72         uint64_t value64[3]; /* don't forget to resize VertexShader9.ff_key */
73         uint32_t value32[6];
74     };
75 };
76 
77 /* Texture stage state:
78  *
79  * COLOROP       D3DTOP 5 bit
80  * ALPHAOP       D3DTOP 5 bit
81  * COLORARG0     D3DTA  3 bit
82  * COLORARG1     D3DTA  3 bit
83  * COLORARG2     D3DTA  3 bit
84  * ALPHAARG0     D3DTA  3 bit
85  * ALPHAARG1     D3DTA  3 bit
86  * ALPHAARG2     D3DTA  3 bit
87  * RESULTARG     D3DTA  1 bit (CURRENT:0 or TEMP:1)
88  * TEXCOORDINDEX 0 - 7  3 bit
89  * ===========================
90  *                     32 bit per stage
91  */
92 struct nine_ff_ps_key
93 {
94     union {
95         struct {
96             struct {
97                 uint32_t colorop   : 5;
98                 uint32_t alphaop   : 5;
99                 uint32_t colorarg0 : 3;
100                 uint32_t colorarg1 : 3;
101                 uint32_t colorarg2 : 3;
102                 uint32_t alphaarg0 : 3;
103                 uint32_t alphaarg1 : 3;
104                 uint32_t alphaarg2 : 3;
105                 uint32_t resultarg : 1; /* CURRENT:0 or TEMP:1 */
106                 uint32_t textarget : 2; /* 1D/2D/3D/CUBE */
107                 uint32_t pad       : 1;
108                 /* that's 32 bit exactly */
109             } ts[8];
110             uint32_t projected : 16;
111             uint32_t fog : 1; /* for vFog coming from VS */
112             uint32_t fog_mode : 2;
113             uint32_t fog_source : 1; /* 0: Z, 1: W */
114             uint32_t specular : 1;
115             uint32_t alpha_test_emulation : 3;
116             uint32_t flatshade : 1;
117             uint32_t pad1 : 7; /* 9 32-bit words with this */
118             uint8_t colorarg_b4[3];
119             uint8_t colorarg_b5[3];
120             uint8_t alphaarg_b4[3]; /* 11 32-bit words plus a byte */
121             uint8_t pad2[3];
122         };
123         uint64_t value64[6]; /* don't forget to resize PixelShader9.ff_key */
124         uint32_t value32[12];
125     };
126 };
127 
nine_ff_vs_key_hash(const void * key)128 static uint32_t nine_ff_vs_key_hash(const void *key)
129 {
130     const struct nine_ff_vs_key *vs = key;
131     unsigned i;
132     uint32_t hash = vs->value32[0];
133     for (i = 1; i < ARRAY_SIZE(vs->value32); ++i)
134         hash ^= vs->value32[i];
135     return hash;
136 }
nine_ff_vs_key_comp(const void * key1,const void * key2)137 static bool nine_ff_vs_key_comp(const void *key1, const void *key2)
138 {
139     struct nine_ff_vs_key *a = (struct nine_ff_vs_key *)key1;
140     struct nine_ff_vs_key *b = (struct nine_ff_vs_key *)key2;
141 
142     return memcmp(a->value64, b->value64, sizeof(a->value64)) == 0;
143 }
nine_ff_ps_key_hash(const void * key)144 static uint32_t nine_ff_ps_key_hash(const void *key)
145 {
146     const struct nine_ff_ps_key *ps = key;
147     unsigned i;
148     uint32_t hash = ps->value32[0];
149     for (i = 1; i < ARRAY_SIZE(ps->value32); ++i)
150         hash ^= ps->value32[i];
151     return hash;
152 }
nine_ff_ps_key_comp(const void * key1,const void * key2)153 static bool nine_ff_ps_key_comp(const void *key1, const void *key2)
154 {
155     struct nine_ff_ps_key *a = (struct nine_ff_ps_key *)key1;
156     struct nine_ff_ps_key *b = (struct nine_ff_ps_key *)key2;
157 
158     return memcmp(a->value64, b->value64, sizeof(a->value64)) == 0;
159 }
nine_ff_fvf_key_hash(const void * key)160 static uint32_t nine_ff_fvf_key_hash(const void *key)
161 {
162     return *(DWORD *)key;
163 }
nine_ff_fvf_key_comp(const void * key1,const void * key2)164 static bool nine_ff_fvf_key_comp(const void *key1, const void *key2)
165 {
166     return *(DWORD *)key1 == *(DWORD *)key2;
167 }
168 
169 static void nine_ff_prune_vs(struct NineDevice9 *);
170 static void nine_ff_prune_ps(struct NineDevice9 *);
171 
nine_ureg_tgsi_dump(struct ureg_program * ureg,bool override)172 static void nine_ureg_tgsi_dump(struct ureg_program *ureg, bool override)
173 {
174     if (debug_get_bool_option("NINE_FF_DUMP", false) || override) {
175         const struct tgsi_token *toks = ureg_get_tokens(ureg, NULL);
176         tgsi_dump(toks, 0);
177         ureg_free_tokens(toks);
178     }
179 }
180 
181 #define _X(r) ureg_scalar(ureg_src(r), TGSI_SWIZZLE_X)
182 #define _Y(r) ureg_scalar(ureg_src(r), TGSI_SWIZZLE_Y)
183 #define _Z(r) ureg_scalar(ureg_src(r), TGSI_SWIZZLE_Z)
184 #define _W(r) ureg_scalar(ureg_src(r), TGSI_SWIZZLE_W)
185 
186 #define _XXXX(r) ureg_scalar(r, TGSI_SWIZZLE_X)
187 #define _YYYY(r) ureg_scalar(r, TGSI_SWIZZLE_Y)
188 #define _ZZZZ(r) ureg_scalar(r, TGSI_SWIZZLE_Z)
189 #define _WWWW(r) ureg_scalar(r, TGSI_SWIZZLE_W)
190 
191 #define _XYZW(r) (r)
192 
193 /* AL should contain base address of lights table. */
194 #define LIGHT_CONST(i)                                                \
195     ureg_src_indirect(ureg_DECL_constant(ureg, i), _X(AL))
196 
197 #define MATERIAL_CONST(i) \
198     ureg_DECL_constant(ureg, 19 + (i))
199 
200 #define _CONST(n) ureg_DECL_constant(ureg, n)
201 
202 /* VS FF constants layout:
203  *
204  * CONST[ 0.. 3] D3DTS_WORLD * D3DTS_VIEW * D3DTS_PROJECTION
205  * CONST[ 4.. 7] D3DTS_WORLD * D3DTS_VIEW
206  * CONST[ 8..11] D3DTS_PROJECTION
207  * CONST[12..15] D3DTS_VIEW^(-1)
208  * CONST[16..18] Normal matrix
209  *
210  * CONST[19].xyz  MATERIAL.Emissive + Material.Ambient * RS.Ambient
211  * CONST[20]      MATERIAL.Diffuse
212  * CONST[21]      MATERIAL.Ambient
213  * CONST[22]      MATERIAL.Specular
214  * CONST[23].x___ MATERIAL.Power
215  * CONST[24]      MATERIAL.Emissive
216  * CONST[25]      RS.Ambient
217  *
218  * CONST[26].x___ RS.PointSizeMin
219  * CONST[26]._y__ RS.PointSizeMax
220  * CONST[26].__z_ RS.PointSize
221  * CONST[26].___w RS.PointScaleA
222  * CONST[27].x___ RS.PointScaleB
223  * CONST[27]._y__ RS.PointScaleC
224  *
225  * CONST[28].x___ RS.FogEnd
226  * CONST[28]._y__ 1.0f / (RS.FogEnd - RS.FogStart)
227  * CONST[28].__z_ RS.FogDensity
228 
229  * CONST[30].x___ TWEENFACTOR
230  *
231  * CONST[32].x___ LIGHT[0].Type
232  * CONST[32]._yzw LIGHT[0].Attenuation0,1,2
233  * CONST[33]      LIGHT[0].Diffuse
234  * CONST[34]      LIGHT[0].Specular
235  * CONST[35]      LIGHT[0].Ambient
236  * CONST[36].xyz_ LIGHT[0].Position
237  * CONST[36].___w LIGHT[0].Range
238  * CONST[37].xyz_ LIGHT[0].Direction
239  * CONST[37].___w LIGHT[0].Falloff
240  * CONST[38].x___ cos(LIGHT[0].Theta / 2)
241  * CONST[38]._y__ cos(LIGHT[0].Phi / 2)
242  * CONST[38].__z_ 1.0f / (cos(LIGHT[0].Theta / 2) - cos(Light[0].Phi / 2))
243  * CONST[39].xyz_ LIGHT[0].HalfVector (for directional lights)
244  * CONST[39].___w 1 if this is the last active light, 0 if not
245  * CONST[40]      LIGHT[1]
246  * CONST[48]      LIGHT[2]
247  * CONST[56]      LIGHT[3]
248  * CONST[64]      LIGHT[4]
249  * CONST[72]      LIGHT[5]
250  * CONST[80]      LIGHT[6]
251  * CONST[88]      LIGHT[7]
252  * NOTE: no lighting code is generated if there are no active lights
253  *
254  * CONST[100].x___ Viewport 2/width
255  * CONST[100]._y__ Viewport 2/height
256  * CONST[100].__z_ Viewport 1/(zmax - zmin)
257  * CONST[100].___w Viewport width
258  * CONST[101].x___ Viewport x0
259  * CONST[101]._y__ Viewport y0
260  * CONST[101].__z_ Viewport z0
261  *
262  * CONST[128..131] D3DTS_TEXTURE0
263  * CONST[132..135] D3DTS_TEXTURE1
264  * CONST[136..139] D3DTS_TEXTURE2
265  * CONST[140..143] D3DTS_TEXTURE3
266  * CONST[144..147] D3DTS_TEXTURE4
267  * CONST[148..151] D3DTS_TEXTURE5
268  * CONST[152..155] D3DTS_TEXTURE6
269  * CONST[156..159] D3DTS_TEXTURE7
270  *
271  * CONST[160] D3DTS_WORLDMATRIX[0] * D3DTS_VIEW
272  * CONST[164] D3DTS_WORLDMATRIX[1] * D3DTS_VIEW
273  * ...
274  * CONST[192] D3DTS_WORLDMATRIX[8] * D3DTS_VIEW
275  * CONST[196] UCP0
276  ...
277  * CONST[203] UCP7
278  */
279 struct vs_build_ctx
280 {
281     struct ureg_program *ureg;
282     const struct nine_ff_vs_key *key;
283 
284     uint16_t input[PIPE_MAX_ATTRIBS];
285     unsigned num_inputs;
286 
287     struct ureg_src aVtx;
288     struct ureg_src aNrm;
289     struct ureg_src aCol[2];
290     struct ureg_src aTex[8];
291     struct ureg_src aPsz;
292     struct ureg_src aInd;
293     struct ureg_src aWgt;
294 
295     struct ureg_src aVtx1; /* tweening */
296     struct ureg_src aNrm1;
297 
298     struct ureg_src mtlA;
299     struct ureg_src mtlD;
300     struct ureg_src mtlS;
301     struct ureg_src mtlE;
302 };
303 
304 static inline unsigned
get_texcoord_sn(struct pipe_screen * screen)305 get_texcoord_sn(struct pipe_screen *screen)
306 {
307     if (screen->get_param(screen, PIPE_CAP_TGSI_TEXCOORD))
308         return TGSI_SEMANTIC_TEXCOORD;
309     return TGSI_SEMANTIC_GENERIC;
310 }
311 
312 static inline struct ureg_src
build_vs_add_input(struct vs_build_ctx * vs,uint16_t ndecl)313 build_vs_add_input(struct vs_build_ctx *vs, uint16_t ndecl)
314 {
315     const unsigned i = vs->num_inputs++;
316     assert(i < PIPE_MAX_ATTRIBS);
317     vs->input[i] = ndecl;
318     return ureg_DECL_vs_input(vs->ureg, i);
319 }
320 
321 /* NOTE: dst may alias src */
322 static inline void
ureg_normalize3(struct ureg_program * ureg,struct ureg_dst dst,struct ureg_src src)323 ureg_normalize3(struct ureg_program *ureg,
324                 struct ureg_dst dst, struct ureg_src src)
325 {
326     struct ureg_dst tmp = ureg_DECL_temporary(ureg);
327     struct ureg_dst tmp_x = ureg_writemask(tmp, TGSI_WRITEMASK_X);
328 
329     ureg_DP3(ureg, tmp_x, src, src);
330     ureg_RSQ(ureg, tmp_x, _X(tmp));
331     ureg_MUL(ureg, dst, src, _X(tmp));
332     ureg_release_temporary(ureg, tmp);
333 }
334 
335 static void *
nine_ff_build_vs(struct NineDevice9 * device,struct vs_build_ctx * vs)336 nine_ff_build_vs(struct NineDevice9 *device, struct vs_build_ctx *vs)
337 {
338     const struct nine_ff_vs_key *key = vs->key;
339     struct ureg_program *ureg = ureg_create(PIPE_SHADER_VERTEX);
340     struct ureg_dst oPos, oCol[2], oPsz, oFog;
341     struct ureg_dst AR;
342     unsigned i, c;
343     unsigned label[32], l = 0;
344     bool need_aNrm = key->lighting || key->passthrough & (1 << NINE_DECLUSAGE_NORMAL);
345     bool has_aNrm;
346     bool need_aVtx = key->lighting || key->fog_mode || key->pointscale || key->ucp;
347     const unsigned texcoord_sn = get_texcoord_sn(device->screen);
348 
349     vs->ureg = ureg;
350 
351     /* Check which inputs we should transform. */
352     for (i = 0; i < 8 * 3; i += 3) {
353         switch ((key->tc_gen >> i) & 0x7) {
354         case NINED3DTSS_TCI_CAMERASPACENORMAL:
355             need_aNrm = true;
356             break;
357         case NINED3DTSS_TCI_CAMERASPACEPOSITION:
358             need_aVtx = true;
359             break;
360         case NINED3DTSS_TCI_CAMERASPACEREFLECTIONVECTOR:
361             need_aVtx = need_aNrm = true;
362             break;
363         case NINED3DTSS_TCI_SPHEREMAP:
364             need_aVtx = need_aNrm = true;
365             break;
366         default:
367             break;
368         }
369     }
370 
371     has_aNrm = need_aNrm && key->has_normal;
372 
373     /* Declare and record used inputs (needed for linkage with vertex format):
374      * (texture coordinates handled later)
375      */
376     vs->aVtx = build_vs_add_input(vs,
377         key->position_t ? NINE_DECLUSAGE_POSITIONT : NINE_DECLUSAGE_POSITION);
378 
379     vs->aNrm = ureg_imm1f(ureg, 0.0f);
380     if (has_aNrm)
381         vs->aNrm = build_vs_add_input(vs, NINE_DECLUSAGE_NORMAL);
382 
383     vs->aCol[0] = ureg_imm1f(ureg, 1.0f);
384     vs->aCol[1] = ureg_imm1f(ureg, 0.0f);
385 
386     if (key->lighting || key->darkness) {
387         const unsigned mask = key->mtl_diffuse | key->mtl_specular |
388                               key->mtl_ambient | key->mtl_emissive;
389         if ((mask & 0x1) && !key->color0in_one)
390             vs->aCol[0] = build_vs_add_input(vs, NINE_DECLUSAGE_i(COLOR, 0));
391         if ((mask & 0x2) && !key->color1in_zero)
392             vs->aCol[1] = build_vs_add_input(vs, NINE_DECLUSAGE_i(COLOR, 1));
393 
394         vs->mtlD = MATERIAL_CONST(1);
395         vs->mtlA = MATERIAL_CONST(2);
396         vs->mtlS = MATERIAL_CONST(3);
397         vs->mtlE = MATERIAL_CONST(5);
398         if (key->mtl_diffuse  == 1) vs->mtlD = vs->aCol[0]; else
399         if (key->mtl_diffuse  == 2) vs->mtlD = vs->aCol[1];
400         if (key->mtl_ambient  == 1) vs->mtlA = vs->aCol[0]; else
401         if (key->mtl_ambient  == 2) vs->mtlA = vs->aCol[1];
402         if (key->mtl_specular == 1) vs->mtlS = vs->aCol[0]; else
403         if (key->mtl_specular == 2) vs->mtlS = vs->aCol[1];
404         if (key->mtl_emissive == 1) vs->mtlE = vs->aCol[0]; else
405         if (key->mtl_emissive == 2) vs->mtlE = vs->aCol[1];
406     } else {
407         if (!key->color0in_one) vs->aCol[0] = build_vs_add_input(vs, NINE_DECLUSAGE_i(COLOR, 0));
408         if (!key->color1in_zero) vs->aCol[1] = build_vs_add_input(vs, NINE_DECLUSAGE_i(COLOR, 1));
409     }
410 
411     if (key->vertexpointsize)
412         vs->aPsz = build_vs_add_input(vs, NINE_DECLUSAGE_PSIZE);
413 
414     if (key->vertexblend_indexed || key->passthrough & (1 << NINE_DECLUSAGE_BLENDINDICES))
415         vs->aInd = build_vs_add_input(vs, NINE_DECLUSAGE_BLENDINDICES);
416     if (key->vertexblend || key->passthrough & (1 << NINE_DECLUSAGE_BLENDWEIGHT))
417         vs->aWgt = build_vs_add_input(vs, NINE_DECLUSAGE_BLENDWEIGHT);
418     if (key->vertextween) {
419         vs->aVtx1 = build_vs_add_input(vs, NINE_DECLUSAGE_i(POSITION,1));
420         vs->aNrm1 = build_vs_add_input(vs, NINE_DECLUSAGE_i(NORMAL,1));
421     }
422 
423     /* Declare outputs:
424      */
425     oPos = ureg_DECL_output(ureg, TGSI_SEMANTIC_POSITION, 0); /* HPOS */
426     oCol[0] = ureg_saturate(ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 0));
427     oCol[1] = ureg_saturate(ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 1));
428     if (key->fog || key->passthrough & (1 << NINE_DECLUSAGE_FOG)) {
429         oFog = ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 16);
430         oFog = ureg_writemask(oFog, TGSI_WRITEMASK_X);
431     }
432 
433     if (key->vertexpointsize || key->pointscale || device->driver_caps.always_output_pointsize) {
434         oPsz = ureg_DECL_output_masked(ureg, TGSI_SEMANTIC_PSIZE, 0,
435                                        TGSI_WRITEMASK_X, 0, 1);
436         oPsz = ureg_writemask(oPsz, TGSI_WRITEMASK_X);
437     }
438 
439     if (key->lighting || key->vertexblend)
440         AR = ureg_DECL_address(ureg);
441 
442     /* === Vertex transformation / vertex blending:
443      */
444 
445     if (key->position_t) {
446         if (device->driver_caps.window_space_position_support) {
447             ureg_MOV(ureg, oPos, vs->aVtx);
448         } else {
449             struct ureg_dst tmp = ureg_DECL_temporary(ureg);
450             /* vs->aVtx contains the coordinates buffer wise.
451             * later in the pipeline, clipping, viewport and division
452             * by w (rhw = 1/w) are going to be applied, so do the reverse
453             * of these transformations (except clipping) to have the good
454             * position at the end.*/
455             ureg_MOV(ureg, tmp, vs->aVtx);
456             /* X from [X_min, X_min + width] to [-1, 1], same for Y. Z to [0, 1] */
457             ureg_ADD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XYZ), ureg_src(tmp), ureg_negate(_CONST(101)));
458             ureg_MUL(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XYZ), ureg_src(tmp), _CONST(100));
459             ureg_ADD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XY), ureg_src(tmp), ureg_imm1f(ureg, -1.0f));
460             /* Y needs to be reversed */
461             ureg_MOV(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_Y), ureg_negate(ureg_src(tmp)));
462             /* Replace w by 1 if it equals to 0 */
463             ureg_CMP(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_W), ureg_negate(ureg_abs(ureg_scalar(ureg_src(tmp), TGSI_SWIZZLE_W))),
464                      ureg_scalar(ureg_src(tmp), TGSI_SWIZZLE_W), ureg_imm1f(ureg, 1.0f));
465             /* inverse rhw */
466             ureg_RCP(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_W), _W(tmp));
467             /* multiply X, Y, Z by w */
468             ureg_MUL(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XYZ), ureg_src(tmp), _W(tmp));
469             ureg_MOV(ureg, oPos, ureg_src(tmp));
470             ureg_release_temporary(ureg, tmp);
471         }
472     } else if (key->vertexblend) {
473         struct ureg_dst tmp = ureg_DECL_temporary(ureg);
474         struct ureg_dst tmp2 = ureg_DECL_temporary(ureg);
475         struct ureg_dst aVtx_dst = ureg_DECL_temporary(ureg);
476         struct ureg_dst aNrm_dst = ureg_DECL_temporary(ureg);
477         struct ureg_dst sum_blendweights = ureg_DECL_temporary(ureg);
478         struct ureg_src cWM[4];
479 
480         for (i = 160; i <= 195; ++i)
481             ureg_DECL_constant(ureg, i);
482 
483         /* translate world matrix index to constant file index */
484         if (key->vertexblend_indexed) {
485             ureg_MAD(ureg, tmp, vs->aInd, ureg_imm1f(ureg, 4.0f), ureg_imm1f(ureg, 160.0f));
486             ureg_ARL(ureg, AR, ureg_src(tmp));
487         }
488 
489         ureg_MOV(ureg, aVtx_dst, ureg_imm4f(ureg, 0.0f, 0.0f, 0.0f, 0.0f));
490         ureg_MOV(ureg, aNrm_dst, ureg_imm4f(ureg, 0.0f, 0.0f, 0.0f, 0.0f));
491         ureg_MOV(ureg, sum_blendweights, ureg_imm4f(ureg, 1.0f, 1.0f, 1.0f, 1.0f));
492 
493         for (i = 0; i < key->vertexblend; ++i) {
494             for (c = 0; c < 4; ++c) {
495                 cWM[c] = ureg_src_dimension(ureg_src_register(TGSI_FILE_CONSTANT, (160 + i * 4) * !key->vertexblend_indexed + c), 0);
496                 if (key->vertexblend_indexed)
497                     cWM[c] = ureg_src_indirect(cWM[c], ureg_scalar(ureg_src(AR), i));
498             }
499 
500             /* multiply by WORLD(index) */
501             ureg_MUL(ureg, tmp, _XXXX(vs->aVtx), cWM[0]);
502             ureg_MAD(ureg, tmp, _YYYY(vs->aVtx), cWM[1], ureg_src(tmp));
503             ureg_MAD(ureg, tmp, _ZZZZ(vs->aVtx), cWM[2], ureg_src(tmp));
504             ureg_MAD(ureg, tmp, _WWWW(vs->aVtx), cWM[3], ureg_src(tmp));
505 
506             if (has_aNrm) {
507                 /* Note: the spec says the transpose of the inverse of the
508                  * WorldView matrices should be used, but all tests show
509                  * otherwise.
510                  * Only case unknown: D3DVBF_0WEIGHTS */
511                 ureg_MUL(ureg, tmp2, _XXXX(vs->aNrm), cWM[0]);
512                 ureg_MAD(ureg, tmp2, _YYYY(vs->aNrm), cWM[1], ureg_src(tmp2));
513                 ureg_MAD(ureg, tmp2, _ZZZZ(vs->aNrm), cWM[2], ureg_src(tmp2));
514             }
515 
516             if (i < (key->vertexblend - 1)) {
517                 /* accumulate weighted position value */
518                 ureg_MAD(ureg, aVtx_dst, ureg_src(tmp), ureg_scalar(vs->aWgt, i), ureg_src(aVtx_dst));
519                 if (has_aNrm)
520                     ureg_MAD(ureg, aNrm_dst, ureg_src(tmp2), ureg_scalar(vs->aWgt, i), ureg_src(aNrm_dst));
521                 /* subtract weighted position value for last value */
522                 ureg_ADD(ureg, sum_blendweights, ureg_src(sum_blendweights), ureg_negate(ureg_scalar(vs->aWgt, i)));
523             }
524         }
525 
526         /* the last weighted position is always 1 - sum_of_previous_weights */
527         ureg_MAD(ureg, aVtx_dst, ureg_src(tmp), ureg_scalar(ureg_src(sum_blendweights), key->vertexblend - 1), ureg_src(aVtx_dst));
528         if (has_aNrm)
529             ureg_MAD(ureg, aNrm_dst, ureg_src(tmp2), ureg_scalar(ureg_src(sum_blendweights), key->vertexblend - 1), ureg_src(aNrm_dst));
530 
531         /* multiply by VIEW_PROJ */
532         ureg_MUL(ureg, tmp, _X(aVtx_dst), _CONST(8));
533         ureg_MAD(ureg, tmp, _Y(aVtx_dst), _CONST(9),  ureg_src(tmp));
534         ureg_MAD(ureg, tmp, _Z(aVtx_dst), _CONST(10), ureg_src(tmp));
535         ureg_MAD(ureg, oPos, _W(aVtx_dst), _CONST(11), ureg_src(tmp));
536 
537         if (need_aVtx)
538             vs->aVtx = ureg_src(aVtx_dst);
539 
540         ureg_release_temporary(ureg, tmp);
541         ureg_release_temporary(ureg, tmp2);
542         ureg_release_temporary(ureg, sum_blendweights);
543         if (!need_aVtx)
544             ureg_release_temporary(ureg, aVtx_dst);
545 
546         if (has_aNrm) {
547             if (key->normalizenormals)
548                ureg_normalize3(ureg, aNrm_dst, ureg_src(aNrm_dst));
549             vs->aNrm = ureg_src(aNrm_dst);
550         } else
551             ureg_release_temporary(ureg, aNrm_dst);
552     } else {
553         struct ureg_dst tmp = ureg_DECL_temporary(ureg);
554 
555         if (key->vertextween) {
556             struct ureg_dst aVtx_dst = ureg_DECL_temporary(ureg);
557             ureg_LRP(ureg, aVtx_dst, _XXXX(_CONST(30)), vs->aVtx1, vs->aVtx);
558             vs->aVtx = ureg_src(aVtx_dst);
559             if (has_aNrm) {
560                 struct ureg_dst aNrm_dst = ureg_DECL_temporary(ureg);
561                 ureg_LRP(ureg, aNrm_dst, _XXXX(_CONST(30)), vs->aNrm1, vs->aNrm);
562                 vs->aNrm = ureg_src(aNrm_dst);
563             }
564         }
565 
566         /* position = vertex * WORLD_VIEW_PROJ */
567         ureg_MUL(ureg, tmp, _XXXX(vs->aVtx), _CONST(0));
568         ureg_MAD(ureg, tmp, _YYYY(vs->aVtx), _CONST(1), ureg_src(tmp));
569         ureg_MAD(ureg, tmp, _ZZZZ(vs->aVtx), _CONST(2), ureg_src(tmp));
570         ureg_MAD(ureg, oPos, _WWWW(vs->aVtx), _CONST(3), ureg_src(tmp));
571         ureg_release_temporary(ureg, tmp);
572 
573         if (need_aVtx) {
574             struct ureg_dst aVtx_dst = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_XYZ);
575             ureg_MUL(ureg, aVtx_dst, _XXXX(vs->aVtx), _CONST(4));
576             ureg_MAD(ureg, aVtx_dst, _YYYY(vs->aVtx), _CONST(5), ureg_src(aVtx_dst));
577             ureg_MAD(ureg, aVtx_dst, _ZZZZ(vs->aVtx), _CONST(6), ureg_src(aVtx_dst));
578             ureg_MAD(ureg, aVtx_dst, _WWWW(vs->aVtx), _CONST(7), ureg_src(aVtx_dst));
579             vs->aVtx = ureg_src(aVtx_dst);
580         }
581         if (has_aNrm) {
582             struct ureg_dst aNrm_dst = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_XYZ);
583             ureg_MUL(ureg, aNrm_dst, _XXXX(vs->aNrm), _CONST(16));
584             ureg_MAD(ureg, aNrm_dst, _YYYY(vs->aNrm), _CONST(17), ureg_src(aNrm_dst));
585             ureg_MAD(ureg, aNrm_dst, _ZZZZ(vs->aNrm), _CONST(18), ureg_src(aNrm_dst));
586             if (key->normalizenormals)
587                ureg_normalize3(ureg, aNrm_dst, ureg_src(aNrm_dst));
588             vs->aNrm = ureg_src(aNrm_dst);
589         }
590     }
591 
592     /* === Process point size:
593      */
594     if (key->vertexpointsize || key->pointscale || device->driver_caps.always_output_pointsize) {
595         struct ureg_dst tmp = ureg_DECL_temporary(ureg);
596         struct ureg_dst tmp_x = ureg_writemask(tmp, TGSI_WRITEMASK_X);
597         struct ureg_dst tmp_y = ureg_writemask(tmp, TGSI_WRITEMASK_Y);
598         struct ureg_dst tmp_z = ureg_writemask(tmp, TGSI_WRITEMASK_Z);
599         if (key->vertexpointsize) {
600             struct ureg_src cPsz1 = ureg_DECL_constant(ureg, 26);
601             ureg_MAX(ureg, tmp_z, _XXXX(vs->aPsz), _XXXX(cPsz1));
602             ureg_MIN(ureg, tmp_z, _Z(tmp), _YYYY(cPsz1));
603         } else {
604             struct ureg_src cPsz1 = ureg_DECL_constant(ureg, 26);
605             ureg_MOV(ureg, tmp_z, _ZZZZ(cPsz1));
606         }
607 
608         if (key->pointscale) {
609             struct ureg_src cPsz1 = ureg_DECL_constant(ureg, 26);
610             struct ureg_src cPsz2 = ureg_DECL_constant(ureg, 27);
611 
612             ureg_DP3(ureg, tmp_x, vs->aVtx, vs->aVtx);
613             ureg_RSQ(ureg, tmp_y, _X(tmp));
614             ureg_MUL(ureg, tmp_y, _Y(tmp), _X(tmp));
615             ureg_CMP(ureg, tmp_y, ureg_negate(_Y(tmp)), _Y(tmp), ureg_imm1f(ureg, 0.0f));
616             ureg_MAD(ureg, tmp_x, _Y(tmp), _YYYY(cPsz2), _XXXX(cPsz2));
617             ureg_MAD(ureg, tmp_x, _Y(tmp), _X(tmp), _WWWW(cPsz1));
618             ureg_RSQ(ureg, tmp_x, _X(tmp));
619             ureg_MUL(ureg, tmp_x, _X(tmp), _Z(tmp));
620             ureg_MUL(ureg, tmp_x, _X(tmp), _WWWW(_CONST(100)));
621             ureg_MAX(ureg, tmp_x, _X(tmp), _XXXX(cPsz1));
622             ureg_MIN(ureg, tmp_z, _X(tmp), _YYYY(cPsz1));
623         }
624 
625         ureg_MOV(ureg, oPsz, _Z(tmp));
626         ureg_release_temporary(ureg, tmp);
627     }
628 
629     for (i = 0; i < 8; ++i) {
630         struct ureg_dst tmp, tmp_x, tmp2;
631         struct ureg_dst oTex, input_coord, transformed, t, aVtx_normed;
632         unsigned c, writemask;
633         const unsigned tci = (key->tc_gen >> (i * 3)) & 0x7;
634         const unsigned idx = (key->tc_idx >> (i * 3)) & 0x7;
635         unsigned dim_input = 1 + ((key->tc_dim_input >> (i * 2)) & 0x3);
636         const unsigned dim_output = (key->tc_dim_output >> (i * 3)) & 0x7;
637 
638         /* No texture output of index s */
639         if (tci == NINED3DTSS_TCI_DISABLE)
640             continue;
641         oTex = ureg_DECL_output(ureg, texcoord_sn, i);
642         tmp = ureg_DECL_temporary(ureg);
643         tmp_x = ureg_writemask(tmp, TGSI_WRITEMASK_X);
644         input_coord = ureg_DECL_temporary(ureg);
645         transformed = ureg_DECL_temporary(ureg);
646 
647         /* Get the coordinate */
648         switch (tci) {
649         case NINED3DTSS_TCI_PASSTHRU:
650             /* NINED3DTSS_TCI_PASSTHRU => Use texcoord coming from index idx *
651              * Else the idx is used only to determine wrapping mode. */
652             vs->aTex[idx] = build_vs_add_input(vs, NINE_DECLUSAGE_i(TEXCOORD,idx));
653             ureg_MOV(ureg, input_coord, vs->aTex[idx]);
654             break;
655         case NINED3DTSS_TCI_CAMERASPACENORMAL:
656             ureg_MOV(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_XYZ), vs->aNrm);
657             ureg_MOV(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_W), ureg_imm1f(ureg, 1.0f));
658             dim_input = 4;
659             break;
660         case NINED3DTSS_TCI_CAMERASPACEPOSITION:
661             ureg_MOV(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_XYZ), vs->aVtx);
662             ureg_MOV(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_W), ureg_imm1f(ureg, 1.0f));
663             dim_input = 4;
664             break;
665         case NINED3DTSS_TCI_CAMERASPACEREFLECTIONVECTOR:
666             tmp.WriteMask = TGSI_WRITEMASK_XYZ;
667             aVtx_normed = ureg_DECL_temporary(ureg);
668             ureg_normalize3(ureg, aVtx_normed, vs->aVtx);
669             ureg_DP3(ureg, tmp_x, ureg_src(aVtx_normed), vs->aNrm);
670             ureg_MUL(ureg, tmp, vs->aNrm, _X(tmp));
671             ureg_ADD(ureg, tmp, ureg_src(tmp), ureg_src(tmp));
672             ureg_ADD(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_XYZ), ureg_src(aVtx_normed), ureg_negate(ureg_src(tmp)));
673             ureg_MOV(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_W), ureg_imm1f(ureg, 1.0f));
674             ureg_release_temporary(ureg, aVtx_normed);
675             dim_input = 4;
676             tmp.WriteMask = TGSI_WRITEMASK_XYZW;
677             break;
678         case NINED3DTSS_TCI_SPHEREMAP:
679             /* Implement the formula of GL_SPHERE_MAP */
680             tmp.WriteMask = TGSI_WRITEMASK_XYZ;
681             aVtx_normed = ureg_DECL_temporary(ureg);
682             tmp2 = ureg_DECL_temporary(ureg);
683             ureg_normalize3(ureg, aVtx_normed, vs->aVtx);
684             ureg_DP3(ureg, tmp_x, ureg_src(aVtx_normed), vs->aNrm);
685             ureg_MUL(ureg, tmp, vs->aNrm, _X(tmp));
686             ureg_ADD(ureg, tmp, ureg_src(tmp), ureg_src(tmp));
687             ureg_ADD(ureg, tmp, ureg_src(aVtx_normed), ureg_negate(ureg_src(tmp)));
688             /* now tmp = normed(Vtx) - 2 dot3(normed(Vtx), Nrm) Nrm */
689             ureg_MOV(ureg, ureg_writemask(tmp2, TGSI_WRITEMASK_XYZ), ureg_src(tmp));
690             ureg_MUL(ureg, tmp2, ureg_src(tmp2), ureg_src(tmp2));
691             ureg_DP3(ureg, ureg_writemask(tmp2, TGSI_WRITEMASK_X), ureg_src(tmp2), ureg_src(tmp2));
692             ureg_RSQ(ureg, ureg_writemask(tmp2, TGSI_WRITEMASK_X), ureg_src(tmp2));
693             ureg_MUL(ureg, ureg_writemask(tmp2, TGSI_WRITEMASK_X), ureg_src(tmp2), ureg_imm1f(ureg, 0.5f));
694             /* tmp2 = 0.5 / sqrt(tmp.x^2 + tmp.y^2 + (tmp.z+1)^2)
695              * TODO: z coordinates are a bit different gl vs d3d, should the formula be adapted ? */
696             ureg_MUL(ureg, tmp, ureg_src(tmp), _X(tmp2));
697             ureg_ADD(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_XY), ureg_src(tmp), ureg_imm1f(ureg, 0.5f));
698             ureg_MOV(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_ZW), ureg_imm4f(ureg, 0.0f, 0.0f, 0.0f, 1.0f));
699             ureg_release_temporary(ureg, aVtx_normed);
700             ureg_release_temporary(ureg, tmp2);
701             dim_input = 4;
702             tmp.WriteMask = TGSI_WRITEMASK_XYZW;
703             break;
704         default:
705             assert(0);
706             break;
707         }
708 
709         /* Apply the transformation */
710         /* dim_output == 0 => do not transform the components.
711          * XYZRHW also disables transformation */
712         if (!dim_output || key->position_t) {
713             ureg_release_temporary(ureg, transformed);
714             transformed = input_coord;
715             writemask = TGSI_WRITEMASK_XYZW;
716         } else {
717             for (c = 0; c < dim_output; c++) {
718                 t = ureg_writemask(transformed, 1 << c);
719                 switch (dim_input) {
720                 /* dim_input = 1 2 3: -> we add trailing 1 to input*/
721                 case 1: ureg_MAD(ureg, t, _X(input_coord), _XXXX(_CONST(128 + i * 4 + c)), _YYYY(_CONST(128 + i * 4 + c)));
722                         break;
723                 case 2: ureg_DP2(ureg, t, ureg_src(input_coord), _CONST(128 + i * 4 + c));
724                         ureg_ADD(ureg, t, ureg_src(transformed), _ZZZZ(_CONST(128 + i * 4 + c)));
725                         break;
726                 case 3: ureg_DP3(ureg, t, ureg_src(input_coord), _CONST(128 + i * 4 + c));
727                         ureg_ADD(ureg, t, ureg_src(transformed), _WWWW(_CONST(128 + i * 4 + c)));
728                         break;
729                 case 4: ureg_DP4(ureg, t, ureg_src(input_coord), _CONST(128 + i * 4 + c)); break;
730                 default:
731                     assert(0);
732                 }
733             }
734             writemask = (1 << dim_output) - 1;
735             ureg_release_temporary(ureg, input_coord);
736         }
737 
738         ureg_MOV(ureg, ureg_writemask(oTex, writemask), ureg_src(transformed));
739         ureg_release_temporary(ureg, transformed);
740         ureg_release_temporary(ureg, tmp);
741     }
742 
743     /* === Lighting:
744      *
745      * DIRECTIONAL:  Light at infinite distance, parallel rays, no attenuation.
746      * POINT: Finite distance to scene, divergent rays, isotropic, attenuation.
747      * SPOT: Finite distance, divergent rays, angular dependence, attenuation.
748      *
749      * vec3 normal = normalize(in.Normal * NormalMatrix);
750      * vec3 hitDir = light.direction;
751      * float atten = 1.0;
752      *
753      * if (light.type != DIRECTIONAL)
754      * {
755      *     vec3 hitVec = light.position - eyeVertex;
756      *     float d = length(hitVec);
757      *     hitDir = hitVec / d;
758      *     atten = 1 / ((light.atten2 * d + light.atten1) * d + light.atten0);
759      * }
760      *
761      * if (light.type == SPOTLIGHT)
762      * {
763      *     float rho = dp3(-hitVec, light.direction);
764      *     if (rho < cos(light.phi / 2))
765      *         atten = 0;
766      *     if (rho < cos(light.theta / 2))
767      *         atten *= pow(some_func(rho), light.falloff);
768      * }
769      *
770      * float nDotHit = dp3_sat(normal, hitVec);
771      * float powFact = 0.0;
772      *
773      * if (nDotHit > 0.0)
774      * {
775      *     vec3 midVec = normalize(hitDir + eye);
776      *     float nDotMid = dp3_sat(normal, midVec);
777      *     pFact = pow(nDotMid, material.power);
778      * }
779      *
780      * ambient += light.ambient * atten;
781      * diffuse += light.diffuse * atten * nDotHit;
782      * specular += light.specular * atten * powFact;
783      */
784     if (key->lighting) {
785         struct ureg_dst tmp = ureg_DECL_temporary(ureg);
786         struct ureg_dst tmp_x = ureg_writemask(tmp, TGSI_WRITEMASK_X);
787         struct ureg_dst tmp_y = ureg_writemask(tmp, TGSI_WRITEMASK_Y);
788         struct ureg_dst tmp_z = ureg_writemask(tmp, TGSI_WRITEMASK_Z);
789         struct ureg_dst rAtt = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_W);
790         struct ureg_dst rHit = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_XYZ);
791         struct ureg_dst rMid = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_XYZ);
792 
793         struct ureg_dst rCtr = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_W);
794 
795         struct ureg_dst AL = ureg_writemask(AR, TGSI_WRITEMASK_X);
796 
797         /* Light.*.Alpha is not used. */
798         struct ureg_dst rD = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_XYZ);
799         struct ureg_dst rA = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_XYZ);
800         struct ureg_dst rS = ureg_DECL_temporary(ureg);
801 
802         struct ureg_src mtlP = _XXXX(MATERIAL_CONST(4));
803 
804         struct ureg_src cLKind = _XXXX(LIGHT_CONST(0));
805         struct ureg_src cLAtt0 = _YYYY(LIGHT_CONST(0));
806         struct ureg_src cLAtt1 = _ZZZZ(LIGHT_CONST(0));
807         struct ureg_src cLAtt2 = _WWWW(LIGHT_CONST(0));
808         struct ureg_src cLColD = _XYZW(LIGHT_CONST(1));
809         struct ureg_src cLColS = _XYZW(LIGHT_CONST(2));
810         struct ureg_src cLColA = _XYZW(LIGHT_CONST(3));
811         struct ureg_src cLPos  = _XYZW(LIGHT_CONST(4));
812         struct ureg_src cLRng  = _WWWW(LIGHT_CONST(4));
813         struct ureg_src cLDir  = _XYZW(LIGHT_CONST(5));
814         struct ureg_src cLFOff = _WWWW(LIGHT_CONST(5));
815         struct ureg_src cLTht  = _XXXX(LIGHT_CONST(6));
816         struct ureg_src cLPhi  = _YYYY(LIGHT_CONST(6));
817         struct ureg_src cLSDiv = _ZZZZ(LIGHT_CONST(6));
818         struct ureg_src cLLast = _WWWW(LIGHT_CONST(7));
819 
820         const unsigned loop_label = l++;
821 
822         /* Declare all light constants to allow indirect addressing */
823         for (i = 32; i < 96; i++)
824             ureg_DECL_constant(ureg, i);
825 
826         ureg_MOV(ureg, rCtr, ureg_imm1f(ureg, 32.0f)); /* &lightconst(0) */
827         ureg_MOV(ureg, rD, ureg_imm1f(ureg, 0.0f));
828         ureg_MOV(ureg, rA, ureg_imm1f(ureg, 0.0f));
829         ureg_MOV(ureg, rS, ureg_imm1f(ureg, 0.0f));
830 
831         /* loop management */
832         ureg_BGNLOOP(ureg, &label[loop_label]);
833         ureg_ARL(ureg, AL, _W(rCtr));
834 
835         /* if (not DIRECTIONAL light): */
836         ureg_SNE(ureg, tmp_x, cLKind, ureg_imm1f(ureg, D3DLIGHT_DIRECTIONAL));
837         ureg_MOV(ureg, rHit, ureg_negate(cLDir));
838         ureg_MOV(ureg, rAtt, ureg_imm1f(ureg, 1.0f));
839         ureg_IF(ureg, _X(tmp), &label[l++]);
840         {
841             /* hitDir = light.position - eyeVtx
842              * d = length(hitDir)
843              */
844             ureg_ADD(ureg, rHit, cLPos, ureg_negate(vs->aVtx));
845             ureg_DP3(ureg, tmp_x, ureg_src(rHit), ureg_src(rHit));
846             ureg_RSQ(ureg, tmp_y, _X(tmp));
847             ureg_MUL(ureg, tmp_x, _X(tmp), _Y(tmp)); /* length */
848 
849             /* att = 1.0 / (light.att0 + (light.att1 + light.att2 * d) * d) */
850             ureg_MAD(ureg, rAtt, _X(tmp), cLAtt2, cLAtt1);
851             ureg_MAD(ureg, rAtt, _X(tmp), _W(rAtt), cLAtt0);
852             ureg_RCP(ureg, rAtt, _W(rAtt));
853             /* cut-off if distance exceeds Light.Range */
854             ureg_SLT(ureg, tmp_x, _X(tmp), cLRng);
855             ureg_MUL(ureg, rAtt, _W(rAtt), _X(tmp));
856         }
857         ureg_fixup_label(ureg, label[l-1], ureg_get_instruction_number(ureg));
858         ureg_ENDIF(ureg);
859 
860         /* normalize hitDir */
861         ureg_normalize3(ureg, rHit, ureg_src(rHit));
862 
863         /* if (SPOT light) */
864         ureg_SEQ(ureg, tmp_x, cLKind, ureg_imm1f(ureg, D3DLIGHT_SPOT));
865         ureg_IF(ureg, _X(tmp), &label[l++]);
866         {
867             /* rho = dp3(-hitDir, light.spotDir)
868              *
869              * if (rho  > light.ctht2) NOTE: 0 <= phi <= pi, 0 <= theta <= phi
870              *     spotAtt = 1
871              * else
872              * if (rho <= light.cphi2)
873              *     spotAtt = 0
874              * else
875              *     spotAtt = (rho - light.cphi2) / (light.ctht2 - light.cphi2) ^ light.falloff
876              */
877             ureg_DP3(ureg, tmp_y, ureg_negate(ureg_src(rHit)), cLDir); /* rho */
878             ureg_ADD(ureg, tmp_x, _Y(tmp), ureg_negate(cLPhi));
879             ureg_MUL(ureg, tmp_x, _X(tmp), cLSDiv);
880             ureg_POW(ureg, tmp_x, _X(tmp), cLFOff); /* spotAtten */
881             ureg_SGE(ureg, tmp_z, _Y(tmp), cLTht); /* if inside theta && phi */
882             ureg_SGE(ureg, tmp_y, _Y(tmp), cLPhi); /* if inside phi */
883             ureg_MAD(ureg, ureg_saturate(tmp_x), _X(tmp), _Y(tmp), _Z(tmp));
884             ureg_MUL(ureg, rAtt, _W(rAtt), _X(tmp));
885         }
886         ureg_fixup_label(ureg, label[l-1], ureg_get_instruction_number(ureg));
887         ureg_ENDIF(ureg);
888 
889         /* directional factors, let's not use LIT because of clarity */
890 
891         if (has_aNrm) {
892             if (key->localviewer) {
893                 ureg_normalize3(ureg, rMid, vs->aVtx);
894                 ureg_ADD(ureg, rMid, ureg_src(rHit), ureg_negate(ureg_src(rMid)));
895             } else {
896                 ureg_ADD(ureg, rMid, ureg_src(rHit), ureg_imm3f(ureg, 0.0f, 0.0f, -1.0f));
897             }
898             ureg_normalize3(ureg, rMid, ureg_src(rMid));
899             ureg_DP3(ureg, ureg_saturate(tmp_x), vs->aNrm, ureg_src(rHit));
900             ureg_DP3(ureg, ureg_saturate(tmp_y), vs->aNrm, ureg_src(rMid));
901             ureg_MUL(ureg, tmp_z, _X(tmp), _Y(tmp));
902             /* Tests show that specular is computed only if (dp3(normal,hitDir) > 0).
903              * For front facing, it is more restrictive than test (dp3(normal,mid) > 0).
904              * No tests were made for backfacing, so add the two conditions */
905             ureg_IF(ureg, _Z(tmp), &label[l++]);
906             {
907                 ureg_DP3(ureg, ureg_saturate(tmp_y), vs->aNrm, ureg_src(rMid));
908                 ureg_POW(ureg, tmp_y, _Y(tmp), mtlP);
909                 ureg_MUL(ureg, tmp_y, _W(rAtt), _Y(tmp)); /* power factor * att */
910                 ureg_MAD(ureg, rS, cLColS, _Y(tmp), ureg_src(rS)); /* accumulate specular */
911             }
912             ureg_fixup_label(ureg, label[l-1], ureg_get_instruction_number(ureg));
913             ureg_ENDIF(ureg);
914 
915             ureg_MUL(ureg, tmp_x, _W(rAtt), _X(tmp)); /* dp3(normal,hitDir) * att */
916             ureg_MAD(ureg, rD, cLColD, _X(tmp), ureg_src(rD)); /* accumulate diffuse */
917         }
918 
919         ureg_MAD(ureg, rA, cLColA, _W(rAtt), ureg_src(rA)); /* accumulate ambient */
920 
921         /* break if this was the last light */
922         ureg_IF(ureg, cLLast, &label[l++]);
923         ureg_BRK(ureg);
924         ureg_ENDIF(ureg);
925         ureg_fixup_label(ureg, label[l-1], ureg_get_instruction_number(ureg));
926 
927         ureg_ADD(ureg, rCtr, _W(rCtr), ureg_imm1f(ureg, 8.0f));
928         ureg_fixup_label(ureg, label[loop_label], ureg_get_instruction_number(ureg));
929         ureg_ENDLOOP(ureg, &label[loop_label]);
930 
931         /* Apply to material:
932          *
933          * oCol[0] = (material.emissive + material.ambient * rs.ambient) +
934          *           material.ambient * ambient +
935          *           material.diffuse * diffuse +
936          * oCol[1] = material.specular * specular;
937          */
938         if (key->mtl_emissive == 0 && key->mtl_ambient == 0)
939             ureg_MAD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XYZ), ureg_src(rA), vs->mtlA, _CONST(19));
940         else {
941             ureg_ADD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XYZ), ureg_src(rA), _CONST(25));
942             ureg_MAD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XYZ), vs->mtlA, ureg_src(tmp), vs->mtlE);
943         }
944 
945         ureg_MAD(ureg, ureg_writemask(oCol[0], TGSI_WRITEMASK_XYZ), ureg_src(rD), vs->mtlD, ureg_src(tmp));
946         ureg_MOV(ureg, ureg_writemask(oCol[0], TGSI_WRITEMASK_W), vs->mtlD);
947         ureg_MUL(ureg, oCol[1], ureg_src(rS), vs->mtlS);
948         ureg_release_temporary(ureg, rAtt);
949         ureg_release_temporary(ureg, rHit);
950         ureg_release_temporary(ureg, rMid);
951         ureg_release_temporary(ureg, rCtr);
952         ureg_release_temporary(ureg, rD);
953         ureg_release_temporary(ureg, rA);
954         ureg_release_temporary(ureg, rS);
955         ureg_release_temporary(ureg, rAtt);
956         ureg_release_temporary(ureg, tmp);
957     } else
958     /* COLOR */
959     if (key->darkness) {
960         if (key->mtl_emissive == 0 && key->mtl_ambient == 0)
961             ureg_MOV(ureg, ureg_writemask(oCol[0], TGSI_WRITEMASK_XYZ), _CONST(19));
962         else
963             ureg_MAD(ureg, ureg_writemask(oCol[0], TGSI_WRITEMASK_XYZ), vs->mtlA, _CONST(25), vs->mtlE);
964         ureg_MOV(ureg, ureg_writemask(oCol[0], TGSI_WRITEMASK_W), vs->mtlD);
965         ureg_MOV(ureg, oCol[1], ureg_imm1f(ureg, 0.0f));
966     } else {
967         ureg_MOV(ureg, oCol[0], vs->aCol[0]);
968         ureg_MOV(ureg, oCol[1], vs->aCol[1]);
969     }
970 
971     /* === Process fog.
972      *
973      * exp(x) = ex2(log2(e) * x)
974      */
975     if (key->fog_mode) {
976         struct ureg_dst tmp = ureg_DECL_temporary(ureg);
977         struct ureg_dst tmp_x = ureg_writemask(tmp, TGSI_WRITEMASK_X);
978         struct ureg_dst tmp_z = ureg_writemask(tmp, TGSI_WRITEMASK_Z);
979         if (key->fog_range) {
980             ureg_DP3(ureg, tmp_x, vs->aVtx, vs->aVtx);
981             ureg_RSQ(ureg, tmp_z, _X(tmp));
982             ureg_MUL(ureg, tmp_z, _Z(tmp), _X(tmp));
983         } else {
984             ureg_MOV(ureg, tmp_z, ureg_abs(_ZZZZ(vs->aVtx)));
985         }
986 
987         if (key->fog_mode == D3DFOG_EXP) {
988             ureg_MUL(ureg, tmp_x, _Z(tmp), _ZZZZ(_CONST(28)));
989             ureg_MUL(ureg, tmp_x, _X(tmp), ureg_imm1f(ureg, -1.442695f));
990             ureg_EX2(ureg, tmp_x, _X(tmp));
991         } else
992         if (key->fog_mode == D3DFOG_EXP2) {
993             ureg_MUL(ureg, tmp_x, _Z(tmp), _ZZZZ(_CONST(28)));
994             ureg_MUL(ureg, tmp_x, _X(tmp), _X(tmp));
995             ureg_MUL(ureg, tmp_x, _X(tmp), ureg_imm1f(ureg, -1.442695f));
996             ureg_EX2(ureg, tmp_x, _X(tmp));
997         } else
998         if (key->fog_mode == D3DFOG_LINEAR) {
999             ureg_ADD(ureg, tmp_x, _XXXX(_CONST(28)), ureg_negate(_Z(tmp)));
1000             ureg_MUL(ureg, ureg_saturate(tmp_x), _X(tmp), _YYYY(_CONST(28)));
1001         }
1002         ureg_MOV(ureg, oFog, _X(tmp));
1003         ureg_release_temporary(ureg, tmp);
1004     } else if (key->fog && !(key->passthrough & (1 << NINE_DECLUSAGE_FOG))) {
1005         ureg_MOV(ureg, oFog, ureg_scalar(vs->aCol[1], TGSI_SWIZZLE_W));
1006     }
1007 
1008     if (key->passthrough & (1 << NINE_DECLUSAGE_BLENDWEIGHT)) {
1009         struct ureg_src input;
1010         struct ureg_dst output;
1011         input = vs->aWgt;
1012         output = ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 19);
1013         ureg_MOV(ureg, output, input);
1014     }
1015     if (key->passthrough & (1 << NINE_DECLUSAGE_BLENDINDICES)) {
1016         struct ureg_src input;
1017         struct ureg_dst output;
1018         input = vs->aInd;
1019         output = ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 20);
1020         ureg_MOV(ureg, output, input);
1021     }
1022     if (key->passthrough & (1 << NINE_DECLUSAGE_NORMAL)) {
1023         struct ureg_src input;
1024         struct ureg_dst output;
1025         input = vs->aNrm;
1026         output = ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 21);
1027         ureg_MOV(ureg, output, input);
1028     }
1029     if (key->passthrough & (1 << NINE_DECLUSAGE_TANGENT)) {
1030         struct ureg_src input;
1031         struct ureg_dst output;
1032         input = build_vs_add_input(vs, NINE_DECLUSAGE_TANGENT);
1033         output = ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 22);
1034         ureg_MOV(ureg, output, input);
1035     }
1036     if (key->passthrough & (1 << NINE_DECLUSAGE_BINORMAL)) {
1037         struct ureg_src input;
1038         struct ureg_dst output;
1039         input = build_vs_add_input(vs, NINE_DECLUSAGE_BINORMAL);
1040         output = ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 23);
1041         ureg_MOV(ureg, output, input);
1042     }
1043     if (key->passthrough & (1 << NINE_DECLUSAGE_FOG)) {
1044         struct ureg_src input;
1045         struct ureg_dst output;
1046         input = build_vs_add_input(vs, NINE_DECLUSAGE_FOG);
1047         input = ureg_scalar(input, TGSI_SWIZZLE_X);
1048         output = oFog;
1049         ureg_MOV(ureg, output, input);
1050     }
1051     if (key->passthrough & (1 << NINE_DECLUSAGE_DEPTH)) {
1052         (void) 0; /* TODO: replace z of position output ? */
1053     }
1054 
1055     /* ucp for ff applies on world coordinates.
1056      * aVtx is in worldview coordinates. */
1057     if (key->ucp) {
1058         struct ureg_dst tmp = ureg_DECL_temporary(ureg);
1059         ureg_MUL(ureg, tmp, _XXXX(vs->aVtx), _CONST(12));
1060         ureg_MAD(ureg, tmp, _YYYY(vs->aVtx), _CONST(13),  ureg_src(tmp));
1061         ureg_MAD(ureg, tmp, _ZZZZ(vs->aVtx), _CONST(14), ureg_src(tmp));
1062         if (!key->clipplane_emulate) {
1063             struct ureg_dst clipVect = ureg_DECL_output(ureg, TGSI_SEMANTIC_CLIPVERTEX, 0);
1064             ureg_ADD(ureg, clipVect, _CONST(15), ureg_src(tmp));
1065         } else {
1066             struct ureg_dst clipdist[2] = {ureg_dst_undef(), ureg_dst_undef()};
1067             int num_clipdist = ffs(key->clipplane_emulate);
1068             ureg_ADD(ureg, tmp, _CONST(15), ureg_src(tmp));
1069             clipdist[0] = ureg_DECL_output_masked(ureg, TGSI_SEMANTIC_CLIPDIST, 0,
1070                                                       ((1 << num_clipdist) - 1) & 0xf, 0, 1);
1071             if (num_clipdist >= 5)
1072                 clipdist[1] = ureg_DECL_output_masked(ureg, TGSI_SEMANTIC_CLIPDIST, 1,
1073                                                       ((1 << (num_clipdist - 4)) - 1) & 0xf, 0, 1);
1074             ureg_property(ureg, TGSI_PROPERTY_NUM_CLIPDIST_ENABLED, num_clipdist);
1075             for (i = 0; i < num_clipdist; i++) {
1076                 assert(!ureg_dst_is_undef(clipdist[i>>2]));
1077                 if (!(key->clipplane_emulate & (1 << i)))
1078                     ureg_MOV(ureg, ureg_writemask(clipdist[i>>2], 1 << (i & 0x2)), ureg_imm1f(ureg, 0.f));
1079                 else
1080                     ureg_DP4(ureg, ureg_writemask(clipdist[i>>2], 1 << (i & 0x2)),
1081                              ureg_src(tmp), _CONST(196+i));
1082             }
1083         }
1084         ureg_release_temporary(ureg, tmp);
1085     }
1086 
1087     if (key->position_t && device->driver_caps.window_space_position_support)
1088         ureg_property(ureg, TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION, true);
1089 
1090     ureg_END(ureg);
1091     nine_ureg_tgsi_dump(ureg, false);
1092     return nine_create_shader_with_so_and_destroy(ureg, device->context.pipe, NULL);
1093 }
1094 
1095 /* PS FF constants layout:
1096  *
1097  * CONST[ 0.. 7]      stage[i].D3DTSS_CONSTANT
1098  * CONST[ 8..15].x___ stage[i].D3DTSS_BUMPENVMAT00
1099  * CONST[ 8..15]._y__ stage[i].D3DTSS_BUMPENVMAT01
1100  * CONST[ 8..15].__z_ stage[i].D3DTSS_BUMPENVMAT10
1101  * CONST[ 8..15].___w stage[i].D3DTSS_BUMPENVMAT11
1102  * CONST[16..19].x_z_ stage[i].D3DTSS_BUMPENVLSCALE
1103  * CONST[17..19]._y_w stage[i].D3DTSS_BUMPENVLOFFSET
1104  *
1105  * CONST[20] D3DRS_TEXTUREFACTOR
1106  * CONST[21] D3DRS_FOGCOLOR
1107  * CONST[22].x___ RS.FogEnd
1108  * CONST[22]._y__ 1.0f / (RS.FogEnd - RS.FogStart)
1109  * CONST[22].__z_ RS.FogDensity
1110  * CONST[22].___w Alpha ref
1111  */
1112 struct ps_build_ctx
1113 {
1114     struct ureg_program *ureg;
1115     unsigned color_interpolate_flag;
1116 
1117     struct ureg_src vC[2]; /* DIFFUSE, SPECULAR */
1118     struct ureg_src vT[8]; /* TEXCOORD[i] */
1119     struct ureg_dst rCur; /* D3DTA_CURRENT */
1120     struct ureg_dst rMod;
1121     struct ureg_src rCurSrc;
1122     struct ureg_dst rTmp; /* D3DTA_TEMP */
1123     struct ureg_src rTmpSrc;
1124     struct ureg_dst rTex;
1125     struct ureg_src rTexSrc;
1126     struct ureg_src cBEM[8];
1127     struct ureg_src s[8];
1128 
1129     struct {
1130         unsigned index;
1131         unsigned index_pre_mod;
1132     } stage;
1133 };
1134 
1135 static struct ureg_src
ps_get_ts_arg(struct ps_build_ctx * ps,unsigned ta)1136 ps_get_ts_arg(struct ps_build_ctx *ps, unsigned ta)
1137 {
1138     struct ureg_src reg;
1139 
1140     switch (ta & D3DTA_SELECTMASK) {
1141     case D3DTA_CONSTANT:
1142         reg = ureg_DECL_constant(ps->ureg, ps->stage.index);
1143         break;
1144     case D3DTA_CURRENT:
1145         reg = (ps->stage.index == ps->stage.index_pre_mod) ? ureg_src(ps->rMod) : ps->rCurSrc;
1146         break;
1147     case D3DTA_DIFFUSE:
1148         reg = ureg_DECL_fs_input(ps->ureg, TGSI_SEMANTIC_COLOR, 0, ps->color_interpolate_flag);
1149         break;
1150     case D3DTA_SPECULAR:
1151         reg = ureg_DECL_fs_input(ps->ureg, TGSI_SEMANTIC_COLOR, 1, ps->color_interpolate_flag);
1152         break;
1153     case D3DTA_TEMP:
1154         reg = ps->rTmpSrc;
1155         break;
1156     case D3DTA_TEXTURE:
1157         reg = ps->rTexSrc;
1158         break;
1159     case D3DTA_TFACTOR:
1160         reg = ureg_DECL_constant(ps->ureg, 20);
1161         break;
1162     default:
1163         assert(0);
1164         reg = ureg_src_undef();
1165         break;
1166     }
1167     if (ta & D3DTA_COMPLEMENT) {
1168         struct ureg_dst dst = ureg_DECL_temporary(ps->ureg);
1169         ureg_ADD(ps->ureg, dst, ureg_imm1f(ps->ureg, 1.0f), ureg_negate(reg));
1170         reg = ureg_src(dst);
1171     }
1172     if (ta & D3DTA_ALPHAREPLICATE)
1173         reg = _WWWW(reg);
1174     return reg;
1175 }
1176 
1177 static struct ureg_dst
ps_get_ts_dst(struct ps_build_ctx * ps,unsigned ta)1178 ps_get_ts_dst(struct ps_build_ctx *ps, unsigned ta)
1179 {
1180     assert(!(ta & (D3DTA_COMPLEMENT | D3DTA_ALPHAREPLICATE)));
1181 
1182     switch (ta & D3DTA_SELECTMASK) {
1183     case D3DTA_CURRENT:
1184         return ps->rCur;
1185     case D3DTA_TEMP:
1186         return ps->rTmp;
1187     default:
1188         assert(0);
1189         return ureg_dst_undef();
1190     }
1191 }
1192 
ps_d3dtop_args_mask(D3DTEXTUREOP top)1193 static uint8_t ps_d3dtop_args_mask(D3DTEXTUREOP top)
1194 {
1195     switch (top) {
1196     case D3DTOP_DISABLE:
1197         return 0x0;
1198     case D3DTOP_SELECTARG1:
1199     case D3DTOP_PREMODULATE:
1200         return 0x2;
1201     case D3DTOP_SELECTARG2:
1202         return 0x4;
1203     case D3DTOP_MULTIPLYADD:
1204     case D3DTOP_LERP:
1205         return 0x7;
1206     default:
1207         return 0x6;
1208     }
1209 }
1210 
1211 static inline bool
is_MOV_no_op(struct ureg_dst dst,struct ureg_src src)1212 is_MOV_no_op(struct ureg_dst dst, struct ureg_src src)
1213 {
1214     return !dst.WriteMask ||
1215         (dst.File == src.File &&
1216          dst.Index == src.Index &&
1217          !dst.Indirect &&
1218          !dst.Saturate &&
1219          !src.Indirect &&
1220          !src.Negate &&
1221          !src.Absolute &&
1222          (!(dst.WriteMask & TGSI_WRITEMASK_X) || (src.SwizzleX == TGSI_SWIZZLE_X)) &&
1223          (!(dst.WriteMask & TGSI_WRITEMASK_Y) || (src.SwizzleY == TGSI_SWIZZLE_Y)) &&
1224          (!(dst.WriteMask & TGSI_WRITEMASK_Z) || (src.SwizzleZ == TGSI_SWIZZLE_Z)) &&
1225          (!(dst.WriteMask & TGSI_WRITEMASK_W) || (src.SwizzleW == TGSI_SWIZZLE_W)));
1226 
1227 }
1228 
1229 static void
ps_do_ts_op(struct ps_build_ctx * ps,unsigned top,struct ureg_dst dst,struct ureg_src * arg)1230 ps_do_ts_op(struct ps_build_ctx *ps, unsigned top, struct ureg_dst dst, struct ureg_src *arg)
1231 {
1232     struct ureg_program *ureg = ps->ureg;
1233     struct ureg_dst tmp = ureg_DECL_temporary(ureg);
1234     struct ureg_dst tmp2 = ureg_DECL_temporary(ureg);
1235     struct ureg_dst tmp_x = ureg_writemask(tmp, TGSI_WRITEMASK_X);
1236 
1237     tmp.WriteMask = dst.WriteMask;
1238 
1239     if (top != D3DTOP_SELECTARG1 && top != D3DTOP_SELECTARG2 &&
1240         top != D3DTOP_MODULATE && top != D3DTOP_PREMODULATE &&
1241         top != D3DTOP_BLENDDIFFUSEALPHA && top != D3DTOP_BLENDTEXTUREALPHA &&
1242         top != D3DTOP_BLENDFACTORALPHA && top != D3DTOP_BLENDCURRENTALPHA &&
1243         top != D3DTOP_BUMPENVMAP && top != D3DTOP_BUMPENVMAPLUMINANCE &&
1244         top != D3DTOP_LERP)
1245         dst = ureg_saturate(dst);
1246 
1247     switch (top) {
1248     case D3DTOP_SELECTARG1:
1249         if (!is_MOV_no_op(dst, arg[1]))
1250             ureg_MOV(ureg, dst, arg[1]);
1251         break;
1252     case D3DTOP_SELECTARG2:
1253         if (!is_MOV_no_op(dst, arg[2]))
1254             ureg_MOV(ureg, dst, arg[2]);
1255         break;
1256     case D3DTOP_MODULATE:
1257         ureg_MUL(ureg, dst, arg[1], arg[2]);
1258         break;
1259     case D3DTOP_MODULATE2X:
1260         ureg_MUL(ureg, tmp, arg[1], arg[2]);
1261         ureg_ADD(ureg, dst, ureg_src(tmp), ureg_src(tmp));
1262         break;
1263     case D3DTOP_MODULATE4X:
1264         ureg_MUL(ureg, tmp, arg[1], arg[2]);
1265         ureg_MUL(ureg, dst, ureg_src(tmp), ureg_imm1f(ureg, 4.0f));
1266         break;
1267     case D3DTOP_ADD:
1268         ureg_ADD(ureg, dst, arg[1], arg[2]);
1269         break;
1270     case D3DTOP_ADDSIGNED:
1271         ureg_ADD(ureg, tmp, arg[1], arg[2]);
1272         ureg_ADD(ureg, dst, ureg_src(tmp), ureg_imm1f(ureg, -0.5f));
1273         break;
1274     case D3DTOP_ADDSIGNED2X:
1275         ureg_ADD(ureg, tmp, arg[1], arg[2]);
1276         ureg_MAD(ureg, dst, ureg_src(tmp), ureg_imm1f(ureg, 2.0f), ureg_imm1f(ureg, -1.0f));
1277         break;
1278     case D3DTOP_SUBTRACT:
1279         ureg_ADD(ureg, dst, arg[1], ureg_negate(arg[2]));
1280         break;
1281     case D3DTOP_ADDSMOOTH:
1282         ureg_ADD(ureg, tmp, ureg_imm1f(ureg, 1.0f), ureg_negate(arg[1]));
1283         ureg_MAD(ureg, dst, ureg_src(tmp), arg[2], arg[1]);
1284         break;
1285     case D3DTOP_BLENDDIFFUSEALPHA:
1286         ureg_LRP(ureg, dst, _WWWW(ps->vC[0]), arg[1], arg[2]);
1287         break;
1288     case D3DTOP_BLENDTEXTUREALPHA:
1289         /* XXX: alpha taken from previous stage, texture or result ? */
1290         ureg_LRP(ureg, dst, _W(ps->rTex), arg[1], arg[2]);
1291         break;
1292     case D3DTOP_BLENDFACTORALPHA:
1293         ureg_LRP(ureg, dst, _WWWW(_CONST(20)), arg[1], arg[2]);
1294         break;
1295     case D3DTOP_BLENDTEXTUREALPHAPM:
1296         ureg_ADD(ureg, tmp_x, ureg_imm1f(ureg, 1.0f), ureg_negate(_W(ps->rTex)));
1297         ureg_MAD(ureg, dst, arg[2], _X(tmp), arg[1]);
1298         break;
1299     case D3DTOP_BLENDCURRENTALPHA:
1300         ureg_LRP(ureg, dst, _WWWW(ps->rCurSrc), arg[1], arg[2]);
1301         break;
1302     case D3DTOP_PREMODULATE:
1303         ureg_MOV(ureg, dst, arg[1]);
1304         ps->stage.index_pre_mod = ps->stage.index + 1;
1305         break;
1306     case D3DTOP_MODULATEALPHA_ADDCOLOR:
1307         ureg_MAD(ureg, dst, _WWWW(arg[1]), arg[2], arg[1]);
1308         break;
1309     case D3DTOP_MODULATECOLOR_ADDALPHA:
1310         ureg_MAD(ureg, dst, arg[1], arg[2], _WWWW(arg[1]));
1311         break;
1312     case D3DTOP_MODULATEINVALPHA_ADDCOLOR:
1313         ureg_ADD(ureg, tmp_x, ureg_imm1f(ureg, 1.0f), ureg_negate(_WWWW(arg[1])));
1314         ureg_MAD(ureg, dst, _X(tmp), arg[2], arg[1]);
1315         break;
1316     case D3DTOP_MODULATEINVCOLOR_ADDALPHA:
1317         ureg_ADD(ureg, tmp, ureg_imm1f(ureg, 1.0f), ureg_negate(arg[1]));
1318         ureg_MAD(ureg, dst, ureg_src(tmp), arg[2], _WWWW(arg[1]));
1319         break;
1320     case D3DTOP_BUMPENVMAP:
1321         break;
1322     case D3DTOP_BUMPENVMAPLUMINANCE:
1323         break;
1324     case D3DTOP_DOTPRODUCT3:
1325         ureg_ADD(ureg, tmp, arg[1], ureg_imm4f(ureg,-0.5,-0.5,-0.5,-0.5));
1326         ureg_ADD(ureg, tmp2, arg[2] , ureg_imm4f(ureg,-0.5,-0.5,-0.5,-0.5));
1327         ureg_DP3(ureg, tmp, ureg_src(tmp), ureg_src(tmp2));
1328         ureg_MUL(ureg, ureg_saturate(dst), ureg_src(tmp), ureg_imm4f(ureg,4.0,4.0,4.0,4.0));
1329         break;
1330     case D3DTOP_MULTIPLYADD:
1331         ureg_MAD(ureg, dst, arg[1], arg[2], arg[0]);
1332         break;
1333     case D3DTOP_LERP:
1334         ureg_LRP(ureg, dst, arg[0], arg[1], arg[2]);
1335         break;
1336     case D3DTOP_DISABLE:
1337         /* no-op ? */
1338         break;
1339     default:
1340         assert(!"invalid D3DTOP");
1341         break;
1342     }
1343     ureg_release_temporary(ureg, tmp);
1344     ureg_release_temporary(ureg, tmp2);
1345 }
1346 
1347 static void *
nine_ff_build_ps(struct NineDevice9 * device,struct nine_ff_ps_key * key)1348 nine_ff_build_ps(struct NineDevice9 *device, struct nine_ff_ps_key *key)
1349 {
1350     struct ps_build_ctx ps;
1351     struct ureg_program *ureg = ureg_create(PIPE_SHADER_FRAGMENT);
1352     struct ureg_dst oCol;
1353     unsigned s;
1354     const unsigned texcoord_sn = get_texcoord_sn(device->screen);
1355 
1356     memset(&ps, 0, sizeof(ps));
1357     ps.ureg = ureg;
1358     ps.color_interpolate_flag = key->flatshade ? TGSI_INTERPOLATE_CONSTANT : TGSI_INTERPOLATE_PERSPECTIVE;
1359     ps.stage.index_pre_mod = -1;
1360 
1361     ps.vC[0] = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_COLOR, 0, ps.color_interpolate_flag);
1362 
1363     ps.rCur = ureg_DECL_temporary(ureg);
1364     ps.rTmp = ureg_DECL_temporary(ureg);
1365     ps.rTex = ureg_DECL_temporary(ureg);
1366     ps.rCurSrc = ureg_src(ps.rCur);
1367     ps.rTmpSrc = ureg_src(ps.rTmp);
1368     ps.rTexSrc = ureg_src(ps.rTex);
1369 
1370     /* Initial values */
1371     ureg_MOV(ureg, ps.rCur, ps.vC[0]);
1372     ureg_MOV(ureg, ps.rTmp, ureg_imm1f(ureg, 0.0f));
1373     ureg_MOV(ureg, ps.rTex, ureg_imm1f(ureg, 0.0f));
1374 
1375     for (s = 0; s < 8; ++s) {
1376         ps.s[s] = ureg_src_undef();
1377 
1378         if (key->ts[s].colorop != D3DTOP_DISABLE) {
1379             if (key->ts[s].colorarg0 == D3DTA_SPECULAR ||
1380                 key->ts[s].colorarg1 == D3DTA_SPECULAR ||
1381                 key->ts[s].colorarg2 == D3DTA_SPECULAR)
1382                 ps.vC[1] = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_COLOR, 1, ps.color_interpolate_flag);
1383 
1384             if (key->ts[s].colorarg0 == D3DTA_TEXTURE ||
1385                 key->ts[s].colorarg1 == D3DTA_TEXTURE ||
1386                 key->ts[s].colorarg2 == D3DTA_TEXTURE ||
1387                 key->ts[s].colorop == D3DTOP_BLENDTEXTUREALPHA ||
1388                 key->ts[s].colorop == D3DTOP_BLENDTEXTUREALPHAPM) {
1389                 ps.s[s] = ureg_DECL_sampler(ureg, s);
1390                 ps.vT[s] = ureg_DECL_fs_input(ureg, texcoord_sn, s, TGSI_INTERPOLATE_PERSPECTIVE);
1391             }
1392             if (s && (key->ts[s - 1].colorop == D3DTOP_PREMODULATE ||
1393                       key->ts[s - 1].alphaop == D3DTOP_PREMODULATE))
1394                 ps.s[s] = ureg_DECL_sampler(ureg, s);
1395         }
1396 
1397         if (key->ts[s].alphaop != D3DTOP_DISABLE) {
1398             if (key->ts[s].alphaarg0 == D3DTA_SPECULAR ||
1399                 key->ts[s].alphaarg1 == D3DTA_SPECULAR ||
1400                 key->ts[s].alphaarg2 == D3DTA_SPECULAR)
1401                 ps.vC[1] = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_COLOR, 1, ps.color_interpolate_flag);
1402 
1403             if (key->ts[s].alphaarg0 == D3DTA_TEXTURE ||
1404                 key->ts[s].alphaarg1 == D3DTA_TEXTURE ||
1405                 key->ts[s].alphaarg2 == D3DTA_TEXTURE ||
1406                 key->ts[s].colorop == D3DTOP_BLENDTEXTUREALPHA ||
1407                 key->ts[s].colorop == D3DTOP_BLENDTEXTUREALPHAPM) {
1408                 ps.s[s] = ureg_DECL_sampler(ureg, s);
1409                 ps.vT[s] = ureg_DECL_fs_input(ureg, texcoord_sn, s, TGSI_INTERPOLATE_PERSPECTIVE);
1410             }
1411         }
1412     }
1413     if (key->specular)
1414         ps.vC[1] = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_COLOR, 1, ps.color_interpolate_flag);
1415 
1416     oCol = ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 0);
1417 
1418     /* Run stages.
1419      */
1420     for (s = 0; s < 8; ++s) {
1421         unsigned colorarg[3];
1422         unsigned alphaarg[3];
1423         const uint8_t used_c = ps_d3dtop_args_mask(key->ts[s].colorop);
1424         const uint8_t used_a = ps_d3dtop_args_mask(key->ts[s].alphaop);
1425         struct ureg_dst dst;
1426         struct ureg_src arg[3];
1427 
1428         if (key->ts[s].colorop == D3DTOP_DISABLE) {
1429             assert (key->ts[s].alphaop == D3DTOP_DISABLE);
1430             continue;
1431         }
1432         ps.stage.index = s;
1433 
1434         DBG("STAGE[%u]: colorop=%s alphaop=%s\n", s,
1435             nine_D3DTOP_to_str(key->ts[s].colorop),
1436             nine_D3DTOP_to_str(key->ts[s].alphaop));
1437 
1438         if (!ureg_src_is_undef(ps.s[s])) {
1439             unsigned target;
1440             struct ureg_src texture_coord = ps.vT[s];
1441             struct ureg_dst delta;
1442             switch (key->ts[s].textarget) {
1443             case 0: target = TGSI_TEXTURE_1D; break;
1444             case 1: target = TGSI_TEXTURE_2D; break;
1445             case 2: target = TGSI_TEXTURE_3D; break;
1446             case 3: target = TGSI_TEXTURE_CUBE; break;
1447             /* this is a 2 bit bitfield, do I really need a default case ? */
1448             }
1449 
1450             /* Modify coordinates */
1451             if (s >= 1 &&
1452                 (key->ts[s-1].colorop == D3DTOP_BUMPENVMAP ||
1453                  key->ts[s-1].colorop == D3DTOP_BUMPENVMAPLUMINANCE)) {
1454                 delta = ureg_DECL_temporary(ureg);
1455                 /* Du' = D3DTSS_BUMPENVMAT00(stage s-1)*t(s-1)R + D3DTSS_BUMPENVMAT10(stage s-1)*t(s-1)G */
1456                 ureg_MUL(ureg, ureg_writemask(delta, TGSI_WRITEMASK_X), _X(ps.rTex), _XXXX(_CONST(8 + s - 1)));
1457                 ureg_MAD(ureg, ureg_writemask(delta, TGSI_WRITEMASK_X), _Y(ps.rTex), _ZZZZ(_CONST(8 + s - 1)), ureg_src(delta));
1458                 /* Dv' = D3DTSS_BUMPENVMAT01(stage s-1)*t(s-1)R + D3DTSS_BUMPENVMAT11(stage s-1)*t(s-1)G */
1459                 ureg_MUL(ureg, ureg_writemask(delta, TGSI_WRITEMASK_Y), _X(ps.rTex), _YYYY(_CONST(8 + s - 1)));
1460                 ureg_MAD(ureg, ureg_writemask(delta, TGSI_WRITEMASK_Y), _Y(ps.rTex), _WWWW(_CONST(8 + s - 1)), ureg_src(delta));
1461                 texture_coord = ureg_src(ureg_DECL_temporary(ureg));
1462                 ureg_MOV(ureg, ureg_writemask(ureg_dst(texture_coord), ureg_dst(ps.vT[s]).WriteMask), ps.vT[s]);
1463                 ureg_ADD(ureg, ureg_writemask(ureg_dst(texture_coord), TGSI_WRITEMASK_XY), texture_coord, ureg_src(delta));
1464                 /* Prepare luminance multiplier
1465                  * t(s)RGBA = t(s)RGBA * clamp[(t(s-1)B * D3DTSS_BUMPENVLSCALE(stage s-1)) + D3DTSS_BUMPENVLOFFSET(stage s-1)] */
1466                 if (key->ts[s-1].colorop == D3DTOP_BUMPENVMAPLUMINANCE) {
1467                     struct ureg_src bumpenvlscale = ((s-1) & 1) ? _ZZZZ(_CONST(16 + (s-1) / 2)) : _XXXX(_CONST(16 + (s-1) / 2));
1468                     struct ureg_src bumpenvloffset = ((s-1) & 1) ? _WWWW(_CONST(16 + (s-1) / 2)) : _YYYY(_CONST(16 + (s-1) / 2));
1469 
1470                     ureg_MAD(ureg, ureg_saturate(ureg_writemask(delta, TGSI_WRITEMASK_X)), _Z(ps.rTex), bumpenvlscale, bumpenvloffset);
1471                 }
1472             }
1473             if (key->projected & (3 << (s *2))) {
1474                 unsigned dim = 1 + ((key->projected >> (2 * s)) & 3);
1475                 if (dim == 4)
1476                     ureg_TXP(ureg, ps.rTex, target, texture_coord, ps.s[s]);
1477                 else {
1478                     struct ureg_dst tmp = ureg_DECL_temporary(ureg);
1479                     ureg_RCP(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_X), ureg_scalar(texture_coord, dim-1));
1480                     ureg_MUL(ureg, ps.rTmp, _X(tmp), texture_coord);
1481                     ureg_TEX(ureg, ps.rTex, target, ps.rTmpSrc, ps.s[s]);
1482                     ureg_release_temporary(ureg, tmp);
1483                 }
1484             } else {
1485                 ureg_TEX(ureg, ps.rTex, target, texture_coord, ps.s[s]);
1486             }
1487             if (s >= 1 && key->ts[s-1].colorop == D3DTOP_BUMPENVMAPLUMINANCE)
1488                 ureg_MUL(ureg, ps.rTex, ureg_src(ps.rTex), _X(delta));
1489         }
1490 
1491         if (key->ts[s].colorop == D3DTOP_BUMPENVMAP ||
1492             key->ts[s].colorop == D3DTOP_BUMPENVMAPLUMINANCE)
1493             continue;
1494 
1495         dst = ps_get_ts_dst(&ps, key->ts[s].resultarg ? D3DTA_TEMP : D3DTA_CURRENT);
1496 
1497         if (ps.stage.index_pre_mod == ps.stage.index) {
1498             ps.rMod = ureg_DECL_temporary(ureg);
1499             ureg_MUL(ureg, ps.rMod, ps.rCurSrc, ps.rTexSrc);
1500         }
1501 
1502         colorarg[0] = (key->ts[s].colorarg0 | (((key->colorarg_b4[0] >> s) & 0x1) << 4) | ((key->colorarg_b5[0] >> s) << 5)) & 0x3f;
1503         colorarg[1] = (key->ts[s].colorarg1 | (((key->colorarg_b4[1] >> s) & 0x1) << 4) | ((key->colorarg_b5[1] >> s) << 5)) & 0x3f;
1504         colorarg[2] = (key->ts[s].colorarg2 | (((key->colorarg_b4[2] >> s) & 0x1) << 4) | ((key->colorarg_b5[2] >> s) << 5)) & 0x3f;
1505         alphaarg[0] = (key->ts[s].alphaarg0 | ((key->alphaarg_b4[0] >> s) << 4)) & 0x1f;
1506         alphaarg[1] = (key->ts[s].alphaarg1 | ((key->alphaarg_b4[1] >> s) << 4)) & 0x1f;
1507         alphaarg[2] = (key->ts[s].alphaarg2 | ((key->alphaarg_b4[2] >> s) << 4)) & 0x1f;
1508 
1509         if (key->ts[s].colorop != key->ts[s].alphaop ||
1510             colorarg[0] != alphaarg[0] ||
1511             colorarg[1] != alphaarg[1] ||
1512             colorarg[2] != alphaarg[2])
1513             dst.WriteMask = TGSI_WRITEMASK_XYZ;
1514 
1515         /* Special DOTPRODUCT behaviour (see wine tests) */
1516         if (key->ts[s].colorop == D3DTOP_DOTPRODUCT3)
1517             dst.WriteMask = TGSI_WRITEMASK_XYZW;
1518 
1519         if (used_c & 0x1) arg[0] = ps_get_ts_arg(&ps, colorarg[0]);
1520         if (used_c & 0x2) arg[1] = ps_get_ts_arg(&ps, colorarg[1]);
1521         if (used_c & 0x4) arg[2] = ps_get_ts_arg(&ps, colorarg[2]);
1522         ps_do_ts_op(&ps, key->ts[s].colorop, dst, arg);
1523 
1524         if (dst.WriteMask != TGSI_WRITEMASK_XYZW) {
1525             dst.WriteMask = TGSI_WRITEMASK_W;
1526 
1527             if (used_a & 0x1) arg[0] = ps_get_ts_arg(&ps, alphaarg[0]);
1528             if (used_a & 0x2) arg[1] = ps_get_ts_arg(&ps, alphaarg[1]);
1529             if (used_a & 0x4) arg[2] = ps_get_ts_arg(&ps, alphaarg[2]);
1530             ps_do_ts_op(&ps, key->ts[s].alphaop, dst, arg);
1531         }
1532     }
1533 
1534     if (key->specular)
1535         ureg_ADD(ureg, ureg_writemask(ps.rCur, TGSI_WRITEMASK_XYZ), ps.rCurSrc, ps.vC[1]);
1536 
1537     if (key->alpha_test_emulation == PIPE_FUNC_NEVER) {
1538         ureg_KILL(ureg);
1539     } else if (key->alpha_test_emulation != PIPE_FUNC_ALWAYS) {
1540         unsigned cmp_op;
1541         struct ureg_src src[2];
1542         struct ureg_dst tmp = ps.rTmp;
1543         cmp_op = pipe_comp_to_tgsi_opposite(key->alpha_test_emulation);
1544         src[0] = ureg_scalar(ps.rCurSrc, TGSI_SWIZZLE_W); /* Read color alpha channel */
1545         src[1] = _WWWW(_CONST(22)); /* Read alpha ref */
1546         ureg_insn(ureg, cmp_op, &tmp, 1, src, 2, 0);
1547         ureg_KILL_IF(ureg, ureg_negate(ureg_scalar(ureg_src(tmp), TGSI_SWIZZLE_X))); /* if opposite test passes, discard */
1548     }
1549 
1550     /* Fog.
1551      */
1552     if (key->fog_mode) {
1553         struct ureg_dst rFog = ureg_writemask(ps.rTmp, TGSI_WRITEMASK_X);
1554         struct ureg_src vPos;
1555         if (device->screen->get_param(device->screen,
1556                                       PIPE_CAP_FS_POSITION_IS_SYSVAL)) {
1557             vPos = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_POSITION, 0);
1558         } else {
1559             vPos = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_POSITION, 0,
1560                                       TGSI_INTERPOLATE_LINEAR);
1561         }
1562 
1563         /* Source is either W or Z.
1564          * Z is when an orthogonal projection matrix is detected,
1565          * W (WFOG) else.
1566          */
1567         if (!key->fog_source)
1568             ureg_MOV(ureg, rFog, _ZZZZ(vPos));
1569         else
1570             /* Position's w is 1/w */
1571             ureg_RCP(ureg, rFog, _WWWW(vPos));
1572 
1573         if (key->fog_mode == D3DFOG_EXP) {
1574             ureg_MUL(ureg, rFog, _X(rFog), _ZZZZ(_CONST(22)));
1575             ureg_MUL(ureg, rFog, _X(rFog), ureg_imm1f(ureg, -1.442695f));
1576             ureg_EX2(ureg, rFog, _X(rFog));
1577         } else
1578         if (key->fog_mode == D3DFOG_EXP2) {
1579             ureg_MUL(ureg, rFog, _X(rFog), _ZZZZ(_CONST(22)));
1580             ureg_MUL(ureg, rFog, _X(rFog), _X(rFog));
1581             ureg_MUL(ureg, rFog, _X(rFog), ureg_imm1f(ureg, -1.442695f));
1582             ureg_EX2(ureg, rFog, _X(rFog));
1583         } else
1584         if (key->fog_mode == D3DFOG_LINEAR) {
1585             ureg_ADD(ureg, rFog, _XXXX(_CONST(22)), ureg_negate(_X(rFog)));
1586             ureg_MUL(ureg, ureg_saturate(rFog), _X(rFog), _YYYY(_CONST(22)));
1587         }
1588         ureg_LRP(ureg, ureg_writemask(oCol, TGSI_WRITEMASK_XYZ), _X(rFog), ps.rCurSrc, _CONST(21));
1589         ureg_MOV(ureg, ureg_writemask(oCol, TGSI_WRITEMASK_W), ps.rCurSrc);
1590     } else
1591     if (key->fog) {
1592         struct ureg_src vFog = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_GENERIC, 16, TGSI_INTERPOLATE_PERSPECTIVE);
1593         ureg_LRP(ureg, ureg_writemask(oCol, TGSI_WRITEMASK_XYZ), _XXXX(vFog), ps.rCurSrc, _CONST(21));
1594         ureg_MOV(ureg, ureg_writemask(oCol, TGSI_WRITEMASK_W), ps.rCurSrc);
1595     } else {
1596         ureg_MOV(ureg, oCol, ps.rCurSrc);
1597     }
1598 
1599     ureg_END(ureg);
1600     nine_ureg_tgsi_dump(ureg, false);
1601     return nine_create_shader_with_so_and_destroy(ureg, device->context.pipe, NULL);
1602 }
1603 
1604 static struct NineVertexShader9 *
nine_ff_get_vs(struct NineDevice9 * device)1605 nine_ff_get_vs(struct NineDevice9 *device)
1606 {
1607     const struct nine_context *context = &device->context;
1608     struct NineVertexShader9 *vs;
1609     struct vs_build_ctx bld;
1610     struct nine_ff_vs_key key;
1611     unsigned s, i;
1612     bool has_indexes = false;
1613     bool has_weights = false;
1614     int8_t input_texture_coord[8];
1615 
1616     assert(sizeof(key) <= sizeof(key.value32));
1617 
1618     memset(&key, 0, sizeof(key));
1619     memset(&bld, 0, sizeof(bld));
1620     memset(&input_texture_coord, 0, sizeof(input_texture_coord));
1621 
1622     bld.key = &key;
1623 
1624     /* FIXME: this shouldn't be NULL, but it is on init */
1625     if (context->vdecl) {
1626         key.color0in_one = 1;
1627         key.color1in_zero = 1;
1628         for (i = 0; i < context->vdecl->nelems; i++) {
1629             uint16_t usage = context->vdecl->usage_map[i];
1630             if (usage == NINE_DECLUSAGE_POSITIONT)
1631                 key.position_t = 1;
1632             else if (usage == NINE_DECLUSAGE_i(COLOR, 0))
1633                 key.color0in_one = 0;
1634             else if (usage == NINE_DECLUSAGE_i(COLOR, 1))
1635                 key.color1in_zero = 0;
1636             else if (usage == NINE_DECLUSAGE_i(BLENDINDICES, 0)) {
1637                 has_indexes = true;
1638                 key.passthrough |= 1 << usage;
1639             } else if (usage == NINE_DECLUSAGE_i(BLENDWEIGHT, 0)) {
1640                 has_weights = true;
1641                 key.passthrough |= 1 << usage;
1642             } else if (usage == NINE_DECLUSAGE_i(NORMAL, 0)) {
1643                 key.has_normal = 1;
1644                 key.passthrough |= 1 << usage;
1645             } else if (usage == NINE_DECLUSAGE_PSIZE)
1646                 key.vertexpointsize = 1;
1647             else if (usage % NINE_DECLUSAGE_COUNT == NINE_DECLUSAGE_TEXCOORD) {
1648                 s = usage / NINE_DECLUSAGE_COUNT;
1649                 if (s < 8)
1650                     input_texture_coord[s] = nine_decltype_get_dim(context->vdecl->decls[i].Type);
1651                 else
1652                     DBG("FF given texture coordinate >= 8. Ignoring\n");
1653             } else if (usage < NINE_DECLUSAGE_NONE)
1654                 key.passthrough |= 1 << usage;
1655         }
1656     }
1657     /* ff vs + ps 3.0: some elements are passed to the ps (wine test).
1658      * We do restrict to indices 0 */
1659     key.passthrough &= ~((1 << NINE_DECLUSAGE_POSITION) | (1 << NINE_DECLUSAGE_PSIZE) |
1660                          (1 << NINE_DECLUSAGE_TEXCOORD) | (1 << NINE_DECLUSAGE_POSITIONT) |
1661                          (1 << NINE_DECLUSAGE_TESSFACTOR) | (1 << NINE_DECLUSAGE_SAMPLE));
1662     if (!key.position_t)
1663         key.passthrough = 0;
1664     key.pointscale = !!context->rs[D3DRS_POINTSCALEENABLE];
1665 
1666     key.lighting = !!context->rs[D3DRS_LIGHTING] &&  context->ff.num_lights_active;
1667     key.darkness = !!context->rs[D3DRS_LIGHTING] && !context->ff.num_lights_active;
1668     if (key.position_t) {
1669         key.darkness = 0; /* |= key.lighting; */ /* XXX ? */
1670         key.lighting = 0;
1671     }
1672     if ((key.lighting | key.darkness) && context->rs[D3DRS_COLORVERTEX]) {
1673         uint32_t mask = (key.color0in_one ? 0 : 1) | (key.color1in_zero ? 0 : 2);
1674         key.mtl_diffuse = context->rs[D3DRS_DIFFUSEMATERIALSOURCE] & mask;
1675         key.mtl_ambient = context->rs[D3DRS_AMBIENTMATERIALSOURCE] & mask;
1676         key.mtl_specular = context->rs[D3DRS_SPECULARMATERIALSOURCE] & mask;
1677         key.mtl_emissive = context->rs[D3DRS_EMISSIVEMATERIALSOURCE] & mask;
1678     }
1679     key.fog = !!context->rs[D3DRS_FOGENABLE];
1680     key.fog_mode = (!key.position_t && context->rs[D3DRS_FOGENABLE]) ? context->rs[D3DRS_FOGVERTEXMODE] : 0;
1681     if (key.fog_mode)
1682         key.fog_range = context->rs[D3DRS_RANGEFOGENABLE];
1683 
1684     key.localviewer = !!context->rs[D3DRS_LOCALVIEWER];
1685     key.normalizenormals = !!context->rs[D3DRS_NORMALIZENORMALS];
1686     key.ucp = !!context->rs[D3DRS_CLIPPLANEENABLE];
1687     key.clipplane_emulate = device->driver_caps.emulate_ucp ? (context->rs[D3DRS_CLIPPLANEENABLE] & 0xff) : 0;
1688 
1689     if (context->rs[D3DRS_VERTEXBLEND] != D3DVBF_DISABLE) {
1690         key.vertexblend_indexed = !!context->rs[D3DRS_INDEXEDVERTEXBLENDENABLE] && has_indexes;
1691 
1692         switch (context->rs[D3DRS_VERTEXBLEND]) {
1693         case D3DVBF_0WEIGHTS: key.vertexblend = key.vertexblend_indexed; break;
1694         case D3DVBF_1WEIGHTS: key.vertexblend = 2; break;
1695         case D3DVBF_2WEIGHTS: key.vertexblend = 3; break;
1696         case D3DVBF_3WEIGHTS: key.vertexblend = 4; break;
1697         case D3DVBF_TWEENING: key.vertextween = 1; break;
1698         default:
1699             assert(!"invalid D3DVBF");
1700             break;
1701         }
1702         if (!has_weights && context->rs[D3DRS_VERTEXBLEND] != D3DVBF_0WEIGHTS)
1703             key.vertexblend = 0; /* TODO: if key.vertexblend_indexed, perhaps it should use 1.0 as weight, or revert to D3DVBF_0WEIGHTS */
1704     }
1705 
1706     for (s = 0; s < 8; ++s) {
1707         unsigned gen = (context->ff.tex_stage[s][D3DTSS_TEXCOORDINDEX] >> 16) + 1;
1708         unsigned idx = context->ff.tex_stage[s][D3DTSS_TEXCOORDINDEX] & 7;
1709         unsigned dim;
1710 
1711         if (key.position_t && gen > NINED3DTSS_TCI_PASSTHRU)
1712             gen = NINED3DTSS_TCI_PASSTHRU;
1713 
1714         if (!input_texture_coord[idx] && gen == NINED3DTSS_TCI_PASSTHRU)
1715             gen = NINED3DTSS_TCI_DISABLE;
1716 
1717         key.tc_gen |= gen << (s * 3);
1718         key.tc_idx |= idx << (s * 3);
1719         key.tc_dim_input |= ((input_texture_coord[idx]-1) & 0x3) << (s * 2);
1720 
1721         dim = context->ff.tex_stage[s][D3DTSS_TEXTURETRANSFORMFLAGS] & 0x7;
1722         if (dim > 4)
1723             dim = input_texture_coord[idx];
1724         if (dim == 1) /* NV behaviour */
1725             dim = 0;
1726         key.tc_dim_output |= dim << (s * 3);
1727     }
1728 
1729     DBG("VS ff key hash: %x\n", nine_ff_vs_key_hash(&key));
1730     vs = util_hash_table_get(device->ff.ht_vs, &key);
1731     if (vs)
1732         return vs;
1733     NineVertexShader9_new(device, &vs, NULL, nine_ff_build_vs(device, &bld));
1734 
1735     nine_ff_prune_vs(device);
1736     if (vs) {
1737         unsigned n;
1738 
1739         memcpy(&vs->ff_key, &key, sizeof(vs->ff_key));
1740 
1741         _mesa_hash_table_insert(device->ff.ht_vs, &vs->ff_key, vs);
1742         device->ff.num_vs++;
1743 
1744         vs->num_inputs = bld.num_inputs;
1745         for (n = 0; n < bld.num_inputs; ++n)
1746             vs->input_map[n].ndecl = bld.input[n];
1747 
1748         vs->position_t = key.position_t;
1749         vs->point_size = key.vertexpointsize | key.pointscale | device->driver_caps.always_output_pointsize;
1750     }
1751     return vs;
1752 }
1753 
1754 #define GET_D3DTS(n) nine_state_access_transform(&context->ff, D3DTS_##n, FALSE)
1755 #define IS_D3DTS_DIRTY(s,n) ((s)->ff.changed.transform[(D3DTS_##n) / 32] & (1 << ((D3DTS_##n) % 32)))
1756 
1757 static struct NinePixelShader9 *
nine_ff_get_ps(struct NineDevice9 * device)1758 nine_ff_get_ps(struct NineDevice9 *device)
1759 {
1760     struct nine_context *context = &device->context;
1761     struct NinePixelShader9 *ps;
1762     struct nine_ff_ps_key key;
1763     unsigned s;
1764     uint8_t sampler_mask = 0;
1765 
1766     assert(sizeof(key) <= sizeof(key.value32));
1767 
1768     memset(&key, 0, sizeof(key));
1769     for (s = 0; s < 8; ++s) {
1770         key.ts[s].colorop = context->ff.tex_stage[s][D3DTSS_COLOROP];
1771         key.ts[s].alphaop = context->ff.tex_stage[s][D3DTSS_ALPHAOP];
1772         const uint8_t used_c = ps_d3dtop_args_mask(key.ts[s].colorop);
1773         const uint8_t used_a = ps_d3dtop_args_mask(key.ts[s].alphaop);
1774         /* MSDN says D3DTOP_DISABLE disables this and all subsequent stages.
1775          * ALPHAOP cannot be enabled if COLOROP is disabled.
1776          * Verified on Windows. */
1777         if (key.ts[s].colorop == D3DTOP_DISABLE) {
1778             key.ts[s].alphaop = D3DTOP_DISABLE; /* DISABLE == 1, avoid degenerate keys */
1779             break;
1780         }
1781 
1782         if (!context->texture[s].enabled &&
1783             ((context->ff.tex_stage[s][D3DTSS_COLORARG0] == D3DTA_TEXTURE &&
1784               used_c & 0x1) ||
1785              (context->ff.tex_stage[s][D3DTSS_COLORARG1] == D3DTA_TEXTURE &&
1786               used_c & 0x2) ||
1787              (context->ff.tex_stage[s][D3DTSS_COLORARG2] == D3DTA_TEXTURE &&
1788               used_c & 0x4))) {
1789             /* Tested on Windows: Invalid texture read disables the stage
1790              * and the subsequent ones, but only for colorop. For alpha,
1791              * it's as if the texture had alpha of 1.0, which is what
1792              * has our dummy texture in that case. Invalid color also
1793              * disabled the following alpha stages. */
1794             key.ts[s].colorop = key.ts[s].alphaop = D3DTOP_DISABLE;
1795             break;
1796         }
1797 
1798         if (context->ff.tex_stage[s][D3DTSS_COLORARG0] == D3DTA_TEXTURE ||
1799             context->ff.tex_stage[s][D3DTSS_COLORARG1] == D3DTA_TEXTURE ||
1800             context->ff.tex_stage[s][D3DTSS_COLORARG2] == D3DTA_TEXTURE ||
1801             context->ff.tex_stage[s][D3DTSS_ALPHAARG0] == D3DTA_TEXTURE ||
1802             context->ff.tex_stage[s][D3DTSS_ALPHAARG1] == D3DTA_TEXTURE ||
1803             context->ff.tex_stage[s][D3DTSS_ALPHAARG2] == D3DTA_TEXTURE)
1804             sampler_mask |= (1 << s);
1805 
1806         if (key.ts[s].colorop != D3DTOP_DISABLE) {
1807             if (used_c & 0x1) key.ts[s].colorarg0 = context->ff.tex_stage[s][D3DTSS_COLORARG0] & 0x7;
1808             if (used_c & 0x2) key.ts[s].colorarg1 = context->ff.tex_stage[s][D3DTSS_COLORARG1] & 0x7;
1809             if (used_c & 0x4) key.ts[s].colorarg2 = context->ff.tex_stage[s][D3DTSS_COLORARG2] & 0x7;
1810             if (used_c & 0x1) key.colorarg_b4[0] |= ((context->ff.tex_stage[s][D3DTSS_COLORARG0] >> 4) & 0x1) << s;
1811             if (used_c & 0x1) key.colorarg_b5[0] |= ((context->ff.tex_stage[s][D3DTSS_COLORARG0] >> 5) & 0x1) << s;
1812             if (used_c & 0x2) key.colorarg_b4[1] |= ((context->ff.tex_stage[s][D3DTSS_COLORARG1] >> 4) & 0x1) << s;
1813             if (used_c & 0x2) key.colorarg_b5[1] |= ((context->ff.tex_stage[s][D3DTSS_COLORARG1] >> 5) & 0x1) << s;
1814             if (used_c & 0x4) key.colorarg_b4[2] |= ((context->ff.tex_stage[s][D3DTSS_COLORARG2] >> 4) & 0x1) << s;
1815             if (used_c & 0x4) key.colorarg_b5[2] |= ((context->ff.tex_stage[s][D3DTSS_COLORARG2] >> 5) & 0x1) << s;
1816         }
1817         if (key.ts[s].alphaop != D3DTOP_DISABLE) {
1818             if (used_a & 0x1) key.ts[s].alphaarg0 = context->ff.tex_stage[s][D3DTSS_ALPHAARG0] & 0x7;
1819             if (used_a & 0x2) key.ts[s].alphaarg1 = context->ff.tex_stage[s][D3DTSS_ALPHAARG1] & 0x7;
1820             if (used_a & 0x4) key.ts[s].alphaarg2 = context->ff.tex_stage[s][D3DTSS_ALPHAARG2] & 0x7;
1821             if (used_a & 0x1) key.alphaarg_b4[0] |= ((context->ff.tex_stage[s][D3DTSS_ALPHAARG0] >> 4) & 0x1) << s;
1822             if (used_a & 0x2) key.alphaarg_b4[1] |= ((context->ff.tex_stage[s][D3DTSS_ALPHAARG1] >> 4) & 0x1) << s;
1823             if (used_a & 0x4) key.alphaarg_b4[2] |= ((context->ff.tex_stage[s][D3DTSS_ALPHAARG2] >> 4) & 0x1) << s;
1824         }
1825         key.ts[s].resultarg = context->ff.tex_stage[s][D3DTSS_RESULTARG] == D3DTA_TEMP;
1826 
1827         if (context->texture[s].enabled) {
1828             switch (context->texture[s].type) {
1829             case D3DRTYPE_TEXTURE:       key.ts[s].textarget = 1; break;
1830             case D3DRTYPE_VOLUMETEXTURE: key.ts[s].textarget = 2; break;
1831             case D3DRTYPE_CUBETEXTURE:   key.ts[s].textarget = 3; break;
1832             default:
1833                 assert(!"unexpected texture type");
1834                 break;
1835             }
1836         } else {
1837             key.ts[s].textarget = 1;
1838         }
1839     }
1840 
1841     /* Note: If colorop is D3DTOP_DISABLE for the first stage
1842      * (which implies alphaop is too), nothing particular happens,
1843      * that is, current is equal to diffuse (which is the case anyway,
1844      * because it is how it is initialized).
1845      * Special case seems if alphaop is D3DTOP_DISABLE and not colorop,
1846      * because then if the resultarg is TEMP, then diffuse alpha is written
1847      * to it. */
1848     if (key.ts[0].colorop != D3DTOP_DISABLE &&
1849         key.ts[0].alphaop == D3DTOP_DISABLE &&
1850         key.ts[0].resultarg != 0) {
1851         key.ts[0].alphaop = D3DTOP_SELECTARG1;
1852         key.ts[0].alphaarg1 = D3DTA_DIFFUSE;
1853     }
1854     /* When no alpha stage writes to current, diffuse alpha is taken.
1855      * Since we initialize current to diffuse, we have the behaviour. */
1856 
1857     /* Last stage always writes to Current */
1858     if (s >= 1)
1859         key.ts[s-1].resultarg = 0;
1860 
1861     key.projected = nine_ff_get_projected_key_ff(context);
1862     key.specular = !!context->rs[D3DRS_SPECULARENABLE];
1863     key.flatshade = context->rs[D3DRS_SHADEMODE] == D3DSHADE_FLAT;
1864 
1865     for (; s < 8; ++s)
1866         key.ts[s].colorop = key.ts[s].alphaop = D3DTOP_DISABLE;
1867     if (context->rs[D3DRS_FOGENABLE])
1868         key.fog_mode = context->rs[D3DRS_FOGTABLEMODE];
1869     key.fog = !!context->rs[D3DRS_FOGENABLE];
1870     if (key.fog_mode && key.fog)
1871         key.fog_source = !context->zfog;
1872     key.alpha_test_emulation = context->rs[NINED3DRS_EMULATED_ALPHATEST] & 0x7;
1873 
1874     DBG("PS ff key hash: %x\n", nine_ff_ps_key_hash(&key));
1875     ps = util_hash_table_get(device->ff.ht_ps, &key);
1876     if (ps)
1877         return ps;
1878     NinePixelShader9_new(device, &ps, NULL, nine_ff_build_ps(device, &key));
1879 
1880     nine_ff_prune_ps(device);
1881     if (ps) {
1882         memcpy(&ps->ff_key, &key, sizeof(ps->ff_key));
1883 
1884         _mesa_hash_table_insert(device->ff.ht_ps, &ps->ff_key, ps);
1885         device->ff.num_ps++;
1886 
1887         ps->rt_mask = 0x1;
1888         ps->sampler_mask = sampler_mask;
1889     }
1890     return ps;
1891 }
1892 
1893 static void
nine_ff_load_vs_transforms(struct NineDevice9 * device)1894 nine_ff_load_vs_transforms(struct NineDevice9 *device)
1895 {
1896     struct nine_context *context = &device->context;
1897     D3DMATRIX T;
1898     D3DMATRIX *M = (D3DMATRIX *)device->ff.vs_const;
1899     unsigned i;
1900 
1901     /* TODO: make this nicer, and only upload the ones we need */
1902     /* TODO: use ff.vs_const as storage of W, V, P matrices */
1903 
1904     if (IS_D3DTS_DIRTY(context, WORLD) ||
1905         IS_D3DTS_DIRTY(context, VIEW) ||
1906         IS_D3DTS_DIRTY(context, PROJECTION)) {
1907         /* WVP, WV matrices */
1908         nine_d3d_matrix_matrix_mul(&M[1], GET_D3DTS(WORLD), GET_D3DTS(VIEW));
1909         nine_d3d_matrix_matrix_mul(&M[0], &M[1], GET_D3DTS(PROJECTION));
1910 
1911         /* normal matrix == transpose(inverse(WV)) */
1912         nine_d3d_matrix_inverse(&T, &M[1]);
1913         nine_d3d_matrix_transpose(&M[4], &T);
1914 
1915         /* P matrix */
1916         M[2] = *GET_D3DTS(PROJECTION);
1917 
1918         /* V and W matrix */
1919         nine_d3d_matrix_inverse(&M[3], GET_D3DTS(VIEW));
1920         M[40] = M[1];
1921     }
1922 
1923     if (context->rs[D3DRS_VERTEXBLEND] != D3DVBF_DISABLE) {
1924         /* load other world matrices */
1925         for (i = 1; i <= 8; ++i) {
1926             nine_d3d_matrix_matrix_mul(&M[40 + i], GET_D3DTS(WORLDMATRIX(i)), GET_D3DTS(VIEW));
1927         }
1928     }
1929 
1930     device->ff.vs_const[30 * 4] = asfloat(context->rs[D3DRS_TWEENFACTOR]);
1931 }
1932 
1933 static void
nine_ff_load_lights(struct NineDevice9 * device)1934 nine_ff_load_lights(struct NineDevice9 *device)
1935 {
1936     struct nine_context *context = &device->context;
1937     struct fvec4 *dst = (struct fvec4 *)device->ff.vs_const;
1938     unsigned l;
1939 
1940     if (context->changed.group & NINE_STATE_FF_MATERIAL) {
1941         const D3DMATERIAL9 *mtl = &context->ff.material;
1942 
1943         memcpy(&dst[20], &mtl->Diffuse, 4 * sizeof(float));
1944         memcpy(&dst[21], &mtl->Ambient, 4 * sizeof(float));
1945         memcpy(&dst[22], &mtl->Specular, 4 * sizeof(float));
1946         dst[23].x = mtl->Power;
1947         memcpy(&dst[24], &mtl->Emissive, 4 * sizeof(float));
1948         d3dcolor_to_rgba(&dst[25].x, context->rs[D3DRS_AMBIENT]);
1949         dst[19].x = dst[25].x * mtl->Ambient.r + mtl->Emissive.r;
1950         dst[19].y = dst[25].y * mtl->Ambient.g + mtl->Emissive.g;
1951         dst[19].z = dst[25].z * mtl->Ambient.b + mtl->Emissive.b;
1952     }
1953 
1954     if (!(context->changed.group & NINE_STATE_FF_LIGHTING) && !IS_D3DTS_DIRTY(context, VIEW))
1955         return;
1956 
1957     for (l = 0; l < context->ff.num_lights_active; ++l) {
1958         const D3DLIGHT9 *light = &context->ff.light[context->ff.active_light[l]];
1959 
1960         dst[32 + l * 8].x = light->Type;
1961         dst[32 + l * 8].y = light->Attenuation0;
1962         dst[32 + l * 8].z = light->Attenuation1;
1963         dst[32 + l * 8].w = light->Attenuation2;
1964         memcpy(&dst[33 + l * 8].x, &light->Diffuse, sizeof(light->Diffuse));
1965         memcpy(&dst[34 + l * 8].x, &light->Specular, sizeof(light->Specular));
1966         memcpy(&dst[35 + l * 8].x, &light->Ambient, sizeof(light->Ambient));
1967         nine_d3d_vector4_matrix_mul((D3DVECTOR *)&dst[36 + l * 8].x, &light->Position, GET_D3DTS(VIEW));
1968         nine_d3d_vector3_matrix_mul((D3DVECTOR *)&dst[37 + l * 8].x, &light->Direction, GET_D3DTS(VIEW));
1969         dst[36 + l * 8].w = light->Type == D3DLIGHT_DIRECTIONAL ? 1e9f : light->Range;
1970         dst[37 + l * 8].w = light->Falloff;
1971         dst[38 + l * 8].x = cosf(light->Theta * 0.5f);
1972         dst[38 + l * 8].y = cosf(light->Phi * 0.5f);
1973         dst[38 + l * 8].z = 1.0f / (dst[38 + l * 8].x - dst[38 + l * 8].y);
1974         dst[39 + l * 8].w = (float)((l + 1) == context->ff.num_lights_active);
1975     }
1976 }
1977 
1978 static void
nine_ff_load_point_and_fog_params(struct NineDevice9 * device)1979 nine_ff_load_point_and_fog_params(struct NineDevice9 *device)
1980 {
1981     struct nine_context *context = &device->context;
1982     struct fvec4 *dst = (struct fvec4 *)device->ff.vs_const;
1983 
1984     if (!(context->changed.group & NINE_STATE_FF_VS_OTHER))
1985         return;
1986     dst[26].x = asfloat(context->rs[D3DRS_POINTSIZE_MIN]);
1987     dst[26].y = asfloat(context->rs[D3DRS_POINTSIZE_MAX]);
1988     dst[26].z = CLAMP(asfloat(context->rs[D3DRS_POINTSIZE]),
1989                 asfloat(context->rs[D3DRS_POINTSIZE_MIN]),
1990                 asfloat(context->rs[D3DRS_POINTSIZE_MAX]));
1991     dst[26].w = asfloat(context->rs[D3DRS_POINTSCALE_A]);
1992     dst[27].x = asfloat(context->rs[D3DRS_POINTSCALE_B]);
1993     dst[27].y = asfloat(context->rs[D3DRS_POINTSCALE_C]);
1994     dst[28].x = asfloat(context->rs[D3DRS_FOGEND]);
1995     dst[28].y = 1.0f / (asfloat(context->rs[D3DRS_FOGEND]) - asfloat(context->rs[D3DRS_FOGSTART]));
1996     if (isinf(dst[28].y))
1997         dst[28].y = 0.0f;
1998     dst[28].z = asfloat(context->rs[D3DRS_FOGDENSITY]);
1999     if (device->driver_caps.emulate_ucp)
2000         memcpy(&dst[196], &context->clip.ucp, sizeof(context->clip));
2001 }
2002 
2003 static void
nine_ff_load_tex_matrices(struct NineDevice9 * device)2004 nine_ff_load_tex_matrices(struct NineDevice9 *device)
2005 {
2006     struct nine_context *context = &device->context;
2007     D3DMATRIX *M = (D3DMATRIX *)device->ff.vs_const;
2008     unsigned s;
2009 
2010     if (!(context->ff.changed.transform[0] & 0xff0000))
2011         return;
2012     for (s = 0; s < 8; ++s) {
2013         if (IS_D3DTS_DIRTY(context, TEXTURE0 + s))
2014             nine_d3d_matrix_transpose(&M[32 + s], nine_state_access_transform(&context->ff, D3DTS_TEXTURE0 + s, false));
2015     }
2016 }
2017 
2018 static void
nine_ff_load_ps_params(struct NineDevice9 * device)2019 nine_ff_load_ps_params(struct NineDevice9 *device)
2020 {
2021     struct nine_context *context = &device->context;
2022     struct fvec4 *dst = (struct fvec4 *)device->ff.ps_const;
2023     unsigned s;
2024 
2025     if (!(context->changed.group & NINE_STATE_FF_PS_CONSTS))
2026         return;
2027 
2028     for (s = 0; s < 8; ++s)
2029         d3dcolor_to_rgba(&dst[s].x, context->ff.tex_stage[s][D3DTSS_CONSTANT]);
2030 
2031     for (s = 0; s < 8; ++s) {
2032         dst[8 + s].x = asfloat(context->ff.tex_stage[s][D3DTSS_BUMPENVMAT00]);
2033         dst[8 + s].y = asfloat(context->ff.tex_stage[s][D3DTSS_BUMPENVMAT01]);
2034         dst[8 + s].z = asfloat(context->ff.tex_stage[s][D3DTSS_BUMPENVMAT10]);
2035         dst[8 + s].w = asfloat(context->ff.tex_stage[s][D3DTSS_BUMPENVMAT11]);
2036         if (s & 1) {
2037             dst[16 + s / 2].z = asfloat(context->ff.tex_stage[s][D3DTSS_BUMPENVLSCALE]);
2038             dst[16 + s / 2].w = asfloat(context->ff.tex_stage[s][D3DTSS_BUMPENVLOFFSET]);
2039         } else {
2040             dst[16 + s / 2].x = asfloat(context->ff.tex_stage[s][D3DTSS_BUMPENVLSCALE]);
2041             dst[16 + s / 2].y = asfloat(context->ff.tex_stage[s][D3DTSS_BUMPENVLOFFSET]);
2042         }
2043     }
2044 
2045     d3dcolor_to_rgba(&dst[20].x, context->rs[D3DRS_TEXTUREFACTOR]);
2046     d3dcolor_to_rgba(&dst[21].x, context->rs[D3DRS_FOGCOLOR]);
2047     dst[22].x = asfloat(context->rs[D3DRS_FOGEND]);
2048     dst[22].y = 1.0f / (asfloat(context->rs[D3DRS_FOGEND]) - asfloat(context->rs[D3DRS_FOGSTART]));
2049     dst[22].z = asfloat(context->rs[D3DRS_FOGDENSITY]);
2050     dst[22].w = (float)context->rs[D3DRS_ALPHAREF] / 255.f;
2051 }
2052 
2053 static void
nine_ff_load_viewport_info(struct NineDevice9 * device)2054 nine_ff_load_viewport_info(struct NineDevice9 *device)
2055 {
2056     D3DVIEWPORT9 *viewport = &device->context.viewport;
2057     struct fvec4 *dst = (struct fvec4 *)device->ff.vs_const;
2058     float diffZ = viewport->MaxZ - viewport->MinZ;
2059 
2060     /* Note: the other functions avoids to fill the const again if nothing changed.
2061      * But we don't have much to fill, and adding code to allow that may be complex
2062      * so just fill it always */
2063     dst[100].x = 2.0f / (float)(viewport->Width);
2064     dst[100].y = 2.0f / (float)(viewport->Height);
2065     dst[100].z = (diffZ == 0.0f) ? 0.0f : (1.0f / diffZ);
2066     dst[100].w = (float)(viewport->Width);
2067     dst[101].x = (float)(viewport->X);
2068     dst[101].y = (float)(viewport->Y);
2069     dst[101].z = (float)(viewport->MinZ);
2070 }
2071 
2072 void
nine_ff_update(struct NineDevice9 * device)2073 nine_ff_update(struct NineDevice9 *device)
2074 {
2075     struct nine_context *context = &device->context;
2076     struct pipe_constant_buffer cb;
2077 
2078     DBG("vs=%p ps=%p\n", context->vs, context->ps);
2079 
2080     /* NOTE: the only reference belongs to the hash table */
2081     if (!context->programmable_vs) {
2082         device->ff.vs = nine_ff_get_vs(device);
2083         context->changed.group |= NINE_STATE_VS;
2084     }
2085     if (!context->ps) {
2086         device->ff.ps = nine_ff_get_ps(device);
2087         context->changed.group |= NINE_STATE_PS;
2088     }
2089 
2090     if (!context->programmable_vs) {
2091         nine_ff_load_vs_transforms(device);
2092         nine_ff_load_tex_matrices(device);
2093         nine_ff_load_lights(device);
2094         nine_ff_load_point_and_fog_params(device);
2095         nine_ff_load_viewport_info(device);
2096 
2097         memset(context->ff.changed.transform, 0, sizeof(context->ff.changed.transform));
2098 
2099         cb.buffer_offset = 0;
2100         cb.buffer = NULL;
2101         cb.user_buffer = device->ff.vs_const;
2102         cb.buffer_size = NINE_FF_NUM_VS_CONST * 4 * sizeof(float);
2103 
2104         context->pipe_data.cb_vs_ff = cb;
2105         context->commit |= NINE_STATE_COMMIT_CONST_VS;
2106 
2107         context->changed.group &= ~NINE_STATE_FF_VS;
2108     }
2109 
2110     if (!context->ps) {
2111         nine_ff_load_ps_params(device);
2112 
2113         cb.buffer_offset = 0;
2114         cb.buffer = NULL;
2115         cb.user_buffer = device->ff.ps_const;
2116         cb.buffer_size = NINE_FF_NUM_PS_CONST * 4 * sizeof(float);
2117 
2118         context->pipe_data.cb_ps_ff = cb;
2119         context->commit |= NINE_STATE_COMMIT_CONST_PS;
2120 
2121         context->changed.group &= ~NINE_STATE_FF_PS;
2122     }
2123 }
2124 
2125 
2126 bool
nine_ff_init(struct NineDevice9 * device)2127 nine_ff_init(struct NineDevice9 *device)
2128 {
2129     device->ff.ht_vs = _mesa_hash_table_create(NULL, nine_ff_vs_key_hash,
2130                                                nine_ff_vs_key_comp);
2131     device->ff.ht_ps = _mesa_hash_table_create(NULL, nine_ff_ps_key_hash,
2132                                                nine_ff_ps_key_comp);
2133 
2134     device->ff.ht_fvf = _mesa_hash_table_create(NULL, nine_ff_fvf_key_hash,
2135                                                 nine_ff_fvf_key_comp);
2136 
2137     device->ff.vs_const = CALLOC(NINE_FF_NUM_VS_CONST, 4 * sizeof(float));
2138     device->ff.ps_const = CALLOC(NINE_FF_NUM_PS_CONST, 4 * sizeof(float));
2139 
2140     return device->ff.ht_vs && device->ff.ht_ps &&
2141         device->ff.ht_fvf &&
2142         device->ff.vs_const && device->ff.ps_const;
2143 }
2144 
nine_ff_ht_delete_cb(void * key,void * value,void * data)2145 static enum pipe_error nine_ff_ht_delete_cb(void *key, void *value, void *data)
2146 {
2147     NineUnknown_Unbind(NineUnknown(value));
2148     return PIPE_OK;
2149 }
2150 
2151 void
nine_ff_fini(struct NineDevice9 * device)2152 nine_ff_fini(struct NineDevice9 *device)
2153 {
2154     if (device->ff.ht_vs) {
2155         util_hash_table_foreach(device->ff.ht_vs, nine_ff_ht_delete_cb, NULL);
2156         _mesa_hash_table_destroy(device->ff.ht_vs, NULL);
2157     }
2158     if (device->ff.ht_ps) {
2159         util_hash_table_foreach(device->ff.ht_ps, nine_ff_ht_delete_cb, NULL);
2160         _mesa_hash_table_destroy(device->ff.ht_ps, NULL);
2161     }
2162     if (device->ff.ht_fvf) {
2163         util_hash_table_foreach(device->ff.ht_fvf, nine_ff_ht_delete_cb, NULL);
2164         _mesa_hash_table_destroy(device->ff.ht_fvf, NULL);
2165     }
2166     device->ff.vs = NULL; /* destroyed by unbinding from hash table */
2167     device->ff.ps = NULL;
2168 
2169     FREE(device->ff.vs_const);
2170     FREE(device->ff.ps_const);
2171 }
2172 
2173 static void
nine_ff_prune_vs(struct NineDevice9 * device)2174 nine_ff_prune_vs(struct NineDevice9 *device)
2175 {
2176     struct nine_context *context = &device->context;
2177 
2178     if (device->ff.num_vs > 1024) {
2179         /* could destroy the bound one here, so unbind */
2180         context->pipe->bind_vs_state(context->pipe, NULL);
2181         util_hash_table_foreach(device->ff.ht_vs, nine_ff_ht_delete_cb, NULL);
2182         _mesa_hash_table_clear(device->ff.ht_vs, NULL);
2183         device->ff.num_vs = 0;
2184         context->changed.group |= NINE_STATE_VS;
2185     }
2186 }
2187 static void
nine_ff_prune_ps(struct NineDevice9 * device)2188 nine_ff_prune_ps(struct NineDevice9 *device)
2189 {
2190     struct nine_context *context = &device->context;
2191 
2192     if (device->ff.num_ps > 1024) {
2193         /* could destroy the bound one here, so unbind */
2194         context->pipe->bind_fs_state(context->pipe, NULL);
2195         util_hash_table_foreach(device->ff.ht_ps, nine_ff_ht_delete_cb, NULL);
2196         _mesa_hash_table_clear(device->ff.ht_ps, NULL);
2197         device->ff.num_ps = 0;
2198         context->changed.group |= NINE_STATE_PS;
2199     }
2200 }
2201 
2202 /* ========================================================================== */
2203 
2204 /* Matrix multiplication:
2205  *
2206  * in memory: 0 1 2 3 (row major)
2207  *            4 5 6 7
2208  *            8 9 a b
2209  *            c d e f
2210  *
2211  *    cA cB cC cD
2212  * r0             = (r0 * cA) (r0 * cB) . .
2213  * r1             = (r1 * cA) (r1 * cB)
2214  * r2             = (r2 * cA) .
2215  * r3             = (r3 * cA) .
2216  *
2217  *               r: (11) (12) (13) (14)
2218  *                  (21) (22) (23) (24)
2219  *                  (31) (32) (33) (34)
2220  *                  (41) (42) (43) (44)
2221  * l: (11 12 13 14)
2222  *    (21 22 23 24)
2223  *    (31 32 33 34)
2224  *    (41 42 43 44)
2225  *
2226  * v: (x  y  z  1 )
2227  *
2228  * t.xyzw = MUL(v.xxxx, r[0]);
2229  * t.xyzw = MAD(v.yyyy, r[1], t.xyzw);
2230  * t.xyzw = MAD(v.zzzz, r[2], t.xyzw);
2231  * v.xyzw = MAD(v.wwww, r[3], t.xyzw);
2232  *
2233  * v.x = DP4(v, c[0]);
2234  * v.y = DP4(v, c[1]);
2235  * v.z = DP4(v, c[2]);
2236  * v.w = DP4(v, c[3]) = 1
2237  */
2238 
2239 /*
2240 static void
2241 nine_D3DMATRIX_print(const D3DMATRIX *M)
2242 {
2243     DBG("\n(%f %f %f %f)\n"
2244         "(%f %f %f %f)\n"
2245         "(%f %f %f %f)\n"
2246         "(%f %f %f %f)\n",
2247         M->m[0][0], M->m[0][1], M->m[0][2], M->m[0][3],
2248         M->m[1][0], M->m[1][1], M->m[1][2], M->m[1][3],
2249         M->m[2][0], M->m[2][1], M->m[2][2], M->m[2][3],
2250         M->m[3][0], M->m[3][1], M->m[3][2], M->m[3][3]);
2251 }
2252 */
2253 
2254 static inline float
nine_DP4_row_col(const D3DMATRIX * A,int r,const D3DMATRIX * B,int c)2255 nine_DP4_row_col(const D3DMATRIX *A, int r, const D3DMATRIX *B, int c)
2256 {
2257     return A->m[r][0] * B->m[0][c] +
2258            A->m[r][1] * B->m[1][c] +
2259            A->m[r][2] * B->m[2][c] +
2260            A->m[r][3] * B->m[3][c];
2261 }
2262 
2263 static inline float
nine_DP4_vec_col(const D3DVECTOR * v,const D3DMATRIX * M,int c)2264 nine_DP4_vec_col(const D3DVECTOR *v, const D3DMATRIX *M, int c)
2265 {
2266     return v->x * M->m[0][c] +
2267            v->y * M->m[1][c] +
2268            v->z * M->m[2][c] +
2269            1.0f * M->m[3][c];
2270 }
2271 
2272 static inline float
nine_DP3_vec_col(const D3DVECTOR * v,const D3DMATRIX * M,int c)2273 nine_DP3_vec_col(const D3DVECTOR *v, const D3DMATRIX *M, int c)
2274 {
2275     return v->x * M->m[0][c] +
2276            v->y * M->m[1][c] +
2277            v->z * M->m[2][c];
2278 }
2279 
2280 void
nine_d3d_matrix_matrix_mul(D3DMATRIX * D,const D3DMATRIX * L,const D3DMATRIX * R)2281 nine_d3d_matrix_matrix_mul(D3DMATRIX *D, const D3DMATRIX *L, const D3DMATRIX *R)
2282 {
2283     D->_11 = nine_DP4_row_col(L, 0, R, 0);
2284     D->_12 = nine_DP4_row_col(L, 0, R, 1);
2285     D->_13 = nine_DP4_row_col(L, 0, R, 2);
2286     D->_14 = nine_DP4_row_col(L, 0, R, 3);
2287 
2288     D->_21 = nine_DP4_row_col(L, 1, R, 0);
2289     D->_22 = nine_DP4_row_col(L, 1, R, 1);
2290     D->_23 = nine_DP4_row_col(L, 1, R, 2);
2291     D->_24 = nine_DP4_row_col(L, 1, R, 3);
2292 
2293     D->_31 = nine_DP4_row_col(L, 2, R, 0);
2294     D->_32 = nine_DP4_row_col(L, 2, R, 1);
2295     D->_33 = nine_DP4_row_col(L, 2, R, 2);
2296     D->_34 = nine_DP4_row_col(L, 2, R, 3);
2297 
2298     D->_41 = nine_DP4_row_col(L, 3, R, 0);
2299     D->_42 = nine_DP4_row_col(L, 3, R, 1);
2300     D->_43 = nine_DP4_row_col(L, 3, R, 2);
2301     D->_44 = nine_DP4_row_col(L, 3, R, 3);
2302 }
2303 
2304 void
nine_d3d_vector4_matrix_mul(D3DVECTOR * d,const D3DVECTOR * v,const D3DMATRIX * M)2305 nine_d3d_vector4_matrix_mul(D3DVECTOR *d, const D3DVECTOR *v, const D3DMATRIX *M)
2306 {
2307     d->x = nine_DP4_vec_col(v, M, 0);
2308     d->y = nine_DP4_vec_col(v, M, 1);
2309     d->z = nine_DP4_vec_col(v, M, 2);
2310 }
2311 
2312 void
nine_d3d_vector3_matrix_mul(D3DVECTOR * d,const D3DVECTOR * v,const D3DMATRIX * M)2313 nine_d3d_vector3_matrix_mul(D3DVECTOR *d, const D3DVECTOR *v, const D3DMATRIX *M)
2314 {
2315     d->x = nine_DP3_vec_col(v, M, 0);
2316     d->y = nine_DP3_vec_col(v, M, 1);
2317     d->z = nine_DP3_vec_col(v, M, 2);
2318 }
2319 
2320 void
nine_d3d_matrix_transpose(D3DMATRIX * D,const D3DMATRIX * M)2321 nine_d3d_matrix_transpose(D3DMATRIX *D, const D3DMATRIX *M)
2322 {
2323     unsigned i, j;
2324     for (i = 0; i < 4; ++i)
2325     for (j = 0; j < 4; ++j)
2326         D->m[i][j] = M->m[j][i];
2327 }
2328 
2329 #define _M_ADD_PROD_1i_2j_3k_4l(i,j,k,l) do {            \
2330     float t = M->_1##i * M->_2##j * M->_3##k * M->_4##l; \
2331     if (t > 0.0f) pos += t; else neg += t; } while(0)
2332 
2333 #define _M_SUB_PROD_1i_2j_3k_4l(i,j,k,l) do {            \
2334     float t = M->_1##i * M->_2##j * M->_3##k * M->_4##l; \
2335     if (t > 0.0f) neg -= t; else pos -= t; } while(0)
2336 float
nine_d3d_matrix_det(const D3DMATRIX * M)2337 nine_d3d_matrix_det(const D3DMATRIX *M)
2338 {
2339     float pos = 0.0f;
2340     float neg = 0.0f;
2341 
2342     _M_ADD_PROD_1i_2j_3k_4l(1, 2, 3, 4);
2343     _M_ADD_PROD_1i_2j_3k_4l(1, 3, 4, 2);
2344     _M_ADD_PROD_1i_2j_3k_4l(1, 4, 2, 3);
2345 
2346     _M_ADD_PROD_1i_2j_3k_4l(2, 1, 4, 3);
2347     _M_ADD_PROD_1i_2j_3k_4l(2, 3, 1, 4);
2348     _M_ADD_PROD_1i_2j_3k_4l(2, 4, 3, 1);
2349 
2350     _M_ADD_PROD_1i_2j_3k_4l(3, 1, 2, 4);
2351     _M_ADD_PROD_1i_2j_3k_4l(3, 2, 4, 1);
2352     _M_ADD_PROD_1i_2j_3k_4l(3, 4, 1, 2);
2353 
2354     _M_ADD_PROD_1i_2j_3k_4l(4, 1, 3, 2);
2355     _M_ADD_PROD_1i_2j_3k_4l(4, 2, 1, 3);
2356     _M_ADD_PROD_1i_2j_3k_4l(4, 3, 2, 1);
2357 
2358     _M_SUB_PROD_1i_2j_3k_4l(1, 2, 4, 3);
2359     _M_SUB_PROD_1i_2j_3k_4l(1, 3, 2, 4);
2360     _M_SUB_PROD_1i_2j_3k_4l(1, 4, 3, 2);
2361 
2362     _M_SUB_PROD_1i_2j_3k_4l(2, 1, 3, 4);
2363     _M_SUB_PROD_1i_2j_3k_4l(2, 3, 4, 1);
2364     _M_SUB_PROD_1i_2j_3k_4l(2, 4, 1, 3);
2365 
2366     _M_SUB_PROD_1i_2j_3k_4l(3, 1, 4, 2);
2367     _M_SUB_PROD_1i_2j_3k_4l(3, 2, 1, 4);
2368     _M_SUB_PROD_1i_2j_3k_4l(3, 4, 2, 1);
2369 
2370     _M_SUB_PROD_1i_2j_3k_4l(4, 1, 2, 3);
2371     _M_SUB_PROD_1i_2j_3k_4l(4, 2, 3, 1);
2372     _M_SUB_PROD_1i_2j_3k_4l(4, 3, 1, 2);
2373 
2374     return pos + neg;
2375 }
2376 
2377 /* XXX: Probably better to just use src/mesa/math/m_matrix.c because
2378  * I have no idea where this code came from.
2379  */
2380 void
nine_d3d_matrix_inverse(D3DMATRIX * D,const D3DMATRIX * M)2381 nine_d3d_matrix_inverse(D3DMATRIX *D, const D3DMATRIX *M)
2382 {
2383     int i, k;
2384     float det;
2385 
2386     D->m[0][0] =
2387         M->m[1][1] * M->m[2][2] * M->m[3][3] -
2388         M->m[1][1] * M->m[3][2] * M->m[2][3] -
2389         M->m[1][2] * M->m[2][1] * M->m[3][3] +
2390         M->m[1][2] * M->m[3][1] * M->m[2][3] +
2391         M->m[1][3] * M->m[2][1] * M->m[3][2] -
2392         M->m[1][3] * M->m[3][1] * M->m[2][2];
2393 
2394     D->m[0][1] =
2395        -M->m[0][1] * M->m[2][2] * M->m[3][3] +
2396         M->m[0][1] * M->m[3][2] * M->m[2][3] +
2397         M->m[0][2] * M->m[2][1] * M->m[3][3] -
2398         M->m[0][2] * M->m[3][1] * M->m[2][3] -
2399         M->m[0][3] * M->m[2][1] * M->m[3][2] +
2400         M->m[0][3] * M->m[3][1] * M->m[2][2];
2401 
2402     D->m[0][2] =
2403         M->m[0][1] * M->m[1][2] * M->m[3][3] -
2404         M->m[0][1] * M->m[3][2] * M->m[1][3] -
2405         M->m[0][2] * M->m[1][1] * M->m[3][3] +
2406         M->m[0][2] * M->m[3][1] * M->m[1][3] +
2407         M->m[0][3] * M->m[1][1] * M->m[3][2] -
2408         M->m[0][3] * M->m[3][1] * M->m[1][2];
2409 
2410     D->m[0][3] =
2411        -M->m[0][1] * M->m[1][2] * M->m[2][3] +
2412         M->m[0][1] * M->m[2][2] * M->m[1][3] +
2413         M->m[0][2] * M->m[1][1] * M->m[2][3] -
2414         M->m[0][2] * M->m[2][1] * M->m[1][3] -
2415         M->m[0][3] * M->m[1][1] * M->m[2][2] +
2416         M->m[0][3] * M->m[2][1] * M->m[1][2];
2417 
2418     D->m[1][0] =
2419        -M->m[1][0] * M->m[2][2] * M->m[3][3] +
2420         M->m[1][0] * M->m[3][2] * M->m[2][3] +
2421         M->m[1][2] * M->m[2][0] * M->m[3][3] -
2422         M->m[1][2] * M->m[3][0] * M->m[2][3] -
2423         M->m[1][3] * M->m[2][0] * M->m[3][2] +
2424         M->m[1][3] * M->m[3][0] * M->m[2][2];
2425 
2426     D->m[1][1] =
2427         M->m[0][0] * M->m[2][2] * M->m[3][3] -
2428         M->m[0][0] * M->m[3][2] * M->m[2][3] -
2429         M->m[0][2] * M->m[2][0] * M->m[3][3] +
2430         M->m[0][2] * M->m[3][0] * M->m[2][3] +
2431         M->m[0][3] * M->m[2][0] * M->m[3][2] -
2432         M->m[0][3] * M->m[3][0] * M->m[2][2];
2433 
2434     D->m[1][2] =
2435        -M->m[0][0] * M->m[1][2] * M->m[3][3] +
2436         M->m[0][0] * M->m[3][2] * M->m[1][3] +
2437         M->m[0][2] * M->m[1][0] * M->m[3][3] -
2438         M->m[0][2] * M->m[3][0] * M->m[1][3] -
2439         M->m[0][3] * M->m[1][0] * M->m[3][2] +
2440         M->m[0][3] * M->m[3][0] * M->m[1][2];
2441 
2442     D->m[1][3] =
2443         M->m[0][0] * M->m[1][2] * M->m[2][3] -
2444         M->m[0][0] * M->m[2][2] * M->m[1][3] -
2445         M->m[0][2] * M->m[1][0] * M->m[2][3] +
2446         M->m[0][2] * M->m[2][0] * M->m[1][3] +
2447         M->m[0][3] * M->m[1][0] * M->m[2][2] -
2448         M->m[0][3] * M->m[2][0] * M->m[1][2];
2449 
2450     D->m[2][0] =
2451         M->m[1][0] * M->m[2][1] * M->m[3][3] -
2452         M->m[1][0] * M->m[3][1] * M->m[2][3] -
2453         M->m[1][1] * M->m[2][0] * M->m[3][3] +
2454         M->m[1][1] * M->m[3][0] * M->m[2][3] +
2455         M->m[1][3] * M->m[2][0] * M->m[3][1] -
2456         M->m[1][3] * M->m[3][0] * M->m[2][1];
2457 
2458     D->m[2][1] =
2459        -M->m[0][0] * M->m[2][1] * M->m[3][3] +
2460         M->m[0][0] * M->m[3][1] * M->m[2][3] +
2461         M->m[0][1] * M->m[2][0] * M->m[3][3] -
2462         M->m[0][1] * M->m[3][0] * M->m[2][3] -
2463         M->m[0][3] * M->m[2][0] * M->m[3][1] +
2464         M->m[0][3] * M->m[3][0] * M->m[2][1];
2465 
2466     D->m[2][2] =
2467         M->m[0][0] * M->m[1][1] * M->m[3][3] -
2468         M->m[0][0] * M->m[3][1] * M->m[1][3] -
2469         M->m[0][1] * M->m[1][0] * M->m[3][3] +
2470         M->m[0][1] * M->m[3][0] * M->m[1][3] +
2471         M->m[0][3] * M->m[1][0] * M->m[3][1] -
2472         M->m[0][3] * M->m[3][0] * M->m[1][1];
2473 
2474     D->m[2][3] =
2475        -M->m[0][0] * M->m[1][1] * M->m[2][3] +
2476         M->m[0][0] * M->m[2][1] * M->m[1][3] +
2477         M->m[0][1] * M->m[1][0] * M->m[2][3] -
2478         M->m[0][1] * M->m[2][0] * M->m[1][3] -
2479         M->m[0][3] * M->m[1][0] * M->m[2][1] +
2480         M->m[0][3] * M->m[2][0] * M->m[1][1];
2481 
2482     D->m[3][0] =
2483        -M->m[1][0] * M->m[2][1] * M->m[3][2] +
2484         M->m[1][0] * M->m[3][1] * M->m[2][2] +
2485         M->m[1][1] * M->m[2][0] * M->m[3][2] -
2486         M->m[1][1] * M->m[3][0] * M->m[2][2] -
2487         M->m[1][2] * M->m[2][0] * M->m[3][1] +
2488         M->m[1][2] * M->m[3][0] * M->m[2][1];
2489 
2490     D->m[3][1] =
2491         M->m[0][0] * M->m[2][1] * M->m[3][2] -
2492         M->m[0][0] * M->m[3][1] * M->m[2][2] -
2493         M->m[0][1] * M->m[2][0] * M->m[3][2] +
2494         M->m[0][1] * M->m[3][0] * M->m[2][2] +
2495         M->m[0][2] * M->m[2][0] * M->m[3][1] -
2496         M->m[0][2] * M->m[3][0] * M->m[2][1];
2497 
2498     D->m[3][2] =
2499        -M->m[0][0] * M->m[1][1] * M->m[3][2] +
2500         M->m[0][0] * M->m[3][1] * M->m[1][2] +
2501         M->m[0][1] * M->m[1][0] * M->m[3][2] -
2502         M->m[0][1] * M->m[3][0] * M->m[1][2] -
2503         M->m[0][2] * M->m[1][0] * M->m[3][1] +
2504         M->m[0][2] * M->m[3][0] * M->m[1][1];
2505 
2506     D->m[3][3] =
2507         M->m[0][0] * M->m[1][1] * M->m[2][2] -
2508         M->m[0][0] * M->m[2][1] * M->m[1][2] -
2509         M->m[0][1] * M->m[1][0] * M->m[2][2] +
2510         M->m[0][1] * M->m[2][0] * M->m[1][2] +
2511         M->m[0][2] * M->m[1][0] * M->m[2][1] -
2512         M->m[0][2] * M->m[2][0] * M->m[1][1];
2513 
2514     det =
2515         M->m[0][0] * D->m[0][0] +
2516         M->m[1][0] * D->m[0][1] +
2517         M->m[2][0] * D->m[0][2] +
2518         M->m[3][0] * D->m[0][3];
2519 
2520     if (fabsf(det) < 1e-30) {/* non inversible */
2521         *D = *M; /* wine tests */
2522         return;
2523     }
2524 
2525     det = 1.0 / det;
2526 
2527     for (i = 0; i < 4; i++)
2528     for (k = 0; k < 4; k++)
2529         D->m[i][k] *= det;
2530 
2531 #if MESA_DEBUG || !defined(NDEBUG)
2532     {
2533         D3DMATRIX I;
2534 
2535         nine_d3d_matrix_matrix_mul(&I, D, M);
2536 
2537         for (i = 0; i < 4; ++i)
2538         for (k = 0; k < 4; ++k)
2539             if (fabsf(I.m[i][k] - (float)(i == k)) > 1e-3)
2540                 DBG("Matrix inversion check FAILED !\n");
2541     }
2542 #endif
2543 }
2544