xref: /aosp_15_r20/external/mesa3d/src/intel/compiler/elk/elk_compile_ff_gs.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
3  Intel funded Tungsten Graphics to
4  develop this 3D driver.
5 
6  Permission is hereby granted, free of charge, to any person obtaining
7  a copy of this software and associated documentation files (the
8  "Software"), to deal in the Software without restriction, including
9  without limitation the rights to use, copy, modify, merge, publish,
10  distribute, sublicense, and/or sell copies of the Software, and to
11  permit persons to whom the Software is furnished to do so, subject to
12  the following conditions:
13 
14  The above copyright notice and this permission notice (including the
15  next paragraph) shall be included in all copies or substantial
16  portions of the Software.
17 
18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 
26  **********************************************************************/
27  /*
28   * Authors:
29   *   Keith Whitwell <[email protected]>
30   */
31 
32 #include "elk_compiler.h"
33 #include "elk_disasm.h"
34 #include "elk_eu.h"
35 #include "elk_prim.h"
36 
37 #include "dev/intel_debug.h"
38 
39 #define MAX_GS_VERTS (4)
40 
41 struct elk_ff_gs_compile {
42    struct elk_codegen func;
43    struct elk_ff_gs_prog_key key;
44    struct elk_ff_gs_prog_data *prog_data;
45 
46    struct {
47       struct elk_reg R0;
48 
49       /**
50        * Register holding streamed vertex buffer pointers -- see the Sandy
51        * Bridge PRM, volume 2 part 1, section 4.4.2 (GS Thread Payload
52        * [DevSNB]).  These pointers are delivered in GRF 1.
53        */
54       struct elk_reg SVBI;
55 
56       struct elk_reg vertex[MAX_GS_VERTS];
57       struct elk_reg header;
58       struct elk_reg temp;
59 
60       /**
61        * Register holding destination indices for streamed buffer writes.
62        * Only used for SOL programs.
63        */
64       struct elk_reg destination_indices;
65    } reg;
66 
67    /* Number of registers used to store vertex data */
68    GLuint nr_regs;
69 
70    struct intel_vue_map vue_map;
71 };
72 
73 /**
74  * Allocate registers for GS.
75  *
76  * If sol_program is true, then:
77  *
78  * - The thread will be spawned with the "SVBI Payload Enable" bit set, so GRF
79  *   1 needs to be set aside to hold the streamed vertex buffer indices.
80  *
81  * - The thread will need to use the destination_indices register.
82  */
elk_ff_gs_alloc_regs(struct elk_ff_gs_compile * c,GLuint nr_verts,bool sol_program)83 static void elk_ff_gs_alloc_regs(struct elk_ff_gs_compile *c,
84                                  GLuint nr_verts,
85                                  bool sol_program)
86 {
87    GLuint i = 0,j;
88 
89    /* Register usage is static, precompute here:
90     */
91    c->reg.R0 = retype(elk_vec8_grf(i, 0), ELK_REGISTER_TYPE_UD); i++;
92 
93    /* Streamed vertex buffer indices */
94    if (sol_program)
95       c->reg.SVBI = retype(elk_vec8_grf(i++, 0), ELK_REGISTER_TYPE_UD);
96 
97    /* Payload vertices plus space for more generated vertices:
98     */
99    for (j = 0; j < nr_verts; j++) {
100       c->reg.vertex[j] = elk_vec4_grf(i, 0);
101       i += c->nr_regs;
102    }
103 
104    c->reg.header = retype(elk_vec8_grf(i++, 0), ELK_REGISTER_TYPE_UD);
105    c->reg.temp = retype(elk_vec8_grf(i++, 0), ELK_REGISTER_TYPE_UD);
106 
107    if (sol_program) {
108       c->reg.destination_indices =
109          retype(elk_vec4_grf(i++, 0), ELK_REGISTER_TYPE_UD);
110    }
111 
112    c->prog_data->urb_read_length = c->nr_regs;
113    c->prog_data->total_grf = i;
114 }
115 
116 
117 /**
118  * Set up the initial value of c->reg.header register based on c->reg.R0.
119  *
120  * The following information is passed to the GS thread in R0, and needs to be
121  * included in the first URB_WRITE or FF_SYNC message sent by the GS:
122  *
123  * - DWORD 0 [31:0] handle info (Gen4 only)
124  * - DWORD 5 [7:0] FFTID
125  * - DWORD 6 [31:0] Debug info
126  * - DWORD 7 [31:0] Debug info
127  *
128  * This function sets up the above data by copying by copying the contents of
129  * R0 to the header register.
130  */
elk_ff_gs_initialize_header(struct elk_ff_gs_compile * c)131 static void elk_ff_gs_initialize_header(struct elk_ff_gs_compile *c)
132 {
133    struct elk_codegen *p = &c->func;
134    elk_MOV(p, c->reg.header, c->reg.R0);
135 }
136 
137 /**
138  * Overwrite DWORD 2 of c->reg.header with the given immediate unsigned value.
139  *
140  * In URB_WRITE messages, DWORD 2 contains the fields PrimType, PrimStart,
141  * PrimEnd, Increment CL_INVOCATIONS, and SONumPrimsWritten, many of which we
142  * need to be able to update on a per-vertex basis.
143  */
elk_ff_gs_overwrite_header_dw2(struct elk_ff_gs_compile * c,unsigned dw2)144 static void elk_ff_gs_overwrite_header_dw2(struct elk_ff_gs_compile *c,
145                                            unsigned dw2)
146 {
147    struct elk_codegen *p = &c->func;
148    elk_MOV(p, get_element_ud(c->reg.header, 2), elk_imm_ud(dw2));
149 }
150 
151 /**
152  * Overwrite DWORD 2 of c->reg.header with the primitive type from c->reg.R0.
153  *
154  * When the thread is spawned, GRF 0 contains the primitive type in bits 4:0
155  * of DWORD 2.  URB_WRITE messages need the primitive type in bits 6:2 of
156  * DWORD 2.  So this function extracts the primitive type field, bitshifts it
157  * appropriately, and stores it in c->reg.header.
158  */
elk_ff_gs_overwrite_header_dw2_from_r0(struct elk_ff_gs_compile * c)159 static void elk_ff_gs_overwrite_header_dw2_from_r0(struct elk_ff_gs_compile *c)
160 {
161    struct elk_codegen *p = &c->func;
162    elk_AND(p, get_element_ud(c->reg.header, 2), get_element_ud(c->reg.R0, 2),
163            elk_imm_ud(0x1f));
164    elk_SHL(p, get_element_ud(c->reg.header, 2),
165            get_element_ud(c->reg.header, 2), elk_imm_ud(2));
166 }
167 
168 /**
169  * Apply an additive offset to DWORD 2 of c->reg.header.
170  *
171  * This is used to set/unset the "PrimStart" and "PrimEnd" flags appropriately
172  * for each vertex.
173  */
elk_ff_gs_offset_header_dw2(struct elk_ff_gs_compile * c,int offset)174 static void elk_ff_gs_offset_header_dw2(struct elk_ff_gs_compile *c,
175                                         int offset)
176 {
177    struct elk_codegen *p = &c->func;
178    elk_ADD(p, get_element_d(c->reg.header, 2), get_element_d(c->reg.header, 2),
179            elk_imm_d(offset));
180 }
181 
182 
183 /**
184  * Emit a vertex using the URB_WRITE message.  Use the contents of
185  * c->reg.header for the message header, and the registers starting at \c vert
186  * for the vertex data.
187  *
188  * If \c last is true, then this is the last vertex, so no further URB space
189  * should be allocated, and this message should end the thread.
190  *
191  * If \c last is false, then a new URB entry will be allocated, and its handle
192  * will be stored in DWORD 0 of c->reg.header for use in the next URB_WRITE
193  * message.
194  */
elk_ff_gs_emit_vue(struct elk_ff_gs_compile * c,struct elk_reg vert,bool last)195 static void elk_ff_gs_emit_vue(struct elk_ff_gs_compile *c,
196                                struct elk_reg vert,
197                                bool last)
198 {
199    struct elk_codegen *p = &c->func;
200    int write_offset = 0;
201    bool complete = false;
202 
203    do {
204       /* We can't write more than 14 registers at a time to the URB */
205       int write_len = MIN2(c->nr_regs - write_offset, 14);
206       if (write_len == c->nr_regs - write_offset)
207          complete = true;
208 
209       /* Copy the vertex from vertn into m1..mN+1:
210        */
211       elk_copy8(p, elk_message_reg(1), offset(vert, write_offset), write_len);
212 
213       /* Send the vertex data to the URB.  If this is the last write for this
214        * vertex, then we mark it as complete, and either end the thread or
215        * allocate another vertex URB entry (depending whether this is the last
216        * vertex).
217        */
218       enum elk_urb_write_flags flags;
219       if (!complete)
220          flags = ELK_URB_WRITE_NO_FLAGS;
221       else if (last)
222          flags = ELK_URB_WRITE_EOT_COMPLETE;
223       else
224          flags = ELK_URB_WRITE_ALLOCATE_COMPLETE;
225       elk_urb_WRITE(p,
226                     (flags & ELK_URB_WRITE_ALLOCATE) ? c->reg.temp
227                     : retype(elk_null_reg(), ELK_REGISTER_TYPE_UD),
228                     0,
229                     c->reg.header,
230                     flags,
231                     write_len + 1, /* msg length */
232                     (flags & ELK_URB_WRITE_ALLOCATE) ? 1
233                     : 0, /* response length */
234                     write_offset,  /* urb offset */
235                     ELK_URB_SWIZZLE_NONE);
236       write_offset += write_len;
237    } while (!complete);
238 
239    if (!last) {
240       elk_MOV(p, get_element_ud(c->reg.header, 0),
241               get_element_ud(c->reg.temp, 0));
242    }
243 }
244 
245 /**
246  * Send an FF_SYNC message to ensure that all previously spawned GS threads
247  * have finished sending primitives down the pipeline, and to allocate a URB
248  * entry for the first output vertex.  Only needed on Ironlake+.
249  *
250  * This function modifies c->reg.header: in DWORD 1, it stores num_prim (which
251  * is needed by the FF_SYNC message), and in DWORD 0, it stores the handle to
252  * the allocated URB entry (which will be needed by the URB_WRITE meesage that
253  * follows).
254  */
elk_ff_gs_ff_sync(struct elk_ff_gs_compile * c,int num_prim)255 static void elk_ff_gs_ff_sync(struct elk_ff_gs_compile *c, int num_prim)
256 {
257    struct elk_codegen *p = &c->func;
258 
259    elk_MOV(p, get_element_ud(c->reg.header, 1), elk_imm_ud(num_prim));
260    elk_ff_sync(p,
261                c->reg.temp,
262                0,
263                c->reg.header,
264                1, /* allocate */
265                1, /* response length */
266                0 /* eot */);
267    elk_MOV(p, get_element_ud(c->reg.header, 0),
268            get_element_ud(c->reg.temp, 0));
269 }
270 
271 
272 static void
elk_ff_gs_quads(struct elk_ff_gs_compile * c,const struct elk_ff_gs_prog_key * key)273 elk_ff_gs_quads(struct elk_ff_gs_compile *c,
274 		const struct elk_ff_gs_prog_key *key)
275 {
276    elk_ff_gs_alloc_regs(c, 4, false);
277    elk_ff_gs_initialize_header(c);
278    /* Use polygons for correct edgeflag behaviour. Note that vertex 3
279     * is the PV for quads, but vertex 0 for polygons:
280     */
281    if (c->func.devinfo->ver == 5)
282       elk_ff_gs_ff_sync(c, 1);
283    elk_ff_gs_overwrite_header_dw2(
284       c, ((_3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT)
285           | URB_WRITE_PRIM_START));
286    if (key->pv_first) {
287       elk_ff_gs_emit_vue(c, c->reg.vertex[0], 0);
288       elk_ff_gs_overwrite_header_dw2(
289          c, _3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT);
290       elk_ff_gs_emit_vue(c, c->reg.vertex[1], 0);
291       elk_ff_gs_emit_vue(c, c->reg.vertex[2], 0);
292       elk_ff_gs_overwrite_header_dw2(
293          c, ((_3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT)
294              | URB_WRITE_PRIM_END));
295       elk_ff_gs_emit_vue(c, c->reg.vertex[3], 1);
296    }
297    else {
298       elk_ff_gs_emit_vue(c, c->reg.vertex[3], 0);
299       elk_ff_gs_overwrite_header_dw2(
300          c, _3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT);
301       elk_ff_gs_emit_vue(c, c->reg.vertex[0], 0);
302       elk_ff_gs_emit_vue(c, c->reg.vertex[1], 0);
303       elk_ff_gs_overwrite_header_dw2(
304          c, ((_3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT)
305              | URB_WRITE_PRIM_END));
306       elk_ff_gs_emit_vue(c, c->reg.vertex[2], 1);
307    }
308 }
309 
310 static void
elk_ff_gs_quad_strip(struct elk_ff_gs_compile * c,const struct elk_ff_gs_prog_key * key)311 elk_ff_gs_quad_strip(struct elk_ff_gs_compile *c,
312                      const struct elk_ff_gs_prog_key *key)
313 {
314    elk_ff_gs_alloc_regs(c, 4, false);
315    elk_ff_gs_initialize_header(c);
316 
317    if (c->func.devinfo->ver == 5)
318       elk_ff_gs_ff_sync(c, 1);
319    elk_ff_gs_overwrite_header_dw2(
320       c, ((_3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT)
321           | URB_WRITE_PRIM_START));
322    if (key->pv_first) {
323       elk_ff_gs_emit_vue(c, c->reg.vertex[0], 0);
324       elk_ff_gs_overwrite_header_dw2(
325          c, _3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT);
326       elk_ff_gs_emit_vue(c, c->reg.vertex[1], 0);
327       elk_ff_gs_emit_vue(c, c->reg.vertex[2], 0);
328       elk_ff_gs_overwrite_header_dw2(
329          c, ((_3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT)
330              | URB_WRITE_PRIM_END));
331       elk_ff_gs_emit_vue(c, c->reg.vertex[3], 1);
332    }
333    else {
334       elk_ff_gs_emit_vue(c, c->reg.vertex[2], 0);
335       elk_ff_gs_overwrite_header_dw2(
336          c, _3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT);
337       elk_ff_gs_emit_vue(c, c->reg.vertex[3], 0);
338       elk_ff_gs_emit_vue(c, c->reg.vertex[0], 0);
339       elk_ff_gs_overwrite_header_dw2(
340          c, ((_3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT)
341              | URB_WRITE_PRIM_END));
342       elk_ff_gs_emit_vue(c, c->reg.vertex[1], 1);
343    }
344 }
345 
elk_ff_gs_lines(struct elk_ff_gs_compile * c)346 static void elk_ff_gs_lines(struct elk_ff_gs_compile *c)
347 {
348    elk_ff_gs_alloc_regs(c, 2, false);
349    elk_ff_gs_initialize_header(c);
350 
351    if (c->func.devinfo->ver == 5)
352       elk_ff_gs_ff_sync(c, 1);
353    elk_ff_gs_overwrite_header_dw2(
354       c, ((_3DPRIM_LINESTRIP << URB_WRITE_PRIM_TYPE_SHIFT)
355           | URB_WRITE_PRIM_START));
356    elk_ff_gs_emit_vue(c, c->reg.vertex[0], 0);
357    elk_ff_gs_overwrite_header_dw2(
358       c, ((_3DPRIM_LINESTRIP << URB_WRITE_PRIM_TYPE_SHIFT)
359           | URB_WRITE_PRIM_END));
360    elk_ff_gs_emit_vue(c, c->reg.vertex[1], 1);
361 }
362 
363 /**
364  * Generate the geometry shader program used on Gen6 to perform stream output
365  * (transform feedback).
366  */
367 static void
gfx6_sol_program(struct elk_ff_gs_compile * c,const struct elk_ff_gs_prog_key * key,unsigned num_verts,bool check_edge_flags)368 gfx6_sol_program(struct elk_ff_gs_compile *c, const struct elk_ff_gs_prog_key *key,
369                  unsigned num_verts, bool check_edge_flags)
370 {
371    struct elk_codegen *p = &c->func;
372    elk_inst *inst;
373    c->prog_data->svbi_postincrement_value = num_verts;
374 
375    elk_ff_gs_alloc_regs(c, num_verts, true);
376    elk_ff_gs_initialize_header(c);
377 
378    if (key->num_transform_feedback_bindings > 0) {
379       unsigned vertex, binding;
380       struct elk_reg destination_indices_uw =
381          vec8(retype(c->reg.destination_indices, ELK_REGISTER_TYPE_UW));
382 
383       /* Note: since we use the binding table to keep track of buffer offsets
384        * and stride, the GS doesn't need to keep track of a separate pointer
385        * into each buffer; it uses a single pointer which increments by 1 for
386        * each vertex.  So we use SVBI0 for this pointer, regardless of whether
387        * transform feedback is in interleaved or separate attribs mode.
388        *
389        * Make sure that the buffers have enough room for all the vertices.
390        */
391       elk_ADD(p, get_element_ud(c->reg.temp, 0),
392                  get_element_ud(c->reg.SVBI, 0), elk_imm_ud(num_verts));
393       elk_CMP(p, vec1(elk_null_reg()), ELK_CONDITIONAL_LE,
394                  get_element_ud(c->reg.temp, 0),
395                  get_element_ud(c->reg.SVBI, 4));
396       elk_IF(p, ELK_EXECUTE_1);
397 
398       /* Compute the destination indices to write to.  Usually we use SVBI[0]
399        * + (0, 1, 2).  However, for odd-numbered triangles in tristrips, the
400        * vertices come down the pipeline in reversed winding order, so we need
401        * to flip the order when writing to the transform feedback buffer.  To
402        * ensure that flatshading accuracy is preserved, we need to write them
403        * in order SVBI[0] + (0, 2, 1) if we're using the first provoking
404        * vertex convention, and in order SVBI[0] + (1, 0, 2) if we're using
405        * the last provoking vertex convention.
406        *
407        * Note: since elk_imm_v can only be used in instructions in
408        * packed-word execution mode, and SVBI is a double-word, we need to
409        * first move the appropriate immediate constant ((0, 1, 2), (0, 2, 1),
410        * or (1, 0, 2)) to the destination_indices register, and then add SVBI
411        * using a separate instruction.  Also, since the immediate constant is
412        * expressed as packed words, and we need to load double-words into
413        * destination_indices, we need to intersperse zeros to fill the upper
414        * halves of each double-word.
415        */
416       elk_MOV(p, destination_indices_uw,
417               elk_imm_v(0x00020100)); /* (0, 1, 2) */
418       if (num_verts == 3) {
419          /* Get primitive type into temp register. */
420          elk_AND(p, get_element_ud(c->reg.temp, 0),
421                  get_element_ud(c->reg.R0, 2), elk_imm_ud(0x1f));
422 
423          /* Test if primitive type is TRISTRIP_REVERSE.  We need to do this as
424           * an 8-wide comparison so that the conditional MOV that follows
425           * moves all 8 words correctly.
426           */
427          elk_CMP(p, vec8(elk_null_reg()), ELK_CONDITIONAL_EQ,
428                  get_element_ud(c->reg.temp, 0),
429                  elk_imm_ud(_3DPRIM_TRISTRIP_REVERSE));
430 
431          /* If so, then overwrite destination_indices_uw with the appropriate
432           * reordering.
433           */
434          inst = elk_MOV(p, destination_indices_uw,
435                         elk_imm_v(key->pv_first ? 0x00010200    /* (0, 2, 1) */
436                                                 : 0x00020001)); /* (1, 0, 2) */
437          elk_inst_set_pred_control(p->devinfo, inst, ELK_PREDICATE_NORMAL);
438       }
439 
440       assert(c->reg.destination_indices.width == ELK_EXECUTE_4);
441       elk_push_insn_state(p);
442       elk_set_default_exec_size(p, ELK_EXECUTE_4);
443       elk_ADD(p, c->reg.destination_indices,
444               c->reg.destination_indices, get_element_ud(c->reg.SVBI, 0));
445       elk_pop_insn_state(p);
446       /* For each vertex, generate code to output each varying using the
447        * appropriate binding table entry.
448        */
449       for (vertex = 0; vertex < num_verts; ++vertex) {
450          /* Set up the correct destination index for this vertex */
451          elk_MOV(p, get_element_ud(c->reg.header, 5),
452                  get_element_ud(c->reg.destination_indices, vertex));
453 
454          for (binding = 0; binding < key->num_transform_feedback_bindings;
455               ++binding) {
456             unsigned char varying =
457                key->transform_feedback_bindings[binding];
458             unsigned char slot = c->vue_map.varying_to_slot[varying];
459             /* From the Sandybridge PRM, Volume 2, Part 1, Section 4.5.1:
460              *
461              *   "Prior to End of Thread with a URB_WRITE, the kernel must
462              *   ensure that all writes are complete by sending the final
463              *   write as a committed write."
464              */
465             bool final_write =
466                binding == key->num_transform_feedback_bindings - 1 &&
467                vertex == num_verts - 1;
468             struct elk_reg vertex_slot = c->reg.vertex[vertex];
469             vertex_slot.nr += slot / 2;
470             vertex_slot.subnr = (slot % 2) * 16;
471             /* gl_PointSize is stored in VARYING_SLOT_PSIZ.w. */
472             vertex_slot.swizzle = varying == VARYING_SLOT_PSIZ
473                ? ELK_SWIZZLE_WWWW : key->transform_feedback_swizzles[binding];
474             elk_set_default_access_mode(p, ELK_ALIGN_16);
475             elk_push_insn_state(p);
476             elk_set_default_exec_size(p, ELK_EXECUTE_4);
477 
478             elk_MOV(p, stride(c->reg.header, 4, 4, 1),
479                     retype(vertex_slot, ELK_REGISTER_TYPE_UD));
480             elk_pop_insn_state(p);
481 
482             elk_set_default_access_mode(p, ELK_ALIGN_1);
483             elk_svb_write(p,
484                           final_write ? c->reg.temp : elk_null_reg(), /* dest */
485                           1, /* msg_reg_nr */
486                           c->reg.header, /* src0 */
487                           ELK_GFX6_SOL_BINDING_START + binding, /* binding_table_index */
488                           final_write); /* send_commit_msg */
489          }
490       }
491       elk_ENDIF(p);
492 
493       /* Now, reinitialize the header register from R0 to restore the parts of
494        * the register that we overwrote while streaming out transform feedback
495        * data.
496        */
497       elk_ff_gs_initialize_header(c);
498 
499       /* Finally, wait for the write commit to occur so that we can proceed to
500        * other things safely.
501        *
502        * From the Sandybridge PRM, Volume 4, Part 1, Section 3.3:
503        *
504        *   The write commit does not modify the destination register, but
505        *   merely clears the dependency associated with the destination
506        *   register. Thus, a simple “mov” instruction using the register as a
507        *   source is sufficient to wait for the write commit to occur.
508        */
509       elk_MOV(p, c->reg.temp, c->reg.temp);
510    }
511 
512    elk_ff_gs_ff_sync(c, 1);
513 
514    elk_ff_gs_overwrite_header_dw2_from_r0(c);
515    switch (num_verts) {
516    case 1:
517       elk_ff_gs_offset_header_dw2(c,
518                                   URB_WRITE_PRIM_START | URB_WRITE_PRIM_END);
519       elk_ff_gs_emit_vue(c, c->reg.vertex[0], true);
520       break;
521    case 2:
522       elk_ff_gs_offset_header_dw2(c, URB_WRITE_PRIM_START);
523       elk_ff_gs_emit_vue(c, c->reg.vertex[0], false);
524       elk_ff_gs_offset_header_dw2(c,
525                                   URB_WRITE_PRIM_END - URB_WRITE_PRIM_START);
526       elk_ff_gs_emit_vue(c, c->reg.vertex[1], true);
527       break;
528    case 3:
529       if (check_edge_flags) {
530          /* Only emit vertices 0 and 1 if this is the first triangle of the
531           * polygon.  Otherwise they are redundant.
532           */
533          elk_AND(p, retype(elk_null_reg(), ELK_REGISTER_TYPE_UD),
534                  get_element_ud(c->reg.R0, 2),
535                  elk_imm_ud(ELK_GS_EDGE_INDICATOR_0));
536          elk_inst_set_cond_modifier(p->devinfo, elk_last_inst, ELK_CONDITIONAL_NZ);
537          elk_IF(p, ELK_EXECUTE_1);
538       }
539       elk_ff_gs_offset_header_dw2(c, URB_WRITE_PRIM_START);
540       elk_ff_gs_emit_vue(c, c->reg.vertex[0], false);
541       elk_ff_gs_offset_header_dw2(c, -URB_WRITE_PRIM_START);
542       elk_ff_gs_emit_vue(c, c->reg.vertex[1], false);
543       if (check_edge_flags) {
544          elk_ENDIF(p);
545          /* Only emit vertex 2 in PRIM_END mode if this is the last triangle
546           * of the polygon.  Otherwise leave the primitive incomplete because
547           * there are more polygon vertices coming.
548           */
549          elk_AND(p, retype(elk_null_reg(), ELK_REGISTER_TYPE_UD),
550                  get_element_ud(c->reg.R0, 2),
551                  elk_imm_ud(ELK_GS_EDGE_INDICATOR_1));
552          elk_inst_set_cond_modifier(p->devinfo, elk_last_inst, ELK_CONDITIONAL_NZ);
553          elk_set_default_predicate_control(p, ELK_PREDICATE_NORMAL);
554       }
555       elk_ff_gs_offset_header_dw2(c, URB_WRITE_PRIM_END);
556       elk_set_default_predicate_control(p, ELK_PREDICATE_NONE);
557       elk_ff_gs_emit_vue(c, c->reg.vertex[2], true);
558       break;
559    }
560 }
561 
562 const unsigned *
elk_compile_ff_gs_prog(struct elk_compiler * compiler,void * mem_ctx,const struct elk_ff_gs_prog_key * key,struct elk_ff_gs_prog_data * prog_data,struct intel_vue_map * vue_map,unsigned * final_assembly_size)563 elk_compile_ff_gs_prog(struct elk_compiler *compiler,
564 		       void *mem_ctx,
565 		       const struct elk_ff_gs_prog_key *key,
566 		       struct elk_ff_gs_prog_data *prog_data,
567 		       struct intel_vue_map *vue_map,
568 		       unsigned *final_assembly_size)
569 {
570    struct elk_ff_gs_compile c;
571    const GLuint *program;
572 
573    memset(&c, 0, sizeof(c));
574 
575    c.key = *key;
576    c.vue_map = *vue_map;
577    c.nr_regs = (c.vue_map.num_slots + 1)/2;
578    c.prog_data = prog_data;
579 
580    mem_ctx = ralloc_context(NULL);
581 
582    /* Begin the compilation:
583     */
584    elk_init_codegen(&compiler->isa, &c.func, mem_ctx);
585 
586    c.func.single_program_flow = 1;
587 
588    /* For some reason the thread is spawned with only 4 channels
589     * unmasked.
590     */
591    elk_set_default_mask_control(&c.func, ELK_MASK_DISABLE);
592 
593    if (compiler->devinfo->ver >= 6) {
594       unsigned num_verts;
595       bool check_edge_flag;
596       /* On Sandybridge, we use the GS for implementing transform feedback
597        * (called "Stream Out" in the PRM).
598        */
599       switch (key->primitive) {
600       case _3DPRIM_POINTLIST:
601          num_verts = 1;
602          check_edge_flag = false;
603          break;
604       case _3DPRIM_LINELIST:
605       case _3DPRIM_LINESTRIP:
606       case _3DPRIM_LINELOOP:
607          num_verts = 2;
608          check_edge_flag = false;
609          break;
610       case _3DPRIM_TRILIST:
611       case _3DPRIM_TRIFAN:
612       case _3DPRIM_TRISTRIP:
613       case _3DPRIM_RECTLIST:
614          num_verts = 3;
615          check_edge_flag = false;
616          break;
617       case _3DPRIM_QUADLIST:
618       case _3DPRIM_QUADSTRIP:
619       case _3DPRIM_POLYGON:
620          num_verts = 3;
621          check_edge_flag = true;
622          break;
623       default:
624          unreachable("Unexpected primitive type in Gen6 SOL program.");
625       }
626       gfx6_sol_program(&c, key, num_verts, check_edge_flag);
627    } else {
628       /* On Gen4-5, we use the GS to decompose certain types of primitives.
629        * Note that primitives which don't require a GS program have already
630        * been weeded out by now.
631        */
632       switch (key->primitive) {
633       case _3DPRIM_QUADLIST:
634          elk_ff_gs_quads( &c, key );
635          break;
636       case _3DPRIM_QUADSTRIP:
637          elk_ff_gs_quad_strip( &c, key );
638          break;
639       case _3DPRIM_LINELOOP:
640          elk_ff_gs_lines( &c );
641          break;
642       default:
643          return NULL;
644       }
645    }
646 
647    elk_compact_instructions(&c.func, 0, NULL);
648 
649    /* get the program
650     */
651    program = elk_get_program(&c.func, final_assembly_size);
652 
653    if (INTEL_DEBUG(DEBUG_GS)) {
654       fprintf(stderr, "gs:\n");
655       elk_disassemble_with_labels(&compiler->isa, c.func.store,
656                                   0, *final_assembly_size, stderr);
657       fprintf(stderr, "\n");
658     }
659 
660    return program;
661 }
662 
663