xref: /aosp_15_r20/external/mesa3d/src/intel/compiler/brw_ir_performance.cpp (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2020 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "brw_eu.h"
25 #include "brw_fs.h"
26 #include "brw_cfg.h"
27 
28 using namespace brw;
29 
30 namespace {
31    /**
32     * Enumeration representing the various asynchronous units that can run
33     * computations in parallel on behalf of a shader thread.
34     */
35    enum intel_eu_unit {
36       /** EU front-end. */
37       EU_UNIT_FE,
38       /** EU FPU0 (Note that co-issue to FPU1 is currently not modeled here). */
39       EU_UNIT_FPU,
40       /** Extended Math unit (AKA FPU1 on Gfx8-11, part of the EU on Gfx6+). */
41       EU_UNIT_EM,
42       /** Sampler shared function. */
43       EU_UNIT_SAMPLER,
44       /** Pixel Interpolator shared function. */
45       EU_UNIT_PI,
46       /** Unified Return Buffer shared function. */
47       EU_UNIT_URB,
48       /** Data Port Data Cache shared function. */
49       EU_UNIT_DP_DC,
50       /** Data Port Render Cache shared function. */
51       EU_UNIT_DP_RC,
52       /** Data Port Constant Cache shared function. */
53       EU_UNIT_DP_CC,
54       /** Message Gateway shared function. */
55       EU_UNIT_GATEWAY,
56       /** Thread Spawner shared function. */
57       EU_UNIT_SPAWNER,
58       /* EU_UNIT_VME, */
59       /* EU_UNIT_CRE, */
60       /** Number of asynchronous units currently tracked. */
61       EU_NUM_UNITS,
62       /** Dummy unit for instructions that don't consume runtime from the above. */
63       EU_UNIT_NULL = EU_NUM_UNITS
64    };
65 
66    /**
67     * Enumeration representing a computation result another computation can
68     * potentially depend on.
69     */
70    enum intel_eu_dependency_id {
71       /* Register part of the GRF. */
72       EU_DEPENDENCY_ID_GRF0 = 0,
73       /* Address register part of the ARF. */
74       EU_DEPENDENCY_ID_ADDR0 = EU_DEPENDENCY_ID_GRF0 + XE2_MAX_GRF,
75       /* Accumulator register part of the ARF. */
76       EU_DEPENDENCY_ID_ACCUM0 = EU_DEPENDENCY_ID_ADDR0 + 1,
77       /* Flag register part of the ARF. */
78       EU_DEPENDENCY_ID_FLAG0 = EU_DEPENDENCY_ID_ACCUM0 + 12,
79       /* SBID token write completion.  Only used on Gfx12+. */
80       EU_DEPENDENCY_ID_SBID_WR0 = EU_DEPENDENCY_ID_FLAG0 + 8,
81       /* SBID token read completion.  Only used on Gfx12+. */
82       EU_DEPENDENCY_ID_SBID_RD0 = EU_DEPENDENCY_ID_SBID_WR0 + 32,
83       /* Number of computation dependencies currently tracked. */
84       EU_NUM_DEPENDENCY_IDS = EU_DEPENDENCY_ID_SBID_RD0 + 32
85    };
86 
87    /**
88     * State of our modeling of the program execution.
89     */
90    struct state {
state__anona6b15da60111::state91       state() : unit_ready(), dep_ready(), unit_busy(), weight(1.0) {}
92       /**
93        * Time at which a given unit will be ready to execute the next
94        * computation, in clock units.
95        */
96       unsigned unit_ready[EU_NUM_UNITS];
97       /**
98        * Time at which an instruction dependent on a given dependency ID will
99        * be ready to execute, in clock units.
100        */
101       unsigned dep_ready[EU_NUM_DEPENDENCY_IDS];
102       /**
103        * Aggregated utilization of a given unit excluding idle cycles,
104        * in clock units.
105        */
106       float unit_busy[EU_NUM_UNITS];
107       /**
108        * Factor of the overhead of a computation accounted for in the
109        * aggregated utilization calculation.
110        */
111       float weight;
112    };
113 
114    /**
115     * Information derived from an IR instruction used to compute performance
116     * estimates.  Allows the timing calculation to work on both FS and VEC4
117     * instructions.
118     */
119    struct instruction_info {
instruction_info__anona6b15da60111::instruction_info120       instruction_info(const struct brw_isa_info *isa, const fs_inst *inst) :
121          isa(isa), devinfo(isa->devinfo), op(inst->opcode),
122          td(inst->dst.type), sd(DIV_ROUND_UP(inst->size_written, REG_SIZE)),
123          tx(get_exec_type(inst)), sx(0), ss(0),
124          sc(has_bank_conflict(isa, inst) ? sd : 0),
125          desc(inst->desc), sfid(inst->sfid)
126       {
127          /* We typically want the maximum source size, except for split send
128           * messages which require the total size.
129           */
130          if (inst->opcode == SHADER_OPCODE_SEND) {
131             ss = DIV_ROUND_UP(inst->size_read(2), REG_SIZE) +
132                  DIV_ROUND_UP(inst->size_read(3), REG_SIZE);
133          } else {
134             for (unsigned i = 0; i < inst->sources; i++)
135                ss = MAX2(ss, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));
136          }
137 
138          /* Convert the execution size to GRF units. */
139          sx = DIV_ROUND_UP(inst->exec_size * brw_type_size_bytes(tx), REG_SIZE);
140 
141          /* 32x32 integer multiplication has half the usual ALU throughput.
142           * Treat it as double-precision.
143           */
144          if ((inst->opcode == BRW_OPCODE_MUL || inst->opcode == BRW_OPCODE_MAD) &&
145              !brw_type_is_float(tx) && brw_type_size_bytes(tx) == 4 &&
146              brw_type_size_bytes(inst->src[0].type) == brw_type_size_bytes(inst->src[1].type))
147             tx = brw_int_type(8, tx == BRW_TYPE_D);
148 
149          rcount = inst->opcode == BRW_OPCODE_DPAS ? inst->rcount : 0;
150       }
151 
152       /** ISA encoding information */
153       const struct brw_isa_info *isa;
154       /** Device information. */
155       const struct intel_device_info *devinfo;
156       /** Instruction opcode. */
157       opcode op;
158       /** Destination type. */
159       brw_reg_type td;
160       /** Destination size in GRF units. */
161       unsigned sd;
162       /** Execution type. */
163       brw_reg_type tx;
164       /** Execution size in GRF units. */
165       unsigned sx;
166       /** Source size. */
167       unsigned ss;
168       /** Bank conflict penalty size in GRF units (equal to sd if non-zero). */
169       unsigned sc;
170       /** Send message descriptor. */
171       uint32_t desc;
172       /** Send message shared function ID. */
173       uint8_t sfid;
174       /** Repeat count for DPAS instructions. */
175       uint8_t rcount;
176    };
177 
178    /**
179     * Timing information of an instruction used to estimate the performance of
180     * the program.
181     */
182    struct perf_desc {
perf_desc__anona6b15da60111::perf_desc183       perf_desc(enum intel_eu_unit u, int df, int db,
184                 int ls, int ld, int la, int lf) :
185          u(u), df(df), db(db), ls(ls), ld(ld), la(la), lf(lf) {}
186 
187       /**
188        * Back-end unit its runtime shall be accounted to, in addition to the
189        * EU front-end which is always assumed to be involved.
190        */
191       enum intel_eu_unit u;
192       /**
193        * Overhead cycles from the time that the EU front-end starts executing
194        * the instruction until it's ready to execute the next instruction.
195        */
196       int df;
197       /**
198        * Overhead cycles from the time that the back-end starts executing the
199        * instruction until it's ready to execute the next instruction.
200        */
201       int db;
202       /**
203        * Latency cycles from the time that the back-end starts executing the
204        * instruction until its sources have been read from the register file.
205        */
206       int ls;
207       /**
208        * Latency cycles from the time that the back-end starts executing the
209        * instruction until its regular destination has been written to the
210        * register file.
211        */
212       int ld;
213       /**
214        * Latency cycles from the time that the back-end starts executing the
215        * instruction until its accumulator destination has been written to the
216        * ARF file.
217        *
218        * Note that this is an approximation of the real behavior of
219        * accumulating instructions in the hardware: Instead of modeling a pair
220        * of back-to-back accumulating instructions as a first computation with
221        * latency equal to ld followed by another computation with a
222        * mid-pipeline stall (e.g. after the "M" part of a MAC instruction), we
223        * model the stall as if it occurred at the top of the pipeline, with
224        * the latency of the accumulator computation offset accordingly.
225        */
226       int la;
227       /**
228        * Latency cycles from the time that the back-end starts executing the
229        * instruction until its flag destination has been written to the ARF
230        * file.
231        */
232       int lf;
233    };
234 
235    /**
236     * Compute the timing information of an instruction based on any relevant
237     * information from the IR and a number of parameters specifying a linear
238     * approximation: Parameter X_Y specifies the derivative of timing X
239     * relative to info field Y, while X_1 specifies the independent term of
240     * the approximation of timing X.
241     */
242    perf_desc
calculate_desc(const instruction_info & info,enum intel_eu_unit u,int df_1,int df_sd,int df_sc,int db_1,int db_sx,int ls_1,int ld_1,int la_1,int lf_1,int l_ss,int l_sd)243    calculate_desc(const instruction_info &info, enum intel_eu_unit u,
244                   int df_1, int df_sd, int df_sc,
245                   int db_1, int db_sx,
246                   int ls_1, int ld_1, int la_1, int lf_1,
247                   int l_ss, int l_sd)
248    {
249       return perf_desc(u, df_1 + df_sd * int(info.sd) + df_sc * int(info.sc),
250                           db_1 + db_sx * int(info.sx),
251                           ls_1 + l_ss * int(info.ss),
252                           ld_1 + l_ss * int(info.ss) + l_sd * int(info.sd),
253                           la_1, lf_1);
254    }
255 
256    /**
257     * Compute the timing information of an instruction based on any relevant
258     * information from the IR and a number of linear approximation parameters
259     * hard-coded for each IR instruction.
260     *
261     * Most timing parameters are obtained from the multivariate linear
262     * regression of a sample of empirical timings measured using the tm0
263     * register (as can be done today by using the shader_time debugging
264     * option).  The Gfx4-5 math timings are obtained from BSpec Volume 5c.3
265     * "Shared Functions - Extended Math", Section 3.2 "Performance".
266     * Parameters marked XXX shall be considered low-quality, they're possibly
267     * high variance or completely guessed in cases where experimental data was
268     * unavailable.
269     */
270    const perf_desc
instruction_desc(const instruction_info & info)271    instruction_desc(const instruction_info &info)
272    {
273       const struct intel_device_info *devinfo = info.devinfo;
274 
275       switch (info.op) {
276       case BRW_OPCODE_SYNC:
277       case BRW_OPCODE_SEL:
278       case BRW_OPCODE_NOT:
279       case BRW_OPCODE_AND:
280       case BRW_OPCODE_OR:
281       case BRW_OPCODE_XOR:
282       case BRW_OPCODE_SHR:
283       case BRW_OPCODE_SHL:
284       case BRW_OPCODE_ASR:
285       case BRW_OPCODE_CMPN:
286       case BRW_OPCODE_BFREV:
287       case BRW_OPCODE_BFI1:
288       case BRW_OPCODE_AVG:
289       case BRW_OPCODE_FRC:
290       case BRW_OPCODE_RNDU:
291       case BRW_OPCODE_RNDD:
292       case BRW_OPCODE_RNDE:
293       case BRW_OPCODE_RNDZ:
294       case BRW_OPCODE_MAC:
295       case BRW_OPCODE_MACH:
296       case BRW_OPCODE_LZD:
297       case BRW_OPCODE_FBH:
298       case BRW_OPCODE_FBL:
299       case BRW_OPCODE_CBIT:
300       case BRW_OPCODE_ADDC:
301       case BRW_OPCODE_ROR:
302       case BRW_OPCODE_ROL:
303       case BRW_OPCODE_SUBB:
304       case BRW_OPCODE_LINE:
305       case BRW_OPCODE_NOP:
306       case SHADER_OPCODE_CLUSTER_BROADCAST:
307       case SHADER_OPCODE_SCRATCH_HEADER:
308       case FS_OPCODE_DDX_COARSE:
309       case FS_OPCODE_DDX_FINE:
310       case FS_OPCODE_DDY_COARSE:
311       case FS_OPCODE_PIXEL_X:
312       case FS_OPCODE_PIXEL_Y:
313          if (devinfo->ver >= 11) {
314             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
315                                   0, 10, 6 /* XXX */, 14, 0, 0);
316          } else {
317             if (brw_type_size_bytes(info.tx) > 4)
318                return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
319                                      0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
320             else
321                return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
322                                      0, 8, 4, 12, 0, 0);
323          }
324 
325       case BRW_OPCODE_MOV:
326       case BRW_OPCODE_CMP:
327       case BRW_OPCODE_ADD:
328       case BRW_OPCODE_ADD3:
329       case BRW_OPCODE_MUL:
330       case SHADER_OPCODE_MOV_RELOC_IMM:
331          if (devinfo->ver >= 11) {
332             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
333                                   0, 10, 6, 14, 0, 0);
334          } else {
335             if (brw_type_size_bytes(info.tx) > 4)
336                return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
337                                      0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
338             else
339                return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
340                                      0, 8, 4, 12, 0, 0);
341          }
342 
343       case BRW_OPCODE_BFE:
344       case BRW_OPCODE_BFI2:
345       case BRW_OPCODE_CSEL:
346          if (devinfo->ver >= 11)
347             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
348                                   0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
349          else
350             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
351                                   0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
352 
353       case BRW_OPCODE_MAD:
354          if (devinfo->ver >= 11) {
355             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
356                                   0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
357          } else {
358             if (brw_type_size_bytes(info.tx) > 4)
359                return calculate_desc(info, EU_UNIT_FPU, 0, 4, 1, 0, 4,
360                                      0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
361             else
362                return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
363                                      0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
364          }
365 
366       case BRW_OPCODE_DP4:
367       case BRW_OPCODE_DPH:
368       case BRW_OPCODE_DP3:
369       case BRW_OPCODE_DP2:
370          return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
371                                0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
372 
373       case BRW_OPCODE_DP4A:
374          if (devinfo->ver >= 12)
375             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
376                                   0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
377          else
378             abort();
379 
380       case BRW_OPCODE_DPAS: {
381          unsigned ld;
382 
383          switch (info.rcount) {
384          case 1:
385             ld = 21;
386             break;
387          case 2:
388             ld = 22;
389             break;
390          case 8:
391          default:
392             ld = 32;
393             break;
394          }
395 
396          /* DPAS cannot write the accumulator or the flags, so pass UINT_MAX
397           * for la and lf.
398           */
399          if (devinfo->verx10 >= 125)
400             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
401                                   0, ld, UINT_MAX, UINT_MAX, 0, 0);
402          else
403             abort();
404       }
405 
406       case SHADER_OPCODE_RCP:
407       case SHADER_OPCODE_RSQ:
408       case SHADER_OPCODE_SQRT:
409       case SHADER_OPCODE_EXP2:
410       case SHADER_OPCODE_LOG2:
411       case SHADER_OPCODE_SIN:
412       case SHADER_OPCODE_COS:
413          return calculate_desc(info, EU_UNIT_EM, -2, 4, 0, 0, 4,
414                                0, 16, 0, 0, 0, 0);
415 
416       case SHADER_OPCODE_POW:
417          return calculate_desc(info, EU_UNIT_EM, -2, 4, 0, 0, 8,
418                                0, 24, 0, 0, 0, 0);
419 
420       case SHADER_OPCODE_INT_QUOTIENT:
421       case SHADER_OPCODE_INT_REMAINDER:
422          return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 26, 0,
423                                0, 28 /* XXX */, 0, 0, 0, 0);
424 
425       case BRW_OPCODE_DO:
426          return calculate_desc(info, EU_UNIT_NULL, 0, 0, 0, 0, 0,
427                                0, 0, 0, 0, 0, 0);
428 
429       case BRW_OPCODE_IF:
430       case BRW_OPCODE_ELSE:
431       case BRW_OPCODE_ENDIF:
432       case BRW_OPCODE_WHILE:
433       case BRW_OPCODE_BREAK:
434       case BRW_OPCODE_CONTINUE:
435       case BRW_OPCODE_HALT:
436          return calculate_desc(info, EU_UNIT_NULL, 8, 0, 0, 0, 0,
437                                0, 0, 0, 0, 0, 0);
438 
439       case BRW_OPCODE_PLN:
440          return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
441                                0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
442 
443       case BRW_OPCODE_LRP:
444          return calculate_desc(info, EU_UNIT_FPU, 0, 4, 1, 0, 4,
445                                0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
446 
447       case FS_OPCODE_PACK_HALF_2x16_SPLIT:
448          if (devinfo->ver >= 11)
449             return calculate_desc(info, EU_UNIT_FPU, 20, 6, 0, 0, 6,
450                                   0, 10 /* XXX */, 6 /* XXX */,
451                                   14 /* XXX */, 0, 0);
452          else
453             return calculate_desc(info, EU_UNIT_FPU, 16, 6, 0, 0, 6,
454                                   0, 8 /* XXX */, 4 /* XXX */,
455                                   12 /* XXX */, 0, 0);
456 
457       case SHADER_OPCODE_READ_ARCH_REG:
458          if (devinfo->ver >= 12) {
459             return calculate_desc(info, EU_UNIT_FPU, 20, 6, 0, 0, 6,
460                                   0, 10, 6 /* XXX */, 14, 0, 0);
461          } else {
462             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
463                                   0, 8, 4, 12, 0, 0);
464          }
465 
466       case SHADER_OPCODE_MOV_INDIRECT:
467          if (devinfo->ver >= 11)
468             return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0,
469                                   0, 10 /* XXX */, 6 /* XXX */,
470                                   14 /* XXX */, 0, 0);
471          else
472             return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0,
473                                   0, 8 /* XXX */, 4 /* XXX */,
474                                   12 /* XXX */, 0, 0);
475 
476       case SHADER_OPCODE_BROADCAST:
477          if (devinfo->ver >= 11)
478             return calculate_desc(info, EU_UNIT_FPU, 20 /* XXX */, 0, 0, 4, 0,
479                                   0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
480          else
481             return calculate_desc(info, EU_UNIT_FPU, 18, 0, 0, 4, 0,
482                                   0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
483 
484       case SHADER_OPCODE_FIND_LIVE_CHANNEL:
485       case SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL:
486       case SHADER_OPCODE_LOAD_LIVE_CHANNELS:
487          if (devinfo->ver >= 11)
488             return calculate_desc(info, EU_UNIT_FPU, 2, 0, 0, 2, 0,
489                                   0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
490          else
491             return calculate_desc(info, EU_UNIT_FPU, 2, 0, 0, 2, 0,
492                                   0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
493 
494       case SHADER_OPCODE_RND_MODE:
495       case SHADER_OPCODE_FLOAT_CONTROL_MODE:
496          if (devinfo->ver >= 11)
497             return calculate_desc(info, EU_UNIT_FPU, 24 /* XXX */, 0, 0,
498                                   4 /* XXX */, 0,
499                                   0, 0, 0, 0, 0, 0);
500          else
501             return calculate_desc(info, EU_UNIT_FPU, 20 /* XXX */, 0, 0,
502                                   4 /* XXX */, 0,
503                                   0, 0, 0, 0, 0, 0);
504 
505       case SHADER_OPCODE_SHUFFLE:
506          if (devinfo->ver >= 11)
507             return calculate_desc(info, EU_UNIT_FPU, 44 /* XXX */, 0, 0,
508                                   44 /* XXX */, 0,
509                                   0, 10 /* XXX */, 6 /* XXX */,
510                                   14 /* XXX */, 0, 0);
511          else
512             return calculate_desc(info, EU_UNIT_FPU, 42 /* XXX */, 0, 0,
513                                   42 /* XXX */, 0,
514                                   0, 8 /* XXX */, 4 /* XXX */,
515                                   12 /* XXX */, 0, 0);
516 
517       case SHADER_OPCODE_SEL_EXEC:
518          if (devinfo->ver >= 11)
519             return calculate_desc(info, EU_UNIT_FPU, 10 /* XXX */, 4 /* XXX */, 0,
520                                   0, 4 /* XXX */,
521                                   0, 10 /* XXX */, 6 /* XXX */,
522                                   14 /* XXX */, 0, 0);
523          else
524             return calculate_desc(info, EU_UNIT_FPU, 8 /* XXX */, 4 /* XXX */, 0,
525                                   0, 4 /* XXX */,
526                                   0, 8 /* XXX */, 4 /* XXX */,
527                                   12 /* XXX */, 0, 0);
528 
529       case SHADER_OPCODE_QUAD_SWIZZLE:
530          if (devinfo->ver >= 11)
531             return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0,
532                                   0, 8 /* XXX */,
533                                   0, 10 /* XXX */, 6 /* XXX */,
534                                   14 /* XXX */, 0, 0);
535          else
536             return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0,
537                                   0, 8 /* XXX */,
538                                   0, 8 /* XXX */, 4 /* XXX */,
539                                   12 /* XXX */, 0, 0);
540 
541       case FS_OPCODE_DDY_FINE:
542          if (devinfo->ver >= 11)
543             return calculate_desc(info, EU_UNIT_FPU, 0, 14, 0, 0, 4,
544                                   0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
545          else
546             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
547                                   0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
548 
549       case FS_OPCODE_LOAD_LIVE_CHANNELS:
550          if (devinfo->ver >= 11)
551             return calculate_desc(info, EU_UNIT_FPU, 2 /* XXX */, 0, 0,
552                                   2 /* XXX */, 0,
553                                   0, 0, 0, 10 /* XXX */, 0, 0);
554          else
555             return calculate_desc(info, EU_UNIT_FPU, 0, 2 /* XXX */, 0,
556                                   0, 2 /* XXX */,
557                                   0, 0, 0, 8 /* XXX */, 0, 0);
558 
559       case SHADER_OPCODE_GET_BUFFER_SIZE:
560          return calculate_desc(info, EU_UNIT_SAMPLER, 2, 0, 0, 0, 16 /* XXX */,
561                                8 /* XXX */, 750 /* XXX */, 0, 0,
562                                2 /* XXX */, 0);
563 
564       case SHADER_OPCODE_MEMORY_FENCE:
565       case SHADER_OPCODE_INTERLOCK:
566          switch (info.sfid) {
567          case GFX6_SFID_DATAPORT_RENDER_CACHE:
568             return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0, 30 /* XXX */, 0,
569                                   10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
570 
571          case BRW_SFID_URB:
572          case GFX7_SFID_DATAPORT_DATA_CACHE:
573          case GFX12_SFID_SLM:
574          case GFX12_SFID_TGM:
575          case GFX12_SFID_UGM:
576          case HSW_SFID_DATAPORT_DATA_CACHE_1:
577             return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0, 30 /* XXX */, 0,
578                                   10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
579 
580          default:
581             abort();
582          }
583 
584       case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
585          return calculate_desc(info, EU_UNIT_DP_CC, 2, 0, 0, 0, 16 /* XXX */,
586                                10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
587 
588       case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
589       case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
590       case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
591          return calculate_desc(info, EU_UNIT_PI, 2, 0, 0, 14 /* XXX */, 0,
592                                0, 90 /* XXX */, 0, 0, 0, 0);
593 
594       case SHADER_OPCODE_BARRIER:
595          return calculate_desc(info, EU_UNIT_GATEWAY, 90 /* XXX */, 0, 0,
596                                0 /* XXX */, 0,
597                                0, 0, 0, 0, 0, 0);
598 
599       case SHADER_OPCODE_SEND:
600          switch (info.sfid) {
601          case GFX6_SFID_DATAPORT_CONSTANT_CACHE:
602             /* See FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD */
603             return calculate_desc(info, EU_UNIT_DP_CC, 2, 0, 0, 0, 16 /* XXX */,
604                                   10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
605          case GFX6_SFID_DATAPORT_RENDER_CACHE:
606             switch (brw_dp_desc_msg_type(devinfo, info.desc)) {
607             case GFX7_DATAPORT_RC_TYPED_ATOMIC_OP:
608                return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0,
609                                      30 /* XXX */, 450 /* XXX */,
610                                      10 /* XXX */, 100 /* XXX */,
611                                      0, 0, 0, 400 /* XXX */);
612             default:
613                return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0,
614                                      0, 450 /* XXX */,
615                                      10 /* XXX */, 300 /* XXX */, 0, 0,
616                                      0, 0);
617             }
618          case BRW_SFID_SAMPLER: {
619             return calculate_desc(info, EU_UNIT_SAMPLER, 2, 0, 0, 0, 16,
620                                   8, 750, 0, 0, 2, 0);
621          }
622          case GFX7_SFID_DATAPORT_DATA_CACHE:
623          case HSW_SFID_DATAPORT_DATA_CACHE_1:
624             switch (brw_dp_desc_msg_type(devinfo, info.desc)) {
625             case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP:
626             case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2:
627             case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2:
628             case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP:
629                return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
630                                      30 /* XXX */, 400 /* XXX */,
631                                      10 /* XXX */, 100 /* XXX */, 0, 0,
632                                      0, 400 /* XXX */);
633 
634             default:
635                return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
636                                      0, 20 /* XXX */,
637                                      10 /* XXX */, 100 /* XXX */, 0, 0,
638                                      0, 0);
639             }
640 
641          case GFX7_SFID_PIXEL_INTERPOLATOR:
642             return calculate_desc(info, EU_UNIT_PI, 2, 0, 0, 14 /* XXX */, 0,
643                                   0, 90 /* XXX */, 0, 0, 0, 0);
644 
645          case GFX12_SFID_UGM:
646          case GFX12_SFID_TGM:
647          case GFX12_SFID_SLM:
648             switch (lsc_msg_desc_opcode(devinfo, info.desc)) {
649             case LSC_OP_LOAD:
650             case LSC_OP_STORE:
651             case LSC_OP_LOAD_CMASK:
652             case LSC_OP_STORE_CMASK:
653                return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
654                                      0, 20 /* XXX */,
655                                      10 /* XXX */, 100 /* XXX */, 0, 0,
656                                      0, 0);
657 
658             case LSC_OP_FENCE:
659             case LSC_OP_ATOMIC_INC:
660             case LSC_OP_ATOMIC_DEC:
661             case LSC_OP_ATOMIC_LOAD:
662             case LSC_OP_ATOMIC_STORE:
663             case LSC_OP_ATOMIC_ADD:
664             case LSC_OP_ATOMIC_SUB:
665             case LSC_OP_ATOMIC_MIN:
666             case LSC_OP_ATOMIC_MAX:
667             case LSC_OP_ATOMIC_UMIN:
668             case LSC_OP_ATOMIC_UMAX:
669             case LSC_OP_ATOMIC_CMPXCHG:
670             case LSC_OP_ATOMIC_FADD:
671             case LSC_OP_ATOMIC_FSUB:
672             case LSC_OP_ATOMIC_FMIN:
673             case LSC_OP_ATOMIC_FMAX:
674             case LSC_OP_ATOMIC_FCMPXCHG:
675             case LSC_OP_ATOMIC_AND:
676             case LSC_OP_ATOMIC_OR:
677             case LSC_OP_ATOMIC_XOR:
678                return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
679                                      30 /* XXX */, 400 /* XXX */,
680                                      10 /* XXX */, 100 /* XXX */, 0, 0,
681                                      0, 400 /* XXX */);
682             default:
683                abort();
684             }
685 
686          case BRW_SFID_MESSAGE_GATEWAY:
687          case GEN_RT_SFID_BINDLESS_THREAD_DISPATCH: /* or THREAD_SPAWNER */
688          case GEN_RT_SFID_RAY_TRACE_ACCELERATOR:
689             return calculate_desc(info, EU_UNIT_SPAWNER, 2, 0, 0, 0 /* XXX */, 0,
690                                   10 /* XXX */, 0, 0, 0, 0, 0);
691 
692          case BRW_SFID_URB:
693             return calculate_desc(info, EU_UNIT_URB, 2, 0, 0, 0, 6 /* XXX */,
694                                   32 /* XXX */, 200 /* XXX */, 0, 0, 0, 0);
695 
696          default:
697             abort();
698          }
699 
700       case SHADER_OPCODE_UNDEF:
701       case SHADER_OPCODE_HALT_TARGET:
702       case FS_OPCODE_SCHEDULING_FENCE:
703          return calculate_desc(info, EU_UNIT_NULL, 0, 0, 0, 0, 0,
704                                0, 0, 0, 0, 0, 0);
705 
706       default:
707          abort();
708       }
709    }
710 
711    /**
712     * Model the performance behavior of a stall on the specified dependency
713     * ID.
714     */
715    void
stall_on_dependency(state & st,enum intel_eu_dependency_id id)716    stall_on_dependency(state &st, enum intel_eu_dependency_id id)
717    {
718       if (id < ARRAY_SIZE(st.dep_ready))
719          st.unit_ready[EU_UNIT_FE] = MAX2(st.unit_ready[EU_UNIT_FE],
720                                        st.dep_ready[id]);
721    }
722 
723    /**
724     * Model the performance behavior of the front-end and back-end while
725     * executing an instruction with the specified timing information, assuming
726     * all dependencies are already clear.
727     */
728    void
execute_instruction(state & st,const perf_desc & perf)729    execute_instruction(state &st, const perf_desc &perf)
730    {
731       /* Compute the time at which the front-end will be ready to execute the
732        * next instruction.
733        */
734       st.unit_ready[EU_UNIT_FE] += perf.df;
735 
736       if (perf.u < EU_NUM_UNITS) {
737          /* Wait for the back-end to be ready to execute this instruction. */
738          st.unit_ready[EU_UNIT_FE] = MAX2(st.unit_ready[EU_UNIT_FE],
739                                        st.unit_ready[perf.u]);
740 
741          /* Compute the time at which the back-end will be ready to execute
742           * the next instruction, and update the back-end utilization.
743           */
744          st.unit_ready[perf.u] = st.unit_ready[EU_UNIT_FE] + perf.db;
745          st.unit_busy[perf.u] += perf.db * st.weight;
746       }
747    }
748 
749    /**
750     * Model the performance behavior of a read dependency provided by an
751     * instruction.
752     */
753    void
mark_read_dependency(state & st,const perf_desc & perf,enum intel_eu_dependency_id id)754    mark_read_dependency(state &st, const perf_desc &perf,
755                         enum intel_eu_dependency_id id)
756    {
757       if (id < ARRAY_SIZE(st.dep_ready))
758          st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.ls;
759    }
760 
761    /**
762     * Model the performance behavior of a write dependency provided by an
763     * instruction.
764     */
765    void
mark_write_dependency(state & st,const perf_desc & perf,enum intel_eu_dependency_id id)766    mark_write_dependency(state &st, const perf_desc &perf,
767                          enum intel_eu_dependency_id id)
768    {
769       if (id >= EU_DEPENDENCY_ID_ACCUM0 && id < EU_DEPENDENCY_ID_FLAG0)
770          st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.la;
771       else if (id >= EU_DEPENDENCY_ID_FLAG0 && id < EU_DEPENDENCY_ID_SBID_WR0)
772          st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.lf;
773       else if (id < ARRAY_SIZE(st.dep_ready))
774          st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.ld;
775    }
776 
777    /**
778     * Return the dependency ID of a backend_reg, offset by \p delta GRFs.
779     */
780    enum intel_eu_dependency_id
reg_dependency_id(const intel_device_info * devinfo,const brw_reg & r,const int delta)781    reg_dependency_id(const intel_device_info *devinfo, const brw_reg &r,
782                      const int delta)
783    {
784       if (r.file == VGRF) {
785          const unsigned i = r.nr + r.offset / REG_SIZE + delta;
786          assert(i < EU_DEPENDENCY_ID_ADDR0 - EU_DEPENDENCY_ID_GRF0);
787          return intel_eu_dependency_id(EU_DEPENDENCY_ID_GRF0 + i);
788 
789       } else if (r.file == FIXED_GRF) {
790          const unsigned i = r.nr + delta;
791          assert(i < EU_DEPENDENCY_ID_ADDR0 - EU_DEPENDENCY_ID_GRF0);
792          return intel_eu_dependency_id(EU_DEPENDENCY_ID_GRF0 + i);
793 
794       } else if (r.file == ARF && r.nr >= BRW_ARF_ADDRESS &&
795                  r.nr < BRW_ARF_ACCUMULATOR) {
796          assert(delta == 0);
797          return EU_DEPENDENCY_ID_ADDR0;
798 
799       } else if (r.file == ARF && r.nr >= BRW_ARF_ACCUMULATOR &&
800                  r.nr < BRW_ARF_FLAG) {
801          const unsigned i = r.nr - BRW_ARF_ACCUMULATOR + delta;
802          assert(i < EU_DEPENDENCY_ID_FLAG0 - EU_DEPENDENCY_ID_ACCUM0);
803          return intel_eu_dependency_id(EU_DEPENDENCY_ID_ACCUM0 + i);
804 
805       } else {
806          return EU_NUM_DEPENDENCY_IDS;
807       }
808    }
809 
810    /**
811     * Return the dependency ID of flag register starting at offset \p i.
812     */
813    enum intel_eu_dependency_id
flag_dependency_id(unsigned i)814    flag_dependency_id(unsigned i)
815    {
816       assert(i < EU_DEPENDENCY_ID_SBID_WR0 - EU_DEPENDENCY_ID_FLAG0);
817       return intel_eu_dependency_id(EU_DEPENDENCY_ID_FLAG0 + i);
818    }
819 
820    /**
821     * Return the dependency ID corresponding to the SBID read completion
822     * condition of a Gfx12+ SWSB.
823     */
824    enum intel_eu_dependency_id
tgl_swsb_rd_dependency_id(tgl_swsb swsb)825    tgl_swsb_rd_dependency_id(tgl_swsb swsb)
826    {
827       if (swsb.mode) {
828          assert(swsb.sbid <
829                 EU_NUM_DEPENDENCY_IDS - EU_DEPENDENCY_ID_SBID_RD0);
830          return intel_eu_dependency_id(EU_DEPENDENCY_ID_SBID_RD0 + swsb.sbid);
831       } else {
832          return EU_NUM_DEPENDENCY_IDS;
833       }
834    }
835 
836    /**
837     * Return the dependency ID corresponding to the SBID write completion
838     * condition of a Gfx12+ SWSB.
839     */
840    enum intel_eu_dependency_id
tgl_swsb_wr_dependency_id(tgl_swsb swsb)841    tgl_swsb_wr_dependency_id(tgl_swsb swsb)
842    {
843       if (swsb.mode) {
844          assert(swsb.sbid <
845                 EU_DEPENDENCY_ID_SBID_RD0 - EU_DEPENDENCY_ID_SBID_WR0);
846          return intel_eu_dependency_id(EU_DEPENDENCY_ID_SBID_WR0 + swsb.sbid);
847       } else {
848          return EU_NUM_DEPENDENCY_IDS;
849       }
850    }
851 
852    /**
853     * Return the implicit accumulator register accessed by channel \p i of the
854     * instruction.
855     */
856    unsigned
accum_reg_of_channel(const intel_device_info * devinfo,const fs_inst * inst,brw_reg_type tx,unsigned i)857    accum_reg_of_channel(const intel_device_info *devinfo,
858                         const fs_inst *inst,
859                         brw_reg_type tx, unsigned i)
860    {
861       assert(inst->reads_accumulator_implicitly() ||
862              inst->writes_accumulator_implicitly(devinfo));
863       const unsigned offset = (inst->group + i) * brw_type_size_bytes(tx) *
864          (brw_type_is_float(tx) ? 1 : 2);
865       return offset / (reg_unit(devinfo) * REG_SIZE) % 2;
866    }
867 
868    /**
869     * Model the performance behavior of an FS back-end instruction.
870     */
871    void
issue_inst(state & st,const struct brw_isa_info * isa,const fs_inst * inst)872    issue_inst(state &st, const struct brw_isa_info *isa,
873               const fs_inst *inst)
874    {
875       const struct intel_device_info *devinfo = isa->devinfo;
876       const instruction_info info(isa, inst);
877       const perf_desc perf = instruction_desc(info);
878 
879       /* Stall on any source dependencies. */
880       for (unsigned i = 0; i < inst->sources; i++) {
881          for (unsigned j = 0; j < regs_read(inst, i); j++)
882             stall_on_dependency(
883                st, reg_dependency_id(devinfo, inst->src[i], j));
884       }
885 
886       if (inst->reads_accumulator_implicitly()) {
887          for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
888               j <= accum_reg_of_channel(devinfo, inst, info.tx,
889                                         inst->exec_size - 1); j++)
890             stall_on_dependency(
891                st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
892       }
893 
894       if (const unsigned mask = inst->flags_read(devinfo)) {
895          for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
896             if (mask & (1 << i))
897                stall_on_dependency(st, flag_dependency_id(i));
898          }
899       }
900 
901       /* Stall on any write dependencies. */
902       if (!inst->no_dd_check) {
903          if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
904             for (unsigned j = 0; j < regs_written(inst); j++)
905                stall_on_dependency(
906                   st, reg_dependency_id(devinfo, inst->dst, j));
907          }
908 
909          if (inst->writes_accumulator_implicitly(devinfo)) {
910             for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
911                  j <= accum_reg_of_channel(devinfo, inst, info.tx,
912                                            inst->exec_size - 1); j++)
913                stall_on_dependency(
914                   st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
915          }
916 
917          if (const unsigned mask = inst->flags_written(devinfo)) {
918             for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
919                if (mask & (1 << i))
920                   stall_on_dependency(st, flag_dependency_id(i));
921             }
922          }
923       }
924 
925       /* Stall on any SBID dependencies. */
926       if (inst->sched.mode & (TGL_SBID_SET | TGL_SBID_DST))
927          stall_on_dependency(st, tgl_swsb_wr_dependency_id(inst->sched));
928       else if (inst->sched.mode & TGL_SBID_SRC)
929          stall_on_dependency(st, tgl_swsb_rd_dependency_id(inst->sched));
930 
931       /* Execute the instruction. */
932       execute_instruction(st, perf);
933 
934       /* Mark any source dependencies. */
935       if (inst->is_send_from_grf()) {
936          for (unsigned i = 0; i < inst->sources; i++) {
937             if (inst->is_payload(i)) {
938                for (unsigned j = 0; j < regs_read(inst, i); j++)
939                   mark_read_dependency(
940                      st, perf, reg_dependency_id(devinfo, inst->src[i], j));
941             }
942          }
943       }
944 
945       /* Mark any destination dependencies. */
946       if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
947          for (unsigned j = 0; j < regs_written(inst); j++) {
948             mark_write_dependency(st, perf,
949                                   reg_dependency_id(devinfo, inst->dst, j));
950          }
951       }
952 
953       if (inst->writes_accumulator_implicitly(devinfo)) {
954          for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
955               j <= accum_reg_of_channel(devinfo, inst, info.tx,
956                                         inst->exec_size - 1); j++)
957             mark_write_dependency(st, perf,
958                                   reg_dependency_id(devinfo, brw_acc_reg(8), j));
959       }
960 
961       if (const unsigned mask = inst->flags_written(devinfo)) {
962          for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
963             if (mask & (1 << i))
964                mark_write_dependency(st, perf, flag_dependency_id(i));
965          }
966       }
967 
968       /* Mark any SBID dependencies. */
969       if (inst->sched.mode & TGL_SBID_SET) {
970          mark_read_dependency(st, perf, tgl_swsb_rd_dependency_id(inst->sched));
971          mark_write_dependency(st, perf, tgl_swsb_wr_dependency_id(inst->sched));
972       }
973    }
974 
975    /**
976     * Calculate the maximum possible throughput of the program compatible with
977     * the cycle-count utilization estimated for each asynchronous unit, in
978     * threads-per-cycle units.
979     */
980    float
calculate_thread_throughput(const state & st,float busy)981    calculate_thread_throughput(const state &st, float busy)
982    {
983       for (unsigned i = 0; i < EU_NUM_UNITS; i++)
984          busy = MAX2(busy, st.unit_busy[i]);
985 
986       return 1.0 / busy;
987    }
988 
989    /**
990     * Estimate the performance of the specified shader.
991     */
992    void
calculate_performance(performance & p,const fs_visitor * s,unsigned dispatch_width)993    calculate_performance(performance &p, const fs_visitor *s,
994                          unsigned dispatch_width)
995    {
996       /* XXX - Note that the previous version of this code used worst-case
997        *       scenario estimation of branching divergence for SIMD32 shaders,
998        *       but this heuristic was removed to improve performance in common
999        *       scenarios. Wider shader variants are less optimal when divergence
1000        *       is high, e.g. when application renders complex scene on a small
1001        *       surface. It is assumed that such renders are short, so their
1002        *       time doesn't matter and when it comes to the overall performance,
1003        *       they are dominated by more optimal larger renders.
1004        *
1005        *       It's possible that we could do better with divergence analysis
1006        *       by isolating branches which are 100% uniform.
1007        *
1008        *       Plumbing the trip counts from NIR loop analysis would allow us
1009        *       to do a better job regarding the loop weights.
1010        *
1011        *       In the meantime use values that roughly match the control flow
1012        *       weights used elsewhere in the compiler back-end.
1013        *
1014        *       Note that we provide slightly more pessimistic weights on
1015        *       Gfx12+ for SIMD32, since the effective warp size on that
1016        *       platform is 2x the SIMD width due to EU fusion, which increases
1017        *       the likelihood of divergent control flow in comparison to
1018        *       previous generations, giving narrower SIMD modes a performance
1019        *       advantage in several test-cases with non-uniform discard jumps.
1020        */
1021       const float discard_weight = (dispatch_width > 16 || s->devinfo->ver < 12 ?
1022                                     1.0 : 0.5);
1023       const float loop_weight = 10;
1024       unsigned halt_count = 0;
1025       unsigned elapsed = 0;
1026       state st;
1027 
1028       foreach_block(block, s->cfg) {
1029          const unsigned elapsed0 = elapsed;
1030 
1031          foreach_inst_in_block(fs_inst, inst, block) {
1032             const unsigned clock0 = st.unit_ready[EU_UNIT_FE];
1033 
1034             issue_inst(st, &s->compiler->isa, inst);
1035 
1036             if (inst->opcode == SHADER_OPCODE_HALT_TARGET && halt_count)
1037                st.weight /= discard_weight;
1038 
1039             elapsed += (st.unit_ready[EU_UNIT_FE] - clock0) * st.weight;
1040 
1041             if (inst->opcode == BRW_OPCODE_DO)
1042                st.weight *= loop_weight;
1043             else if (inst->opcode == BRW_OPCODE_WHILE)
1044                st.weight /= loop_weight;
1045             else if (inst->opcode == BRW_OPCODE_HALT && !halt_count++)
1046                st.weight *= discard_weight;
1047          }
1048 
1049          p.block_latency[block->num] = elapsed - elapsed0;
1050       }
1051 
1052       p.latency = elapsed;
1053       p.throughput = dispatch_width * calculate_thread_throughput(st, elapsed);
1054    }
1055 }
1056 
performance(const fs_visitor * v)1057 brw::performance::performance(const fs_visitor *v) :
1058    block_latency(new unsigned[v->cfg->num_blocks])
1059 {
1060    calculate_performance(*this, v, v->dispatch_width);
1061 }
1062 
~performance()1063 brw::performance::~performance()
1064 {
1065    delete[] block_latency;
1066 }
1067