1 /*
2 * Copyright © 2020 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_eu.h"
25 #include "brw_fs.h"
26 #include "brw_cfg.h"
27
28 using namespace brw;
29
30 namespace {
31 /**
32 * Enumeration representing the various asynchronous units that can run
33 * computations in parallel on behalf of a shader thread.
34 */
35 enum intel_eu_unit {
36 /** EU front-end. */
37 EU_UNIT_FE,
38 /** EU FPU0 (Note that co-issue to FPU1 is currently not modeled here). */
39 EU_UNIT_FPU,
40 /** Extended Math unit (AKA FPU1 on Gfx8-11, part of the EU on Gfx6+). */
41 EU_UNIT_EM,
42 /** Sampler shared function. */
43 EU_UNIT_SAMPLER,
44 /** Pixel Interpolator shared function. */
45 EU_UNIT_PI,
46 /** Unified Return Buffer shared function. */
47 EU_UNIT_URB,
48 /** Data Port Data Cache shared function. */
49 EU_UNIT_DP_DC,
50 /** Data Port Render Cache shared function. */
51 EU_UNIT_DP_RC,
52 /** Data Port Constant Cache shared function. */
53 EU_UNIT_DP_CC,
54 /** Message Gateway shared function. */
55 EU_UNIT_GATEWAY,
56 /** Thread Spawner shared function. */
57 EU_UNIT_SPAWNER,
58 /* EU_UNIT_VME, */
59 /* EU_UNIT_CRE, */
60 /** Number of asynchronous units currently tracked. */
61 EU_NUM_UNITS,
62 /** Dummy unit for instructions that don't consume runtime from the above. */
63 EU_UNIT_NULL = EU_NUM_UNITS
64 };
65
66 /**
67 * Enumeration representing a computation result another computation can
68 * potentially depend on.
69 */
70 enum intel_eu_dependency_id {
71 /* Register part of the GRF. */
72 EU_DEPENDENCY_ID_GRF0 = 0,
73 /* Address register part of the ARF. */
74 EU_DEPENDENCY_ID_ADDR0 = EU_DEPENDENCY_ID_GRF0 + XE2_MAX_GRF,
75 /* Accumulator register part of the ARF. */
76 EU_DEPENDENCY_ID_ACCUM0 = EU_DEPENDENCY_ID_ADDR0 + 1,
77 /* Flag register part of the ARF. */
78 EU_DEPENDENCY_ID_FLAG0 = EU_DEPENDENCY_ID_ACCUM0 + 12,
79 /* SBID token write completion. Only used on Gfx12+. */
80 EU_DEPENDENCY_ID_SBID_WR0 = EU_DEPENDENCY_ID_FLAG0 + 8,
81 /* SBID token read completion. Only used on Gfx12+. */
82 EU_DEPENDENCY_ID_SBID_RD0 = EU_DEPENDENCY_ID_SBID_WR0 + 32,
83 /* Number of computation dependencies currently tracked. */
84 EU_NUM_DEPENDENCY_IDS = EU_DEPENDENCY_ID_SBID_RD0 + 32
85 };
86
87 /**
88 * State of our modeling of the program execution.
89 */
90 struct state {
state__anona6b15da60111::state91 state() : unit_ready(), dep_ready(), unit_busy(), weight(1.0) {}
92 /**
93 * Time at which a given unit will be ready to execute the next
94 * computation, in clock units.
95 */
96 unsigned unit_ready[EU_NUM_UNITS];
97 /**
98 * Time at which an instruction dependent on a given dependency ID will
99 * be ready to execute, in clock units.
100 */
101 unsigned dep_ready[EU_NUM_DEPENDENCY_IDS];
102 /**
103 * Aggregated utilization of a given unit excluding idle cycles,
104 * in clock units.
105 */
106 float unit_busy[EU_NUM_UNITS];
107 /**
108 * Factor of the overhead of a computation accounted for in the
109 * aggregated utilization calculation.
110 */
111 float weight;
112 };
113
114 /**
115 * Information derived from an IR instruction used to compute performance
116 * estimates. Allows the timing calculation to work on both FS and VEC4
117 * instructions.
118 */
119 struct instruction_info {
instruction_info__anona6b15da60111::instruction_info120 instruction_info(const struct brw_isa_info *isa, const fs_inst *inst) :
121 isa(isa), devinfo(isa->devinfo), op(inst->opcode),
122 td(inst->dst.type), sd(DIV_ROUND_UP(inst->size_written, REG_SIZE)),
123 tx(get_exec_type(inst)), sx(0), ss(0),
124 sc(has_bank_conflict(isa, inst) ? sd : 0),
125 desc(inst->desc), sfid(inst->sfid)
126 {
127 /* We typically want the maximum source size, except for split send
128 * messages which require the total size.
129 */
130 if (inst->opcode == SHADER_OPCODE_SEND) {
131 ss = DIV_ROUND_UP(inst->size_read(2), REG_SIZE) +
132 DIV_ROUND_UP(inst->size_read(3), REG_SIZE);
133 } else {
134 for (unsigned i = 0; i < inst->sources; i++)
135 ss = MAX2(ss, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));
136 }
137
138 /* Convert the execution size to GRF units. */
139 sx = DIV_ROUND_UP(inst->exec_size * brw_type_size_bytes(tx), REG_SIZE);
140
141 /* 32x32 integer multiplication has half the usual ALU throughput.
142 * Treat it as double-precision.
143 */
144 if ((inst->opcode == BRW_OPCODE_MUL || inst->opcode == BRW_OPCODE_MAD) &&
145 !brw_type_is_float(tx) && brw_type_size_bytes(tx) == 4 &&
146 brw_type_size_bytes(inst->src[0].type) == brw_type_size_bytes(inst->src[1].type))
147 tx = brw_int_type(8, tx == BRW_TYPE_D);
148
149 rcount = inst->opcode == BRW_OPCODE_DPAS ? inst->rcount : 0;
150 }
151
152 /** ISA encoding information */
153 const struct brw_isa_info *isa;
154 /** Device information. */
155 const struct intel_device_info *devinfo;
156 /** Instruction opcode. */
157 opcode op;
158 /** Destination type. */
159 brw_reg_type td;
160 /** Destination size in GRF units. */
161 unsigned sd;
162 /** Execution type. */
163 brw_reg_type tx;
164 /** Execution size in GRF units. */
165 unsigned sx;
166 /** Source size. */
167 unsigned ss;
168 /** Bank conflict penalty size in GRF units (equal to sd if non-zero). */
169 unsigned sc;
170 /** Send message descriptor. */
171 uint32_t desc;
172 /** Send message shared function ID. */
173 uint8_t sfid;
174 /** Repeat count for DPAS instructions. */
175 uint8_t rcount;
176 };
177
178 /**
179 * Timing information of an instruction used to estimate the performance of
180 * the program.
181 */
182 struct perf_desc {
perf_desc__anona6b15da60111::perf_desc183 perf_desc(enum intel_eu_unit u, int df, int db,
184 int ls, int ld, int la, int lf) :
185 u(u), df(df), db(db), ls(ls), ld(ld), la(la), lf(lf) {}
186
187 /**
188 * Back-end unit its runtime shall be accounted to, in addition to the
189 * EU front-end which is always assumed to be involved.
190 */
191 enum intel_eu_unit u;
192 /**
193 * Overhead cycles from the time that the EU front-end starts executing
194 * the instruction until it's ready to execute the next instruction.
195 */
196 int df;
197 /**
198 * Overhead cycles from the time that the back-end starts executing the
199 * instruction until it's ready to execute the next instruction.
200 */
201 int db;
202 /**
203 * Latency cycles from the time that the back-end starts executing the
204 * instruction until its sources have been read from the register file.
205 */
206 int ls;
207 /**
208 * Latency cycles from the time that the back-end starts executing the
209 * instruction until its regular destination has been written to the
210 * register file.
211 */
212 int ld;
213 /**
214 * Latency cycles from the time that the back-end starts executing the
215 * instruction until its accumulator destination has been written to the
216 * ARF file.
217 *
218 * Note that this is an approximation of the real behavior of
219 * accumulating instructions in the hardware: Instead of modeling a pair
220 * of back-to-back accumulating instructions as a first computation with
221 * latency equal to ld followed by another computation with a
222 * mid-pipeline stall (e.g. after the "M" part of a MAC instruction), we
223 * model the stall as if it occurred at the top of the pipeline, with
224 * the latency of the accumulator computation offset accordingly.
225 */
226 int la;
227 /**
228 * Latency cycles from the time that the back-end starts executing the
229 * instruction until its flag destination has been written to the ARF
230 * file.
231 */
232 int lf;
233 };
234
235 /**
236 * Compute the timing information of an instruction based on any relevant
237 * information from the IR and a number of parameters specifying a linear
238 * approximation: Parameter X_Y specifies the derivative of timing X
239 * relative to info field Y, while X_1 specifies the independent term of
240 * the approximation of timing X.
241 */
242 perf_desc
calculate_desc(const instruction_info & info,enum intel_eu_unit u,int df_1,int df_sd,int df_sc,int db_1,int db_sx,int ls_1,int ld_1,int la_1,int lf_1,int l_ss,int l_sd)243 calculate_desc(const instruction_info &info, enum intel_eu_unit u,
244 int df_1, int df_sd, int df_sc,
245 int db_1, int db_sx,
246 int ls_1, int ld_1, int la_1, int lf_1,
247 int l_ss, int l_sd)
248 {
249 return perf_desc(u, df_1 + df_sd * int(info.sd) + df_sc * int(info.sc),
250 db_1 + db_sx * int(info.sx),
251 ls_1 + l_ss * int(info.ss),
252 ld_1 + l_ss * int(info.ss) + l_sd * int(info.sd),
253 la_1, lf_1);
254 }
255
256 /**
257 * Compute the timing information of an instruction based on any relevant
258 * information from the IR and a number of linear approximation parameters
259 * hard-coded for each IR instruction.
260 *
261 * Most timing parameters are obtained from the multivariate linear
262 * regression of a sample of empirical timings measured using the tm0
263 * register (as can be done today by using the shader_time debugging
264 * option). The Gfx4-5 math timings are obtained from BSpec Volume 5c.3
265 * "Shared Functions - Extended Math", Section 3.2 "Performance".
266 * Parameters marked XXX shall be considered low-quality, they're possibly
267 * high variance or completely guessed in cases where experimental data was
268 * unavailable.
269 */
270 const perf_desc
instruction_desc(const instruction_info & info)271 instruction_desc(const instruction_info &info)
272 {
273 const struct intel_device_info *devinfo = info.devinfo;
274
275 switch (info.op) {
276 case BRW_OPCODE_SYNC:
277 case BRW_OPCODE_SEL:
278 case BRW_OPCODE_NOT:
279 case BRW_OPCODE_AND:
280 case BRW_OPCODE_OR:
281 case BRW_OPCODE_XOR:
282 case BRW_OPCODE_SHR:
283 case BRW_OPCODE_SHL:
284 case BRW_OPCODE_ASR:
285 case BRW_OPCODE_CMPN:
286 case BRW_OPCODE_BFREV:
287 case BRW_OPCODE_BFI1:
288 case BRW_OPCODE_AVG:
289 case BRW_OPCODE_FRC:
290 case BRW_OPCODE_RNDU:
291 case BRW_OPCODE_RNDD:
292 case BRW_OPCODE_RNDE:
293 case BRW_OPCODE_RNDZ:
294 case BRW_OPCODE_MAC:
295 case BRW_OPCODE_MACH:
296 case BRW_OPCODE_LZD:
297 case BRW_OPCODE_FBH:
298 case BRW_OPCODE_FBL:
299 case BRW_OPCODE_CBIT:
300 case BRW_OPCODE_ADDC:
301 case BRW_OPCODE_ROR:
302 case BRW_OPCODE_ROL:
303 case BRW_OPCODE_SUBB:
304 case BRW_OPCODE_LINE:
305 case BRW_OPCODE_NOP:
306 case SHADER_OPCODE_CLUSTER_BROADCAST:
307 case SHADER_OPCODE_SCRATCH_HEADER:
308 case FS_OPCODE_DDX_COARSE:
309 case FS_OPCODE_DDX_FINE:
310 case FS_OPCODE_DDY_COARSE:
311 case FS_OPCODE_PIXEL_X:
312 case FS_OPCODE_PIXEL_Y:
313 if (devinfo->ver >= 11) {
314 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
315 0, 10, 6 /* XXX */, 14, 0, 0);
316 } else {
317 if (brw_type_size_bytes(info.tx) > 4)
318 return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
319 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
320 else
321 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
322 0, 8, 4, 12, 0, 0);
323 }
324
325 case BRW_OPCODE_MOV:
326 case BRW_OPCODE_CMP:
327 case BRW_OPCODE_ADD:
328 case BRW_OPCODE_ADD3:
329 case BRW_OPCODE_MUL:
330 case SHADER_OPCODE_MOV_RELOC_IMM:
331 if (devinfo->ver >= 11) {
332 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
333 0, 10, 6, 14, 0, 0);
334 } else {
335 if (brw_type_size_bytes(info.tx) > 4)
336 return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
337 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
338 else
339 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
340 0, 8, 4, 12, 0, 0);
341 }
342
343 case BRW_OPCODE_BFE:
344 case BRW_OPCODE_BFI2:
345 case BRW_OPCODE_CSEL:
346 if (devinfo->ver >= 11)
347 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
348 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
349 else
350 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
351 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
352
353 case BRW_OPCODE_MAD:
354 if (devinfo->ver >= 11) {
355 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
356 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
357 } else {
358 if (brw_type_size_bytes(info.tx) > 4)
359 return calculate_desc(info, EU_UNIT_FPU, 0, 4, 1, 0, 4,
360 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
361 else
362 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
363 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
364 }
365
366 case BRW_OPCODE_DP4:
367 case BRW_OPCODE_DPH:
368 case BRW_OPCODE_DP3:
369 case BRW_OPCODE_DP2:
370 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
371 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
372
373 case BRW_OPCODE_DP4A:
374 if (devinfo->ver >= 12)
375 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
376 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
377 else
378 abort();
379
380 case BRW_OPCODE_DPAS: {
381 unsigned ld;
382
383 switch (info.rcount) {
384 case 1:
385 ld = 21;
386 break;
387 case 2:
388 ld = 22;
389 break;
390 case 8:
391 default:
392 ld = 32;
393 break;
394 }
395
396 /* DPAS cannot write the accumulator or the flags, so pass UINT_MAX
397 * for la and lf.
398 */
399 if (devinfo->verx10 >= 125)
400 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
401 0, ld, UINT_MAX, UINT_MAX, 0, 0);
402 else
403 abort();
404 }
405
406 case SHADER_OPCODE_RCP:
407 case SHADER_OPCODE_RSQ:
408 case SHADER_OPCODE_SQRT:
409 case SHADER_OPCODE_EXP2:
410 case SHADER_OPCODE_LOG2:
411 case SHADER_OPCODE_SIN:
412 case SHADER_OPCODE_COS:
413 return calculate_desc(info, EU_UNIT_EM, -2, 4, 0, 0, 4,
414 0, 16, 0, 0, 0, 0);
415
416 case SHADER_OPCODE_POW:
417 return calculate_desc(info, EU_UNIT_EM, -2, 4, 0, 0, 8,
418 0, 24, 0, 0, 0, 0);
419
420 case SHADER_OPCODE_INT_QUOTIENT:
421 case SHADER_OPCODE_INT_REMAINDER:
422 return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 26, 0,
423 0, 28 /* XXX */, 0, 0, 0, 0);
424
425 case BRW_OPCODE_DO:
426 return calculate_desc(info, EU_UNIT_NULL, 0, 0, 0, 0, 0,
427 0, 0, 0, 0, 0, 0);
428
429 case BRW_OPCODE_IF:
430 case BRW_OPCODE_ELSE:
431 case BRW_OPCODE_ENDIF:
432 case BRW_OPCODE_WHILE:
433 case BRW_OPCODE_BREAK:
434 case BRW_OPCODE_CONTINUE:
435 case BRW_OPCODE_HALT:
436 return calculate_desc(info, EU_UNIT_NULL, 8, 0, 0, 0, 0,
437 0, 0, 0, 0, 0, 0);
438
439 case BRW_OPCODE_PLN:
440 return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
441 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
442
443 case BRW_OPCODE_LRP:
444 return calculate_desc(info, EU_UNIT_FPU, 0, 4, 1, 0, 4,
445 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
446
447 case FS_OPCODE_PACK_HALF_2x16_SPLIT:
448 if (devinfo->ver >= 11)
449 return calculate_desc(info, EU_UNIT_FPU, 20, 6, 0, 0, 6,
450 0, 10 /* XXX */, 6 /* XXX */,
451 14 /* XXX */, 0, 0);
452 else
453 return calculate_desc(info, EU_UNIT_FPU, 16, 6, 0, 0, 6,
454 0, 8 /* XXX */, 4 /* XXX */,
455 12 /* XXX */, 0, 0);
456
457 case SHADER_OPCODE_READ_ARCH_REG:
458 if (devinfo->ver >= 12) {
459 return calculate_desc(info, EU_UNIT_FPU, 20, 6, 0, 0, 6,
460 0, 10, 6 /* XXX */, 14, 0, 0);
461 } else {
462 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
463 0, 8, 4, 12, 0, 0);
464 }
465
466 case SHADER_OPCODE_MOV_INDIRECT:
467 if (devinfo->ver >= 11)
468 return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0,
469 0, 10 /* XXX */, 6 /* XXX */,
470 14 /* XXX */, 0, 0);
471 else
472 return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0,
473 0, 8 /* XXX */, 4 /* XXX */,
474 12 /* XXX */, 0, 0);
475
476 case SHADER_OPCODE_BROADCAST:
477 if (devinfo->ver >= 11)
478 return calculate_desc(info, EU_UNIT_FPU, 20 /* XXX */, 0, 0, 4, 0,
479 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
480 else
481 return calculate_desc(info, EU_UNIT_FPU, 18, 0, 0, 4, 0,
482 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
483
484 case SHADER_OPCODE_FIND_LIVE_CHANNEL:
485 case SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL:
486 case SHADER_OPCODE_LOAD_LIVE_CHANNELS:
487 if (devinfo->ver >= 11)
488 return calculate_desc(info, EU_UNIT_FPU, 2, 0, 0, 2, 0,
489 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
490 else
491 return calculate_desc(info, EU_UNIT_FPU, 2, 0, 0, 2, 0,
492 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
493
494 case SHADER_OPCODE_RND_MODE:
495 case SHADER_OPCODE_FLOAT_CONTROL_MODE:
496 if (devinfo->ver >= 11)
497 return calculate_desc(info, EU_UNIT_FPU, 24 /* XXX */, 0, 0,
498 4 /* XXX */, 0,
499 0, 0, 0, 0, 0, 0);
500 else
501 return calculate_desc(info, EU_UNIT_FPU, 20 /* XXX */, 0, 0,
502 4 /* XXX */, 0,
503 0, 0, 0, 0, 0, 0);
504
505 case SHADER_OPCODE_SHUFFLE:
506 if (devinfo->ver >= 11)
507 return calculate_desc(info, EU_UNIT_FPU, 44 /* XXX */, 0, 0,
508 44 /* XXX */, 0,
509 0, 10 /* XXX */, 6 /* XXX */,
510 14 /* XXX */, 0, 0);
511 else
512 return calculate_desc(info, EU_UNIT_FPU, 42 /* XXX */, 0, 0,
513 42 /* XXX */, 0,
514 0, 8 /* XXX */, 4 /* XXX */,
515 12 /* XXX */, 0, 0);
516
517 case SHADER_OPCODE_SEL_EXEC:
518 if (devinfo->ver >= 11)
519 return calculate_desc(info, EU_UNIT_FPU, 10 /* XXX */, 4 /* XXX */, 0,
520 0, 4 /* XXX */,
521 0, 10 /* XXX */, 6 /* XXX */,
522 14 /* XXX */, 0, 0);
523 else
524 return calculate_desc(info, EU_UNIT_FPU, 8 /* XXX */, 4 /* XXX */, 0,
525 0, 4 /* XXX */,
526 0, 8 /* XXX */, 4 /* XXX */,
527 12 /* XXX */, 0, 0);
528
529 case SHADER_OPCODE_QUAD_SWIZZLE:
530 if (devinfo->ver >= 11)
531 return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0,
532 0, 8 /* XXX */,
533 0, 10 /* XXX */, 6 /* XXX */,
534 14 /* XXX */, 0, 0);
535 else
536 return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0,
537 0, 8 /* XXX */,
538 0, 8 /* XXX */, 4 /* XXX */,
539 12 /* XXX */, 0, 0);
540
541 case FS_OPCODE_DDY_FINE:
542 if (devinfo->ver >= 11)
543 return calculate_desc(info, EU_UNIT_FPU, 0, 14, 0, 0, 4,
544 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
545 else
546 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
547 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
548
549 case FS_OPCODE_LOAD_LIVE_CHANNELS:
550 if (devinfo->ver >= 11)
551 return calculate_desc(info, EU_UNIT_FPU, 2 /* XXX */, 0, 0,
552 2 /* XXX */, 0,
553 0, 0, 0, 10 /* XXX */, 0, 0);
554 else
555 return calculate_desc(info, EU_UNIT_FPU, 0, 2 /* XXX */, 0,
556 0, 2 /* XXX */,
557 0, 0, 0, 8 /* XXX */, 0, 0);
558
559 case SHADER_OPCODE_GET_BUFFER_SIZE:
560 return calculate_desc(info, EU_UNIT_SAMPLER, 2, 0, 0, 0, 16 /* XXX */,
561 8 /* XXX */, 750 /* XXX */, 0, 0,
562 2 /* XXX */, 0);
563
564 case SHADER_OPCODE_MEMORY_FENCE:
565 case SHADER_OPCODE_INTERLOCK:
566 switch (info.sfid) {
567 case GFX6_SFID_DATAPORT_RENDER_CACHE:
568 return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0, 30 /* XXX */, 0,
569 10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
570
571 case BRW_SFID_URB:
572 case GFX7_SFID_DATAPORT_DATA_CACHE:
573 case GFX12_SFID_SLM:
574 case GFX12_SFID_TGM:
575 case GFX12_SFID_UGM:
576 case HSW_SFID_DATAPORT_DATA_CACHE_1:
577 return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0, 30 /* XXX */, 0,
578 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
579
580 default:
581 abort();
582 }
583
584 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
585 return calculate_desc(info, EU_UNIT_DP_CC, 2, 0, 0, 0, 16 /* XXX */,
586 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
587
588 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
589 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
590 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
591 return calculate_desc(info, EU_UNIT_PI, 2, 0, 0, 14 /* XXX */, 0,
592 0, 90 /* XXX */, 0, 0, 0, 0);
593
594 case SHADER_OPCODE_BARRIER:
595 return calculate_desc(info, EU_UNIT_GATEWAY, 90 /* XXX */, 0, 0,
596 0 /* XXX */, 0,
597 0, 0, 0, 0, 0, 0);
598
599 case SHADER_OPCODE_SEND:
600 switch (info.sfid) {
601 case GFX6_SFID_DATAPORT_CONSTANT_CACHE:
602 /* See FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD */
603 return calculate_desc(info, EU_UNIT_DP_CC, 2, 0, 0, 0, 16 /* XXX */,
604 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
605 case GFX6_SFID_DATAPORT_RENDER_CACHE:
606 switch (brw_dp_desc_msg_type(devinfo, info.desc)) {
607 case GFX7_DATAPORT_RC_TYPED_ATOMIC_OP:
608 return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0,
609 30 /* XXX */, 450 /* XXX */,
610 10 /* XXX */, 100 /* XXX */,
611 0, 0, 0, 400 /* XXX */);
612 default:
613 return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0,
614 0, 450 /* XXX */,
615 10 /* XXX */, 300 /* XXX */, 0, 0,
616 0, 0);
617 }
618 case BRW_SFID_SAMPLER: {
619 return calculate_desc(info, EU_UNIT_SAMPLER, 2, 0, 0, 0, 16,
620 8, 750, 0, 0, 2, 0);
621 }
622 case GFX7_SFID_DATAPORT_DATA_CACHE:
623 case HSW_SFID_DATAPORT_DATA_CACHE_1:
624 switch (brw_dp_desc_msg_type(devinfo, info.desc)) {
625 case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP:
626 case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2:
627 case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2:
628 case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP:
629 return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
630 30 /* XXX */, 400 /* XXX */,
631 10 /* XXX */, 100 /* XXX */, 0, 0,
632 0, 400 /* XXX */);
633
634 default:
635 return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
636 0, 20 /* XXX */,
637 10 /* XXX */, 100 /* XXX */, 0, 0,
638 0, 0);
639 }
640
641 case GFX7_SFID_PIXEL_INTERPOLATOR:
642 return calculate_desc(info, EU_UNIT_PI, 2, 0, 0, 14 /* XXX */, 0,
643 0, 90 /* XXX */, 0, 0, 0, 0);
644
645 case GFX12_SFID_UGM:
646 case GFX12_SFID_TGM:
647 case GFX12_SFID_SLM:
648 switch (lsc_msg_desc_opcode(devinfo, info.desc)) {
649 case LSC_OP_LOAD:
650 case LSC_OP_STORE:
651 case LSC_OP_LOAD_CMASK:
652 case LSC_OP_STORE_CMASK:
653 return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
654 0, 20 /* XXX */,
655 10 /* XXX */, 100 /* XXX */, 0, 0,
656 0, 0);
657
658 case LSC_OP_FENCE:
659 case LSC_OP_ATOMIC_INC:
660 case LSC_OP_ATOMIC_DEC:
661 case LSC_OP_ATOMIC_LOAD:
662 case LSC_OP_ATOMIC_STORE:
663 case LSC_OP_ATOMIC_ADD:
664 case LSC_OP_ATOMIC_SUB:
665 case LSC_OP_ATOMIC_MIN:
666 case LSC_OP_ATOMIC_MAX:
667 case LSC_OP_ATOMIC_UMIN:
668 case LSC_OP_ATOMIC_UMAX:
669 case LSC_OP_ATOMIC_CMPXCHG:
670 case LSC_OP_ATOMIC_FADD:
671 case LSC_OP_ATOMIC_FSUB:
672 case LSC_OP_ATOMIC_FMIN:
673 case LSC_OP_ATOMIC_FMAX:
674 case LSC_OP_ATOMIC_FCMPXCHG:
675 case LSC_OP_ATOMIC_AND:
676 case LSC_OP_ATOMIC_OR:
677 case LSC_OP_ATOMIC_XOR:
678 return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
679 30 /* XXX */, 400 /* XXX */,
680 10 /* XXX */, 100 /* XXX */, 0, 0,
681 0, 400 /* XXX */);
682 default:
683 abort();
684 }
685
686 case BRW_SFID_MESSAGE_GATEWAY:
687 case GEN_RT_SFID_BINDLESS_THREAD_DISPATCH: /* or THREAD_SPAWNER */
688 case GEN_RT_SFID_RAY_TRACE_ACCELERATOR:
689 return calculate_desc(info, EU_UNIT_SPAWNER, 2, 0, 0, 0 /* XXX */, 0,
690 10 /* XXX */, 0, 0, 0, 0, 0);
691
692 case BRW_SFID_URB:
693 return calculate_desc(info, EU_UNIT_URB, 2, 0, 0, 0, 6 /* XXX */,
694 32 /* XXX */, 200 /* XXX */, 0, 0, 0, 0);
695
696 default:
697 abort();
698 }
699
700 case SHADER_OPCODE_UNDEF:
701 case SHADER_OPCODE_HALT_TARGET:
702 case FS_OPCODE_SCHEDULING_FENCE:
703 return calculate_desc(info, EU_UNIT_NULL, 0, 0, 0, 0, 0,
704 0, 0, 0, 0, 0, 0);
705
706 default:
707 abort();
708 }
709 }
710
711 /**
712 * Model the performance behavior of a stall on the specified dependency
713 * ID.
714 */
715 void
stall_on_dependency(state & st,enum intel_eu_dependency_id id)716 stall_on_dependency(state &st, enum intel_eu_dependency_id id)
717 {
718 if (id < ARRAY_SIZE(st.dep_ready))
719 st.unit_ready[EU_UNIT_FE] = MAX2(st.unit_ready[EU_UNIT_FE],
720 st.dep_ready[id]);
721 }
722
723 /**
724 * Model the performance behavior of the front-end and back-end while
725 * executing an instruction with the specified timing information, assuming
726 * all dependencies are already clear.
727 */
728 void
execute_instruction(state & st,const perf_desc & perf)729 execute_instruction(state &st, const perf_desc &perf)
730 {
731 /* Compute the time at which the front-end will be ready to execute the
732 * next instruction.
733 */
734 st.unit_ready[EU_UNIT_FE] += perf.df;
735
736 if (perf.u < EU_NUM_UNITS) {
737 /* Wait for the back-end to be ready to execute this instruction. */
738 st.unit_ready[EU_UNIT_FE] = MAX2(st.unit_ready[EU_UNIT_FE],
739 st.unit_ready[perf.u]);
740
741 /* Compute the time at which the back-end will be ready to execute
742 * the next instruction, and update the back-end utilization.
743 */
744 st.unit_ready[perf.u] = st.unit_ready[EU_UNIT_FE] + perf.db;
745 st.unit_busy[perf.u] += perf.db * st.weight;
746 }
747 }
748
749 /**
750 * Model the performance behavior of a read dependency provided by an
751 * instruction.
752 */
753 void
mark_read_dependency(state & st,const perf_desc & perf,enum intel_eu_dependency_id id)754 mark_read_dependency(state &st, const perf_desc &perf,
755 enum intel_eu_dependency_id id)
756 {
757 if (id < ARRAY_SIZE(st.dep_ready))
758 st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.ls;
759 }
760
761 /**
762 * Model the performance behavior of a write dependency provided by an
763 * instruction.
764 */
765 void
mark_write_dependency(state & st,const perf_desc & perf,enum intel_eu_dependency_id id)766 mark_write_dependency(state &st, const perf_desc &perf,
767 enum intel_eu_dependency_id id)
768 {
769 if (id >= EU_DEPENDENCY_ID_ACCUM0 && id < EU_DEPENDENCY_ID_FLAG0)
770 st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.la;
771 else if (id >= EU_DEPENDENCY_ID_FLAG0 && id < EU_DEPENDENCY_ID_SBID_WR0)
772 st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.lf;
773 else if (id < ARRAY_SIZE(st.dep_ready))
774 st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.ld;
775 }
776
777 /**
778 * Return the dependency ID of a backend_reg, offset by \p delta GRFs.
779 */
780 enum intel_eu_dependency_id
reg_dependency_id(const intel_device_info * devinfo,const brw_reg & r,const int delta)781 reg_dependency_id(const intel_device_info *devinfo, const brw_reg &r,
782 const int delta)
783 {
784 if (r.file == VGRF) {
785 const unsigned i = r.nr + r.offset / REG_SIZE + delta;
786 assert(i < EU_DEPENDENCY_ID_ADDR0 - EU_DEPENDENCY_ID_GRF0);
787 return intel_eu_dependency_id(EU_DEPENDENCY_ID_GRF0 + i);
788
789 } else if (r.file == FIXED_GRF) {
790 const unsigned i = r.nr + delta;
791 assert(i < EU_DEPENDENCY_ID_ADDR0 - EU_DEPENDENCY_ID_GRF0);
792 return intel_eu_dependency_id(EU_DEPENDENCY_ID_GRF0 + i);
793
794 } else if (r.file == ARF && r.nr >= BRW_ARF_ADDRESS &&
795 r.nr < BRW_ARF_ACCUMULATOR) {
796 assert(delta == 0);
797 return EU_DEPENDENCY_ID_ADDR0;
798
799 } else if (r.file == ARF && r.nr >= BRW_ARF_ACCUMULATOR &&
800 r.nr < BRW_ARF_FLAG) {
801 const unsigned i = r.nr - BRW_ARF_ACCUMULATOR + delta;
802 assert(i < EU_DEPENDENCY_ID_FLAG0 - EU_DEPENDENCY_ID_ACCUM0);
803 return intel_eu_dependency_id(EU_DEPENDENCY_ID_ACCUM0 + i);
804
805 } else {
806 return EU_NUM_DEPENDENCY_IDS;
807 }
808 }
809
810 /**
811 * Return the dependency ID of flag register starting at offset \p i.
812 */
813 enum intel_eu_dependency_id
flag_dependency_id(unsigned i)814 flag_dependency_id(unsigned i)
815 {
816 assert(i < EU_DEPENDENCY_ID_SBID_WR0 - EU_DEPENDENCY_ID_FLAG0);
817 return intel_eu_dependency_id(EU_DEPENDENCY_ID_FLAG0 + i);
818 }
819
820 /**
821 * Return the dependency ID corresponding to the SBID read completion
822 * condition of a Gfx12+ SWSB.
823 */
824 enum intel_eu_dependency_id
tgl_swsb_rd_dependency_id(tgl_swsb swsb)825 tgl_swsb_rd_dependency_id(tgl_swsb swsb)
826 {
827 if (swsb.mode) {
828 assert(swsb.sbid <
829 EU_NUM_DEPENDENCY_IDS - EU_DEPENDENCY_ID_SBID_RD0);
830 return intel_eu_dependency_id(EU_DEPENDENCY_ID_SBID_RD0 + swsb.sbid);
831 } else {
832 return EU_NUM_DEPENDENCY_IDS;
833 }
834 }
835
836 /**
837 * Return the dependency ID corresponding to the SBID write completion
838 * condition of a Gfx12+ SWSB.
839 */
840 enum intel_eu_dependency_id
tgl_swsb_wr_dependency_id(tgl_swsb swsb)841 tgl_swsb_wr_dependency_id(tgl_swsb swsb)
842 {
843 if (swsb.mode) {
844 assert(swsb.sbid <
845 EU_DEPENDENCY_ID_SBID_RD0 - EU_DEPENDENCY_ID_SBID_WR0);
846 return intel_eu_dependency_id(EU_DEPENDENCY_ID_SBID_WR0 + swsb.sbid);
847 } else {
848 return EU_NUM_DEPENDENCY_IDS;
849 }
850 }
851
852 /**
853 * Return the implicit accumulator register accessed by channel \p i of the
854 * instruction.
855 */
856 unsigned
accum_reg_of_channel(const intel_device_info * devinfo,const fs_inst * inst,brw_reg_type tx,unsigned i)857 accum_reg_of_channel(const intel_device_info *devinfo,
858 const fs_inst *inst,
859 brw_reg_type tx, unsigned i)
860 {
861 assert(inst->reads_accumulator_implicitly() ||
862 inst->writes_accumulator_implicitly(devinfo));
863 const unsigned offset = (inst->group + i) * brw_type_size_bytes(tx) *
864 (brw_type_is_float(tx) ? 1 : 2);
865 return offset / (reg_unit(devinfo) * REG_SIZE) % 2;
866 }
867
868 /**
869 * Model the performance behavior of an FS back-end instruction.
870 */
871 void
issue_inst(state & st,const struct brw_isa_info * isa,const fs_inst * inst)872 issue_inst(state &st, const struct brw_isa_info *isa,
873 const fs_inst *inst)
874 {
875 const struct intel_device_info *devinfo = isa->devinfo;
876 const instruction_info info(isa, inst);
877 const perf_desc perf = instruction_desc(info);
878
879 /* Stall on any source dependencies. */
880 for (unsigned i = 0; i < inst->sources; i++) {
881 for (unsigned j = 0; j < regs_read(inst, i); j++)
882 stall_on_dependency(
883 st, reg_dependency_id(devinfo, inst->src[i], j));
884 }
885
886 if (inst->reads_accumulator_implicitly()) {
887 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
888 j <= accum_reg_of_channel(devinfo, inst, info.tx,
889 inst->exec_size - 1); j++)
890 stall_on_dependency(
891 st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
892 }
893
894 if (const unsigned mask = inst->flags_read(devinfo)) {
895 for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
896 if (mask & (1 << i))
897 stall_on_dependency(st, flag_dependency_id(i));
898 }
899 }
900
901 /* Stall on any write dependencies. */
902 if (!inst->no_dd_check) {
903 if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
904 for (unsigned j = 0; j < regs_written(inst); j++)
905 stall_on_dependency(
906 st, reg_dependency_id(devinfo, inst->dst, j));
907 }
908
909 if (inst->writes_accumulator_implicitly(devinfo)) {
910 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
911 j <= accum_reg_of_channel(devinfo, inst, info.tx,
912 inst->exec_size - 1); j++)
913 stall_on_dependency(
914 st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
915 }
916
917 if (const unsigned mask = inst->flags_written(devinfo)) {
918 for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
919 if (mask & (1 << i))
920 stall_on_dependency(st, flag_dependency_id(i));
921 }
922 }
923 }
924
925 /* Stall on any SBID dependencies. */
926 if (inst->sched.mode & (TGL_SBID_SET | TGL_SBID_DST))
927 stall_on_dependency(st, tgl_swsb_wr_dependency_id(inst->sched));
928 else if (inst->sched.mode & TGL_SBID_SRC)
929 stall_on_dependency(st, tgl_swsb_rd_dependency_id(inst->sched));
930
931 /* Execute the instruction. */
932 execute_instruction(st, perf);
933
934 /* Mark any source dependencies. */
935 if (inst->is_send_from_grf()) {
936 for (unsigned i = 0; i < inst->sources; i++) {
937 if (inst->is_payload(i)) {
938 for (unsigned j = 0; j < regs_read(inst, i); j++)
939 mark_read_dependency(
940 st, perf, reg_dependency_id(devinfo, inst->src[i], j));
941 }
942 }
943 }
944
945 /* Mark any destination dependencies. */
946 if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
947 for (unsigned j = 0; j < regs_written(inst); j++) {
948 mark_write_dependency(st, perf,
949 reg_dependency_id(devinfo, inst->dst, j));
950 }
951 }
952
953 if (inst->writes_accumulator_implicitly(devinfo)) {
954 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
955 j <= accum_reg_of_channel(devinfo, inst, info.tx,
956 inst->exec_size - 1); j++)
957 mark_write_dependency(st, perf,
958 reg_dependency_id(devinfo, brw_acc_reg(8), j));
959 }
960
961 if (const unsigned mask = inst->flags_written(devinfo)) {
962 for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
963 if (mask & (1 << i))
964 mark_write_dependency(st, perf, flag_dependency_id(i));
965 }
966 }
967
968 /* Mark any SBID dependencies. */
969 if (inst->sched.mode & TGL_SBID_SET) {
970 mark_read_dependency(st, perf, tgl_swsb_rd_dependency_id(inst->sched));
971 mark_write_dependency(st, perf, tgl_swsb_wr_dependency_id(inst->sched));
972 }
973 }
974
975 /**
976 * Calculate the maximum possible throughput of the program compatible with
977 * the cycle-count utilization estimated for each asynchronous unit, in
978 * threads-per-cycle units.
979 */
980 float
calculate_thread_throughput(const state & st,float busy)981 calculate_thread_throughput(const state &st, float busy)
982 {
983 for (unsigned i = 0; i < EU_NUM_UNITS; i++)
984 busy = MAX2(busy, st.unit_busy[i]);
985
986 return 1.0 / busy;
987 }
988
989 /**
990 * Estimate the performance of the specified shader.
991 */
992 void
calculate_performance(performance & p,const fs_visitor * s,unsigned dispatch_width)993 calculate_performance(performance &p, const fs_visitor *s,
994 unsigned dispatch_width)
995 {
996 /* XXX - Note that the previous version of this code used worst-case
997 * scenario estimation of branching divergence for SIMD32 shaders,
998 * but this heuristic was removed to improve performance in common
999 * scenarios. Wider shader variants are less optimal when divergence
1000 * is high, e.g. when application renders complex scene on a small
1001 * surface. It is assumed that such renders are short, so their
1002 * time doesn't matter and when it comes to the overall performance,
1003 * they are dominated by more optimal larger renders.
1004 *
1005 * It's possible that we could do better with divergence analysis
1006 * by isolating branches which are 100% uniform.
1007 *
1008 * Plumbing the trip counts from NIR loop analysis would allow us
1009 * to do a better job regarding the loop weights.
1010 *
1011 * In the meantime use values that roughly match the control flow
1012 * weights used elsewhere in the compiler back-end.
1013 *
1014 * Note that we provide slightly more pessimistic weights on
1015 * Gfx12+ for SIMD32, since the effective warp size on that
1016 * platform is 2x the SIMD width due to EU fusion, which increases
1017 * the likelihood of divergent control flow in comparison to
1018 * previous generations, giving narrower SIMD modes a performance
1019 * advantage in several test-cases with non-uniform discard jumps.
1020 */
1021 const float discard_weight = (dispatch_width > 16 || s->devinfo->ver < 12 ?
1022 1.0 : 0.5);
1023 const float loop_weight = 10;
1024 unsigned halt_count = 0;
1025 unsigned elapsed = 0;
1026 state st;
1027
1028 foreach_block(block, s->cfg) {
1029 const unsigned elapsed0 = elapsed;
1030
1031 foreach_inst_in_block(fs_inst, inst, block) {
1032 const unsigned clock0 = st.unit_ready[EU_UNIT_FE];
1033
1034 issue_inst(st, &s->compiler->isa, inst);
1035
1036 if (inst->opcode == SHADER_OPCODE_HALT_TARGET && halt_count)
1037 st.weight /= discard_weight;
1038
1039 elapsed += (st.unit_ready[EU_UNIT_FE] - clock0) * st.weight;
1040
1041 if (inst->opcode == BRW_OPCODE_DO)
1042 st.weight *= loop_weight;
1043 else if (inst->opcode == BRW_OPCODE_WHILE)
1044 st.weight /= loop_weight;
1045 else if (inst->opcode == BRW_OPCODE_HALT && !halt_count++)
1046 st.weight *= discard_weight;
1047 }
1048
1049 p.block_latency[block->num] = elapsed - elapsed0;
1050 }
1051
1052 p.latency = elapsed;
1053 p.throughput = dispatch_width * calculate_thread_throughput(st, elapsed);
1054 }
1055 }
1056
performance(const fs_visitor * v)1057 brw::performance::performance(const fs_visitor *v) :
1058 block_latency(new unsigned[v->cfg->num_blocks])
1059 {
1060 calculate_performance(*this, v, v->dispatch_width);
1061 }
1062
~performance()1063 brw::performance::~performance()
1064 {
1065 delete[] block_latency;
1066 }
1067