xref: /aosp_15_r20/external/mesa3d/src/asahi/compiler/agx_performance.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2023 Alyssa Rosenzweig
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "agx_compile.h"
7 #include "agx_compiler.h"
8 #include "agx_opcodes.h"
9 
10 /* Table describing the relationship between registers pressure and thread
11  * count. Each entry describes a maximum number of registers and the associated
12  * best-case thread count.
13  *
14  * Sorted in ascending order of maximum registers for easy lookup.
15  */
16 static const struct agx_occupancy occupancies[] = {
17    {104, 1024}, {112, 896}, {128, 832}, {136, 768}, {144, 704},
18    {160, 640},  {184, 576}, {208, 512}, {232, 448}, {256, 384},
19 };
20 
21 struct agx_occupancy
agx_occupancy_for_register_count(unsigned halfregs)22 agx_occupancy_for_register_count(unsigned halfregs)
23 {
24    for (unsigned i = 0; i < ARRAY_SIZE(occupancies); ++i) {
25       unsigned max = occupancies[i].max_registers;
26       assert((i == 0 || max > occupancies[i - 1].max_registers) && "ascending");
27 
28       if (halfregs <= max)
29          return occupancies[i];
30    }
31 
32    unreachable("Register count must be less than the maximum");
33 }
34 
35 unsigned
agx_max_registers_for_occupancy(unsigned occupancy)36 agx_max_registers_for_occupancy(unsigned occupancy)
37 {
38    unsigned max_regs = 0;
39 
40    for (unsigned i = 0; i < ARRAY_SIZE(occupancies); ++i) {
41       if (occupancy <= occupancies[i].max_threads)
42          max_regs = occupancies[i].max_registers;
43       else
44          break;
45    }
46 
47    assert(max_regs > 0 && "Thread count must be less than the maximum");
48    return max_regs;
49 }
50 
51 /* Crude cycle model for G13G */
52 enum alu_unit {
53    NONE,
54    SCIB,
55    IC,
56    F32,
57    F16,
58 };
59 
60 struct alu_timing {
61    enum alu_unit unit;
62    unsigned latency;
63    unsigned tp;
64 };
65 
66 /* clang-format off */
67 struct alu_timing op_timings[] = {
68    [AGX_OPCODE_FMA]           = { F32, 2, 1 },
69    [AGX_OPCODE_FADD]          = { F32, 2, 1 },
70    [AGX_OPCODE_FMUL]          = { F32, 2, 1 },
71 
72    [AGX_OPCODE_MOV_IMM]       = { SCIB, 1, 1 },
73    [AGX_OPCODE_BITOP]         = { SCIB, 2, 1 }, /* tp might be 2 for 32-bit / no $? */
74    [AGX_OPCODE_ICMPSEL]       = { SCIB, 2, 1 },
75    [AGX_OPCODE_FCMPSEL]       = { SCIB, 2, 1 },
76    [AGX_OPCODE_IADD]          = { SCIB, 2, 1 },
77 
78    [AGX_OPCODE_GET_SR]          = { SCIB, 2, 2 },
79    [AGX_OPCODE_GET_SR_BARRIER]  = { SCIB, 2, 2 },
80    [AGX_OPCODE_GET_SR_COVERAGE] = { SCIB, 2, 2 },
81 
82    [AGX_OPCODE_IMAD]          = { IC, 3, 2 },
83    [AGX_OPCODE_BFI]           = { IC, 3, 2 },
84    [AGX_OPCODE_EXTR]          = { IC, 3, 2 },
85    [AGX_OPCODE_ASR]           = { IC, 3, 2 },
86    [AGX_OPCODE_FLOOR]         = { IC, 3, 2 },
87    [AGX_OPCODE_SIN_PT_1]      = { IC, 3, 2 },
88    [AGX_OPCODE_SIN_PT_2]      = { IC, 5, 2 },
89    [AGX_OPCODE_LOG2]          = { IC, 5, 2 },
90    [AGX_OPCODE_EXP2]          = { IC, 5, 2 },
91    [AGX_OPCODE_RCP]           = { IC, 5, 3 },
92    [AGX_OPCODE_RSQRT]         = { IC, 6, 4 },
93    [AGX_OPCODE_SRSQRT]        = { IC, 6, 4 },
94 
95    // XXX: check this
96    [AGX_OPCODE_SIMD_PREFIX]      = { SCIB, 18, 18 },
97    [AGX_OPCODE_SIMD_REDUCE]      = { SCIB, 24, 24 },
98    [AGX_OPCODE_QUAD_PREFIX]      = { SCIB, 18, 18 },
99    [AGX_OPCODE_QUAD_REDUCE]      = { SCIB, 24, 24 },
100    [AGX_OPCODE_SHUFFLE]          = { SCIB, 5, 2   },
101    [AGX_OPCODE_QUAD_SHUFFLE]     = { SCIB, 5, 2   },
102 
103    [AGX_OPCODE_ICMP_BALLOT]      = { SCIB, 5, 2   },
104    [AGX_OPCODE_FCMP_BALLOT]      = { SCIB, 5, 2   },
105    [AGX_OPCODE_ICMP_QUAD_BALLOT] = { SCIB, 4, 2   },
106    [AGX_OPCODE_FCMP_QUAD_BALLOT] = { SCIB, 4, 2   },
107 };
108 /* clang-format on */
109 
110 /*
111  * TODO: Model non-ALU instructions, latency, register cache, 64-bit, etc.
112  */
113 struct agx_cycle_estimate
agx_estimate_cycles(agx_context * ctx)114 agx_estimate_cycles(agx_context *ctx)
115 {
116    struct agx_cycle_estimate est = {0};
117 
118    agx_foreach_instr_global(ctx, I) {
119       struct alu_timing alu = I->op < ARRAY_SIZE(op_timings)
120                                  ? op_timings[I->op]
121                                  : (struct alu_timing){0};
122 
123       if (alu.unit == IC) {
124          est.ic += alu.tp * 2;
125       } else if (alu.unit) {
126          est.f_scib += alu.tp;
127       } else {
128          /* TODO */
129       }
130    }
131 
132    /* IC and F/SCIB run in parallel across warps */
133    est.alu = MAX2(est.ic, est.f_scib);
134    return est;
135 }
136