xref: /aosp_15_r20/external/mesa3d/src/intel/compiler/elk/elk_simd_selection.cpp (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2021 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "elk_private.h"
25 #include "compiler/shader_info.h"
26 #include "intel/dev/intel_debug.h"
27 #include "intel/dev/intel_device_info.h"
28 #include "util/ralloc.h"
29 
30 unsigned
elk_required_dispatch_width(const struct shader_info * info)31 elk_required_dispatch_width(const struct shader_info *info)
32 {
33    if ((int)info->subgroup_size >= (int)SUBGROUP_SIZE_REQUIRE_8) {
34       assert(gl_shader_stage_uses_workgroup(info->stage));
35       /* These enum values are expressly chosen to be equal to the subgroup
36        * size that they require.
37        */
38       return (unsigned)info->subgroup_size;
39    } else {
40       return 0;
41    }
42 }
43 
44 static inline bool
test_bit(unsigned mask,unsigned bit)45 test_bit(unsigned mask, unsigned bit) {
46    return mask & (1u << bit);
47 }
48 
49 namespace {
50 
51 struct elk_cs_prog_data *
get_cs_prog_data(elk_simd_selection_state & state)52 get_cs_prog_data(elk_simd_selection_state &state)
53 {
54    if (std::holds_alternative<struct elk_cs_prog_data *>(state.prog_data))
55       return std::get<struct elk_cs_prog_data *>(state.prog_data);
56    else
57       return nullptr;
58 }
59 
60 struct elk_stage_prog_data *
get_prog_data(elk_simd_selection_state & state)61 get_prog_data(elk_simd_selection_state &state)
62 {
63    if (std::holds_alternative<struct elk_cs_prog_data *>(state.prog_data))
64       return &std::get<struct elk_cs_prog_data *>(state.prog_data)->base;
65    else
66       return nullptr;
67 }
68 
69 }
70 
71 bool
elk_simd_should_compile(elk_simd_selection_state & state,unsigned simd)72 elk_simd_should_compile(elk_simd_selection_state &state, unsigned simd)
73 {
74    assert(simd < SIMD_COUNT);
75    assert(!state.compiled[simd]);
76 
77    const auto cs_prog_data = get_cs_prog_data(state);
78    const auto prog_data = get_prog_data(state);
79    const unsigned width = 8u << simd;
80 
81    /* For shaders with variable size workgroup, in most cases we can compile
82     * all the variants (exceptions are bindless dispatch & ray queries), since
83     * the choice will happen only at dispatch time.
84     */
85    const bool workgroup_size_variable = cs_prog_data && cs_prog_data->local_size[0] == 0;
86 
87    if (!workgroup_size_variable) {
88       if (state.spilled[simd]) {
89          state.error[simd] = "Would spill";
90          return false;
91       }
92 
93       if (state.required_width && state.required_width != width) {
94          state.error[simd] = "Different than required dispatch width";
95          return false;
96       }
97 
98       if (cs_prog_data) {
99          const unsigned workgroup_size = cs_prog_data->local_size[0] *
100                                          cs_prog_data->local_size[1] *
101                                          cs_prog_data->local_size[2];
102 
103          unsigned max_threads = state.devinfo->max_cs_workgroup_threads;
104 
105          if (simd > 0 && state.compiled[simd - 1] &&
106             workgroup_size <= (width / 2)) {
107             state.error[simd] = "Workgroup size already fits in smaller SIMD";
108             return false;
109          }
110 
111          if (DIV_ROUND_UP(workgroup_size, width) > max_threads) {
112             state.error[simd] = "Would need more than max_threads to fit all invocations";
113             return false;
114          }
115       }
116 
117       /* The SIMD32 is only enabled for cases it is needed unless forced.
118        *
119        * TODO: Use performance_analysis and drop this rule.
120        */
121       if (width == 32) {
122          if (!INTEL_DEBUG(DEBUG_DO32) && (state.compiled[0] || state.compiled[1])) {
123             state.error[simd] = "SIMD32 not required (use INTEL_DEBUG=do32 to force)";
124             return false;
125          }
126       }
127    }
128 
129    uint64_t start;
130    switch (prog_data->stage) {
131    case MESA_SHADER_COMPUTE:
132       start = DEBUG_CS_SIMD8;
133       break;
134    default:
135       unreachable("unknown shader stage in elk_simd_should_compile");
136    }
137 
138    const bool env_skip[] = {
139       (intel_simd & (start << 0)) == 0,
140       (intel_simd & (start << 1)) == 0,
141       (intel_simd & (start << 2)) == 0,
142    };
143 
144    static_assert(ARRAY_SIZE(env_skip) == SIMD_COUNT);
145 
146    if (unlikely(env_skip[simd])) {
147       state.error[simd] = "Disabled by INTEL_DEBUG environment variable";
148       return false;
149    }
150 
151    return true;
152 }
153 
154 void
elk_simd_mark_compiled(elk_simd_selection_state & state,unsigned simd,bool spilled)155 elk_simd_mark_compiled(elk_simd_selection_state &state, unsigned simd, bool spilled)
156 {
157    assert(simd < SIMD_COUNT);
158    assert(!state.compiled[simd]);
159 
160    auto cs_prog_data = get_cs_prog_data(state);
161 
162    state.compiled[simd] = true;
163    if (cs_prog_data)
164       cs_prog_data->prog_mask |= 1u << simd;
165 
166    /* If a SIMD spilled, all the larger ones would spill too. */
167    if (spilled) {
168       for (unsigned i = simd; i < SIMD_COUNT; i++) {
169          state.spilled[i] = true;
170          if (cs_prog_data)
171             cs_prog_data->prog_spilled |= 1u << i;
172       }
173    }
174 }
175 
176 int
elk_simd_select(const struct elk_simd_selection_state & state)177 elk_simd_select(const struct elk_simd_selection_state &state)
178 {
179    for (int i = SIMD_COUNT - 1; i >= 0; i--) {
180       if (state.compiled[i] && !state.spilled[i])
181          return i;
182    }
183    for (int i = SIMD_COUNT - 1; i >= 0; i--) {
184       if (state.compiled[i])
185          return i;
186    }
187    return -1;
188 }
189 
190 int
elk_simd_select_for_workgroup_size(const struct intel_device_info * devinfo,const struct elk_cs_prog_data * prog_data,const unsigned * sizes)191 elk_simd_select_for_workgroup_size(const struct intel_device_info *devinfo,
192                                    const struct elk_cs_prog_data *prog_data,
193                                    const unsigned *sizes)
194 {
195    if (!sizes || (prog_data->local_size[0] == sizes[0] &&
196                   prog_data->local_size[1] == sizes[1] &&
197                   prog_data->local_size[2] == sizes[2])) {
198       elk_simd_selection_state simd_state{
199          .prog_data = const_cast<struct elk_cs_prog_data *>(prog_data),
200       };
201 
202       /* Propagate the prog_data information back to the simd_state,
203        * so we can use select() directly.
204        */
205       for (int i = 0; i < SIMD_COUNT; i++) {
206          simd_state.compiled[i] = test_bit(prog_data->prog_mask, i);
207          simd_state.spilled[i] = test_bit(prog_data->prog_spilled, i);
208       }
209 
210       return elk_simd_select(simd_state);
211    }
212 
213    struct elk_cs_prog_data cloned = *prog_data;
214    for (unsigned i = 0; i < 3; i++)
215       cloned.local_size[i] = sizes[i];
216 
217    cloned.prog_mask = 0;
218    cloned.prog_spilled = 0;
219 
220    elk_simd_selection_state simd_state{
221       .devinfo = devinfo,
222       .prog_data = &cloned,
223    };
224 
225    for (unsigned simd = 0; simd < SIMD_COUNT; simd++) {
226       /* We are not recompiling, so use original results of prog_mask and
227        * prog_spilled as they will already contain all possible compilations.
228        */
229       if (elk_simd_should_compile(simd_state, simd) &&
230           test_bit(prog_data->prog_mask, simd)) {
231          elk_simd_mark_compiled(simd_state, simd, test_bit(prog_data->prog_spilled, simd));
232       }
233    }
234 
235    return elk_simd_select(simd_state);
236 }
237