xref: /aosp_15_r20/external/mesa3d/src/intel/common/intel_tiled_render.h (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2023 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 #ifndef INTEL_TILED_RENDER_H
24 #define INTEL_TILED_RENDER_H
25 
26 #include "intel/common/intel_l3_config.h"
27 #include "intel/dev/intel_device_info.h"
28 #include "intel/isl/isl.h"
29 
30 /**
31  * Return the tile cache space used as target by the tiling parameter
32  * calculation algorithm below.  Cache space units are in bits.
33  *
34  * \sa intel_calculate_tile_dimensions()
35  */
36 UNUSED static unsigned
intel_calculate_tile_cache_size(const struct intel_device_info * devinfo,const struct intel_l3_config * cfg)37 intel_calculate_tile_cache_size(const struct intel_device_info *devinfo,
38                                 const struct intel_l3_config *cfg)
39 {
40    const unsigned tc_l3_partition_size = 1024 * 8 *
41       intel_get_l3_partition_size(devinfo, cfg, INTEL_L3P_TC);
42    const unsigned all_l3_partition_size = 1024 * 8 *
43       intel_get_l3_partition_size(devinfo, cfg, INTEL_L3P_ALL);
44    /* Target half of the total L3 space as simple heuristic, could be
45     * improved by adjusting the target dynamically.
46     */
47    const unsigned target_all_l3_partition_size = all_l3_partition_size / 2;
48    /* If there's a tile cache partition on the L3, use its size as
49     * target, otherwise (e.g. in unified L3 cache mode) use a fraction
50     * of the total L3 available.
51     *
52     * XXX - Note that this assumes TBIMR in pixel hashing mode is in use.
53     */
54    const unsigned tile_cache_size = tc_l3_partition_size ? tc_l3_partition_size :
55                                     target_all_l3_partition_size;
56    assert(tile_cache_size > 0);
57    return tile_cache_size;
58 }
59 
60 /**
61  * Return the amount of bits per pixel used to store an ISL surface in
62  * memory.  This can be used as helper to estimate the value of the \p
63  * pixel_size argument of intel_calculate_tile_dimensions() below.
64  */
65 UNUSED static unsigned
intel_calculate_surface_pixel_size(const struct isl_surf * surf)66 intel_calculate_surface_pixel_size(const struct isl_surf *surf)
67 {
68    const struct isl_format_layout *layout = isl_format_get_layout(surf->format);
69    const unsigned num_samples = MAX2(1, surf->samples);
70    if (surf->size_B > 0)
71       return DIV_ROUND_UP(layout->bpb * num_samples,
72                           layout->bw * layout->bh * layout->bd);
73    else
74       return 0;
75 }
76 
77 /**
78  * Estimate tiling parameters that yield a reasonable balance between
79  * tile cache utilization and avoidance of thrashing, based on the
80  * device's current caching configuration, the framebuffer dimensions
81  * and an estimate of the tile cache footprint per fragment in bits (\p
82  * pixel_size).
83  *
84  * The calculated tile dimensions are guaranteed to be a multiple of
85  * the block dimensions \p block_width and \p block_height, which for
86  * TBIMR in pixel hashing mode must be equal to the pixel hashing
87  * block size, typically 16x16 or 32x32.
88  */
89 UNUSED static void
intel_calculate_tile_dimensions(const struct intel_device_info * devinfo,const struct intel_l3_config * cfg,unsigned block_width,unsigned block_height,unsigned fb_width,unsigned fb_height,unsigned pixel_size,unsigned * tile_width,unsigned * tile_height)90 intel_calculate_tile_dimensions(const struct intel_device_info *devinfo,
91                                 const struct intel_l3_config *cfg,
92                                 unsigned block_width, unsigned block_height,
93                                 unsigned fb_width, unsigned fb_height,
94                                 unsigned pixel_size,
95                                 unsigned *tile_width, unsigned *tile_height)
96 {
97    /* Maximum number of tiles supported by the TBIMR tile sequencing
98     * hardware.
99     */
100    const unsigned max_horiz_tiles = 32;
101    const unsigned max_vert_tiles = 32;
102 
103    /* Represent dimensions in hashing block units, which guarantees
104     * that the resulting tile dimensions are a multiple of the hashing
105     * block dimensions, a requirement of TBIMR in pixel hashing mode.
106     */
107    const unsigned fb_block_width = DIV_ROUND_UP(fb_width, block_width);
108    const unsigned fb_block_height = DIV_ROUND_UP(fb_height, block_height);
109 
110    /* Amount of tile cache space for the workload to target. */
111    const unsigned tile_cache_size = intel_calculate_tile_cache_size(devinfo, cfg);
112    /* Cache footprint of a single hashing block worth of threads. */
113    const unsigned block_size = MAX2(1, pixel_size * block_width * block_height);
114    /* Calculate the desired tile surface (in block units) that fully
115     * utilizes the target portion of the tile cache, which in an ideal
116     * world where an oracle has given us the tile cache footprint per
117     * block is just the ratio of the two.
118     */
119    const unsigned desired_tile_surf = MAX2(1, tile_cache_size / block_size);
120    /* Clamp the desired tile surface to be between the surface of the
121     * whole framebuffer and the surface of the smallest tile possible
122     * at the maximum suported tile count.
123     */
124    const unsigned tile_surf = CLAMP(desired_tile_surf,
125                                     (DIV_ROUND_UP(fb_block_width, max_horiz_tiles) *
126                                      DIV_ROUND_UP(fb_block_height, max_vert_tiles)),
127                                     fb_block_width * fb_block_height);
128    /* XXX - If the tile_surf calculated above is smaller than the
129     *       number of pixel pipes on the GPU, the pipeline is so
130     *       cache-heavy that the parallelism of the GPU will have to
131     *       be constrained in order to avoid thrashing the tile cache.
132     *       Possibly emit a performance warning, or better, return an
133     *       error indicating that the pixel pipe hashing config needs
134     *       to be adjusted to use a finer hashing mode in order to
135     *       spread out the workload evenly across the available slices.
136     */
137 
138    /* Select the tile aspect ratio that minimizes the number of passes
139     * required to render the whole framebuffer.  The search starts at
140     * an approximately square tile size of the desired surface and
141     * increases the ratio between its major and minor axes in a
142     * sequence of finite increments.
143     *
144     * The algorithm is biased in favor of the squarest possible tiling
145     * config since it starts with a tile shape closest to a square and
146     * early-exits when a global minimum is detected.  This bias is
147     * intentional since cache locality may suffer at high tile aspect
148     * ratios.
149     */
150    const float base_major = sqrtf(tile_surf);
151    /* Make sure that the minimum major axis where the search starts
152     * isn't so small (due to a small framebuffer or rounding) that the
153     * tile would have to be larger than the framebuffer in the
154     * opposite "minor" direction.
155     */
156    const unsigned min_major = MAX3(1, floorf(base_major),
157 				   tile_surf / MIN2(fb_block_width, fb_block_height));
158    /* Stop search at a an aspect ratio of approximately 2 (A major
159     * axis equal to 'base_major * M_SQRT2' would give an aspect ratio
160     * of exactly 2 if it was a valid integer number).  Aspect ratios
161     * higher than 2 could technically be useful, the upper bound is
162     * intended as a heuristic in order to set a low limit to the
163     * number of iterations the loop below may execute.
164     */
165    const unsigned max_major = ceilf(MAX2(base_major, min_major) * M_SQRT2);
166    assert(max_major < INT_MAX);
167 
168    /* Best tile dimensions found so far. */
169    unsigned best_count = UINT_MAX;
170    unsigned best_block_width = 0;
171    unsigned best_block_height = 0;
172 
173    for (unsigned major = min_major; major <= max_major;) {
174       /* Minor axis that yields the desired tile surface for the
175        * present major parameter.
176        */
177       const unsigned minor = MAX2(1, tile_surf / major);
178 
179       /* Calculate the total number of tiles if this aspect ratio is
180        * used in the X-major orientation.
181        */
182       const unsigned horiz_tiles_x = DIV_ROUND_UP(fb_block_width, major);
183       const unsigned vert_tiles_x = DIV_ROUND_UP(fb_block_height, minor);
184       const unsigned count_x = horiz_tiles_x * vert_tiles_x;
185 
186       /* Calculate the number of blocks we need to add to the major
187        * axis for the number of X-major tile columns (horiz_tiles_x)
188        * to drop by one.  This avoids many useless iterations relative
189        * to exhaustive search, since an increase in major can only
190        * decrease the total tile count if it decreases horiz_tiles_x
191        * as well, vert_tiles_x is monotonically increasing with major.
192        *
193        * If the number of tile columns is already 1 the X-major
194        * solution cannot be improved further, use "infinity" so the
195        * increment for the next iteration is only determined by the
196        * Y-major search -- If the Y-major solution cannot be improved
197        * either the search will be terminated.
198        */
199       const unsigned delta_x = horiz_tiles_x == 1 ? INT_MAX :
200          DIV_ROUND_UP(fb_block_width - major * (horiz_tiles_x - 1),
201                       horiz_tiles_x - 1);
202 
203       /* Update the best known solution with the present X-major one
204        * if it's allowed by the hardware and requires a lower total
205        * number of tiles to cover the whole framebuffer.
206        */
207       if (horiz_tiles_x <= max_horiz_tiles && vert_tiles_x <= max_vert_tiles &&
208           count_x < best_count) {
209          best_count = count_x;
210          best_block_width = major;
211          best_block_height = minor;
212 
213          /* The array of tiles is fully covered by the framebuffer, a
214           * global minimum has been found, terminate the search.
215           */
216          if (count_x * tile_surf == fb_block_width * fb_block_height)
217             break;
218       }
219 
220       /* Calculate the total number of tiles if this aspect ratio is
221        * used in the Y-major orientation.
222        */
223       const unsigned horiz_tiles_y = DIV_ROUND_UP(fb_block_width, minor);
224       const unsigned vert_tiles_y = DIV_ROUND_UP(fb_block_height, major);
225       const unsigned count_y =  horiz_tiles_y * vert_tiles_y;
226 
227       /* Calculate the number of blocks we need to add to the major
228        * axis for the number of Y-major tile rows (vert_tiles_y) to
229        * drop by one.  Analogous to the delta_x described above after
230        * a flip of the X and Y axes.
231        */
232       const unsigned delta_y = vert_tiles_y == 1 ? INT_MAX :
233          DIV_ROUND_UP(fb_block_height - major * (vert_tiles_y - 1),
234                       vert_tiles_y - 1);
235 
236       /* Update the best known solution with the present Y-major one
237        * if it's allowed by the hardware and requires a lower total
238        * number of tiles to cover the whole framebuffer.
239        */
240       if (horiz_tiles_y <= max_horiz_tiles && vert_tiles_y <= max_vert_tiles &&
241           count_y < best_count) {
242          best_count = count_y;
243          best_block_width = minor;
244          best_block_height = major;
245 
246          /* The array of tiles is fully covered by the framebuffer, a
247           * global minimum has been found, terminate the search.
248           */
249          if (count_y * tile_surf == fb_block_width * fb_block_height)
250             break;
251       }
252 
253       /* Use the smallest of the computed major increments in order to
254        * visit the closest subsequent solution candidate.  If both the
255        * X-major and Y-major searches have terminated major will be
256        * pushed above the upper bound of the search, causing immediate
257        * termination.
258        */
259       const unsigned delta = MIN2(delta_x, delta_y);
260       assert(major + delta > major);
261       major += delta;
262    }
263 
264    /* Sanity-check and return the result, scaling it back to pixel
265     * units.
266     */
267    assert(best_block_width > 0 && best_block_height > 0);
268    assert(DIV_ROUND_UP(fb_block_width, best_block_width) <= max_horiz_tiles);
269    assert(DIV_ROUND_UP(fb_block_height, best_block_height) <= max_vert_tiles);
270 
271    *tile_width = best_block_width * block_width;
272    *tile_height = best_block_height * block_height;
273 }
274 
275 #endif
276