xref: /aosp_15_r20/external/mesa3d/src/intel/common/intel_pixel_hash.h (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2021 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 #ifndef INTEL_PIXEL_HASH_H
24 #define INTEL_PIXEL_HASH_H
25 
26 /**
27  * Compute an \p n x \p m pixel hashing table usable as slice, subslice or
28  * pixel pipe hashing table.  The resulting table is the cyclic repetition of
29  * a fixed pattern with periodicity equal to \p period.
30  *
31  * If \p index is specified to be equal to \p period, a 2-way hashing table
32  * will be generated such that indices 0 and 1 are returned for the following
33  * fractions of entries respectively:
34  *
35  *   p_0 = ceil(period / 2) / period
36  *   p_1 = floor(period / 2) / period
37  *
38  * If \p index is even and less than \p period, a 3-way hashing table will be
39  * generated such that indices 0, 1 and 2 are returned for the following
40  * fractions of entries:
41  *
42  *   p_0 = (ceil(period / 2) - 1) / period
43  *   p_1 = floor(period / 2) / period
44  *   p_2 = 1 / period
45  *
46  * The equations above apply if \p flip is equal to 0, if it is equal to 1 p_0
47  * and p_1 will be swapped for the result.  Note that in the context of pixel
48  * pipe hashing this can be always 0 on Gfx12 platforms, since the hardware
49  * transparently remaps logical indices found on the table to physical pixel
50  * pipe indices from the highest to lowest EU count.
51  */
52 UNUSED static void
intel_compute_pixel_hash_table_3way(unsigned n,unsigned m,unsigned period,unsigned index,bool flip,uint32_t * p)53 intel_compute_pixel_hash_table_3way(unsigned n, unsigned m,
54                                     unsigned period, unsigned index, bool flip,
55                                     uint32_t *p)
56 {
57    for (unsigned i = 0; i < n; i++) {
58       for (unsigned j = 0; j < m; j++) {
59          const unsigned k = (i + j) % period;
60          p[j + m * i] = (k == index ? 2 : (k & 1) ^ flip);
61       }
62    }
63 }
64 
65 /**
66  * Compute an \p n x \p m pixel hashing table usable as slice,
67  * subslice or pixel pipe hashing table.  This generalizes the
68  * previous 3-way hash table function to an arbitrary number of ways
69  * given by the number of bits set in the expression "mask1 | mask2".
70  * If a way is only set in one of the two mask arguments it will
71  * appear on the table with half the frequency as a way set on both
72  * masks.
73  */
74 UNUSED static void
intel_compute_pixel_hash_table_nway(unsigned n,unsigned m,uint32_t mask1,uint32_t mask2,uint32_t * p)75 intel_compute_pixel_hash_table_nway(unsigned n, unsigned m,
76                                     uint32_t mask1, uint32_t mask2,
77                                     uint32_t *p)
78 {
79    /* If both masks are equal all ways are expected to show up with
80     * the same frequency on the final table, so we can zero out one of
81     * the masks in order to halve the number of IDs we need to handle.
82     */
83    if (mask1 == mask2)
84       mask2 = 0;
85 
86    /* Construct a table mapping consecutive indices to the physical
87     * indices given by the bits set on the mask arguments.  Ways
88     * enabled on both masks will appear twice on the mapping, so
89     * they'll show up with twice the frequency on the final table.
90     */
91    unsigned phys_ids[(sizeof(mask1) + sizeof(mask2)) * CHAR_BIT];
92    unsigned num_ids = 0;
93 
94    for (unsigned i = 0; i < sizeof(mask1) * CHAR_BIT; i++) {
95       if (mask1 & (1u << i))
96          phys_ids[num_ids++] = i;
97       if (mask2 & (1u << i))
98          phys_ids[num_ids++] = i;
99    }
100 
101    assert(num_ids > 0);
102 
103    /* Compute a permutation of the above indices that assigns indices
104     * as far as possible to adjacent entries.  This permutation is
105     * designed to be equivalent to the bit reversal of each index in
106     * cases where num_ids is a power of two, but doesn't actually
107     * require it to be a power of two in order to satisfy the required
108     * properties (which is necessary to handle configurations with
109     * arbitrary non-power of two fusing).  By construction, flipping
110     * bit l of its input will lead to a change in its result of the
111     * order of num_ids/2^(l+1) (see variable t below).  The
112     * bijectivity of this permutation can be verified easily by
113     * induction.  This permutation is applied cyclically to the
114     * vertical indices of the hashing table constructed below.
115     */
116    const unsigned bits = util_logbase2_ceil(num_ids);
117    unsigned swzy[ARRAY_SIZE(phys_ids)];
118 
119    for (unsigned k = 0; k < num_ids; k++) {
120       unsigned t = num_ids;
121       unsigned s = 0;
122 
123       for (unsigned l = 0; l < bits; l++) {
124          if (k & (1u << l)) {
125             s += (t + 1) >> 1;
126             t >>= 1;
127          } else {
128             t = (t + 1) >> 1;
129          }
130       }
131 
132       swzy[k] = s;
133    }
134 
135    /* Compute a second permutation applied cyclically to the
136     * horizontal indices of the hashing table.  In cases where a
137     * single mask is present (which means that all ways are expected
138     * to have the same frequency) this permutation will be the
139     * identity and will have no effect.
140     *
141     * In cases where some ways have twice the frequency of the others,
142     * use a similar iterative halving of the range of the permutation
143     * as in the the swzy[] permutation defined above, but instead of
144     * scanning the bits of its argument (the "k" variable above) in
145     * the opposite order (from LSB to MSB), proceed by halving the
146     * domain of the permutation in the same order as its range, which
147     * would lead to an identity permutation if it wasn't because the
148     * LSB of its range is adjusted as early as possible instead of at
149     * the last iteration.
150     *
151     * The reason for the special casing of the LSB is that we want to
152     * avoid assigning adjacent IDs to adjacent elements of the table,
153     * since ways that appear duplicated in the phys_ids mapping above
154     * would then appear duplicated in adjacent positions of the final
155     * table, which would lead to poor utilization for small primitives
156     * that only cover a small contiguous portion of the hashing table
157     * and would have twice as much work as necessary submitted to the
158     * same way instead of spreading its processing over a larger
159     * number of ways.
160     */
161    unsigned swzx[ARRAY_SIZE(phys_ids)];
162 
163    if (mask1 && mask2) {
164       for (unsigned k = 0; k < num_ids; k++) {
165          unsigned l = k;
166          unsigned t = num_ids;
167          unsigned s = 0;
168          bool in_range = false;
169 
170          while (t > 1) {
171             const bool first_in_range = t <= m && !in_range;
172             in_range |= first_in_range;
173 
174             if (l >= (t + 1) >> 1) {
175                /* Apply the s++ increment (which could only occur in
176                 * the last t == 2 iteration if we were constructing an
177                 * identity permutation) as soon as the domain of the
178                 * permutation has been decomposed into a chunk smaller
179                 * than the width of the hashing table \p m (which
180                 * causes in_range to be first set to true), since
181                 * doing it earlier would prevent any alternation
182                 * between even and odd indices in the first \p m
183                 * elements of swzx[], which are the only ones actually
184                 * used.
185                 *
186                 * Subsequent (in_range == true) increments of s need
187                 * to be doubled since they are selecting between
188                 * indices of the same parity.
189                 */
190                if (!in_range)
191                   s += (t + 1) >> 1;
192                else if (first_in_range)
193                   s++;
194                else
195                   s += (t + 1) >> 1 << 1;
196 
197                l -= (t + 1) >> 1;
198                t >>= 1;
199             } else {
200                t = (t + 1) >> 1;
201             }
202          }
203 
204          swzx[k] = s;
205       }
206    } else {
207       for (unsigned k = 0; k < num_ids; k++)
208          swzx[k] = k;
209    }
210 
211    /* Initialize the table with the cyclic repetition of a
212     * num_ids-periodic pattern.
213     *
214     * Note that the horizontal and vertical permutations (swzx and
215     * swzy respectively) are different, and the former is either an
216     * identity permutation or close to the identity.  This asymmetry
217     * is intentional in order to minimize the size of the contiguous
218     * area that needs to be rendered in parallel in order to utilize
219     * the whole GPU: In cases where swzx is the identity a rendering
220     * rectangle of width W will need to be at least H blocks high,
221     * where H is bounded by 2^ceil(log2(num_ids/W)) thanks to the
222     * above definition of the swzy permutation.
223     */
224    for (unsigned i = 0; i < n; i++) {
225       const unsigned k = i % num_ids;
226       for (unsigned j = 0; j < m; j++) {
227          const unsigned l = j % num_ids;
228          p[j + m * i] = phys_ids[(swzx[l] + swzy[k]) % num_ids];
229       }
230    }
231 }
232 
233 #endif
234