/* * Copyright 2024 Intel Corporation * SPDX-License-Identifier: MIT */ #include "intel_compute_slm.h" #include #include "util/macros.h" #include "util/u_math.h" struct slm_encode { uint32_t encode; uint32_t size_in_kb; }; static inline struct slm_encode * slm_encode_lookup(struct slm_encode *table, unsigned int table_len, uint32_t bytes) { const uint32_t kbytes = DIV_ROUND_UP(bytes, 1024); unsigned int i; assert(kbytes <= table[table_len - 1].size_in_kb); for (i = 0; i < table_len; i++) { if (table[i].size_in_kb >= kbytes) return &table[i]; } return &table[table_len - 1]; } static struct slm_encode xe2_slm_allocation_size_table[] = { { .encode = 0x0, .size_in_kb = 0, }, { .encode = 0x1, .size_in_kb = 1, }, { .encode = 0x2, .size_in_kb = 2, }, { .encode = 0x3, .size_in_kb = 4, }, { .encode = 0x4, .size_in_kb = 8, }, { .encode = 0x5, .size_in_kb = 16, }, { .encode = 0x8, .size_in_kb = 24, }, { .encode = 0x6, .size_in_kb = 32, }, { .encode = 0x9, .size_in_kb = 48, }, { .encode = 0x7, .size_in_kb = 64, }, { .encode = 0xA, .size_in_kb = 96, }, { .encode = 0xB, .size_in_kb = 128, }, { .encode = 0xC, .size_in_kb = 192, }, { .encode = 0xD, .size_in_kb = 256, }, { .encode = 0xE, .size_in_kb = 384, }, }; /* Shared Local Memory Size is specified as powers of two, * and also have a Gen-dependent minimum value if not zero. */ uint32_t intel_compute_slm_calculate_size(unsigned gen, uint32_t bytes) { if (gen >= 20) { struct slm_encode *slm_encode; slm_encode = slm_encode_lookup(xe2_slm_allocation_size_table, ARRAY_SIZE(xe2_slm_allocation_size_table), bytes); return slm_encode->size_in_kb * 1024; } assert(bytes <= 64 * 1024); if (bytes > 0) return MAX2(util_next_power_of_two(bytes), gen >= 9 ? 1024 : 4096); else return 0; } uint32_t intel_compute_slm_encode_size(unsigned gen, uint32_t bytes) { uint32_t slm_size; if (bytes == 0) return 0; if (gen >= 20) { struct slm_encode *slm_encode; slm_encode = slm_encode_lookup(xe2_slm_allocation_size_table, ARRAY_SIZE(xe2_slm_allocation_size_table), bytes); return slm_encode->encode; } /* Shared Local Memory is specified as powers of two, and encoded in * INTERFACE_DESCRIPTOR_DATA with the following representations: * * Size | 0 kB | 1 kB | 2 kB | 4 kB | 8 kB | 16 kB | 32 kB | 64 kB | * ------------------------------------------------------------------- * Gfx7-8 | 0 | none | none | 1 | 2 | 4 | 8 | 16 | * ------------------------------------------------------------------- * Gfx9+ | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | */ slm_size = intel_compute_slm_calculate_size(gen, bytes); assert(util_is_power_of_two_nonzero(slm_size)); if (gen >= 9) { /* Turn an exponent of 10 (1024 kB) into 1. */ assert(slm_size >= 1024); slm_size = ffs(slm_size) - 10; } else { assert(slm_size >= 4096); /* Convert to the pre-Gfx9 representation. */ slm_size = slm_size / 4096; } return slm_size; } /* encode = 0 sets to largest SLM size supported in subslice */ static struct slm_encode preferred_slm_allocation_size_table[] = { { .encode = 0x8, .size_in_kb = 0, }, { .encode = 0x9, .size_in_kb = 16, }, { .encode = 0xa, .size_in_kb = 32, }, { .encode = 0xb, .size_in_kb = 64, }, { .encode = 0xc, .size_in_kb = 96, }, { .encode = 0xd, .size_in_kb = 128, }, }; static struct slm_encode xe2_preferred_slm_allocation_size_table[] = { { .encode = 0x0, .size_in_kb = 0, }, { .encode = 0x1, .size_in_kb = 16, }, { .encode = 0x2, .size_in_kb = 32, }, { .encode = 0x3, .size_in_kb = 64, }, { .encode = 0x4, .size_in_kb = 96, }, { .encode = 0x5, .size_in_kb = 128, }, { .encode = 0x6, .size_in_kb = 160, }, { .encode = 0x7, .size_in_kb = 192, }, { .encode = 0x8, .size_in_kb = 224, }, { .encode = 0x9, .size_in_kb = 256, }, { .encode = 0xA, .size_in_kb = 384, }, }; static uint32_t intel_compute_preferred_slm_encode_size(unsigned gen, uint32_t bytes) { struct slm_encode *table; unsigned int table_len; if (gen >= 20) { table = xe2_preferred_slm_allocation_size_table; table_len = ARRAY_SIZE(xe2_preferred_slm_allocation_size_table); } else { table = preferred_slm_allocation_size_table; table_len = ARRAY_SIZE(preferred_slm_allocation_size_table); } return slm_encode_lookup(table, table_len, bytes)->encode; } /** * Compute a shared local memory size to be allocated for each sub-slice. * It estimate how many workgroups will run concurrently per sub-slice and * multiply that per each workgroup SLM size. */ uint32_t intel_compute_preferred_slm_calc_encode_size(const struct intel_device_info *devinfo, const uint32_t slm_size_per_workgroup, const uint32_t invocations_per_workgroup, const uint8_t cs_simd) { const uint32_t max_preferred_slm_size = intel_device_info_get_max_preferred_slm_size(devinfo); const uint32_t invocations_per_ss = intel_device_info_get_eu_count_first_subslice(devinfo) * devinfo->num_thread_per_eu * cs_simd; uint32_t preferred_slm_size; if (slm_size_per_workgroup) { uint32_t workgroups_per_ss = invocations_per_ss / invocations_per_workgroup; preferred_slm_size = workgroups_per_ss * slm_size_per_workgroup; preferred_slm_size = MIN2(preferred_slm_size, max_preferred_slm_size); } else { preferred_slm_size = 0; } assert(preferred_slm_size >= slm_size_per_workgroup); return intel_compute_preferred_slm_encode_size(devinfo->ver, preferred_slm_size); }