1 // 2 // Copyright (C) 2009-2021 Intel Corporation 3 // 4 // SPDX-License-Identifier: MIT 5 // 6 // 7 8 // 9 // This file contains structure definitions shared by GRL OCL kernels and host code 10 // 11 12 #pragma once 13 14 #include "GRLGen12.h" 15 16 // NOTE: 17 // MSB(Most significant byte) - here I refer to it as a part of sorting that does MSB Radix sort, which can spawn additional work 18 // BLS(Bottom level sort) - here I refer to it as a last part of sorting a particular range(currently Bitonic), which cannot spawn additional work 19 // 20 21 #define MSB_RADIX_NUM_BINS 256 22 #define MSB_BITS_PER_ITERATION 8 // how many bits are sorted per iteration 23 #define MSB_SHIFT_BYTE_START_OFFSET 56 // start offset for byte shifting, first iteration will start from here 24 25 #define MSB_RADIX_NUM_VCONTEXTS 8 // NOTE: mkulikow: maybe expand/shrink? More means more MSB processed in parallel but more memory used 26 27 #define MSB_STACK_ENTRIES_NUM (MSB_RADIX_NUM_VCONTEXTS * MSB_RADIX_NUM_BINS * 7) // first level doesn't get spawned, so 7 iterations must fit here, 28 // since at max one algorithm iteration can spawn MSB_RADIX_NUM_VCONTEXTS * MSB_RADIX_NUM_BINS we need 7 of these 29 30 #define MSB_DISPATCH_QUEUE_NUM_RECORDS (MSB_RADIX_NUM_VCONTEXTS) // one per context 31 32 #define BLS_DISPATCH_QUEUE_NUM_RECORDS (MSB_RADIX_NUM_VCONTEXTS * MSB_RADIX_NUM_BINS) // each context can spawn MSB_RADIX_NUM_BINS, 33 // so at max one algorithm iteration can spawn MSB_RADIX_NUM_VCONTEXTS * MSB_RADIX_NUM_BINS 34 35 #define MSB_WG_SORT_ELEMENTS_THRESHOLD 256 // This tells us how many elements at max we can process in a single workgroup. 36 // If a single MSB entry needs more, then it will spawn more WGs 37 // after updating this also needs to update msb_radix_bitonic_sort.grl's computation of initial workgroups num 38 39 #define BOTTOM_LEVEL_SORT_THRESHOLD 512 // TODO: is 4096 best value? ON skl gives best performance 40 // Right now we use 256 workitems in simd16 which give us 16 hw threads, assuming 2KB per thread, we have 32KB SLM to play with. 41 // Since we use ulong(8bytes) we can store 4096 elements 42 // This also tells us that if number of elements to sort is less than this, we don't need to allocate scheduler 43 // Need to keep in sync with the GRL const BOTTOM_LEVEL_SORT_THRESHOLD 44 45 #define BOTTOM_LEVEL_SORT_MERGING_THRESHOLD 512 // This is the amount till which we'll merge small BLS'es produced by MSB into a single bigger BLS 46 47 GRL_NAMESPACE_BEGIN(GRL) 48 49 50 51 52 GRL_NAMESPACE_BEGIN(RTAS) 53 GRL_NAMESPACE_BEGIN(MORTON_MSB_RADIX_BITONIC_SORT) 54 55 struct MSBStackEntry 56 { 57 uint start_offset; 58 uint count; 59 uint iteration; 60 }; 61 62 struct MSBStack 63 { 64 dword num_entries; 65 struct MSBStackEntry entries[MSB_STACK_ENTRIES_NUM]; 66 }; 67 68 struct MSBRadixContext 69 { 70 uint start[MSB_RADIX_NUM_BINS]; 71 uint count[MSB_RADIX_NUM_BINS]; 72 uint num_wgs_in_flight; // this is used to identify which msb wg is last 73 uint num_keys; // number of keys to process 74 uint iteration; 75 ulong* keys_in; 76 ulong* keys_out; 77 78 uint start_offset; //offset from the beginning of the buffer 79 }; 80 81 struct MSBDispatchRecord 82 { 83 uint wgs_to_dispatch; // amount of workgroups to dispatch for this current record 84 }; 85 86 struct MSBDispatchQueue 87 { 88 dword num_records; 89 struct MSBDispatchRecord records[MSB_RADIX_NUM_VCONTEXTS]; // each context have its own record 90 }; 91 92 // BLS(Bottom Level Sort) - last stage of sorting which will not spawn any new tasks 93 struct BLSDispatchRecord 94 { 95 uint start_offset; // offset from the beginning of the buffer 96 uint count; 97 ulong* keys_in; // we don't need keys_out since we will write always to the same output buffer 98 }; 99 100 struct BLSDispatchQueue 101 { 102 dword num_records; 103 struct BLSDispatchRecord records[BLS_DISPATCH_QUEUE_NUM_RECORDS]; 104 }; 105 106 struct VContextScheduler 107 { 108 ///////////////////////////////////////////////////////////// 109 // State data used for communication with command streamer 110 // NOTE: This part must match definition in 'msb_radix_bitonic_sort.grl' 111 ///////////////////////////////////////////////////////////// 112 113 dword num_wgs_msb; // number of MSB workgroups being processed by current iteration 114 dword num_wgs_bls; // number of BLS workgroups being processed by current iteration 115 116 dword scheduler_postsync; 117 dword _pad1; 118 119 ///////////////////////////////////////////////////////////// 120 121 struct MSBDispatchQueue msb_queue; 122 struct BLSDispatchQueue bls_queue0; 123 struct BLSDispatchQueue bls_queue1; 124 125 struct BLSDispatchQueue* curr_bls_queue; 126 struct BLSDispatchQueue* next_bls_queue; 127 128 struct MSBStack msb_stack; 129 130 struct MSBRadixContext contexts[MSB_RADIX_NUM_VCONTEXTS]; 131 }; 132 133 GRL_NAMESPACE_END(MORTON_MSB_RADIX_BITONIC_SORT) 134 GRL_NAMESPACE_END(RTAS) 135 GRL_NAMESPACE_END(GRL) 136