xref: /aosp_15_r20/external/mesa3d/src/intel/vulkan/grl/gpu/intrinsics.h (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 //
2 // Copyright (C) 2009-2021 Intel Corporation
3 //
4 // SPDX-License-Identifier: MIT
5 //
6 //
7 
8 #pragma once
9 
10 // TODO: AABB_work_group_reduce is super slow, remove !!!
11 
12 #pragma cl_intel_subgroups : enable
13 #pragma cl_khr_fp16        : enable
14 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
15 
16 
17 uint intel_sub_group_ballot(bool valid);
18 
19 // atom_min
20 float __attribute__((overloadable)) atom_min(volatile __global float *p, float val);
21 float __attribute__((overloadable)) atom_min(volatile __local float *p, float val);
22 float __attribute__((overloadable)) atomic_min(volatile __global float *p, float val);
23 float __attribute__((overloadable)) atomic_min(volatile __local float *p, float val);
24 // atom_max
25 float __attribute__((overloadable)) atom_max(volatile __global float *p, float val);
26 float __attribute__((overloadable)) atom_max(volatile __local float *p, float val);
27 float __attribute__((overloadable)) atomic_max(volatile __global float *p, float val);
28 float __attribute__((overloadable)) atomic_max(volatile __local float *p, float val);
29 // atom_cmpxchg
30 float __attribute__((overloadable)) atom_cmpxchg(volatile __global float *p, float cmp, float val);
31 float __attribute__((overloadable)) atom_cmpxchg(volatile __local float *p, float cmp, float val);
32 float __attribute__((overloadable)) atomic_cmpxchg(volatile __global float *p, float cmp, float val);
33 float __attribute__((overloadable)) atomic_cmpxchg(volatile __local float *p, float cmp, float val);
34 
35 
36 
subgroup_single_atomic_add(global uint * p,uint val)37 inline uint subgroup_single_atomic_add(global uint *p, uint val)
38 {
39     const uint subgroupLocalID = get_sub_group_local_id();
40     const int v = subgroupLocalID == 0 ? atomic_add(p, val) : 0;
41     return sub_group_broadcast(v, 0);
42 }
43 
halfarea(const float3 d)44 inline float halfarea(const float3 d)
45 {
46     return fma(d.x, (d.y + d.z), d.y * d.z);
47 }
48 
area(const float3 d)49 inline float area(const float3 d)
50 {
51     return halfarea(d) * 2.0f;
52 }
53 
maxDim(const float3 a)54 inline uint maxDim(const float3 a)
55 {
56     const float3 b = fabs(a);
57     const bool b_x_y = b.x > b.y;
58     const float cur_max = b_x_y ? b.x : b.y;
59     const uint cur_idx = b_x_y ? 0 : 1;
60     const bool b_x_y_z = b.z > cur_max;
61     return b_x_y_z ? 2 : cur_idx;
62 }
63 
sortByMaxDim(const float3 a)64 inline uint3 sortByMaxDim(const float3 a)
65 {
66     const uint kz = maxDim(a);
67     const uint _kx = (kz + 1) % 3;
68     const uint _ky = (_kx + 1) % 3;
69     const bool kz_pos = a[kz] >= 0.0f;
70     const uint kx = kz_pos ? _ky : _kx;
71     const uint ky = kz_pos ? _kx : _ky;
72     return (uint3)(kx, ky, kz);
73 }
74 
sort4_ascending(const uint4 dist)75 inline uint4 sort4_ascending(const uint4 dist)
76 {
77     const uint a0 = dist.s0;
78     const uint a1 = dist.s1;
79     const uint a2 = dist.s2;
80     const uint a3 = dist.s3;
81     const uint b0 = min(a0, a2);
82     const uint b1 = min(a1, a3);
83     const uint b2 = max(a0, a2);
84     const uint b3 = max(a1, a3);
85     const uint c0 = min(b0, b1);
86     const uint c1 = max(b0, b1);
87     const uint c2 = min(b2, b3);
88     const uint c3 = max(b2, b3);
89     const uint d0 = c0;
90     const uint d1 = min(c1, c2);
91     const uint d2 = max(c1, c2);
92     const uint d3 = c3;
93     return (uint4)(d0, d1, d2, d3);
94 }
95 
96 __constant const uint shuffleA[8] = {1, 0, 3, 2, 5, 4, 7, 6};
97 __constant const uint shuffleB[8] = {2, 3, 0, 1, 7, 6, 5, 4};
98 __constant const uint shuffleC[8] = {1, 0, 3, 2, 5, 4, 7, 6};
99 __constant const uint shuffleD[8] = {7, 6, 5, 4, 3, 2, 1, 0};
100 __constant const uint shuffleE[8] = {2, 3, 0, 1, 6, 7, 4, 5};
101 __constant const uint shuffleF[8] = {1, 0, 3, 2, 5, 4, 7, 6};
102 __constant const uint shuffleG[8] = {0, 2, 1, 3, 5, 4, 7, 6};
103 
104 __constant const uint selAA[8] = {0, 1, 0, 1, 0, 1, 0, 1};
105 __constant const uint selCC[8] = {0, 0, 1, 1, 0, 0, 1, 1};
106 __constant const uint selF0[8] = {0, 0, 0, 0, 1, 1, 1, 1};
107 
108 __constant const uint selGG[8] = {0, 0, 1, 0, 1, 1, 1, 1};
109 
compare_exchange_descending(const uint a0,const uint shuffleMask,const uint selectMask)110 inline uint compare_exchange_descending(const uint a0, const uint shuffleMask, const uint selectMask)
111 {
112     const uint a1 = intel_sub_group_shuffle(a0, shuffleMask);
113     const uint a_min = min(a0, a1);
114     const uint a_max = max(a0, a1);
115     return select(a_max, a_min, selectMask);
116 }
117 
compare_exchange_ascending(const uint a0,const uint shuffleMask,const uint selectMask)118 inline uint compare_exchange_ascending(const uint a0, const uint shuffleMask, const uint selectMask)
119 {
120     const uint a1 = intel_sub_group_shuffle(a0, shuffleMask);
121     const uint a_min = min(a0, a1);
122     const uint a_max = max(a0, a1);
123     return select(a_min, a_max, selectMask);
124 }
125 
sort8_descending(const uint aa)126 inline uint sort8_descending(const uint aa)
127 {
128     const unsigned int slotID = get_sub_group_local_id() % 8;
129     const uint bb = compare_exchange_descending(aa, shuffleA[slotID], selAA[slotID]);
130     const uint cc = compare_exchange_descending(bb, shuffleB[slotID], selCC[slotID]);
131     const uint dd = compare_exchange_descending(cc, shuffleC[slotID], selAA[slotID]);
132     const uint ee = compare_exchange_descending(dd, shuffleD[slotID], selF0[slotID]);
133     const uint ff = compare_exchange_descending(ee, shuffleE[slotID], selCC[slotID]);
134     const uint gg = compare_exchange_descending(ff, shuffleF[slotID], selAA[slotID]);
135     return gg;
136 }
137 
sort8_ascending(const uint aa)138 inline uint sort8_ascending(const uint aa)
139 {
140     const unsigned int slotID = get_sub_group_local_id() % 8;
141     const uint bb = compare_exchange_ascending(aa, shuffleA[slotID], selAA[slotID]);
142     const uint cc = compare_exchange_ascending(bb, shuffleB[slotID], selCC[slotID]);
143     const uint dd = compare_exchange_ascending(cc, shuffleC[slotID], selAA[slotID]);
144     const uint ee = compare_exchange_ascending(dd, shuffleD[slotID], selF0[slotID]);
145     const uint ff = compare_exchange_ascending(ee, shuffleE[slotID], selCC[slotID]);
146     const uint gg = compare_exchange_ascending(ff, shuffleF[slotID], selAA[slotID]);
147     return gg;
148 }
149 
sort4_descending(const uint aa)150 inline uint sort4_descending(const uint aa)
151 {
152     const unsigned int slotID = get_sub_group_local_id() % 8;
153     const uint bb = compare_exchange_descending(aa, shuffleA[slotID], selAA[slotID]);
154     const uint cc = compare_exchange_descending(bb, shuffleB[slotID], selCC[slotID]);
155     const uint dd = compare_exchange_descending(cc, shuffleG[slotID], selGG[slotID]);
156     return dd;
157 }
158 
compare_exchange_descending_ulong(const ulong a0,const uint shuffleMask,const uint selectMask)159 inline ulong compare_exchange_descending_ulong(const ulong a0, const uint shuffleMask, const uint selectMask)
160 {
161     const ulong a1 = intel_sub_group_shuffle(a0, shuffleMask);
162     const ulong a_min = min(a0, a1);
163     const ulong a_max = max(a0, a1);
164     return select(a_max, a_min, (ulong)selectMask);
165 }
166 
compare_exchange_ascending_ulong(const ulong a0,const uint shuffleMask,const uint selectMask)167 inline ulong compare_exchange_ascending_ulong(const ulong a0, const uint shuffleMask, const uint selectMask)
168 {
169     const ulong a1 = intel_sub_group_shuffle(a0, shuffleMask);
170     const ulong a_min = min(a0, a1);
171     const ulong a_max = max(a0, a1);
172     return select(a_min, a_max, (ulong)selectMask);
173 }
174 
sort8_ascending_ulong(const ulong aa)175 inline ulong sort8_ascending_ulong(const ulong aa)
176 {
177     const unsigned int slotID = get_sub_group_local_id() % 8;
178     const ulong bb = compare_exchange_ascending_ulong(aa, shuffleA[slotID], selAA[slotID]);
179     const ulong cc = compare_exchange_ascending_ulong(bb, shuffleB[slotID], selCC[slotID]);
180     const ulong dd = compare_exchange_ascending_ulong(cc, shuffleC[slotID], selAA[slotID]);
181     const ulong ee = compare_exchange_ascending_ulong(dd, shuffleD[slotID], selF0[slotID]);
182     const ulong ff = compare_exchange_ascending_ulong(ee, shuffleE[slotID], selCC[slotID]);
183     const ulong gg = compare_exchange_ascending_ulong(ff, shuffleF[slotID], selAA[slotID]);
184     return gg;
185 }
186 
bitInterleave3D(const uint4 in)187 inline uint bitInterleave3D(const uint4 in)
188 {
189     uint x = in.x, y = in.y, z = in.z;
190     x = (x | (x << 16)) & 0x030000FF;
191     x = (x | (x << 8)) & 0x0300F00F;
192     x = (x | (x << 4)) & 0x030C30C3;
193     x = (x | (x << 2)) & 0x09249249;
194 
195     y = (y | (y << 16)) & 0x030000FF;
196     y = (y | (y << 8)) & 0x0300F00F;
197     y = (y | (y << 4)) & 0x030C30C3;
198     y = (y | (y << 2)) & 0x09249249;
199 
200     z = (z | (z << 16)) & 0x030000FF;
201     z = (z | (z << 8)) & 0x0300F00F;
202     z = (z | (z << 4)) & 0x030C30C3;
203     z = (z | (z << 2)) & 0x09249249;
204 
205     return x | (y << 1) | (z << 2);
206 }
207 
bitInterleave4D(const uint4 in)208 inline uint bitInterleave4D(const uint4 in)
209 {
210     uint x = in.x, y = in.y, z = in.z, w = in.w;
211 
212     x = x & 0x000000ff;
213     x = (x ^ (x << 16)) & 0x00c0003f;
214     x = (x ^ (x << 8)) & 0x00c03807;
215     x = (x ^ (x << 4)) & 0x08530853;
216     x = (x ^ (x << 2)) & 0x09090909;
217     x = (x ^ (x << 1)) & 0x11111111;
218 
219     y = y & 0x000000ff;
220     y = (y ^ (y << 16)) & 0x00c0003f;
221     y = (y ^ (y << 8)) & 0x00c03807;
222     y = (y ^ (y << 4)) & 0x08530853;
223     y = (y ^ (y << 2)) & 0x09090909;
224     y = (y ^ (y << 1)) & 0x11111111;
225 
226     z = z & 0x000000ff;
227     z = (z ^ (z << 16)) & 0x00c0003f;
228     z = (z ^ (z << 8)) & 0x00c03807;
229     z = (z ^ (z << 4)) & 0x08530853;
230     z = (z ^ (z << 2)) & 0x09090909;
231     z = (z ^ (z << 1)) & 0x11111111;
232 
233     w = w & 0x000000ff;
234     w = (w ^ (w << 16)) & 0x00c0003f;
235     w = (w ^ (w << 8)) & 0x00c03807;
236     w = (w ^ (w << 4)) & 0x08530853;
237     w = (w ^ (w << 2)) & 0x09090909;
238     w = (w ^ (w << 1)) & 0x11111111;
239 
240     return (x | (y << 1) | (z << 2) | (w << 3));
241 }
242 
ulong_bitInterleave4D(const uint4 in)243 inline ulong ulong_bitInterleave4D(const uint4 in)
244 {
245     ulong x = in.x, y = in.y, z = in.z, w = in.w;
246 
247     x = x & 0x0000ffff;
248     x = (x ^ (x << 32)) & 0x0000f800000007ff;
249     x = (x ^ (x << 16)) & 0x0000f80007c0003f;
250     x = (x ^ (x << 8)) & 0x00c0380700c03807;
251     x = (x ^ (x << 4)) & 0x0843084308430843;
252     x = (x ^ (x << 2)) & 0x0909090909090909;
253     x = (x ^ (x << 1)) & 0x1111111111111111;
254 
255     y = y & 0x0000ffff;
256     y = (y ^ (y << 32)) & 0x0000f800000007ff;
257     y = (y ^ (y << 16)) & 0x0000f80007c0003f;
258     y = (y ^ (y << 8)) & 0x00c0380700c03807;
259     y = (y ^ (y << 4)) & 0x0843084308430843;
260     y = (y ^ (y << 2)) & 0x0909090909090909;
261     y = (y ^ (y << 1)) & 0x1111111111111111;
262 
263     z = z & 0x0000ffff;
264     z = (z ^ (z << 32)) & 0x0000f800000007ff;
265     z = (z ^ (z << 16)) & 0x0000f80007c0003f;
266     z = (z ^ (z << 8)) & 0x00c0380700c03807;
267     z = (z ^ (z << 4)) & 0x0843084308430843;
268     z = (z ^ (z << 2)) & 0x0909090909090909;
269     z = (z ^ (z << 1)) & 0x1111111111111111;
270 
271     w = w & 0x0000ffff;
272     w = (w ^ (w << 32)) & 0x0000f800000007ff;
273     w = (w ^ (w << 16)) & 0x0000f80007c0003f;
274     w = (w ^ (w << 8)) & 0x00c0380700c03807;
275     w = (w ^ (w << 4)) & 0x0843084308430843;
276     w = (w ^ (w << 2)) & 0x0909090909090909;
277     w = (w ^ (w << 1)) & 0x1111111111111111;
278 
279     return (x | (y << 1) | (z << 2) | (w << 3));
280 }
281 
bitCompact(uint x)282 inline uint bitCompact(uint x)
283 {
284     x &= 0x09249249;
285     x = (x ^ (x >> 2)) & 0x030c30c3;
286     x = (x ^ (x >> 4)) & 0x0300f00f;
287     x = (x ^ (x >> 8)) & 0xff0000ff;
288     x = (x ^ (x >> 16)) & 0x000003ff;
289     return x;
290 }
291 
bitCompact3D(const uint in)292 inline uint3 bitCompact3D(const uint in)
293 {
294     const uint x = bitCompact(x >> 0);
295     const uint y = bitCompact(y >> 1);
296     const uint z = bitCompact(z >> 2);
297     return (uint3)(x, y, z);
298 }
299 
convertToPushIndices8(uint ID)300 inline uint convertToPushIndices8(uint ID)
301 {
302     const unsigned int slotID = get_sub_group_local_id();
303     uint index = 0;
304     for (uint i = 0; i < 8; i++)
305     {
306         const uint mask = intel_sub_group_ballot(ID == i);
307         const uint new_index = ctz(mask);
308         index = i == slotID ? new_index : index;
309     }
310     return index;
311 }
312 
convertToPushIndices16(uint ID)313 inline uint convertToPushIndices16(uint ID)
314 {
315     const unsigned int slotID = get_sub_group_local_id();
316     uint index = 0;
317     for (uint i = 0; i < 16; i++)
318     {
319         const uint mask = intel_sub_group_ballot(ID == i);
320         const uint new_index = ctz(mask);
321         index = i == slotID ? new_index : index;
322     }
323     return index;
324 }
325 
326 #define FLOAT_EXPONENT_MASK     (0x7F800000)  // used to be EXPONENT_MASK
327 #define FLOAT_MANTISSA_MASK     (0x007FFFFF)  // used to be MANTISSA_MASK
328 #define FLOAT_NEG_ONE_EXP_MASK  (0x3F000000)
329 #define FLOAT_BIAS              (127)
330 #define FLOAT_MANTISSA_BITS     (23)
331 
frexp_vec3(float3 len,int3 * exp)332 inline float3 frexp_vec3(float3 len, int3* exp)
333 {
334     float3 mant = as_float3((int3)((as_int3(len) & (int3)FLOAT_MANTISSA_MASK) + (int3)FLOAT_NEG_ONE_EXP_MASK));
335     mant = select(mant, (float3)(0.5f), (int3)(mant == (float3)(1.0f)));
336     mant = copysign(mant, len);
337     *exp = ((as_int3(len) & (int3)FLOAT_EXPONENT_MASK) >> (int3)FLOAT_MANTISSA_BITS) - ((int3)FLOAT_BIAS - (int3)(1));
338     return mant;
339 }
340 
341 
342 #ifndef uniform
343 #define uniform
344 #endif
345 
346 #ifndef varying
347 #define varying
348 #endif
349 
get_sub_group_global_id()350 uint get_sub_group_global_id()
351 {
352     return get_sub_group_id() + get_num_sub_groups() * get_group_id( 0 );
353 }
354 
355 // each lane contains the number of 1 bits below the corresponding position in 'mask'
subgroup_bit_prefix_exclusive(uniform uint mask)356 uint subgroup_bit_prefix_exclusive(uniform uint mask)
357 {
358     varying ushort lane = get_sub_group_local_id();
359     varying uint lane_mask = (1 << lane) - 1;
360     varying uint m = mask & lane_mask;
361     return popcount(m);
362 }
363 
bit_prefix_exclusive(uniform uint mask,varying uint lane_idx)364 uint bit_prefix_exclusive(uniform uint mask, varying uint lane_idx )
365 {
366     varying uint lane_mask = (1 << lane_idx) - 1;
367     varying uint m = mask & lane_mask;
368     return popcount(m);
369 }
370 
371 
sub_group_broadcast_uint3(uint3 v,uniform ushort idx)372 uint3 sub_group_broadcast_uint3(uint3 v, uniform ushort idx)
373 {
374     return (uint3)(sub_group_broadcast(v.x,idx),
375                    sub_group_broadcast(v.y,idx),
376                    sub_group_broadcast(v.z,idx));
377 }
378 
sub_group_broadcast_float3(float3 v,uniform ushort idx)379 float3 sub_group_broadcast_float3(float3 v, uniform ushort idx)
380 {
381     return (float3)(sub_group_broadcast(v.x, idx),
382                     sub_group_broadcast(v.y, idx),
383                     sub_group_broadcast(v.z, idx));
384 }
385 
sub_group_reduce_min_float3(float3 v)386 float3 sub_group_reduce_min_float3(float3 v)
387 {
388     return (float3)(sub_group_reduce_min(v.x),
389                     sub_group_reduce_min(v.y),
390                     sub_group_reduce_min(v.z) );
391 }
sub_group_reduce_max_float3(float3 v)392 float3 sub_group_reduce_max_float3(float3 v)
393 {
394     return (float3)(sub_group_reduce_max(v.x),
395                     sub_group_reduce_max(v.y),
396                     sub_group_reduce_max(v.z));
397 }
398 
sub_group_shuffle_float3(float3 v,uniform ushort idx)399 float3 sub_group_shuffle_float3(float3 v, uniform ushort idx)
400 {
401     return (float3)(intel_sub_group_shuffle(v.x, idx),
402                     intel_sub_group_shuffle(v.y, idx),
403                     intel_sub_group_shuffle(v.z, idx));
404 }
sub_group_shuffle_uint3(uint3 v,uniform ushort idx)405 uint3 sub_group_shuffle_uint3(uint3 v, uniform ushort idx)
406 {
407     return (uint3)( intel_sub_group_shuffle(v.x, idx),
408                     intel_sub_group_shuffle(v.y, idx),
409                     intel_sub_group_shuffle(v.z, idx));
410 }
411 
412 
sub_group_reduce_or_N6(uchar val)413 inline uchar sub_group_reduce_or_N6(uchar val)
414 {
415     val = val | intel_sub_group_shuffle_down(val, val, 4);
416     val = val | intel_sub_group_shuffle_down(val, val, 2);
417     val = val | intel_sub_group_shuffle_down(val, val, 1);
418     return sub_group_broadcast(val, 0);
419 }
420 
sub_group_reduce_or_N6_2xSIMD8_in_SIMD16(uchar val)421 inline uchar sub_group_reduce_or_N6_2xSIMD8_in_SIMD16(uchar val)
422 {
423     uint SIMD8_id = get_sub_group_local_id() / 8;
424     val = val | intel_sub_group_shuffle_down(val, val, 4);
425     val = val | intel_sub_group_shuffle_down(val, val, 2);
426     val = val | intel_sub_group_shuffle_down(val, val, 1);
427 
428     return intel_sub_group_shuffle(val, SIMD8_id * 8);
429 }
430 
431 
atomic_inc_local(local uint * p)432 inline __attribute__((overloadable)) uint atomic_inc_local( local uint* p )
433 {
434     return atomic_fetch_add_explicit( (volatile local atomic_uint*) p, (uint)1, memory_order_relaxed, memory_scope_work_group );
435 }
436 
atomic_inc_local(local int * p)437 inline __attribute__((overloadable)) int atomic_inc_local(local int* p)
438 {
439     return atomic_fetch_add_explicit( (volatile local atomic_int*) p, (int)1, memory_order_relaxed, memory_scope_work_group);
440 }
441 
atomic_dec_local(local uint * p)442 inline __attribute__((overloadable)) uint atomic_dec_local(local uint* p)
443 {
444     return atomic_fetch_sub_explicit((volatile local atomic_uint*) p, (uint)1, memory_order_relaxed, memory_scope_work_group);
445 }
446 
atomic_dec_local(local int * p)447 inline __attribute__((overloadable)) int atomic_dec_local(local int* p)
448 {
449     return atomic_fetch_sub_explicit((volatile local atomic_int*) p, (int)1, memory_order_relaxed, memory_scope_work_group);
450 }
451 
atomic_sub_local(local uint * p,uint n)452 inline __attribute__((overloadable)) uint atomic_sub_local(local uint* p, uint n)
453 {
454     return atomic_fetch_sub_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group);
455 }
456 
atomic_sub_local(local int * p,int n)457 inline __attribute__((overloadable)) int atomic_sub_local(local int* p, int n )
458 {
459     return atomic_fetch_sub_explicit( (volatile local atomic_int*) p, n, memory_order_relaxed, memory_scope_work_group);
460 }
461 
atomic_add_local(local uint * p,uint n)462 inline uint atomic_add_local( local uint* p, uint n )
463 {
464     return atomic_fetch_add_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group);
465 }
466 
atomic_xor_local(local uint * p,uint n)467 inline uint atomic_xor_local(local uint* p, uint n)
468 {
469     return atomic_fetch_xor_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group);
470 }
471 
atomic_or_local(local uint * p,uint n)472 inline uint atomic_or_local(local uint* p, uint n)
473 {
474     return atomic_fetch_or_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group);
475 }
476 
atomic_min_local(local uint * p,uint n)477 inline uint atomic_min_local(local uint* p, uint n)
478 {
479     return atomic_fetch_min_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group);
480 }
481 
atomic_max_local(local uint * p,uint n)482 inline uint atomic_max_local(local uint* p, uint n)
483 {
484     return atomic_fetch_max_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group);
485 }
486 
487 
488 
489 
atomic_inc_global(global uint * p)490 inline uint atomic_inc_global( global uint* p )
491 {
492     return atomic_fetch_add_explicit((volatile global atomic_uint*) p, (uint)1, memory_order_relaxed, memory_scope_device);
493 }
494 
atomic_dec_global(global uint * p)495 inline uint atomic_dec_global(global uint* p)
496 {
497     return atomic_fetch_sub_explicit( (volatile global atomic_uint*) p, (uint)1, memory_order_relaxed, memory_scope_device);
498 }
499 
atomic_compare_exchange_global(global uint * p,uint * expected,uint desired)500 inline bool atomic_compare_exchange_global(global uint* p, uint* expected, uint desired)
501 {
502     return atomic_compare_exchange_strong_explicit((volatile global atomic_uint*) p, expected, desired, memory_order_relaxed, memory_order_relaxed, memory_scope_device);
503 }
504 
atomic_add_global(global uint * p,uint n)505 inline uint atomic_add_global( global uint* p, uint n )
506 {
507     return atomic_fetch_add_explicit( (volatile global atomic_uint*) p, n, memory_order_relaxed, memory_scope_device);
508 }
509 
atomic_sub_global(global uint * p,uint n)510 inline uint atomic_sub_global(global uint* p, uint n)
511 {
512     return atomic_fetch_sub_explicit((volatile global atomic_uint*) p, n, memory_order_relaxed, memory_scope_device);
513 }
514 
atomic_or_global(global uint * p,uint n)515 inline uint atomic_or_global(global uint* p, uint n)
516 {
517     return atomic_fetch_or_explicit((volatile global atomic_uint*) p, n, memory_order_relaxed, memory_scope_device);
518 }
519 
520 
atomic_inc_global_acquire(global uint * p)521 inline uint atomic_inc_global_acquire(global uint* p)
522 {
523     return atomic_fetch_add_explicit((volatile global atomic_uint*) p, (uint)1, memory_order_acquire, memory_scope_device);
524 }
525 
526 
atomic_inc_global_release(global uint * p)527 inline uint atomic_inc_global_release(global uint* p)
528 {
529     return atomic_fetch_add_explicit((volatile global atomic_uint*) p, (uint)1, memory_order_release, memory_scope_device);
530 }
atomic_dec_global_release(global uint * p)531 inline uint atomic_dec_global_release(global uint* p)
532 {
533     return atomic_fetch_sub_explicit((volatile global atomic_uint*) p, (uint)1, memory_order_release, memory_scope_device);
534 }
535 
generic_atomic_add(uint * p,uint val)536 inline uint generic_atomic_add(uint* p, uint val)
537 {
538     if (to_global(p) != NULL)
539         return atomic_add_global(to_global(p), val);
540     if (to_local(p) != NULL)
541         return atomic_add_local(to_local(p), val);
542     return 0;
543 }
544 
sub_group_reduce_max_N6(uint n)545 inline __attribute__((overloadable)) uint sub_group_reduce_max_N6( uint n )
546 {
547     n = max( n, intel_sub_group_shuffle_down( n, n, 4 ) );
548     n = max( n, intel_sub_group_shuffle_down( n, n, 2 ) );
549     n = max( n, intel_sub_group_shuffle_down( n, n, 1 ) );
550     return sub_group_broadcast( n, 0 );
551 }
552 
sub_group_reduce_max_N6(float n)553 inline __attribute__((overloadable)) float sub_group_reduce_max_N6( float n )
554 {
555     n = max( n, intel_sub_group_shuffle_down( n, n, 4 ) );
556     n = max( n, intel_sub_group_shuffle_down( n, n, 2 ) );
557     n = max( n, intel_sub_group_shuffle_down( n, n, 1 ) );
558     return sub_group_broadcast( n, 0 );
559 }
560 
sub_group_reduce_max_N6_2xSIMD8_in_SIMD16(float n)561 inline __attribute__((overloadable)) float sub_group_reduce_max_N6_2xSIMD8_in_SIMD16(float n)
562 {
563     n = max(n, intel_sub_group_shuffle_down(n, n, 4));
564     n = max(n, intel_sub_group_shuffle_down(n, n, 2));
565     n = max(n, intel_sub_group_shuffle_down(n, n, 1));
566     return intel_sub_group_shuffle(n, (get_sub_group_local_id() / 8) * 8);//sub_group_broadcast(n, 0);
567 }
568 
generic_atomic_inc(uint * p)569 inline uint generic_atomic_inc(uint* p)
570 {
571     if (to_global(p) != NULL)
572         return atomic_inc_global(to_global(p));
573     if (to_local(p) != NULL)
574         return atomic_inc(to_local(p));
575     return 0;
576 }
577 
578 
579 // Built-in GRL function which, if called in a kernel body, will force the kernel
580 //  to be compiled to the minimum SIMD width supported by the platform
581 void GRL_UseMinimumSIMDWidth();