1 //
2 // Copyright (C) 2009-2021 Intel Corporation
3 //
4 // SPDX-License-Identifier: MIT
5 //
6 //
7
8 #pragma once
9
10 // TODO: AABB_work_group_reduce is super slow, remove !!!
11
12 #pragma cl_intel_subgroups : enable
13 #pragma cl_khr_fp16 : enable
14 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
15
16
17 uint intel_sub_group_ballot(bool valid);
18
19 // atom_min
20 float __attribute__((overloadable)) atom_min(volatile __global float *p, float val);
21 float __attribute__((overloadable)) atom_min(volatile __local float *p, float val);
22 float __attribute__((overloadable)) atomic_min(volatile __global float *p, float val);
23 float __attribute__((overloadable)) atomic_min(volatile __local float *p, float val);
24 // atom_max
25 float __attribute__((overloadable)) atom_max(volatile __global float *p, float val);
26 float __attribute__((overloadable)) atom_max(volatile __local float *p, float val);
27 float __attribute__((overloadable)) atomic_max(volatile __global float *p, float val);
28 float __attribute__((overloadable)) atomic_max(volatile __local float *p, float val);
29 // atom_cmpxchg
30 float __attribute__((overloadable)) atom_cmpxchg(volatile __global float *p, float cmp, float val);
31 float __attribute__((overloadable)) atom_cmpxchg(volatile __local float *p, float cmp, float val);
32 float __attribute__((overloadable)) atomic_cmpxchg(volatile __global float *p, float cmp, float val);
33 float __attribute__((overloadable)) atomic_cmpxchg(volatile __local float *p, float cmp, float val);
34
35
36
subgroup_single_atomic_add(global uint * p,uint val)37 inline uint subgroup_single_atomic_add(global uint *p, uint val)
38 {
39 const uint subgroupLocalID = get_sub_group_local_id();
40 const int v = subgroupLocalID == 0 ? atomic_add(p, val) : 0;
41 return sub_group_broadcast(v, 0);
42 }
43
halfarea(const float3 d)44 inline float halfarea(const float3 d)
45 {
46 return fma(d.x, (d.y + d.z), d.y * d.z);
47 }
48
area(const float3 d)49 inline float area(const float3 d)
50 {
51 return halfarea(d) * 2.0f;
52 }
53
maxDim(const float3 a)54 inline uint maxDim(const float3 a)
55 {
56 const float3 b = fabs(a);
57 const bool b_x_y = b.x > b.y;
58 const float cur_max = b_x_y ? b.x : b.y;
59 const uint cur_idx = b_x_y ? 0 : 1;
60 const bool b_x_y_z = b.z > cur_max;
61 return b_x_y_z ? 2 : cur_idx;
62 }
63
sortByMaxDim(const float3 a)64 inline uint3 sortByMaxDim(const float3 a)
65 {
66 const uint kz = maxDim(a);
67 const uint _kx = (kz + 1) % 3;
68 const uint _ky = (_kx + 1) % 3;
69 const bool kz_pos = a[kz] >= 0.0f;
70 const uint kx = kz_pos ? _ky : _kx;
71 const uint ky = kz_pos ? _kx : _ky;
72 return (uint3)(kx, ky, kz);
73 }
74
sort4_ascending(const uint4 dist)75 inline uint4 sort4_ascending(const uint4 dist)
76 {
77 const uint a0 = dist.s0;
78 const uint a1 = dist.s1;
79 const uint a2 = dist.s2;
80 const uint a3 = dist.s3;
81 const uint b0 = min(a0, a2);
82 const uint b1 = min(a1, a3);
83 const uint b2 = max(a0, a2);
84 const uint b3 = max(a1, a3);
85 const uint c0 = min(b0, b1);
86 const uint c1 = max(b0, b1);
87 const uint c2 = min(b2, b3);
88 const uint c3 = max(b2, b3);
89 const uint d0 = c0;
90 const uint d1 = min(c1, c2);
91 const uint d2 = max(c1, c2);
92 const uint d3 = c3;
93 return (uint4)(d0, d1, d2, d3);
94 }
95
96 __constant const uint shuffleA[8] = {1, 0, 3, 2, 5, 4, 7, 6};
97 __constant const uint shuffleB[8] = {2, 3, 0, 1, 7, 6, 5, 4};
98 __constant const uint shuffleC[8] = {1, 0, 3, 2, 5, 4, 7, 6};
99 __constant const uint shuffleD[8] = {7, 6, 5, 4, 3, 2, 1, 0};
100 __constant const uint shuffleE[8] = {2, 3, 0, 1, 6, 7, 4, 5};
101 __constant const uint shuffleF[8] = {1, 0, 3, 2, 5, 4, 7, 6};
102 __constant const uint shuffleG[8] = {0, 2, 1, 3, 5, 4, 7, 6};
103
104 __constant const uint selAA[8] = {0, 1, 0, 1, 0, 1, 0, 1};
105 __constant const uint selCC[8] = {0, 0, 1, 1, 0, 0, 1, 1};
106 __constant const uint selF0[8] = {0, 0, 0, 0, 1, 1, 1, 1};
107
108 __constant const uint selGG[8] = {0, 0, 1, 0, 1, 1, 1, 1};
109
compare_exchange_descending(const uint a0,const uint shuffleMask,const uint selectMask)110 inline uint compare_exchange_descending(const uint a0, const uint shuffleMask, const uint selectMask)
111 {
112 const uint a1 = intel_sub_group_shuffle(a0, shuffleMask);
113 const uint a_min = min(a0, a1);
114 const uint a_max = max(a0, a1);
115 return select(a_max, a_min, selectMask);
116 }
117
compare_exchange_ascending(const uint a0,const uint shuffleMask,const uint selectMask)118 inline uint compare_exchange_ascending(const uint a0, const uint shuffleMask, const uint selectMask)
119 {
120 const uint a1 = intel_sub_group_shuffle(a0, shuffleMask);
121 const uint a_min = min(a0, a1);
122 const uint a_max = max(a0, a1);
123 return select(a_min, a_max, selectMask);
124 }
125
sort8_descending(const uint aa)126 inline uint sort8_descending(const uint aa)
127 {
128 const unsigned int slotID = get_sub_group_local_id() % 8;
129 const uint bb = compare_exchange_descending(aa, shuffleA[slotID], selAA[slotID]);
130 const uint cc = compare_exchange_descending(bb, shuffleB[slotID], selCC[slotID]);
131 const uint dd = compare_exchange_descending(cc, shuffleC[slotID], selAA[slotID]);
132 const uint ee = compare_exchange_descending(dd, shuffleD[slotID], selF0[slotID]);
133 const uint ff = compare_exchange_descending(ee, shuffleE[slotID], selCC[slotID]);
134 const uint gg = compare_exchange_descending(ff, shuffleF[slotID], selAA[slotID]);
135 return gg;
136 }
137
sort8_ascending(const uint aa)138 inline uint sort8_ascending(const uint aa)
139 {
140 const unsigned int slotID = get_sub_group_local_id() % 8;
141 const uint bb = compare_exchange_ascending(aa, shuffleA[slotID], selAA[slotID]);
142 const uint cc = compare_exchange_ascending(bb, shuffleB[slotID], selCC[slotID]);
143 const uint dd = compare_exchange_ascending(cc, shuffleC[slotID], selAA[slotID]);
144 const uint ee = compare_exchange_ascending(dd, shuffleD[slotID], selF0[slotID]);
145 const uint ff = compare_exchange_ascending(ee, shuffleE[slotID], selCC[slotID]);
146 const uint gg = compare_exchange_ascending(ff, shuffleF[slotID], selAA[slotID]);
147 return gg;
148 }
149
sort4_descending(const uint aa)150 inline uint sort4_descending(const uint aa)
151 {
152 const unsigned int slotID = get_sub_group_local_id() % 8;
153 const uint bb = compare_exchange_descending(aa, shuffleA[slotID], selAA[slotID]);
154 const uint cc = compare_exchange_descending(bb, shuffleB[slotID], selCC[slotID]);
155 const uint dd = compare_exchange_descending(cc, shuffleG[slotID], selGG[slotID]);
156 return dd;
157 }
158
compare_exchange_descending_ulong(const ulong a0,const uint shuffleMask,const uint selectMask)159 inline ulong compare_exchange_descending_ulong(const ulong a0, const uint shuffleMask, const uint selectMask)
160 {
161 const ulong a1 = intel_sub_group_shuffle(a0, shuffleMask);
162 const ulong a_min = min(a0, a1);
163 const ulong a_max = max(a0, a1);
164 return select(a_max, a_min, (ulong)selectMask);
165 }
166
compare_exchange_ascending_ulong(const ulong a0,const uint shuffleMask,const uint selectMask)167 inline ulong compare_exchange_ascending_ulong(const ulong a0, const uint shuffleMask, const uint selectMask)
168 {
169 const ulong a1 = intel_sub_group_shuffle(a0, shuffleMask);
170 const ulong a_min = min(a0, a1);
171 const ulong a_max = max(a0, a1);
172 return select(a_min, a_max, (ulong)selectMask);
173 }
174
sort8_ascending_ulong(const ulong aa)175 inline ulong sort8_ascending_ulong(const ulong aa)
176 {
177 const unsigned int slotID = get_sub_group_local_id() % 8;
178 const ulong bb = compare_exchange_ascending_ulong(aa, shuffleA[slotID], selAA[slotID]);
179 const ulong cc = compare_exchange_ascending_ulong(bb, shuffleB[slotID], selCC[slotID]);
180 const ulong dd = compare_exchange_ascending_ulong(cc, shuffleC[slotID], selAA[slotID]);
181 const ulong ee = compare_exchange_ascending_ulong(dd, shuffleD[slotID], selF0[slotID]);
182 const ulong ff = compare_exchange_ascending_ulong(ee, shuffleE[slotID], selCC[slotID]);
183 const ulong gg = compare_exchange_ascending_ulong(ff, shuffleF[slotID], selAA[slotID]);
184 return gg;
185 }
186
bitInterleave3D(const uint4 in)187 inline uint bitInterleave3D(const uint4 in)
188 {
189 uint x = in.x, y = in.y, z = in.z;
190 x = (x | (x << 16)) & 0x030000FF;
191 x = (x | (x << 8)) & 0x0300F00F;
192 x = (x | (x << 4)) & 0x030C30C3;
193 x = (x | (x << 2)) & 0x09249249;
194
195 y = (y | (y << 16)) & 0x030000FF;
196 y = (y | (y << 8)) & 0x0300F00F;
197 y = (y | (y << 4)) & 0x030C30C3;
198 y = (y | (y << 2)) & 0x09249249;
199
200 z = (z | (z << 16)) & 0x030000FF;
201 z = (z | (z << 8)) & 0x0300F00F;
202 z = (z | (z << 4)) & 0x030C30C3;
203 z = (z | (z << 2)) & 0x09249249;
204
205 return x | (y << 1) | (z << 2);
206 }
207
bitInterleave4D(const uint4 in)208 inline uint bitInterleave4D(const uint4 in)
209 {
210 uint x = in.x, y = in.y, z = in.z, w = in.w;
211
212 x = x & 0x000000ff;
213 x = (x ^ (x << 16)) & 0x00c0003f;
214 x = (x ^ (x << 8)) & 0x00c03807;
215 x = (x ^ (x << 4)) & 0x08530853;
216 x = (x ^ (x << 2)) & 0x09090909;
217 x = (x ^ (x << 1)) & 0x11111111;
218
219 y = y & 0x000000ff;
220 y = (y ^ (y << 16)) & 0x00c0003f;
221 y = (y ^ (y << 8)) & 0x00c03807;
222 y = (y ^ (y << 4)) & 0x08530853;
223 y = (y ^ (y << 2)) & 0x09090909;
224 y = (y ^ (y << 1)) & 0x11111111;
225
226 z = z & 0x000000ff;
227 z = (z ^ (z << 16)) & 0x00c0003f;
228 z = (z ^ (z << 8)) & 0x00c03807;
229 z = (z ^ (z << 4)) & 0x08530853;
230 z = (z ^ (z << 2)) & 0x09090909;
231 z = (z ^ (z << 1)) & 0x11111111;
232
233 w = w & 0x000000ff;
234 w = (w ^ (w << 16)) & 0x00c0003f;
235 w = (w ^ (w << 8)) & 0x00c03807;
236 w = (w ^ (w << 4)) & 0x08530853;
237 w = (w ^ (w << 2)) & 0x09090909;
238 w = (w ^ (w << 1)) & 0x11111111;
239
240 return (x | (y << 1) | (z << 2) | (w << 3));
241 }
242
ulong_bitInterleave4D(const uint4 in)243 inline ulong ulong_bitInterleave4D(const uint4 in)
244 {
245 ulong x = in.x, y = in.y, z = in.z, w = in.w;
246
247 x = x & 0x0000ffff;
248 x = (x ^ (x << 32)) & 0x0000f800000007ff;
249 x = (x ^ (x << 16)) & 0x0000f80007c0003f;
250 x = (x ^ (x << 8)) & 0x00c0380700c03807;
251 x = (x ^ (x << 4)) & 0x0843084308430843;
252 x = (x ^ (x << 2)) & 0x0909090909090909;
253 x = (x ^ (x << 1)) & 0x1111111111111111;
254
255 y = y & 0x0000ffff;
256 y = (y ^ (y << 32)) & 0x0000f800000007ff;
257 y = (y ^ (y << 16)) & 0x0000f80007c0003f;
258 y = (y ^ (y << 8)) & 0x00c0380700c03807;
259 y = (y ^ (y << 4)) & 0x0843084308430843;
260 y = (y ^ (y << 2)) & 0x0909090909090909;
261 y = (y ^ (y << 1)) & 0x1111111111111111;
262
263 z = z & 0x0000ffff;
264 z = (z ^ (z << 32)) & 0x0000f800000007ff;
265 z = (z ^ (z << 16)) & 0x0000f80007c0003f;
266 z = (z ^ (z << 8)) & 0x00c0380700c03807;
267 z = (z ^ (z << 4)) & 0x0843084308430843;
268 z = (z ^ (z << 2)) & 0x0909090909090909;
269 z = (z ^ (z << 1)) & 0x1111111111111111;
270
271 w = w & 0x0000ffff;
272 w = (w ^ (w << 32)) & 0x0000f800000007ff;
273 w = (w ^ (w << 16)) & 0x0000f80007c0003f;
274 w = (w ^ (w << 8)) & 0x00c0380700c03807;
275 w = (w ^ (w << 4)) & 0x0843084308430843;
276 w = (w ^ (w << 2)) & 0x0909090909090909;
277 w = (w ^ (w << 1)) & 0x1111111111111111;
278
279 return (x | (y << 1) | (z << 2) | (w << 3));
280 }
281
bitCompact(uint x)282 inline uint bitCompact(uint x)
283 {
284 x &= 0x09249249;
285 x = (x ^ (x >> 2)) & 0x030c30c3;
286 x = (x ^ (x >> 4)) & 0x0300f00f;
287 x = (x ^ (x >> 8)) & 0xff0000ff;
288 x = (x ^ (x >> 16)) & 0x000003ff;
289 return x;
290 }
291
bitCompact3D(const uint in)292 inline uint3 bitCompact3D(const uint in)
293 {
294 const uint x = bitCompact(x >> 0);
295 const uint y = bitCompact(y >> 1);
296 const uint z = bitCompact(z >> 2);
297 return (uint3)(x, y, z);
298 }
299
convertToPushIndices8(uint ID)300 inline uint convertToPushIndices8(uint ID)
301 {
302 const unsigned int slotID = get_sub_group_local_id();
303 uint index = 0;
304 for (uint i = 0; i < 8; i++)
305 {
306 const uint mask = intel_sub_group_ballot(ID == i);
307 const uint new_index = ctz(mask);
308 index = i == slotID ? new_index : index;
309 }
310 return index;
311 }
312
convertToPushIndices16(uint ID)313 inline uint convertToPushIndices16(uint ID)
314 {
315 const unsigned int slotID = get_sub_group_local_id();
316 uint index = 0;
317 for (uint i = 0; i < 16; i++)
318 {
319 const uint mask = intel_sub_group_ballot(ID == i);
320 const uint new_index = ctz(mask);
321 index = i == slotID ? new_index : index;
322 }
323 return index;
324 }
325
326 #define FLOAT_EXPONENT_MASK (0x7F800000) // used to be EXPONENT_MASK
327 #define FLOAT_MANTISSA_MASK (0x007FFFFF) // used to be MANTISSA_MASK
328 #define FLOAT_NEG_ONE_EXP_MASK (0x3F000000)
329 #define FLOAT_BIAS (127)
330 #define FLOAT_MANTISSA_BITS (23)
331
frexp_vec3(float3 len,int3 * exp)332 inline float3 frexp_vec3(float3 len, int3* exp)
333 {
334 float3 mant = as_float3((int3)((as_int3(len) & (int3)FLOAT_MANTISSA_MASK) + (int3)FLOAT_NEG_ONE_EXP_MASK));
335 mant = select(mant, (float3)(0.5f), (int3)(mant == (float3)(1.0f)));
336 mant = copysign(mant, len);
337 *exp = ((as_int3(len) & (int3)FLOAT_EXPONENT_MASK) >> (int3)FLOAT_MANTISSA_BITS) - ((int3)FLOAT_BIAS - (int3)(1));
338 return mant;
339 }
340
341
342 #ifndef uniform
343 #define uniform
344 #endif
345
346 #ifndef varying
347 #define varying
348 #endif
349
get_sub_group_global_id()350 uint get_sub_group_global_id()
351 {
352 return get_sub_group_id() + get_num_sub_groups() * get_group_id( 0 );
353 }
354
355 // each lane contains the number of 1 bits below the corresponding position in 'mask'
subgroup_bit_prefix_exclusive(uniform uint mask)356 uint subgroup_bit_prefix_exclusive(uniform uint mask)
357 {
358 varying ushort lane = get_sub_group_local_id();
359 varying uint lane_mask = (1 << lane) - 1;
360 varying uint m = mask & lane_mask;
361 return popcount(m);
362 }
363
bit_prefix_exclusive(uniform uint mask,varying uint lane_idx)364 uint bit_prefix_exclusive(uniform uint mask, varying uint lane_idx )
365 {
366 varying uint lane_mask = (1 << lane_idx) - 1;
367 varying uint m = mask & lane_mask;
368 return popcount(m);
369 }
370
371
sub_group_broadcast_uint3(uint3 v,uniform ushort idx)372 uint3 sub_group_broadcast_uint3(uint3 v, uniform ushort idx)
373 {
374 return (uint3)(sub_group_broadcast(v.x,idx),
375 sub_group_broadcast(v.y,idx),
376 sub_group_broadcast(v.z,idx));
377 }
378
sub_group_broadcast_float3(float3 v,uniform ushort idx)379 float3 sub_group_broadcast_float3(float3 v, uniform ushort idx)
380 {
381 return (float3)(sub_group_broadcast(v.x, idx),
382 sub_group_broadcast(v.y, idx),
383 sub_group_broadcast(v.z, idx));
384 }
385
sub_group_reduce_min_float3(float3 v)386 float3 sub_group_reduce_min_float3(float3 v)
387 {
388 return (float3)(sub_group_reduce_min(v.x),
389 sub_group_reduce_min(v.y),
390 sub_group_reduce_min(v.z) );
391 }
sub_group_reduce_max_float3(float3 v)392 float3 sub_group_reduce_max_float3(float3 v)
393 {
394 return (float3)(sub_group_reduce_max(v.x),
395 sub_group_reduce_max(v.y),
396 sub_group_reduce_max(v.z));
397 }
398
sub_group_shuffle_float3(float3 v,uniform ushort idx)399 float3 sub_group_shuffle_float3(float3 v, uniform ushort idx)
400 {
401 return (float3)(intel_sub_group_shuffle(v.x, idx),
402 intel_sub_group_shuffle(v.y, idx),
403 intel_sub_group_shuffle(v.z, idx));
404 }
sub_group_shuffle_uint3(uint3 v,uniform ushort idx)405 uint3 sub_group_shuffle_uint3(uint3 v, uniform ushort idx)
406 {
407 return (uint3)( intel_sub_group_shuffle(v.x, idx),
408 intel_sub_group_shuffle(v.y, idx),
409 intel_sub_group_shuffle(v.z, idx));
410 }
411
412
sub_group_reduce_or_N6(uchar val)413 inline uchar sub_group_reduce_or_N6(uchar val)
414 {
415 val = val | intel_sub_group_shuffle_down(val, val, 4);
416 val = val | intel_sub_group_shuffle_down(val, val, 2);
417 val = val | intel_sub_group_shuffle_down(val, val, 1);
418 return sub_group_broadcast(val, 0);
419 }
420
sub_group_reduce_or_N6_2xSIMD8_in_SIMD16(uchar val)421 inline uchar sub_group_reduce_or_N6_2xSIMD8_in_SIMD16(uchar val)
422 {
423 uint SIMD8_id = get_sub_group_local_id() / 8;
424 val = val | intel_sub_group_shuffle_down(val, val, 4);
425 val = val | intel_sub_group_shuffle_down(val, val, 2);
426 val = val | intel_sub_group_shuffle_down(val, val, 1);
427
428 return intel_sub_group_shuffle(val, SIMD8_id * 8);
429 }
430
431
atomic_inc_local(local uint * p)432 inline __attribute__((overloadable)) uint atomic_inc_local( local uint* p )
433 {
434 return atomic_fetch_add_explicit( (volatile local atomic_uint*) p, (uint)1, memory_order_relaxed, memory_scope_work_group );
435 }
436
atomic_inc_local(local int * p)437 inline __attribute__((overloadable)) int atomic_inc_local(local int* p)
438 {
439 return atomic_fetch_add_explicit( (volatile local atomic_int*) p, (int)1, memory_order_relaxed, memory_scope_work_group);
440 }
441
atomic_dec_local(local uint * p)442 inline __attribute__((overloadable)) uint atomic_dec_local(local uint* p)
443 {
444 return atomic_fetch_sub_explicit((volatile local atomic_uint*) p, (uint)1, memory_order_relaxed, memory_scope_work_group);
445 }
446
atomic_dec_local(local int * p)447 inline __attribute__((overloadable)) int atomic_dec_local(local int* p)
448 {
449 return atomic_fetch_sub_explicit((volatile local atomic_int*) p, (int)1, memory_order_relaxed, memory_scope_work_group);
450 }
451
atomic_sub_local(local uint * p,uint n)452 inline __attribute__((overloadable)) uint atomic_sub_local(local uint* p, uint n)
453 {
454 return atomic_fetch_sub_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group);
455 }
456
atomic_sub_local(local int * p,int n)457 inline __attribute__((overloadable)) int atomic_sub_local(local int* p, int n )
458 {
459 return atomic_fetch_sub_explicit( (volatile local atomic_int*) p, n, memory_order_relaxed, memory_scope_work_group);
460 }
461
atomic_add_local(local uint * p,uint n)462 inline uint atomic_add_local( local uint* p, uint n )
463 {
464 return atomic_fetch_add_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group);
465 }
466
atomic_xor_local(local uint * p,uint n)467 inline uint atomic_xor_local(local uint* p, uint n)
468 {
469 return atomic_fetch_xor_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group);
470 }
471
atomic_or_local(local uint * p,uint n)472 inline uint atomic_or_local(local uint* p, uint n)
473 {
474 return atomic_fetch_or_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group);
475 }
476
atomic_min_local(local uint * p,uint n)477 inline uint atomic_min_local(local uint* p, uint n)
478 {
479 return atomic_fetch_min_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group);
480 }
481
atomic_max_local(local uint * p,uint n)482 inline uint atomic_max_local(local uint* p, uint n)
483 {
484 return atomic_fetch_max_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group);
485 }
486
487
488
489
atomic_inc_global(global uint * p)490 inline uint atomic_inc_global( global uint* p )
491 {
492 return atomic_fetch_add_explicit((volatile global atomic_uint*) p, (uint)1, memory_order_relaxed, memory_scope_device);
493 }
494
atomic_dec_global(global uint * p)495 inline uint atomic_dec_global(global uint* p)
496 {
497 return atomic_fetch_sub_explicit( (volatile global atomic_uint*) p, (uint)1, memory_order_relaxed, memory_scope_device);
498 }
499
atomic_compare_exchange_global(global uint * p,uint * expected,uint desired)500 inline bool atomic_compare_exchange_global(global uint* p, uint* expected, uint desired)
501 {
502 return atomic_compare_exchange_strong_explicit((volatile global atomic_uint*) p, expected, desired, memory_order_relaxed, memory_order_relaxed, memory_scope_device);
503 }
504
atomic_add_global(global uint * p,uint n)505 inline uint atomic_add_global( global uint* p, uint n )
506 {
507 return atomic_fetch_add_explicit( (volatile global atomic_uint*) p, n, memory_order_relaxed, memory_scope_device);
508 }
509
atomic_sub_global(global uint * p,uint n)510 inline uint atomic_sub_global(global uint* p, uint n)
511 {
512 return atomic_fetch_sub_explicit((volatile global atomic_uint*) p, n, memory_order_relaxed, memory_scope_device);
513 }
514
atomic_or_global(global uint * p,uint n)515 inline uint atomic_or_global(global uint* p, uint n)
516 {
517 return atomic_fetch_or_explicit((volatile global atomic_uint*) p, n, memory_order_relaxed, memory_scope_device);
518 }
519
520
atomic_inc_global_acquire(global uint * p)521 inline uint atomic_inc_global_acquire(global uint* p)
522 {
523 return atomic_fetch_add_explicit((volatile global atomic_uint*) p, (uint)1, memory_order_acquire, memory_scope_device);
524 }
525
526
atomic_inc_global_release(global uint * p)527 inline uint atomic_inc_global_release(global uint* p)
528 {
529 return atomic_fetch_add_explicit((volatile global atomic_uint*) p, (uint)1, memory_order_release, memory_scope_device);
530 }
atomic_dec_global_release(global uint * p)531 inline uint atomic_dec_global_release(global uint* p)
532 {
533 return atomic_fetch_sub_explicit((volatile global atomic_uint*) p, (uint)1, memory_order_release, memory_scope_device);
534 }
535
generic_atomic_add(uint * p,uint val)536 inline uint generic_atomic_add(uint* p, uint val)
537 {
538 if (to_global(p) != NULL)
539 return atomic_add_global(to_global(p), val);
540 if (to_local(p) != NULL)
541 return atomic_add_local(to_local(p), val);
542 return 0;
543 }
544
sub_group_reduce_max_N6(uint n)545 inline __attribute__((overloadable)) uint sub_group_reduce_max_N6( uint n )
546 {
547 n = max( n, intel_sub_group_shuffle_down( n, n, 4 ) );
548 n = max( n, intel_sub_group_shuffle_down( n, n, 2 ) );
549 n = max( n, intel_sub_group_shuffle_down( n, n, 1 ) );
550 return sub_group_broadcast( n, 0 );
551 }
552
sub_group_reduce_max_N6(float n)553 inline __attribute__((overloadable)) float sub_group_reduce_max_N6( float n )
554 {
555 n = max( n, intel_sub_group_shuffle_down( n, n, 4 ) );
556 n = max( n, intel_sub_group_shuffle_down( n, n, 2 ) );
557 n = max( n, intel_sub_group_shuffle_down( n, n, 1 ) );
558 return sub_group_broadcast( n, 0 );
559 }
560
sub_group_reduce_max_N6_2xSIMD8_in_SIMD16(float n)561 inline __attribute__((overloadable)) float sub_group_reduce_max_N6_2xSIMD8_in_SIMD16(float n)
562 {
563 n = max(n, intel_sub_group_shuffle_down(n, n, 4));
564 n = max(n, intel_sub_group_shuffle_down(n, n, 2));
565 n = max(n, intel_sub_group_shuffle_down(n, n, 1));
566 return intel_sub_group_shuffle(n, (get_sub_group_local_id() / 8) * 8);//sub_group_broadcast(n, 0);
567 }
568
generic_atomic_inc(uint * p)569 inline uint generic_atomic_inc(uint* p)
570 {
571 if (to_global(p) != NULL)
572 return atomic_inc_global(to_global(p));
573 if (to_local(p) != NULL)
574 return atomic_inc(to_local(p));
575 return 0;
576 }
577
578
579 // Built-in GRL function which, if called in a kernel body, will force the kernel
580 // to be compiled to the minimum SIMD width supported by the platform
581 void GRL_UseMinimumSIMDWidth();