1MSTRINGIFY( 2 3\n#undef FETCH_2 4\n#undef FETCH_8 5\n 6\n#define FETCH_2(sum, id, A, jumpBy) sum += A[id]; id += jumpBy; sum += A[id]; id += jumpBy; 7\n#define FETCH_4(sum, id, A, jumpBy) FETCH_2(sum, id, A, jumpBy); FETCH_2(sum, id, A, jumpBy); 8\n#define FETCH_8(sum, id, A, jumpBy) FETCH_4(sum, id, A, jumpBy); FETCH_4(sum, id, A, jumpBy); 9\n 10\n 11\n#define FETCH_PER_WI 16 12\n 13 14// Kernels fetching by local_size offset 15__kernel void global_bandwidth_v1_local_offset(__global float *A, __global float *B) 16{ 17 int id = (get_group_id(0) * get_local_size(0) * FETCH_PER_WI) + get_local_id(0); 18 float sum = 0; 19 20 for(int i=0; i<4; i++) 21 { 22 FETCH_4(sum, id, A, get_local_size(0)); 23 } 24 25 B[get_global_id(0)] = sum; 26} 27 28 29__kernel void global_bandwidth_v2_local_offset(__global float2 *A, __global float *B) 30{ 31 int id = (get_group_id(0) * get_local_size(0) * FETCH_PER_WI) + get_local_id(0); 32 float2 sum = 0; 33 34 for(int i=0; i<4; i++) 35 { 36 FETCH_4(sum, id, A, get_local_size(0)); 37 } 38 39 B[get_global_id(0)] = (sum.S0) + (sum.S1); 40} 41 42 43__kernel void global_bandwidth_v4_local_offset(__global float4 *A, __global float *B) 44{ 45 int id = (get_group_id(0) * get_local_size(0) * FETCH_PER_WI) + get_local_id(0); 46 float4 sum = 0; 47 48 for(int i=0; i<4; i++) 49 { 50 FETCH_4(sum, id, A, get_local_size(0)); 51 } 52 53 B[get_global_id(0)] = (sum.S0) + (sum.S1) + (sum.S2) + (sum.S3); 54} 55 56 57__kernel void global_bandwidth_v8_local_offset(__global float8 *A, __global float *B) 58{ 59 int id = (get_group_id(0) * get_local_size(0) * FETCH_PER_WI) + get_local_id(0); 60 float8 sum = 0; 61 62 for(int i=0; i<4; i++) 63 { 64 FETCH_4(sum, id, A, get_local_size(0)); 65 } 66 67 B[get_global_id(0)] = (sum.S0) + (sum.S1) + (sum.S2) + (sum.S3) + (sum.S4) + (sum.S5) + (sum.S6) + (sum.S7); 68} 69 70__kernel void global_bandwidth_v16_local_offset(__global float16 *A, __global float *B) 71{ 72 int id = (get_group_id(0) * get_local_size(0) * FETCH_PER_WI) + get_local_id(0); 73 float16 sum = 0; 74 75 for(int i=0; i<4; i++) 76 { 77 FETCH_4(sum, id, A, get_local_size(0)); 78 } 79 80 float t = (sum.S0) + (sum.S1) + (sum.S2) + (sum.S3) + (sum.S4) + (sum.S5) + (sum.S6) + (sum.S7); 81 t += (sum.S8) + (sum.S9) + (sum.SA) + (sum.SB) + (sum.SC) + (sum.SD) + (sum.SE) + (sum.SF); 82 B[get_global_id(0)] = t; 83} 84 85 86// Kernels fetching by global_size offset 87__kernel void global_bandwidth_v1_global_offset(__global float *A, __global float *B) 88{ 89 int id = get_global_id(0); 90 float sum = 0; 91 92 for(int i=0; i<4; i++) 93 { 94 FETCH_4(sum, id, A, get_global_size(0)); 95 } 96 97 B[get_global_id(0)] = sum; 98} 99 100 101__kernel void global_bandwidth_v2_global_offset(__global float2 *A, __global float *B) 102{ 103 int id = get_global_id(0); 104 float2 sum = 0; 105 106 for(int i=0; i<4; i++) 107 { 108 FETCH_4(sum, id, A, get_global_size(0)); 109 } 110 111 B[get_global_id(0)] = (sum.S0) + (sum.S1); 112} 113 114 115__kernel void global_bandwidth_v4_global_offset(__global float4 *A, __global float *B) 116{ 117 int id = get_global_id(0); 118 float4 sum = 0; 119 120 for(int i=0; i<4; i++) 121 { 122 FETCH_4(sum, id, A, get_global_size(0)); 123 } 124 125 B[get_global_id(0)] = (sum.S0) + (sum.S1) + (sum.S2) + (sum.S3); 126} 127 128 129__kernel void global_bandwidth_v8_global_offset(__global float8 *A, __global float *B) 130{ 131 int id = get_global_id(0); 132 float8 sum = 0; 133 134 for(int i=0; i<4; i++) 135 { 136 FETCH_4(sum, id, A, get_global_size(0)); 137 } 138 139 B[get_global_id(0)] = (sum.S0) + (sum.S1) + (sum.S2) + (sum.S3) + (sum.S4) + (sum.S5) + (sum.S6) + (sum.S7); 140} 141 142__kernel void global_bandwidth_v16_global_offset(__global float16 *A, __global float *B) 143{ 144 int id = get_global_id(0); 145 float16 sum = 0; 146 147 for(int i=0; i<4; i++) 148 { 149 FETCH_4(sum, id, A, get_global_size(0)); 150 } 151 152 float t = (sum.S0) + (sum.S1) + (sum.S2) + (sum.S3) + (sum.S4) + (sum.S5) + (sum.S6) + (sum.S7); 153 t += (sum.S8) + (sum.S9) + (sum.SA) + (sum.SB) + (sum.SC) + (sum.SD) + (sum.SE) + (sum.SF); 154 B[get_global_id(0)] = t; 155} 156 157 158) 159