1*1cd03ba3SJeremy KempMSTRINGIFY( 2*1cd03ba3SJeremy Kemp 3*1cd03ba3SJeremy Kemp// Stringifying requires a new line after hash defines 4*1cd03ba3SJeremy Kemp 5*1cd03ba3SJeremy Kemp\n#if defined(cl_khr_fp16) 6*1cd03ba3SJeremy Kemp\n #pragma OPENCL EXTENSION cl_khr_fp16 : enable 7*1cd03ba3SJeremy Kemp\n #define HALF_AVAILABLE 8*1cd03ba3SJeremy Kemp\n#endif 9*1cd03ba3SJeremy Kemp 10*1cd03ba3SJeremy Kemp\n#undef MAD_4 11*1cd03ba3SJeremy Kemp\n#undef MAD_16 12*1cd03ba3SJeremy Kemp\n#undef MAD_64 13*1cd03ba3SJeremy Kemp\n 14*1cd03ba3SJeremy Kemp\n#define MAD_4(x, y) x = mad(y, x, y); y = mad(x, y, x); x = mad(y, x, y); y = mad(x, y, x); 15*1cd03ba3SJeremy Kemp\n#define MAD_16(x, y) MAD_4(x, y); MAD_4(x, y); MAD_4(x, y); MAD_4(x, y); 16*1cd03ba3SJeremy Kemp\n#define MAD_64(x, y) MAD_16(x, y); MAD_16(x, y); MAD_16(x, y); MAD_16(x, y); 17*1cd03ba3SJeremy Kemp\n 18*1cd03ba3SJeremy Kemp 19*1cd03ba3SJeremy Kemp\n 20*1cd03ba3SJeremy Kemp\n#ifdef HALF_AVAILABLE 21*1cd03ba3SJeremy Kemp\n 22*1cd03ba3SJeremy Kemp 23*1cd03ba3SJeremy Kemp 24*1cd03ba3SJeremy Kemp__kernel void compute_hp_v1(__global half *ptr, float _B) 25*1cd03ba3SJeremy Kemp{ 26*1cd03ba3SJeremy Kemp half _A = (half)_B; 27*1cd03ba3SJeremy Kemp half x = _A; 28*1cd03ba3SJeremy Kemp half y = (half)get_local_id(0); 29*1cd03ba3SJeremy Kemp 30*1cd03ba3SJeremy Kemp for(int i=0; i<128; i++) 31*1cd03ba3SJeremy Kemp { 32*1cd03ba3SJeremy Kemp MAD_16(x, y); 33*1cd03ba3SJeremy Kemp } 34*1cd03ba3SJeremy Kemp 35*1cd03ba3SJeremy Kemp ptr[get_global_id(0)] = y; 36*1cd03ba3SJeremy Kemp} 37*1cd03ba3SJeremy Kemp 38*1cd03ba3SJeremy Kemp 39*1cd03ba3SJeremy Kemp__kernel void compute_hp_v2(__global half *ptr, float _B) 40*1cd03ba3SJeremy Kemp{ 41*1cd03ba3SJeremy Kemp half _A = (half)_B; 42*1cd03ba3SJeremy Kemp half2 x = (half2)(_A, (_A+1)); 43*1cd03ba3SJeremy Kemp half2 y = (half2)get_local_id(0); 44*1cd03ba3SJeremy Kemp 45*1cd03ba3SJeremy Kemp for(int i=0; i<64; i++) 46*1cd03ba3SJeremy Kemp { 47*1cd03ba3SJeremy Kemp MAD_16(x, y); 48*1cd03ba3SJeremy Kemp } 49*1cd03ba3SJeremy Kemp 50*1cd03ba3SJeremy Kemp ptr[get_global_id(0)] = (y.S0) + (y.S1); 51*1cd03ba3SJeremy Kemp} 52*1cd03ba3SJeremy Kemp 53*1cd03ba3SJeremy Kemp__kernel void compute_hp_v4(__global half *ptr, float _B) 54*1cd03ba3SJeremy Kemp{ 55*1cd03ba3SJeremy Kemp half _A = (half)_B; 56*1cd03ba3SJeremy Kemp half4 x = (half4)(_A, (_A+1), (_A+2), (_A+3)); 57*1cd03ba3SJeremy Kemp half4 y = (half4)get_local_id(0); 58*1cd03ba3SJeremy Kemp 59*1cd03ba3SJeremy Kemp for(int i=0; i<32; i++) 60*1cd03ba3SJeremy Kemp { 61*1cd03ba3SJeremy Kemp MAD_16(x, y); 62*1cd03ba3SJeremy Kemp } 63*1cd03ba3SJeremy Kemp 64*1cd03ba3SJeremy Kemp ptr[get_global_id(0)] = (y.S0) + (y.S1) + (y.S2) + (y.S3); 65*1cd03ba3SJeremy Kemp} 66*1cd03ba3SJeremy Kemp 67*1cd03ba3SJeremy Kemp 68*1cd03ba3SJeremy Kemp__kernel void compute_hp_v8(__global half *ptr, float _B) 69*1cd03ba3SJeremy Kemp{ 70*1cd03ba3SJeremy Kemp half _A = (half)_B; 71*1cd03ba3SJeremy Kemp half8 x = (half8)(_A, (_A+1), (_A+2), (_A+3), (_A+4), (_A+5), (_A+6), (_A+7)); 72*1cd03ba3SJeremy Kemp half8 y = (half8)get_local_id(0); 73*1cd03ba3SJeremy Kemp 74*1cd03ba3SJeremy Kemp for(int i=0; i<16; i++) 75*1cd03ba3SJeremy Kemp { 76*1cd03ba3SJeremy Kemp MAD_16(x, y); 77*1cd03ba3SJeremy Kemp } 78*1cd03ba3SJeremy Kemp 79*1cd03ba3SJeremy Kemp ptr[get_global_id(0)] = (y.S0) + (y.S1) + (y.S2) + (y.S3) + (y.S4) + (y.S5) + (y.S6) + (y.S7); 80*1cd03ba3SJeremy Kemp} 81*1cd03ba3SJeremy Kemp 82*1cd03ba3SJeremy Kemp__kernel void compute_hp_v16(__global half *ptr, float _B) 83*1cd03ba3SJeremy Kemp{ 84*1cd03ba3SJeremy Kemp half _A = (half)_B; 85*1cd03ba3SJeremy Kemp half16 x = (half16)(_A, (_A+1), (_A+2), (_A+3), (_A+4), (_A+5), (_A+6), (_A+7), 86*1cd03ba3SJeremy Kemp (_A+8), (_A+9), (_A+10), (_A+11), (_A+12), (_A+13), (_A+14), (_A+15)); 87*1cd03ba3SJeremy Kemp half16 y = (half16)get_local_id(0); 88*1cd03ba3SJeremy Kemp 89*1cd03ba3SJeremy Kemp for(int i=0; i<8; i++) 90*1cd03ba3SJeremy Kemp { 91*1cd03ba3SJeremy Kemp MAD_16(x, y); 92*1cd03ba3SJeremy Kemp } 93*1cd03ba3SJeremy Kemp 94*1cd03ba3SJeremy Kemp half2 t = (y.S01) + (y.S23) + (y.S45) + (y.S67) + (y.S89) + (y.SAB) + (y.SCD) + (y.SEF); 95*1cd03ba3SJeremy Kemp ptr[get_global_id(0)] = t.S0 + t.S1; 96*1cd03ba3SJeremy Kemp} 97*1cd03ba3SJeremy Kemp 98*1cd03ba3SJeremy Kemp\n 99*1cd03ba3SJeremy Kemp\n#endif // half_AVAILABLE 100*1cd03ba3SJeremy Kemp\n 101*1cd03ba3SJeremy Kemp 102*1cd03ba3SJeremy Kemp) 103