1MSTRINGIFY( 2 3// Avoiding auto-vectorize by using vector-width locked dependent code 4 5\n#undef MAD_4 6\n#undef MAD_16 7\n#undef MAD_64 8\n 9\n#define MAD_4(x, y) x = (y*x) + y; y = (x*y) + x; x = (y*x) + y; y = (x*y) + x; 10\n#define MAD_16(x, y) MAD_4(x, y); MAD_4(x, y); MAD_4(x, y); MAD_4(x, y); 11\n#define MAD_64(x, y) MAD_16(x, y); MAD_16(x, y); MAD_16(x, y); MAD_16(x, y); 12\n 13 14__kernel void compute_char_v1(__global char *ptr, char _A) 15{ 16 char x = _A; 17 char y = (char)get_local_id(0); 18 19 for(int i=0; i<64; i++) 20 { 21 MAD_16(x, y); 22 } 23 24 ptr[get_global_id(0)] = y; 25} 26 27 28__kernel void compute_char_v2(__global char *ptr, char _A) 29{ 30 char2 x = (char2)(_A, (_A+1)); 31 char2 y = (char2)get_local_id(0); 32 33 for(int i=0; i<32; i++) 34 { 35 MAD_16(x, y); 36 } 37 38 ptr[get_global_id(0)] = (y.S0) + (y.S1); 39} 40 41__kernel void compute_char_v4(__global char *ptr, char _A) 42{ 43 char4 x = (char4)(_A, (_A+1), (_A+2), (_A+3)); 44 char4 y = (char4)get_local_id(0); 45 46 for(int i=0; i<16; i++) 47 { 48 MAD_16(x, y); 49 } 50 51 ptr[get_global_id(0)] = (y.S0) + (y.S1) + (y.S2) + (y.S3); 52} 53 54 55__kernel void compute_char_v8(__global char *ptr, char _A) 56{ 57 char8 x = (char8)(_A, (_A+1), (_A+2), (_A+3), (_A+4), (_A+5), (_A+6), (_A+7)); 58 char8 y = (char8)get_local_id(0); 59 60 for(int i=0; i<8; i++) 61 { 62 MAD_16(x, y); 63 } 64 65 ptr[get_global_id(0)] = (y.S0) + (y.S1) + (y.S2) + (y.S3) + (y.S4) + (y.S5) + (y.S6) + (y.S7); 66} 67 68__kernel void compute_char_v16(__global char *ptr, char _A) 69{ 70 char16 x = (char16)(_A, (_A+1), (_A+2), (_A+3), (_A+4), (_A+5), (_A+6), (_A+7), 71 (_A+8), (_A+9), (_A+10), (_A+11), (_A+12), (_A+13), (_A+14), (_A+15)); 72 char16 y = (char16)get_local_id(0); 73 74 for(int i=0; i<4; i++) 75 { 76 MAD_16(x, y); 77 } 78 79 char2 t = (y.S01) + (y.S23) + (y.S45) + (y.S67) + (y.S89) + (y.SAB) + (y.SCD) + (y.SEF); 80 ptr[get_global_id(0)] = t.S0 + t.S1; 81} 82 83 84) 85