1*c217d954SCole FaustR"( 2*c217d954SCole Faust 3*c217d954SCole Faust 4*c217d954SCole Faust 5*c217d954SCole Faust 6*c217d954SCole Faust#ifndef ARM_COMPUTE_HELPER_H 7*c217d954SCole Faust#define ARM_COMPUTE_HELPER_H 8*c217d954SCole Faust 9*c217d954SCole Faust 10*c217d954SCole Faust 11*c217d954SCole Faust 12*c217d954SCole Faust#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 13*c217d954SCole Faust VSTORE(N0) \ 14*c217d954SCole Faust (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 15*c217d954SCole Faust 16*c217d954SCole Faust#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 17*c217d954SCole Faust STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 18*c217d954SCole Faust VSTORE(N0) \ 19*c217d954SCole Faust (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 20*c217d954SCole Faust 21*c217d954SCole Faust#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 22*c217d954SCole Faust STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 23*c217d954SCole Faust VSTORE(N0) \ 24*c217d954SCole Faust (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 25*c217d954SCole Faust 26*c217d954SCole Faust#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 27*c217d954SCole Faust STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 28*c217d954SCole Faust VSTORE(N0) \ 29*c217d954SCole Faust (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 30*c217d954SCole Faust 31*c217d954SCole Faust#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 32*c217d954SCole Faust STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 33*c217d954SCole Faust VSTORE(N0) \ 34*c217d954SCole Faust (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 35*c217d954SCole Faust 36*c217d954SCole Faust#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 37*c217d954SCole Faust STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 38*c217d954SCole Faust VSTORE(N0) \ 39*c217d954SCole Faust (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 40*c217d954SCole Faust 41*c217d954SCole Faust#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 42*c217d954SCole Faust STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 43*c217d954SCole Faust VSTORE(N0) \ 44*c217d954SCole Faust (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 45*c217d954SCole Faust 46*c217d954SCole Faust#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 47*c217d954SCole Faust STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 48*c217d954SCole Faust VSTORE(N0) \ 49*c217d954SCole Faust (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 50*c217d954SCole Faust 51*c217d954SCole Faust#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 52*c217d954SCole Faust STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 53*c217d954SCole Faust VSTORE(N0) \ 54*c217d954SCole Faust (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 55*c217d954SCole Faust 56*c217d954SCole Faust#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 57*c217d954SCole Faust STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 58*c217d954SCole Faust VSTORE(N0) \ 59*c217d954SCole Faust (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 60*c217d954SCole Faust 61*c217d954SCole Faust#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 62*c217d954SCole Faust STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 63*c217d954SCole Faust VSTORE(N0) \ 64*c217d954SCole Faust (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 65*c217d954SCole Faust 66*c217d954SCole Faust#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 67*c217d954SCole Faust STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 68*c217d954SCole Faust VSTORE(N0) \ 69*c217d954SCole Faust (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 70*c217d954SCole Faust 71*c217d954SCole Faust#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 72*c217d954SCole Faust STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 73*c217d954SCole Faust VSTORE(N0) \ 74*c217d954SCole Faust (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 75*c217d954SCole Faust 76*c217d954SCole Faust#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 77*c217d954SCole Faust STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 78*c217d954SCole Faust VSTORE(N0) \ 79*c217d954SCole Faust (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 80*c217d954SCole Faust 81*c217d954SCole Faust#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 82*c217d954SCole Faust STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 83*c217d954SCole Faust VSTORE(N0) \ 84*c217d954SCole Faust (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 85*c217d954SCole Faust 86*c217d954SCole Faust#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 87*c217d954SCole Faust STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 88*c217d954SCole Faust VSTORE(N0) \ 89*c217d954SCole Faust (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 90*c217d954SCole Faust 91*c217d954SCole Faust 92*c217d954SCole Faust 93*c217d954SCole Faust#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 94*c217d954SCole Faust VSTORE(N0) \ 95*c217d954SCole Faust (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 96*c217d954SCole Faust 97*c217d954SCole Faust#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 98*c217d954SCole Faust CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 99*c217d954SCole Faust VSTORE(N0) \ 100*c217d954SCole Faust (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 101*c217d954SCole Faust 102*c217d954SCole Faust#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 103*c217d954SCole Faust CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 104*c217d954SCole Faust VSTORE(N0) \ 105*c217d954SCole Faust (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 106*c217d954SCole Faust 107*c217d954SCole Faust#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 108*c217d954SCole Faust CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 109*c217d954SCole Faust VSTORE(N0) \ 110*c217d954SCole Faust (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 111*c217d954SCole Faust 112*c217d954SCole Faust#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 113*c217d954SCole Faust CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 114*c217d954SCole Faust VSTORE(N0) \ 115*c217d954SCole Faust (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 116*c217d954SCole Faust 117*c217d954SCole Faust#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 118*c217d954SCole Faust CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 119*c217d954SCole Faust VSTORE(N0) \ 120*c217d954SCole Faust (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 121*c217d954SCole Faust 122*c217d954SCole Faust#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 123*c217d954SCole Faust CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 124*c217d954SCole Faust VSTORE(N0) \ 125*c217d954SCole Faust (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 126*c217d954SCole Faust 127*c217d954SCole Faust#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 128*c217d954SCole Faust CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 129*c217d954SCole Faust VSTORE(N0) \ 130*c217d954SCole Faust (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 131*c217d954SCole Faust 132*c217d954SCole Faust#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 133*c217d954SCole Faust CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 134*c217d954SCole Faust VSTORE(N0) \ 135*c217d954SCole Faust (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 136*c217d954SCole Faust 137*c217d954SCole Faust#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \ 138*c217d954SCole Faust CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 139*c217d954SCole Faust VSTORE(N0) \ 140*c217d954SCole Faust (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 141*c217d954SCole Faust 142*c217d954SCole Faust#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 143*c217d954SCole Faust CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 144*c217d954SCole Faust VSTORE(N0) \ 145*c217d954SCole Faust (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 146*c217d954SCole Faust 147*c217d954SCole Faust#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 148*c217d954SCole Faust CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 149*c217d954SCole Faust VSTORE(N0) \ 150*c217d954SCole Faust (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 151*c217d954SCole Faust 152*c217d954SCole Faust#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 153*c217d954SCole Faust CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 154*c217d954SCole Faust VSTORE(N0) \ 155*c217d954SCole Faust (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 156*c217d954SCole Faust 157*c217d954SCole Faust#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 158*c217d954SCole Faust CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 159*c217d954SCole Faust VSTORE(N0) \ 160*c217d954SCole Faust (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 161*c217d954SCole Faust 162*c217d954SCole Faust#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 163*c217d954SCole Faust CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 164*c217d954SCole Faust VSTORE(N0) \ 165*c217d954SCole Faust (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 166*c217d954SCole Faust 167*c217d954SCole Faust#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 168*c217d954SCole Faust CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 169*c217d954SCole Faust VSTORE(N0) \ 170*c217d954SCole Faust (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 171*c217d954SCole Faust 172*c217d954SCole Faust 173*c217d954SCole Faust 174*c217d954SCole Faust 175*c217d954SCole Faust#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 176*c217d954SCole Faust#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 177*c217d954SCole Faust 178*c217d954SCole Faust 179*c217d954SCole Faust 180*c217d954SCole Faust#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 181*c217d954SCole Faust#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 182*c217d954SCole Faust 183*c217d954SCole Faust 184*c217d954SCole Faust 185*c217d954SCole Faust#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 186*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 187*c217d954SCole Faust (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 188*c217d954SCole Faust 189*c217d954SCole Faust#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 190*c217d954SCole Faust STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 191*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 192*c217d954SCole Faust (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 193*c217d954SCole Faust 194*c217d954SCole Faust#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 195*c217d954SCole Faust STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 196*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 197*c217d954SCole Faust (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 198*c217d954SCole Faust 199*c217d954SCole Faust#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 200*c217d954SCole Faust STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 201*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 202*c217d954SCole Faust (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 203*c217d954SCole Faust 204*c217d954SCole Faust#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 205*c217d954SCole Faust STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 206*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 207*c217d954SCole Faust (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 208*c217d954SCole Faust 209*c217d954SCole Faust#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 210*c217d954SCole Faust STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 211*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 212*c217d954SCole Faust (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 213*c217d954SCole Faust 214*c217d954SCole Faust#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 215*c217d954SCole Faust STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 216*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 217*c217d954SCole Faust (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 218*c217d954SCole Faust 219*c217d954SCole Faust#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 220*c217d954SCole Faust STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 221*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 222*c217d954SCole Faust (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 223*c217d954SCole Faust 224*c217d954SCole Faust#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 225*c217d954SCole Faust STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 226*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 227*c217d954SCole Faust (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 228*c217d954SCole Faust 229*c217d954SCole Faust#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 230*c217d954SCole Faust STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 231*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 232*c217d954SCole Faust (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 233*c217d954SCole Faust 234*c217d954SCole Faust#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 235*c217d954SCole Faust STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 236*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 237*c217d954SCole Faust (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 238*c217d954SCole Faust 239*c217d954SCole Faust#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 240*c217d954SCole Faust STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 241*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 242*c217d954SCole Faust (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 243*c217d954SCole Faust 244*c217d954SCole Faust#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 245*c217d954SCole Faust STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 246*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 247*c217d954SCole Faust (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 248*c217d954SCole Faust 249*c217d954SCole Faust#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 250*c217d954SCole Faust STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 251*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 252*c217d954SCole Faust (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 253*c217d954SCole Faust 254*c217d954SCole Faust#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 255*c217d954SCole Faust STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 256*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 257*c217d954SCole Faust (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 258*c217d954SCole Faust 259*c217d954SCole Faust#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 260*c217d954SCole Faust STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 261*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 262*c217d954SCole Faust (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 263*c217d954SCole Faust 264*c217d954SCole Faust 265*c217d954SCole Faust 266*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 267*c217d954SCole Faust#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 268*c217d954SCole Faust 269*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 270*c217d954SCole Faust if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \ 271*c217d954SCole Faust { \ 272*c217d954SCole Faust STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 273*c217d954SCole Faust } \ 274*c217d954SCole Faust else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \ 275*c217d954SCole Faust { \ 276*c217d954SCole Faust STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 277*c217d954SCole Faust } \ 278*c217d954SCole Faust else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \ 279*c217d954SCole Faust { \ 280*c217d954SCole Faust STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 281*c217d954SCole Faust } \ 282*c217d954SCole Faust else \ 283*c217d954SCole Faust { \ 284*c217d954SCole Faust STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 285*c217d954SCole Faust } 286*c217d954SCole Faust 287*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \ 288*c217d954SCole Faust if(!(PARTIAL_COND_X)) \ 289*c217d954SCole Faust { \ 290*c217d954SCole Faust STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 291*c217d954SCole Faust } \ 292*c217d954SCole Faust else \ 293*c217d954SCole Faust { \ 294*c217d954SCole Faust STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 295*c217d954SCole Faust } 296*c217d954SCole Faust 297*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \ 298*c217d954SCole Faust if(!(PARTIAL_COND_Y)) \ 299*c217d954SCole Faust { \ 300*c217d954SCole Faust STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 301*c217d954SCole Faust } \ 302*c217d954SCole Faust else \ 303*c217d954SCole Faust { \ 304*c217d954SCole Faust STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 305*c217d954SCole Faust } 306*c217d954SCole Faust 307*c217d954SCole Faust 308*c217d954SCole Faust#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) 309*c217d954SCole Faust 310*c217d954SCole Faust 311*c217d954SCole Faust#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 312*c217d954SCole Faust 313*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 314*c217d954SCole Faust STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 315*c217d954SCole Faust 316*c217d954SCole Faust#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0 317*c217d954SCole Faust 318*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 319*c217d954SCole Faust STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) 320*c217d954SCole Faust 321*c217d954SCole Faust#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0 322*c217d954SCole Faust 323*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 324*c217d954SCole Faust STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) 325*c217d954SCole Faust 326*c217d954SCole Faust#else 327*c217d954SCole Faust 328*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 329*c217d954SCole Faust STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) 330*c217d954SCole Faust 331*c217d954SCole Faust#endif 332*c217d954SCole Faust 333*c217d954SCole Faust#endif 334*c217d954SCole Faust 335*c217d954SCole Faust 336*c217d954SCole Faust#if defined(PARTIAL_STORE_M0) 337*c217d954SCole Faust 338*c217d954SCole Faust#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 339*c217d954SCole Faust ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0)))) 340*c217d954SCole Faust#else 341*c217d954SCole Faust#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 342*c217d954SCole Faust ((uint)(y * M0)) 343*c217d954SCole Faust#endif 344*c217d954SCole Faust 345*c217d954SCole Faust 346*c217d954SCole Faust 347*c217d954SCole Faust#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \ 348*c217d954SCole Faust STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond) 349*c217d954SCole Faust 350*c217d954SCole Faust 351*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 352*c217d954SCole Faust#pragma OPENCL EXTENSION cl_khr_fp16 : enable 353*c217d954SCole Faust#endif 354*c217d954SCole Faust 355*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 356*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable 357*c217d954SCole Faust#endif 358*c217d954SCole Faust 359*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 360*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable 361*c217d954SCole Faust#endif 362*c217d954SCole Faust 363*c217d954SCole Faust#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) 364*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_printf : enable 365*c217d954SCole Faust#endif 366*c217d954SCole Faust 367*c217d954SCole Faust#define GPU_ARCH_MIDGARD 0x100 368*c217d954SCole Faust#define GPU_ARCH_BIFROST 0x200 369*c217d954SCole Faust#define GPU_ARCH_VALHALL 0x300 370*c217d954SCole Faust 371*c217d954SCole Faust 372*c217d954SCole Faust#define CONCAT(a, b) a##b 373*c217d954SCole Faust 374*c217d954SCole Faust 375*c217d954SCole Faust#define EXPAND(x) x 376*c217d954SCole Faust 377*c217d954SCole Faust 378*c217d954SCole Faust#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) 379*c217d954SCole Faust 380*c217d954SCole Faust 381*c217d954SCole Faust#define REV1(x) ((x)) 382*c217d954SCole Faust#define REV2(x) ((x).s10) 383*c217d954SCole Faust#define REV3(x) ((x).s210) 384*c217d954SCole Faust#define REV4(x) ((x).s3210) 385*c217d954SCole Faust#define REV8(x) ((x).s76543210) 386*c217d954SCole Faust#define REV16(x) ((x).sFEDCBA9876543210) 387*c217d954SCole Faust 388*c217d954SCole Faust 389*c217d954SCole Faust 390*c217d954SCole Faust#define REVERSE_STR(x, s) REV##s((x)) 391*c217d954SCole Faust#define REVERSE(x, s) REVERSE_STR(x, s) 392*c217d954SCole Faust 393*c217d954SCole Faust 394*c217d954SCole Faust 395*c217d954SCole Faust#define ROT1_0(x) ((x)) 396*c217d954SCole Faust#define ROT1_1(x) ((x)) 397*c217d954SCole Faust 398*c217d954SCole Faust#define ROT2_0(x) ((x)) 399*c217d954SCole Faust#define ROT2_1(x) ((x).s10) 400*c217d954SCole Faust#define ROT2_2(x) ((x)) 401*c217d954SCole Faust 402*c217d954SCole Faust#define ROT3_0(x) ((x)) 403*c217d954SCole Faust#define ROT3_1(x) ((x).s201) 404*c217d954SCole Faust#define ROT3_2(x) ((x).s120) 405*c217d954SCole Faust#define ROT3_3(x) ((x)) 406*c217d954SCole Faust 407*c217d954SCole Faust#define ROT4_0(x) ((x)) 408*c217d954SCole Faust#define ROT4_1(x) ((x).s3012) 409*c217d954SCole Faust#define ROT4_2(x) ((x).s2301) 410*c217d954SCole Faust#define ROT4_3(x) ((x).s1230) 411*c217d954SCole Faust#define ROT4_4(x) ((x)) 412*c217d954SCole Faust 413*c217d954SCole Faust#define ROT8_0(x) ((x)) 414*c217d954SCole Faust#define ROT8_1(x) ((x).s70123456) 415*c217d954SCole Faust#define ROT8_2(x) ((x).s67012345) 416*c217d954SCole Faust#define ROT8_3(x) ((x).s56701234) 417*c217d954SCole Faust#define ROT8_4(x) ((x).s45670123) 418*c217d954SCole Faust#define ROT8_5(x) ((x).s34567012) 419*c217d954SCole Faust#define ROT8_6(x) ((x).s23456701) 420*c217d954SCole Faust#define ROT8_7(x) ((x).s12345670) 421*c217d954SCole Faust#define ROT8_8(x) ((x)) 422*c217d954SCole Faust 423*c217d954SCole Faust#define ROT16_0(x) ((x)) 424*c217d954SCole Faust#define ROT16_1(x) ((x).sF0123456789ABCDE) 425*c217d954SCole Faust#define ROT16_2(x) ((x).sEF0123456789ABCD) 426*c217d954SCole Faust#define ROT16_3(x) ((x).sDEF0123456789ABC) 427*c217d954SCole Faust#define ROT16_4(x) ((x).sCDEF0123456789AB) 428*c217d954SCole Faust#define ROT16_5(x) ((x).sBCDEF0123456789A) 429*c217d954SCole Faust#define ROT16_6(x) ((x).sABCDEF0123456789) 430*c217d954SCole Faust#define ROT16_7(x) ((x).s9ABCDEF012345678) 431*c217d954SCole Faust#define ROT16_8(x) ((x).s89ABCDEF01234567) 432*c217d954SCole Faust#define ROT16_9(x) ((x).s789ABCDEF0123456) 433*c217d954SCole Faust#define ROT16_10(x) ((x).s6789ABCDEF012345) 434*c217d954SCole Faust#define ROT16_11(x) ((x).s56789ABCDEF01234) 435*c217d954SCole Faust#define ROT16_12(x) ((x).s456789ABCDEF0123) 436*c217d954SCole Faust#define ROT16_13(x) ((x).s3456789ABCDEF012) 437*c217d954SCole Faust#define ROT16_14(x) ((x).s23456789ABCDEF01) 438*c217d954SCole Faust#define ROT16_15(x) ((x).s123456789ABCDEF0) 439*c217d954SCole Faust#define ROT16_16(x) ((x)) 440*c217d954SCole Faust 441*c217d954SCole Faust 442*c217d954SCole Faust 443*c217d954SCole Faust#define ROTATE_STR(x, s, n) ROT##s##_##n(x) 444*c217d954SCole Faust#define ROTATE(x, s, n) ROTATE_STR(x, s, n) 445*c217d954SCole Faust 446*c217d954SCole Faust 447*c217d954SCole Faust 448*c217d954SCole Faust#define V_OFFS1(dt) (dt##1)(0) 449*c217d954SCole Faust#define V_OFFS2(dt) (dt##2)(0, 1) 450*c217d954SCole Faust#define V_OFFS3(dt) (dt##3)(0, 1, 2) 451*c217d954SCole Faust#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3) 452*c217d954SCole Faust#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7) 453*c217d954SCole Faust#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) 454*c217d954SCole Faust 455*c217d954SCole Faust 456*c217d954SCole Faust 457*c217d954SCole Faust#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt) 458*c217d954SCole Faust#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s) 459*c217d954SCole Faust 460*c217d954SCole Faust 461*c217d954SCole Faust#define VLOAD_STR(size) vload##size 462*c217d954SCole Faust#define VLOAD(size) VLOAD_STR(size) 463*c217d954SCole Faust 464*c217d954SCole Faust 465*c217d954SCole Faust#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size 466*c217d954SCole Faust#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size) 467*c217d954SCole Faust 468*c217d954SCole Faust#define NO_LOAD(data, offs, ptr) \ 469*c217d954SCole Faust { \ 470*c217d954SCole Faust } 471*c217d954SCole Faust 472*c217d954SCole Faust 473*c217d954SCole Faust#define vload_partial_1_0 NO_LOAD 474*c217d954SCole Faust#define vload_partial_1_1 vload1 475*c217d954SCole Faust#define vload_partial_1_2 NO_LOAD 476*c217d954SCole Faust#define vload_partial_1_3 NO_LOAD 477*c217d954SCole Faust#define vload_partial_1_4 NO_LOAD 478*c217d954SCole Faust#define vload_partial_1_5 NO_LOAD 479*c217d954SCole Faust#define vload_partial_1_6 NO_LOAD 480*c217d954SCole Faust#define vload_partial_1_7 NO_LOAD 481*c217d954SCole Faust#define vload_partial_1_8 NO_LOAD 482*c217d954SCole Faust#define vload_partial_1_9 NO_LOAD 483*c217d954SCole Faust#define vload_partial_1_10 NO_LOAD 484*c217d954SCole Faust#define vload_partial_1_11 NO_LOAD 485*c217d954SCole Faust#define vload_partial_1_12 NO_LOAD 486*c217d954SCole Faust#define vload_partial_1_13 NO_LOAD 487*c217d954SCole Faust#define vload_partial_1_14 NO_LOAD 488*c217d954SCole Faust#define vload_partial_1_15 NO_LOAD 489*c217d954SCole Faust#define vload_partial_1_16 NO_LOAD 490*c217d954SCole Faust 491*c217d954SCole Faust#define vload_partial_2_0 NO_LOAD 492*c217d954SCole Faust#define vload_partial_2_1 vload_partial_1 493*c217d954SCole Faust#define vload_partial_2_2 vload_partial_2 494*c217d954SCole Faust#define vload_partial_2_3 NO_LOAD 495*c217d954SCole Faust#define vload_partial_2_4 NO_LOAD 496*c217d954SCole Faust#define vload_partial_2_5 NO_LOAD 497*c217d954SCole Faust#define vload_partial_2_6 NO_LOAD 498*c217d954SCole Faust#define vload_partial_2_7 NO_LOAD 499*c217d954SCole Faust#define vload_partial_2_8 NO_LOAD 500*c217d954SCole Faust#define vload_partial_2_9 NO_LOAD 501*c217d954SCole Faust#define vload_partial_2_10 NO_LOAD 502*c217d954SCole Faust#define vload_partial_2_11 NO_LOAD 503*c217d954SCole Faust#define vload_partial_2_12 NO_LOAD 504*c217d954SCole Faust#define vload_partial_2_13 NO_LOAD 505*c217d954SCole Faust#define vload_partial_2_14 NO_LOAD 506*c217d954SCole Faust#define vload_partial_2_15 NO_LOAD 507*c217d954SCole Faust#define vload_partial_2_16 NO_LOAD 508*c217d954SCole Faust 509*c217d954SCole Faust#define vload_partial_3_0 NO_LOAD 510*c217d954SCole Faust#define vload_partial_3_1 vload_partial_1 511*c217d954SCole Faust#define vload_partial_3_2 vload_partial_2 512*c217d954SCole Faust#define vload_partial_3_3 vload_partial_3 513*c217d954SCole Faust#define vload_partial_3_4 NO_LOAD 514*c217d954SCole Faust#define vload_partial_3_5 NO_LOAD 515*c217d954SCole Faust#define vload_partial_3_6 NO_LOAD 516*c217d954SCole Faust#define vload_partial_3_7 NO_LOAD 517*c217d954SCole Faust#define vload_partial_3_8 NO_LOAD 518*c217d954SCole Faust#define vload_partial_3_9 NO_LOAD 519*c217d954SCole Faust#define vload_partial_3_10 NO_LOAD 520*c217d954SCole Faust#define vload_partial_3_11 NO_LOAD 521*c217d954SCole Faust#define vload_partial_3_12 NO_LOAD 522*c217d954SCole Faust#define vload_partial_3_13 NO_LOAD 523*c217d954SCole Faust#define vload_partial_3_14 NO_LOAD 524*c217d954SCole Faust#define vload_partial_3_15 NO_LOAD 525*c217d954SCole Faust#define vload_partial_3_16 NO_LOAD 526*c217d954SCole Faust 527*c217d954SCole Faust#define vload_partial_4_0 NO_LOAD 528*c217d954SCole Faust#define vload_partial_4_1 vload_partial_1 529*c217d954SCole Faust#define vload_partial_4_2 vload_partial_2 530*c217d954SCole Faust#define vload_partial_4_3 vload_partial_3 531*c217d954SCole Faust#define vload_partial_4_4 vload_partial_4 532*c217d954SCole Faust#define vload_partial_4_5 NO_LOAD 533*c217d954SCole Faust#define vload_partial_4_6 NO_LOAD 534*c217d954SCole Faust#define vload_partial_4_7 NO_LOAD 535*c217d954SCole Faust#define vload_partial_4_8 NO_LOAD 536*c217d954SCole Faust#define vload_partial_4_9 NO_LOAD 537*c217d954SCole Faust#define vload_partial_4_10 NO_LOAD 538*c217d954SCole Faust#define vload_partial_4_11 NO_LOAD 539*c217d954SCole Faust#define vload_partial_4_12 NO_LOAD 540*c217d954SCole Faust#define vload_partial_4_13 NO_LOAD 541*c217d954SCole Faust#define vload_partial_4_14 NO_LOAD 542*c217d954SCole Faust#define vload_partial_4_15 NO_LOAD 543*c217d954SCole Faust#define vload_partial_4_16 NO_LOAD 544*c217d954SCole Faust 545*c217d954SCole Faust#define vload_partial_8_0 NO_LOAD 546*c217d954SCole Faust#define vload_partial_8_1 vload_partial_1 547*c217d954SCole Faust#define vload_partial_8_2 vload_partial_2 548*c217d954SCole Faust#define vload_partial_8_3 vload_partial_3 549*c217d954SCole Faust#define vload_partial_8_4 vload_partial_4 550*c217d954SCole Faust#define vload_partial_8_5 vload_partial_5 551*c217d954SCole Faust#define vload_partial_8_6 vload_partial_6 552*c217d954SCole Faust#define vload_partial_8_7 vload_partial_7 553*c217d954SCole Faust#define vload_partial_8_8 vload_partial_8 554*c217d954SCole Faust#define vload_partial_8_9 NO_LOAD 555*c217d954SCole Faust#define vload_partial_8_10 NO_LOAD 556*c217d954SCole Faust#define vload_partial_8_11 NO_LOAD 557*c217d954SCole Faust#define vload_partial_8_12 NO_LOAD 558*c217d954SCole Faust#define vload_partial_8_13 NO_LOAD 559*c217d954SCole Faust#define vload_partial_8_14 NO_LOAD 560*c217d954SCole Faust#define vload_partial_8_15 NO_LOAD 561*c217d954SCole Faust#define vload_partial_8_16 NO_LOAD 562*c217d954SCole Faust 563*c217d954SCole Faust#define vload_partial_16_0 NO_LOAD 564*c217d954SCole Faust#define vload_partial_16_1 vload_partial_1 565*c217d954SCole Faust#define vload_partial_16_2 vload_partial_2 566*c217d954SCole Faust#define vload_partial_16_3 vload_partial_3 567*c217d954SCole Faust#define vload_partial_16_4 vload_partial_4 568*c217d954SCole Faust#define vload_partial_16_5 vload_partial_5 569*c217d954SCole Faust#define vload_partial_16_6 vload_partial_6 570*c217d954SCole Faust#define vload_partial_16_7 vload_partial_7 571*c217d954SCole Faust#define vload_partial_16_8 vload_partial_8 572*c217d954SCole Faust#define vload_partial_16_9 vload_partial_9 573*c217d954SCole Faust#define vload_partial_16_10 vload_partial_10 574*c217d954SCole Faust#define vload_partial_16_11 vload_partial_11 575*c217d954SCole Faust#define vload_partial_16_12 vload_partial_12 576*c217d954SCole Faust#define vload_partial_16_13 vload_partial_13 577*c217d954SCole Faust#define vload_partial_16_14 vload_partial_14 578*c217d954SCole Faust#define vload_partial_16_15 vload_partial_15 579*c217d954SCole Faust#define vload_partial_16_16 vload_partial_16 580*c217d954SCole Faust 581*c217d954SCole Faust 582*c217d954SCole Faust#define vload_partial_1(DATA, OFFSET, PTR) \ 583*c217d954SCole Faust DATA.s0 = vload1(OFFSET, PTR); 584*c217d954SCole Faust 585*c217d954SCole Faust#define vload_partial_2(DATA, OFFSET, PTR) \ 586*c217d954SCole Faust DATA.s01 = vload2(OFFSET, PTR); 587*c217d954SCole Faust 588*c217d954SCole Faust#define vload_partial_3(DATA, OFFSET, PTR) \ 589*c217d954SCole Faust DATA.s012 = vload3(OFFSET, PTR); 590*c217d954SCole Faust 591*c217d954SCole Faust#define vload_partial_4(DATA, OFFSET, PTR) \ 592*c217d954SCole Faust DATA.s0123 = vload4(OFFSET, PTR); 593*c217d954SCole Faust 594*c217d954SCole Faust#define vload_partial_5(DATA, OFFSET, PTR) \ 595*c217d954SCole Faust vload_partial_4(DATA.s0123, OFFSET, PTR); \ 596*c217d954SCole Faust DATA.s4 = vload1(OFFSET, PTR + 4); 597*c217d954SCole Faust 598*c217d954SCole Faust#define vload_partial_6(DATA, OFFSET, PTR) \ 599*c217d954SCole Faust vload_partial_4(DATA.s0123, OFFSET, PTR); \ 600*c217d954SCole Faust vload_partial_2(DATA.s45, OFFSET, PTR + 4); 601*c217d954SCole Faust 602*c217d954SCole Faust#define vload_partial_7(DATA, OFFSET, PTR) \ 603*c217d954SCole Faust vload_partial_4(DATA.s0123, OFFSET, PTR); \ 604*c217d954SCole Faust vload_partial_3(DATA.s456, OFFSET, PTR + 4); 605*c217d954SCole Faust 606*c217d954SCole Faust#define vload_partial_8(DATA, OFFSET, PTR) \ 607*c217d954SCole Faust DATA.s01234567 = vload8(OFFSET, PTR); 608*c217d954SCole Faust 609*c217d954SCole Faust#define vload_partial_9(DATA, OFFSET, PTR) \ 610*c217d954SCole Faust vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 611*c217d954SCole Faust DATA.s8 = vload1(OFFSET, PTR + 8); 612*c217d954SCole Faust 613*c217d954SCole Faust#define vload_partial_10(DATA, OFFSET, PTR) \ 614*c217d954SCole Faust vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 615*c217d954SCole Faust vload_partial_2(DATA.s89, OFFSET, PTR + 8); 616*c217d954SCole Faust 617*c217d954SCole Faust#define vload_partial_11(DATA, OFFSET, PTR) \ 618*c217d954SCole Faust vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 619*c217d954SCole Faust vload_partial_3(DATA.s89A, OFFSET, PTR + 8); 620*c217d954SCole Faust 621*c217d954SCole Faust#define vload_partial_12(DATA, OFFSET, PTR) \ 622*c217d954SCole Faust vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 623*c217d954SCole Faust vload_partial_4(DATA.s89AB, OFFSET, PTR + 8); 624*c217d954SCole Faust 625*c217d954SCole Faust#define vload_partial_13(DATA, OFFSET, PTR) \ 626*c217d954SCole Faust vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 627*c217d954SCole Faust vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8); 628*c217d954SCole Faust 629*c217d954SCole Faust#define vload_partial_14(DATA, OFFSET, PTR) \ 630*c217d954SCole Faust vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 631*c217d954SCole Faust vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8); 632*c217d954SCole Faust 633*c217d954SCole Faust#define vload_partial_15(DATA, OFFSET, PTR) \ 634*c217d954SCole Faust vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 635*c217d954SCole Faust vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8); 636*c217d954SCole Faust 637*c217d954SCole Faust#define vload_partial_16(DATA, OFFSET, PTR) \ 638*c217d954SCole Faust DATA = vload16(OFFSET, PTR); 639*c217d954SCole Faust 640*c217d954SCole Faust 641*c217d954SCole Faust 642*c217d954SCole Faust#define PIXEL_UNIT4 1 643*c217d954SCole Faust#define PIXEL_UNIT8 2 644*c217d954SCole Faust#define PIXEL_UNIT16 4 645*c217d954SCole Faust 646*c217d954SCole Faust 647*c217d954SCole Faust#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size 648*c217d954SCole Faust#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) 649*c217d954SCole Faust 650*c217d954SCole Faust 651*c217d954SCole Faust#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord))); 652*c217d954SCole Faust#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord))); 653*c217d954SCole Faust#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord))); 654*c217d954SCole Faust 655*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 656*c217d954SCole Faust#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord))); 657*c217d954SCole Faust#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord))); 658*c217d954SCole Faust#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord))); 659*c217d954SCole Faust#endif 660*c217d954SCole Faust 661*c217d954SCole Faust#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values)); 662*c217d954SCole Faust#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567)); 663*c217d954SCole Faust#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); 664*c217d954SCole Faust 665*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 666*c217d954SCole Faust#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values)); 667*c217d954SCole Faust#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567)); 668*c217d954SCole Faust#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); 669*c217d954SCole Faust#endif 670*c217d954SCole Faust 671*c217d954SCole Faust 672*c217d954SCole Faust#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord) 673*c217d954SCole Faust#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) 674*c217d954SCole Faust 675*c217d954SCole Faust 676*c217d954SCole Faust#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values) 677*c217d954SCole Faust#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) 678*c217d954SCole Faust 679*c217d954SCole Faust#define VSTORE_STR(size) vstore##size 680*c217d954SCole Faust#define VSTORE(size) VSTORE_STR(size) 681*c217d954SCole Faust 682*c217d954SCole Faust#define float1 float 683*c217d954SCole Faust#define half1 half 684*c217d954SCole Faust#define char1 char 685*c217d954SCole Faust#define uchar1 uchar 686*c217d954SCole Faust#define short1 short 687*c217d954SCole Faust#define ushort1 ushort 688*c217d954SCole Faust#define int1 int 689*c217d954SCole Faust#define uint1 uint 690*c217d954SCole Faust#define long1 long 691*c217d954SCole Faust#define ulong1 ulong 692*c217d954SCole Faust#define double1 double 693*c217d954SCole Faust 694*c217d954SCole Faust#define vload1(OFFSET, PTR) *(OFFSET + PTR) 695*c217d954SCole Faust#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA 696*c217d954SCole Faust 697*c217d954SCole Faust 698*c217d954SCole Faust#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size 699*c217d954SCole Faust#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size) 700*c217d954SCole Faust 701*c217d954SCole Faust#define NO_STORE(data, offs, ptr) \ 702*c217d954SCole Faust { \ 703*c217d954SCole Faust } 704*c217d954SCole Faust 705*c217d954SCole Faust 706*c217d954SCole Faust#define vstore_partial_1_0 NO_STORE 707*c217d954SCole Faust#define vstore_partial_1_1 vstore1 708*c217d954SCole Faust#define vstore_partial_1_2 NO_STORE 709*c217d954SCole Faust#define vstore_partial_1_3 NO_STORE 710*c217d954SCole Faust#define vstore_partial_1_4 NO_STORE 711*c217d954SCole Faust#define vstore_partial_1_5 NO_STORE 712*c217d954SCole Faust#define vstore_partial_1_6 NO_STORE 713*c217d954SCole Faust#define vstore_partial_1_7 NO_STORE 714*c217d954SCole Faust#define vstore_partial_1_8 NO_STORE 715*c217d954SCole Faust#define vstore_partial_1_9 NO_STORE 716*c217d954SCole Faust#define vstore_partial_1_10 NO_STORE 717*c217d954SCole Faust#define vstore_partial_1_11 NO_STORE 718*c217d954SCole Faust#define vstore_partial_1_12 NO_STORE 719*c217d954SCole Faust#define vstore_partial_1_13 NO_STORE 720*c217d954SCole Faust#define vstore_partial_1_14 NO_STORE 721*c217d954SCole Faust#define vstore_partial_1_15 NO_STORE 722*c217d954SCole Faust#define vstore_partial_1_16 NO_STORE 723*c217d954SCole Faust 724*c217d954SCole Faust#define vstore_partial_2_0 NO_STORE 725*c217d954SCole Faust#define vstore_partial_2_1 vstore_partial_1 726*c217d954SCole Faust#define vstore_partial_2_2 vstore_partial_2 727*c217d954SCole Faust#define vstore_partial_2_3 NO_STORE 728*c217d954SCole Faust#define vstore_partial_2_4 NO_STORE 729*c217d954SCole Faust#define vstore_partial_2_5 NO_STORE 730*c217d954SCole Faust#define vstore_partial_2_6 NO_STORE 731*c217d954SCole Faust#define vstore_partial_2_7 NO_STORE 732*c217d954SCole Faust#define vstore_partial_2_8 NO_STORE 733*c217d954SCole Faust#define vstore_partial_2_9 NO_STORE 734*c217d954SCole Faust#define vstore_partial_2_10 NO_STORE 735*c217d954SCole Faust#define vstore_partial_2_11 NO_STORE 736*c217d954SCole Faust#define vstore_partial_2_12 NO_STORE 737*c217d954SCole Faust#define vstore_partial_2_13 NO_STORE 738*c217d954SCole Faust#define vstore_partial_2_14 NO_STORE 739*c217d954SCole Faust#define vstore_partial_2_15 NO_STORE 740*c217d954SCole Faust#define vstore_partial_2_16 NO_STORE 741*c217d954SCole Faust 742*c217d954SCole Faust#define vstore_partial_3_0 NO_STORE 743*c217d954SCole Faust#define vstore_partial_3_1 vstore_partial_1 744*c217d954SCole Faust#define vstore_partial_3_2 vstore_partial_2 745*c217d954SCole Faust#define vstore_partial_3_3 vstore_partial_3 746*c217d954SCole Faust#define vstore_partial_3_4 NO_STORE 747*c217d954SCole Faust#define vstore_partial_3_5 NO_STORE 748*c217d954SCole Faust#define vstore_partial_3_6 NO_STORE 749*c217d954SCole Faust#define vstore_partial_3_7 NO_STORE 750*c217d954SCole Faust#define vstore_partial_3_8 NO_STORE 751*c217d954SCole Faust#define vstore_partial_3_9 NO_STORE 752*c217d954SCole Faust#define vstore_partial_3_10 NO_STORE 753*c217d954SCole Faust#define vstore_partial_3_11 NO_STORE 754*c217d954SCole Faust#define vstore_partial_3_12 NO_STORE 755*c217d954SCole Faust#define vstore_partial_3_13 NO_STORE 756*c217d954SCole Faust#define vstore_partial_3_14 NO_STORE 757*c217d954SCole Faust#define vstore_partial_3_15 NO_STORE 758*c217d954SCole Faust#define vstore_partial_3_16 NO_STORE 759*c217d954SCole Faust 760*c217d954SCole Faust#define vstore_partial_4_0 NO_STORE 761*c217d954SCole Faust#define vstore_partial_4_1 vstore_partial_1 762*c217d954SCole Faust#define vstore_partial_4_2 vstore_partial_2 763*c217d954SCole Faust#define vstore_partial_4_3 vstore_partial_3 764*c217d954SCole Faust#define vstore_partial_4_4 vstore_partial_4 765*c217d954SCole Faust#define vstore_partial_4_5 NO_STORE 766*c217d954SCole Faust#define vstore_partial_4_6 NO_STORE 767*c217d954SCole Faust#define vstore_partial_4_7 NO_STORE 768*c217d954SCole Faust#define vstore_partial_4_8 NO_STORE 769*c217d954SCole Faust#define vstore_partial_4_9 NO_STORE 770*c217d954SCole Faust#define vstore_partial_4_10 NO_STORE 771*c217d954SCole Faust#define vstore_partial_4_11 NO_STORE 772*c217d954SCole Faust#define vstore_partial_4_12 NO_STORE 773*c217d954SCole Faust#define vstore_partial_4_13 NO_STORE 774*c217d954SCole Faust#define vstore_partial_4_14 NO_STORE 775*c217d954SCole Faust#define vstore_partial_4_15 NO_STORE 776*c217d954SCole Faust#define vstore_partial_4_16 NO_STORE 777*c217d954SCole Faust 778*c217d954SCole Faust#define vstore_partial_8_0 NO_STORE 779*c217d954SCole Faust#define vstore_partial_8_1 vstore_partial_1 780*c217d954SCole Faust#define vstore_partial_8_2 vstore_partial_2 781*c217d954SCole Faust#define vstore_partial_8_3 vstore_partial_3 782*c217d954SCole Faust#define vstore_partial_8_4 vstore_partial_4 783*c217d954SCole Faust#define vstore_partial_8_5 vstore_partial_5 784*c217d954SCole Faust#define vstore_partial_8_6 vstore_partial_6 785*c217d954SCole Faust#define vstore_partial_8_7 vstore_partial_7 786*c217d954SCole Faust#define vstore_partial_8_8 vstore_partial_8 787*c217d954SCole Faust#define vstore_partial_8_9 NO_STORE 788*c217d954SCole Faust#define vstore_partial_8_10 NO_STORE 789*c217d954SCole Faust#define vstore_partial_8_11 NO_STORE 790*c217d954SCole Faust#define vstore_partial_8_12 NO_STORE 791*c217d954SCole Faust#define vstore_partial_8_13 NO_STORE 792*c217d954SCole Faust#define vstore_partial_8_14 NO_STORE 793*c217d954SCole Faust#define vstore_partial_8_15 NO_STORE 794*c217d954SCole Faust#define vstore_partial_8_16 NO_STORE 795*c217d954SCole Faust 796*c217d954SCole Faust#define vstore_partial_16_0 NO_STORE 797*c217d954SCole Faust#define vstore_partial_16_1 vstore_partial_1 798*c217d954SCole Faust#define vstore_partial_16_2 vstore_partial_2 799*c217d954SCole Faust#define vstore_partial_16_3 vstore_partial_3 800*c217d954SCole Faust#define vstore_partial_16_4 vstore_partial_4 801*c217d954SCole Faust#define vstore_partial_16_5 vstore_partial_5 802*c217d954SCole Faust#define vstore_partial_16_6 vstore_partial_6 803*c217d954SCole Faust#define vstore_partial_16_7 vstore_partial_7 804*c217d954SCole Faust#define vstore_partial_16_8 vstore_partial_8 805*c217d954SCole Faust#define vstore_partial_16_9 vstore_partial_9 806*c217d954SCole Faust#define vstore_partial_16_10 vstore_partial_10 807*c217d954SCole Faust#define vstore_partial_16_11 vstore_partial_11 808*c217d954SCole Faust#define vstore_partial_16_12 vstore_partial_12 809*c217d954SCole Faust#define vstore_partial_16_13 vstore_partial_13 810*c217d954SCole Faust#define vstore_partial_16_14 vstore_partial_14 811*c217d954SCole Faust#define vstore_partial_16_15 vstore_partial_15 812*c217d954SCole Faust#define vstore_partial_16_16 vstore_partial_16 813*c217d954SCole Faust 814*c217d954SCole Faust 815*c217d954SCole Faust#define vstore_partial_1(DATA, OFFSET, PTR) \ 816*c217d954SCole Faust vstore1(DATA.s0, OFFSET, PTR); 817*c217d954SCole Faust 818*c217d954SCole Faust#define vstore_partial_2(DATA, OFFSET, PTR) \ 819*c217d954SCole Faust vstore2(DATA.s01, OFFSET, PTR); 820*c217d954SCole Faust 821*c217d954SCole Faust#define vstore_partial_3(DATA, OFFSET, PTR) \ 822*c217d954SCole Faust vstore3(DATA.s012, OFFSET, PTR); 823*c217d954SCole Faust 824*c217d954SCole Faust#define vstore_partial_4(DATA, OFFSET, PTR) \ 825*c217d954SCole Faust vstore4(DATA.s0123, OFFSET, PTR); 826*c217d954SCole Faust 827*c217d954SCole Faust#define vstore_partial_5(DATA, OFFSET, PTR) \ 828*c217d954SCole Faust vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 829*c217d954SCole Faust vstore1(DATA.s4, OFFSET, PTR + 4); 830*c217d954SCole Faust 831*c217d954SCole Faust#define vstore_partial_6(DATA, OFFSET, PTR) \ 832*c217d954SCole Faust vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 833*c217d954SCole Faust vstore_partial_2(DATA.s45, OFFSET, PTR + 4); 834*c217d954SCole Faust 835*c217d954SCole Faust#define vstore_partial_7(DATA, OFFSET, PTR) \ 836*c217d954SCole Faust vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 837*c217d954SCole Faust vstore_partial_3(DATA.s456, OFFSET, PTR + 4); 838*c217d954SCole Faust 839*c217d954SCole Faust#define vstore_partial_8(DATA, OFFSET, PTR) \ 840*c217d954SCole Faust vstore8(DATA.s01234567, OFFSET, PTR); 841*c217d954SCole Faust 842*c217d954SCole Faust#define vstore_partial_9(DATA, OFFSET, PTR) \ 843*c217d954SCole Faust vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 844*c217d954SCole Faust vstore1(DATA.s8, OFFSET, PTR + 8); 845*c217d954SCole Faust 846*c217d954SCole Faust#define vstore_partial_10(DATA, OFFSET, PTR) \ 847*c217d954SCole Faust vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 848*c217d954SCole Faust vstore_partial_2(DATA.s89, OFFSET, PTR + 8); 849*c217d954SCole Faust 850*c217d954SCole Faust#define vstore_partial_11(DATA, OFFSET, PTR) \ 851*c217d954SCole Faust vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 852*c217d954SCole Faust vstore_partial_3(DATA.s89a, OFFSET, PTR + 8); 853*c217d954SCole Faust 854*c217d954SCole Faust#define vstore_partial_12(DATA, OFFSET, PTR) \ 855*c217d954SCole Faust vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 856*c217d954SCole Faust vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8); 857*c217d954SCole Faust 858*c217d954SCole Faust#define vstore_partial_13(DATA, OFFSET, PTR) \ 859*c217d954SCole Faust vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 860*c217d954SCole Faust vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8); 861*c217d954SCole Faust 862*c217d954SCole Faust#define vstore_partial_14(DATA, OFFSET, PTR) \ 863*c217d954SCole Faust vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 864*c217d954SCole Faust vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8); 865*c217d954SCole Faust 866*c217d954SCole Faust#define vstore_partial_15(DATA, OFFSET, PTR) \ 867*c217d954SCole Faust vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 868*c217d954SCole Faust vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8); 869*c217d954SCole Faust 870*c217d954SCole Faust#define vstore_partial_16(DATA, OFFSET, PTR) \ 871*c217d954SCole Faust vstore16(DATA, OFFSET, PTR); 872*c217d954SCole Faust 873*c217d954SCole Faust 874*c217d954SCole Faust 875*c217d954SCole Faust 876*c217d954SCole Faust 877*c217d954SCole Faust#define convert_float_sat convert_float 878*c217d954SCole Faust#define convert_float1_sat convert_float 879*c217d954SCole Faust#define convert_float2_sat convert_float2 880*c217d954SCole Faust#define convert_float3_sat convert_float3 881*c217d954SCole Faust#define convert_float4_sat convert_float4 882*c217d954SCole Faust#define convert_float8_sat convert_float8 883*c217d954SCole Faust#define convert_float16_sat convert_float16 884*c217d954SCole Faust#define convert_half_sat convert_float 885*c217d954SCole Faust#define convert_half1_sat convert_half 886*c217d954SCole Faust#define convert_half2_sat convert_half2 887*c217d954SCole Faust#define convert_half3_sat convert_half3 888*c217d954SCole Faust#define convert_half4_sat convert_half4 889*c217d954SCole Faust#define convert_half8_sat convert_half8 890*c217d954SCole Faust#define convert_half16_sat convert_half16 891*c217d954SCole Faust 892*c217d954SCole Faust#define convert_float1 convert_float 893*c217d954SCole Faust#define convert_half1 convert_half 894*c217d954SCole Faust#define convert_char1 convert_char 895*c217d954SCole Faust#define convert_uchar1 convert_uchar 896*c217d954SCole Faust#define convert_short1 convert_short 897*c217d954SCole Faust#define convert_ushort1 convert_ushort 898*c217d954SCole Faust#define convert_int1 convert_int 899*c217d954SCole Faust#define convert_uint1 convert_uint 900*c217d954SCole Faust#define convert_long1 convert_long 901*c217d954SCole Faust#define convert_ulong1 convert_ulong 902*c217d954SCole Faust#define convert_double1 convert_double 903*c217d954SCole Faust 904*c217d954SCole Faust#define convert_char1_sat convert_char_sat 905*c217d954SCole Faust#define convert_uchar1_sat convert_uchar_sat 906*c217d954SCole Faust#define convert_uchar2_sat convert_uchar2_sat 907*c217d954SCole Faust#define convert_uchar3_sat convert_uchar3_sat 908*c217d954SCole Faust#define convert_uchar4_sat convert_uchar4_sat 909*c217d954SCole Faust#define convert_uchar8_sat convert_uchar8_sat 910*c217d954SCole Faust#define convert_uchar16_sat convert_uchar16_sat 911*c217d954SCole Faust#define convert_short1_sat convert_short_sat 912*c217d954SCole Faust#define convert_ushort1_sat convert_ushort_sat 913*c217d954SCole Faust#define convert_int1_sat convert_int_sat 914*c217d954SCole Faust#define convert_uint1_sat convert_uint_sat 915*c217d954SCole Faust#define convert_long1_sat convert_long_sat 916*c217d954SCole Faust#define convert_ulong1_sat convert_ulong_sat 917*c217d954SCole Faust#define convert_double1_sat convert_double_sat 918*c217d954SCole Faust 919*c217d954SCole Faust#define VEC_DATA_TYPE_STR(type, size) type##size 920*c217d954SCole Faust#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) 921*c217d954SCole Faust 922*c217d954SCole Faust#define CONVERT_STR(x, type) (convert_##type((x))) 923*c217d954SCole Faust#define CONVERT(x, type) CONVERT_STR(x, type) 924*c217d954SCole Faust 925*c217d954SCole Faust#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x))) 926*c217d954SCole Faust#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type) 927*c217d954SCole Faust 928*c217d954SCole Faust#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x))) 929*c217d954SCole Faust#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round) 930*c217d954SCole Faust 931*c217d954SCole Faust#define select_vec_dt_uchar(size) uchar##size 932*c217d954SCole Faust#define select_vec_dt_char(size) char##size 933*c217d954SCole Faust#define select_vec_dt_ushort(size) ushort##size 934*c217d954SCole Faust#define select_vec_dt_short(size) short##size 935*c217d954SCole Faust#define select_vec_dt_half(size) short##size 936*c217d954SCole Faust#define select_vec_dt_uint(size) uint##size 937*c217d954SCole Faust#define select_vec_dt_int(size) int##size 938*c217d954SCole Faust#define select_vec_dt_float(size) int##size 939*c217d954SCole Faust#define select_vec_dt_ulong(size) ulong##size 940*c217d954SCole Faust#define select_vec_dt_long(size) long##size 941*c217d954SCole Faust 942*c217d954SCole Faust#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size) 943*c217d954SCole Faust#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size) 944*c217d954SCole Faust#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1) 945*c217d954SCole Faust 946*c217d954SCole Faust#define signed_int_vec_dt_uchar(size) char##size 947*c217d954SCole Faust#define signed_int_vec_dt_char(size) char##size 948*c217d954SCole Faust#define signed_int_vec_dt_ushort(size) short##size 949*c217d954SCole Faust#define signed_int_vec_dt_short(size) short##size 950*c217d954SCole Faust#define signed_int_vec_dt_half(size) short##size 951*c217d954SCole Faust#define signed_int_vec_dt_uint(size) int##size 952*c217d954SCole Faust#define signed_int_vec_dt_int(size) int##size 953*c217d954SCole Faust#define signed_int_vec_dt_float(size) int##size 954*c217d954SCole Faust#define signed_int_vec_dt_ulong(size) long##size 955*c217d954SCole Faust#define signed_int_vec_dt_long(size) long##size 956*c217d954SCole Faust 957*c217d954SCole Faust#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size) 958*c217d954SCole Faust#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size) 959*c217d954SCole Faust#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1) 960*c217d954SCole Faust 961*c217d954SCole Faust#define sum_reduce_1(x) (x) 962*c217d954SCole Faust#define sum_reduce_2(x) ((x).s0) + ((x).s1) 963*c217d954SCole Faust#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2) 964*c217d954SCole Faust#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23) 965*c217d954SCole Faust#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567) 966*c217d954SCole Faust#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF) 967*c217d954SCole Faust 968*c217d954SCole Faust#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x) 969*c217d954SCole Faust#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size) 970*c217d954SCole Faust 971*c217d954SCole Faust#define prod_reduce_1(x) (x) 972*c217d954SCole Faust#define prod_reduce_2(x) ((x).s0) * ((x).s1) 973*c217d954SCole Faust#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2) 974*c217d954SCole Faust#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23) 975*c217d954SCole Faust#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567) 976*c217d954SCole Faust#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF) 977*c217d954SCole Faust 978*c217d954SCole Faust#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x) 979*c217d954SCole Faust#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size) 980*c217d954SCole Faust 981*c217d954SCole Faust#define max_reduce_1(x) (x) 982*c217d954SCole Faust#define max_reduce_2(x) max(((x).s0), ((x).s1)) 983*c217d954SCole Faust#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2)) 984*c217d954SCole Faust#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23)) 985*c217d954SCole Faust#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567)) 986*c217d954SCole Faust#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF)) 987*c217d954SCole Faust 988*c217d954SCole Faust#define MAX_REDUCE_STR(x, size) max_reduce_##size(x) 989*c217d954SCole Faust#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size) 990*c217d954SCole Faust 991*c217d954SCole Faust#define VECTOR_DECLARATION(name) \ 992*c217d954SCole Faust __global uchar *name##_ptr, \ 993*c217d954SCole Faust uint name##_stride_x, \ 994*c217d954SCole Faust uint name##_step_x, \ 995*c217d954SCole Faust uint name##_offset_first_element_in_bytes 996*c217d954SCole Faust 997*c217d954SCole Faust#define IMAGE_DECLARATION(name) \ 998*c217d954SCole Faust __global uchar *name##_ptr, \ 999*c217d954SCole Faust uint name##_stride_x, \ 1000*c217d954SCole Faust uint name##_step_x, \ 1001*c217d954SCole Faust uint name##_stride_y, \ 1002*c217d954SCole Faust uint name##_step_y, \ 1003*c217d954SCole Faust uint name##_offset_first_element_in_bytes 1004*c217d954SCole Faust 1005*c217d954SCole Faust#define TENSOR3D_DECLARATION(name) \ 1006*c217d954SCole Faust __global uchar *name##_ptr, \ 1007*c217d954SCole Faust uint name##_stride_x, \ 1008*c217d954SCole Faust uint name##_step_x, \ 1009*c217d954SCole Faust uint name##_stride_y, \ 1010*c217d954SCole Faust uint name##_step_y, \ 1011*c217d954SCole Faust uint name##_stride_z, \ 1012*c217d954SCole Faust uint name##_step_z, \ 1013*c217d954SCole Faust uint name##_offset_first_element_in_bytes 1014*c217d954SCole Faust 1015*c217d954SCole Faust#define TENSOR4D_DECLARATION(name) \ 1016*c217d954SCole Faust __global uchar *name##_ptr, \ 1017*c217d954SCole Faust uint name##_stride_x, \ 1018*c217d954SCole Faust uint name##_step_x, \ 1019*c217d954SCole Faust uint name##_stride_y, \ 1020*c217d954SCole Faust uint name##_step_y, \ 1021*c217d954SCole Faust uint name##_stride_z, \ 1022*c217d954SCole Faust uint name##_step_z, \ 1023*c217d954SCole Faust uint name##_stride_w, \ 1024*c217d954SCole Faust uint name##_step_w, \ 1025*c217d954SCole Faust uint name##_offset_first_element_in_bytes 1026*c217d954SCole Faust 1027*c217d954SCole Faust#define TENSOR5D_DECLARATION(name) \ 1028*c217d954SCole Faust __global uchar *name##_ptr, \ 1029*c217d954SCole Faust uint name##_stride_x, \ 1030*c217d954SCole Faust uint name##_step_x, \ 1031*c217d954SCole Faust uint name##_stride_y, \ 1032*c217d954SCole Faust uint name##_step_y, \ 1033*c217d954SCole Faust uint name##_stride_z, \ 1034*c217d954SCole Faust uint name##_step_z, \ 1035*c217d954SCole Faust uint name##_stride_w, \ 1036*c217d954SCole Faust uint name##_step_w, \ 1037*c217d954SCole Faust uint name##_stride_v, \ 1038*c217d954SCole Faust uint name##_step_v, \ 1039*c217d954SCole Faust uint name##_offset_first_element_in_bytes 1040*c217d954SCole Faust 1041*c217d954SCole Faust#define CONVERT_TO_VECTOR_STRUCT(name) \ 1042*c217d954SCole Faust update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x) 1043*c217d954SCole Faust 1044*c217d954SCole Faust#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \ 1045*c217d954SCole Faust update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0) 1046*c217d954SCole Faust 1047*c217d954SCole Faust#define CONVERT_TO_IMAGE_STRUCT(name) \ 1048*c217d954SCole Faust update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y) 1049*c217d954SCole Faust 1050*c217d954SCole Faust#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \ 1051*c217d954SCole Faust update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0) 1052*c217d954SCole Faust 1053*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 1054*c217d954SCole Faust update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 1055*c217d954SCole Faust 1056*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \ 1057*c217d954SCole Faust update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z) 1058*c217d954SCole Faust 1059*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 1060*c217d954SCole Faust update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 1061*c217d954SCole Faust 1062*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT(name) \ 1063*c217d954SCole Faust update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 1064*c217d954SCole Faust name##_stride_z, name##_step_z) 1065*c217d954SCole Faust 1066*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \ 1067*c217d954SCole Faust update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0) 1068*c217d954SCole Faust 1069*c217d954SCole Faust#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size) \ 1070*c217d954SCole Faust update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 1071*c217d954SCole Faust name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size) 1072*c217d954SCole Faust 1073*c217d954SCole Faust#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \ 1074*c217d954SCole Faust update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size) 1075*c217d954SCole Faust 1076*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name) \ 1077*c217d954SCole Faust tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 1078*c217d954SCole Faust name##_stride_z, name##_step_z) 1079*c217d954SCole Faust 1080*c217d954SCole Faust 1081*c217d954SCole Fausttypedef struct Vector 1082*c217d954SCole Faust{ 1083*c217d954SCole Faust __global uchar *ptr; 1084*c217d954SCole Faust int offset_first_element_in_bytes; 1085*c217d954SCole Faust int stride_x; 1086*c217d954SCole Faust} Vector; 1087*c217d954SCole Faust 1088*c217d954SCole Faust 1089*c217d954SCole Fausttypedef struct Image 1090*c217d954SCole Faust{ 1091*c217d954SCole Faust __global uchar *ptr; 1092*c217d954SCole Faust int offset_first_element_in_bytes; 1093*c217d954SCole Faust int stride_x; 1094*c217d954SCole Faust int stride_y; 1095*c217d954SCole Faust} Image; 1096*c217d954SCole Faust 1097*c217d954SCole Faust 1098*c217d954SCole Fausttypedef struct Tensor3D 1099*c217d954SCole Faust{ 1100*c217d954SCole Faust __global uchar *ptr; 1101*c217d954SCole Faust int offset_first_element_in_bytes; 1102*c217d954SCole Faust int stride_x; 1103*c217d954SCole Faust int stride_y; 1104*c217d954SCole Faust int stride_z; 1105*c217d954SCole Faust} Tensor3D; 1106*c217d954SCole Faust 1107*c217d954SCole Faust 1108*c217d954SCole Fausttypedef struct Tensor4D 1109*c217d954SCole Faust{ 1110*c217d954SCole Faust __global uchar *ptr; 1111*c217d954SCole Faust int offset_first_element_in_bytes; 1112*c217d954SCole Faust int stride_x; 1113*c217d954SCole Faust int stride_y; 1114*c217d954SCole Faust int stride_z; 1115*c217d954SCole Faust int stride_w; 1116*c217d954SCole Faust} Tensor4D; 1117*c217d954SCole Faust 1118*c217d954SCole Faust 1119*c217d954SCole Faustinline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x) 1120*c217d954SCole Faust{ 1121*c217d954SCole Faust Vector vector = 1122*c217d954SCole Faust { 1123*c217d954SCole Faust .ptr = ptr, 1124*c217d954SCole Faust .offset_first_element_in_bytes = offset_first_element_in_bytes, 1125*c217d954SCole Faust .stride_x = stride_x, 1126*c217d954SCole Faust }; 1127*c217d954SCole Faust vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x; 1128*c217d954SCole Faust return vector; 1129*c217d954SCole Faust} 1130*c217d954SCole Faust 1131*c217d954SCole Faust 1132*c217d954SCole Faustinline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y) 1133*c217d954SCole Faust{ 1134*c217d954SCole Faust Image img = 1135*c217d954SCole Faust { 1136*c217d954SCole Faust .ptr = ptr, 1137*c217d954SCole Faust .offset_first_element_in_bytes = offset_first_element_in_bytes, 1138*c217d954SCole Faust .stride_x = stride_x, 1139*c217d954SCole Faust .stride_y = stride_y 1140*c217d954SCole Faust }; 1141*c217d954SCole Faust img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y; 1142*c217d954SCole Faust return img; 1143*c217d954SCole Faust} 1144*c217d954SCole Faust 1145*c217d954SCole Faust 1146*c217d954SCole Faustinline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 1147*c217d954SCole Faust{ 1148*c217d954SCole Faust Image img = 1149*c217d954SCole Faust { 1150*c217d954SCole Faust .ptr = ptr, 1151*c217d954SCole Faust .offset_first_element_in_bytes = offset_first_element_in_bytes, 1152*c217d954SCole Faust .stride_x = stride_x, 1153*c217d954SCole Faust .stride_y = stride_y 1154*c217d954SCole Faust }; 1155*c217d954SCole Faust img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 1156*c217d954SCole Faust return img; 1157*c217d954SCole Faust} 1158*c217d954SCole Faust 1159*c217d954SCole Faust 1160*c217d954SCole Faustinline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 1161*c217d954SCole Faust{ 1162*c217d954SCole Faust Tensor3D tensor = 1163*c217d954SCole Faust { 1164*c217d954SCole Faust .ptr = ptr, 1165*c217d954SCole Faust .offset_first_element_in_bytes = offset_first_element_in_bytes, 1166*c217d954SCole Faust .stride_x = stride_x, 1167*c217d954SCole Faust .stride_y = stride_y, 1168*c217d954SCole Faust .stride_z = stride_z 1169*c217d954SCole Faust }; 1170*c217d954SCole Faust tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 1171*c217d954SCole Faust return tensor; 1172*c217d954SCole Faust} 1173*c217d954SCole Faust 1174*c217d954SCole Faust 1175*c217d954SCole Faustinline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 1176*c217d954SCole Faust{ 1177*c217d954SCole Faust Tensor3D tensor = 1178*c217d954SCole Faust { 1179*c217d954SCole Faust .ptr = ptr, 1180*c217d954SCole Faust .offset_first_element_in_bytes = offset_first_element_in_bytes, 1181*c217d954SCole Faust .stride_x = stride_x, 1182*c217d954SCole Faust .stride_y = stride_y, 1183*c217d954SCole Faust .stride_z = stride_z 1184*c217d954SCole Faust }; 1185*c217d954SCole Faust return tensor; 1186*c217d954SCole Faust} 1187*c217d954SCole Faust 1188*c217d954SCole Faustinline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w, 1189*c217d954SCole Faust uint step_w, 1190*c217d954SCole Faust uint mod_size) 1191*c217d954SCole Faust{ 1192*c217d954SCole Faust Tensor4D tensor = 1193*c217d954SCole Faust { 1194*c217d954SCole Faust .ptr = ptr, 1195*c217d954SCole Faust .offset_first_element_in_bytes = offset_first_element_in_bytes, 1196*c217d954SCole Faust .stride_x = stride_x, 1197*c217d954SCole Faust .stride_y = stride_y, 1198*c217d954SCole Faust .stride_z = stride_z, 1199*c217d954SCole Faust .stride_w = stride_w 1200*c217d954SCole Faust }; 1201*c217d954SCole Faust 1202*c217d954SCole Faust tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w; 1203*c217d954SCole Faust return tensor; 1204*c217d954SCole Faust} 1205*c217d954SCole Faust 1206*c217d954SCole Faust 1207*c217d954SCole Faustinline __global const uchar *vector_offset(const Vector *vec, int x) 1208*c217d954SCole Faust{ 1209*c217d954SCole Faust return vec->ptr + x * vec->stride_x; 1210*c217d954SCole Faust} 1211*c217d954SCole Faust 1212*c217d954SCole Faust 1213*c217d954SCole Faustinline __global uchar *offset(const Image *img, int x, int y) 1214*c217d954SCole Faust{ 1215*c217d954SCole Faust return img->ptr + x * img->stride_x + y * img->stride_y; 1216*c217d954SCole Faust} 1217*c217d954SCole Faust 1218*c217d954SCole Faust 1219*c217d954SCole Faustinline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z) 1220*c217d954SCole Faust{ 1221*c217d954SCole Faust return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z; 1222*c217d954SCole Faust} 1223*c217d954SCole Faust 1224*c217d954SCole Faust 1225*c217d954SCole Faustinline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w) 1226*c217d954SCole Faust{ 1227*c217d954SCole Faust return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w; 1228*c217d954SCole Faust} 1229*c217d954SCole Faust 1230*c217d954SCole Faust 1231*c217d954SCole Faustinline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index) 1232*c217d954SCole Faust{ 1233*c217d954SCole Faust uint num_elements = width * height; 1234*c217d954SCole Faust 1235*c217d954SCole Faust const uint z = index / num_elements; 1236*c217d954SCole Faust 1237*c217d954SCole Faust index %= num_elements; 1238*c217d954SCole Faust 1239*c217d954SCole Faust const uint y = index / width; 1240*c217d954SCole Faust 1241*c217d954SCole Faust index %= width; 1242*c217d954SCole Faust 1243*c217d954SCole Faust const uint x = index; 1244*c217d954SCole Faust 1245*c217d954SCole Faust return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes; 1246*c217d954SCole Faust} 1247*c217d954SCole Faust 1248*c217d954SCole Faust#endif 1249*c217d954SCole Faust 1250*c217d954SCole Faust#if GPU_ARCH == GPU_ARCH_BIFROST 1251*c217d954SCole Faust#define MLA(a, b, c) (fma(c, b, a)) 1252*c217d954SCole Faust#else 1253*c217d954SCole Faust#define MLA(a, b, c) ((b) * (c) + (a)) 1254*c217d954SCole Faust#endif 1255*c217d954SCole Faust 1256*c217d954SCole Faust 1257*c217d954SCole Faust#define hard_swish_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * ((min(max((x + (DATA_TYPE)3.0), (DATA_TYPE)0.0), (DATA_TYPE)6.0)) * (DATA_TYPE)0.166666667)) 1258*c217d954SCole Faust 1259*c217d954SCole Faust 1260*c217d954SCole Faust#define logistic_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((DATA_TYPE)1.0 / ((DATA_TYPE)1.0 + exp(-x))) 1261*c217d954SCole Faust 1262*c217d954SCole Faust 1263*c217d954SCole Faust#define tanh_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((DATA_TYPE)A_VAL * tanh((DATA_TYPE)B_VAL * x)) 1264*c217d954SCole Faust 1265*c217d954SCole Faust 1266*c217d954SCole Faust#define relu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (max((DATA_TYPE)0.0, x)) 1267*c217d954SCole Faust 1268*c217d954SCole Faust 1269*c217d954SCole Faust#define brelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)0.0, x))) 1270*c217d954SCole Faust 1271*c217d954SCole Faust 1272*c217d954SCole Faust#define lu_brelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL)) 1273*c217d954SCole Faust 1274*c217d954SCole Faust 1275*c217d954SCole Faust#define lrelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0)) 1276*c217d954SCole Faust 1277*c217d954SCole Faust 1278*c217d954SCole Faust#define srelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (log((DATA_TYPE)1.0 + exp(x))) 1279*c217d954SCole Faust 1280*c217d954SCole Faust 1281*c217d954SCole Faust#define elu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))isgreaterequal(x, (DATA_TYPE)0.0))) 1282*c217d954SCole Faust 1283*c217d954SCole Faust 1284*c217d954SCole Faust#define abs_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (fabs(x)) 1285*c217d954SCole Faust 1286*c217d954SCole Faust 1287*c217d954SCole Faust#define square_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * x) 1288*c217d954SCole Faust 1289*c217d954SCole Faust 1290*c217d954SCole Faust#define sqrt_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (sqrt(x)) 1291*c217d954SCole Faust 1292*c217d954SCole Faust 1293*c217d954SCole Faust#define linear_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (MLA((DATA_TYPE)B_VAL, (DATA_TYPE)A_VAL, x)) 1294*c217d954SCole Faust 1295*c217d954SCole Faust 1296*c217d954SCole Faust#define gelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * (DATA_TYPE)0.5 * ((DATA_TYPE)1.0 + erf(x / (DATA_TYPE)1.41421356237))) 1297*c217d954SCole Faust 1298*c217d954SCole Faust 1299*c217d954SCole Faust#define identity_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x) 1300*c217d954SCole Faust 1301*c217d954SCole Faust#define ACT_OP(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) op##_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) 1302*c217d954SCole Faust 1303*c217d954SCole Faust#define ACTIVATION(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ACT_OP(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) 1304*c217d954SCole Faust 1305*c217d954SCole Faust#ifndef ARM_COMPUTE_HELPER_H 1306*c217d954SCole Faust#define ARM_COMPUTE_HELPER_H 1307*c217d954SCole Faust 1308*c217d954SCole Faust 1309*c217d954SCole Faust 1310*c217d954SCole Faust 1311*c217d954SCole Faust#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1312*c217d954SCole Faust VSTORE(N0) \ 1313*c217d954SCole Faust (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 1314*c217d954SCole Faust 1315*c217d954SCole Faust#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1316*c217d954SCole Faust STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1317*c217d954SCole Faust VSTORE(N0) \ 1318*c217d954SCole Faust (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 1319*c217d954SCole Faust 1320*c217d954SCole Faust#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1321*c217d954SCole Faust STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1322*c217d954SCole Faust VSTORE(N0) \ 1323*c217d954SCole Faust (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 1324*c217d954SCole Faust 1325*c217d954SCole Faust#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1326*c217d954SCole Faust STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1327*c217d954SCole Faust VSTORE(N0) \ 1328*c217d954SCole Faust (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 1329*c217d954SCole Faust 1330*c217d954SCole Faust#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1331*c217d954SCole Faust STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1332*c217d954SCole Faust VSTORE(N0) \ 1333*c217d954SCole Faust (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 1334*c217d954SCole Faust 1335*c217d954SCole Faust#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1336*c217d954SCole Faust STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1337*c217d954SCole Faust VSTORE(N0) \ 1338*c217d954SCole Faust (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 1339*c217d954SCole Faust 1340*c217d954SCole Faust#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1341*c217d954SCole Faust STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1342*c217d954SCole Faust VSTORE(N0) \ 1343*c217d954SCole Faust (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 1344*c217d954SCole Faust 1345*c217d954SCole Faust#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1346*c217d954SCole Faust STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1347*c217d954SCole Faust VSTORE(N0) \ 1348*c217d954SCole Faust (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 1349*c217d954SCole Faust 1350*c217d954SCole Faust#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1351*c217d954SCole Faust STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1352*c217d954SCole Faust VSTORE(N0) \ 1353*c217d954SCole Faust (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 1354*c217d954SCole Faust 1355*c217d954SCole Faust#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1356*c217d954SCole Faust STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1357*c217d954SCole Faust VSTORE(N0) \ 1358*c217d954SCole Faust (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 1359*c217d954SCole Faust 1360*c217d954SCole Faust#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1361*c217d954SCole Faust STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1362*c217d954SCole Faust VSTORE(N0) \ 1363*c217d954SCole Faust (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 1364*c217d954SCole Faust 1365*c217d954SCole Faust#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1366*c217d954SCole Faust STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1367*c217d954SCole Faust VSTORE(N0) \ 1368*c217d954SCole Faust (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 1369*c217d954SCole Faust 1370*c217d954SCole Faust#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1371*c217d954SCole Faust STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1372*c217d954SCole Faust VSTORE(N0) \ 1373*c217d954SCole Faust (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 1374*c217d954SCole Faust 1375*c217d954SCole Faust#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1376*c217d954SCole Faust STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1377*c217d954SCole Faust VSTORE(N0) \ 1378*c217d954SCole Faust (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 1379*c217d954SCole Faust 1380*c217d954SCole Faust#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1381*c217d954SCole Faust STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1382*c217d954SCole Faust VSTORE(N0) \ 1383*c217d954SCole Faust (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 1384*c217d954SCole Faust 1385*c217d954SCole Faust#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1386*c217d954SCole Faust STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1387*c217d954SCole Faust VSTORE(N0) \ 1388*c217d954SCole Faust (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 1389*c217d954SCole Faust 1390*c217d954SCole Faust 1391*c217d954SCole Faust 1392*c217d954SCole Faust#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1393*c217d954SCole Faust VSTORE(N0) \ 1394*c217d954SCole Faust (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 1395*c217d954SCole Faust 1396*c217d954SCole Faust#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1397*c217d954SCole Faust CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1398*c217d954SCole Faust VSTORE(N0) \ 1399*c217d954SCole Faust (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 1400*c217d954SCole Faust 1401*c217d954SCole Faust#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1402*c217d954SCole Faust CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1403*c217d954SCole Faust VSTORE(N0) \ 1404*c217d954SCole Faust (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 1405*c217d954SCole Faust 1406*c217d954SCole Faust#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1407*c217d954SCole Faust CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1408*c217d954SCole Faust VSTORE(N0) \ 1409*c217d954SCole Faust (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 1410*c217d954SCole Faust 1411*c217d954SCole Faust#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1412*c217d954SCole Faust CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1413*c217d954SCole Faust VSTORE(N0) \ 1414*c217d954SCole Faust (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 1415*c217d954SCole Faust 1416*c217d954SCole Faust#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1417*c217d954SCole Faust CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1418*c217d954SCole Faust VSTORE(N0) \ 1419*c217d954SCole Faust (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 1420*c217d954SCole Faust 1421*c217d954SCole Faust#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1422*c217d954SCole Faust CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1423*c217d954SCole Faust VSTORE(N0) \ 1424*c217d954SCole Faust (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 1425*c217d954SCole Faust 1426*c217d954SCole Faust#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1427*c217d954SCole Faust CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1428*c217d954SCole Faust VSTORE(N0) \ 1429*c217d954SCole Faust (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 1430*c217d954SCole Faust 1431*c217d954SCole Faust#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1432*c217d954SCole Faust CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1433*c217d954SCole Faust VSTORE(N0) \ 1434*c217d954SCole Faust (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 1435*c217d954SCole Faust 1436*c217d954SCole Faust#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \ 1437*c217d954SCole Faust CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1438*c217d954SCole Faust VSTORE(N0) \ 1439*c217d954SCole Faust (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 1440*c217d954SCole Faust 1441*c217d954SCole Faust#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1442*c217d954SCole Faust CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1443*c217d954SCole Faust VSTORE(N0) \ 1444*c217d954SCole Faust (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 1445*c217d954SCole Faust 1446*c217d954SCole Faust#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1447*c217d954SCole Faust CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1448*c217d954SCole Faust VSTORE(N0) \ 1449*c217d954SCole Faust (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 1450*c217d954SCole Faust 1451*c217d954SCole Faust#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1452*c217d954SCole Faust CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1453*c217d954SCole Faust VSTORE(N0) \ 1454*c217d954SCole Faust (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 1455*c217d954SCole Faust 1456*c217d954SCole Faust#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1457*c217d954SCole Faust CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1458*c217d954SCole Faust VSTORE(N0) \ 1459*c217d954SCole Faust (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 1460*c217d954SCole Faust 1461*c217d954SCole Faust#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1462*c217d954SCole Faust CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1463*c217d954SCole Faust VSTORE(N0) \ 1464*c217d954SCole Faust (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 1465*c217d954SCole Faust 1466*c217d954SCole Faust#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1467*c217d954SCole Faust CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1468*c217d954SCole Faust VSTORE(N0) \ 1469*c217d954SCole Faust (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 1470*c217d954SCole Faust 1471*c217d954SCole Faust 1472*c217d954SCole Faust 1473*c217d954SCole Faust 1474*c217d954SCole Faust#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1475*c217d954SCole Faust#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1476*c217d954SCole Faust 1477*c217d954SCole Faust 1478*c217d954SCole Faust 1479*c217d954SCole Faust#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1480*c217d954SCole Faust#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1481*c217d954SCole Faust 1482*c217d954SCole Faust 1483*c217d954SCole Faust 1484*c217d954SCole Faust#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1485*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 1486*c217d954SCole Faust (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 1487*c217d954SCole Faust 1488*c217d954SCole Faust#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1489*c217d954SCole Faust STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1490*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 1491*c217d954SCole Faust (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 1492*c217d954SCole Faust 1493*c217d954SCole Faust#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1494*c217d954SCole Faust STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1495*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 1496*c217d954SCole Faust (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 1497*c217d954SCole Faust 1498*c217d954SCole Faust#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1499*c217d954SCole Faust STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1500*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 1501*c217d954SCole Faust (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 1502*c217d954SCole Faust 1503*c217d954SCole Faust#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1504*c217d954SCole Faust STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1505*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 1506*c217d954SCole Faust (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 1507*c217d954SCole Faust 1508*c217d954SCole Faust#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1509*c217d954SCole Faust STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1510*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 1511*c217d954SCole Faust (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 1512*c217d954SCole Faust 1513*c217d954SCole Faust#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1514*c217d954SCole Faust STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1515*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 1516*c217d954SCole Faust (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 1517*c217d954SCole Faust 1518*c217d954SCole Faust#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1519*c217d954SCole Faust STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1520*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 1521*c217d954SCole Faust (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 1522*c217d954SCole Faust 1523*c217d954SCole Faust#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1524*c217d954SCole Faust STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1525*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 1526*c217d954SCole Faust (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 1527*c217d954SCole Faust 1528*c217d954SCole Faust#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1529*c217d954SCole Faust STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1530*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 1531*c217d954SCole Faust (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 1532*c217d954SCole Faust 1533*c217d954SCole Faust#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1534*c217d954SCole Faust STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1535*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 1536*c217d954SCole Faust (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 1537*c217d954SCole Faust 1538*c217d954SCole Faust#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1539*c217d954SCole Faust STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1540*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 1541*c217d954SCole Faust (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 1542*c217d954SCole Faust 1543*c217d954SCole Faust#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1544*c217d954SCole Faust STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1545*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 1546*c217d954SCole Faust (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 1547*c217d954SCole Faust 1548*c217d954SCole Faust#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1549*c217d954SCole Faust STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1550*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 1551*c217d954SCole Faust (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 1552*c217d954SCole Faust 1553*c217d954SCole Faust#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1554*c217d954SCole Faust STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1555*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 1556*c217d954SCole Faust (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 1557*c217d954SCole Faust 1558*c217d954SCole Faust#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1559*c217d954SCole Faust STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 1560*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 1561*c217d954SCole Faust (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 1562*c217d954SCole Faust 1563*c217d954SCole Faust 1564*c217d954SCole Faust 1565*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1566*c217d954SCole Faust#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1567*c217d954SCole Faust 1568*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 1569*c217d954SCole Faust if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \ 1570*c217d954SCole Faust { \ 1571*c217d954SCole Faust STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1572*c217d954SCole Faust } \ 1573*c217d954SCole Faust else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \ 1574*c217d954SCole Faust { \ 1575*c217d954SCole Faust STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1576*c217d954SCole Faust } \ 1577*c217d954SCole Faust else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \ 1578*c217d954SCole Faust { \ 1579*c217d954SCole Faust STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1580*c217d954SCole Faust } \ 1581*c217d954SCole Faust else \ 1582*c217d954SCole Faust { \ 1583*c217d954SCole Faust STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1584*c217d954SCole Faust } 1585*c217d954SCole Faust 1586*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \ 1587*c217d954SCole Faust if(!(PARTIAL_COND_X)) \ 1588*c217d954SCole Faust { \ 1589*c217d954SCole Faust STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1590*c217d954SCole Faust } \ 1591*c217d954SCole Faust else \ 1592*c217d954SCole Faust { \ 1593*c217d954SCole Faust STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1594*c217d954SCole Faust } 1595*c217d954SCole Faust 1596*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \ 1597*c217d954SCole Faust if(!(PARTIAL_COND_Y)) \ 1598*c217d954SCole Faust { \ 1599*c217d954SCole Faust STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1600*c217d954SCole Faust } \ 1601*c217d954SCole Faust else \ 1602*c217d954SCole Faust { \ 1603*c217d954SCole Faust STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 1604*c217d954SCole Faust } 1605*c217d954SCole Faust 1606*c217d954SCole Faust 1607*c217d954SCole Faust#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) 1608*c217d954SCole Faust 1609*c217d954SCole Faust 1610*c217d954SCole Faust#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 1611*c217d954SCole Faust 1612*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 1613*c217d954SCole Faust STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 1614*c217d954SCole Faust 1615*c217d954SCole Faust#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0 1616*c217d954SCole Faust 1617*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 1618*c217d954SCole Faust STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) 1619*c217d954SCole Faust 1620*c217d954SCole Faust#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0 1621*c217d954SCole Faust 1622*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 1623*c217d954SCole Faust STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) 1624*c217d954SCole Faust 1625*c217d954SCole Faust#else 1626*c217d954SCole Faust 1627*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 1628*c217d954SCole Faust STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) 1629*c217d954SCole Faust 1630*c217d954SCole Faust#endif 1631*c217d954SCole Faust 1632*c217d954SCole Faust#endif 1633*c217d954SCole Faust 1634*c217d954SCole Faust 1635*c217d954SCole Faust#if defined(PARTIAL_STORE_M0) 1636*c217d954SCole Faust 1637*c217d954SCole Faust#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 1638*c217d954SCole Faust ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0)))) 1639*c217d954SCole Faust#else 1640*c217d954SCole Faust#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 1641*c217d954SCole Faust ((uint)(y * M0)) 1642*c217d954SCole Faust#endif 1643*c217d954SCole Faust 1644*c217d954SCole Faust 1645*c217d954SCole Faust 1646*c217d954SCole Faust#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \ 1647*c217d954SCole Faust STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond) 1648*c217d954SCole Faust 1649*c217d954SCole Faust 1650*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 1651*c217d954SCole Faust#pragma OPENCL EXTENSION cl_khr_fp16 : enable 1652*c217d954SCole Faust#endif 1653*c217d954SCole Faust 1654*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 1655*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable 1656*c217d954SCole Faust#endif 1657*c217d954SCole Faust 1658*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 1659*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable 1660*c217d954SCole Faust#endif 1661*c217d954SCole Faust 1662*c217d954SCole Faust#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) 1663*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_printf : enable 1664*c217d954SCole Faust#endif 1665*c217d954SCole Faust 1666*c217d954SCole Faust#define GPU_ARCH_MIDGARD 0x100 1667*c217d954SCole Faust#define GPU_ARCH_BIFROST 0x200 1668*c217d954SCole Faust#define GPU_ARCH_VALHALL 0x300 1669*c217d954SCole Faust 1670*c217d954SCole Faust 1671*c217d954SCole Faust#define CONCAT(a, b) a##b 1672*c217d954SCole Faust 1673*c217d954SCole Faust 1674*c217d954SCole Faust#define EXPAND(x) x 1675*c217d954SCole Faust 1676*c217d954SCole Faust 1677*c217d954SCole Faust#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) 1678*c217d954SCole Faust 1679*c217d954SCole Faust 1680*c217d954SCole Faust#define REV1(x) ((x)) 1681*c217d954SCole Faust#define REV2(x) ((x).s10) 1682*c217d954SCole Faust#define REV3(x) ((x).s210) 1683*c217d954SCole Faust#define REV4(x) ((x).s3210) 1684*c217d954SCole Faust#define REV8(x) ((x).s76543210) 1685*c217d954SCole Faust#define REV16(x) ((x).sFEDCBA9876543210) 1686*c217d954SCole Faust 1687*c217d954SCole Faust 1688*c217d954SCole Faust 1689*c217d954SCole Faust#define REVERSE_STR(x, s) REV##s((x)) 1690*c217d954SCole Faust#define REVERSE(x, s) REVERSE_STR(x, s) 1691*c217d954SCole Faust 1692*c217d954SCole Faust 1693*c217d954SCole Faust 1694*c217d954SCole Faust#define ROT1_0(x) ((x)) 1695*c217d954SCole Faust#define ROT1_1(x) ((x)) 1696*c217d954SCole Faust 1697*c217d954SCole Faust#define ROT2_0(x) ((x)) 1698*c217d954SCole Faust#define ROT2_1(x) ((x).s10) 1699*c217d954SCole Faust#define ROT2_2(x) ((x)) 1700*c217d954SCole Faust 1701*c217d954SCole Faust#define ROT3_0(x) ((x)) 1702*c217d954SCole Faust#define ROT3_1(x) ((x).s201) 1703*c217d954SCole Faust#define ROT3_2(x) ((x).s120) 1704*c217d954SCole Faust#define ROT3_3(x) ((x)) 1705*c217d954SCole Faust 1706*c217d954SCole Faust#define ROT4_0(x) ((x)) 1707*c217d954SCole Faust#define ROT4_1(x) ((x).s3012) 1708*c217d954SCole Faust#define ROT4_2(x) ((x).s2301) 1709*c217d954SCole Faust#define ROT4_3(x) ((x).s1230) 1710*c217d954SCole Faust#define ROT4_4(x) ((x)) 1711*c217d954SCole Faust 1712*c217d954SCole Faust#define ROT8_0(x) ((x)) 1713*c217d954SCole Faust#define ROT8_1(x) ((x).s70123456) 1714*c217d954SCole Faust#define ROT8_2(x) ((x).s67012345) 1715*c217d954SCole Faust#define ROT8_3(x) ((x).s56701234) 1716*c217d954SCole Faust#define ROT8_4(x) ((x).s45670123) 1717*c217d954SCole Faust#define ROT8_5(x) ((x).s34567012) 1718*c217d954SCole Faust#define ROT8_6(x) ((x).s23456701) 1719*c217d954SCole Faust#define ROT8_7(x) ((x).s12345670) 1720*c217d954SCole Faust#define ROT8_8(x) ((x)) 1721*c217d954SCole Faust 1722*c217d954SCole Faust#define ROT16_0(x) ((x)) 1723*c217d954SCole Faust#define ROT16_1(x) ((x).sF0123456789ABCDE) 1724*c217d954SCole Faust#define ROT16_2(x) ((x).sEF0123456789ABCD) 1725*c217d954SCole Faust#define ROT16_3(x) ((x).sDEF0123456789ABC) 1726*c217d954SCole Faust#define ROT16_4(x) ((x).sCDEF0123456789AB) 1727*c217d954SCole Faust#define ROT16_5(x) ((x).sBCDEF0123456789A) 1728*c217d954SCole Faust#define ROT16_6(x) ((x).sABCDEF0123456789) 1729*c217d954SCole Faust#define ROT16_7(x) ((x).s9ABCDEF012345678) 1730*c217d954SCole Faust#define ROT16_8(x) ((x).s89ABCDEF01234567) 1731*c217d954SCole Faust#define ROT16_9(x) ((x).s789ABCDEF0123456) 1732*c217d954SCole Faust#define ROT16_10(x) ((x).s6789ABCDEF012345) 1733*c217d954SCole Faust#define ROT16_11(x) ((x).s56789ABCDEF01234) 1734*c217d954SCole Faust#define ROT16_12(x) ((x).s456789ABCDEF0123) 1735*c217d954SCole Faust#define ROT16_13(x) ((x).s3456789ABCDEF012) 1736*c217d954SCole Faust#define ROT16_14(x) ((x).s23456789ABCDEF01) 1737*c217d954SCole Faust#define ROT16_15(x) ((x).s123456789ABCDEF0) 1738*c217d954SCole Faust#define ROT16_16(x) ((x)) 1739*c217d954SCole Faust 1740*c217d954SCole Faust 1741*c217d954SCole Faust 1742*c217d954SCole Faust#define ROTATE_STR(x, s, n) ROT##s##_##n(x) 1743*c217d954SCole Faust#define ROTATE(x, s, n) ROTATE_STR(x, s, n) 1744*c217d954SCole Faust 1745*c217d954SCole Faust 1746*c217d954SCole Faust 1747*c217d954SCole Faust#define V_OFFS1(dt) (dt##1)(0) 1748*c217d954SCole Faust#define V_OFFS2(dt) (dt##2)(0, 1) 1749*c217d954SCole Faust#define V_OFFS3(dt) (dt##3)(0, 1, 2) 1750*c217d954SCole Faust#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3) 1751*c217d954SCole Faust#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7) 1752*c217d954SCole Faust#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) 1753*c217d954SCole Faust 1754*c217d954SCole Faust 1755*c217d954SCole Faust 1756*c217d954SCole Faust#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt) 1757*c217d954SCole Faust#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s) 1758*c217d954SCole Faust 1759*c217d954SCole Faust 1760*c217d954SCole Faust#define VLOAD_STR(size) vload##size 1761*c217d954SCole Faust#define VLOAD(size) VLOAD_STR(size) 1762*c217d954SCole Faust 1763*c217d954SCole Faust 1764*c217d954SCole Faust#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size 1765*c217d954SCole Faust#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size) 1766*c217d954SCole Faust 1767*c217d954SCole Faust#define NO_LOAD(data, offs, ptr) \ 1768*c217d954SCole Faust { \ 1769*c217d954SCole Faust } 1770*c217d954SCole Faust 1771*c217d954SCole Faust 1772*c217d954SCole Faust#define vload_partial_1_0 NO_LOAD 1773*c217d954SCole Faust#define vload_partial_1_1 vload1 1774*c217d954SCole Faust#define vload_partial_1_2 NO_LOAD 1775*c217d954SCole Faust#define vload_partial_1_3 NO_LOAD 1776*c217d954SCole Faust#define vload_partial_1_4 NO_LOAD 1777*c217d954SCole Faust#define vload_partial_1_5 NO_LOAD 1778*c217d954SCole Faust#define vload_partial_1_6 NO_LOAD 1779*c217d954SCole Faust#define vload_partial_1_7 NO_LOAD 1780*c217d954SCole Faust#define vload_partial_1_8 NO_LOAD 1781*c217d954SCole Faust#define vload_partial_1_9 NO_LOAD 1782*c217d954SCole Faust#define vload_partial_1_10 NO_LOAD 1783*c217d954SCole Faust#define vload_partial_1_11 NO_LOAD 1784*c217d954SCole Faust#define vload_partial_1_12 NO_LOAD 1785*c217d954SCole Faust#define vload_partial_1_13 NO_LOAD 1786*c217d954SCole Faust#define vload_partial_1_14 NO_LOAD 1787*c217d954SCole Faust#define vload_partial_1_15 NO_LOAD 1788*c217d954SCole Faust#define vload_partial_1_16 NO_LOAD 1789*c217d954SCole Faust 1790*c217d954SCole Faust#define vload_partial_2_0 NO_LOAD 1791*c217d954SCole Faust#define vload_partial_2_1 vload_partial_1 1792*c217d954SCole Faust#define vload_partial_2_2 vload_partial_2 1793*c217d954SCole Faust#define vload_partial_2_3 NO_LOAD 1794*c217d954SCole Faust#define vload_partial_2_4 NO_LOAD 1795*c217d954SCole Faust#define vload_partial_2_5 NO_LOAD 1796*c217d954SCole Faust#define vload_partial_2_6 NO_LOAD 1797*c217d954SCole Faust#define vload_partial_2_7 NO_LOAD 1798*c217d954SCole Faust#define vload_partial_2_8 NO_LOAD 1799*c217d954SCole Faust#define vload_partial_2_9 NO_LOAD 1800*c217d954SCole Faust#define vload_partial_2_10 NO_LOAD 1801*c217d954SCole Faust#define vload_partial_2_11 NO_LOAD 1802*c217d954SCole Faust#define vload_partial_2_12 NO_LOAD 1803*c217d954SCole Faust#define vload_partial_2_13 NO_LOAD 1804*c217d954SCole Faust#define vload_partial_2_14 NO_LOAD 1805*c217d954SCole Faust#define vload_partial_2_15 NO_LOAD 1806*c217d954SCole Faust#define vload_partial_2_16 NO_LOAD 1807*c217d954SCole Faust 1808*c217d954SCole Faust#define vload_partial_3_0 NO_LOAD 1809*c217d954SCole Faust#define vload_partial_3_1 vload_partial_1 1810*c217d954SCole Faust#define vload_partial_3_2 vload_partial_2 1811*c217d954SCole Faust#define vload_partial_3_3 vload_partial_3 1812*c217d954SCole Faust#define vload_partial_3_4 NO_LOAD 1813*c217d954SCole Faust#define vload_partial_3_5 NO_LOAD 1814*c217d954SCole Faust#define vload_partial_3_6 NO_LOAD 1815*c217d954SCole Faust#define vload_partial_3_7 NO_LOAD 1816*c217d954SCole Faust#define vload_partial_3_8 NO_LOAD 1817*c217d954SCole Faust#define vload_partial_3_9 NO_LOAD 1818*c217d954SCole Faust#define vload_partial_3_10 NO_LOAD 1819*c217d954SCole Faust#define vload_partial_3_11 NO_LOAD 1820*c217d954SCole Faust#define vload_partial_3_12 NO_LOAD 1821*c217d954SCole Faust#define vload_partial_3_13 NO_LOAD 1822*c217d954SCole Faust#define vload_partial_3_14 NO_LOAD 1823*c217d954SCole Faust#define vload_partial_3_15 NO_LOAD 1824*c217d954SCole Faust#define vload_partial_3_16 NO_LOAD 1825*c217d954SCole Faust 1826*c217d954SCole Faust#define vload_partial_4_0 NO_LOAD 1827*c217d954SCole Faust#define vload_partial_4_1 vload_partial_1 1828*c217d954SCole Faust#define vload_partial_4_2 vload_partial_2 1829*c217d954SCole Faust#define vload_partial_4_3 vload_partial_3 1830*c217d954SCole Faust#define vload_partial_4_4 vload_partial_4 1831*c217d954SCole Faust#define vload_partial_4_5 NO_LOAD 1832*c217d954SCole Faust#define vload_partial_4_6 NO_LOAD 1833*c217d954SCole Faust#define vload_partial_4_7 NO_LOAD 1834*c217d954SCole Faust#define vload_partial_4_8 NO_LOAD 1835*c217d954SCole Faust#define vload_partial_4_9 NO_LOAD 1836*c217d954SCole Faust#define vload_partial_4_10 NO_LOAD 1837*c217d954SCole Faust#define vload_partial_4_11 NO_LOAD 1838*c217d954SCole Faust#define vload_partial_4_12 NO_LOAD 1839*c217d954SCole Faust#define vload_partial_4_13 NO_LOAD 1840*c217d954SCole Faust#define vload_partial_4_14 NO_LOAD 1841*c217d954SCole Faust#define vload_partial_4_15 NO_LOAD 1842*c217d954SCole Faust#define vload_partial_4_16 NO_LOAD 1843*c217d954SCole Faust 1844*c217d954SCole Faust#define vload_partial_8_0 NO_LOAD 1845*c217d954SCole Faust#define vload_partial_8_1 vload_partial_1 1846*c217d954SCole Faust#define vload_partial_8_2 vload_partial_2 1847*c217d954SCole Faust#define vload_partial_8_3 vload_partial_3 1848*c217d954SCole Faust#define vload_partial_8_4 vload_partial_4 1849*c217d954SCole Faust#define vload_partial_8_5 vload_partial_5 1850*c217d954SCole Faust#define vload_partial_8_6 vload_partial_6 1851*c217d954SCole Faust#define vload_partial_8_7 vload_partial_7 1852*c217d954SCole Faust#define vload_partial_8_8 vload_partial_8 1853*c217d954SCole Faust#define vload_partial_8_9 NO_LOAD 1854*c217d954SCole Faust#define vload_partial_8_10 NO_LOAD 1855*c217d954SCole Faust#define vload_partial_8_11 NO_LOAD 1856*c217d954SCole Faust#define vload_partial_8_12 NO_LOAD 1857*c217d954SCole Faust#define vload_partial_8_13 NO_LOAD 1858*c217d954SCole Faust#define vload_partial_8_14 NO_LOAD 1859*c217d954SCole Faust#define vload_partial_8_15 NO_LOAD 1860*c217d954SCole Faust#define vload_partial_8_16 NO_LOAD 1861*c217d954SCole Faust 1862*c217d954SCole Faust#define vload_partial_16_0 NO_LOAD 1863*c217d954SCole Faust#define vload_partial_16_1 vload_partial_1 1864*c217d954SCole Faust#define vload_partial_16_2 vload_partial_2 1865*c217d954SCole Faust#define vload_partial_16_3 vload_partial_3 1866*c217d954SCole Faust#define vload_partial_16_4 vload_partial_4 1867*c217d954SCole Faust#define vload_partial_16_5 vload_partial_5 1868*c217d954SCole Faust#define vload_partial_16_6 vload_partial_6 1869*c217d954SCole Faust#define vload_partial_16_7 vload_partial_7 1870*c217d954SCole Faust#define vload_partial_16_8 vload_partial_8 1871*c217d954SCole Faust#define vload_partial_16_9 vload_partial_9 1872*c217d954SCole Faust#define vload_partial_16_10 vload_partial_10 1873*c217d954SCole Faust#define vload_partial_16_11 vload_partial_11 1874*c217d954SCole Faust#define vload_partial_16_12 vload_partial_12 1875*c217d954SCole Faust#define vload_partial_16_13 vload_partial_13 1876*c217d954SCole Faust#define vload_partial_16_14 vload_partial_14 1877*c217d954SCole Faust#define vload_partial_16_15 vload_partial_15 1878*c217d954SCole Faust#define vload_partial_16_16 vload_partial_16 1879*c217d954SCole Faust 1880*c217d954SCole Faust 1881*c217d954SCole Faust#define vload_partial_1(DATA, OFFSET, PTR) \ 1882*c217d954SCole Faust DATA.s0 = vload1(OFFSET, PTR); 1883*c217d954SCole Faust 1884*c217d954SCole Faust#define vload_partial_2(DATA, OFFSET, PTR) \ 1885*c217d954SCole Faust DATA.s01 = vload2(OFFSET, PTR); 1886*c217d954SCole Faust 1887*c217d954SCole Faust#define vload_partial_3(DATA, OFFSET, PTR) \ 1888*c217d954SCole Faust DATA.s012 = vload3(OFFSET, PTR); 1889*c217d954SCole Faust 1890*c217d954SCole Faust#define vload_partial_4(DATA, OFFSET, PTR) \ 1891*c217d954SCole Faust DATA.s0123 = vload4(OFFSET, PTR); 1892*c217d954SCole Faust 1893*c217d954SCole Faust#define vload_partial_5(DATA, OFFSET, PTR) \ 1894*c217d954SCole Faust vload_partial_4(DATA.s0123, OFFSET, PTR); \ 1895*c217d954SCole Faust DATA.s4 = vload1(OFFSET, PTR + 4); 1896*c217d954SCole Faust 1897*c217d954SCole Faust#define vload_partial_6(DATA, OFFSET, PTR) \ 1898*c217d954SCole Faust vload_partial_4(DATA.s0123, OFFSET, PTR); \ 1899*c217d954SCole Faust vload_partial_2(DATA.s45, OFFSET, PTR + 4); 1900*c217d954SCole Faust 1901*c217d954SCole Faust#define vload_partial_7(DATA, OFFSET, PTR) \ 1902*c217d954SCole Faust vload_partial_4(DATA.s0123, OFFSET, PTR); \ 1903*c217d954SCole Faust vload_partial_3(DATA.s456, OFFSET, PTR + 4); 1904*c217d954SCole Faust 1905*c217d954SCole Faust#define vload_partial_8(DATA, OFFSET, PTR) \ 1906*c217d954SCole Faust DATA.s01234567 = vload8(OFFSET, PTR); 1907*c217d954SCole Faust 1908*c217d954SCole Faust#define vload_partial_9(DATA, OFFSET, PTR) \ 1909*c217d954SCole Faust vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 1910*c217d954SCole Faust DATA.s8 = vload1(OFFSET, PTR + 8); 1911*c217d954SCole Faust 1912*c217d954SCole Faust#define vload_partial_10(DATA, OFFSET, PTR) \ 1913*c217d954SCole Faust vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 1914*c217d954SCole Faust vload_partial_2(DATA.s89, OFFSET, PTR + 8); 1915*c217d954SCole Faust 1916*c217d954SCole Faust#define vload_partial_11(DATA, OFFSET, PTR) \ 1917*c217d954SCole Faust vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 1918*c217d954SCole Faust vload_partial_3(DATA.s89A, OFFSET, PTR + 8); 1919*c217d954SCole Faust 1920*c217d954SCole Faust#define vload_partial_12(DATA, OFFSET, PTR) \ 1921*c217d954SCole Faust vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 1922*c217d954SCole Faust vload_partial_4(DATA.s89AB, OFFSET, PTR + 8); 1923*c217d954SCole Faust 1924*c217d954SCole Faust#define vload_partial_13(DATA, OFFSET, PTR) \ 1925*c217d954SCole Faust vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 1926*c217d954SCole Faust vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8); 1927*c217d954SCole Faust 1928*c217d954SCole Faust#define vload_partial_14(DATA, OFFSET, PTR) \ 1929*c217d954SCole Faust vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 1930*c217d954SCole Faust vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8); 1931*c217d954SCole Faust 1932*c217d954SCole Faust#define vload_partial_15(DATA, OFFSET, PTR) \ 1933*c217d954SCole Faust vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 1934*c217d954SCole Faust vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8); 1935*c217d954SCole Faust 1936*c217d954SCole Faust#define vload_partial_16(DATA, OFFSET, PTR) \ 1937*c217d954SCole Faust DATA = vload16(OFFSET, PTR); 1938*c217d954SCole Faust 1939*c217d954SCole Faust 1940*c217d954SCole Faust 1941*c217d954SCole Faust#define PIXEL_UNIT4 1 1942*c217d954SCole Faust#define PIXEL_UNIT8 2 1943*c217d954SCole Faust#define PIXEL_UNIT16 4 1944*c217d954SCole Faust 1945*c217d954SCole Faust 1946*c217d954SCole Faust#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size 1947*c217d954SCole Faust#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) 1948*c217d954SCole Faust 1949*c217d954SCole Faust 1950*c217d954SCole Faust#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord))); 1951*c217d954SCole Faust#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord))); 1952*c217d954SCole Faust#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord))); 1953*c217d954SCole Faust 1954*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 1955*c217d954SCole Faust#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord))); 1956*c217d954SCole Faust#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord))); 1957*c217d954SCole Faust#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord))); 1958*c217d954SCole Faust#endif 1959*c217d954SCole Faust 1960*c217d954SCole Faust#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values)); 1961*c217d954SCole Faust#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567)); 1962*c217d954SCole Faust#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); 1963*c217d954SCole Faust 1964*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 1965*c217d954SCole Faust#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values)); 1966*c217d954SCole Faust#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567)); 1967*c217d954SCole Faust#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); 1968*c217d954SCole Faust#endif 1969*c217d954SCole Faust 1970*c217d954SCole Faust 1971*c217d954SCole Faust#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord) 1972*c217d954SCole Faust#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) 1973*c217d954SCole Faust 1974*c217d954SCole Faust 1975*c217d954SCole Faust#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values) 1976*c217d954SCole Faust#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) 1977*c217d954SCole Faust 1978*c217d954SCole Faust#define VSTORE_STR(size) vstore##size 1979*c217d954SCole Faust#define VSTORE(size) VSTORE_STR(size) 1980*c217d954SCole Faust 1981*c217d954SCole Faust#define float1 float 1982*c217d954SCole Faust#define half1 half 1983*c217d954SCole Faust#define char1 char 1984*c217d954SCole Faust#define uchar1 uchar 1985*c217d954SCole Faust#define short1 short 1986*c217d954SCole Faust#define ushort1 ushort 1987*c217d954SCole Faust#define int1 int 1988*c217d954SCole Faust#define uint1 uint 1989*c217d954SCole Faust#define long1 long 1990*c217d954SCole Faust#define ulong1 ulong 1991*c217d954SCole Faust#define double1 double 1992*c217d954SCole Faust 1993*c217d954SCole Faust#define vload1(OFFSET, PTR) *(OFFSET + PTR) 1994*c217d954SCole Faust#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA 1995*c217d954SCole Faust 1996*c217d954SCole Faust 1997*c217d954SCole Faust#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size 1998*c217d954SCole Faust#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size) 1999*c217d954SCole Faust 2000*c217d954SCole Faust#define NO_STORE(data, offs, ptr) \ 2001*c217d954SCole Faust { \ 2002*c217d954SCole Faust } 2003*c217d954SCole Faust 2004*c217d954SCole Faust 2005*c217d954SCole Faust#define vstore_partial_1_0 NO_STORE 2006*c217d954SCole Faust#define vstore_partial_1_1 vstore1 2007*c217d954SCole Faust#define vstore_partial_1_2 NO_STORE 2008*c217d954SCole Faust#define vstore_partial_1_3 NO_STORE 2009*c217d954SCole Faust#define vstore_partial_1_4 NO_STORE 2010*c217d954SCole Faust#define vstore_partial_1_5 NO_STORE 2011*c217d954SCole Faust#define vstore_partial_1_6 NO_STORE 2012*c217d954SCole Faust#define vstore_partial_1_7 NO_STORE 2013*c217d954SCole Faust#define vstore_partial_1_8 NO_STORE 2014*c217d954SCole Faust#define vstore_partial_1_9 NO_STORE 2015*c217d954SCole Faust#define vstore_partial_1_10 NO_STORE 2016*c217d954SCole Faust#define vstore_partial_1_11 NO_STORE 2017*c217d954SCole Faust#define vstore_partial_1_12 NO_STORE 2018*c217d954SCole Faust#define vstore_partial_1_13 NO_STORE 2019*c217d954SCole Faust#define vstore_partial_1_14 NO_STORE 2020*c217d954SCole Faust#define vstore_partial_1_15 NO_STORE 2021*c217d954SCole Faust#define vstore_partial_1_16 NO_STORE 2022*c217d954SCole Faust 2023*c217d954SCole Faust#define vstore_partial_2_0 NO_STORE 2024*c217d954SCole Faust#define vstore_partial_2_1 vstore_partial_1 2025*c217d954SCole Faust#define vstore_partial_2_2 vstore_partial_2 2026*c217d954SCole Faust#define vstore_partial_2_3 NO_STORE 2027*c217d954SCole Faust#define vstore_partial_2_4 NO_STORE 2028*c217d954SCole Faust#define vstore_partial_2_5 NO_STORE 2029*c217d954SCole Faust#define vstore_partial_2_6 NO_STORE 2030*c217d954SCole Faust#define vstore_partial_2_7 NO_STORE 2031*c217d954SCole Faust#define vstore_partial_2_8 NO_STORE 2032*c217d954SCole Faust#define vstore_partial_2_9 NO_STORE 2033*c217d954SCole Faust#define vstore_partial_2_10 NO_STORE 2034*c217d954SCole Faust#define vstore_partial_2_11 NO_STORE 2035*c217d954SCole Faust#define vstore_partial_2_12 NO_STORE 2036*c217d954SCole Faust#define vstore_partial_2_13 NO_STORE 2037*c217d954SCole Faust#define vstore_partial_2_14 NO_STORE 2038*c217d954SCole Faust#define vstore_partial_2_15 NO_STORE 2039*c217d954SCole Faust#define vstore_partial_2_16 NO_STORE 2040*c217d954SCole Faust 2041*c217d954SCole Faust#define vstore_partial_3_0 NO_STORE 2042*c217d954SCole Faust#define vstore_partial_3_1 vstore_partial_1 2043*c217d954SCole Faust#define vstore_partial_3_2 vstore_partial_2 2044*c217d954SCole Faust#define vstore_partial_3_3 vstore_partial_3 2045*c217d954SCole Faust#define vstore_partial_3_4 NO_STORE 2046*c217d954SCole Faust#define vstore_partial_3_5 NO_STORE 2047*c217d954SCole Faust#define vstore_partial_3_6 NO_STORE 2048*c217d954SCole Faust#define vstore_partial_3_7 NO_STORE 2049*c217d954SCole Faust#define vstore_partial_3_8 NO_STORE 2050*c217d954SCole Faust#define vstore_partial_3_9 NO_STORE 2051*c217d954SCole Faust#define vstore_partial_3_10 NO_STORE 2052*c217d954SCole Faust#define vstore_partial_3_11 NO_STORE 2053*c217d954SCole Faust#define vstore_partial_3_12 NO_STORE 2054*c217d954SCole Faust#define vstore_partial_3_13 NO_STORE 2055*c217d954SCole Faust#define vstore_partial_3_14 NO_STORE 2056*c217d954SCole Faust#define vstore_partial_3_15 NO_STORE 2057*c217d954SCole Faust#define vstore_partial_3_16 NO_STORE 2058*c217d954SCole Faust 2059*c217d954SCole Faust#define vstore_partial_4_0 NO_STORE 2060*c217d954SCole Faust#define vstore_partial_4_1 vstore_partial_1 2061*c217d954SCole Faust#define vstore_partial_4_2 vstore_partial_2 2062*c217d954SCole Faust#define vstore_partial_4_3 vstore_partial_3 2063*c217d954SCole Faust#define vstore_partial_4_4 vstore_partial_4 2064*c217d954SCole Faust#define vstore_partial_4_5 NO_STORE 2065*c217d954SCole Faust#define vstore_partial_4_6 NO_STORE 2066*c217d954SCole Faust#define vstore_partial_4_7 NO_STORE 2067*c217d954SCole Faust#define vstore_partial_4_8 NO_STORE 2068*c217d954SCole Faust#define vstore_partial_4_9 NO_STORE 2069*c217d954SCole Faust#define vstore_partial_4_10 NO_STORE 2070*c217d954SCole Faust#define vstore_partial_4_11 NO_STORE 2071*c217d954SCole Faust#define vstore_partial_4_12 NO_STORE 2072*c217d954SCole Faust#define vstore_partial_4_13 NO_STORE 2073*c217d954SCole Faust#define vstore_partial_4_14 NO_STORE 2074*c217d954SCole Faust#define vstore_partial_4_15 NO_STORE 2075*c217d954SCole Faust#define vstore_partial_4_16 NO_STORE 2076*c217d954SCole Faust 2077*c217d954SCole Faust#define vstore_partial_8_0 NO_STORE 2078*c217d954SCole Faust#define vstore_partial_8_1 vstore_partial_1 2079*c217d954SCole Faust#define vstore_partial_8_2 vstore_partial_2 2080*c217d954SCole Faust#define vstore_partial_8_3 vstore_partial_3 2081*c217d954SCole Faust#define vstore_partial_8_4 vstore_partial_4 2082*c217d954SCole Faust#define vstore_partial_8_5 vstore_partial_5 2083*c217d954SCole Faust#define vstore_partial_8_6 vstore_partial_6 2084*c217d954SCole Faust#define vstore_partial_8_7 vstore_partial_7 2085*c217d954SCole Faust#define vstore_partial_8_8 vstore_partial_8 2086*c217d954SCole Faust#define vstore_partial_8_9 NO_STORE 2087*c217d954SCole Faust#define vstore_partial_8_10 NO_STORE 2088*c217d954SCole Faust#define vstore_partial_8_11 NO_STORE 2089*c217d954SCole Faust#define vstore_partial_8_12 NO_STORE 2090*c217d954SCole Faust#define vstore_partial_8_13 NO_STORE 2091*c217d954SCole Faust#define vstore_partial_8_14 NO_STORE 2092*c217d954SCole Faust#define vstore_partial_8_15 NO_STORE 2093*c217d954SCole Faust#define vstore_partial_8_16 NO_STORE 2094*c217d954SCole Faust 2095*c217d954SCole Faust#define vstore_partial_16_0 NO_STORE 2096*c217d954SCole Faust#define vstore_partial_16_1 vstore_partial_1 2097*c217d954SCole Faust#define vstore_partial_16_2 vstore_partial_2 2098*c217d954SCole Faust#define vstore_partial_16_3 vstore_partial_3 2099*c217d954SCole Faust#define vstore_partial_16_4 vstore_partial_4 2100*c217d954SCole Faust#define vstore_partial_16_5 vstore_partial_5 2101*c217d954SCole Faust#define vstore_partial_16_6 vstore_partial_6 2102*c217d954SCole Faust#define vstore_partial_16_7 vstore_partial_7 2103*c217d954SCole Faust#define vstore_partial_16_8 vstore_partial_8 2104*c217d954SCole Faust#define vstore_partial_16_9 vstore_partial_9 2105*c217d954SCole Faust#define vstore_partial_16_10 vstore_partial_10 2106*c217d954SCole Faust#define vstore_partial_16_11 vstore_partial_11 2107*c217d954SCole Faust#define vstore_partial_16_12 vstore_partial_12 2108*c217d954SCole Faust#define vstore_partial_16_13 vstore_partial_13 2109*c217d954SCole Faust#define vstore_partial_16_14 vstore_partial_14 2110*c217d954SCole Faust#define vstore_partial_16_15 vstore_partial_15 2111*c217d954SCole Faust#define vstore_partial_16_16 vstore_partial_16 2112*c217d954SCole Faust 2113*c217d954SCole Faust 2114*c217d954SCole Faust#define vstore_partial_1(DATA, OFFSET, PTR) \ 2115*c217d954SCole Faust vstore1(DATA.s0, OFFSET, PTR); 2116*c217d954SCole Faust 2117*c217d954SCole Faust#define vstore_partial_2(DATA, OFFSET, PTR) \ 2118*c217d954SCole Faust vstore2(DATA.s01, OFFSET, PTR); 2119*c217d954SCole Faust 2120*c217d954SCole Faust#define vstore_partial_3(DATA, OFFSET, PTR) \ 2121*c217d954SCole Faust vstore3(DATA.s012, OFFSET, PTR); 2122*c217d954SCole Faust 2123*c217d954SCole Faust#define vstore_partial_4(DATA, OFFSET, PTR) \ 2124*c217d954SCole Faust vstore4(DATA.s0123, OFFSET, PTR); 2125*c217d954SCole Faust 2126*c217d954SCole Faust#define vstore_partial_5(DATA, OFFSET, PTR) \ 2127*c217d954SCole Faust vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 2128*c217d954SCole Faust vstore1(DATA.s4, OFFSET, PTR + 4); 2129*c217d954SCole Faust 2130*c217d954SCole Faust#define vstore_partial_6(DATA, OFFSET, PTR) \ 2131*c217d954SCole Faust vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 2132*c217d954SCole Faust vstore_partial_2(DATA.s45, OFFSET, PTR + 4); 2133*c217d954SCole Faust 2134*c217d954SCole Faust#define vstore_partial_7(DATA, OFFSET, PTR) \ 2135*c217d954SCole Faust vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 2136*c217d954SCole Faust vstore_partial_3(DATA.s456, OFFSET, PTR + 4); 2137*c217d954SCole Faust 2138*c217d954SCole Faust#define vstore_partial_8(DATA, OFFSET, PTR) \ 2139*c217d954SCole Faust vstore8(DATA.s01234567, OFFSET, PTR); 2140*c217d954SCole Faust 2141*c217d954SCole Faust#define vstore_partial_9(DATA, OFFSET, PTR) \ 2142*c217d954SCole Faust vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2143*c217d954SCole Faust vstore1(DATA.s8, OFFSET, PTR + 8); 2144*c217d954SCole Faust 2145*c217d954SCole Faust#define vstore_partial_10(DATA, OFFSET, PTR) \ 2146*c217d954SCole Faust vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2147*c217d954SCole Faust vstore_partial_2(DATA.s89, OFFSET, PTR + 8); 2148*c217d954SCole Faust 2149*c217d954SCole Faust#define vstore_partial_11(DATA, OFFSET, PTR) \ 2150*c217d954SCole Faust vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2151*c217d954SCole Faust vstore_partial_3(DATA.s89a, OFFSET, PTR + 8); 2152*c217d954SCole Faust 2153*c217d954SCole Faust#define vstore_partial_12(DATA, OFFSET, PTR) \ 2154*c217d954SCole Faust vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2155*c217d954SCole Faust vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8); 2156*c217d954SCole Faust 2157*c217d954SCole Faust#define vstore_partial_13(DATA, OFFSET, PTR) \ 2158*c217d954SCole Faust vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2159*c217d954SCole Faust vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8); 2160*c217d954SCole Faust 2161*c217d954SCole Faust#define vstore_partial_14(DATA, OFFSET, PTR) \ 2162*c217d954SCole Faust vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2163*c217d954SCole Faust vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8); 2164*c217d954SCole Faust 2165*c217d954SCole Faust#define vstore_partial_15(DATA, OFFSET, PTR) \ 2166*c217d954SCole Faust vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 2167*c217d954SCole Faust vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8); 2168*c217d954SCole Faust 2169*c217d954SCole Faust#define vstore_partial_16(DATA, OFFSET, PTR) \ 2170*c217d954SCole Faust vstore16(DATA, OFFSET, PTR); 2171*c217d954SCole Faust 2172*c217d954SCole Faust 2173*c217d954SCole Faust 2174*c217d954SCole Faust 2175*c217d954SCole Faust 2176*c217d954SCole Faust#define convert_float_sat convert_float 2177*c217d954SCole Faust#define convert_float1_sat convert_float 2178*c217d954SCole Faust#define convert_float2_sat convert_float2 2179*c217d954SCole Faust#define convert_float3_sat convert_float3 2180*c217d954SCole Faust#define convert_float4_sat convert_float4 2181*c217d954SCole Faust#define convert_float8_sat convert_float8 2182*c217d954SCole Faust#define convert_float16_sat convert_float16 2183*c217d954SCole Faust#define convert_half_sat convert_float 2184*c217d954SCole Faust#define convert_half1_sat convert_half 2185*c217d954SCole Faust#define convert_half2_sat convert_half2 2186*c217d954SCole Faust#define convert_half3_sat convert_half3 2187*c217d954SCole Faust#define convert_half4_sat convert_half4 2188*c217d954SCole Faust#define convert_half8_sat convert_half8 2189*c217d954SCole Faust#define convert_half16_sat convert_half16 2190*c217d954SCole Faust 2191*c217d954SCole Faust#define convert_float1 convert_float 2192*c217d954SCole Faust#define convert_half1 convert_half 2193*c217d954SCole Faust#define convert_char1 convert_char 2194*c217d954SCole Faust#define convert_uchar1 convert_uchar 2195*c217d954SCole Faust#define convert_short1 convert_short 2196*c217d954SCole Faust#define convert_ushort1 convert_ushort 2197*c217d954SCole Faust#define convert_int1 convert_int 2198*c217d954SCole Faust#define convert_uint1 convert_uint 2199*c217d954SCole Faust#define convert_long1 convert_long 2200*c217d954SCole Faust#define convert_ulong1 convert_ulong 2201*c217d954SCole Faust#define convert_double1 convert_double 2202*c217d954SCole Faust 2203*c217d954SCole Faust#define convert_char1_sat convert_char_sat 2204*c217d954SCole Faust#define convert_uchar1_sat convert_uchar_sat 2205*c217d954SCole Faust#define convert_uchar2_sat convert_uchar2_sat 2206*c217d954SCole Faust#define convert_uchar3_sat convert_uchar3_sat 2207*c217d954SCole Faust#define convert_uchar4_sat convert_uchar4_sat 2208*c217d954SCole Faust#define convert_uchar8_sat convert_uchar8_sat 2209*c217d954SCole Faust#define convert_uchar16_sat convert_uchar16_sat 2210*c217d954SCole Faust#define convert_short1_sat convert_short_sat 2211*c217d954SCole Faust#define convert_ushort1_sat convert_ushort_sat 2212*c217d954SCole Faust#define convert_int1_sat convert_int_sat 2213*c217d954SCole Faust#define convert_uint1_sat convert_uint_sat 2214*c217d954SCole Faust#define convert_long1_sat convert_long_sat 2215*c217d954SCole Faust#define convert_ulong1_sat convert_ulong_sat 2216*c217d954SCole Faust#define convert_double1_sat convert_double_sat 2217*c217d954SCole Faust 2218*c217d954SCole Faust#define VEC_DATA_TYPE_STR(type, size) type##size 2219*c217d954SCole Faust#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) 2220*c217d954SCole Faust 2221*c217d954SCole Faust#define CONVERT_STR(x, type) (convert_##type((x))) 2222*c217d954SCole Faust#define CONVERT(x, type) CONVERT_STR(x, type) 2223*c217d954SCole Faust 2224*c217d954SCole Faust#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x))) 2225*c217d954SCole Faust#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type) 2226*c217d954SCole Faust 2227*c217d954SCole Faust#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x))) 2228*c217d954SCole Faust#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round) 2229*c217d954SCole Faust 2230*c217d954SCole Faust#define select_vec_dt_uchar(size) uchar##size 2231*c217d954SCole Faust#define select_vec_dt_char(size) char##size 2232*c217d954SCole Faust#define select_vec_dt_ushort(size) ushort##size 2233*c217d954SCole Faust#define select_vec_dt_short(size) short##size 2234*c217d954SCole Faust#define select_vec_dt_half(size) short##size 2235*c217d954SCole Faust#define select_vec_dt_uint(size) uint##size 2236*c217d954SCole Faust#define select_vec_dt_int(size) int##size 2237*c217d954SCole Faust#define select_vec_dt_float(size) int##size 2238*c217d954SCole Faust#define select_vec_dt_ulong(size) ulong##size 2239*c217d954SCole Faust#define select_vec_dt_long(size) long##size 2240*c217d954SCole Faust 2241*c217d954SCole Faust#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size) 2242*c217d954SCole Faust#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size) 2243*c217d954SCole Faust#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1) 2244*c217d954SCole Faust 2245*c217d954SCole Faust#define signed_int_vec_dt_uchar(size) char##size 2246*c217d954SCole Faust#define signed_int_vec_dt_char(size) char##size 2247*c217d954SCole Faust#define signed_int_vec_dt_ushort(size) short##size 2248*c217d954SCole Faust#define signed_int_vec_dt_short(size) short##size 2249*c217d954SCole Faust#define signed_int_vec_dt_half(size) short##size 2250*c217d954SCole Faust#define signed_int_vec_dt_uint(size) int##size 2251*c217d954SCole Faust#define signed_int_vec_dt_int(size) int##size 2252*c217d954SCole Faust#define signed_int_vec_dt_float(size) int##size 2253*c217d954SCole Faust#define signed_int_vec_dt_ulong(size) long##size 2254*c217d954SCole Faust#define signed_int_vec_dt_long(size) long##size 2255*c217d954SCole Faust 2256*c217d954SCole Faust#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size) 2257*c217d954SCole Faust#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size) 2258*c217d954SCole Faust#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1) 2259*c217d954SCole Faust 2260*c217d954SCole Faust#define sum_reduce_1(x) (x) 2261*c217d954SCole Faust#define sum_reduce_2(x) ((x).s0) + ((x).s1) 2262*c217d954SCole Faust#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2) 2263*c217d954SCole Faust#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23) 2264*c217d954SCole Faust#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567) 2265*c217d954SCole Faust#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF) 2266*c217d954SCole Faust 2267*c217d954SCole Faust#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x) 2268*c217d954SCole Faust#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size) 2269*c217d954SCole Faust 2270*c217d954SCole Faust#define prod_reduce_1(x) (x) 2271*c217d954SCole Faust#define prod_reduce_2(x) ((x).s0) * ((x).s1) 2272*c217d954SCole Faust#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2) 2273*c217d954SCole Faust#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23) 2274*c217d954SCole Faust#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567) 2275*c217d954SCole Faust#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF) 2276*c217d954SCole Faust 2277*c217d954SCole Faust#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x) 2278*c217d954SCole Faust#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size) 2279*c217d954SCole Faust 2280*c217d954SCole Faust#define max_reduce_1(x) (x) 2281*c217d954SCole Faust#define max_reduce_2(x) max(((x).s0), ((x).s1)) 2282*c217d954SCole Faust#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2)) 2283*c217d954SCole Faust#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23)) 2284*c217d954SCole Faust#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567)) 2285*c217d954SCole Faust#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF)) 2286*c217d954SCole Faust 2287*c217d954SCole Faust#define MAX_REDUCE_STR(x, size) max_reduce_##size(x) 2288*c217d954SCole Faust#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size) 2289*c217d954SCole Faust 2290*c217d954SCole Faust#define VECTOR_DECLARATION(name) \ 2291*c217d954SCole Faust __global uchar *name##_ptr, \ 2292*c217d954SCole Faust uint name##_stride_x, \ 2293*c217d954SCole Faust uint name##_step_x, \ 2294*c217d954SCole Faust uint name##_offset_first_element_in_bytes 2295*c217d954SCole Faust 2296*c217d954SCole Faust#define IMAGE_DECLARATION(name) \ 2297*c217d954SCole Faust __global uchar *name##_ptr, \ 2298*c217d954SCole Faust uint name##_stride_x, \ 2299*c217d954SCole Faust uint name##_step_x, \ 2300*c217d954SCole Faust uint name##_stride_y, \ 2301*c217d954SCole Faust uint name##_step_y, \ 2302*c217d954SCole Faust uint name##_offset_first_element_in_bytes 2303*c217d954SCole Faust 2304*c217d954SCole Faust#define TENSOR3D_DECLARATION(name) \ 2305*c217d954SCole Faust __global uchar *name##_ptr, \ 2306*c217d954SCole Faust uint name##_stride_x, \ 2307*c217d954SCole Faust uint name##_step_x, \ 2308*c217d954SCole Faust uint name##_stride_y, \ 2309*c217d954SCole Faust uint name##_step_y, \ 2310*c217d954SCole Faust uint name##_stride_z, \ 2311*c217d954SCole Faust uint name##_step_z, \ 2312*c217d954SCole Faust uint name##_offset_first_element_in_bytes 2313*c217d954SCole Faust 2314*c217d954SCole Faust#define TENSOR4D_DECLARATION(name) \ 2315*c217d954SCole Faust __global uchar *name##_ptr, \ 2316*c217d954SCole Faust uint name##_stride_x, \ 2317*c217d954SCole Faust uint name##_step_x, \ 2318*c217d954SCole Faust uint name##_stride_y, \ 2319*c217d954SCole Faust uint name##_step_y, \ 2320*c217d954SCole Faust uint name##_stride_z, \ 2321*c217d954SCole Faust uint name##_step_z, \ 2322*c217d954SCole Faust uint name##_stride_w, \ 2323*c217d954SCole Faust uint name##_step_w, \ 2324*c217d954SCole Faust uint name##_offset_first_element_in_bytes 2325*c217d954SCole Faust 2326*c217d954SCole Faust#define TENSOR5D_DECLARATION(name) \ 2327*c217d954SCole Faust __global uchar *name##_ptr, \ 2328*c217d954SCole Faust uint name##_stride_x, \ 2329*c217d954SCole Faust uint name##_step_x, \ 2330*c217d954SCole Faust uint name##_stride_y, \ 2331*c217d954SCole Faust uint name##_step_y, \ 2332*c217d954SCole Faust uint name##_stride_z, \ 2333*c217d954SCole Faust uint name##_step_z, \ 2334*c217d954SCole Faust uint name##_stride_w, \ 2335*c217d954SCole Faust uint name##_step_w, \ 2336*c217d954SCole Faust uint name##_stride_v, \ 2337*c217d954SCole Faust uint name##_step_v, \ 2338*c217d954SCole Faust uint name##_offset_first_element_in_bytes 2339*c217d954SCole Faust 2340*c217d954SCole Faust#define CONVERT_TO_VECTOR_STRUCT(name) \ 2341*c217d954SCole Faust update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x) 2342*c217d954SCole Faust 2343*c217d954SCole Faust#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \ 2344*c217d954SCole Faust update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0) 2345*c217d954SCole Faust 2346*c217d954SCole Faust#define CONVERT_TO_IMAGE_STRUCT(name) \ 2347*c217d954SCole Faust update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y) 2348*c217d954SCole Faust 2349*c217d954SCole Faust#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \ 2350*c217d954SCole Faust update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0) 2351*c217d954SCole Faust 2352*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 2353*c217d954SCole Faust update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 2354*c217d954SCole Faust 2355*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \ 2356*c217d954SCole Faust update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z) 2357*c217d954SCole Faust 2358*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 2359*c217d954SCole Faust update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 2360*c217d954SCole Faust 2361*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT(name) \ 2362*c217d954SCole Faust update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 2363*c217d954SCole Faust name##_stride_z, name##_step_z) 2364*c217d954SCole Faust 2365*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \ 2366*c217d954SCole Faust update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0) 2367*c217d954SCole Faust 2368*c217d954SCole Faust#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size) \ 2369*c217d954SCole Faust update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 2370*c217d954SCole Faust name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size) 2371*c217d954SCole Faust 2372*c217d954SCole Faust#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \ 2373*c217d954SCole Faust update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size) 2374*c217d954SCole Faust 2375*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name) \ 2376*c217d954SCole Faust tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 2377*c217d954SCole Faust name##_stride_z, name##_step_z) 2378*c217d954SCole Faust 2379*c217d954SCole Faust 2380*c217d954SCole Fausttypedef struct Vector 2381*c217d954SCole Faust{ 2382*c217d954SCole Faust __global uchar *ptr; 2383*c217d954SCole Faust int offset_first_element_in_bytes; 2384*c217d954SCole Faust int stride_x; 2385*c217d954SCole Faust} Vector; 2386*c217d954SCole Faust 2387*c217d954SCole Faust 2388*c217d954SCole Fausttypedef struct Image 2389*c217d954SCole Faust{ 2390*c217d954SCole Faust __global uchar *ptr; 2391*c217d954SCole Faust int offset_first_element_in_bytes; 2392*c217d954SCole Faust int stride_x; 2393*c217d954SCole Faust int stride_y; 2394*c217d954SCole Faust} Image; 2395*c217d954SCole Faust 2396*c217d954SCole Faust 2397*c217d954SCole Fausttypedef struct Tensor3D 2398*c217d954SCole Faust{ 2399*c217d954SCole Faust __global uchar *ptr; 2400*c217d954SCole Faust int offset_first_element_in_bytes; 2401*c217d954SCole Faust int stride_x; 2402*c217d954SCole Faust int stride_y; 2403*c217d954SCole Faust int stride_z; 2404*c217d954SCole Faust} Tensor3D; 2405*c217d954SCole Faust 2406*c217d954SCole Faust 2407*c217d954SCole Fausttypedef struct Tensor4D 2408*c217d954SCole Faust{ 2409*c217d954SCole Faust __global uchar *ptr; 2410*c217d954SCole Faust int offset_first_element_in_bytes; 2411*c217d954SCole Faust int stride_x; 2412*c217d954SCole Faust int stride_y; 2413*c217d954SCole Faust int stride_z; 2414*c217d954SCole Faust int stride_w; 2415*c217d954SCole Faust} Tensor4D; 2416*c217d954SCole Faust 2417*c217d954SCole Faust 2418*c217d954SCole Faustinline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x) 2419*c217d954SCole Faust{ 2420*c217d954SCole Faust Vector vector = 2421*c217d954SCole Faust { 2422*c217d954SCole Faust .ptr = ptr, 2423*c217d954SCole Faust .offset_first_element_in_bytes = offset_first_element_in_bytes, 2424*c217d954SCole Faust .stride_x = stride_x, 2425*c217d954SCole Faust }; 2426*c217d954SCole Faust vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x; 2427*c217d954SCole Faust return vector; 2428*c217d954SCole Faust} 2429*c217d954SCole Faust 2430*c217d954SCole Faust 2431*c217d954SCole Faustinline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y) 2432*c217d954SCole Faust{ 2433*c217d954SCole Faust Image img = 2434*c217d954SCole Faust { 2435*c217d954SCole Faust .ptr = ptr, 2436*c217d954SCole Faust .offset_first_element_in_bytes = offset_first_element_in_bytes, 2437*c217d954SCole Faust .stride_x = stride_x, 2438*c217d954SCole Faust .stride_y = stride_y 2439*c217d954SCole Faust }; 2440*c217d954SCole Faust img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y; 2441*c217d954SCole Faust return img; 2442*c217d954SCole Faust} 2443*c217d954SCole Faust 2444*c217d954SCole Faust 2445*c217d954SCole Faustinline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 2446*c217d954SCole Faust{ 2447*c217d954SCole Faust Image img = 2448*c217d954SCole Faust { 2449*c217d954SCole Faust .ptr = ptr, 2450*c217d954SCole Faust .offset_first_element_in_bytes = offset_first_element_in_bytes, 2451*c217d954SCole Faust .stride_x = stride_x, 2452*c217d954SCole Faust .stride_y = stride_y 2453*c217d954SCole Faust }; 2454*c217d954SCole Faust img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 2455*c217d954SCole Faust return img; 2456*c217d954SCole Faust} 2457*c217d954SCole Faust 2458*c217d954SCole Faust 2459*c217d954SCole Faustinline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 2460*c217d954SCole Faust{ 2461*c217d954SCole Faust Tensor3D tensor = 2462*c217d954SCole Faust { 2463*c217d954SCole Faust .ptr = ptr, 2464*c217d954SCole Faust .offset_first_element_in_bytes = offset_first_element_in_bytes, 2465*c217d954SCole Faust .stride_x = stride_x, 2466*c217d954SCole Faust .stride_y = stride_y, 2467*c217d954SCole Faust .stride_z = stride_z 2468*c217d954SCole Faust }; 2469*c217d954SCole Faust tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 2470*c217d954SCole Faust return tensor; 2471*c217d954SCole Faust} 2472*c217d954SCole Faust 2473*c217d954SCole Faust 2474*c217d954SCole Faustinline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 2475*c217d954SCole Faust{ 2476*c217d954SCole Faust Tensor3D tensor = 2477*c217d954SCole Faust { 2478*c217d954SCole Faust .ptr = ptr, 2479*c217d954SCole Faust .offset_first_element_in_bytes = offset_first_element_in_bytes, 2480*c217d954SCole Faust .stride_x = stride_x, 2481*c217d954SCole Faust .stride_y = stride_y, 2482*c217d954SCole Faust .stride_z = stride_z 2483*c217d954SCole Faust }; 2484*c217d954SCole Faust return tensor; 2485*c217d954SCole Faust} 2486*c217d954SCole Faust 2487*c217d954SCole Faustinline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w, 2488*c217d954SCole Faust uint step_w, 2489*c217d954SCole Faust uint mod_size) 2490*c217d954SCole Faust{ 2491*c217d954SCole Faust Tensor4D tensor = 2492*c217d954SCole Faust { 2493*c217d954SCole Faust .ptr = ptr, 2494*c217d954SCole Faust .offset_first_element_in_bytes = offset_first_element_in_bytes, 2495*c217d954SCole Faust .stride_x = stride_x, 2496*c217d954SCole Faust .stride_y = stride_y, 2497*c217d954SCole Faust .stride_z = stride_z, 2498*c217d954SCole Faust .stride_w = stride_w 2499*c217d954SCole Faust }; 2500*c217d954SCole Faust 2501*c217d954SCole Faust tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w; 2502*c217d954SCole Faust return tensor; 2503*c217d954SCole Faust} 2504*c217d954SCole Faust 2505*c217d954SCole Faust 2506*c217d954SCole Faustinline __global const uchar *vector_offset(const Vector *vec, int x) 2507*c217d954SCole Faust{ 2508*c217d954SCole Faust return vec->ptr + x * vec->stride_x; 2509*c217d954SCole Faust} 2510*c217d954SCole Faust 2511*c217d954SCole Faust 2512*c217d954SCole Faustinline __global uchar *offset(const Image *img, int x, int y) 2513*c217d954SCole Faust{ 2514*c217d954SCole Faust return img->ptr + x * img->stride_x + y * img->stride_y; 2515*c217d954SCole Faust} 2516*c217d954SCole Faust 2517*c217d954SCole Faust 2518*c217d954SCole Faustinline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z) 2519*c217d954SCole Faust{ 2520*c217d954SCole Faust return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z; 2521*c217d954SCole Faust} 2522*c217d954SCole Faust 2523*c217d954SCole Faust 2524*c217d954SCole Faustinline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w) 2525*c217d954SCole Faust{ 2526*c217d954SCole Faust return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w; 2527*c217d954SCole Faust} 2528*c217d954SCole Faust 2529*c217d954SCole Faust 2530*c217d954SCole Faustinline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index) 2531*c217d954SCole Faust{ 2532*c217d954SCole Faust uint num_elements = width * height; 2533*c217d954SCole Faust 2534*c217d954SCole Faust const uint z = index / num_elements; 2535*c217d954SCole Faust 2536*c217d954SCole Faust index %= num_elements; 2537*c217d954SCole Faust 2538*c217d954SCole Faust const uint y = index / width; 2539*c217d954SCole Faust 2540*c217d954SCole Faust index %= width; 2541*c217d954SCole Faust 2542*c217d954SCole Faust const uint x = index; 2543*c217d954SCole Faust 2544*c217d954SCole Faust return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes; 2545*c217d954SCole Faust} 2546*c217d954SCole Faust 2547*c217d954SCole Faust#endif 2548*c217d954SCole Faust 2549*c217d954SCole Faust 2550*c217d954SCole Faust#define SCALAR_ACCESS_STR(offset, n0, x) scalar_access_##offset##_##n0(x) 2551*c217d954SCole Faust#define SCALAR_ACCESS(offset, n0, x) SCALAR_ACCESS_STR(offset, n0, x) 2552*c217d954SCole Faust 2553*c217d954SCole Faust 2554*c217d954SCole Faust#define scalar_access_0_1(x) ((x).s0) 2555*c217d954SCole Faust#define scalar_access_0_2(x) ((x).s01) 2556*c217d954SCole Faust#define scalar_access_0_3(x) ((x).s012) 2557*c217d954SCole Faust#define scalar_access_0_4(x) ((x).s0123) 2558*c217d954SCole Faust#define scalar_access_0_8(x) ((x).s01234567) 2559*c217d954SCole Faust#define scalar_access_0_16(x) ((x).s0123456789ABCDEF) 2560*c217d954SCole Faust 2561*c217d954SCole Faust 2562*c217d954SCole Faust#define scalar_access_1_1(x) ((x).s1) 2563*c217d954SCole Faust#define scalar_access_1_2(x) ((x).s12) 2564*c217d954SCole Faust#define scalar_access_1_3(x) ((x).s123) 2565*c217d954SCole Faust#define scalar_access_1_4(x) ((x).s1234) 2566*c217d954SCole Faust#define scalar_access_1_8(x) ((x).s12345678) 2567*c217d954SCole Faust 2568*c217d954SCole Faust 2569*c217d954SCole Faust#define scalar_access_2_1(x) ((x).s2) 2570*c217d954SCole Faust#define scalar_access_2_2(x) ((x).s23) 2571*c217d954SCole Faust#define scalar_access_2_3(x) ((x).s234) 2572*c217d954SCole Faust#define scalar_access_2_4(x) ((x).s2345) 2573*c217d954SCole Faust#define scalar_access_2_8(x) ((x).s23456789) 2574*c217d954SCole Faust 2575*c217d954SCole Faust 2576*c217d954SCole Faust#define scalar_access_3_1(x) ((x).s3) 2577*c217d954SCole Faust#define scalar_access_3_2(x) ((x).s34) 2578*c217d954SCole Faust#define scalar_access_3_3(x) ((x).s345) 2579*c217d954SCole Faust#define scalar_access_3_4(x) ((x).s3456) 2580*c217d954SCole Faust#define scalar_access_3_8(x) ((x).s3456789A) 2581*c217d954SCole Faust 2582*c217d954SCole Faust 2583*c217d954SCole Faust#define scalar_access_4_1(x) ((x).s4) 2584*c217d954SCole Faust#define scalar_access_4_2(x) ((x).s45) 2585*c217d954SCole Faust#define scalar_access_4_3(x) ((x).s456) 2586*c217d954SCole Faust#define scalar_access_4_4(x) ((x).s4567) 2587*c217d954SCole Faust#define scalar_access_4_8(x) ((x).s456789AB) 2588*c217d954SCole Faust 2589*c217d954SCole Faust 2590*c217d954SCole Faust#define scalar_access_8_1(x) ((x).s8) 2591*c217d954SCole Faust#define scalar_access_8_2(x) ((x).s89) 2592*c217d954SCole Faust#define scalar_access_8_3(x) ((x).s89A) 2593*c217d954SCole Faust#define scalar_access_8_4(x) ((x).s89AB) 2594*c217d954SCole Faust#define scalar_access_8_8(x) ((x).s89ABCDEF) 2595*c217d954SCole Faust 2596*c217d954SCole Faust 2597*c217d954SCole Faust#define scalar_access_12_1(x) ((x).sC) 2598*c217d954SCole Faust#define scalar_access_12_2(x) ((x).sCD) 2599*c217d954SCole Faust#define scalar_access_12_3(x) ((x).sCDE) 2600*c217d954SCole Faust#define scalar_access_12_4(x) ((x).sCDEF) 2601*c217d954SCole Faust 2602*c217d954SCole Faust 2603*c217d954SCole Faust#define scalar_access_16_1(x) ((x).sF) 2604*c217d954SCole Faust 2605*c217d954SCole Faust 2606*c217d954SCole Faust#define LOAD_TENSOR_ROW_0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2607*c217d954SCole Faust ({}) 2608*c217d954SCole Faust 2609*c217d954SCole Faust#define LOAD_TENSOR_ROW_1(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2610*c217d954SCole Faust SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##0) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 2611*c217d954SCole Faust 2612*c217d954SCole Faust#define LOAD_TENSOR_ROW_2(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2613*c217d954SCole Faust LOAD_TENSOR_ROW_1(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2614*c217d954SCole Faust SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##1) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 2615*c217d954SCole Faust 2616*c217d954SCole Faust#define LOAD_TENSOR_ROW_3(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2617*c217d954SCole Faust LOAD_TENSOR_ROW_2(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2618*c217d954SCole Faust SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##2) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 2619*c217d954SCole Faust 2620*c217d954SCole Faust#define LOAD_TENSOR_ROW_4(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2621*c217d954SCole Faust LOAD_TENSOR_ROW_3(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2622*c217d954SCole Faust SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##3) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 2623*c217d954SCole Faust 2624*c217d954SCole Faust#define LOAD_TENSOR_ROW_5(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2625*c217d954SCole Faust LOAD_TENSOR_ROW_4(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2626*c217d954SCole Faust SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##4) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 2627*c217d954SCole Faust 2628*c217d954SCole Faust#define LOAD_TENSOR_ROW_6(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2629*c217d954SCole Faust LOAD_TENSOR_ROW_5(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2630*c217d954SCole Faust SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##5) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 2631*c217d954SCole Faust 2632*c217d954SCole Faust#define LOAD_TENSOR_ROW_7(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2633*c217d954SCole Faust LOAD_TENSOR_ROW_6(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2634*c217d954SCole Faust SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##6) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 2635*c217d954SCole Faust 2636*c217d954SCole Faust#define LOAD_TENSOR_ROW_8(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2637*c217d954SCole Faust LOAD_TENSOR_ROW_7(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2638*c217d954SCole Faust SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##7) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 2639*c217d954SCole Faust 2640*c217d954SCole Faust#define LOAD_TENSOR_ROW_9(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2641*c217d954SCole Faust LOAD_TENSOR_ROW_8(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2642*c217d954SCole Faust SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##8) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 2643*c217d954SCole Faust 2644*c217d954SCole Faust#define LOAD_TENSOR_ROW_10(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2645*c217d954SCole Faust LOAD_TENSOR_ROW_9(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2646*c217d954SCole Faust SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##9) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 2647*c217d954SCole Faust 2648*c217d954SCole Faust#define LOAD_TENSOR_ROW_11(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2649*c217d954SCole Faust LOAD_TENSOR_ROW_10(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2650*c217d954SCole Faust SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##A) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 2651*c217d954SCole Faust 2652*c217d954SCole Faust#define LOAD_TENSOR_ROW_12(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2653*c217d954SCole Faust LOAD_TENSOR_ROW_11(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2654*c217d954SCole Faust SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##B) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 2655*c217d954SCole Faust 2656*c217d954SCole Faust#define LOAD_TENSOR_ROW_13(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2657*c217d954SCole Faust LOAD_TENSOR_ROW_12(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2658*c217d954SCole Faust SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##C) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 2659*c217d954SCole Faust 2660*c217d954SCole Faust#define LOAD_TENSOR_ROW_14(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2661*c217d954SCole Faust LOAD_TENSOR_ROW_13(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2662*c217d954SCole Faust SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##D) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 2663*c217d954SCole Faust 2664*c217d954SCole Faust#define LOAD_TENSOR_ROW_15(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2665*c217d954SCole Faust LOAD_TENSOR_ROW_14(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2666*c217d954SCole Faust SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##E) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 2667*c217d954SCole Faust 2668*c217d954SCole Faust#define LOAD_TENSOR_ROW_16(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2669*c217d954SCole Faust LOAD_TENSOR_ROW_15(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \ 2670*c217d954SCole Faust SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##F) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 2671*c217d954SCole Faust 2672*c217d954SCole Faust 2673*c217d954SCole Faust 2674*c217d954SCole Faust#define LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) 2675*c217d954SCole Faust#define LOAD_TENSOR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) 2676*c217d954SCole Faust 2677*c217d954SCole Faust 2678*c217d954SCole Faust 2679*c217d954SCole Faust#define LOAD_TENSOR_M0X0(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2680*c217d954SCole Faust ({}) 2681*c217d954SCole Faust 2682*c217d954SCole Faust#define LOAD_TENSOR_M0X1(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2683*c217d954SCole Faust LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); 2684*c217d954SCole Faust 2685*c217d954SCole Faust#define LOAD_TENSOR_M0X2(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2686*c217d954SCole Faust LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); 2687*c217d954SCole Faust 2688*c217d954SCole Faust#define LOAD_TENSOR_M0X3(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2689*c217d954SCole Faust LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); 2690*c217d954SCole Faust 2691*c217d954SCole Faust#define LOAD_TENSOR_M0X4(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2692*c217d954SCole Faust LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); 2693*c217d954SCole Faust 2694*c217d954SCole Faust#define LOAD_TENSOR_M0X5(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2695*c217d954SCole Faust LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \ 2696*c217d954SCole Faust LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin); 2697*c217d954SCole Faust 2698*c217d954SCole Faust#define LOAD_TENSOR_M0X6(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2699*c217d954SCole Faust LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \ 2700*c217d954SCole Faust LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin); 2701*c217d954SCole Faust 2702*c217d954SCole Faust#define LOAD_TENSOR_M0X7(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2703*c217d954SCole Faust LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \ 2704*c217d954SCole Faust LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin); 2705*c217d954SCole Faust 2706*c217d954SCole Faust#define LOAD_TENSOR_M0X8(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2707*c217d954SCole Faust LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); 2708*c217d954SCole Faust 2709*c217d954SCole Faust#define LOAD_TENSOR_M0X9(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2710*c217d954SCole Faust LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr 0, src_stride_y, zin); \ 2711*c217d954SCole Faust LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); 2712*c217d954SCole Faust 2713*c217d954SCole Faust#define LOAD_TENSOR_M0X10(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2714*c217d954SCole Faust LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \ 2715*c217d954SCole Faust LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); 2716*c217d954SCole Faust 2717*c217d954SCole Faust#define LOAD_TENSOR_M0X11(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2718*c217d954SCole Faust LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \ 2719*c217d954SCole Faust LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); 2720*c217d954SCole Faust 2721*c217d954SCole Faust#define LOAD_TENSOR_M0X12(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2722*c217d954SCole Faust LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \ 2723*c217d954SCole Faust LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); 2724*c217d954SCole Faust 2725*c217d954SCole Faust#define LOAD_TENSOR_M0X13(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2726*c217d954SCole Faust LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \ 2727*c217d954SCole Faust LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \ 2728*c217d954SCole Faust LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin); 2729*c217d954SCole Faust 2730*c217d954SCole Faust#define LOAD_TENSOR_M0X14(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2731*c217d954SCole Faust LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr 0, src_stride_y, zin); \ 2732*c217d954SCole Faust LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \ 2733*c217d954SCole Faust LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin); 2734*c217d954SCole Faust 2735*c217d954SCole Faust#define LOAD_TENSOR_M0X15(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2736*c217d954SCole Faust LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \ 2737*c217d954SCole Faust LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \ 2738*c217d954SCole Faust LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin); 2739*c217d954SCole Faust 2740*c217d954SCole Faust#define LOAD_TENSOR_M0X16(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \ 2741*c217d954SCole Faust LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); 2742*c217d954SCole Faust 2743*c217d954SCole Faust 2744*c217d954SCole Faust 2745*c217d954SCole Faust#define LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0X##N0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 2746*c217d954SCole Faust#define LOAD_TENSOR_M0XN0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 2747*c217d954SCole Faust 2748*c217d954SCole Faust 2749*c217d954SCole Faust#define LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2750*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 2751*c217d954SCole Faust BASENAME##0 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y + Z##0)); 2752*c217d954SCole Faust 2753*c217d954SCole Faust#define LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2754*c217d954SCole Faust LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2755*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 2756*c217d954SCole Faust BASENAME##1 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y + Z##1)); 2757*c217d954SCole Faust 2758*c217d954SCole Faust#define LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2759*c217d954SCole Faust LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2760*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 2761*c217d954SCole Faust BASENAME##2 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y + Z##2)); 2762*c217d954SCole Faust 2763*c217d954SCole Faust#define LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2764*c217d954SCole Faust LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2765*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 2766*c217d954SCole Faust BASENAME##3 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y + Z##3)); 2767*c217d954SCole Faust 2768*c217d954SCole Faust#define LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2769*c217d954SCole Faust LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2770*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 2771*c217d954SCole Faust BASENAME##4 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y + Z##4)); 2772*c217d954SCole Faust 2773*c217d954SCole Faust#define LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2774*c217d954SCole Faust LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2775*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 2776*c217d954SCole Faust BASENAME##5 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y + Z##5)); 2777*c217d954SCole Faust 2778*c217d954SCole Faust#define LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2779*c217d954SCole Faust LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2780*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 2781*c217d954SCole Faust BASENAME##6 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y + Z##6)); 2782*c217d954SCole Faust 2783*c217d954SCole Faust#define LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2784*c217d954SCole Faust LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2785*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 2786*c217d954SCole Faust BASENAME##7 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y + Z##7)); 2787*c217d954SCole Faust 2788*c217d954SCole Faust#define LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2789*c217d954SCole Faust LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2790*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 2791*c217d954SCole Faust BASENAME##8 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y + Z##8)); 2792*c217d954SCole Faust 2793*c217d954SCole Faust#define LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2794*c217d954SCole Faust LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2795*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 2796*c217d954SCole Faust BASENAME##9 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y + Z##9)); 2797*c217d954SCole Faust 2798*c217d954SCole Faust#define LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2799*c217d954SCole Faust LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2800*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 2801*c217d954SCole Faust BASENAME##A = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y + Z##A)); 2802*c217d954SCole Faust 2803*c217d954SCole Faust#define LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2804*c217d954SCole Faust LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2805*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 2806*c217d954SCole Faust BASENAME##B = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y + Z##B)); 2807*c217d954SCole Faust 2808*c217d954SCole Faust#define LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2809*c217d954SCole Faust LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2810*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 2811*c217d954SCole Faust BASENAME##C = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y + Z##C)); 2812*c217d954SCole Faust 2813*c217d954SCole Faust#define LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2814*c217d954SCole Faust LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2815*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 2816*c217d954SCole Faust BASENAME##D = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y + Z##D)); 2817*c217d954SCole Faust 2818*c217d954SCole Faust#define LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2819*c217d954SCole Faust LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2820*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 2821*c217d954SCole Faust BASENAME##E = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y + Z##E)); 2822*c217d954SCole Faust 2823*c217d954SCole Faust#define LOAD_ROW_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2824*c217d954SCole Faust LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2825*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 2826*c217d954SCole Faust BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y + Z##F)); 2827*c217d954SCole Faust 2828*c217d954SCole Faust 2829*c217d954SCole Faust 2830*c217d954SCole Faust 2831*c217d954SCole Faust#define LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) 2832*c217d954SCole Faust#define LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) 2833*c217d954SCole Faust 2834*c217d954SCole Faust 2835*c217d954SCole Faust 2836*c217d954SCole Faust#define LOAD_ROW_PARTIAL_1(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2837*c217d954SCole Faust VLOAD_PARTIAL(N0, LOAD_N0) \ 2838*c217d954SCole Faust (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y + Z##0)); 2839*c217d954SCole Faust 2840*c217d954SCole Faust#define LOAD_ROW_PARTIAL_2(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2841*c217d954SCole Faust LOAD_ROW_PARTIAL_1(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2842*c217d954SCole Faust VLOAD_PARTIAL(N0, LOAD_N0) \ 2843*c217d954SCole Faust (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y + Z##1)); 2844*c217d954SCole Faust 2845*c217d954SCole Faust#define LOAD_ROW_PARTIAL_3(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2846*c217d954SCole Faust LOAD_ROW_PARTIAL_2(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2847*c217d954SCole Faust VLOAD_PARTIAL(N0, LOAD_N0) \ 2848*c217d954SCole Faust (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y + Z##2)); 2849*c217d954SCole Faust 2850*c217d954SCole Faust#define LOAD_ROW_PARTIAL_4(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2851*c217d954SCole Faust LOAD_ROW_PARTIAL_3(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2852*c217d954SCole Faust VLOAD_PARTIAL(N0, LOAD_N0) \ 2853*c217d954SCole Faust (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y + Z##3)); 2854*c217d954SCole Faust 2855*c217d954SCole Faust#define LOAD_ROW_PARTIAL_5(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2856*c217d954SCole Faust LOAD_ROW_PARTIAL_4(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2857*c217d954SCole Faust VLOAD_PARTIAL(N0, LOAD_N0) \ 2858*c217d954SCole Faust (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y + Z##4)); 2859*c217d954SCole Faust 2860*c217d954SCole Faust#define LOAD_ROW_PARTIAL_6(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2861*c217d954SCole Faust LOAD_ROW_PARTIAL_5(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2862*c217d954SCole Faust VLOAD_PARTIAL(N0, LOAD_N0) \ 2863*c217d954SCole Faust (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y + Z##5)); 2864*c217d954SCole Faust 2865*c217d954SCole Faust#define LOAD_ROW_PARTIAL_7(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2866*c217d954SCole Faust LOAD_ROW_PARTIAL_6(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2867*c217d954SCole Faust VLOAD_PARTIAL(N0, LOAD_N0) \ 2868*c217d954SCole Faust (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y + Z##6)); 2869*c217d954SCole Faust 2870*c217d954SCole Faust#define LOAD_ROW_PARTIAL_8(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2871*c217d954SCole Faust LOAD_ROW_PARTIAL_7(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2872*c217d954SCole Faust VLOAD_PARTIAL(N0, LOAD_N0) \ 2873*c217d954SCole Faust (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y + Z##7)); 2874*c217d954SCole Faust 2875*c217d954SCole Faust#define LOAD_ROW_PARTIAL_9(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2876*c217d954SCole Faust LOAD_ROW_PARTIAL_8(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2877*c217d954SCole Faust VLOAD_PARTIAL(N0, LOAD_N0) \ 2878*c217d954SCole Faust (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y + Z##8)); 2879*c217d954SCole Faust 2880*c217d954SCole Faust#define LOAD_ROW_PARTIAL_10(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2881*c217d954SCole Faust LOAD_ROW_PARTIAL_9(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2882*c217d954SCole Faust VLOAD_PARTIAL(N0, LOAD_N0) \ 2883*c217d954SCole Faust (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y + Z##9)); 2884*c217d954SCole Faust 2885*c217d954SCole Faust#define LOAD_ROW_PARTIAL_11(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2886*c217d954SCole Faust LOAD_ROW_PARTIAL_10(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2887*c217d954SCole Faust VLOAD_PARTIAL(N0, LOAD_N0) \ 2888*c217d954SCole Faust (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y + Z##A)); 2889*c217d954SCole Faust 2890*c217d954SCole Faust#define LOAD_ROW_PARTIAL_12(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2891*c217d954SCole Faust LOAD_ROW_PARTIAL_11(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2892*c217d954SCole Faust VLOAD_PARTIAL(N0, LOAD_N0) \ 2893*c217d954SCole Faust (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y + Z##B)); 2894*c217d954SCole Faust 2895*c217d954SCole Faust#define LOAD_ROW_PARTIAL_13(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2896*c217d954SCole Faust LOAD_ROW_PARTIAL_12(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2897*c217d954SCole Faust VLOAD_PARTIAL(N0, LOAD_N0) \ 2898*c217d954SCole Faust (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y + Z##C)); 2899*c217d954SCole Faust 2900*c217d954SCole Faust#define LOAD_ROW_PARTIAL_14(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2901*c217d954SCole Faust LOAD_ROW_PARTIAL_13(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2902*c217d954SCole Faust VLOAD_PARTIAL(N0, LOAD_N0) \ 2903*c217d954SCole Faust (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y + Z##D)); 2904*c217d954SCole Faust 2905*c217d954SCole Faust#define LOAD_ROW_PARTIAL_15(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2906*c217d954SCole Faust LOAD_ROW_PARTIAL_14(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2907*c217d954SCole Faust VLOAD_PARTIAL(N0, LOAD_N0) \ 2908*c217d954SCole Faust (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y + Z##E)); 2909*c217d954SCole Faust 2910*c217d954SCole Faust#define LOAD_ROW_PARTIAL_16(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2911*c217d954SCole Faust LOAD_ROW_PARTIAL_15(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \ 2912*c217d954SCole Faust VLOAD_PARTIAL(N0, LOAD_N0) \ 2913*c217d954SCole Faust (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y + Z##F)); 2914*c217d954SCole Faust 2915*c217d954SCole Faust 2916*c217d954SCole Faust 2917*c217d954SCole Faust#define LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_PARTIAL_##LOAD_M0(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) 2918*c217d954SCole Faust#define LOAD_BLOCK_PARTIAL(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) 2919*c217d954SCole Faust 2920*c217d954SCole Faust#define LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 2921*c217d954SCole Faust if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \ 2922*c217d954SCole Faust { \ 2923*c217d954SCole Faust LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ 2924*c217d954SCole Faust } \ 2925*c217d954SCole Faust else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \ 2926*c217d954SCole Faust { \ 2927*c217d954SCole Faust LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ 2928*c217d954SCole Faust } \ 2929*c217d954SCole Faust else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \ 2930*c217d954SCole Faust { \ 2931*c217d954SCole Faust LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ 2932*c217d954SCole Faust } \ 2933*c217d954SCole Faust else \ 2934*c217d954SCole Faust { \ 2935*c217d954SCole Faust LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ 2936*c217d954SCole Faust } 2937*c217d954SCole Faust 2938*c217d954SCole Faust#define LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \ 2939*c217d954SCole Faust if(!(PARTIAL_COND_X)) \ 2940*c217d954SCole Faust { \ 2941*c217d954SCole Faust LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ 2942*c217d954SCole Faust } \ 2943*c217d954SCole Faust else \ 2944*c217d954SCole Faust { \ 2945*c217d954SCole Faust LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ 2946*c217d954SCole Faust } 2947*c217d954SCole Faust 2948*c217d954SCole Faust#define LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \ 2949*c217d954SCole Faust if(!(PARTIAL_COND_Y)) \ 2950*c217d954SCole Faust { \ 2951*c217d954SCole Faust LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ 2952*c217d954SCole Faust } \ 2953*c217d954SCole Faust else \ 2954*c217d954SCole Faust { \ 2955*c217d954SCole Faust LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \ 2956*c217d954SCole Faust } 2957*c217d954SCole Faust 2958*c217d954SCole Faust 2959*c217d954SCole Faust#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 2960*c217d954SCole Faust 2961*c217d954SCole Faust#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 2962*c217d954SCole Faust LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) 2963*c217d954SCole Faust 2964*c217d954SCole Faust#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0 2965*c217d954SCole Faust 2966*c217d954SCole Faust#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 2967*c217d954SCole Faust REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0); \ 2968*c217d954SCole Faust LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) 2969*c217d954SCole Faust 2970*c217d954SCole Faust#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0 2971*c217d954SCole Faust 2972*c217d954SCole Faust#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 2973*c217d954SCole Faust REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0); \ 2974*c217d954SCole Faust LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) 2975*c217d954SCole Faust 2976*c217d954SCole Faust#else 2977*c217d954SCole Faust 2978*c217d954SCole Faust#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 2979*c217d954SCole Faust REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0); \ 2980*c217d954SCole Faust LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) 2981*c217d954SCole Faust 2982*c217d954SCole Faust#endif 2983*c217d954SCole Faust 2984*c217d954SCole Faust 2985*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_1(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 2986*c217d954SCole Faust BASENAME##0 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 0 * X_STEP_ROW), (Y_COORD + 0 * Y_STEP_ROW)) 2987*c217d954SCole Faust 2988*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_2(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 2989*c217d954SCole Faust LOAD_TEXTURE2D_ROW_1(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 2990*c217d954SCole Faust BASENAME##1 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 1 * X_STEP_ROW), (Y_COORD + 1 * Y_STEP_ROW)) 2991*c217d954SCole Faust 2992*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_3(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 2993*c217d954SCole Faust LOAD_TEXTURE2D_ROW_2(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 2994*c217d954SCole Faust BASENAME##2 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 2 * X_STEP_ROW), (Y_COORD + 2 * Y_STEP_ROW)) 2995*c217d954SCole Faust 2996*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_4(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 2997*c217d954SCole Faust LOAD_TEXTURE2D_ROW_3(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 2998*c217d954SCole Faust BASENAME##3 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 3 * X_STEP_ROW), (Y_COORD + 3 * Y_STEP_ROW)) 2999*c217d954SCole Faust 3000*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_5(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3001*c217d954SCole Faust LOAD_TEXTURE2D_ROW_4(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3002*c217d954SCole Faust BASENAME##4 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 4 * X_STEP_ROW), (Y_COORD + 4 * Y_STEP_ROW)) 3003*c217d954SCole Faust 3004*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_6(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3005*c217d954SCole Faust LOAD_TEXTURE2D_ROW_5(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3006*c217d954SCole Faust BASENAME##5 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 5 * X_STEP_ROW), (Y_COORD + 5 * Y_STEP_ROW)) 3007*c217d954SCole Faust 3008*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_7(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3009*c217d954SCole Faust LOAD_TEXTURE2D_ROW_6(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3010*c217d954SCole Faust BASENAME##6 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 6 * X_STEP_ROW), (Y_COORD + 6 * Y_STEP_ROW)) 3011*c217d954SCole Faust 3012*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_8(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3013*c217d954SCole Faust LOAD_TEXTURE2D_ROW_7(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3014*c217d954SCole Faust BASENAME##7 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 7 * X_STEP_ROW), (Y_COORD + 7 * Y_STEP_ROW)) 3015*c217d954SCole Faust 3016*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_9(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3017*c217d954SCole Faust LOAD_TEXTURE2D_ROW_8(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3018*c217d954SCole Faust BASENAME##8 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 8 * X_STEP_ROW), (Y_COORD + 8 * Y_STEP_ROW)) 3019*c217d954SCole Faust 3020*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_10(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3021*c217d954SCole Faust LOAD_TEXTURE2D_ROW_9(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3022*c217d954SCole Faust BASENAME##9 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 9 * X_STEP_ROW), (Y_COORD + 9 * Y_STEP_ROW)) 3023*c217d954SCole Faust 3024*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_11(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3025*c217d954SCole Faust LOAD_TEXTURE2D_ROW_10(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3026*c217d954SCole Faust BASENAME##A = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 10 * X_STEP_ROW), (Y_COORD + 10 * Y_STEP_ROW)) 3027*c217d954SCole Faust 3028*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_12(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3029*c217d954SCole Faust LOAD_TEXTURE2D_ROW_11(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3030*c217d954SCole Faust BASENAME##B = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 11 * X_STEP_ROW), (Y_COORD + 11 * Y_STEP_ROW)) 3031*c217d954SCole Faust 3032*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_13(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3033*c217d954SCole Faust LOAD_TEXTURE2D_ROW_12(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3034*c217d954SCole Faust BASENAME##C = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 12 * X_STEP_ROW), (Y_COORD + 12 * Y_STEP_ROW)) 3035*c217d954SCole Faust 3036*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_14(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3037*c217d954SCole Faust LOAD_TEXTURE2D_ROW_13(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3038*c217d954SCole Faust BASENAME##D = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 13 * X_STEP_ROW), (Y_COORD + 13 * Y_STEP_ROW)) 3039*c217d954SCole Faust 3040*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_15(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3041*c217d954SCole Faust LOAD_TEXTURE2D_ROW_14(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3042*c217d954SCole Faust BASENAME##E = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 14 * X_STEP_ROW), (Y_COORD + 14 * Y_STEP_ROW)) 3043*c217d954SCole Faust 3044*c217d954SCole Faust#define LOAD_TEXTURE2D_ROW_16(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3045*c217d954SCole Faust LOAD_TEXTURE2D_ROW_15(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ 3046*c217d954SCole Faust BASENAME##F = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 15 * X_STEP_ROW), (Y_COORD + 15 * Y_STEP_ROW)) 3047*c217d954SCole Faust 3048*c217d954SCole Faust 3049*c217d954SCole Faust 3050*c217d954SCole Faust#define LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_ROW_##M0(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) 3051*c217d954SCole Faust#define LOAD_TEXTURE2D(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) 3052*c217d954SCole Faust 3053*c217d954SCole Faust 3054*c217d954SCole Faust 3055*c217d954SCole Faust#define LOAD_ROW_INDIRECT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3056*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3057*c217d954SCole Faust BASENAME##0; \ 3058*c217d954SCole Faust if(Y_MASK##0 != 0) \ 3059*c217d954SCole Faust BASENAME##0 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##0 * STRIDE_Y)); \ 3060*c217d954SCole Faust else \ 3061*c217d954SCole Faust BASENAME##0 = 0; 3062*c217d954SCole Faust 3063*c217d954SCole Faust#define LOAD_ROW_INDIRECT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3064*c217d954SCole Faust LOAD_ROW_INDIRECT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3065*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3066*c217d954SCole Faust BASENAME##1; \ 3067*c217d954SCole Faust if(Y_MASK##1 != 0) \ 3068*c217d954SCole Faust BASENAME##1 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##1 * STRIDE_Y)); \ 3069*c217d954SCole Faust else \ 3070*c217d954SCole Faust BASENAME##1 = 0; 3071*c217d954SCole Faust 3072*c217d954SCole Faust#define LOAD_ROW_INDIRECT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3073*c217d954SCole Faust LOAD_ROW_INDIRECT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3074*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3075*c217d954SCole Faust BASENAME##2; \ 3076*c217d954SCole Faust if(Y_MASK##2 != 0) \ 3077*c217d954SCole Faust BASENAME##2 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##2 * STRIDE_Y)); \ 3078*c217d954SCole Faust else \ 3079*c217d954SCole Faust BASENAME##2 = 0; 3080*c217d954SCole Faust 3081*c217d954SCole Faust#define LOAD_ROW_INDIRECT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3082*c217d954SCole Faust LOAD_ROW_INDIRECT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3083*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3084*c217d954SCole Faust BASENAME##3; \ 3085*c217d954SCole Faust if(Y_MASK##3 != 0) \ 3086*c217d954SCole Faust BASENAME##3 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##3 * STRIDE_Y)); \ 3087*c217d954SCole Faust else \ 3088*c217d954SCole Faust BASENAME##3 = 0; 3089*c217d954SCole Faust 3090*c217d954SCole Faust#define LOAD_ROW_INDIRECT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3091*c217d954SCole Faust LOAD_ROW_INDIRECT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3092*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3093*c217d954SCole Faust BASENAME##4; \ 3094*c217d954SCole Faust if(Y_MASK##4 != 0) \ 3095*c217d954SCole Faust BASENAME##4 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##4 * STRIDE_Y)); \ 3096*c217d954SCole Faust else \ 3097*c217d954SCole Faust BASENAME##4 = 0; 3098*c217d954SCole Faust 3099*c217d954SCole Faust#define LOAD_ROW_INDIRECT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3100*c217d954SCole Faust LOAD_ROW_INDIRECT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3101*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3102*c217d954SCole Faust BASENAME##5; \ 3103*c217d954SCole Faust if(Y_MASK##5 != 0) \ 3104*c217d954SCole Faust BASENAME##5 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##5 * STRIDE_Y)); \ 3105*c217d954SCole Faust else \ 3106*c217d954SCole Faust BASENAME##5 = 0; 3107*c217d954SCole Faust 3108*c217d954SCole Faust#define LOAD_ROW_INDIRECT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3109*c217d954SCole Faust LOAD_ROW_INDIRECT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3110*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3111*c217d954SCole Faust BASENAME##6; \ 3112*c217d954SCole Faust if(Y_MASK##6 != 0) \ 3113*c217d954SCole Faust BASENAME##6 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##6 * STRIDE_Y)); \ 3114*c217d954SCole Faust else \ 3115*c217d954SCole Faust BASENAME##6 = 0; 3116*c217d954SCole Faust 3117*c217d954SCole Faust#define LOAD_ROW_INDIRECT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3118*c217d954SCole Faust LOAD_ROW_INDIRECT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3119*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3120*c217d954SCole Faust BASENAME##7; \ 3121*c217d954SCole Faust if(Y_MASK##7 != 0) \ 3122*c217d954SCole Faust BASENAME##7 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##7 * STRIDE_Y)); \ 3123*c217d954SCole Faust else \ 3124*c217d954SCole Faust BASENAME##7 = 0; 3125*c217d954SCole Faust 3126*c217d954SCole Faust#define LOAD_ROW_INDIRECT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3127*c217d954SCole Faust LOAD_ROW_INDIRECT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3128*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3129*c217d954SCole Faust BASENAME##8; \ 3130*c217d954SCole Faust if(Y_MASK##8 != 0) \ 3131*c217d954SCole Faust BASENAME##8 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##8 * STRIDE_Y)); \ 3132*c217d954SCole Faust else \ 3133*c217d954SCole Faust BASENAME##8 = 0; 3134*c217d954SCole Faust 3135*c217d954SCole Faust#define LOAD_ROW_INDIRECT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3136*c217d954SCole Faust LOAD_ROW_INDIRECT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3137*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3138*c217d954SCole Faust BASENAME##9; \ 3139*c217d954SCole Faust if(Y_MASK##9 != 0) \ 3140*c217d954SCole Faust BASENAME##9 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##9 * STRIDE_Y)); \ 3141*c217d954SCole Faust else \ 3142*c217d954SCole Faust BASENAME##9 = 0; 3143*c217d954SCole Faust 3144*c217d954SCole Faust#define LOAD_ROW_INDIRECT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3145*c217d954SCole Faust LOAD_ROW_INDIRECT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3146*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3147*c217d954SCole Faust BASENAME##A; \ 3148*c217d954SCole Faust if(Y_MASK##A != 0) \ 3149*c217d954SCole Faust BASENAME##A = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##A * STRIDE_Y)); \ 3150*c217d954SCole Faust else \ 3151*c217d954SCole Faust BASENAME##A = 0; 3152*c217d954SCole Faust 3153*c217d954SCole Faust#define LOAD_ROW_INDIRECT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3154*c217d954SCole Faust LOAD_ROW_INDIRECT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3155*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3156*c217d954SCole Faust BASENAME##B; \ 3157*c217d954SCole Faust if(Y_MASK##B != 0) \ 3158*c217d954SCole Faust BASENAME##B = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##B * STRIDE_Y)); \ 3159*c217d954SCole Faust else \ 3160*c217d954SCole Faust BASENAME##B = 0; 3161*c217d954SCole Faust 3162*c217d954SCole Faust#define LOAD_ROW_INDIRECT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3163*c217d954SCole Faust LOAD_ROW_INDIRECT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3164*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3165*c217d954SCole Faust BASENAME##C; \ 3166*c217d954SCole Faust if(Y_MASK##C != 0) \ 3167*c217d954SCole Faust BASENAME##C = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##C * STRIDE_Y)); \ 3168*c217d954SCole Faust else \ 3169*c217d954SCole Faust BASENAME##C = 0; 3170*c217d954SCole Faust 3171*c217d954SCole Faust#define LOAD_ROW_INDIRECT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3172*c217d954SCole Faust LOAD_ROW_INDIRECT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3173*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3174*c217d954SCole Faust BASENAME##D; \ 3175*c217d954SCole Faust if(Y_MASK##D != 0) \ 3176*c217d954SCole Faust BASENAME##D = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##D * STRIDE_Y)); \ 3177*c217d954SCole Faust else \ 3178*c217d954SCole Faust BASENAME##D = 0; 3179*c217d954SCole Faust 3180*c217d954SCole Faust#define LOAD_ROW_INDIRECT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3181*c217d954SCole Faust LOAD_ROW_INDIRECT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3182*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3183*c217d954SCole Faust BASENAME##E; \ 3184*c217d954SCole Faust if(Y_MASK##E != 0) \ 3185*c217d954SCole Faust BASENAME##E = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##E * STRIDE_Y)); \ 3186*c217d954SCole Faust else \ 3187*c217d954SCole Faust BASENAME##E = 0; 3188*c217d954SCole Faust 3189*c217d954SCole Faust#define LOAD_ROW_INDIRECT_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3190*c217d954SCole Faust LOAD_ROW_INDIRECT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \ 3191*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3192*c217d954SCole Faust BASENAME##F; \ 3193*c217d954SCole Faust if(Y_MASK##F != 0) \ 3194*c217d954SCole Faust BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##F * STRIDE_Y)); \ 3195*c217d954SCole Faust else \ 3196*c217d954SCole Faust BASENAME##F = 0; 3197*c217d954SCole Faust 3198*c217d954SCole Faust 3199*c217d954SCole Faust#define LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) LOAD_ROW_INDIRECT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) 3200*c217d954SCole Faust#define LOAD_BLOCK_INDIRECT(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) 3201*c217d954SCole Faust 3202*c217d954SCole Faust 3203*c217d954SCole Faust#define LOAD_ELEMENT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3204*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3205*c217d954SCole Faust BASENAME##0 = *((__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y)); 3206*c217d954SCole Faust 3207*c217d954SCole Faust#define LOAD_ELEMENT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3208*c217d954SCole Faust LOAD_ELEMENT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3209*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3210*c217d954SCole Faust BASENAME##1 = *((__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y)); 3211*c217d954SCole Faust 3212*c217d954SCole Faust#define LOAD_ELEMENT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3213*c217d954SCole Faust LOAD_ELEMENT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3214*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3215*c217d954SCole Faust BASENAME##2 = *((__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y)); 3216*c217d954SCole Faust 3217*c217d954SCole Faust#define LOAD_ELEMENT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3218*c217d954SCole Faust LOAD_ELEMENT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3219*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3220*c217d954SCole Faust BASENAME##3 = *((__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y)); 3221*c217d954SCole Faust 3222*c217d954SCole Faust#define LOAD_ELEMENT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3223*c217d954SCole Faust LOAD_ELEMENT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3224*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3225*c217d954SCole Faust BASENAME##4 = *((__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y)); 3226*c217d954SCole Faust 3227*c217d954SCole Faust#define LOAD_ELEMENT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3228*c217d954SCole Faust LOAD_ELEMENT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3229*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3230*c217d954SCole Faust BASENAME##5 = *((__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y)); 3231*c217d954SCole Faust 3232*c217d954SCole Faust#define LOAD_ELEMENT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3233*c217d954SCole Faust LOAD_ELEMENT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3234*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3235*c217d954SCole Faust BASENAME##6 = *((__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y)); 3236*c217d954SCole Faust 3237*c217d954SCole Faust#define LOAD_ELEMENT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3238*c217d954SCole Faust LOAD_ELEMENT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3239*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3240*c217d954SCole Faust BASENAME##7 = *((__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y)); 3241*c217d954SCole Faust 3242*c217d954SCole Faust#define LOAD_ELEMENT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3243*c217d954SCole Faust LOAD_ELEMENT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3244*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3245*c217d954SCole Faust BASENAME##8 = *((__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y)); 3246*c217d954SCole Faust 3247*c217d954SCole Faust#define LOAD_ELEMENT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3248*c217d954SCole Faust LOAD_ELEMENT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3249*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3250*c217d954SCole Faust BASENAME##9 = *((__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y)); 3251*c217d954SCole Faust 3252*c217d954SCole Faust#define LOAD_ELEMENT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3253*c217d954SCole Faust LOAD_ELEMENT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3254*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3255*c217d954SCole Faust BASENAME##A = *((__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y)); 3256*c217d954SCole Faust 3257*c217d954SCole Faust#define LOAD_ELEMENT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3258*c217d954SCole Faust LOAD_ELEMENT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3259*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3260*c217d954SCole Faust BASENAME##B = *((__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y)); 3261*c217d954SCole Faust 3262*c217d954SCole Faust#define LOAD_ELEMENT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3263*c217d954SCole Faust LOAD_ELEMENT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3264*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3265*c217d954SCole Faust BASENAME##C = *((__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y)); 3266*c217d954SCole Faust 3267*c217d954SCole Faust#define LOAD_ELEMENT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3268*c217d954SCole Faust LOAD_ELEMENT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3269*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3270*c217d954SCole Faust BASENAME##D = *((__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y)); 3271*c217d954SCole Faust 3272*c217d954SCole Faust#define LOAD_ELEMENT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3273*c217d954SCole Faust LOAD_ELEMENT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3274*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3275*c217d954SCole Faust BASENAME##E = *((__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y)); 3276*c217d954SCole Faust 3277*c217d954SCole Faust#define LOAD_ELEMENT_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3278*c217d954SCole Faust LOAD_ELEMENT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \ 3279*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N0) \ 3280*c217d954SCole Faust BASENAME##F = *((__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y)); 3281*c217d954SCole Faust 3282*c217d954SCole Faust 3283*c217d954SCole Faust 3284*c217d954SCole Faust 3285*c217d954SCole Faust#define LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) LOAD_ELEMENT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) 3286*c217d954SCole Faust#define LOAD_SCALAR_AS_VECTOR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) 3287*c217d954SCole Faust 3288*c217d954SCole Faust 3289*c217d954SCole Faust 3290*c217d954SCole Faust#define CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3291*c217d954SCole Faust Z##0 = (0 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ 3292*c217d954SCole Faust Z##0 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##0); \ 3293*c217d954SCole Faust Z##0 *= (CROSS_PLANE_PAD * STRIDE_Y); 3294*c217d954SCole Faust 3295*c217d954SCole Faust#define CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3296*c217d954SCole Faust CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3297*c217d954SCole Faust Z##1 = (1 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ 3298*c217d954SCole Faust Z##1 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##1); \ 3299*c217d954SCole Faust Z##1 *= (CROSS_PLANE_PAD * STRIDE_Y); 3300*c217d954SCole Faust 3301*c217d954SCole Faust#define CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3302*c217d954SCole Faust CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3303*c217d954SCole Faust Z##2 = (2 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ 3304*c217d954SCole Faust Z##2 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##2); \ 3305*c217d954SCole Faust Z##2 *= (CROSS_PLANE_PAD * STRIDE_Y); 3306*c217d954SCole Faust 3307*c217d954SCole Faust#define CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3308*c217d954SCole Faust CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3309*c217d954SCole Faust Z##3 = (3 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ 3310*c217d954SCole Faust Z##3 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##3); \ 3311*c217d954SCole Faust Z##3 *= (CROSS_PLANE_PAD * STRIDE_Y); 3312*c217d954SCole Faust 3313*c217d954SCole Faust#define CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3314*c217d954SCole Faust CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3315*c217d954SCole Faust Z##4 = (4 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ 3316*c217d954SCole Faust Z##4 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##4); \ 3317*c217d954SCole Faust Z##4 *= (CROSS_PLANE_PAD * STRIDE_Y); 3318*c217d954SCole Faust 3319*c217d954SCole Faust#define CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3320*c217d954SCole Faust CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3321*c217d954SCole Faust Z##5 = (5 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ 3322*c217d954SCole Faust Z##5 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##5); \ 3323*c217d954SCole Faust Z##5 *= (CROSS_PLANE_PAD * STRIDE_Y); 3324*c217d954SCole Faust 3325*c217d954SCole Faust#define CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3326*c217d954SCole Faust CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3327*c217d954SCole Faust Z##6 = (6 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ 3328*c217d954SCole Faust Z##6 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##6); \ 3329*c217d954SCole Faust Z##6 *= (CROSS_PLANE_PAD * STRIDE_Y); 3330*c217d954SCole Faust 3331*c217d954SCole Faust#define CALCULATE_Z_OFFSET_8(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3332*c217d954SCole Faust CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \ 3333*c217d954SCole Faust Z##7 = (7 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \ 3334*c217d954SCole Faust Z##7 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##7); \ 3335*c217d954SCole Faust Z##7 *= (CROSS_PLANE_PAD * STRIDE_Y); 3336*c217d954SCole Faust 3337*c217d954SCole Faust 3338*c217d954SCole Faust 3339*c217d954SCole Faust 3340*c217d954SCole Faust#define CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_##M0(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) 3341*c217d954SCole Faust#define CALCULATE_Z_OFFSET(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) 3342*c217d954SCole Faust 3343*c217d954SCole Faust 3344*c217d954SCole Faust 3345*c217d954SCole Faust#define SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) \ 3346*c217d954SCole Faust BASENAME##0 *= (DATA_TYPE)SCALE; 3347*c217d954SCole Faust 3348*c217d954SCole Faust#define SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE) \ 3349*c217d954SCole Faust SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) \ 3350*c217d954SCole Faust BASENAME##1 *= (DATA_TYPE)SCALE; 3351*c217d954SCole Faust 3352*c217d954SCole Faust#define SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE) \ 3353*c217d954SCole Faust SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE) \ 3354*c217d954SCole Faust BASENAME##2 *= (DATA_TYPE)SCALE; 3355*c217d954SCole Faust 3356*c217d954SCole Faust#define SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE) \ 3357*c217d954SCole Faust SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE) \ 3358*c217d954SCole Faust BASENAME##3 *= (DATA_TYPE)SCALE; 3359*c217d954SCole Faust 3360*c217d954SCole Faust#define SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE) \ 3361*c217d954SCole Faust SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE) \ 3362*c217d954SCole Faust BASENAME##4 *= (DATA_TYPE)SCALE; 3363*c217d954SCole Faust 3364*c217d954SCole Faust#define SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE) \ 3365*c217d954SCole Faust SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE) \ 3366*c217d954SCole Faust BASENAME##5 *= (DATA_TYPE)SCALE; 3367*c217d954SCole Faust 3368*c217d954SCole Faust#define SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE) \ 3369*c217d954SCole Faust SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE) \ 3370*c217d954SCole Faust BASENAME##6 *= (DATA_TYPE)SCALE; 3371*c217d954SCole Faust 3372*c217d954SCole Faust#define SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE) \ 3373*c217d954SCole Faust SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE) \ 3374*c217d954SCole Faust BASENAME##7 *= (DATA_TYPE)SCALE; 3375*c217d954SCole Faust 3376*c217d954SCole Faust#define SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE) \ 3377*c217d954SCole Faust SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE) \ 3378*c217d954SCole Faust BASENAME##8 *= (DATA_TYPE)SCALE; 3379*c217d954SCole Faust 3380*c217d954SCole Faust#define SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE) \ 3381*c217d954SCole Faust SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE) \ 3382*c217d954SCole Faust BASENAME##9 *= (DATA_TYPE)SCALE; 3383*c217d954SCole Faust 3384*c217d954SCole Faust#define SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE) \ 3385*c217d954SCole Faust SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE) \ 3386*c217d954SCole Faust BASENAME##A *= (DATA_TYPE)SCALE; 3387*c217d954SCole Faust 3388*c217d954SCole Faust#define SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE) \ 3389*c217d954SCole Faust SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE) \ 3390*c217d954SCole Faust BASENAME##B *= (DATA_TYPE)SCALE; 3391*c217d954SCole Faust 3392*c217d954SCole Faust#define SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE) \ 3393*c217d954SCole Faust SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE) \ 3394*c217d954SCole Faust BASENAME##C *= (DATA_TYPE)SCALE; 3395*c217d954SCole Faust 3396*c217d954SCole Faust#define SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE) \ 3397*c217d954SCole Faust SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE) \ 3398*c217d954SCole Faust BASENAME##D *= (DATA_TYPE)SCALE; 3399*c217d954SCole Faust 3400*c217d954SCole Faust#define SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE) \ 3401*c217d954SCole Faust SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE) \ 3402*c217d954SCole Faust BASENAME##E *= (DATA_TYPE)SCALE; 3403*c217d954SCole Faust 3404*c217d954SCole Faust#define SCALE_ROW_16(DATA_TYPE, BASENAME, SCALE) \ 3405*c217d954SCole Faust SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE) \ 3406*c217d954SCole Faust BASENAME##F *= (DATA_TYPE)SCALE; 3407*c217d954SCole Faust 3408*c217d954SCole Faust 3409*c217d954SCole Faust 3410*c217d954SCole Faust#define SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE) SCALE_ROW_##N(DATA_TYPE, BASENAME, SCALE) 3411*c217d954SCole Faust#define SCALE_BLOCK(N, DATA_TYPE, BASENAME, SCALE) SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE) 3412*c217d954SCole Faust 3413*c217d954SCole Faust 3414*c217d954SCole Faust 3415*c217d954SCole Faust#define COLUMN_VECTOR1(IDX_COL, BASENAME, X, TYPE) \ 3416*c217d954SCole Faust TYPE BASENAME##IDX_COL = (TYPE)((X##0).s##IDX_COL); 3417*c217d954SCole Faust#define COLUMN_VECTOR2(IDX_COL, BASENAME, X, TYPE) \ 3418*c217d954SCole Faust VEC_DATA_TYPE(TYPE, 2) \ 3419*c217d954SCole Faust BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0).s##IDX_COL, (X##1).s##IDX_COL); 3420*c217d954SCole Faust#define COLUMN_VECTOR3(IDX_COL, BASENAME, X, TYPE) \ 3421*c217d954SCole Faust VEC_DATA_TYPE(TYPE, 3) \ 3422*c217d954SCole Faust BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 3))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL); 3423*c217d954SCole Faust#define COLUMN_VECTOR4(IDX_COL, BASENAME, X, TYPE) \ 3424*c217d954SCole Faust VEC_DATA_TYPE(TYPE, 4) \ 3425*c217d954SCole Faust BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL); 3426*c217d954SCole Faust#define COLUMN_VECTOR8(IDX_COL, BASENAME, X, TYPE) \ 3427*c217d954SCole Faust VEC_DATA_TYPE(TYPE, 8) \ 3428*c217d954SCole Faust BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL); 3429*c217d954SCole Faust#define COLUMN_VECTOR16(IDX_COL, BASENAME, X, TYPE) \ 3430*c217d954SCole Faust VEC_DATA_TYPE(TYPE, 16) \ 3431*c217d954SCole Faust BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL, (X##8).s##IDX_COL, (X##9).s##IDX_COL, (X##A).s##IDX_COL, (X##B).s##IDX_COL, (X##C).s##IDX_COL, (X##D).s##IDX_COL, (X##E).s##IDX_COL, (X##F).s##IDX_COL); 3432*c217d954SCole Faust 3433*c217d954SCole Faust 3434*c217d954SCole Faust 3435*c217d954SCole Faust#define COLUMN_VECTOR_SCALAR1(IDX_COL, BASENAME, X, TYPE) \ 3436*c217d954SCole Faust TYPE BASENAME##IDX_COL = (TYPE)((X##0)); 3437*c217d954SCole Faust#define COLUMN_VECTOR_SCALAR2(IDX_COL, BASENAME, X, TYPE) \ 3438*c217d954SCole Faust VEC_DATA_TYPE(TYPE, 2) \ 3439*c217d954SCole Faust BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0), (X##1)); 3440*c217d954SCole Faust#define COLUMN_VECTOR_SCALAR3(IDX_COL, BASENAME, X, TYPE) \ 3441*c217d954SCole Faust VEC_DATA_TYPE(TYPE, 3) \ 3442*c217d954SCole Faust BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 3))((X##0), (X##1), (X##2)); 3443*c217d954SCole Faust#define COLUMN_VECTOR_SCALAR4(IDX_COL, BASENAME, X, TYPE) \ 3444*c217d954SCole Faust VEC_DATA_TYPE(TYPE, 4) \ 3445*c217d954SCole Faust BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0), (X##1), (X##2), (X##3)); 3446*c217d954SCole Faust#define COLUMN_VECTOR_SCALAR8(IDX_COL, BASENAME, X, TYPE) \ 3447*c217d954SCole Faust VEC_DATA_TYPE(TYPE, 8) \ 3448*c217d954SCole Faust BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7)); 3449*c217d954SCole Faust#define COLUMN_VECTOR_SCALAR16(IDX_COL, BASENAME, X, TYPE) \ 3450*c217d954SCole Faust VEC_DATA_TYPE(TYPE, 16) \ 3451*c217d954SCole Faust BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7), (X##8), (X##9), (X##A), (X##B), (X##C), (X##D), (X##E), (X##F)); 3452*c217d954SCole Faust 3453*c217d954SCole Faust 3454*c217d954SCole Faust 3455*c217d954SCole Faust#define TRANSPOSE_K0X1(K0, BASENAME, BS, TYPE) \ 3456*c217d954SCole Faust COLUMN_VECTOR_SCALAR(K0, 0, BASENAME, BS, TYPE); 3457*c217d954SCole Faust#define TRANSPOSE_K0X2(K0, BASENAME, BS, TYPE) \ 3458*c217d954SCole Faust COLUMN_VECTOR(K0, 0, BASENAME, BS, TYPE); \ 3459*c217d954SCole Faust COLUMN_VECTOR(K0, 1, BASENAME, BS, TYPE); 3460*c217d954SCole Faust#define TRANSPOSE_K0X3(K0, BASENAME, BS, TYPE) \ 3461*c217d954SCole Faust TRANSPOSE_K0X2(K0, BASENAME, BS, TYPE); \ 3462*c217d954SCole Faust COLUMN_VECTOR(K0, 2, BASENAME, BS, TYPE); 3463*c217d954SCole Faust#define TRANSPOSE_K0X4(K0, BASENAME, BS, TYPE) \ 3464*c217d954SCole Faust TRANSPOSE_K0X3(K0, BASENAME, BS, TYPE); \ 3465*c217d954SCole Faust COLUMN_VECTOR(K0, 3, BASENAME, BS, TYPE); 3466*c217d954SCole Faust#define TRANSPOSE_K0X8(K0, BASENAME, BS, TYPE) \ 3467*c217d954SCole Faust TRANSPOSE_K0X4(K0, BASENAME, BS, TYPE); \ 3468*c217d954SCole Faust COLUMN_VECTOR(K0, 4, BASENAME, BS, TYPE); \ 3469*c217d954SCole Faust COLUMN_VECTOR(K0, 5, BASENAME, BS, TYPE); \ 3470*c217d954SCole Faust COLUMN_VECTOR(K0, 6, BASENAME, BS, TYPE); \ 3471*c217d954SCole Faust COLUMN_VECTOR(K0, 7, BASENAME, BS, TYPE); 3472*c217d954SCole Faust#define TRANSPOSE_K0X16(K0, BASENAME, BS, TYPE) \ 3473*c217d954SCole Faust TRANSPOSE_K0X8(K0, BASENAME, BS, TYPE); \ 3474*c217d954SCole Faust COLUMN_VECTOR(K0, 8, BASENAME, BS, TYPE); \ 3475*c217d954SCole Faust COLUMN_VECTOR(K0, 9, BASENAME, BS, TYPE); \ 3476*c217d954SCole Faust COLUMN_VECTOR(K0, A, BASENAME, BS, TYPE); \ 3477*c217d954SCole Faust COLUMN_VECTOR(K0, B, BASENAME, BS, TYPE); \ 3478*c217d954SCole Faust COLUMN_VECTOR(K0, C, BASENAME, BS, TYPE); \ 3479*c217d954SCole Faust COLUMN_VECTOR(K0, D, BASENAME, BS, TYPE); \ 3480*c217d954SCole Faust COLUMN_VECTOR(K0, E, BASENAME, BS, TYPE); \ 3481*c217d954SCole Faust COLUMN_VECTOR(K0, F, BASENAME, BS, TYPE); 3482*c217d954SCole Faust 3483*c217d954SCole Faust 3484*c217d954SCole Faust 3485*c217d954SCole Faust 3486*c217d954SCole Faust#define COLUMN_VECTOR(K0, IDX_COL, BASENAME, BS, TYPE) \ 3487*c217d954SCole Faust CONCAT(COLUMN_VECTOR, K0) \ 3488*c217d954SCole Faust (IDX_COL, BASENAME, BS, TYPE); 3489*c217d954SCole Faust 3490*c217d954SCole Faust 3491*c217d954SCole Faust#define COLUMN_VECTOR_SCALAR(K0, IDX_COL, BASENAME, BS, TYPE) \ 3492*c217d954SCole Faust CONCAT(COLUMN_VECTOR_SCALAR, K0) \ 3493*c217d954SCole Faust (IDX_COL, BASENAME, BS, TYPE); 3494*c217d954SCole Faust 3495*c217d954SCole Faust 3496*c217d954SCole Faust#define TRANSPOSE_K0XN0(K0, N0, BASENAME, BS, TYPE) \ 3497*c217d954SCole Faust CONCAT(TRANSPOSE_K0X, N0) \ 3498*c217d954SCole Faust (K0, BASENAME, BS, TYPE); 3499*c217d954SCole Faust 3500*c217d954SCole Faust 3501*c217d954SCole Faust#define ADD_ROW_1(BASENAME, BIAS) \ 3502*c217d954SCole Faust BASENAME##0 += BIAS##0; 3503*c217d954SCole Faust 3504*c217d954SCole Faust#define ADD_ROW_2(BASENAME, BIAS) \ 3505*c217d954SCole Faust ADD_ROW_1(BASENAME, BIAS) \ 3506*c217d954SCole Faust BASENAME##1 += BIAS##1; 3507*c217d954SCole Faust 3508*c217d954SCole Faust#define ADD_ROW_3(BASENAME, BIAS) \ 3509*c217d954SCole Faust ADD_ROW_2(BASENAME, BIAS) \ 3510*c217d954SCole Faust BASENAME##2 += BIAS##2; 3511*c217d954SCole Faust 3512*c217d954SCole Faust#define ADD_ROW_4(BASENAME, BIAS) \ 3513*c217d954SCole Faust ADD_ROW_3(BASENAME, BIAS) \ 3514*c217d954SCole Faust BASENAME##3 += BIAS##3; 3515*c217d954SCole Faust 3516*c217d954SCole Faust#define ADD_ROW_5(BASENAME, BIAS) \ 3517*c217d954SCole Faust ADD_ROW_4(BASENAME, BIAS) \ 3518*c217d954SCole Faust BASENAME##4 += BIAS##4; 3519*c217d954SCole Faust 3520*c217d954SCole Faust#define ADD_ROW_6(BASENAME, BIAS) \ 3521*c217d954SCole Faust ADD_ROW_5(BASENAME, BIAS) \ 3522*c217d954SCole Faust BASENAME##5 += BIAS##5; 3523*c217d954SCole Faust 3524*c217d954SCole Faust#define ADD_ROW_7(BASENAME, BIAS) \ 3525*c217d954SCole Faust ADD_ROW_6(BASENAME, BIAS) \ 3526*c217d954SCole Faust BASENAME##6 += BIAS##6; 3527*c217d954SCole Faust 3528*c217d954SCole Faust#define ADD_ROW_8(BASENAME, BIAS) \ 3529*c217d954SCole Faust ADD_ROW_7(BASENAME, BIAS) \ 3530*c217d954SCole Faust BASENAME##7 += BIAS##7; 3531*c217d954SCole Faust 3532*c217d954SCole Faust#define ADD_ROW_9(BASENAME, BIAS) \ 3533*c217d954SCole Faust ADD_ROW_8(BASENAME, BIAS) \ 3534*c217d954SCole Faust BASENAME##8 += BIAS##8; 3535*c217d954SCole Faust 3536*c217d954SCole Faust#define ADD_ROW_10(BASENAME, BIAS) \ 3537*c217d954SCole Faust ADD_ROW_9(BASENAME, BIAS) \ 3538*c217d954SCole Faust BASENAME##9 += BIAS##9; 3539*c217d954SCole Faust 3540*c217d954SCole Faust#define ADD_ROW_11(BASENAME, BIAS) \ 3541*c217d954SCole Faust ADD_ROW_10(BASENAME, BIAS) \ 3542*c217d954SCole Faust BASENAME##A += BIAS##A; 3543*c217d954SCole Faust 3544*c217d954SCole Faust#define ADD_ROW_12(BASENAME, BIAS) \ 3545*c217d954SCole Faust ADD_ROW_11(BASENAME, BIAS) \ 3546*c217d954SCole Faust BASENAME##B += BIAS##B; 3547*c217d954SCole Faust 3548*c217d954SCole Faust#define ADD_ROW_13(BASENAME, BIAS) \ 3549*c217d954SCole Faust ADD_ROW_12(BASENAME, BIAS) \ 3550*c217d954SCole Faust BASENAME##C += BIAS##C; 3551*c217d954SCole Faust 3552*c217d954SCole Faust#define ADD_ROW_14(BASENAME, BIAS) \ 3553*c217d954SCole Faust ADD_ROW_13(BASENAME, BIAS) \ 3554*c217d954SCole Faust BASENAME##D += BIAS##D; 3555*c217d954SCole Faust 3556*c217d954SCole Faust#define ADD_ROW_15(BASENAME, BIAS) \ 3557*c217d954SCole Faust ADD_ROW_14(BASENAME, BIAS) \ 3558*c217d954SCole Faust BASENAME##E += BIAS##E; 3559*c217d954SCole Faust 3560*c217d954SCole Faust#define ADD_ROW_16(BASENAME, BIAS) \ 3561*c217d954SCole Faust ADD_ROW_15(BASENAME, BIAS) \ 3562*c217d954SCole Faust BASENAME##F += BIAS##F; 3563*c217d954SCole Faust 3564*c217d954SCole Faust 3565*c217d954SCole Faust 3566*c217d954SCole Faust 3567*c217d954SCole Faust#define ADD_BLOCK_STR(N, BASENAME, BIAS) ADD_ROW_##N(BASENAME, BIAS) 3568*c217d954SCole Faust#define ADD_BLOCK(N, BASENAME, BIAS) ADD_BLOCK_STR(N, BASENAME, BIAS) 3569*c217d954SCole Faust 3570*c217d954SCole Faust 3571*c217d954SCole Faust 3572*c217d954SCole Faust#define ADD_ROW_BROADCAST_1(BASENAME, BIAS) \ 3573*c217d954SCole Faust BASENAME##0 += BIAS; 3574*c217d954SCole Faust 3575*c217d954SCole Faust#define ADD_ROW_BROADCAST_2(BASENAME, BIAS) \ 3576*c217d954SCole Faust ADD_ROW_BROADCAST_1(BASENAME, BIAS) \ 3577*c217d954SCole Faust BASENAME##1 += BIAS; 3578*c217d954SCole Faust 3579*c217d954SCole Faust#define ADD_ROW_BROADCAST_3(BASENAME, BIAS) \ 3580*c217d954SCole Faust ADD_ROW_BROADCAST_2(BASENAME, BIAS) \ 3581*c217d954SCole Faust BASENAME##2 += BIAS; 3582*c217d954SCole Faust 3583*c217d954SCole Faust#define ADD_ROW_BROADCAST_4(BASENAME, BIAS) \ 3584*c217d954SCole Faust ADD_ROW_BROADCAST_3(BASENAME, BIAS) \ 3585*c217d954SCole Faust BASENAME##3 += BIAS; 3586*c217d954SCole Faust 3587*c217d954SCole Faust#define ADD_ROW_BROADCAST_5(BASENAME, BIAS) \ 3588*c217d954SCole Faust ADD_ROW_BROADCAST_4(BASENAME, BIAS) \ 3589*c217d954SCole Faust BASENAME##4 += BIAS; 3590*c217d954SCole Faust 3591*c217d954SCole Faust#define ADD_ROW_BROADCAST_6(BASENAME, BIAS) \ 3592*c217d954SCole Faust ADD_ROW_BROADCAST_5(BASENAME, BIAS) \ 3593*c217d954SCole Faust BASENAME##5 += BIAS; 3594*c217d954SCole Faust 3595*c217d954SCole Faust#define ADD_ROW_BROADCAST_7(BASENAME, BIAS) \ 3596*c217d954SCole Faust ADD_ROW_BROADCAST_6(BASENAME, BIAS) \ 3597*c217d954SCole Faust BASENAME##6 += BIAS; 3598*c217d954SCole Faust 3599*c217d954SCole Faust#define ADD_ROW_BROADCAST_8(BASENAME, BIAS) \ 3600*c217d954SCole Faust ADD_ROW_BROADCAST_7(BASENAME, BIAS) \ 3601*c217d954SCole Faust BASENAME##7 += BIAS; 3602*c217d954SCole Faust 3603*c217d954SCole Faust#define ADD_ROW_BROADCAST_9(BASENAME, BIAS) \ 3604*c217d954SCole Faust ADD_ROW_BROADCAST_8(BASENAME, BIAS) \ 3605*c217d954SCole Faust BASENAME##8 += BIAS; 3606*c217d954SCole Faust 3607*c217d954SCole Faust#define ADD_ROW_BROADCAST_10(BASENAME, BIAS) \ 3608*c217d954SCole Faust ADD_ROW_BROADCAST_9(BASENAME, BIAS) \ 3609*c217d954SCole Faust BASENAME##9 += BIAS; 3610*c217d954SCole Faust 3611*c217d954SCole Faust#define ADD_ROW_BROADCAST_11(BASENAME, BIAS) \ 3612*c217d954SCole Faust ADD_ROW_BROADCAST_10(BASENAME, BIAS) \ 3613*c217d954SCole Faust BASENAME##A += BIAS; 3614*c217d954SCole Faust 3615*c217d954SCole Faust#define ADD_ROW_BROADCAST_12(BASENAME, BIAS) \ 3616*c217d954SCole Faust ADD_ROW_BROADCAST_11(BASENAME, BIAS) \ 3617*c217d954SCole Faust BASENAME##B += BIAS; 3618*c217d954SCole Faust 3619*c217d954SCole Faust#define ADD_ROW_BROADCAST_13(BASENAME, BIAS) \ 3620*c217d954SCole Faust ADD_ROW_BROADCAST_12(BASENAME, BIAS) \ 3621*c217d954SCole Faust BASENAME##C += BIAS; 3622*c217d954SCole Faust 3623*c217d954SCole Faust#define ADD_ROW_BROADCAST_14(BASENAME, BIAS) \ 3624*c217d954SCole Faust ADD_ROW_BROADCAST_13(BASENAME, BIAS) \ 3625*c217d954SCole Faust BASENAME##D += BIAS; 3626*c217d954SCole Faust 3627*c217d954SCole Faust#define ADD_ROW_BROADCAST_15(BASENAME, BIAS) \ 3628*c217d954SCole Faust ADD_ROW_BROADCAST_14(BASENAME, BIAS) \ 3629*c217d954SCole Faust BASENAME##E += BIAS; 3630*c217d954SCole Faust 3631*c217d954SCole Faust#define ADD_ROW_BROADCAST_16(BASENAME, BIAS) \ 3632*c217d954SCole Faust ADD_ROW_BROADCAST_15(BASENAME, BIAS) \ 3633*c217d954SCole Faust BASENAME##F += BIAS; 3634*c217d954SCole Faust 3635*c217d954SCole Faust 3636*c217d954SCole Faust#define ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS) ADD_ROW_BROADCAST_##N(BASENAME, BIAS) 3637*c217d954SCole Faust#define ADD_BLOCK_BROADCAST(N, BASENAME, BIAS) ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS) 3638*c217d954SCole Faust 3639*c217d954SCole Faust 3640*c217d954SCole Faust 3641*c217d954SCole Faust#define ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3642*c217d954SCole Faust BASENAME##0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##0, A_VAL, B_VAL); 3643*c217d954SCole Faust 3644*c217d954SCole Faust#define ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3645*c217d954SCole Faust ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3646*c217d954SCole Faust BASENAME##1 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##1, A_VAL, B_VAL); 3647*c217d954SCole Faust 3648*c217d954SCole Faust#define ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3649*c217d954SCole Faust ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3650*c217d954SCole Faust BASENAME##2 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##2, A_VAL, B_VAL); 3651*c217d954SCole Faust 3652*c217d954SCole Faust#define ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3653*c217d954SCole Faust ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3654*c217d954SCole Faust BASENAME##3 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##3, A_VAL, B_VAL); 3655*c217d954SCole Faust 3656*c217d954SCole Faust#define ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3657*c217d954SCole Faust ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3658*c217d954SCole Faust BASENAME##4 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##4, A_VAL, B_VAL); 3659*c217d954SCole Faust 3660*c217d954SCole Faust#define ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3661*c217d954SCole Faust ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3662*c217d954SCole Faust BASENAME##5 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##5, A_VAL, B_VAL); 3663*c217d954SCole Faust 3664*c217d954SCole Faust#define ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3665*c217d954SCole Faust ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3666*c217d954SCole Faust BASENAME##6 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##6, A_VAL, B_VAL); 3667*c217d954SCole Faust 3668*c217d954SCole Faust#define ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3669*c217d954SCole Faust ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3670*c217d954SCole Faust BASENAME##7 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##7, A_VAL, B_VAL); 3671*c217d954SCole Faust 3672*c217d954SCole Faust#define ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3673*c217d954SCole Faust ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3674*c217d954SCole Faust BASENAME##8 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##8, A_VAL, B_VAL); 3675*c217d954SCole Faust 3676*c217d954SCole Faust#define ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3677*c217d954SCole Faust ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3678*c217d954SCole Faust BASENAME##9 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##9, A_VAL, B_VAL); 3679*c217d954SCole Faust 3680*c217d954SCole Faust#define ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3681*c217d954SCole Faust ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3682*c217d954SCole Faust BASENAME##A = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##A, A_VAL, B_VAL); 3683*c217d954SCole Faust 3684*c217d954SCole Faust#define ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3685*c217d954SCole Faust ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3686*c217d954SCole Faust BASENAME##B = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##B, A_VAL, B_VAL); 3687*c217d954SCole Faust 3688*c217d954SCole Faust#define ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3689*c217d954SCole Faust ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3690*c217d954SCole Faust BASENAME##C = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##C, A_VAL, B_VAL); 3691*c217d954SCole Faust 3692*c217d954SCole Faust#define ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3693*c217d954SCole Faust ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3694*c217d954SCole Faust BASENAME##D = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##D, A_VAL, B_VAL); 3695*c217d954SCole Faust 3696*c217d954SCole Faust#define ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3697*c217d954SCole Faust ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3698*c217d954SCole Faust BASENAME##E = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##E, A_VAL, B_VAL); 3699*c217d954SCole Faust 3700*c217d954SCole Faust#define ACTIVATION_ROW_16(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3701*c217d954SCole Faust ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \ 3702*c217d954SCole Faust BASENAME##F = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##F, A_VAL, B_VAL); 3703*c217d954SCole Faust 3704*c217d954SCole Faust 3705*c217d954SCole Faust 3706*c217d954SCole Faust#define ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) ACTIVATION_ROW_##N(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) 3707*c217d954SCole Faust#define ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) 3708*c217d954SCole Faust 3709*c217d954SCole Faust 3710*c217d954SCole Faust 3711*c217d954SCole Faust#define CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3712*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N) \ 3713*c217d954SCole Faust BASENAME_DST##0 = CONVERT(BASENAME_SRC##0, VEC_DATA_TYPE(DATA_TYPE, N)); 3714*c217d954SCole Faust 3715*c217d954SCole Faust#define CONVERT_ROW_2(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3716*c217d954SCole Faust CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3717*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N) \ 3718*c217d954SCole Faust BASENAME_DST##1 = CONVERT(BASENAME_SRC##1, VEC_DATA_TYPE(DATA_TYPE, N)); 3719*c217d954SCole Faust 3720*c217d954SCole Faust#define CONVERT_ROW_3(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3721*c217d954SCole Faust CONVERT_ROW_2(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3722*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N) \ 3723*c217d954SCole Faust BASENAME_DST##2 = CONVERT(BASENAME_SRC##2, VEC_DATA_TYPE(DATA_TYPE, N)); 3724*c217d954SCole Faust 3725*c217d954SCole Faust#define CONVERT_ROW_4(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3726*c217d954SCole Faust CONVERT_ROW_3(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3727*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N) \ 3728*c217d954SCole Faust BASENAME_DST##3 = CONVERT(BASENAME_SRC##3, VEC_DATA_TYPE(DATA_TYPE, N)); 3729*c217d954SCole Faust 3730*c217d954SCole Faust#define CONVERT_ROW_5(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3731*c217d954SCole Faust CONVERT_ROW_4(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3732*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N) \ 3733*c217d954SCole Faust BASENAME_DST##4 = CONVERT(BASENAME_SRC##4, VEC_DATA_TYPE(DATA_TYPE, N)); 3734*c217d954SCole Faust 3735*c217d954SCole Faust#define CONVERT_ROW_6(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3736*c217d954SCole Faust CONVERT_ROW_5(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3737*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N) \ 3738*c217d954SCole Faust BASENAME_DST##5 = CONVERT(BASENAME_SRC##5, VEC_DATA_TYPE(DATA_TYPE, N)); 3739*c217d954SCole Faust 3740*c217d954SCole Faust#define CONVERT_ROW_7(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3741*c217d954SCole Faust CONVERT_ROW_6(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3742*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N) \ 3743*c217d954SCole Faust BASENAME_DST##6 = CONVERT(BASENAME_SRC##6, VEC_DATA_TYPE(DATA_TYPE, N)); 3744*c217d954SCole Faust 3745*c217d954SCole Faust#define CONVERT_ROW_8(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3746*c217d954SCole Faust CONVERT_ROW_7(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3747*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N) \ 3748*c217d954SCole Faust BASENAME_DST##7 = CONVERT(BASENAME_SRC##7, VEC_DATA_TYPE(DATA_TYPE, N)); 3749*c217d954SCole Faust 3750*c217d954SCole Faust#define CONVERT_ROW_9(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3751*c217d954SCole Faust CONVERT_ROW_8(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3752*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N) \ 3753*c217d954SCole Faust BASENAME_DST##8 = CONVERT(BASENAME_SRC##8, VEC_DATA_TYPE(DATA_TYPE, N)); 3754*c217d954SCole Faust 3755*c217d954SCole Faust#define CONVERT_ROW_10(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3756*c217d954SCole Faust CONVERT_ROW_9(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3757*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N) \ 3758*c217d954SCole Faust BASENAME_DST##9 = CONVERT(BASENAME_SRC##9, VEC_DATA_TYPE(DATA_TYPE, N)); 3759*c217d954SCole Faust 3760*c217d954SCole Faust#define CONVERT_ROW_11(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3761*c217d954SCole Faust CONVERT_ROW_10(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3762*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N) \ 3763*c217d954SCole Faust BASENAME_DST##A = CONVERT(BASENAME_SRC##A, VEC_DATA_TYPE(DATA_TYPE, N)); 3764*c217d954SCole Faust 3765*c217d954SCole Faust#define CONVERT_ROW_12(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3766*c217d954SCole Faust CONVERT_ROW_11(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3767*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N) \ 3768*c217d954SCole Faust BASENAME_DST##B = CONVERT(BASENAME_SRC##B, VEC_DATA_TYPE(DATA_TYPE, N)); 3769*c217d954SCole Faust 3770*c217d954SCole Faust#define CONVERT_ROW_13(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3771*c217d954SCole Faust CONVERT_ROW_12(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3772*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N) \ 3773*c217d954SCole Faust BASENAME_DST##C = CONVERT(BASENAME_SRC##C, VEC_DATA_TYPE(DATA_TYPE, N)); 3774*c217d954SCole Faust 3775*c217d954SCole Faust#define CONVERT_ROW_14(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3776*c217d954SCole Faust CONVERT_ROW_13(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3777*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N) \ 3778*c217d954SCole Faust BASENAME_DST##D = CONVERT(BASENAME_SRC##D, VEC_DATA_TYPE(DATA_TYPE, N)); 3779*c217d954SCole Faust 3780*c217d954SCole Faust#define CONVERT_ROW_15(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3781*c217d954SCole Faust CONVERT_ROW_14(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3782*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N) \ 3783*c217d954SCole Faust BASENAME_DST##E = CONVERT(BASENAME_SRC##E, VEC_DATA_TYPE(DATA_TYPE, N)); 3784*c217d954SCole Faust 3785*c217d954SCole Faust#define CONVERT_ROW_16(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3786*c217d954SCole Faust CONVERT_ROW_15(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \ 3787*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, N) \ 3788*c217d954SCole Faust BASENAME_DST##F = CONVERT(BASENAME_SRC##F, VEC_DATA_TYPE(DATA_TYPE, N)); 3789*c217d954SCole Faust 3790*c217d954SCole Faust 3791*c217d954SCole Faust 3792*c217d954SCole Faust#define CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_ROW_##M(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) 3793*c217d954SCole Faust#define CONVERT_BLOCK(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) 3794*c217d954SCole Faust 3795*c217d954SCole Faust 3796*c217d954SCole Faust#ifndef ARM_COMPUTE_HELPERS_ASYMM_H 3797*c217d954SCole Faust#define ARM_COMPUTE_HELPERS_ASYMM_H 3798*c217d954SCole Faust 3799*c217d954SCole Faust 3800*c217d954SCole Faust#ifndef ARM_COMPUTE_HELPER_H 3801*c217d954SCole Faust#define ARM_COMPUTE_HELPER_H 3802*c217d954SCole Faust 3803*c217d954SCole Faust 3804*c217d954SCole Faust 3805*c217d954SCole Faust 3806*c217d954SCole Faust#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3807*c217d954SCole Faust VSTORE(N0) \ 3808*c217d954SCole Faust (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 3809*c217d954SCole Faust 3810*c217d954SCole Faust#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3811*c217d954SCole Faust STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3812*c217d954SCole Faust VSTORE(N0) \ 3813*c217d954SCole Faust (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 3814*c217d954SCole Faust 3815*c217d954SCole Faust#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3816*c217d954SCole Faust STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3817*c217d954SCole Faust VSTORE(N0) \ 3818*c217d954SCole Faust (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 3819*c217d954SCole Faust 3820*c217d954SCole Faust#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3821*c217d954SCole Faust STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3822*c217d954SCole Faust VSTORE(N0) \ 3823*c217d954SCole Faust (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 3824*c217d954SCole Faust 3825*c217d954SCole Faust#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3826*c217d954SCole Faust STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3827*c217d954SCole Faust VSTORE(N0) \ 3828*c217d954SCole Faust (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 3829*c217d954SCole Faust 3830*c217d954SCole Faust#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3831*c217d954SCole Faust STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3832*c217d954SCole Faust VSTORE(N0) \ 3833*c217d954SCole Faust (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 3834*c217d954SCole Faust 3835*c217d954SCole Faust#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3836*c217d954SCole Faust STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3837*c217d954SCole Faust VSTORE(N0) \ 3838*c217d954SCole Faust (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 3839*c217d954SCole Faust 3840*c217d954SCole Faust#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3841*c217d954SCole Faust STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3842*c217d954SCole Faust VSTORE(N0) \ 3843*c217d954SCole Faust (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 3844*c217d954SCole Faust 3845*c217d954SCole Faust#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3846*c217d954SCole Faust STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3847*c217d954SCole Faust VSTORE(N0) \ 3848*c217d954SCole Faust (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 3849*c217d954SCole Faust 3850*c217d954SCole Faust#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3851*c217d954SCole Faust STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3852*c217d954SCole Faust VSTORE(N0) \ 3853*c217d954SCole Faust (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 3854*c217d954SCole Faust 3855*c217d954SCole Faust#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3856*c217d954SCole Faust STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3857*c217d954SCole Faust VSTORE(N0) \ 3858*c217d954SCole Faust (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 3859*c217d954SCole Faust 3860*c217d954SCole Faust#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3861*c217d954SCole Faust STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3862*c217d954SCole Faust VSTORE(N0) \ 3863*c217d954SCole Faust (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 3864*c217d954SCole Faust 3865*c217d954SCole Faust#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3866*c217d954SCole Faust STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3867*c217d954SCole Faust VSTORE(N0) \ 3868*c217d954SCole Faust (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 3869*c217d954SCole Faust 3870*c217d954SCole Faust#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3871*c217d954SCole Faust STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3872*c217d954SCole Faust VSTORE(N0) \ 3873*c217d954SCole Faust (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 3874*c217d954SCole Faust 3875*c217d954SCole Faust#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3876*c217d954SCole Faust STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3877*c217d954SCole Faust VSTORE(N0) \ 3878*c217d954SCole Faust (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 3879*c217d954SCole Faust 3880*c217d954SCole Faust#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3881*c217d954SCole Faust STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3882*c217d954SCole Faust VSTORE(N0) \ 3883*c217d954SCole Faust (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 3884*c217d954SCole Faust 3885*c217d954SCole Faust 3886*c217d954SCole Faust 3887*c217d954SCole Faust#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3888*c217d954SCole Faust VSTORE(N0) \ 3889*c217d954SCole Faust (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 3890*c217d954SCole Faust 3891*c217d954SCole Faust#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3892*c217d954SCole Faust CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3893*c217d954SCole Faust VSTORE(N0) \ 3894*c217d954SCole Faust (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 3895*c217d954SCole Faust 3896*c217d954SCole Faust#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3897*c217d954SCole Faust CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3898*c217d954SCole Faust VSTORE(N0) \ 3899*c217d954SCole Faust (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 3900*c217d954SCole Faust 3901*c217d954SCole Faust#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3902*c217d954SCole Faust CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3903*c217d954SCole Faust VSTORE(N0) \ 3904*c217d954SCole Faust (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 3905*c217d954SCole Faust 3906*c217d954SCole Faust#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3907*c217d954SCole Faust CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3908*c217d954SCole Faust VSTORE(N0) \ 3909*c217d954SCole Faust (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 3910*c217d954SCole Faust 3911*c217d954SCole Faust#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3912*c217d954SCole Faust CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3913*c217d954SCole Faust VSTORE(N0) \ 3914*c217d954SCole Faust (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 3915*c217d954SCole Faust 3916*c217d954SCole Faust#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3917*c217d954SCole Faust CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3918*c217d954SCole Faust VSTORE(N0) \ 3919*c217d954SCole Faust (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 3920*c217d954SCole Faust 3921*c217d954SCole Faust#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3922*c217d954SCole Faust CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3923*c217d954SCole Faust VSTORE(N0) \ 3924*c217d954SCole Faust (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 3925*c217d954SCole Faust 3926*c217d954SCole Faust#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3927*c217d954SCole Faust CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3928*c217d954SCole Faust VSTORE(N0) \ 3929*c217d954SCole Faust (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 3930*c217d954SCole Faust 3931*c217d954SCole Faust#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \ 3932*c217d954SCole Faust CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3933*c217d954SCole Faust VSTORE(N0) \ 3934*c217d954SCole Faust (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 3935*c217d954SCole Faust 3936*c217d954SCole Faust#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3937*c217d954SCole Faust CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3938*c217d954SCole Faust VSTORE(N0) \ 3939*c217d954SCole Faust (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 3940*c217d954SCole Faust 3941*c217d954SCole Faust#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3942*c217d954SCole Faust CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3943*c217d954SCole Faust VSTORE(N0) \ 3944*c217d954SCole Faust (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 3945*c217d954SCole Faust 3946*c217d954SCole Faust#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3947*c217d954SCole Faust CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3948*c217d954SCole Faust VSTORE(N0) \ 3949*c217d954SCole Faust (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 3950*c217d954SCole Faust 3951*c217d954SCole Faust#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3952*c217d954SCole Faust CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3953*c217d954SCole Faust VSTORE(N0) \ 3954*c217d954SCole Faust (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 3955*c217d954SCole Faust 3956*c217d954SCole Faust#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3957*c217d954SCole Faust CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3958*c217d954SCole Faust VSTORE(N0) \ 3959*c217d954SCole Faust (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 3960*c217d954SCole Faust 3961*c217d954SCole Faust#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3962*c217d954SCole Faust CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3963*c217d954SCole Faust VSTORE(N0) \ 3964*c217d954SCole Faust (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 3965*c217d954SCole Faust 3966*c217d954SCole Faust 3967*c217d954SCole Faust 3968*c217d954SCole Faust 3969*c217d954SCole Faust#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 3970*c217d954SCole Faust#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 3971*c217d954SCole Faust 3972*c217d954SCole Faust 3973*c217d954SCole Faust 3974*c217d954SCole Faust#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 3975*c217d954SCole Faust#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 3976*c217d954SCole Faust 3977*c217d954SCole Faust 3978*c217d954SCole Faust 3979*c217d954SCole Faust#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3980*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 3981*c217d954SCole Faust (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 3982*c217d954SCole Faust 3983*c217d954SCole Faust#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3984*c217d954SCole Faust STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3985*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 3986*c217d954SCole Faust (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 3987*c217d954SCole Faust 3988*c217d954SCole Faust#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3989*c217d954SCole Faust STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3990*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 3991*c217d954SCole Faust (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 3992*c217d954SCole Faust 3993*c217d954SCole Faust#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3994*c217d954SCole Faust STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3995*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 3996*c217d954SCole Faust (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 3997*c217d954SCole Faust 3998*c217d954SCole Faust#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 3999*c217d954SCole Faust STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4000*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 4001*c217d954SCole Faust (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 4002*c217d954SCole Faust 4003*c217d954SCole Faust#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4004*c217d954SCole Faust STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4005*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 4006*c217d954SCole Faust (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 4007*c217d954SCole Faust 4008*c217d954SCole Faust#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4009*c217d954SCole Faust STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4010*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 4011*c217d954SCole Faust (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 4012*c217d954SCole Faust 4013*c217d954SCole Faust#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4014*c217d954SCole Faust STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4015*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 4016*c217d954SCole Faust (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 4017*c217d954SCole Faust 4018*c217d954SCole Faust#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4019*c217d954SCole Faust STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4020*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 4021*c217d954SCole Faust (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 4022*c217d954SCole Faust 4023*c217d954SCole Faust#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4024*c217d954SCole Faust STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4025*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 4026*c217d954SCole Faust (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 4027*c217d954SCole Faust 4028*c217d954SCole Faust#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4029*c217d954SCole Faust STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4030*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 4031*c217d954SCole Faust (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 4032*c217d954SCole Faust 4033*c217d954SCole Faust#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4034*c217d954SCole Faust STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4035*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 4036*c217d954SCole Faust (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 4037*c217d954SCole Faust 4038*c217d954SCole Faust#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4039*c217d954SCole Faust STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4040*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 4041*c217d954SCole Faust (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 4042*c217d954SCole Faust 4043*c217d954SCole Faust#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4044*c217d954SCole Faust STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4045*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 4046*c217d954SCole Faust (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 4047*c217d954SCole Faust 4048*c217d954SCole Faust#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4049*c217d954SCole Faust STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4050*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 4051*c217d954SCole Faust (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 4052*c217d954SCole Faust 4053*c217d954SCole Faust#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4054*c217d954SCole Faust STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 4055*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 4056*c217d954SCole Faust (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 4057*c217d954SCole Faust 4058*c217d954SCole Faust 4059*c217d954SCole Faust 4060*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 4061*c217d954SCole Faust#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 4062*c217d954SCole Faust 4063*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 4064*c217d954SCole Faust if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \ 4065*c217d954SCole Faust { \ 4066*c217d954SCole Faust STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 4067*c217d954SCole Faust } \ 4068*c217d954SCole Faust else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \ 4069*c217d954SCole Faust { \ 4070*c217d954SCole Faust STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 4071*c217d954SCole Faust } \ 4072*c217d954SCole Faust else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \ 4073*c217d954SCole Faust { \ 4074*c217d954SCole Faust STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 4075*c217d954SCole Faust } \ 4076*c217d954SCole Faust else \ 4077*c217d954SCole Faust { \ 4078*c217d954SCole Faust STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 4079*c217d954SCole Faust } 4080*c217d954SCole Faust 4081*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \ 4082*c217d954SCole Faust if(!(PARTIAL_COND_X)) \ 4083*c217d954SCole Faust { \ 4084*c217d954SCole Faust STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 4085*c217d954SCole Faust } \ 4086*c217d954SCole Faust else \ 4087*c217d954SCole Faust { \ 4088*c217d954SCole Faust STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 4089*c217d954SCole Faust } 4090*c217d954SCole Faust 4091*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \ 4092*c217d954SCole Faust if(!(PARTIAL_COND_Y)) \ 4093*c217d954SCole Faust { \ 4094*c217d954SCole Faust STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 4095*c217d954SCole Faust } \ 4096*c217d954SCole Faust else \ 4097*c217d954SCole Faust { \ 4098*c217d954SCole Faust STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 4099*c217d954SCole Faust } 4100*c217d954SCole Faust 4101*c217d954SCole Faust 4102*c217d954SCole Faust#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) 4103*c217d954SCole Faust 4104*c217d954SCole Faust 4105*c217d954SCole Faust#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 4106*c217d954SCole Faust 4107*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 4108*c217d954SCole Faust STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 4109*c217d954SCole Faust 4110*c217d954SCole Faust#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0 4111*c217d954SCole Faust 4112*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 4113*c217d954SCole Faust STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) 4114*c217d954SCole Faust 4115*c217d954SCole Faust#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0 4116*c217d954SCole Faust 4117*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 4118*c217d954SCole Faust STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) 4119*c217d954SCole Faust 4120*c217d954SCole Faust#else 4121*c217d954SCole Faust 4122*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 4123*c217d954SCole Faust STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) 4124*c217d954SCole Faust 4125*c217d954SCole Faust#endif 4126*c217d954SCole Faust 4127*c217d954SCole Faust#endif 4128*c217d954SCole Faust 4129*c217d954SCole Faust 4130*c217d954SCole Faust#if defined(PARTIAL_STORE_M0) 4131*c217d954SCole Faust 4132*c217d954SCole Faust#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 4133*c217d954SCole Faust ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0)))) 4134*c217d954SCole Faust#else 4135*c217d954SCole Faust#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 4136*c217d954SCole Faust ((uint)(y * M0)) 4137*c217d954SCole Faust#endif 4138*c217d954SCole Faust 4139*c217d954SCole Faust 4140*c217d954SCole Faust 4141*c217d954SCole Faust#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \ 4142*c217d954SCole Faust STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond) 4143*c217d954SCole Faust 4144*c217d954SCole Faust 4145*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 4146*c217d954SCole Faust#pragma OPENCL EXTENSION cl_khr_fp16 : enable 4147*c217d954SCole Faust#endif 4148*c217d954SCole Faust 4149*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 4150*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable 4151*c217d954SCole Faust#endif 4152*c217d954SCole Faust 4153*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 4154*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable 4155*c217d954SCole Faust#endif 4156*c217d954SCole Faust 4157*c217d954SCole Faust#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) 4158*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_printf : enable 4159*c217d954SCole Faust#endif 4160*c217d954SCole Faust 4161*c217d954SCole Faust#define GPU_ARCH_MIDGARD 0x100 4162*c217d954SCole Faust#define GPU_ARCH_BIFROST 0x200 4163*c217d954SCole Faust#define GPU_ARCH_VALHALL 0x300 4164*c217d954SCole Faust 4165*c217d954SCole Faust 4166*c217d954SCole Faust#define CONCAT(a, b) a##b 4167*c217d954SCole Faust 4168*c217d954SCole Faust 4169*c217d954SCole Faust#define EXPAND(x) x 4170*c217d954SCole Faust 4171*c217d954SCole Faust 4172*c217d954SCole Faust#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) 4173*c217d954SCole Faust 4174*c217d954SCole Faust 4175*c217d954SCole Faust#define REV1(x) ((x)) 4176*c217d954SCole Faust#define REV2(x) ((x).s10) 4177*c217d954SCole Faust#define REV3(x) ((x).s210) 4178*c217d954SCole Faust#define REV4(x) ((x).s3210) 4179*c217d954SCole Faust#define REV8(x) ((x).s76543210) 4180*c217d954SCole Faust#define REV16(x) ((x).sFEDCBA9876543210) 4181*c217d954SCole Faust 4182*c217d954SCole Faust 4183*c217d954SCole Faust 4184*c217d954SCole Faust#define REVERSE_STR(x, s) REV##s((x)) 4185*c217d954SCole Faust#define REVERSE(x, s) REVERSE_STR(x, s) 4186*c217d954SCole Faust 4187*c217d954SCole Faust 4188*c217d954SCole Faust 4189*c217d954SCole Faust#define ROT1_0(x) ((x)) 4190*c217d954SCole Faust#define ROT1_1(x) ((x)) 4191*c217d954SCole Faust 4192*c217d954SCole Faust#define ROT2_0(x) ((x)) 4193*c217d954SCole Faust#define ROT2_1(x) ((x).s10) 4194*c217d954SCole Faust#define ROT2_2(x) ((x)) 4195*c217d954SCole Faust 4196*c217d954SCole Faust#define ROT3_0(x) ((x)) 4197*c217d954SCole Faust#define ROT3_1(x) ((x).s201) 4198*c217d954SCole Faust#define ROT3_2(x) ((x).s120) 4199*c217d954SCole Faust#define ROT3_3(x) ((x)) 4200*c217d954SCole Faust 4201*c217d954SCole Faust#define ROT4_0(x) ((x)) 4202*c217d954SCole Faust#define ROT4_1(x) ((x).s3012) 4203*c217d954SCole Faust#define ROT4_2(x) ((x).s2301) 4204*c217d954SCole Faust#define ROT4_3(x) ((x).s1230) 4205*c217d954SCole Faust#define ROT4_4(x) ((x)) 4206*c217d954SCole Faust 4207*c217d954SCole Faust#define ROT8_0(x) ((x)) 4208*c217d954SCole Faust#define ROT8_1(x) ((x).s70123456) 4209*c217d954SCole Faust#define ROT8_2(x) ((x).s67012345) 4210*c217d954SCole Faust#define ROT8_3(x) ((x).s56701234) 4211*c217d954SCole Faust#define ROT8_4(x) ((x).s45670123) 4212*c217d954SCole Faust#define ROT8_5(x) ((x).s34567012) 4213*c217d954SCole Faust#define ROT8_6(x) ((x).s23456701) 4214*c217d954SCole Faust#define ROT8_7(x) ((x).s12345670) 4215*c217d954SCole Faust#define ROT8_8(x) ((x)) 4216*c217d954SCole Faust 4217*c217d954SCole Faust#define ROT16_0(x) ((x)) 4218*c217d954SCole Faust#define ROT16_1(x) ((x).sF0123456789ABCDE) 4219*c217d954SCole Faust#define ROT16_2(x) ((x).sEF0123456789ABCD) 4220*c217d954SCole Faust#define ROT16_3(x) ((x).sDEF0123456789ABC) 4221*c217d954SCole Faust#define ROT16_4(x) ((x).sCDEF0123456789AB) 4222*c217d954SCole Faust#define ROT16_5(x) ((x).sBCDEF0123456789A) 4223*c217d954SCole Faust#define ROT16_6(x) ((x).sABCDEF0123456789) 4224*c217d954SCole Faust#define ROT16_7(x) ((x).s9ABCDEF012345678) 4225*c217d954SCole Faust#define ROT16_8(x) ((x).s89ABCDEF01234567) 4226*c217d954SCole Faust#define ROT16_9(x) ((x).s789ABCDEF0123456) 4227*c217d954SCole Faust#define ROT16_10(x) ((x).s6789ABCDEF012345) 4228*c217d954SCole Faust#define ROT16_11(x) ((x).s56789ABCDEF01234) 4229*c217d954SCole Faust#define ROT16_12(x) ((x).s456789ABCDEF0123) 4230*c217d954SCole Faust#define ROT16_13(x) ((x).s3456789ABCDEF012) 4231*c217d954SCole Faust#define ROT16_14(x) ((x).s23456789ABCDEF01) 4232*c217d954SCole Faust#define ROT16_15(x) ((x).s123456789ABCDEF0) 4233*c217d954SCole Faust#define ROT16_16(x) ((x)) 4234*c217d954SCole Faust 4235*c217d954SCole Faust 4236*c217d954SCole Faust 4237*c217d954SCole Faust#define ROTATE_STR(x, s, n) ROT##s##_##n(x) 4238*c217d954SCole Faust#define ROTATE(x, s, n) ROTATE_STR(x, s, n) 4239*c217d954SCole Faust 4240*c217d954SCole Faust 4241*c217d954SCole Faust 4242*c217d954SCole Faust#define V_OFFS1(dt) (dt##1)(0) 4243*c217d954SCole Faust#define V_OFFS2(dt) (dt##2)(0, 1) 4244*c217d954SCole Faust#define V_OFFS3(dt) (dt##3)(0, 1, 2) 4245*c217d954SCole Faust#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3) 4246*c217d954SCole Faust#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7) 4247*c217d954SCole Faust#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) 4248*c217d954SCole Faust 4249*c217d954SCole Faust 4250*c217d954SCole Faust 4251*c217d954SCole Faust#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt) 4252*c217d954SCole Faust#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s) 4253*c217d954SCole Faust 4254*c217d954SCole Faust 4255*c217d954SCole Faust#define VLOAD_STR(size) vload##size 4256*c217d954SCole Faust#define VLOAD(size) VLOAD_STR(size) 4257*c217d954SCole Faust 4258*c217d954SCole Faust 4259*c217d954SCole Faust#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size 4260*c217d954SCole Faust#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size) 4261*c217d954SCole Faust 4262*c217d954SCole Faust#define NO_LOAD(data, offs, ptr) \ 4263*c217d954SCole Faust { \ 4264*c217d954SCole Faust } 4265*c217d954SCole Faust 4266*c217d954SCole Faust 4267*c217d954SCole Faust#define vload_partial_1_0 NO_LOAD 4268*c217d954SCole Faust#define vload_partial_1_1 vload1 4269*c217d954SCole Faust#define vload_partial_1_2 NO_LOAD 4270*c217d954SCole Faust#define vload_partial_1_3 NO_LOAD 4271*c217d954SCole Faust#define vload_partial_1_4 NO_LOAD 4272*c217d954SCole Faust#define vload_partial_1_5 NO_LOAD 4273*c217d954SCole Faust#define vload_partial_1_6 NO_LOAD 4274*c217d954SCole Faust#define vload_partial_1_7 NO_LOAD 4275*c217d954SCole Faust#define vload_partial_1_8 NO_LOAD 4276*c217d954SCole Faust#define vload_partial_1_9 NO_LOAD 4277*c217d954SCole Faust#define vload_partial_1_10 NO_LOAD 4278*c217d954SCole Faust#define vload_partial_1_11 NO_LOAD 4279*c217d954SCole Faust#define vload_partial_1_12 NO_LOAD 4280*c217d954SCole Faust#define vload_partial_1_13 NO_LOAD 4281*c217d954SCole Faust#define vload_partial_1_14 NO_LOAD 4282*c217d954SCole Faust#define vload_partial_1_15 NO_LOAD 4283*c217d954SCole Faust#define vload_partial_1_16 NO_LOAD 4284*c217d954SCole Faust 4285*c217d954SCole Faust#define vload_partial_2_0 NO_LOAD 4286*c217d954SCole Faust#define vload_partial_2_1 vload_partial_1 4287*c217d954SCole Faust#define vload_partial_2_2 vload_partial_2 4288*c217d954SCole Faust#define vload_partial_2_3 NO_LOAD 4289*c217d954SCole Faust#define vload_partial_2_4 NO_LOAD 4290*c217d954SCole Faust#define vload_partial_2_5 NO_LOAD 4291*c217d954SCole Faust#define vload_partial_2_6 NO_LOAD 4292*c217d954SCole Faust#define vload_partial_2_7 NO_LOAD 4293*c217d954SCole Faust#define vload_partial_2_8 NO_LOAD 4294*c217d954SCole Faust#define vload_partial_2_9 NO_LOAD 4295*c217d954SCole Faust#define vload_partial_2_10 NO_LOAD 4296*c217d954SCole Faust#define vload_partial_2_11 NO_LOAD 4297*c217d954SCole Faust#define vload_partial_2_12 NO_LOAD 4298*c217d954SCole Faust#define vload_partial_2_13 NO_LOAD 4299*c217d954SCole Faust#define vload_partial_2_14 NO_LOAD 4300*c217d954SCole Faust#define vload_partial_2_15 NO_LOAD 4301*c217d954SCole Faust#define vload_partial_2_16 NO_LOAD 4302*c217d954SCole Faust 4303*c217d954SCole Faust#define vload_partial_3_0 NO_LOAD 4304*c217d954SCole Faust#define vload_partial_3_1 vload_partial_1 4305*c217d954SCole Faust#define vload_partial_3_2 vload_partial_2 4306*c217d954SCole Faust#define vload_partial_3_3 vload_partial_3 4307*c217d954SCole Faust#define vload_partial_3_4 NO_LOAD 4308*c217d954SCole Faust#define vload_partial_3_5 NO_LOAD 4309*c217d954SCole Faust#define vload_partial_3_6 NO_LOAD 4310*c217d954SCole Faust#define vload_partial_3_7 NO_LOAD 4311*c217d954SCole Faust#define vload_partial_3_8 NO_LOAD 4312*c217d954SCole Faust#define vload_partial_3_9 NO_LOAD 4313*c217d954SCole Faust#define vload_partial_3_10 NO_LOAD 4314*c217d954SCole Faust#define vload_partial_3_11 NO_LOAD 4315*c217d954SCole Faust#define vload_partial_3_12 NO_LOAD 4316*c217d954SCole Faust#define vload_partial_3_13 NO_LOAD 4317*c217d954SCole Faust#define vload_partial_3_14 NO_LOAD 4318*c217d954SCole Faust#define vload_partial_3_15 NO_LOAD 4319*c217d954SCole Faust#define vload_partial_3_16 NO_LOAD 4320*c217d954SCole Faust 4321*c217d954SCole Faust#define vload_partial_4_0 NO_LOAD 4322*c217d954SCole Faust#define vload_partial_4_1 vload_partial_1 4323*c217d954SCole Faust#define vload_partial_4_2 vload_partial_2 4324*c217d954SCole Faust#define vload_partial_4_3 vload_partial_3 4325*c217d954SCole Faust#define vload_partial_4_4 vload_partial_4 4326*c217d954SCole Faust#define vload_partial_4_5 NO_LOAD 4327*c217d954SCole Faust#define vload_partial_4_6 NO_LOAD 4328*c217d954SCole Faust#define vload_partial_4_7 NO_LOAD 4329*c217d954SCole Faust#define vload_partial_4_8 NO_LOAD 4330*c217d954SCole Faust#define vload_partial_4_9 NO_LOAD 4331*c217d954SCole Faust#define vload_partial_4_10 NO_LOAD 4332*c217d954SCole Faust#define vload_partial_4_11 NO_LOAD 4333*c217d954SCole Faust#define vload_partial_4_12 NO_LOAD 4334*c217d954SCole Faust#define vload_partial_4_13 NO_LOAD 4335*c217d954SCole Faust#define vload_partial_4_14 NO_LOAD 4336*c217d954SCole Faust#define vload_partial_4_15 NO_LOAD 4337*c217d954SCole Faust#define vload_partial_4_16 NO_LOAD 4338*c217d954SCole Faust 4339*c217d954SCole Faust#define vload_partial_8_0 NO_LOAD 4340*c217d954SCole Faust#define vload_partial_8_1 vload_partial_1 4341*c217d954SCole Faust#define vload_partial_8_2 vload_partial_2 4342*c217d954SCole Faust#define vload_partial_8_3 vload_partial_3 4343*c217d954SCole Faust#define vload_partial_8_4 vload_partial_4 4344*c217d954SCole Faust#define vload_partial_8_5 vload_partial_5 4345*c217d954SCole Faust#define vload_partial_8_6 vload_partial_6 4346*c217d954SCole Faust#define vload_partial_8_7 vload_partial_7 4347*c217d954SCole Faust#define vload_partial_8_8 vload_partial_8 4348*c217d954SCole Faust#define vload_partial_8_9 NO_LOAD 4349*c217d954SCole Faust#define vload_partial_8_10 NO_LOAD 4350*c217d954SCole Faust#define vload_partial_8_11 NO_LOAD 4351*c217d954SCole Faust#define vload_partial_8_12 NO_LOAD 4352*c217d954SCole Faust#define vload_partial_8_13 NO_LOAD 4353*c217d954SCole Faust#define vload_partial_8_14 NO_LOAD 4354*c217d954SCole Faust#define vload_partial_8_15 NO_LOAD 4355*c217d954SCole Faust#define vload_partial_8_16 NO_LOAD 4356*c217d954SCole Faust 4357*c217d954SCole Faust#define vload_partial_16_0 NO_LOAD 4358*c217d954SCole Faust#define vload_partial_16_1 vload_partial_1 4359*c217d954SCole Faust#define vload_partial_16_2 vload_partial_2 4360*c217d954SCole Faust#define vload_partial_16_3 vload_partial_3 4361*c217d954SCole Faust#define vload_partial_16_4 vload_partial_4 4362*c217d954SCole Faust#define vload_partial_16_5 vload_partial_5 4363*c217d954SCole Faust#define vload_partial_16_6 vload_partial_6 4364*c217d954SCole Faust#define vload_partial_16_7 vload_partial_7 4365*c217d954SCole Faust#define vload_partial_16_8 vload_partial_8 4366*c217d954SCole Faust#define vload_partial_16_9 vload_partial_9 4367*c217d954SCole Faust#define vload_partial_16_10 vload_partial_10 4368*c217d954SCole Faust#define vload_partial_16_11 vload_partial_11 4369*c217d954SCole Faust#define vload_partial_16_12 vload_partial_12 4370*c217d954SCole Faust#define vload_partial_16_13 vload_partial_13 4371*c217d954SCole Faust#define vload_partial_16_14 vload_partial_14 4372*c217d954SCole Faust#define vload_partial_16_15 vload_partial_15 4373*c217d954SCole Faust#define vload_partial_16_16 vload_partial_16 4374*c217d954SCole Faust 4375*c217d954SCole Faust 4376*c217d954SCole Faust#define vload_partial_1(DATA, OFFSET, PTR) \ 4377*c217d954SCole Faust DATA.s0 = vload1(OFFSET, PTR); 4378*c217d954SCole Faust 4379*c217d954SCole Faust#define vload_partial_2(DATA, OFFSET, PTR) \ 4380*c217d954SCole Faust DATA.s01 = vload2(OFFSET, PTR); 4381*c217d954SCole Faust 4382*c217d954SCole Faust#define vload_partial_3(DATA, OFFSET, PTR) \ 4383*c217d954SCole Faust DATA.s012 = vload3(OFFSET, PTR); 4384*c217d954SCole Faust 4385*c217d954SCole Faust#define vload_partial_4(DATA, OFFSET, PTR) \ 4386*c217d954SCole Faust DATA.s0123 = vload4(OFFSET, PTR); 4387*c217d954SCole Faust 4388*c217d954SCole Faust#define vload_partial_5(DATA, OFFSET, PTR) \ 4389*c217d954SCole Faust vload_partial_4(DATA.s0123, OFFSET, PTR); \ 4390*c217d954SCole Faust DATA.s4 = vload1(OFFSET, PTR + 4); 4391*c217d954SCole Faust 4392*c217d954SCole Faust#define vload_partial_6(DATA, OFFSET, PTR) \ 4393*c217d954SCole Faust vload_partial_4(DATA.s0123, OFFSET, PTR); \ 4394*c217d954SCole Faust vload_partial_2(DATA.s45, OFFSET, PTR + 4); 4395*c217d954SCole Faust 4396*c217d954SCole Faust#define vload_partial_7(DATA, OFFSET, PTR) \ 4397*c217d954SCole Faust vload_partial_4(DATA.s0123, OFFSET, PTR); \ 4398*c217d954SCole Faust vload_partial_3(DATA.s456, OFFSET, PTR + 4); 4399*c217d954SCole Faust 4400*c217d954SCole Faust#define vload_partial_8(DATA, OFFSET, PTR) \ 4401*c217d954SCole Faust DATA.s01234567 = vload8(OFFSET, PTR); 4402*c217d954SCole Faust 4403*c217d954SCole Faust#define vload_partial_9(DATA, OFFSET, PTR) \ 4404*c217d954SCole Faust vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 4405*c217d954SCole Faust DATA.s8 = vload1(OFFSET, PTR + 8); 4406*c217d954SCole Faust 4407*c217d954SCole Faust#define vload_partial_10(DATA, OFFSET, PTR) \ 4408*c217d954SCole Faust vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 4409*c217d954SCole Faust vload_partial_2(DATA.s89, OFFSET, PTR + 8); 4410*c217d954SCole Faust 4411*c217d954SCole Faust#define vload_partial_11(DATA, OFFSET, PTR) \ 4412*c217d954SCole Faust vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 4413*c217d954SCole Faust vload_partial_3(DATA.s89A, OFFSET, PTR + 8); 4414*c217d954SCole Faust 4415*c217d954SCole Faust#define vload_partial_12(DATA, OFFSET, PTR) \ 4416*c217d954SCole Faust vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 4417*c217d954SCole Faust vload_partial_4(DATA.s89AB, OFFSET, PTR + 8); 4418*c217d954SCole Faust 4419*c217d954SCole Faust#define vload_partial_13(DATA, OFFSET, PTR) \ 4420*c217d954SCole Faust vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 4421*c217d954SCole Faust vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8); 4422*c217d954SCole Faust 4423*c217d954SCole Faust#define vload_partial_14(DATA, OFFSET, PTR) \ 4424*c217d954SCole Faust vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 4425*c217d954SCole Faust vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8); 4426*c217d954SCole Faust 4427*c217d954SCole Faust#define vload_partial_15(DATA, OFFSET, PTR) \ 4428*c217d954SCole Faust vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 4429*c217d954SCole Faust vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8); 4430*c217d954SCole Faust 4431*c217d954SCole Faust#define vload_partial_16(DATA, OFFSET, PTR) \ 4432*c217d954SCole Faust DATA = vload16(OFFSET, PTR); 4433*c217d954SCole Faust 4434*c217d954SCole Faust 4435*c217d954SCole Faust 4436*c217d954SCole Faust#define PIXEL_UNIT4 1 4437*c217d954SCole Faust#define PIXEL_UNIT8 2 4438*c217d954SCole Faust#define PIXEL_UNIT16 4 4439*c217d954SCole Faust 4440*c217d954SCole Faust 4441*c217d954SCole Faust#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size 4442*c217d954SCole Faust#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) 4443*c217d954SCole Faust 4444*c217d954SCole Faust 4445*c217d954SCole Faust#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord))); 4446*c217d954SCole Faust#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord))); 4447*c217d954SCole Faust#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord))); 4448*c217d954SCole Faust 4449*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 4450*c217d954SCole Faust#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord))); 4451*c217d954SCole Faust#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord))); 4452*c217d954SCole Faust#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord))); 4453*c217d954SCole Faust#endif 4454*c217d954SCole Faust 4455*c217d954SCole Faust#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values)); 4456*c217d954SCole Faust#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567)); 4457*c217d954SCole Faust#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); 4458*c217d954SCole Faust 4459*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 4460*c217d954SCole Faust#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values)); 4461*c217d954SCole Faust#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567)); 4462*c217d954SCole Faust#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); 4463*c217d954SCole Faust#endif 4464*c217d954SCole Faust 4465*c217d954SCole Faust 4466*c217d954SCole Faust#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord) 4467*c217d954SCole Faust#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) 4468*c217d954SCole Faust 4469*c217d954SCole Faust 4470*c217d954SCole Faust#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values) 4471*c217d954SCole Faust#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) 4472*c217d954SCole Faust 4473*c217d954SCole Faust#define VSTORE_STR(size) vstore##size 4474*c217d954SCole Faust#define VSTORE(size) VSTORE_STR(size) 4475*c217d954SCole Faust 4476*c217d954SCole Faust#define float1 float 4477*c217d954SCole Faust#define half1 half 4478*c217d954SCole Faust#define char1 char 4479*c217d954SCole Faust#define uchar1 uchar 4480*c217d954SCole Faust#define short1 short 4481*c217d954SCole Faust#define ushort1 ushort 4482*c217d954SCole Faust#define int1 int 4483*c217d954SCole Faust#define uint1 uint 4484*c217d954SCole Faust#define long1 long 4485*c217d954SCole Faust#define ulong1 ulong 4486*c217d954SCole Faust#define double1 double 4487*c217d954SCole Faust 4488*c217d954SCole Faust#define vload1(OFFSET, PTR) *(OFFSET + PTR) 4489*c217d954SCole Faust#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA 4490*c217d954SCole Faust 4491*c217d954SCole Faust 4492*c217d954SCole Faust#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size 4493*c217d954SCole Faust#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size) 4494*c217d954SCole Faust 4495*c217d954SCole Faust#define NO_STORE(data, offs, ptr) \ 4496*c217d954SCole Faust { \ 4497*c217d954SCole Faust } 4498*c217d954SCole Faust 4499*c217d954SCole Faust 4500*c217d954SCole Faust#define vstore_partial_1_0 NO_STORE 4501*c217d954SCole Faust#define vstore_partial_1_1 vstore1 4502*c217d954SCole Faust#define vstore_partial_1_2 NO_STORE 4503*c217d954SCole Faust#define vstore_partial_1_3 NO_STORE 4504*c217d954SCole Faust#define vstore_partial_1_4 NO_STORE 4505*c217d954SCole Faust#define vstore_partial_1_5 NO_STORE 4506*c217d954SCole Faust#define vstore_partial_1_6 NO_STORE 4507*c217d954SCole Faust#define vstore_partial_1_7 NO_STORE 4508*c217d954SCole Faust#define vstore_partial_1_8 NO_STORE 4509*c217d954SCole Faust#define vstore_partial_1_9 NO_STORE 4510*c217d954SCole Faust#define vstore_partial_1_10 NO_STORE 4511*c217d954SCole Faust#define vstore_partial_1_11 NO_STORE 4512*c217d954SCole Faust#define vstore_partial_1_12 NO_STORE 4513*c217d954SCole Faust#define vstore_partial_1_13 NO_STORE 4514*c217d954SCole Faust#define vstore_partial_1_14 NO_STORE 4515*c217d954SCole Faust#define vstore_partial_1_15 NO_STORE 4516*c217d954SCole Faust#define vstore_partial_1_16 NO_STORE 4517*c217d954SCole Faust 4518*c217d954SCole Faust#define vstore_partial_2_0 NO_STORE 4519*c217d954SCole Faust#define vstore_partial_2_1 vstore_partial_1 4520*c217d954SCole Faust#define vstore_partial_2_2 vstore_partial_2 4521*c217d954SCole Faust#define vstore_partial_2_3 NO_STORE 4522*c217d954SCole Faust#define vstore_partial_2_4 NO_STORE 4523*c217d954SCole Faust#define vstore_partial_2_5 NO_STORE 4524*c217d954SCole Faust#define vstore_partial_2_6 NO_STORE 4525*c217d954SCole Faust#define vstore_partial_2_7 NO_STORE 4526*c217d954SCole Faust#define vstore_partial_2_8 NO_STORE 4527*c217d954SCole Faust#define vstore_partial_2_9 NO_STORE 4528*c217d954SCole Faust#define vstore_partial_2_10 NO_STORE 4529*c217d954SCole Faust#define vstore_partial_2_11 NO_STORE 4530*c217d954SCole Faust#define vstore_partial_2_12 NO_STORE 4531*c217d954SCole Faust#define vstore_partial_2_13 NO_STORE 4532*c217d954SCole Faust#define vstore_partial_2_14 NO_STORE 4533*c217d954SCole Faust#define vstore_partial_2_15 NO_STORE 4534*c217d954SCole Faust#define vstore_partial_2_16 NO_STORE 4535*c217d954SCole Faust 4536*c217d954SCole Faust#define vstore_partial_3_0 NO_STORE 4537*c217d954SCole Faust#define vstore_partial_3_1 vstore_partial_1 4538*c217d954SCole Faust#define vstore_partial_3_2 vstore_partial_2 4539*c217d954SCole Faust#define vstore_partial_3_3 vstore_partial_3 4540*c217d954SCole Faust#define vstore_partial_3_4 NO_STORE 4541*c217d954SCole Faust#define vstore_partial_3_5 NO_STORE 4542*c217d954SCole Faust#define vstore_partial_3_6 NO_STORE 4543*c217d954SCole Faust#define vstore_partial_3_7 NO_STORE 4544*c217d954SCole Faust#define vstore_partial_3_8 NO_STORE 4545*c217d954SCole Faust#define vstore_partial_3_9 NO_STORE 4546*c217d954SCole Faust#define vstore_partial_3_10 NO_STORE 4547*c217d954SCole Faust#define vstore_partial_3_11 NO_STORE 4548*c217d954SCole Faust#define vstore_partial_3_12 NO_STORE 4549*c217d954SCole Faust#define vstore_partial_3_13 NO_STORE 4550*c217d954SCole Faust#define vstore_partial_3_14 NO_STORE 4551*c217d954SCole Faust#define vstore_partial_3_15 NO_STORE 4552*c217d954SCole Faust#define vstore_partial_3_16 NO_STORE 4553*c217d954SCole Faust 4554*c217d954SCole Faust#define vstore_partial_4_0 NO_STORE 4555*c217d954SCole Faust#define vstore_partial_4_1 vstore_partial_1 4556*c217d954SCole Faust#define vstore_partial_4_2 vstore_partial_2 4557*c217d954SCole Faust#define vstore_partial_4_3 vstore_partial_3 4558*c217d954SCole Faust#define vstore_partial_4_4 vstore_partial_4 4559*c217d954SCole Faust#define vstore_partial_4_5 NO_STORE 4560*c217d954SCole Faust#define vstore_partial_4_6 NO_STORE 4561*c217d954SCole Faust#define vstore_partial_4_7 NO_STORE 4562*c217d954SCole Faust#define vstore_partial_4_8 NO_STORE 4563*c217d954SCole Faust#define vstore_partial_4_9 NO_STORE 4564*c217d954SCole Faust#define vstore_partial_4_10 NO_STORE 4565*c217d954SCole Faust#define vstore_partial_4_11 NO_STORE 4566*c217d954SCole Faust#define vstore_partial_4_12 NO_STORE 4567*c217d954SCole Faust#define vstore_partial_4_13 NO_STORE 4568*c217d954SCole Faust#define vstore_partial_4_14 NO_STORE 4569*c217d954SCole Faust#define vstore_partial_4_15 NO_STORE 4570*c217d954SCole Faust#define vstore_partial_4_16 NO_STORE 4571*c217d954SCole Faust 4572*c217d954SCole Faust#define vstore_partial_8_0 NO_STORE 4573*c217d954SCole Faust#define vstore_partial_8_1 vstore_partial_1 4574*c217d954SCole Faust#define vstore_partial_8_2 vstore_partial_2 4575*c217d954SCole Faust#define vstore_partial_8_3 vstore_partial_3 4576*c217d954SCole Faust#define vstore_partial_8_4 vstore_partial_4 4577*c217d954SCole Faust#define vstore_partial_8_5 vstore_partial_5 4578*c217d954SCole Faust#define vstore_partial_8_6 vstore_partial_6 4579*c217d954SCole Faust#define vstore_partial_8_7 vstore_partial_7 4580*c217d954SCole Faust#define vstore_partial_8_8 vstore_partial_8 4581*c217d954SCole Faust#define vstore_partial_8_9 NO_STORE 4582*c217d954SCole Faust#define vstore_partial_8_10 NO_STORE 4583*c217d954SCole Faust#define vstore_partial_8_11 NO_STORE 4584*c217d954SCole Faust#define vstore_partial_8_12 NO_STORE 4585*c217d954SCole Faust#define vstore_partial_8_13 NO_STORE 4586*c217d954SCole Faust#define vstore_partial_8_14 NO_STORE 4587*c217d954SCole Faust#define vstore_partial_8_15 NO_STORE 4588*c217d954SCole Faust#define vstore_partial_8_16 NO_STORE 4589*c217d954SCole Faust 4590*c217d954SCole Faust#define vstore_partial_16_0 NO_STORE 4591*c217d954SCole Faust#define vstore_partial_16_1 vstore_partial_1 4592*c217d954SCole Faust#define vstore_partial_16_2 vstore_partial_2 4593*c217d954SCole Faust#define vstore_partial_16_3 vstore_partial_3 4594*c217d954SCole Faust#define vstore_partial_16_4 vstore_partial_4 4595*c217d954SCole Faust#define vstore_partial_16_5 vstore_partial_5 4596*c217d954SCole Faust#define vstore_partial_16_6 vstore_partial_6 4597*c217d954SCole Faust#define vstore_partial_16_7 vstore_partial_7 4598*c217d954SCole Faust#define vstore_partial_16_8 vstore_partial_8 4599*c217d954SCole Faust#define vstore_partial_16_9 vstore_partial_9 4600*c217d954SCole Faust#define vstore_partial_16_10 vstore_partial_10 4601*c217d954SCole Faust#define vstore_partial_16_11 vstore_partial_11 4602*c217d954SCole Faust#define vstore_partial_16_12 vstore_partial_12 4603*c217d954SCole Faust#define vstore_partial_16_13 vstore_partial_13 4604*c217d954SCole Faust#define vstore_partial_16_14 vstore_partial_14 4605*c217d954SCole Faust#define vstore_partial_16_15 vstore_partial_15 4606*c217d954SCole Faust#define vstore_partial_16_16 vstore_partial_16 4607*c217d954SCole Faust 4608*c217d954SCole Faust 4609*c217d954SCole Faust#define vstore_partial_1(DATA, OFFSET, PTR) \ 4610*c217d954SCole Faust vstore1(DATA.s0, OFFSET, PTR); 4611*c217d954SCole Faust 4612*c217d954SCole Faust#define vstore_partial_2(DATA, OFFSET, PTR) \ 4613*c217d954SCole Faust vstore2(DATA.s01, OFFSET, PTR); 4614*c217d954SCole Faust 4615*c217d954SCole Faust#define vstore_partial_3(DATA, OFFSET, PTR) \ 4616*c217d954SCole Faust vstore3(DATA.s012, OFFSET, PTR); 4617*c217d954SCole Faust 4618*c217d954SCole Faust#define vstore_partial_4(DATA, OFFSET, PTR) \ 4619*c217d954SCole Faust vstore4(DATA.s0123, OFFSET, PTR); 4620*c217d954SCole Faust 4621*c217d954SCole Faust#define vstore_partial_5(DATA, OFFSET, PTR) \ 4622*c217d954SCole Faust vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 4623*c217d954SCole Faust vstore1(DATA.s4, OFFSET, PTR + 4); 4624*c217d954SCole Faust 4625*c217d954SCole Faust#define vstore_partial_6(DATA, OFFSET, PTR) \ 4626*c217d954SCole Faust vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 4627*c217d954SCole Faust vstore_partial_2(DATA.s45, OFFSET, PTR + 4); 4628*c217d954SCole Faust 4629*c217d954SCole Faust#define vstore_partial_7(DATA, OFFSET, PTR) \ 4630*c217d954SCole Faust vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 4631*c217d954SCole Faust vstore_partial_3(DATA.s456, OFFSET, PTR + 4); 4632*c217d954SCole Faust 4633*c217d954SCole Faust#define vstore_partial_8(DATA, OFFSET, PTR) \ 4634*c217d954SCole Faust vstore8(DATA.s01234567, OFFSET, PTR); 4635*c217d954SCole Faust 4636*c217d954SCole Faust#define vstore_partial_9(DATA, OFFSET, PTR) \ 4637*c217d954SCole Faust vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 4638*c217d954SCole Faust vstore1(DATA.s8, OFFSET, PTR + 8); 4639*c217d954SCole Faust 4640*c217d954SCole Faust#define vstore_partial_10(DATA, OFFSET, PTR) \ 4641*c217d954SCole Faust vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 4642*c217d954SCole Faust vstore_partial_2(DATA.s89, OFFSET, PTR + 8); 4643*c217d954SCole Faust 4644*c217d954SCole Faust#define vstore_partial_11(DATA, OFFSET, PTR) \ 4645*c217d954SCole Faust vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 4646*c217d954SCole Faust vstore_partial_3(DATA.s89a, OFFSET, PTR + 8); 4647*c217d954SCole Faust 4648*c217d954SCole Faust#define vstore_partial_12(DATA, OFFSET, PTR) \ 4649*c217d954SCole Faust vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 4650*c217d954SCole Faust vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8); 4651*c217d954SCole Faust 4652*c217d954SCole Faust#define vstore_partial_13(DATA, OFFSET, PTR) \ 4653*c217d954SCole Faust vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 4654*c217d954SCole Faust vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8); 4655*c217d954SCole Faust 4656*c217d954SCole Faust#define vstore_partial_14(DATA, OFFSET, PTR) \ 4657*c217d954SCole Faust vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 4658*c217d954SCole Faust vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8); 4659*c217d954SCole Faust 4660*c217d954SCole Faust#define vstore_partial_15(DATA, OFFSET, PTR) \ 4661*c217d954SCole Faust vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 4662*c217d954SCole Faust vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8); 4663*c217d954SCole Faust 4664*c217d954SCole Faust#define vstore_partial_16(DATA, OFFSET, PTR) \ 4665*c217d954SCole Faust vstore16(DATA, OFFSET, PTR); 4666*c217d954SCole Faust 4667*c217d954SCole Faust 4668*c217d954SCole Faust 4669*c217d954SCole Faust 4670*c217d954SCole Faust 4671*c217d954SCole Faust#define convert_float_sat convert_float 4672*c217d954SCole Faust#define convert_float1_sat convert_float 4673*c217d954SCole Faust#define convert_float2_sat convert_float2 4674*c217d954SCole Faust#define convert_float3_sat convert_float3 4675*c217d954SCole Faust#define convert_float4_sat convert_float4 4676*c217d954SCole Faust#define convert_float8_sat convert_float8 4677*c217d954SCole Faust#define convert_float16_sat convert_float16 4678*c217d954SCole Faust#define convert_half_sat convert_float 4679*c217d954SCole Faust#define convert_half1_sat convert_half 4680*c217d954SCole Faust#define convert_half2_sat convert_half2 4681*c217d954SCole Faust#define convert_half3_sat convert_half3 4682*c217d954SCole Faust#define convert_half4_sat convert_half4 4683*c217d954SCole Faust#define convert_half8_sat convert_half8 4684*c217d954SCole Faust#define convert_half16_sat convert_half16 4685*c217d954SCole Faust 4686*c217d954SCole Faust#define convert_float1 convert_float 4687*c217d954SCole Faust#define convert_half1 convert_half 4688*c217d954SCole Faust#define convert_char1 convert_char 4689*c217d954SCole Faust#define convert_uchar1 convert_uchar 4690*c217d954SCole Faust#define convert_short1 convert_short 4691*c217d954SCole Faust#define convert_ushort1 convert_ushort 4692*c217d954SCole Faust#define convert_int1 convert_int 4693*c217d954SCole Faust#define convert_uint1 convert_uint 4694*c217d954SCole Faust#define convert_long1 convert_long 4695*c217d954SCole Faust#define convert_ulong1 convert_ulong 4696*c217d954SCole Faust#define convert_double1 convert_double 4697*c217d954SCole Faust 4698*c217d954SCole Faust#define convert_char1_sat convert_char_sat 4699*c217d954SCole Faust#define convert_uchar1_sat convert_uchar_sat 4700*c217d954SCole Faust#define convert_uchar2_sat convert_uchar2_sat 4701*c217d954SCole Faust#define convert_uchar3_sat convert_uchar3_sat 4702*c217d954SCole Faust#define convert_uchar4_sat convert_uchar4_sat 4703*c217d954SCole Faust#define convert_uchar8_sat convert_uchar8_sat 4704*c217d954SCole Faust#define convert_uchar16_sat convert_uchar16_sat 4705*c217d954SCole Faust#define convert_short1_sat convert_short_sat 4706*c217d954SCole Faust#define convert_ushort1_sat convert_ushort_sat 4707*c217d954SCole Faust#define convert_int1_sat convert_int_sat 4708*c217d954SCole Faust#define convert_uint1_sat convert_uint_sat 4709*c217d954SCole Faust#define convert_long1_sat convert_long_sat 4710*c217d954SCole Faust#define convert_ulong1_sat convert_ulong_sat 4711*c217d954SCole Faust#define convert_double1_sat convert_double_sat 4712*c217d954SCole Faust 4713*c217d954SCole Faust#define VEC_DATA_TYPE_STR(type, size) type##size 4714*c217d954SCole Faust#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) 4715*c217d954SCole Faust 4716*c217d954SCole Faust#define CONVERT_STR(x, type) (convert_##type((x))) 4717*c217d954SCole Faust#define CONVERT(x, type) CONVERT_STR(x, type) 4718*c217d954SCole Faust 4719*c217d954SCole Faust#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x))) 4720*c217d954SCole Faust#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type) 4721*c217d954SCole Faust 4722*c217d954SCole Faust#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x))) 4723*c217d954SCole Faust#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round) 4724*c217d954SCole Faust 4725*c217d954SCole Faust#define select_vec_dt_uchar(size) uchar##size 4726*c217d954SCole Faust#define select_vec_dt_char(size) char##size 4727*c217d954SCole Faust#define select_vec_dt_ushort(size) ushort##size 4728*c217d954SCole Faust#define select_vec_dt_short(size) short##size 4729*c217d954SCole Faust#define select_vec_dt_half(size) short##size 4730*c217d954SCole Faust#define select_vec_dt_uint(size) uint##size 4731*c217d954SCole Faust#define select_vec_dt_int(size) int##size 4732*c217d954SCole Faust#define select_vec_dt_float(size) int##size 4733*c217d954SCole Faust#define select_vec_dt_ulong(size) ulong##size 4734*c217d954SCole Faust#define select_vec_dt_long(size) long##size 4735*c217d954SCole Faust 4736*c217d954SCole Faust#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size) 4737*c217d954SCole Faust#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size) 4738*c217d954SCole Faust#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1) 4739*c217d954SCole Faust 4740*c217d954SCole Faust#define signed_int_vec_dt_uchar(size) char##size 4741*c217d954SCole Faust#define signed_int_vec_dt_char(size) char##size 4742*c217d954SCole Faust#define signed_int_vec_dt_ushort(size) short##size 4743*c217d954SCole Faust#define signed_int_vec_dt_short(size) short##size 4744*c217d954SCole Faust#define signed_int_vec_dt_half(size) short##size 4745*c217d954SCole Faust#define signed_int_vec_dt_uint(size) int##size 4746*c217d954SCole Faust#define signed_int_vec_dt_int(size) int##size 4747*c217d954SCole Faust#define signed_int_vec_dt_float(size) int##size 4748*c217d954SCole Faust#define signed_int_vec_dt_ulong(size) long##size 4749*c217d954SCole Faust#define signed_int_vec_dt_long(size) long##size 4750*c217d954SCole Faust 4751*c217d954SCole Faust#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size) 4752*c217d954SCole Faust#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size) 4753*c217d954SCole Faust#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1) 4754*c217d954SCole Faust 4755*c217d954SCole Faust#define sum_reduce_1(x) (x) 4756*c217d954SCole Faust#define sum_reduce_2(x) ((x).s0) + ((x).s1) 4757*c217d954SCole Faust#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2) 4758*c217d954SCole Faust#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23) 4759*c217d954SCole Faust#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567) 4760*c217d954SCole Faust#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF) 4761*c217d954SCole Faust 4762*c217d954SCole Faust#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x) 4763*c217d954SCole Faust#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size) 4764*c217d954SCole Faust 4765*c217d954SCole Faust#define prod_reduce_1(x) (x) 4766*c217d954SCole Faust#define prod_reduce_2(x) ((x).s0) * ((x).s1) 4767*c217d954SCole Faust#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2) 4768*c217d954SCole Faust#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23) 4769*c217d954SCole Faust#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567) 4770*c217d954SCole Faust#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF) 4771*c217d954SCole Faust 4772*c217d954SCole Faust#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x) 4773*c217d954SCole Faust#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size) 4774*c217d954SCole Faust 4775*c217d954SCole Faust#define max_reduce_1(x) (x) 4776*c217d954SCole Faust#define max_reduce_2(x) max(((x).s0), ((x).s1)) 4777*c217d954SCole Faust#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2)) 4778*c217d954SCole Faust#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23)) 4779*c217d954SCole Faust#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567)) 4780*c217d954SCole Faust#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF)) 4781*c217d954SCole Faust 4782*c217d954SCole Faust#define MAX_REDUCE_STR(x, size) max_reduce_##size(x) 4783*c217d954SCole Faust#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size) 4784*c217d954SCole Faust 4785*c217d954SCole Faust#define VECTOR_DECLARATION(name) \ 4786*c217d954SCole Faust __global uchar *name##_ptr, \ 4787*c217d954SCole Faust uint name##_stride_x, \ 4788*c217d954SCole Faust uint name##_step_x, \ 4789*c217d954SCole Faust uint name##_offset_first_element_in_bytes 4790*c217d954SCole Faust 4791*c217d954SCole Faust#define IMAGE_DECLARATION(name) \ 4792*c217d954SCole Faust __global uchar *name##_ptr, \ 4793*c217d954SCole Faust uint name##_stride_x, \ 4794*c217d954SCole Faust uint name##_step_x, \ 4795*c217d954SCole Faust uint name##_stride_y, \ 4796*c217d954SCole Faust uint name##_step_y, \ 4797*c217d954SCole Faust uint name##_offset_first_element_in_bytes 4798*c217d954SCole Faust 4799*c217d954SCole Faust#define TENSOR3D_DECLARATION(name) \ 4800*c217d954SCole Faust __global uchar *name##_ptr, \ 4801*c217d954SCole Faust uint name##_stride_x, \ 4802*c217d954SCole Faust uint name##_step_x, \ 4803*c217d954SCole Faust uint name##_stride_y, \ 4804*c217d954SCole Faust uint name##_step_y, \ 4805*c217d954SCole Faust uint name##_stride_z, \ 4806*c217d954SCole Faust uint name##_step_z, \ 4807*c217d954SCole Faust uint name##_offset_first_element_in_bytes 4808*c217d954SCole Faust 4809*c217d954SCole Faust#define TENSOR4D_DECLARATION(name) \ 4810*c217d954SCole Faust __global uchar *name##_ptr, \ 4811*c217d954SCole Faust uint name##_stride_x, \ 4812*c217d954SCole Faust uint name##_step_x, \ 4813*c217d954SCole Faust uint name##_stride_y, \ 4814*c217d954SCole Faust uint name##_step_y, \ 4815*c217d954SCole Faust uint name##_stride_z, \ 4816*c217d954SCole Faust uint name##_step_z, \ 4817*c217d954SCole Faust uint name##_stride_w, \ 4818*c217d954SCole Faust uint name##_step_w, \ 4819*c217d954SCole Faust uint name##_offset_first_element_in_bytes 4820*c217d954SCole Faust 4821*c217d954SCole Faust#define TENSOR5D_DECLARATION(name) \ 4822*c217d954SCole Faust __global uchar *name##_ptr, \ 4823*c217d954SCole Faust uint name##_stride_x, \ 4824*c217d954SCole Faust uint name##_step_x, \ 4825*c217d954SCole Faust uint name##_stride_y, \ 4826*c217d954SCole Faust uint name##_step_y, \ 4827*c217d954SCole Faust uint name##_stride_z, \ 4828*c217d954SCole Faust uint name##_step_z, \ 4829*c217d954SCole Faust uint name##_stride_w, \ 4830*c217d954SCole Faust uint name##_step_w, \ 4831*c217d954SCole Faust uint name##_stride_v, \ 4832*c217d954SCole Faust uint name##_step_v, \ 4833*c217d954SCole Faust uint name##_offset_first_element_in_bytes 4834*c217d954SCole Faust 4835*c217d954SCole Faust#define CONVERT_TO_VECTOR_STRUCT(name) \ 4836*c217d954SCole Faust update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x) 4837*c217d954SCole Faust 4838*c217d954SCole Faust#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \ 4839*c217d954SCole Faust update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0) 4840*c217d954SCole Faust 4841*c217d954SCole Faust#define CONVERT_TO_IMAGE_STRUCT(name) \ 4842*c217d954SCole Faust update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y) 4843*c217d954SCole Faust 4844*c217d954SCole Faust#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \ 4845*c217d954SCole Faust update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0) 4846*c217d954SCole Faust 4847*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 4848*c217d954SCole Faust update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 4849*c217d954SCole Faust 4850*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \ 4851*c217d954SCole Faust update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z) 4852*c217d954SCole Faust 4853*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 4854*c217d954SCole Faust update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 4855*c217d954SCole Faust 4856*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT(name) \ 4857*c217d954SCole Faust update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 4858*c217d954SCole Faust name##_stride_z, name##_step_z) 4859*c217d954SCole Faust 4860*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \ 4861*c217d954SCole Faust update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0) 4862*c217d954SCole Faust 4863*c217d954SCole Faust#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size) \ 4864*c217d954SCole Faust update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 4865*c217d954SCole Faust name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size) 4866*c217d954SCole Faust 4867*c217d954SCole Faust#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \ 4868*c217d954SCole Faust update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size) 4869*c217d954SCole Faust 4870*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name) \ 4871*c217d954SCole Faust tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 4872*c217d954SCole Faust name##_stride_z, name##_step_z) 4873*c217d954SCole Faust 4874*c217d954SCole Faust 4875*c217d954SCole Fausttypedef struct Vector 4876*c217d954SCole Faust{ 4877*c217d954SCole Faust __global uchar *ptr; 4878*c217d954SCole Faust int offset_first_element_in_bytes; 4879*c217d954SCole Faust int stride_x; 4880*c217d954SCole Faust} Vector; 4881*c217d954SCole Faust 4882*c217d954SCole Faust 4883*c217d954SCole Fausttypedef struct Image 4884*c217d954SCole Faust{ 4885*c217d954SCole Faust __global uchar *ptr; 4886*c217d954SCole Faust int offset_first_element_in_bytes; 4887*c217d954SCole Faust int stride_x; 4888*c217d954SCole Faust int stride_y; 4889*c217d954SCole Faust} Image; 4890*c217d954SCole Faust 4891*c217d954SCole Faust 4892*c217d954SCole Fausttypedef struct Tensor3D 4893*c217d954SCole Faust{ 4894*c217d954SCole Faust __global uchar *ptr; 4895*c217d954SCole Faust int offset_first_element_in_bytes; 4896*c217d954SCole Faust int stride_x; 4897*c217d954SCole Faust int stride_y; 4898*c217d954SCole Faust int stride_z; 4899*c217d954SCole Faust} Tensor3D; 4900*c217d954SCole Faust 4901*c217d954SCole Faust 4902*c217d954SCole Fausttypedef struct Tensor4D 4903*c217d954SCole Faust{ 4904*c217d954SCole Faust __global uchar *ptr; 4905*c217d954SCole Faust int offset_first_element_in_bytes; 4906*c217d954SCole Faust int stride_x; 4907*c217d954SCole Faust int stride_y; 4908*c217d954SCole Faust int stride_z; 4909*c217d954SCole Faust int stride_w; 4910*c217d954SCole Faust} Tensor4D; 4911*c217d954SCole Faust 4912*c217d954SCole Faust 4913*c217d954SCole Faustinline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x) 4914*c217d954SCole Faust{ 4915*c217d954SCole Faust Vector vector = 4916*c217d954SCole Faust { 4917*c217d954SCole Faust .ptr = ptr, 4918*c217d954SCole Faust .offset_first_element_in_bytes = offset_first_element_in_bytes, 4919*c217d954SCole Faust .stride_x = stride_x, 4920*c217d954SCole Faust }; 4921*c217d954SCole Faust vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x; 4922*c217d954SCole Faust return vector; 4923*c217d954SCole Faust} 4924*c217d954SCole Faust 4925*c217d954SCole Faust 4926*c217d954SCole Faustinline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y) 4927*c217d954SCole Faust{ 4928*c217d954SCole Faust Image img = 4929*c217d954SCole Faust { 4930*c217d954SCole Faust .ptr = ptr, 4931*c217d954SCole Faust .offset_first_element_in_bytes = offset_first_element_in_bytes, 4932*c217d954SCole Faust .stride_x = stride_x, 4933*c217d954SCole Faust .stride_y = stride_y 4934*c217d954SCole Faust }; 4935*c217d954SCole Faust img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y; 4936*c217d954SCole Faust return img; 4937*c217d954SCole Faust} 4938*c217d954SCole Faust 4939*c217d954SCole Faust 4940*c217d954SCole Faustinline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 4941*c217d954SCole Faust{ 4942*c217d954SCole Faust Image img = 4943*c217d954SCole Faust { 4944*c217d954SCole Faust .ptr = ptr, 4945*c217d954SCole Faust .offset_first_element_in_bytes = offset_first_element_in_bytes, 4946*c217d954SCole Faust .stride_x = stride_x, 4947*c217d954SCole Faust .stride_y = stride_y 4948*c217d954SCole Faust }; 4949*c217d954SCole Faust img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 4950*c217d954SCole Faust return img; 4951*c217d954SCole Faust} 4952*c217d954SCole Faust 4953*c217d954SCole Faust 4954*c217d954SCole Faustinline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 4955*c217d954SCole Faust{ 4956*c217d954SCole Faust Tensor3D tensor = 4957*c217d954SCole Faust { 4958*c217d954SCole Faust .ptr = ptr, 4959*c217d954SCole Faust .offset_first_element_in_bytes = offset_first_element_in_bytes, 4960*c217d954SCole Faust .stride_x = stride_x, 4961*c217d954SCole Faust .stride_y = stride_y, 4962*c217d954SCole Faust .stride_z = stride_z 4963*c217d954SCole Faust }; 4964*c217d954SCole Faust tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 4965*c217d954SCole Faust return tensor; 4966*c217d954SCole Faust} 4967*c217d954SCole Faust 4968*c217d954SCole Faust 4969*c217d954SCole Faustinline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 4970*c217d954SCole Faust{ 4971*c217d954SCole Faust Tensor3D tensor = 4972*c217d954SCole Faust { 4973*c217d954SCole Faust .ptr = ptr, 4974*c217d954SCole Faust .offset_first_element_in_bytes = offset_first_element_in_bytes, 4975*c217d954SCole Faust .stride_x = stride_x, 4976*c217d954SCole Faust .stride_y = stride_y, 4977*c217d954SCole Faust .stride_z = stride_z 4978*c217d954SCole Faust }; 4979*c217d954SCole Faust return tensor; 4980*c217d954SCole Faust} 4981*c217d954SCole Faust 4982*c217d954SCole Faustinline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w, 4983*c217d954SCole Faust uint step_w, 4984*c217d954SCole Faust uint mod_size) 4985*c217d954SCole Faust{ 4986*c217d954SCole Faust Tensor4D tensor = 4987*c217d954SCole Faust { 4988*c217d954SCole Faust .ptr = ptr, 4989*c217d954SCole Faust .offset_first_element_in_bytes = offset_first_element_in_bytes, 4990*c217d954SCole Faust .stride_x = stride_x, 4991*c217d954SCole Faust .stride_y = stride_y, 4992*c217d954SCole Faust .stride_z = stride_z, 4993*c217d954SCole Faust .stride_w = stride_w 4994*c217d954SCole Faust }; 4995*c217d954SCole Faust 4996*c217d954SCole Faust tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w; 4997*c217d954SCole Faust return tensor; 4998*c217d954SCole Faust} 4999*c217d954SCole Faust 5000*c217d954SCole Faust 5001*c217d954SCole Faustinline __global const uchar *vector_offset(const Vector *vec, int x) 5002*c217d954SCole Faust{ 5003*c217d954SCole Faust return vec->ptr + x * vec->stride_x; 5004*c217d954SCole Faust} 5005*c217d954SCole Faust 5006*c217d954SCole Faust 5007*c217d954SCole Faustinline __global uchar *offset(const Image *img, int x, int y) 5008*c217d954SCole Faust{ 5009*c217d954SCole Faust return img->ptr + x * img->stride_x + y * img->stride_y; 5010*c217d954SCole Faust} 5011*c217d954SCole Faust 5012*c217d954SCole Faust 5013*c217d954SCole Faustinline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z) 5014*c217d954SCole Faust{ 5015*c217d954SCole Faust return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z; 5016*c217d954SCole Faust} 5017*c217d954SCole Faust 5018*c217d954SCole Faust 5019*c217d954SCole Faustinline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w) 5020*c217d954SCole Faust{ 5021*c217d954SCole Faust return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w; 5022*c217d954SCole Faust} 5023*c217d954SCole Faust 5024*c217d954SCole Faust 5025*c217d954SCole Faustinline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index) 5026*c217d954SCole Faust{ 5027*c217d954SCole Faust uint num_elements = width * height; 5028*c217d954SCole Faust 5029*c217d954SCole Faust const uint z = index / num_elements; 5030*c217d954SCole Faust 5031*c217d954SCole Faust index %= num_elements; 5032*c217d954SCole Faust 5033*c217d954SCole Faust const uint y = index / width; 5034*c217d954SCole Faust 5035*c217d954SCole Faust index %= width; 5036*c217d954SCole Faust 5037*c217d954SCole Faust const uint x = index; 5038*c217d954SCole Faust 5039*c217d954SCole Faust return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes; 5040*c217d954SCole Faust} 5041*c217d954SCole Faust 5042*c217d954SCole Faust#endif 5043*c217d954SCole Faust 5044*c217d954SCole Faust 5045*c217d954SCole Faust#define CONVERT_DOWN_RTE_STR(x, type) (convert_##type##_rte((x))) 5046*c217d954SCole Faust#define CONVERT_DOWN_RTE(x, type) CONVERT_DOWN_RTE_STR(x, type) 5047*c217d954SCole Faust 5048*c217d954SCole Faust 5049*c217d954SCole Faustinline uchar quantize_qasymm8(float input, float offset, float scale) 5050*c217d954SCole Faust{ 5051*c217d954SCole Faust float out_f32 = input / scale + offset; 5052*c217d954SCole Faust uchar res_u8 = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, int), uchar); 5053*c217d954SCole Faust return res_u8; 5054*c217d954SCole Faust} 5055*c217d954SCole Faust 5056*c217d954SCole Faust 5057*c217d954SCole Faustinline float dequantize_qasymm8(uchar input, float offset, float scale) 5058*c217d954SCole Faust{ 5059*c217d954SCole Faust return ((float)input - offset) * scale; 5060*c217d954SCole Faust} 5061*c217d954SCole Faust 5062*c217d954SCole Faust 5063*c217d954SCole Faustinline float dequantize_qasymm8_signed(char input, float offset, float scale) 5064*c217d954SCole Faust{ 5065*c217d954SCole Faust return ((float)input - offset) * scale; 5066*c217d954SCole Faust} 5067*c217d954SCole Faust 5068*c217d954SCole Faust 5069*c217d954SCole Faust#define QUANTIZE_IMPL(type, size) \ 5070*c217d954SCole Faust inline VEC_DATA_TYPE(type, size) quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale) \ 5071*c217d954SCole Faust { \ 5072*c217d954SCole Faust VEC_DATA_TYPE(float, size) \ 5073*c217d954SCole Faust out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset); \ 5074*c217d954SCole Faust VEC_DATA_TYPE(type, size) \ 5075*c217d954SCole Faust res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), VEC_DATA_TYPE(type, size)); \ 5076*c217d954SCole Faust return res; \ 5077*c217d954SCole Faust } 5078*c217d954SCole Faust 5079*c217d954SCole Faust 5080*c217d954SCole Faust#define DEQUANTIZE_IMPL(type, size) \ 5081*c217d954SCole Faust inline VEC_DATA_TYPE(float, size) dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \ 5082*c217d954SCole Faust { \ 5083*c217d954SCole Faust return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale; \ 5084*c217d954SCole Faust } 5085*c217d954SCole Faust 5086*c217d954SCole Faust 5087*c217d954SCole Faust#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size) \ 5088*c217d954SCole Faust inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \ 5089*c217d954SCole Faust { \ 5090*c217d954SCole Faust const VEC_DATA_TYPE(int, size) \ 5091*c217d954SCole Faust zero = (VEC_DATA_TYPE(int, size))0; \ 5092*c217d954SCole Faust const VEC_DATA_TYPE(int, size) \ 5093*c217d954SCole Faust one = (VEC_DATA_TYPE(int, size))1; \ 5094*c217d954SCole Faust VEC_DATA_TYPE(int, size) \ 5095*c217d954SCole Faust mask = (one << exponent) - one; \ 5096*c217d954SCole Faust VEC_DATA_TYPE(int, size) \ 5097*c217d954SCole Faust threshold = (mask >> 1) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))(x < 0)); \ 5098*c217d954SCole Faust return (x >> exponent) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))((x & mask) > threshold)); \ 5099*c217d954SCole Faust } 5100*c217d954SCole Faust 5101*c217d954SCole Faust 5102*c217d954SCole Faust#define ASYMM_MULT_IMPL(size) \ 5103*c217d954SCole Faust inline VEC_DATA_TYPE(int, size) asymm_mult##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \ 5104*c217d954SCole Faust { \ 5105*c217d954SCole Faust VEC_DATA_TYPE(int, size) \ 5106*c217d954SCole Faust overflow = a == b && a == INT_MIN; \ 5107*c217d954SCole Faust VEC_DATA_TYPE(long, size) \ 5108*c217d954SCole Faust a_64 = convert_long##size(a); \ 5109*c217d954SCole Faust VEC_DATA_TYPE(long, size) \ 5110*c217d954SCole Faust b_64 = convert_long##size(b); \ 5111*c217d954SCole Faust VEC_DATA_TYPE(long, size) \ 5112*c217d954SCole Faust ab_64 = a_64 * b_64; \ 5113*c217d954SCole Faust \ 5114*c217d954SCole Faust VEC_DATA_TYPE(long, size) \ 5115*c217d954SCole Faust mask1 = 1 << 30; \ 5116*c217d954SCole Faust VEC_DATA_TYPE(long, size) \ 5117*c217d954SCole Faust mask2 = 1 - (1 << 30); \ 5118*c217d954SCole Faust VEC_DATA_TYPE(long, size) \ 5119*c217d954SCole Faust is_positive_or_zero = ab_64 >= 0; \ 5120*c217d954SCole Faust VEC_DATA_TYPE(long, size) \ 5121*c217d954SCole Faust nudge = select(mask2, mask1, (SELECT_VEC_DATA_TYPE(long, size))(is_positive_or_zero)); \ 5122*c217d954SCole Faust VEC_DATA_TYPE(long, size) \ 5123*c217d954SCole Faust mask = 1ll << 31; \ 5124*c217d954SCole Faust VEC_DATA_TYPE(int, size) \ 5125*c217d954SCole Faust ab_x2_high32 = convert_int##size((ab_64 + nudge) / mask); \ 5126*c217d954SCole Faust return select(ab_x2_high32, INT_MAX, (SELECT_VEC_DATA_TYPE(int, size))(overflow)); \ 5127*c217d954SCole Faust } 5128*c217d954SCole Faust 5129*c217d954SCole Faust 5130*c217d954SCole Faust#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size) \ 5131*c217d954SCole Faust inline VEC_DATA_TYPE(int, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) a) \ 5132*c217d954SCole Faust { \ 5133*c217d954SCole Faust const VEC_DATA_TYPE(int, size) constant_term = 1895147668; \ 5134*c217d954SCole Faust const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883; \ 5135*c217d954SCole Faust const int k_fractional_bits = 31; \ 5136*c217d954SCole Faust VEC_DATA_TYPE(int, size) \ 5137*c217d954SCole Faust x = a + (1 << (k_fractional_bits - 3)); \ 5138*c217d954SCole Faust VEC_DATA_TYPE(int, size) \ 5139*c217d954SCole Faust x2 = ASYMM_MULT(x, x, size); \ 5140*c217d954SCole Faust VEC_DATA_TYPE(int, size) \ 5141*c217d954SCole Faust x3 = ASYMM_MULT(x2, x, size); \ 5142*c217d954SCole Faust VEC_DATA_TYPE(int, size) \ 5143*c217d954SCole Faust x4 = ASYMM_MULT(x2, x2, size); \ 5144*c217d954SCole Faust VEC_DATA_TYPE(int, size) \ 5145*c217d954SCole Faust x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size); \ 5146*c217d954SCole Faust VEC_DATA_TYPE(int, size) \ 5147*c217d954SCole Faust x4_over_24_plus_x3_over_6_plus_x2 = ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2; \ 5148*c217d954SCole Faust VEC_DATA_TYPE(int, size) \ 5149*c217d954SCole Faust x4_over_24_plus_x3_over_6_plus_x2_over_2 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size); \ 5150*c217d954SCole Faust return constant_term + ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size); \ 5151*c217d954SCole Faust } 5152*c217d954SCole Faust 5153*c217d954SCole Faust 5154*c217d954SCole Faust#define ASYMM_SELECT_USING_MASK_IMPL(size) \ 5155*c217d954SCole Faust inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size(VEC_DATA_TYPE(int, size) if_mask, VEC_DATA_TYPE(int, size) then_val, VEC_DATA_TYPE(int, size) else_val) \ 5156*c217d954SCole Faust { \ 5157*c217d954SCole Faust return (if_mask & then_val) ^ (~if_mask & else_val); \ 5158*c217d954SCole Faust } 5159*c217d954SCole Faust 5160*c217d954SCole Faust 5161*c217d954SCole Faust#define ASYMM_MASK_IF_ZERO_IMPL(size) \ 5162*c217d954SCole Faust inline VEC_DATA_TYPE(int, size) asymm_mask_if_zero##size(VEC_DATA_TYPE(int, size) a) \ 5163*c217d954SCole Faust { \ 5164*c217d954SCole Faust const VEC_DATA_TYPE(int, size) all_zeros = 0; \ 5165*c217d954SCole Faust const VEC_DATA_TYPE(int, size) all_ones = ~0; \ 5166*c217d954SCole Faust return select(all_zeros, all_ones, (SELECT_VEC_DATA_TYPE(int, size))(a == 0)); \ 5167*c217d954SCole Faust } 5168*c217d954SCole Faust 5169*c217d954SCole Faust 5170*c217d954SCole Faust#define ASYMM_MASK_IF_NON_ZERO_IMPL(size) \ 5171*c217d954SCole Faust inline VEC_DATA_TYPE(int, size) asymm_mask_if_non_zero##size(VEC_DATA_TYPE(int, size) a) \ 5172*c217d954SCole Faust { \ 5173*c217d954SCole Faust const VEC_DATA_TYPE(int, size) all_zeros = 0; \ 5174*c217d954SCole Faust const VEC_DATA_TYPE(int, size) all_ones = ~0; \ 5175*c217d954SCole Faust return select(all_zeros, all_ones, (SELECT_VEC_DATA_TYPE(int, size))(a != 0)); \ 5176*c217d954SCole Faust } 5177*c217d954SCole Faust 5178*c217d954SCole Faust#define EXP_BARREL_SHIFTER_IMPL(size) \ 5179*c217d954SCole Faust inline VEC_DATA_TYPE(int, size) exp_barrel_shifter##size(VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder) \ 5180*c217d954SCole Faust { \ 5181*c217d954SCole Faust if(k_integer_bits > exponent) \ 5182*c217d954SCole Faust { \ 5183*c217d954SCole Faust const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0; \ 5184*c217d954SCole Faust return ASYMM_SELECT_USING_MASK( \ 5185*c217d954SCole Faust ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size), \ 5186*c217d954SCole Faust ASYMM_MULT(result, fp_multiplier, size), result, size); \ 5187*c217d954SCole Faust } \ 5188*c217d954SCole Faust \ 5189*c217d954SCole Faust return result; \ 5190*c217d954SCole Faust } 5191*c217d954SCole Faust 5192*c217d954SCole Faust 5193*c217d954SCole Faust#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size) \ 5194*c217d954SCole Faust inline VEC_DATA_TYPE(int, size) asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits) \ 5195*c217d954SCole Faust { \ 5196*c217d954SCole Faust const int k_fractional_bits = 31 - k_integer_bits; \ 5197*c217d954SCole Faust VEC_DATA_TYPE(int, size) \ 5198*c217d954SCole Faust k_one_quarter = 1 << (k_fractional_bits - 2); \ 5199*c217d954SCole Faust VEC_DATA_TYPE(int, size) \ 5200*c217d954SCole Faust mask = k_one_quarter - 1; \ 5201*c217d954SCole Faust VEC_DATA_TYPE(int, size) \ 5202*c217d954SCole Faust a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter; \ 5203*c217d954SCole Faust VEC_DATA_TYPE(int, size) \ 5204*c217d954SCole Faust a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits; \ 5205*c217d954SCole Faust VEC_DATA_TYPE(int, size) \ 5206*c217d954SCole Faust result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a_mod_quarter_minus_one_quarter_scaled, size); \ 5207*c217d954SCole Faust VEC_DATA_TYPE(int, size) \ 5208*c217d954SCole Faust remainder = a_mod_quarter_minus_one_quarter - a; \ 5209*c217d954SCole Faust \ 5210*c217d954SCole Faust result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits, remainder, size); \ 5211*c217d954SCole Faust result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits, remainder, size); \ 5212*c217d954SCole Faust result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits, remainder, size); \ 5213*c217d954SCole Faust result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits, remainder, size); \ 5214*c217d954SCole Faust result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, remainder, size); \ 5215*c217d954SCole Faust result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, size); \ 5216*c217d954SCole Faust result = EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size); \ 5217*c217d954SCole Faust \ 5218*c217d954SCole Faust if(k_integer_bits > 5) \ 5219*c217d954SCole Faust { \ 5220*c217d954SCole Faust const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5)); \ 5221*c217d954SCole Faust result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size); \ 5222*c217d954SCole Faust } \ 5223*c217d954SCole Faust \ 5224*c217d954SCole Faust const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX; \ 5225*c217d954SCole Faust return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size); \ 5226*c217d954SCole Faust } 5227*c217d954SCole Faust 5228*c217d954SCole Faust 5229*c217d954SCole Faust#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size) \ 5230*c217d954SCole Faust inline VEC_DATA_TYPE(int, size) asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \ 5231*c217d954SCole Faust { \ 5232*c217d954SCole Faust if(exponent < 0) \ 5233*c217d954SCole Faust { \ 5234*c217d954SCole Faust return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size); \ 5235*c217d954SCole Faust } \ 5236*c217d954SCole Faust \ 5237*c217d954SCole Faust const VEC_DATA_TYPE(int, size) min = INT_MIN; \ 5238*c217d954SCole Faust const VEC_DATA_TYPE(int, size) max = INT_MAX; \ 5239*c217d954SCole Faust int threshold = ((1 << (31 - exponent)) - 1); \ 5240*c217d954SCole Faust VEC_DATA_TYPE(int, size) \ 5241*c217d954SCole Faust positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size); \ 5242*c217d954SCole Faust VEC_DATA_TYPE(int, size) \ 5243*c217d954SCole Faust negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size); \ 5244*c217d954SCole Faust VEC_DATA_TYPE(int, size) \ 5245*c217d954SCole Faust result = x << exponent; \ 5246*c217d954SCole Faust result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size); \ 5247*c217d954SCole Faust result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size); \ 5248*c217d954SCole Faust return result; \ 5249*c217d954SCole Faust } 5250*c217d954SCole Faust 5251*c217d954SCole Faust 5252*c217d954SCole Faust#define ASYMM_ROUNDING_HALF_SUM_IMPL(size) \ 5253*c217d954SCole Faust inline VEC_DATA_TYPE(int, size) asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \ 5254*c217d954SCole Faust { \ 5255*c217d954SCole Faust VEC_DATA_TYPE(long, size) \ 5256*c217d954SCole Faust a64 = convert_long##size(a); \ 5257*c217d954SCole Faust VEC_DATA_TYPE(long, size) \ 5258*c217d954SCole Faust b64 = convert_long##size(b); \ 5259*c217d954SCole Faust VEC_DATA_TYPE(long, size) \ 5260*c217d954SCole Faust sum = a64 + b64; \ 5261*c217d954SCole Faust const VEC_DATA_TYPE(long, size) one = 1; \ 5262*c217d954SCole Faust const VEC_DATA_TYPE(long, size) minus_one = -1; \ 5263*c217d954SCole Faust VEC_DATA_TYPE(long, size) \ 5264*c217d954SCole Faust sign = select(minus_one, one, (SELECT_VEC_DATA_TYPE(long, size))(sum >= 0)); \ 5265*c217d954SCole Faust return convert_int##size((sum + sign) / 2); \ 5266*c217d954SCole Faust } 5267*c217d954SCole Faust 5268*c217d954SCole Faust 5269*c217d954SCole Faust#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(size) \ 5270*c217d954SCole Faust inline VEC_DATA_TYPE(int, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(VEC_DATA_TYPE(int, size) a) \ 5271*c217d954SCole Faust { \ 5272*c217d954SCole Faust const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX; \ 5273*c217d954SCole Faust const VEC_DATA_TYPE(int, size) Q2_one = 1 << (31 - 2); \ 5274*c217d954SCole Faust VEC_DATA_TYPE(int, size) \ 5275*c217d954SCole Faust half_denominator = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size); \ 5276*c217d954SCole Faust const VEC_DATA_TYPE(int, size) Q2_48_over_17 = 1515870810; \ 5277*c217d954SCole Faust const VEC_DATA_TYPE(int, size) Q2_neg_32_over_17 = -1010580540; \ 5278*c217d954SCole Faust VEC_DATA_TYPE(int, size) \ 5279*c217d954SCole Faust x = Q2_48_over_17 + ASYMM_MULT(half_denominator, Q2_neg_32_over_17, size); \ 5280*c217d954SCole Faust for(int i = 0; i < 3; i++) \ 5281*c217d954SCole Faust { \ 5282*c217d954SCole Faust VEC_DATA_TYPE(int, size) \ 5283*c217d954SCole Faust half_denominator_times_x = ASYMM_MULT(half_denominator, x, size); \ 5284*c217d954SCole Faust VEC_DATA_TYPE(int, size) \ 5285*c217d954SCole Faust one_minus_half_denominator_times_x = Q2_one - half_denominator_times_x; \ 5286*c217d954SCole Faust VEC_DATA_TYPE(int, size) \ 5287*c217d954SCole Faust tmp = ASYMM_MULT(x, one_minus_half_denominator_times_x, size); \ 5288*c217d954SCole Faust x = x + ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(tmp, 2, size); \ 5289*c217d954SCole Faust } \ 5290*c217d954SCole Faust return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, 1, size); \ 5291*c217d954SCole Faust } 5292*c217d954SCole Faust 5293*c217d954SCole Faust 5294*c217d954SCole Faust#define ASYMM_RESCALE_IMPL(size) \ 5295*c217d954SCole Faust inline VEC_DATA_TYPE(int, size) asymm_rescale##size(VEC_DATA_TYPE(int, size) value, int src_integer_bits, int dst_integer_bits) \ 5296*c217d954SCole Faust { \ 5297*c217d954SCole Faust int exponent = src_integer_bits - dst_integer_bits; \ 5298*c217d954SCole Faust return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size); \ 5299*c217d954SCole Faust } 5300*c217d954SCole Faust 5301*c217d954SCole Faust#define QUANTIZE_STR(input, offset, scale, type, size) quantize_##type##size(input, offset, scale) 5302*c217d954SCole Faust#define QUANTIZE(input, offset, scale, type, size) QUANTIZE_STR(input, offset, scale, type, size) 5303*c217d954SCole Faust#define DEQUANTIZE_STR(input, offset, scale, type, size) dequantize_##type##size(input, offset, scale) 5304*c217d954SCole Faust#define DEQUANTIZE(input, offset, scale, type, size) DEQUANTIZE_STR(input, offset, scale, type, size) 5305*c217d954SCole Faust 5306*c217d954SCole Faust#define ASYMM_ROUNDING_DIVIDE_BY_POW2_STR(x, exponent, size) asymm_rounding_divide_by_POW2_##size(x, exponent) 5307*c217d954SCole Faust#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) ASYMM_ROUNDING_DIVIDE_BY_POW2_STR(x, exponent, size) 5308*c217d954SCole Faust#define ASYMM_MULT_STR(a, b, size) asymm_mult##size(a, b) 5309*c217d954SCole Faust#define ASYMM_MULT(a, b, size) ASYMM_MULT_STR(a, b, size) 5310*c217d954SCole Faust#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size) \ 5311*c217d954SCole Faust ASYMM_MULT(x *((VEC_DATA_TYPE(int, size))(1) << (-left_shift)), quantized_multiplier, size) 5312*c217d954SCole Faust#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \ 5313*c217d954SCole Faust ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size) 5314*c217d954SCole Faust#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a) 5315*c217d954SCole Faust#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) asymm_select_using_mask##size(if_mask, then_val, else_val) 5316*c217d954SCole Faust#define ASYMM_MASK_IF_ZERO(a, size) asymm_mask_if_zero##size(a) 5317*c217d954SCole Faust#define ASYMM_MASK_IF_NON_ZERO(a, size) asymm_mask_if_non_zero##size(a) 5318*c217d954SCole Faust#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder, size) exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder) 5319*c217d954SCole Faust#define ASYMM_EXP_ON_NEGATIVE_VALUES_STR(a, k_integer_bits, size) asymm_exp_on_negative_values##size(a, k_integer_bits) 5320*c217d954SCole Faust#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) ASYMM_EXP_ON_NEGATIVE_VALUES_STR(a, k_integer_bits, size) 5321*c217d954SCole Faust#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(a) 5322*c217d954SCole Faust#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size) 5323*c217d954SCole Faust#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) asymm_saturating_rounding_mult_by_pow2##size(x, exponent) 5324*c217d954SCole Faust#define ASYMM_ROUNDING_HALF_SUM(a, b, size) asymm_rounding_half_sum##size(a, b) 5325*c217d954SCole Faust#define ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size) asymm_rescale##size(value, src_integer_bits, dst_integer_bits) 5326*c217d954SCole Faust#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size) 5327*c217d954SCole Faust 5328*c217d954SCole Faust#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size) \ 5329*c217d954SCole Faust inline VEC_DATA_TYPE(int, size) multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift) \ 5330*c217d954SCole Faust { \ 5331*c217d954SCole Faust const int left_shift = shift > 0 ? shift : 0; \ 5332*c217d954SCole Faust const int right_shift = shift > 0 ? 0 : -shift; \ 5333*c217d954SCole Faust return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size), right_shift, size); \ 5334*c217d954SCole Faust } 5335*c217d954SCole Faust#define MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, size) multiply_by_quantized_multiplier##size(input, qmul, shift) 5336*c217d954SCole Faust 5337*c217d954SCole FaustQUANTIZE_IMPL(uchar, 1) 5338*c217d954SCole FaustQUANTIZE_IMPL(char, 1) 5339*c217d954SCole FaustQUANTIZE_IMPL(uint, 1) 5340*c217d954SCole FaustQUANTIZE_IMPL(int, 1) 5341*c217d954SCole FaustQUANTIZE_IMPL(uchar, 2) 5342*c217d954SCole FaustQUANTIZE_IMPL(char, 2) 5343*c217d954SCole FaustQUANTIZE_IMPL(uint, 2) 5344*c217d954SCole FaustQUANTIZE_IMPL(int, 2) 5345*c217d954SCole FaustQUANTIZE_IMPL(uchar, 3) 5346*c217d954SCole FaustQUANTIZE_IMPL(char, 3) 5347*c217d954SCole FaustQUANTIZE_IMPL(uint, 3) 5348*c217d954SCole FaustQUANTIZE_IMPL(int, 3) 5349*c217d954SCole FaustQUANTIZE_IMPL(uchar, 4) 5350*c217d954SCole FaustQUANTIZE_IMPL(ushort, 4) 5351*c217d954SCole FaustQUANTIZE_IMPL(short, 4) 5352*c217d954SCole FaustQUANTIZE_IMPL(int, 4) 5353*c217d954SCole FaustQUANTIZE_IMPL(uchar, 8) 5354*c217d954SCole FaustQUANTIZE_IMPL(char, 8) 5355*c217d954SCole FaustQUANTIZE_IMPL(uint, 8) 5356*c217d954SCole FaustQUANTIZE_IMPL(int, 8) 5357*c217d954SCole FaustQUANTIZE_IMPL(uchar, 16) 5358*c217d954SCole FaustQUANTIZE_IMPL(char, 16) 5359*c217d954SCole FaustQUANTIZE_IMPL(ushort, 16) 5360*c217d954SCole FaustQUANTIZE_IMPL(short, 16) 5361*c217d954SCole FaustQUANTIZE_IMPL(uint, 16) 5362*c217d954SCole FaustQUANTIZE_IMPL(int, 16) 5363*c217d954SCole Faust 5364*c217d954SCole FaustDEQUANTIZE_IMPL(uchar, 1) 5365*c217d954SCole FaustDEQUANTIZE_IMPL(char, 1) 5366*c217d954SCole FaustDEQUANTIZE_IMPL(uint, 1) 5367*c217d954SCole FaustDEQUANTIZE_IMPL(int, 1) 5368*c217d954SCole FaustDEQUANTIZE_IMPL(uchar, 2) 5369*c217d954SCole FaustDEQUANTIZE_IMPL(char, 2) 5370*c217d954SCole FaustDEQUANTIZE_IMPL(uint, 2) 5371*c217d954SCole FaustDEQUANTIZE_IMPL(int, 2) 5372*c217d954SCole FaustDEQUANTIZE_IMPL(uchar, 3) 5373*c217d954SCole FaustDEQUANTIZE_IMPL(char, 3) 5374*c217d954SCole FaustDEQUANTIZE_IMPL(uint, 3) 5375*c217d954SCole FaustDEQUANTIZE_IMPL(int, 3) 5376*c217d954SCole FaustDEQUANTIZE_IMPL(uchar, 4) 5377*c217d954SCole FaustDEQUANTIZE_IMPL(ushort, 4) 5378*c217d954SCole FaustDEQUANTIZE_IMPL(short, 4) 5379*c217d954SCole FaustDEQUANTIZE_IMPL(int, 4) 5380*c217d954SCole FaustDEQUANTIZE_IMPL(uchar, 8) 5381*c217d954SCole FaustDEQUANTIZE_IMPL(char, 8) 5382*c217d954SCole FaustDEQUANTIZE_IMPL(uint, 8) 5383*c217d954SCole FaustDEQUANTIZE_IMPL(int, 8) 5384*c217d954SCole FaustDEQUANTIZE_IMPL(uchar, 16) 5385*c217d954SCole FaustDEQUANTIZE_IMPL(char, 16) 5386*c217d954SCole FaustDEQUANTIZE_IMPL(ushort, 16) 5387*c217d954SCole FaustDEQUANTIZE_IMPL(short, 16) 5388*c217d954SCole FaustDEQUANTIZE_IMPL(uint, 16) 5389*c217d954SCole FaustDEQUANTIZE_IMPL(int, 16) 5390*c217d954SCole Faust 5391*c217d954SCole FaustASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(1) 5392*c217d954SCole FaustASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2) 5393*c217d954SCole FaustASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(3) 5394*c217d954SCole FaustASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(4) 5395*c217d954SCole FaustASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(8) 5396*c217d954SCole FaustASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(16) 5397*c217d954SCole Faust 5398*c217d954SCole FaustASYMM_MULT_IMPL(1) 5399*c217d954SCole FaustASYMM_MULT_IMPL(2) 5400*c217d954SCole FaustASYMM_MULT_IMPL(3) 5401*c217d954SCole FaustASYMM_MULT_IMPL(4) 5402*c217d954SCole FaustASYMM_MULT_IMPL(8) 5403*c217d954SCole FaustASYMM_MULT_IMPL(16) 5404*c217d954SCole Faust 5405*c217d954SCole FaustASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(1) 5406*c217d954SCole FaustASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(2) 5407*c217d954SCole FaustASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(3) 5408*c217d954SCole FaustASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(4) 5409*c217d954SCole FaustASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(8) 5410*c217d954SCole FaustASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(16) 5411*c217d954SCole Faust 5412*c217d954SCole FaustASYMM_SELECT_USING_MASK_IMPL(1) 5413*c217d954SCole FaustASYMM_SELECT_USING_MASK_IMPL(2) 5414*c217d954SCole FaustASYMM_SELECT_USING_MASK_IMPL(3) 5415*c217d954SCole FaustASYMM_SELECT_USING_MASK_IMPL(4) 5416*c217d954SCole FaustASYMM_SELECT_USING_MASK_IMPL(8) 5417*c217d954SCole FaustASYMM_SELECT_USING_MASK_IMPL(16) 5418*c217d954SCole Faust 5419*c217d954SCole FaustASYMM_MASK_IF_ZERO_IMPL(1) 5420*c217d954SCole FaustASYMM_MASK_IF_ZERO_IMPL(2) 5421*c217d954SCole FaustASYMM_MASK_IF_ZERO_IMPL(3) 5422*c217d954SCole FaustASYMM_MASK_IF_ZERO_IMPL(4) 5423*c217d954SCole FaustASYMM_MASK_IF_ZERO_IMPL(8) 5424*c217d954SCole FaustASYMM_MASK_IF_ZERO_IMPL(16) 5425*c217d954SCole Faust 5426*c217d954SCole FaustASYMM_MASK_IF_NON_ZERO_IMPL(1) 5427*c217d954SCole FaustASYMM_MASK_IF_NON_ZERO_IMPL(2) 5428*c217d954SCole FaustASYMM_MASK_IF_NON_ZERO_IMPL(3) 5429*c217d954SCole FaustASYMM_MASK_IF_NON_ZERO_IMPL(4) 5430*c217d954SCole FaustASYMM_MASK_IF_NON_ZERO_IMPL(8) 5431*c217d954SCole FaustASYMM_MASK_IF_NON_ZERO_IMPL(16) 5432*c217d954SCole Faust 5433*c217d954SCole FaustEXP_BARREL_SHIFTER_IMPL(1) 5434*c217d954SCole FaustEXP_BARREL_SHIFTER_IMPL(2) 5435*c217d954SCole FaustEXP_BARREL_SHIFTER_IMPL(3) 5436*c217d954SCole FaustEXP_BARREL_SHIFTER_IMPL(4) 5437*c217d954SCole FaustEXP_BARREL_SHIFTER_IMPL(8) 5438*c217d954SCole FaustEXP_BARREL_SHIFTER_IMPL(16) 5439*c217d954SCole Faust 5440*c217d954SCole FaustASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(1) 5441*c217d954SCole FaustASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(2) 5442*c217d954SCole FaustASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(3) 5443*c217d954SCole FaustASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(4) 5444*c217d954SCole FaustASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(8) 5445*c217d954SCole FaustASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(16) 5446*c217d954SCole Faust 5447*c217d954SCole FaustASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(1) 5448*c217d954SCole FaustASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(2) 5449*c217d954SCole FaustASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(3) 5450*c217d954SCole FaustASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(4) 5451*c217d954SCole FaustASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(8) 5452*c217d954SCole FaustASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(16) 5453*c217d954SCole Faust 5454*c217d954SCole FaustASYMM_ROUNDING_HALF_SUM_IMPL(1) 5455*c217d954SCole FaustASYMM_ROUNDING_HALF_SUM_IMPL(2) 5456*c217d954SCole FaustASYMM_ROUNDING_HALF_SUM_IMPL(3) 5457*c217d954SCole FaustASYMM_ROUNDING_HALF_SUM_IMPL(4) 5458*c217d954SCole FaustASYMM_ROUNDING_HALF_SUM_IMPL(8) 5459*c217d954SCole FaustASYMM_ROUNDING_HALF_SUM_IMPL(16) 5460*c217d954SCole Faust 5461*c217d954SCole FaustASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(1) 5462*c217d954SCole FaustASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(2) 5463*c217d954SCole FaustASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(3) 5464*c217d954SCole FaustASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(4) 5465*c217d954SCole FaustASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(8) 5466*c217d954SCole FaustASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(16) 5467*c217d954SCole Faust 5468*c217d954SCole FaustASYMM_RESCALE_IMPL(1) 5469*c217d954SCole FaustASYMM_RESCALE_IMPL(2) 5470*c217d954SCole FaustASYMM_RESCALE_IMPL(3) 5471*c217d954SCole FaustASYMM_RESCALE_IMPL(4) 5472*c217d954SCole FaustASYMM_RESCALE_IMPL(8) 5473*c217d954SCole FaustASYMM_RESCALE_IMPL(16) 5474*c217d954SCole Faust 5475*c217d954SCole FaustMULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(1) 5476*c217d954SCole FaustMULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(2) 5477*c217d954SCole FaustMULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(3) 5478*c217d954SCole FaustMULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(4) 5479*c217d954SCole FaustMULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(8) 5480*c217d954SCole FaustMULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(16) 5481*c217d954SCole Faust 5482*c217d954SCole Faust#endif 5483*c217d954SCole Faust 5484*c217d954SCole Faust#ifndef ARM_COMPUTE_REPEAT_H 5485*c217d954SCole Faust#define ARM_COMPUTE_REPEAT_H 5486*c217d954SCole Faust 5487*c217d954SCole Faust 5488*c217d954SCole Faust#ifndef ARM_COMPUTE_HELPER_H 5489*c217d954SCole Faust#define ARM_COMPUTE_HELPER_H 5490*c217d954SCole Faust 5491*c217d954SCole Faust 5492*c217d954SCole Faust 5493*c217d954SCole Faust 5494*c217d954SCole Faust#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5495*c217d954SCole Faust VSTORE(N0) \ 5496*c217d954SCole Faust (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 5497*c217d954SCole Faust 5498*c217d954SCole Faust#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5499*c217d954SCole Faust STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5500*c217d954SCole Faust VSTORE(N0) \ 5501*c217d954SCole Faust (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 5502*c217d954SCole Faust 5503*c217d954SCole Faust#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5504*c217d954SCole Faust STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5505*c217d954SCole Faust VSTORE(N0) \ 5506*c217d954SCole Faust (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 5507*c217d954SCole Faust 5508*c217d954SCole Faust#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5509*c217d954SCole Faust STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5510*c217d954SCole Faust VSTORE(N0) \ 5511*c217d954SCole Faust (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 5512*c217d954SCole Faust 5513*c217d954SCole Faust#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5514*c217d954SCole Faust STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5515*c217d954SCole Faust VSTORE(N0) \ 5516*c217d954SCole Faust (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 5517*c217d954SCole Faust 5518*c217d954SCole Faust#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5519*c217d954SCole Faust STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5520*c217d954SCole Faust VSTORE(N0) \ 5521*c217d954SCole Faust (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 5522*c217d954SCole Faust 5523*c217d954SCole Faust#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5524*c217d954SCole Faust STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5525*c217d954SCole Faust VSTORE(N0) \ 5526*c217d954SCole Faust (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 5527*c217d954SCole Faust 5528*c217d954SCole Faust#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5529*c217d954SCole Faust STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5530*c217d954SCole Faust VSTORE(N0) \ 5531*c217d954SCole Faust (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 5532*c217d954SCole Faust 5533*c217d954SCole Faust#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5534*c217d954SCole Faust STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5535*c217d954SCole Faust VSTORE(N0) \ 5536*c217d954SCole Faust (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 5537*c217d954SCole Faust 5538*c217d954SCole Faust#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5539*c217d954SCole Faust STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5540*c217d954SCole Faust VSTORE(N0) \ 5541*c217d954SCole Faust (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 5542*c217d954SCole Faust 5543*c217d954SCole Faust#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5544*c217d954SCole Faust STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5545*c217d954SCole Faust VSTORE(N0) \ 5546*c217d954SCole Faust (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 5547*c217d954SCole Faust 5548*c217d954SCole Faust#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5549*c217d954SCole Faust STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5550*c217d954SCole Faust VSTORE(N0) \ 5551*c217d954SCole Faust (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 5552*c217d954SCole Faust 5553*c217d954SCole Faust#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5554*c217d954SCole Faust STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5555*c217d954SCole Faust VSTORE(N0) \ 5556*c217d954SCole Faust (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 5557*c217d954SCole Faust 5558*c217d954SCole Faust#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5559*c217d954SCole Faust STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5560*c217d954SCole Faust VSTORE(N0) \ 5561*c217d954SCole Faust (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 5562*c217d954SCole Faust 5563*c217d954SCole Faust#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5564*c217d954SCole Faust STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5565*c217d954SCole Faust VSTORE(N0) \ 5566*c217d954SCole Faust (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 5567*c217d954SCole Faust 5568*c217d954SCole Faust#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5569*c217d954SCole Faust STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5570*c217d954SCole Faust VSTORE(N0) \ 5571*c217d954SCole Faust (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 5572*c217d954SCole Faust 5573*c217d954SCole Faust 5574*c217d954SCole Faust 5575*c217d954SCole Faust#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5576*c217d954SCole Faust VSTORE(N0) \ 5577*c217d954SCole Faust (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 5578*c217d954SCole Faust 5579*c217d954SCole Faust#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5580*c217d954SCole Faust CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5581*c217d954SCole Faust VSTORE(N0) \ 5582*c217d954SCole Faust (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 5583*c217d954SCole Faust 5584*c217d954SCole Faust#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5585*c217d954SCole Faust CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5586*c217d954SCole Faust VSTORE(N0) \ 5587*c217d954SCole Faust (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 5588*c217d954SCole Faust 5589*c217d954SCole Faust#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5590*c217d954SCole Faust CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5591*c217d954SCole Faust VSTORE(N0) \ 5592*c217d954SCole Faust (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 5593*c217d954SCole Faust 5594*c217d954SCole Faust#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5595*c217d954SCole Faust CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5596*c217d954SCole Faust VSTORE(N0) \ 5597*c217d954SCole Faust (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 5598*c217d954SCole Faust 5599*c217d954SCole Faust#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5600*c217d954SCole Faust CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5601*c217d954SCole Faust VSTORE(N0) \ 5602*c217d954SCole Faust (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 5603*c217d954SCole Faust 5604*c217d954SCole Faust#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5605*c217d954SCole Faust CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5606*c217d954SCole Faust VSTORE(N0) \ 5607*c217d954SCole Faust (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 5608*c217d954SCole Faust 5609*c217d954SCole Faust#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5610*c217d954SCole Faust CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5611*c217d954SCole Faust VSTORE(N0) \ 5612*c217d954SCole Faust (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 5613*c217d954SCole Faust 5614*c217d954SCole Faust#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5615*c217d954SCole Faust CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5616*c217d954SCole Faust VSTORE(N0) \ 5617*c217d954SCole Faust (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 5618*c217d954SCole Faust 5619*c217d954SCole Faust#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \ 5620*c217d954SCole Faust CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5621*c217d954SCole Faust VSTORE(N0) \ 5622*c217d954SCole Faust (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 5623*c217d954SCole Faust 5624*c217d954SCole Faust#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5625*c217d954SCole Faust CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5626*c217d954SCole Faust VSTORE(N0) \ 5627*c217d954SCole Faust (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 5628*c217d954SCole Faust 5629*c217d954SCole Faust#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5630*c217d954SCole Faust CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5631*c217d954SCole Faust VSTORE(N0) \ 5632*c217d954SCole Faust (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 5633*c217d954SCole Faust 5634*c217d954SCole Faust#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5635*c217d954SCole Faust CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5636*c217d954SCole Faust VSTORE(N0) \ 5637*c217d954SCole Faust (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 5638*c217d954SCole Faust 5639*c217d954SCole Faust#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5640*c217d954SCole Faust CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5641*c217d954SCole Faust VSTORE(N0) \ 5642*c217d954SCole Faust (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 5643*c217d954SCole Faust 5644*c217d954SCole Faust#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5645*c217d954SCole Faust CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5646*c217d954SCole Faust VSTORE(N0) \ 5647*c217d954SCole Faust (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 5648*c217d954SCole Faust 5649*c217d954SCole Faust#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5650*c217d954SCole Faust CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5651*c217d954SCole Faust VSTORE(N0) \ 5652*c217d954SCole Faust (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 5653*c217d954SCole Faust 5654*c217d954SCole Faust 5655*c217d954SCole Faust 5656*c217d954SCole Faust 5657*c217d954SCole Faust#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 5658*c217d954SCole Faust#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 5659*c217d954SCole Faust 5660*c217d954SCole Faust 5661*c217d954SCole Faust 5662*c217d954SCole Faust#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 5663*c217d954SCole Faust#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 5664*c217d954SCole Faust 5665*c217d954SCole Faust 5666*c217d954SCole Faust 5667*c217d954SCole Faust#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5668*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 5669*c217d954SCole Faust (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0)); 5670*c217d954SCole Faust 5671*c217d954SCole Faust#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5672*c217d954SCole Faust STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5673*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 5674*c217d954SCole Faust (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1)); 5675*c217d954SCole Faust 5676*c217d954SCole Faust#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5677*c217d954SCole Faust STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5678*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 5679*c217d954SCole Faust (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2)); 5680*c217d954SCole Faust 5681*c217d954SCole Faust#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5682*c217d954SCole Faust STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5683*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 5684*c217d954SCole Faust (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3)); 5685*c217d954SCole Faust 5686*c217d954SCole Faust#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5687*c217d954SCole Faust STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5688*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 5689*c217d954SCole Faust (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4)); 5690*c217d954SCole Faust 5691*c217d954SCole Faust#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5692*c217d954SCole Faust STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5693*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 5694*c217d954SCole Faust (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5)); 5695*c217d954SCole Faust 5696*c217d954SCole Faust#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5697*c217d954SCole Faust STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5698*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 5699*c217d954SCole Faust (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6)); 5700*c217d954SCole Faust 5701*c217d954SCole Faust#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5702*c217d954SCole Faust STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5703*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 5704*c217d954SCole Faust (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7)); 5705*c217d954SCole Faust 5706*c217d954SCole Faust#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5707*c217d954SCole Faust STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5708*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 5709*c217d954SCole Faust (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8)); 5710*c217d954SCole Faust 5711*c217d954SCole Faust#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5712*c217d954SCole Faust STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5713*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 5714*c217d954SCole Faust (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9)); 5715*c217d954SCole Faust 5716*c217d954SCole Faust#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5717*c217d954SCole Faust STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5718*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 5719*c217d954SCole Faust (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A)); 5720*c217d954SCole Faust 5721*c217d954SCole Faust#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5722*c217d954SCole Faust STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5723*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 5724*c217d954SCole Faust (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B)); 5725*c217d954SCole Faust 5726*c217d954SCole Faust#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5727*c217d954SCole Faust STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5728*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 5729*c217d954SCole Faust (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C)); 5730*c217d954SCole Faust 5731*c217d954SCole Faust#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5732*c217d954SCole Faust STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5733*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 5734*c217d954SCole Faust (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D)); 5735*c217d954SCole Faust 5736*c217d954SCole Faust#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5737*c217d954SCole Faust STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5738*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 5739*c217d954SCole Faust (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E)); 5740*c217d954SCole Faust 5741*c217d954SCole Faust#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5742*c217d954SCole Faust STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \ 5743*c217d954SCole Faust VSTORE_PARTIAL(N0, STORE_N0) \ 5744*c217d954SCole Faust (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F)); 5745*c217d954SCole Faust 5746*c217d954SCole Faust 5747*c217d954SCole Faust 5748*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 5749*c217d954SCole Faust#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 5750*c217d954SCole Faust 5751*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 5752*c217d954SCole Faust if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \ 5753*c217d954SCole Faust { \ 5754*c217d954SCole Faust STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 5755*c217d954SCole Faust } \ 5756*c217d954SCole Faust else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \ 5757*c217d954SCole Faust { \ 5758*c217d954SCole Faust STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 5759*c217d954SCole Faust } \ 5760*c217d954SCole Faust else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \ 5761*c217d954SCole Faust { \ 5762*c217d954SCole Faust STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 5763*c217d954SCole Faust } \ 5764*c217d954SCole Faust else \ 5765*c217d954SCole Faust { \ 5766*c217d954SCole Faust STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 5767*c217d954SCole Faust } 5768*c217d954SCole Faust 5769*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \ 5770*c217d954SCole Faust if(!(PARTIAL_COND_X)) \ 5771*c217d954SCole Faust { \ 5772*c217d954SCole Faust STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 5773*c217d954SCole Faust } \ 5774*c217d954SCole Faust else \ 5775*c217d954SCole Faust { \ 5776*c217d954SCole Faust STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 5777*c217d954SCole Faust } 5778*c217d954SCole Faust 5779*c217d954SCole Faust#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \ 5780*c217d954SCole Faust if(!(PARTIAL_COND_Y)) \ 5781*c217d954SCole Faust { \ 5782*c217d954SCole Faust STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 5783*c217d954SCole Faust } \ 5784*c217d954SCole Faust else \ 5785*c217d954SCole Faust { \ 5786*c217d954SCole Faust STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \ 5787*c217d954SCole Faust } 5788*c217d954SCole Faust 5789*c217d954SCole Faust 5790*c217d954SCole Faust#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) 5791*c217d954SCole Faust 5792*c217d954SCole Faust 5793*c217d954SCole Faust#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0 5794*c217d954SCole Faust 5795*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 5796*c217d954SCole Faust STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) 5797*c217d954SCole Faust 5798*c217d954SCole Faust#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0 5799*c217d954SCole Faust 5800*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 5801*c217d954SCole Faust STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) 5802*c217d954SCole Faust 5803*c217d954SCole Faust#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0 5804*c217d954SCole Faust 5805*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 5806*c217d954SCole Faust STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) 5807*c217d954SCole Faust 5808*c217d954SCole Faust#else 5809*c217d954SCole Faust 5810*c217d954SCole Faust#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \ 5811*c217d954SCole Faust STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) 5812*c217d954SCole Faust 5813*c217d954SCole Faust#endif 5814*c217d954SCole Faust 5815*c217d954SCole Faust#endif 5816*c217d954SCole Faust 5817*c217d954SCole Faust 5818*c217d954SCole Faust#if defined(PARTIAL_STORE_M0) 5819*c217d954SCole Faust 5820*c217d954SCole Faust#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 5821*c217d954SCole Faust ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0)))) 5822*c217d954SCole Faust#else 5823*c217d954SCole Faust#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \ 5824*c217d954SCole Faust ((uint)(y * M0)) 5825*c217d954SCole Faust#endif 5826*c217d954SCole Faust 5827*c217d954SCole Faust 5828*c217d954SCole Faust 5829*c217d954SCole Faust#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond) \ 5830*c217d954SCole Faust STORE_BLOCK_PARTIAL_IN_X(1, vec_size, data_type, basename, ptr, 0, 0, leftover, cond) 5831*c217d954SCole Faust 5832*c217d954SCole Faust 5833*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 5834*c217d954SCole Faust#pragma OPENCL EXTENSION cl_khr_fp16 : enable 5835*c217d954SCole Faust#endif 5836*c217d954SCole Faust 5837*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 5838*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable 5839*c217d954SCole Faust#endif 5840*c217d954SCole Faust 5841*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 5842*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable 5843*c217d954SCole Faust#endif 5844*c217d954SCole Faust 5845*c217d954SCole Faust#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) 5846*c217d954SCole Faust#pragma OPENCL EXTENSION cl_arm_printf : enable 5847*c217d954SCole Faust#endif 5848*c217d954SCole Faust 5849*c217d954SCole Faust#define GPU_ARCH_MIDGARD 0x100 5850*c217d954SCole Faust#define GPU_ARCH_BIFROST 0x200 5851*c217d954SCole Faust#define GPU_ARCH_VALHALL 0x300 5852*c217d954SCole Faust 5853*c217d954SCole Faust 5854*c217d954SCole Faust#define CONCAT(a, b) a##b 5855*c217d954SCole Faust 5856*c217d954SCole Faust 5857*c217d954SCole Faust#define EXPAND(x) x 5858*c217d954SCole Faust 5859*c217d954SCole Faust 5860*c217d954SCole Faust#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) 5861*c217d954SCole Faust 5862*c217d954SCole Faust 5863*c217d954SCole Faust#define REV1(x) ((x)) 5864*c217d954SCole Faust#define REV2(x) ((x).s10) 5865*c217d954SCole Faust#define REV3(x) ((x).s210) 5866*c217d954SCole Faust#define REV4(x) ((x).s3210) 5867*c217d954SCole Faust#define REV8(x) ((x).s76543210) 5868*c217d954SCole Faust#define REV16(x) ((x).sFEDCBA9876543210) 5869*c217d954SCole Faust 5870*c217d954SCole Faust 5871*c217d954SCole Faust 5872*c217d954SCole Faust#define REVERSE_STR(x, s) REV##s((x)) 5873*c217d954SCole Faust#define REVERSE(x, s) REVERSE_STR(x, s) 5874*c217d954SCole Faust 5875*c217d954SCole Faust 5876*c217d954SCole Faust 5877*c217d954SCole Faust#define ROT1_0(x) ((x)) 5878*c217d954SCole Faust#define ROT1_1(x) ((x)) 5879*c217d954SCole Faust 5880*c217d954SCole Faust#define ROT2_0(x) ((x)) 5881*c217d954SCole Faust#define ROT2_1(x) ((x).s10) 5882*c217d954SCole Faust#define ROT2_2(x) ((x)) 5883*c217d954SCole Faust 5884*c217d954SCole Faust#define ROT3_0(x) ((x)) 5885*c217d954SCole Faust#define ROT3_1(x) ((x).s201) 5886*c217d954SCole Faust#define ROT3_2(x) ((x).s120) 5887*c217d954SCole Faust#define ROT3_3(x) ((x)) 5888*c217d954SCole Faust 5889*c217d954SCole Faust#define ROT4_0(x) ((x)) 5890*c217d954SCole Faust#define ROT4_1(x) ((x).s3012) 5891*c217d954SCole Faust#define ROT4_2(x) ((x).s2301) 5892*c217d954SCole Faust#define ROT4_3(x) ((x).s1230) 5893*c217d954SCole Faust#define ROT4_4(x) ((x)) 5894*c217d954SCole Faust 5895*c217d954SCole Faust#define ROT8_0(x) ((x)) 5896*c217d954SCole Faust#define ROT8_1(x) ((x).s70123456) 5897*c217d954SCole Faust#define ROT8_2(x) ((x).s67012345) 5898*c217d954SCole Faust#define ROT8_3(x) ((x).s56701234) 5899*c217d954SCole Faust#define ROT8_4(x) ((x).s45670123) 5900*c217d954SCole Faust#define ROT8_5(x) ((x).s34567012) 5901*c217d954SCole Faust#define ROT8_6(x) ((x).s23456701) 5902*c217d954SCole Faust#define ROT8_7(x) ((x).s12345670) 5903*c217d954SCole Faust#define ROT8_8(x) ((x)) 5904*c217d954SCole Faust 5905*c217d954SCole Faust#define ROT16_0(x) ((x)) 5906*c217d954SCole Faust#define ROT16_1(x) ((x).sF0123456789ABCDE) 5907*c217d954SCole Faust#define ROT16_2(x) ((x).sEF0123456789ABCD) 5908*c217d954SCole Faust#define ROT16_3(x) ((x).sDEF0123456789ABC) 5909*c217d954SCole Faust#define ROT16_4(x) ((x).sCDEF0123456789AB) 5910*c217d954SCole Faust#define ROT16_5(x) ((x).sBCDEF0123456789A) 5911*c217d954SCole Faust#define ROT16_6(x) ((x).sABCDEF0123456789) 5912*c217d954SCole Faust#define ROT16_7(x) ((x).s9ABCDEF012345678) 5913*c217d954SCole Faust#define ROT16_8(x) ((x).s89ABCDEF01234567) 5914*c217d954SCole Faust#define ROT16_9(x) ((x).s789ABCDEF0123456) 5915*c217d954SCole Faust#define ROT16_10(x) ((x).s6789ABCDEF012345) 5916*c217d954SCole Faust#define ROT16_11(x) ((x).s56789ABCDEF01234) 5917*c217d954SCole Faust#define ROT16_12(x) ((x).s456789ABCDEF0123) 5918*c217d954SCole Faust#define ROT16_13(x) ((x).s3456789ABCDEF012) 5919*c217d954SCole Faust#define ROT16_14(x) ((x).s23456789ABCDEF01) 5920*c217d954SCole Faust#define ROT16_15(x) ((x).s123456789ABCDEF0) 5921*c217d954SCole Faust#define ROT16_16(x) ((x)) 5922*c217d954SCole Faust 5923*c217d954SCole Faust 5924*c217d954SCole Faust 5925*c217d954SCole Faust#define ROTATE_STR(x, s, n) ROT##s##_##n(x) 5926*c217d954SCole Faust#define ROTATE(x, s, n) ROTATE_STR(x, s, n) 5927*c217d954SCole Faust 5928*c217d954SCole Faust 5929*c217d954SCole Faust 5930*c217d954SCole Faust#define V_OFFS1(dt) (dt##1)(0) 5931*c217d954SCole Faust#define V_OFFS2(dt) (dt##2)(0, 1) 5932*c217d954SCole Faust#define V_OFFS3(dt) (dt##3)(0, 1, 2) 5933*c217d954SCole Faust#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3) 5934*c217d954SCole Faust#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7) 5935*c217d954SCole Faust#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) 5936*c217d954SCole Faust 5937*c217d954SCole Faust 5938*c217d954SCole Faust 5939*c217d954SCole Faust#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt) 5940*c217d954SCole Faust#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s) 5941*c217d954SCole Faust 5942*c217d954SCole Faust 5943*c217d954SCole Faust#define VLOAD_STR(size) vload##size 5944*c217d954SCole Faust#define VLOAD(size) VLOAD_STR(size) 5945*c217d954SCole Faust 5946*c217d954SCole Faust 5947*c217d954SCole Faust#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size 5948*c217d954SCole Faust#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size) 5949*c217d954SCole Faust 5950*c217d954SCole Faust#define NO_LOAD(data, offs, ptr) \ 5951*c217d954SCole Faust { \ 5952*c217d954SCole Faust } 5953*c217d954SCole Faust 5954*c217d954SCole Faust 5955*c217d954SCole Faust#define vload_partial_1_0 NO_LOAD 5956*c217d954SCole Faust#define vload_partial_1_1 vload1 5957*c217d954SCole Faust#define vload_partial_1_2 NO_LOAD 5958*c217d954SCole Faust#define vload_partial_1_3 NO_LOAD 5959*c217d954SCole Faust#define vload_partial_1_4 NO_LOAD 5960*c217d954SCole Faust#define vload_partial_1_5 NO_LOAD 5961*c217d954SCole Faust#define vload_partial_1_6 NO_LOAD 5962*c217d954SCole Faust#define vload_partial_1_7 NO_LOAD 5963*c217d954SCole Faust#define vload_partial_1_8 NO_LOAD 5964*c217d954SCole Faust#define vload_partial_1_9 NO_LOAD 5965*c217d954SCole Faust#define vload_partial_1_10 NO_LOAD 5966*c217d954SCole Faust#define vload_partial_1_11 NO_LOAD 5967*c217d954SCole Faust#define vload_partial_1_12 NO_LOAD 5968*c217d954SCole Faust#define vload_partial_1_13 NO_LOAD 5969*c217d954SCole Faust#define vload_partial_1_14 NO_LOAD 5970*c217d954SCole Faust#define vload_partial_1_15 NO_LOAD 5971*c217d954SCole Faust#define vload_partial_1_16 NO_LOAD 5972*c217d954SCole Faust 5973*c217d954SCole Faust#define vload_partial_2_0 NO_LOAD 5974*c217d954SCole Faust#define vload_partial_2_1 vload_partial_1 5975*c217d954SCole Faust#define vload_partial_2_2 vload_partial_2 5976*c217d954SCole Faust#define vload_partial_2_3 NO_LOAD 5977*c217d954SCole Faust#define vload_partial_2_4 NO_LOAD 5978*c217d954SCole Faust#define vload_partial_2_5 NO_LOAD 5979*c217d954SCole Faust#define vload_partial_2_6 NO_LOAD 5980*c217d954SCole Faust#define vload_partial_2_7 NO_LOAD 5981*c217d954SCole Faust#define vload_partial_2_8 NO_LOAD 5982*c217d954SCole Faust#define vload_partial_2_9 NO_LOAD 5983*c217d954SCole Faust#define vload_partial_2_10 NO_LOAD 5984*c217d954SCole Faust#define vload_partial_2_11 NO_LOAD 5985*c217d954SCole Faust#define vload_partial_2_12 NO_LOAD 5986*c217d954SCole Faust#define vload_partial_2_13 NO_LOAD 5987*c217d954SCole Faust#define vload_partial_2_14 NO_LOAD 5988*c217d954SCole Faust#define vload_partial_2_15 NO_LOAD 5989*c217d954SCole Faust#define vload_partial_2_16 NO_LOAD 5990*c217d954SCole Faust 5991*c217d954SCole Faust#define vload_partial_3_0 NO_LOAD 5992*c217d954SCole Faust#define vload_partial_3_1 vload_partial_1 5993*c217d954SCole Faust#define vload_partial_3_2 vload_partial_2 5994*c217d954SCole Faust#define vload_partial_3_3 vload_partial_3 5995*c217d954SCole Faust#define vload_partial_3_4 NO_LOAD 5996*c217d954SCole Faust#define vload_partial_3_5 NO_LOAD 5997*c217d954SCole Faust#define vload_partial_3_6 NO_LOAD 5998*c217d954SCole Faust#define vload_partial_3_7 NO_LOAD 5999*c217d954SCole Faust#define vload_partial_3_8 NO_LOAD 6000*c217d954SCole Faust#define vload_partial_3_9 NO_LOAD 6001*c217d954SCole Faust#define vload_partial_3_10 NO_LOAD 6002*c217d954SCole Faust#define vload_partial_3_11 NO_LOAD 6003*c217d954SCole Faust#define vload_partial_3_12 NO_LOAD 6004*c217d954SCole Faust#define vload_partial_3_13 NO_LOAD 6005*c217d954SCole Faust#define vload_partial_3_14 NO_LOAD 6006*c217d954SCole Faust#define vload_partial_3_15 NO_LOAD 6007*c217d954SCole Faust#define vload_partial_3_16 NO_LOAD 6008*c217d954SCole Faust 6009*c217d954SCole Faust#define vload_partial_4_0 NO_LOAD 6010*c217d954SCole Faust#define vload_partial_4_1 vload_partial_1 6011*c217d954SCole Faust#define vload_partial_4_2 vload_partial_2 6012*c217d954SCole Faust#define vload_partial_4_3 vload_partial_3 6013*c217d954SCole Faust#define vload_partial_4_4 vload_partial_4 6014*c217d954SCole Faust#define vload_partial_4_5 NO_LOAD 6015*c217d954SCole Faust#define vload_partial_4_6 NO_LOAD 6016*c217d954SCole Faust#define vload_partial_4_7 NO_LOAD 6017*c217d954SCole Faust#define vload_partial_4_8 NO_LOAD 6018*c217d954SCole Faust#define vload_partial_4_9 NO_LOAD 6019*c217d954SCole Faust#define vload_partial_4_10 NO_LOAD 6020*c217d954SCole Faust#define vload_partial_4_11 NO_LOAD 6021*c217d954SCole Faust#define vload_partial_4_12 NO_LOAD 6022*c217d954SCole Faust#define vload_partial_4_13 NO_LOAD 6023*c217d954SCole Faust#define vload_partial_4_14 NO_LOAD 6024*c217d954SCole Faust#define vload_partial_4_15 NO_LOAD 6025*c217d954SCole Faust#define vload_partial_4_16 NO_LOAD 6026*c217d954SCole Faust 6027*c217d954SCole Faust#define vload_partial_8_0 NO_LOAD 6028*c217d954SCole Faust#define vload_partial_8_1 vload_partial_1 6029*c217d954SCole Faust#define vload_partial_8_2 vload_partial_2 6030*c217d954SCole Faust#define vload_partial_8_3 vload_partial_3 6031*c217d954SCole Faust#define vload_partial_8_4 vload_partial_4 6032*c217d954SCole Faust#define vload_partial_8_5 vload_partial_5 6033*c217d954SCole Faust#define vload_partial_8_6 vload_partial_6 6034*c217d954SCole Faust#define vload_partial_8_7 vload_partial_7 6035*c217d954SCole Faust#define vload_partial_8_8 vload_partial_8 6036*c217d954SCole Faust#define vload_partial_8_9 NO_LOAD 6037*c217d954SCole Faust#define vload_partial_8_10 NO_LOAD 6038*c217d954SCole Faust#define vload_partial_8_11 NO_LOAD 6039*c217d954SCole Faust#define vload_partial_8_12 NO_LOAD 6040*c217d954SCole Faust#define vload_partial_8_13 NO_LOAD 6041*c217d954SCole Faust#define vload_partial_8_14 NO_LOAD 6042*c217d954SCole Faust#define vload_partial_8_15 NO_LOAD 6043*c217d954SCole Faust#define vload_partial_8_16 NO_LOAD 6044*c217d954SCole Faust 6045*c217d954SCole Faust#define vload_partial_16_0 NO_LOAD 6046*c217d954SCole Faust#define vload_partial_16_1 vload_partial_1 6047*c217d954SCole Faust#define vload_partial_16_2 vload_partial_2 6048*c217d954SCole Faust#define vload_partial_16_3 vload_partial_3 6049*c217d954SCole Faust#define vload_partial_16_4 vload_partial_4 6050*c217d954SCole Faust#define vload_partial_16_5 vload_partial_5 6051*c217d954SCole Faust#define vload_partial_16_6 vload_partial_6 6052*c217d954SCole Faust#define vload_partial_16_7 vload_partial_7 6053*c217d954SCole Faust#define vload_partial_16_8 vload_partial_8 6054*c217d954SCole Faust#define vload_partial_16_9 vload_partial_9 6055*c217d954SCole Faust#define vload_partial_16_10 vload_partial_10 6056*c217d954SCole Faust#define vload_partial_16_11 vload_partial_11 6057*c217d954SCole Faust#define vload_partial_16_12 vload_partial_12 6058*c217d954SCole Faust#define vload_partial_16_13 vload_partial_13 6059*c217d954SCole Faust#define vload_partial_16_14 vload_partial_14 6060*c217d954SCole Faust#define vload_partial_16_15 vload_partial_15 6061*c217d954SCole Faust#define vload_partial_16_16 vload_partial_16 6062*c217d954SCole Faust 6063*c217d954SCole Faust 6064*c217d954SCole Faust#define vload_partial_1(DATA, OFFSET, PTR) \ 6065*c217d954SCole Faust DATA.s0 = vload1(OFFSET, PTR); 6066*c217d954SCole Faust 6067*c217d954SCole Faust#define vload_partial_2(DATA, OFFSET, PTR) \ 6068*c217d954SCole Faust DATA.s01 = vload2(OFFSET, PTR); 6069*c217d954SCole Faust 6070*c217d954SCole Faust#define vload_partial_3(DATA, OFFSET, PTR) \ 6071*c217d954SCole Faust DATA.s012 = vload3(OFFSET, PTR); 6072*c217d954SCole Faust 6073*c217d954SCole Faust#define vload_partial_4(DATA, OFFSET, PTR) \ 6074*c217d954SCole Faust DATA.s0123 = vload4(OFFSET, PTR); 6075*c217d954SCole Faust 6076*c217d954SCole Faust#define vload_partial_5(DATA, OFFSET, PTR) \ 6077*c217d954SCole Faust vload_partial_4(DATA.s0123, OFFSET, PTR); \ 6078*c217d954SCole Faust DATA.s4 = vload1(OFFSET, PTR + 4); 6079*c217d954SCole Faust 6080*c217d954SCole Faust#define vload_partial_6(DATA, OFFSET, PTR) \ 6081*c217d954SCole Faust vload_partial_4(DATA.s0123, OFFSET, PTR); \ 6082*c217d954SCole Faust vload_partial_2(DATA.s45, OFFSET, PTR + 4); 6083*c217d954SCole Faust 6084*c217d954SCole Faust#define vload_partial_7(DATA, OFFSET, PTR) \ 6085*c217d954SCole Faust vload_partial_4(DATA.s0123, OFFSET, PTR); \ 6086*c217d954SCole Faust vload_partial_3(DATA.s456, OFFSET, PTR + 4); 6087*c217d954SCole Faust 6088*c217d954SCole Faust#define vload_partial_8(DATA, OFFSET, PTR) \ 6089*c217d954SCole Faust DATA.s01234567 = vload8(OFFSET, PTR); 6090*c217d954SCole Faust 6091*c217d954SCole Faust#define vload_partial_9(DATA, OFFSET, PTR) \ 6092*c217d954SCole Faust vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 6093*c217d954SCole Faust DATA.s8 = vload1(OFFSET, PTR + 8); 6094*c217d954SCole Faust 6095*c217d954SCole Faust#define vload_partial_10(DATA, OFFSET, PTR) \ 6096*c217d954SCole Faust vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 6097*c217d954SCole Faust vload_partial_2(DATA.s89, OFFSET, PTR + 8); 6098*c217d954SCole Faust 6099*c217d954SCole Faust#define vload_partial_11(DATA, OFFSET, PTR) \ 6100*c217d954SCole Faust vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 6101*c217d954SCole Faust vload_partial_3(DATA.s89A, OFFSET, PTR + 8); 6102*c217d954SCole Faust 6103*c217d954SCole Faust#define vload_partial_12(DATA, OFFSET, PTR) \ 6104*c217d954SCole Faust vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 6105*c217d954SCole Faust vload_partial_4(DATA.s89AB, OFFSET, PTR + 8); 6106*c217d954SCole Faust 6107*c217d954SCole Faust#define vload_partial_13(DATA, OFFSET, PTR) \ 6108*c217d954SCole Faust vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 6109*c217d954SCole Faust vload_partial_5(DATA.s89ABCDEF, OFFSET, PTR + 8); 6110*c217d954SCole Faust 6111*c217d954SCole Faust#define vload_partial_14(DATA, OFFSET, PTR) \ 6112*c217d954SCole Faust vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 6113*c217d954SCole Faust vload_partial_6(DATA.s89ABCDEF, OFFSET, PTR + 8); 6114*c217d954SCole Faust 6115*c217d954SCole Faust#define vload_partial_15(DATA, OFFSET, PTR) \ 6116*c217d954SCole Faust vload_partial_8(DATA.s01234567, OFFSET, PTR); \ 6117*c217d954SCole Faust vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8); 6118*c217d954SCole Faust 6119*c217d954SCole Faust#define vload_partial_16(DATA, OFFSET, PTR) \ 6120*c217d954SCole Faust DATA = vload16(OFFSET, PTR); 6121*c217d954SCole Faust 6122*c217d954SCole Faust 6123*c217d954SCole Faust 6124*c217d954SCole Faust#define PIXEL_UNIT4 1 6125*c217d954SCole Faust#define PIXEL_UNIT8 2 6126*c217d954SCole Faust#define PIXEL_UNIT16 4 6127*c217d954SCole Faust 6128*c217d954SCole Faust 6129*c217d954SCole Faust#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size 6130*c217d954SCole Faust#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) 6131*c217d954SCole Faust 6132*c217d954SCole Faust 6133*c217d954SCole Faust#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord))); 6134*c217d954SCole Faust#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord))); 6135*c217d954SCole Faust#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord))); 6136*c217d954SCole Faust 6137*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 6138*c217d954SCole Faust#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord))); 6139*c217d954SCole Faust#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord))); 6140*c217d954SCole Faust#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord))); 6141*c217d954SCole Faust#endif 6142*c217d954SCole Faust 6143*c217d954SCole Faust#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values)); 6144*c217d954SCole Faust#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567)); 6145*c217d954SCole Faust#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); 6146*c217d954SCole Faust 6147*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) 6148*c217d954SCole Faust#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values)); 6149*c217d954SCole Faust#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567)); 6150*c217d954SCole Faust#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); 6151*c217d954SCole Faust#endif 6152*c217d954SCole Faust 6153*c217d954SCole Faust 6154*c217d954SCole Faust#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord) 6155*c217d954SCole Faust#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) 6156*c217d954SCole Faust 6157*c217d954SCole Faust 6158*c217d954SCole Faust#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values) 6159*c217d954SCole Faust#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) 6160*c217d954SCole Faust 6161*c217d954SCole Faust#define VSTORE_STR(size) vstore##size 6162*c217d954SCole Faust#define VSTORE(size) VSTORE_STR(size) 6163*c217d954SCole Faust 6164*c217d954SCole Faust#define float1 float 6165*c217d954SCole Faust#define half1 half 6166*c217d954SCole Faust#define char1 char 6167*c217d954SCole Faust#define uchar1 uchar 6168*c217d954SCole Faust#define short1 short 6169*c217d954SCole Faust#define ushort1 ushort 6170*c217d954SCole Faust#define int1 int 6171*c217d954SCole Faust#define uint1 uint 6172*c217d954SCole Faust#define long1 long 6173*c217d954SCole Faust#define ulong1 ulong 6174*c217d954SCole Faust#define double1 double 6175*c217d954SCole Faust 6176*c217d954SCole Faust#define vload1(OFFSET, PTR) *(OFFSET + PTR) 6177*c217d954SCole Faust#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA 6178*c217d954SCole Faust 6179*c217d954SCole Faust 6180*c217d954SCole Faust#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size 6181*c217d954SCole Faust#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size) 6182*c217d954SCole Faust 6183*c217d954SCole Faust#define NO_STORE(data, offs, ptr) \ 6184*c217d954SCole Faust { \ 6185*c217d954SCole Faust } 6186*c217d954SCole Faust 6187*c217d954SCole Faust 6188*c217d954SCole Faust#define vstore_partial_1_0 NO_STORE 6189*c217d954SCole Faust#define vstore_partial_1_1 vstore1 6190*c217d954SCole Faust#define vstore_partial_1_2 NO_STORE 6191*c217d954SCole Faust#define vstore_partial_1_3 NO_STORE 6192*c217d954SCole Faust#define vstore_partial_1_4 NO_STORE 6193*c217d954SCole Faust#define vstore_partial_1_5 NO_STORE 6194*c217d954SCole Faust#define vstore_partial_1_6 NO_STORE 6195*c217d954SCole Faust#define vstore_partial_1_7 NO_STORE 6196*c217d954SCole Faust#define vstore_partial_1_8 NO_STORE 6197*c217d954SCole Faust#define vstore_partial_1_9 NO_STORE 6198*c217d954SCole Faust#define vstore_partial_1_10 NO_STORE 6199*c217d954SCole Faust#define vstore_partial_1_11 NO_STORE 6200*c217d954SCole Faust#define vstore_partial_1_12 NO_STORE 6201*c217d954SCole Faust#define vstore_partial_1_13 NO_STORE 6202*c217d954SCole Faust#define vstore_partial_1_14 NO_STORE 6203*c217d954SCole Faust#define vstore_partial_1_15 NO_STORE 6204*c217d954SCole Faust#define vstore_partial_1_16 NO_STORE 6205*c217d954SCole Faust 6206*c217d954SCole Faust#define vstore_partial_2_0 NO_STORE 6207*c217d954SCole Faust#define vstore_partial_2_1 vstore_partial_1 6208*c217d954SCole Faust#define vstore_partial_2_2 vstore_partial_2 6209*c217d954SCole Faust#define vstore_partial_2_3 NO_STORE 6210*c217d954SCole Faust#define vstore_partial_2_4 NO_STORE 6211*c217d954SCole Faust#define vstore_partial_2_5 NO_STORE 6212*c217d954SCole Faust#define vstore_partial_2_6 NO_STORE 6213*c217d954SCole Faust#define vstore_partial_2_7 NO_STORE 6214*c217d954SCole Faust#define vstore_partial_2_8 NO_STORE 6215*c217d954SCole Faust#define vstore_partial_2_9 NO_STORE 6216*c217d954SCole Faust#define vstore_partial_2_10 NO_STORE 6217*c217d954SCole Faust#define vstore_partial_2_11 NO_STORE 6218*c217d954SCole Faust#define vstore_partial_2_12 NO_STORE 6219*c217d954SCole Faust#define vstore_partial_2_13 NO_STORE 6220*c217d954SCole Faust#define vstore_partial_2_14 NO_STORE 6221*c217d954SCole Faust#define vstore_partial_2_15 NO_STORE 6222*c217d954SCole Faust#define vstore_partial_2_16 NO_STORE 6223*c217d954SCole Faust 6224*c217d954SCole Faust#define vstore_partial_3_0 NO_STORE 6225*c217d954SCole Faust#define vstore_partial_3_1 vstore_partial_1 6226*c217d954SCole Faust#define vstore_partial_3_2 vstore_partial_2 6227*c217d954SCole Faust#define vstore_partial_3_3 vstore_partial_3 6228*c217d954SCole Faust#define vstore_partial_3_4 NO_STORE 6229*c217d954SCole Faust#define vstore_partial_3_5 NO_STORE 6230*c217d954SCole Faust#define vstore_partial_3_6 NO_STORE 6231*c217d954SCole Faust#define vstore_partial_3_7 NO_STORE 6232*c217d954SCole Faust#define vstore_partial_3_8 NO_STORE 6233*c217d954SCole Faust#define vstore_partial_3_9 NO_STORE 6234*c217d954SCole Faust#define vstore_partial_3_10 NO_STORE 6235*c217d954SCole Faust#define vstore_partial_3_11 NO_STORE 6236*c217d954SCole Faust#define vstore_partial_3_12 NO_STORE 6237*c217d954SCole Faust#define vstore_partial_3_13 NO_STORE 6238*c217d954SCole Faust#define vstore_partial_3_14 NO_STORE 6239*c217d954SCole Faust#define vstore_partial_3_15 NO_STORE 6240*c217d954SCole Faust#define vstore_partial_3_16 NO_STORE 6241*c217d954SCole Faust 6242*c217d954SCole Faust#define vstore_partial_4_0 NO_STORE 6243*c217d954SCole Faust#define vstore_partial_4_1 vstore_partial_1 6244*c217d954SCole Faust#define vstore_partial_4_2 vstore_partial_2 6245*c217d954SCole Faust#define vstore_partial_4_3 vstore_partial_3 6246*c217d954SCole Faust#define vstore_partial_4_4 vstore_partial_4 6247*c217d954SCole Faust#define vstore_partial_4_5 NO_STORE 6248*c217d954SCole Faust#define vstore_partial_4_6 NO_STORE 6249*c217d954SCole Faust#define vstore_partial_4_7 NO_STORE 6250*c217d954SCole Faust#define vstore_partial_4_8 NO_STORE 6251*c217d954SCole Faust#define vstore_partial_4_9 NO_STORE 6252*c217d954SCole Faust#define vstore_partial_4_10 NO_STORE 6253*c217d954SCole Faust#define vstore_partial_4_11 NO_STORE 6254*c217d954SCole Faust#define vstore_partial_4_12 NO_STORE 6255*c217d954SCole Faust#define vstore_partial_4_13 NO_STORE 6256*c217d954SCole Faust#define vstore_partial_4_14 NO_STORE 6257*c217d954SCole Faust#define vstore_partial_4_15 NO_STORE 6258*c217d954SCole Faust#define vstore_partial_4_16 NO_STORE 6259*c217d954SCole Faust 6260*c217d954SCole Faust#define vstore_partial_8_0 NO_STORE 6261*c217d954SCole Faust#define vstore_partial_8_1 vstore_partial_1 6262*c217d954SCole Faust#define vstore_partial_8_2 vstore_partial_2 6263*c217d954SCole Faust#define vstore_partial_8_3 vstore_partial_3 6264*c217d954SCole Faust#define vstore_partial_8_4 vstore_partial_4 6265*c217d954SCole Faust#define vstore_partial_8_5 vstore_partial_5 6266*c217d954SCole Faust#define vstore_partial_8_6 vstore_partial_6 6267*c217d954SCole Faust#define vstore_partial_8_7 vstore_partial_7 6268*c217d954SCole Faust#define vstore_partial_8_8 vstore_partial_8 6269*c217d954SCole Faust#define vstore_partial_8_9 NO_STORE 6270*c217d954SCole Faust#define vstore_partial_8_10 NO_STORE 6271*c217d954SCole Faust#define vstore_partial_8_11 NO_STORE 6272*c217d954SCole Faust#define vstore_partial_8_12 NO_STORE 6273*c217d954SCole Faust#define vstore_partial_8_13 NO_STORE 6274*c217d954SCole Faust#define vstore_partial_8_14 NO_STORE 6275*c217d954SCole Faust#define vstore_partial_8_15 NO_STORE 6276*c217d954SCole Faust#define vstore_partial_8_16 NO_STORE 6277*c217d954SCole Faust 6278*c217d954SCole Faust#define vstore_partial_16_0 NO_STORE 6279*c217d954SCole Faust#define vstore_partial_16_1 vstore_partial_1 6280*c217d954SCole Faust#define vstore_partial_16_2 vstore_partial_2 6281*c217d954SCole Faust#define vstore_partial_16_3 vstore_partial_3 6282*c217d954SCole Faust#define vstore_partial_16_4 vstore_partial_4 6283*c217d954SCole Faust#define vstore_partial_16_5 vstore_partial_5 6284*c217d954SCole Faust#define vstore_partial_16_6 vstore_partial_6 6285*c217d954SCole Faust#define vstore_partial_16_7 vstore_partial_7 6286*c217d954SCole Faust#define vstore_partial_16_8 vstore_partial_8 6287*c217d954SCole Faust#define vstore_partial_16_9 vstore_partial_9 6288*c217d954SCole Faust#define vstore_partial_16_10 vstore_partial_10 6289*c217d954SCole Faust#define vstore_partial_16_11 vstore_partial_11 6290*c217d954SCole Faust#define vstore_partial_16_12 vstore_partial_12 6291*c217d954SCole Faust#define vstore_partial_16_13 vstore_partial_13 6292*c217d954SCole Faust#define vstore_partial_16_14 vstore_partial_14 6293*c217d954SCole Faust#define vstore_partial_16_15 vstore_partial_15 6294*c217d954SCole Faust#define vstore_partial_16_16 vstore_partial_16 6295*c217d954SCole Faust 6296*c217d954SCole Faust 6297*c217d954SCole Faust#define vstore_partial_1(DATA, OFFSET, PTR) \ 6298*c217d954SCole Faust vstore1(DATA.s0, OFFSET, PTR); 6299*c217d954SCole Faust 6300*c217d954SCole Faust#define vstore_partial_2(DATA, OFFSET, PTR) \ 6301*c217d954SCole Faust vstore2(DATA.s01, OFFSET, PTR); 6302*c217d954SCole Faust 6303*c217d954SCole Faust#define vstore_partial_3(DATA, OFFSET, PTR) \ 6304*c217d954SCole Faust vstore3(DATA.s012, OFFSET, PTR); 6305*c217d954SCole Faust 6306*c217d954SCole Faust#define vstore_partial_4(DATA, OFFSET, PTR) \ 6307*c217d954SCole Faust vstore4(DATA.s0123, OFFSET, PTR); 6308*c217d954SCole Faust 6309*c217d954SCole Faust#define vstore_partial_5(DATA, OFFSET, PTR) \ 6310*c217d954SCole Faust vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 6311*c217d954SCole Faust vstore1(DATA.s4, OFFSET, PTR + 4); 6312*c217d954SCole Faust 6313*c217d954SCole Faust#define vstore_partial_6(DATA, OFFSET, PTR) \ 6314*c217d954SCole Faust vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 6315*c217d954SCole Faust vstore_partial_2(DATA.s45, OFFSET, PTR + 4); 6316*c217d954SCole Faust 6317*c217d954SCole Faust#define vstore_partial_7(DATA, OFFSET, PTR) \ 6318*c217d954SCole Faust vstore_partial_4(DATA.s0123, OFFSET, PTR); \ 6319*c217d954SCole Faust vstore_partial_3(DATA.s456, OFFSET, PTR + 4); 6320*c217d954SCole Faust 6321*c217d954SCole Faust#define vstore_partial_8(DATA, OFFSET, PTR) \ 6322*c217d954SCole Faust vstore8(DATA.s01234567, OFFSET, PTR); 6323*c217d954SCole Faust 6324*c217d954SCole Faust#define vstore_partial_9(DATA, OFFSET, PTR) \ 6325*c217d954SCole Faust vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 6326*c217d954SCole Faust vstore1(DATA.s8, OFFSET, PTR + 8); 6327*c217d954SCole Faust 6328*c217d954SCole Faust#define vstore_partial_10(DATA, OFFSET, PTR) \ 6329*c217d954SCole Faust vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 6330*c217d954SCole Faust vstore_partial_2(DATA.s89, OFFSET, PTR + 8); 6331*c217d954SCole Faust 6332*c217d954SCole Faust#define vstore_partial_11(DATA, OFFSET, PTR) \ 6333*c217d954SCole Faust vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 6334*c217d954SCole Faust vstore_partial_3(DATA.s89a, OFFSET, PTR + 8); 6335*c217d954SCole Faust 6336*c217d954SCole Faust#define vstore_partial_12(DATA, OFFSET, PTR) \ 6337*c217d954SCole Faust vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 6338*c217d954SCole Faust vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8); 6339*c217d954SCole Faust 6340*c217d954SCole Faust#define vstore_partial_13(DATA, OFFSET, PTR) \ 6341*c217d954SCole Faust vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 6342*c217d954SCole Faust vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8); 6343*c217d954SCole Faust 6344*c217d954SCole Faust#define vstore_partial_14(DATA, OFFSET, PTR) \ 6345*c217d954SCole Faust vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 6346*c217d954SCole Faust vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8); 6347*c217d954SCole Faust 6348*c217d954SCole Faust#define vstore_partial_15(DATA, OFFSET, PTR) \ 6349*c217d954SCole Faust vstore_partial_8(DATA.s01234567, OFFSET, PTR); \ 6350*c217d954SCole Faust vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8); 6351*c217d954SCole Faust 6352*c217d954SCole Faust#define vstore_partial_16(DATA, OFFSET, PTR) \ 6353*c217d954SCole Faust vstore16(DATA, OFFSET, PTR); 6354*c217d954SCole Faust 6355*c217d954SCole Faust 6356*c217d954SCole Faust 6357*c217d954SCole Faust 6358*c217d954SCole Faust 6359*c217d954SCole Faust#define convert_float_sat convert_float 6360*c217d954SCole Faust#define convert_float1_sat convert_float 6361*c217d954SCole Faust#define convert_float2_sat convert_float2 6362*c217d954SCole Faust#define convert_float3_sat convert_float3 6363*c217d954SCole Faust#define convert_float4_sat convert_float4 6364*c217d954SCole Faust#define convert_float8_sat convert_float8 6365*c217d954SCole Faust#define convert_float16_sat convert_float16 6366*c217d954SCole Faust#define convert_half_sat convert_float 6367*c217d954SCole Faust#define convert_half1_sat convert_half 6368*c217d954SCole Faust#define convert_half2_sat convert_half2 6369*c217d954SCole Faust#define convert_half3_sat convert_half3 6370*c217d954SCole Faust#define convert_half4_sat convert_half4 6371*c217d954SCole Faust#define convert_half8_sat convert_half8 6372*c217d954SCole Faust#define convert_half16_sat convert_half16 6373*c217d954SCole Faust 6374*c217d954SCole Faust#define convert_float1 convert_float 6375*c217d954SCole Faust#define convert_half1 convert_half 6376*c217d954SCole Faust#define convert_char1 convert_char 6377*c217d954SCole Faust#define convert_uchar1 convert_uchar 6378*c217d954SCole Faust#define convert_short1 convert_short 6379*c217d954SCole Faust#define convert_ushort1 convert_ushort 6380*c217d954SCole Faust#define convert_int1 convert_int 6381*c217d954SCole Faust#define convert_uint1 convert_uint 6382*c217d954SCole Faust#define convert_long1 convert_long 6383*c217d954SCole Faust#define convert_ulong1 convert_ulong 6384*c217d954SCole Faust#define convert_double1 convert_double 6385*c217d954SCole Faust 6386*c217d954SCole Faust#define convert_char1_sat convert_char_sat 6387*c217d954SCole Faust#define convert_uchar1_sat convert_uchar_sat 6388*c217d954SCole Faust#define convert_uchar2_sat convert_uchar2_sat 6389*c217d954SCole Faust#define convert_uchar3_sat convert_uchar3_sat 6390*c217d954SCole Faust#define convert_uchar4_sat convert_uchar4_sat 6391*c217d954SCole Faust#define convert_uchar8_sat convert_uchar8_sat 6392*c217d954SCole Faust#define convert_uchar16_sat convert_uchar16_sat 6393*c217d954SCole Faust#define convert_short1_sat convert_short_sat 6394*c217d954SCole Faust#define convert_ushort1_sat convert_ushort_sat 6395*c217d954SCole Faust#define convert_int1_sat convert_int_sat 6396*c217d954SCole Faust#define convert_uint1_sat convert_uint_sat 6397*c217d954SCole Faust#define convert_long1_sat convert_long_sat 6398*c217d954SCole Faust#define convert_ulong1_sat convert_ulong_sat 6399*c217d954SCole Faust#define convert_double1_sat convert_double_sat 6400*c217d954SCole Faust 6401*c217d954SCole Faust#define VEC_DATA_TYPE_STR(type, size) type##size 6402*c217d954SCole Faust#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) 6403*c217d954SCole Faust 6404*c217d954SCole Faust#define CONVERT_STR(x, type) (convert_##type((x))) 6405*c217d954SCole Faust#define CONVERT(x, type) CONVERT_STR(x, type) 6406*c217d954SCole Faust 6407*c217d954SCole Faust#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x))) 6408*c217d954SCole Faust#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type) 6409*c217d954SCole Faust 6410*c217d954SCole Faust#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x))) 6411*c217d954SCole Faust#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round) 6412*c217d954SCole Faust 6413*c217d954SCole Faust#define select_vec_dt_uchar(size) uchar##size 6414*c217d954SCole Faust#define select_vec_dt_char(size) char##size 6415*c217d954SCole Faust#define select_vec_dt_ushort(size) ushort##size 6416*c217d954SCole Faust#define select_vec_dt_short(size) short##size 6417*c217d954SCole Faust#define select_vec_dt_half(size) short##size 6418*c217d954SCole Faust#define select_vec_dt_uint(size) uint##size 6419*c217d954SCole Faust#define select_vec_dt_int(size) int##size 6420*c217d954SCole Faust#define select_vec_dt_float(size) int##size 6421*c217d954SCole Faust#define select_vec_dt_ulong(size) ulong##size 6422*c217d954SCole Faust#define select_vec_dt_long(size) long##size 6423*c217d954SCole Faust 6424*c217d954SCole Faust#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size) 6425*c217d954SCole Faust#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size) 6426*c217d954SCole Faust#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1) 6427*c217d954SCole Faust 6428*c217d954SCole Faust#define signed_int_vec_dt_uchar(size) char##size 6429*c217d954SCole Faust#define signed_int_vec_dt_char(size) char##size 6430*c217d954SCole Faust#define signed_int_vec_dt_ushort(size) short##size 6431*c217d954SCole Faust#define signed_int_vec_dt_short(size) short##size 6432*c217d954SCole Faust#define signed_int_vec_dt_half(size) short##size 6433*c217d954SCole Faust#define signed_int_vec_dt_uint(size) int##size 6434*c217d954SCole Faust#define signed_int_vec_dt_int(size) int##size 6435*c217d954SCole Faust#define signed_int_vec_dt_float(size) int##size 6436*c217d954SCole Faust#define signed_int_vec_dt_ulong(size) long##size 6437*c217d954SCole Faust#define signed_int_vec_dt_long(size) long##size 6438*c217d954SCole Faust 6439*c217d954SCole Faust#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size) 6440*c217d954SCole Faust#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size) 6441*c217d954SCole Faust#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1) 6442*c217d954SCole Faust 6443*c217d954SCole Faust#define sum_reduce_1(x) (x) 6444*c217d954SCole Faust#define sum_reduce_2(x) ((x).s0) + ((x).s1) 6445*c217d954SCole Faust#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2) 6446*c217d954SCole Faust#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23) 6447*c217d954SCole Faust#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567) 6448*c217d954SCole Faust#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF) 6449*c217d954SCole Faust 6450*c217d954SCole Faust#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x) 6451*c217d954SCole Faust#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size) 6452*c217d954SCole Faust 6453*c217d954SCole Faust#define prod_reduce_1(x) (x) 6454*c217d954SCole Faust#define prod_reduce_2(x) ((x).s0) * ((x).s1) 6455*c217d954SCole Faust#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2) 6456*c217d954SCole Faust#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23) 6457*c217d954SCole Faust#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567) 6458*c217d954SCole Faust#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF) 6459*c217d954SCole Faust 6460*c217d954SCole Faust#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x) 6461*c217d954SCole Faust#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size) 6462*c217d954SCole Faust 6463*c217d954SCole Faust#define max_reduce_1(x) (x) 6464*c217d954SCole Faust#define max_reduce_2(x) max(((x).s0), ((x).s1)) 6465*c217d954SCole Faust#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2)) 6466*c217d954SCole Faust#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23)) 6467*c217d954SCole Faust#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567)) 6468*c217d954SCole Faust#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF)) 6469*c217d954SCole Faust 6470*c217d954SCole Faust#define MAX_REDUCE_STR(x, size) max_reduce_##size(x) 6471*c217d954SCole Faust#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size) 6472*c217d954SCole Faust 6473*c217d954SCole Faust#define VECTOR_DECLARATION(name) \ 6474*c217d954SCole Faust __global uchar *name##_ptr, \ 6475*c217d954SCole Faust uint name##_stride_x, \ 6476*c217d954SCole Faust uint name##_step_x, \ 6477*c217d954SCole Faust uint name##_offset_first_element_in_bytes 6478*c217d954SCole Faust 6479*c217d954SCole Faust#define IMAGE_DECLARATION(name) \ 6480*c217d954SCole Faust __global uchar *name##_ptr, \ 6481*c217d954SCole Faust uint name##_stride_x, \ 6482*c217d954SCole Faust uint name##_step_x, \ 6483*c217d954SCole Faust uint name##_stride_y, \ 6484*c217d954SCole Faust uint name##_step_y, \ 6485*c217d954SCole Faust uint name##_offset_first_element_in_bytes 6486*c217d954SCole Faust 6487*c217d954SCole Faust#define TENSOR3D_DECLARATION(name) \ 6488*c217d954SCole Faust __global uchar *name##_ptr, \ 6489*c217d954SCole Faust uint name##_stride_x, \ 6490*c217d954SCole Faust uint name##_step_x, \ 6491*c217d954SCole Faust uint name##_stride_y, \ 6492*c217d954SCole Faust uint name##_step_y, \ 6493*c217d954SCole Faust uint name##_stride_z, \ 6494*c217d954SCole Faust uint name##_step_z, \ 6495*c217d954SCole Faust uint name##_offset_first_element_in_bytes 6496*c217d954SCole Faust 6497*c217d954SCole Faust#define TENSOR4D_DECLARATION(name) \ 6498*c217d954SCole Faust __global uchar *name##_ptr, \ 6499*c217d954SCole Faust uint name##_stride_x, \ 6500*c217d954SCole Faust uint name##_step_x, \ 6501*c217d954SCole Faust uint name##_stride_y, \ 6502*c217d954SCole Faust uint name##_step_y, \ 6503*c217d954SCole Faust uint name##_stride_z, \ 6504*c217d954SCole Faust uint name##_step_z, \ 6505*c217d954SCole Faust uint name##_stride_w, \ 6506*c217d954SCole Faust uint name##_step_w, \ 6507*c217d954SCole Faust uint name##_offset_first_element_in_bytes 6508*c217d954SCole Faust 6509*c217d954SCole Faust#define TENSOR5D_DECLARATION(name) \ 6510*c217d954SCole Faust __global uchar *name##_ptr, \ 6511*c217d954SCole Faust uint name##_stride_x, \ 6512*c217d954SCole Faust uint name##_step_x, \ 6513*c217d954SCole Faust uint name##_stride_y, \ 6514*c217d954SCole Faust uint name##_step_y, \ 6515*c217d954SCole Faust uint name##_stride_z, \ 6516*c217d954SCole Faust uint name##_step_z, \ 6517*c217d954SCole Faust uint name##_stride_w, \ 6518*c217d954SCole Faust uint name##_step_w, \ 6519*c217d954SCole Faust uint name##_stride_v, \ 6520*c217d954SCole Faust uint name##_step_v, \ 6521*c217d954SCole Faust uint name##_offset_first_element_in_bytes 6522*c217d954SCole Faust 6523*c217d954SCole Faust#define CONVERT_TO_VECTOR_STRUCT(name) \ 6524*c217d954SCole Faust update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x) 6525*c217d954SCole Faust 6526*c217d954SCole Faust#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \ 6527*c217d954SCole Faust update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0) 6528*c217d954SCole Faust 6529*c217d954SCole Faust#define CONVERT_TO_IMAGE_STRUCT(name) \ 6530*c217d954SCole Faust update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y) 6531*c217d954SCole Faust 6532*c217d954SCole Faust#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \ 6533*c217d954SCole Faust update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0) 6534*c217d954SCole Faust 6535*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 6536*c217d954SCole Faust update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 6537*c217d954SCole Faust 6538*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \ 6539*c217d954SCole Faust update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z) 6540*c217d954SCole Faust 6541*c217d954SCole Faust#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ 6542*c217d954SCole Faust update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) 6543*c217d954SCole Faust 6544*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT(name) \ 6545*c217d954SCole Faust update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 6546*c217d954SCole Faust name##_stride_z, name##_step_z) 6547*c217d954SCole Faust 6548*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \ 6549*c217d954SCole Faust update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0) 6550*c217d954SCole Faust 6551*c217d954SCole Faust#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size) \ 6552*c217d954SCole Faust update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 6553*c217d954SCole Faust name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size) 6554*c217d954SCole Faust 6555*c217d954SCole Faust#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \ 6556*c217d954SCole Faust update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size) 6557*c217d954SCole Faust 6558*c217d954SCole Faust#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name) \ 6559*c217d954SCole Faust tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ 6560*c217d954SCole Faust name##_stride_z, name##_step_z) 6561*c217d954SCole Faust 6562*c217d954SCole Faust 6563*c217d954SCole Fausttypedef struct Vector 6564*c217d954SCole Faust{ 6565*c217d954SCole Faust __global uchar *ptr; 6566*c217d954SCole Faust int offset_first_element_in_bytes; 6567*c217d954SCole Faust int stride_x; 6568*c217d954SCole Faust} Vector; 6569*c217d954SCole Faust 6570*c217d954SCole Faust 6571*c217d954SCole Fausttypedef struct Image 6572*c217d954SCole Faust{ 6573*c217d954SCole Faust __global uchar *ptr; 6574*c217d954SCole Faust int offset_first_element_in_bytes; 6575*c217d954SCole Faust int stride_x; 6576*c217d954SCole Faust int stride_y; 6577*c217d954SCole Faust} Image; 6578*c217d954SCole Faust 6579*c217d954SCole Faust 6580*c217d954SCole Fausttypedef struct Tensor3D 6581*c217d954SCole Faust{ 6582*c217d954SCole Faust __global uchar *ptr; 6583*c217d954SCole Faust int offset_first_element_in_bytes; 6584*c217d954SCole Faust int stride_x; 6585*c217d954SCole Faust int stride_y; 6586*c217d954SCole Faust int stride_z; 6587*c217d954SCole Faust} Tensor3D; 6588*c217d954SCole Faust 6589*c217d954SCole Faust 6590*c217d954SCole Fausttypedef struct Tensor4D 6591*c217d954SCole Faust{ 6592*c217d954SCole Faust __global uchar *ptr; 6593*c217d954SCole Faust int offset_first_element_in_bytes; 6594*c217d954SCole Faust int stride_x; 6595*c217d954SCole Faust int stride_y; 6596*c217d954SCole Faust int stride_z; 6597*c217d954SCole Faust int stride_w; 6598*c217d954SCole Faust} Tensor4D; 6599*c217d954SCole Faust 6600*c217d954SCole Faust 6601*c217d954SCole Faustinline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x) 6602*c217d954SCole Faust{ 6603*c217d954SCole Faust Vector vector = 6604*c217d954SCole Faust { 6605*c217d954SCole Faust .ptr = ptr, 6606*c217d954SCole Faust .offset_first_element_in_bytes = offset_first_element_in_bytes, 6607*c217d954SCole Faust .stride_x = stride_x, 6608*c217d954SCole Faust }; 6609*c217d954SCole Faust vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x; 6610*c217d954SCole Faust return vector; 6611*c217d954SCole Faust} 6612*c217d954SCole Faust 6613*c217d954SCole Faust 6614*c217d954SCole Faustinline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y) 6615*c217d954SCole Faust{ 6616*c217d954SCole Faust Image img = 6617*c217d954SCole Faust { 6618*c217d954SCole Faust .ptr = ptr, 6619*c217d954SCole Faust .offset_first_element_in_bytes = offset_first_element_in_bytes, 6620*c217d954SCole Faust .stride_x = stride_x, 6621*c217d954SCole Faust .stride_y = stride_y 6622*c217d954SCole Faust }; 6623*c217d954SCole Faust img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y; 6624*c217d954SCole Faust return img; 6625*c217d954SCole Faust} 6626*c217d954SCole Faust 6627*c217d954SCole Faust 6628*c217d954SCole Faustinline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 6629*c217d954SCole Faust{ 6630*c217d954SCole Faust Image img = 6631*c217d954SCole Faust { 6632*c217d954SCole Faust .ptr = ptr, 6633*c217d954SCole Faust .offset_first_element_in_bytes = offset_first_element_in_bytes, 6634*c217d954SCole Faust .stride_x = stride_x, 6635*c217d954SCole Faust .stride_y = stride_y 6636*c217d954SCole Faust }; 6637*c217d954SCole Faust img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 6638*c217d954SCole Faust return img; 6639*c217d954SCole Faust} 6640*c217d954SCole Faust 6641*c217d954SCole Faust 6642*c217d954SCole Faustinline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 6643*c217d954SCole Faust{ 6644*c217d954SCole Faust Tensor3D tensor = 6645*c217d954SCole Faust { 6646*c217d954SCole Faust .ptr = ptr, 6647*c217d954SCole Faust .offset_first_element_in_bytes = offset_first_element_in_bytes, 6648*c217d954SCole Faust .stride_x = stride_x, 6649*c217d954SCole Faust .stride_y = stride_y, 6650*c217d954SCole Faust .stride_z = stride_z 6651*c217d954SCole Faust }; 6652*c217d954SCole Faust tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; 6653*c217d954SCole Faust return tensor; 6654*c217d954SCole Faust} 6655*c217d954SCole Faust 6656*c217d954SCole Faust 6657*c217d954SCole Faustinline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) 6658*c217d954SCole Faust{ 6659*c217d954SCole Faust Tensor3D tensor = 6660*c217d954SCole Faust { 6661*c217d954SCole Faust .ptr = ptr, 6662*c217d954SCole Faust .offset_first_element_in_bytes = offset_first_element_in_bytes, 6663*c217d954SCole Faust .stride_x = stride_x, 6664*c217d954SCole Faust .stride_y = stride_y, 6665*c217d954SCole Faust .stride_z = stride_z 6666*c217d954SCole Faust }; 6667*c217d954SCole Faust return tensor; 6668*c217d954SCole Faust} 6669*c217d954SCole Faust 6670*c217d954SCole Faustinline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w, 6671*c217d954SCole Faust uint step_w, 6672*c217d954SCole Faust uint mod_size) 6673*c217d954SCole Faust{ 6674*c217d954SCole Faust Tensor4D tensor = 6675*c217d954SCole Faust { 6676*c217d954SCole Faust .ptr = ptr, 6677*c217d954SCole Faust .offset_first_element_in_bytes = offset_first_element_in_bytes, 6678*c217d954SCole Faust .stride_x = stride_x, 6679*c217d954SCole Faust .stride_y = stride_y, 6680*c217d954SCole Faust .stride_z = stride_z, 6681*c217d954SCole Faust .stride_w = stride_w 6682*c217d954SCole Faust }; 6683*c217d954SCole Faust 6684*c217d954SCole Faust tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w; 6685*c217d954SCole Faust return tensor; 6686*c217d954SCole Faust} 6687*c217d954SCole Faust 6688*c217d954SCole Faust 6689*c217d954SCole Faustinline __global const uchar *vector_offset(const Vector *vec, int x) 6690*c217d954SCole Faust{ 6691*c217d954SCole Faust return vec->ptr + x * vec->stride_x; 6692*c217d954SCole Faust} 6693*c217d954SCole Faust 6694*c217d954SCole Faust 6695*c217d954SCole Faustinline __global uchar *offset(const Image *img, int x, int y) 6696*c217d954SCole Faust{ 6697*c217d954SCole Faust return img->ptr + x * img->stride_x + y * img->stride_y; 6698*c217d954SCole Faust} 6699*c217d954SCole Faust 6700*c217d954SCole Faust 6701*c217d954SCole Faustinline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z) 6702*c217d954SCole Faust{ 6703*c217d954SCole Faust return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z; 6704*c217d954SCole Faust} 6705*c217d954SCole Faust 6706*c217d954SCole Faust 6707*c217d954SCole Faustinline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w) 6708*c217d954SCole Faust{ 6709*c217d954SCole Faust return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w; 6710*c217d954SCole Faust} 6711*c217d954SCole Faust 6712*c217d954SCole Faust 6713*c217d954SCole Faustinline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index) 6714*c217d954SCole Faust{ 6715*c217d954SCole Faust uint num_elements = width * height; 6716*c217d954SCole Faust 6717*c217d954SCole Faust const uint z = index / num_elements; 6718*c217d954SCole Faust 6719*c217d954SCole Faust index %= num_elements; 6720*c217d954SCole Faust 6721*c217d954SCole Faust const uint y = index / width; 6722*c217d954SCole Faust 6723*c217d954SCole Faust index %= width; 6724*c217d954SCole Faust 6725*c217d954SCole Faust const uint x = index; 6726*c217d954SCole Faust 6727*c217d954SCole Faust return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes; 6728*c217d954SCole Faust} 6729*c217d954SCole Faust 6730*c217d954SCole Faust#endif 6731*c217d954SCole Faust 6732*c217d954SCole Faust 6733*c217d954SCole Faust 6734*c217d954SCole Faust#define REPEAT_3_1(P_X, P_A, P_B, P_C) P_X##_DEF(0, P_A, P_B, P_C) 6735*c217d954SCole Faust#define REPEAT_3_2(P_X, P_A, P_B, P_C) \ 6736*c217d954SCole Faust P_X##_DEF(1, P_A, P_B, P_C); \ 6737*c217d954SCole Faust REPEAT_3_1(P_X, P_A, P_B, P_C) 6738*c217d954SCole Faust#define REPEAT_3_3(P_X, P_A, P_B, P_C) \ 6739*c217d954SCole Faust P_X##_DEF(2, P_A, P_B, P_C); \ 6740*c217d954SCole Faust REPEAT_3_2(P_X, P_A, P_B, P_C) 6741*c217d954SCole Faust#define REPEAT_3_4(P_X, P_A, P_B, P_C) \ 6742*c217d954SCole Faust P_X##_DEF(3, P_A, P_B, P_C); \ 6743*c217d954SCole Faust REPEAT_3_3(P_X, P_A, P_B, P_C) 6744*c217d954SCole Faust#define REPEAT_3_5(P_X, P_A, P_B, P_C) \ 6745*c217d954SCole Faust P_X##_DEF(4, P_A, P_B, P_C); \ 6746*c217d954SCole Faust REPEAT_3_4(P_X, P_A, P_B, P_C) 6747*c217d954SCole Faust#define REPEAT_3_6(P_X, P_A, P_B, P_C) \ 6748*c217d954SCole Faust P_X##_DEF(5, P_A, P_B, P_C); \ 6749*c217d954SCole Faust REPEAT_3_5(P_X, P_A, P_B, P_C) 6750*c217d954SCole Faust#define REPEAT_3_7(P_X, P_A, P_B, P_C) \ 6751*c217d954SCole Faust P_X##_DEF(6, P_A, P_B, P_C); \ 6752*c217d954SCole Faust REPEAT_3_6(P_X, P_A, P_B, P_C) 6753*c217d954SCole Faust#define REPEAT_3_8(P_X, P_A, P_B, P_C) \ 6754*c217d954SCole Faust P_X##_DEF(7, P_A, P_B, P_C); \ 6755*c217d954SCole Faust REPEAT_3_7(P_X, P_A, P_B, P_C) 6756*c217d954SCole Faust#define REPEAT_3_9(P_X, P_A, P_B, P_C) \ 6757*c217d954SCole Faust P_X##_DEF(8, P_A, P_B, P_C); \ 6758*c217d954SCole Faust REPEAT_3_8(P_X, P_A, P_B, P_C) 6759*c217d954SCole Faust#define REPEAT_3_10(P_X, P_A, P_B, P_C) \ 6760*c217d954SCole Faust P_X##_DEF(9, P_A, P_B, P_C); \ 6761*c217d954SCole Faust REPEAT_3_9(P_X, P_A, P_B, P_C) 6762*c217d954SCole Faust#define REPEAT_3_11(P_X, P_A, P_B, P_C) \ 6763*c217d954SCole Faust P_X##_DEF(A, P_A, P_B, P_C); \ 6764*c217d954SCole Faust REPEAT_3_10(P_X, P_A, P_B, P_C) 6765*c217d954SCole Faust#define REPEAT_3_12(P_X, P_A, P_B, P_C) \ 6766*c217d954SCole Faust P_X##_DEF(B, P_A, P_B, P_C); \ 6767*c217d954SCole Faust REPEAT_3_11(P_X, P_A, P_B, P_C) 6768*c217d954SCole Faust#define REPEAT_3_13(P_X, P_A, P_B, P_C) \ 6769*c217d954SCole Faust P_X##_DEF(C, P_A, P_B, P_C); \ 6770*c217d954SCole Faust REPEAT_3_12(P_X, P_A, P_B, P_C) 6771*c217d954SCole Faust#define REPEAT_3_14(P_X, P_A, P_B, P_C) \ 6772*c217d954SCole Faust P_X##_DEF(D, P_A, P_B, P_C); \ 6773*c217d954SCole Faust REPEAT_3_13(P_X, P_A, P_B, P_C) 6774*c217d954SCole Faust#define REPEAT_3_15(P_X, P_A, P_B, P_C) \ 6775*c217d954SCole Faust P_X##_DEF(E, P_A, P_B, P_C); \ 6776*c217d954SCole Faust REPEAT_3_14(P_X, P_A, P_B, P_C) 6777*c217d954SCole Faust#define REPEAT_3_16(P_X, P_A, P_B, P_C) \ 6778*c217d954SCole Faust P_X##_DEF(F, P_A, P_B, P_C); \ 6779*c217d954SCole Faust REPEAT_3_15(P_X, P_A, P_B, P_C) 6780*c217d954SCole Faust 6781*c217d954SCole Faust#define REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_3_##P_NUM(P_OP, P_A, P_B, P_C) 6782*c217d954SCole Faust#define REPEAT_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C) 6783*c217d954SCole Faust 6784*c217d954SCole Faust 6785*c217d954SCole Faust#define REPEAT_4_1(P_X, P_A, P_B, P_C, P_D) P_X##_DEF(0, P_A, P_B, P_C, P_D) 6786*c217d954SCole Faust#define REPEAT_4_2(P_X, P_A, P_B, P_C, P_D) \ 6787*c217d954SCole Faust P_X##_DEF(1, P_A, P_B, P_C, P_D); \ 6788*c217d954SCole Faust REPEAT_4_1(P_X, P_A, P_B, P_C, P_D) 6789*c217d954SCole Faust#define REPEAT_4_3(P_X, P_A, P_B, P_C, P_D) \ 6790*c217d954SCole Faust P_X##_DEF(2, P_A, P_B, P_C, P_D); \ 6791*c217d954SCole Faust REPEAT_4_2(P_X, P_A, P_B, P_C, P_D) 6792*c217d954SCole Faust#define REPEAT_4_4(P_X, P_A, P_B, P_C, P_D) \ 6793*c217d954SCole Faust P_X##_DEF(3, P_A, P_B, P_C, P_D); \ 6794*c217d954SCole Faust REPEAT_4_3(P_X, P_A, P_B, P_C, P_D) 6795*c217d954SCole Faust#define REPEAT_4_5(P_X, P_A, P_B, P_C, P_D) \ 6796*c217d954SCole Faust P_X##_DEF(4, P_A, P_B, P_C, P_D); \ 6797*c217d954SCole Faust REPEAT_4_4(P_X, P_A, P_B, P_C, P_D) 6798*c217d954SCole Faust#define REPEAT_4_6(P_X, P_A, P_B, P_C, P_D) \ 6799*c217d954SCole Faust P_X##_DEF(5, P_A, P_B, P_C, P_D); \ 6800*c217d954SCole Faust REPEAT_4_5(P_X, P_A, P_B, P_C, P_D) 6801*c217d954SCole Faust#define REPEAT_4_7(P_X, P_A, P_B, P_C, P_D) \ 6802*c217d954SCole Faust P_X##_DEF(6, P_A, P_B, P_C, P_D); \ 6803*c217d954SCole Faust REPEAT_4_6(P_X, P_A, P_B, P_C, P_D) 6804*c217d954SCole Faust#define REPEAT_4_8(P_X, P_A, P_B, P_C, P_D) \ 6805*c217d954SCole Faust P_X##_DEF(7, P_A, P_B, P_C, P_D); \ 6806*c217d954SCole Faust REPEAT_4_7(P_X, P_A, P_B, P_C, P_D) 6807*c217d954SCole Faust#define REPEAT_4_9(P_X, P_A, P_B, P_C, P_D) \ 6808*c217d954SCole Faust P_X##_DEF(8, P_A, P_B, P_C, P_D); \ 6809*c217d954SCole Faust REPEAT_4_8(P_X, P_A, P_B, P_C, P_D) 6810*c217d954SCole Faust#define REPEAT_4_10(P_X, P_A, P_B, P_C, P_D) \ 6811*c217d954SCole Faust P_X##_DEF(9, P_A, P_B, P_C, P_D); \ 6812*c217d954SCole Faust REPEAT_4_9(P_X, P_A, P_B, P_C, P_D) 6813*c217d954SCole Faust#define REPEAT_4_11(P_X, P_A, P_B, P_C, P_D) \ 6814*c217d954SCole Faust P_X##_DEF(A, P_A, P_B, P_C, P_D); \ 6815*c217d954SCole Faust REPEAT_4_10(P_X, P_A, P_B, P_C, P_D) 6816*c217d954SCole Faust#define REPEAT_4_12(P_X, P_A, P_B, P_C, P_D) \ 6817*c217d954SCole Faust P_X##_DEF(B, P_A, P_B, P_C, P_D); \ 6818*c217d954SCole Faust REPEAT_4_11(P_X, P_A, P_B, P_C, P_D) 6819*c217d954SCole Faust#define REPEAT_4_13(P_X, P_A, P_B, P_C, P_D) \ 6820*c217d954SCole Faust P_X##_DEF(C, P_A, P_B, P_C, P_D); \ 6821*c217d954SCole Faust REPEAT_4_12(P_X, P_A, P_B, P_C, P_D) 6822*c217d954SCole Faust#define REPEAT_4_14(P_X, P_A, P_B, P_C, P_D) \ 6823*c217d954SCole Faust P_X##_DEF(D, P_A, P_B, P_C, P_D); \ 6824*c217d954SCole Faust REPEAT_4_13(P_X, P_A, P_B, P_C, P_D) 6825*c217d954SCole Faust#define REPEAT_4_15(P_X, P_A, P_B, P_C, P_D) \ 6826*c217d954SCole Faust P_X##_DEF(E, P_A, P_B, P_C, P_D); \ 6827*c217d954SCole Faust REPEAT_4_14(P_X, P_A, P_B, P_C, P_D) 6828*c217d954SCole Faust#define REPEAT_4_16(P_X, P_A, P_B, P_C, P_D) \ 6829*c217d954SCole Faust P_X##_DEF(F, P_A, P_B, P_C, P_D); \ 6830*c217d954SCole Faust REPEAT_4_15(P_X, P_A, P_B, P_C, P_D) 6831*c217d954SCole Faust 6832*c217d954SCole Faust#define REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) REPEAT_4_##P_NUM(P_OP, P_A, P_B, P_C, P_D) 6833*c217d954SCole Faust#define REPEAT_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) 6834*c217d954SCole Faust 6835*c217d954SCole Faust 6836*c217d954SCole Faust#define VAR_INIT_TO_CONST_DEF(ID, TYPE, VAR, VAL) TYPE VAR##ID = VAL 6837*c217d954SCole Faust#define REPEAT_VAR_INIT_TO_CONST(N, TYPE, VAR, VAL) REPEAT_3_N(N, VAR_INIT_TO_CONST, TYPE, VAR, VAL) 6838*c217d954SCole Faust 6839*c217d954SCole Faust 6840*c217d954SCole Faust#define VAR_INIT_CONVERT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) TYPE_OUT VAR_OUT##ID = CONVERT(VAR_IN##ID, TYPE_OUT) 6841*c217d954SCole Faust#define REPEAT_VAR_INIT_CONVERT(N, TYPE_OUT, VAR_IN, VAR_OUT) REPEAT_3_N(N, VAR_INIT_CONVERT, TYPE_OUT, VAR_IN, VAR_OUT) 6842*c217d954SCole Faust 6843*c217d954SCole Faust 6844*c217d954SCole Faust#define VAR_INIT_CONVERT_SAT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) TYPE_OUT VAR_OUT##ID = CONVERT_SAT(VAR_IN##ID, TYPE_OUT) 6845*c217d954SCole Faust#define REPEAT_VAR_INIT_CONVERT_SAT(N, TYPE_OUT, VAR_IN, VAR_OUT) REPEAT_3_N(N, VAR_INIT_CONVERT_SAT, TYPE_OUT, VAR_IN, VAR_OUT) 6846*c217d954SCole Faust 6847*c217d954SCole Faust 6848*c217d954SCole Faust#define ADD_CONST_TO_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID += (TYPE)VAL 6849*c217d954SCole Faust#define REPEAT_ADD_CONST_TO_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, ADD_CONST_TO_VAR, TYPE, VAR, VAL) 6850*c217d954SCole Faust 6851*c217d954SCole Faust 6852*c217d954SCole Faust#define MLA_VAR_WITH_CONST_VEC_DEF(ID, VAR_A, VAR_B, VAL) VAR_A##ID += VAR_B##ID * VAL 6853*c217d954SCole Faust#define REPEAT_MLA_VAR_WITH_CONST_VEC(N, VAR_A, VAR_B, VAL) REPEAT_3_N(N, MLA_VAR_WITH_CONST_VEC, VAR_A, VAR_B, VAL) 6854*c217d954SCole Faust 6855*c217d954SCole Faust 6856*c217d954SCole Faust#define ADD_VECTOR_TO_VAR_DEF(ID, TYPE, VAR, VEC) VAR##ID += VEC 6857*c217d954SCole Faust#define REPEAT_ADD_VECTOR_TO_VAR(N, VAR, VEC) REPEAT_3_N(N, ADD_VECTOR_TO_VAR, "", VAR, VEC) 6858*c217d954SCole Faust 6859*c217d954SCole Faust 6860*c217d954SCole Faust#define ADD_TWO_VARS_DEF(ID, TYPE, VAR_A, VAR_B) VAR_A##ID += VAR_B##ID 6861*c217d954SCole Faust#define REPEAT_ADD_TWO_VARS(N, VAR_A, VAR_B) REPEAT_3_N(N, ADD_TWO_VARS, "", VAR_A, VAR_B) 6862*c217d954SCole Faust 6863*c217d954SCole Faust 6864*c217d954SCole Faust#define MAX_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = max(VAR##ID, (TYPE)VAL) 6865*c217d954SCole Faust#define REPEAT_MAX_CONST_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, MAX_CONST_VAR, TYPE, VAR, VAL) 6866*c217d954SCole Faust 6867*c217d954SCole Faust 6868*c217d954SCole Faust#define MIN_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = min(VAR##ID, (TYPE)VAL) 6869*c217d954SCole Faust#define REPEAT_MIN_CONST_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, MIN_CONST_VAR, TYPE, VAR, VAL) 6870*c217d954SCole Faust 6871*c217d954SCole Faust 6872*c217d954SCole Faust#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE) 6873*c217d954SCole Faust#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT) 6874*c217d954SCole Faust 6875*c217d954SCole Faust 6876*c217d954SCole Faust#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE) 6877*c217d954SCole Faust#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT) 6878*c217d954SCole Faust 6879*c217d954SCole Faust 6880*c217d954SCole Faust#define ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) \ 6881*c217d954SCole Faust ({ \ 6882*c217d954SCole Faust VEC_DATA_TYPE(int, N0) \ 6883*c217d954SCole Faust VAR##ID_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, N0); \ 6884*c217d954SCole Faust VEC_DATA_TYPE(int, N0) \ 6885*c217d954SCole Faust VAR##ID_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, N0); \ 6886*c217d954SCole Faust VAR##ID = select(VAR##ID_shift_lt0, VAR##ID_shift_gt0, RES_SHIFT >= 0); \ 6887*c217d954SCole Faust }) 6888*c217d954SCole Faust#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL, SIZE, VAR, RES_MUL, RES_SHIFT) 6889*c217d954SCole Faust 6890*c217d954SCole Faust#endif 6891*c217d954SCole Faust 6892*c217d954SCole Faust#ifndef SRC_CORE_CL_CL_KERNELS_TILE_HELPERS 6893*c217d954SCole Faust#define SRC_CORE_CL_CL_KERNELS_TILE_HELPERS 6894*c217d954SCole Faust 6895*c217d954SCole Faust 6896*c217d954SCole Faust 6897*c217d954SCole Faust 6898*c217d954SCole Faust#define TILE_VECTOR_SIZE1 1 6899*c217d954SCole Faust#define TILE_VECTOR_SIZE2 2 6900*c217d954SCole Faust#define TILE_VECTOR_SIZE3 3 6901*c217d954SCole Faust#define TILE_VECTOR_SIZE4 4 6902*c217d954SCole Faust#define TILE_VECTOR_SIZE5 8 6903*c217d954SCole Faust#define TILE_VECTOR_SIZE6 8 6904*c217d954SCole Faust#define TILE_VECTOR_SIZE7 8 6905*c217d954SCole Faust#define TILE_VECTOR_SIZE8 8 6906*c217d954SCole Faust#define TILE_VECTOR_SIZE9 16 6907*c217d954SCole Faust#define TILE_VECTOR_SIZE10 16 6908*c217d954SCole Faust#define TILE_VECTOR_SIZE11 16 6909*c217d954SCole Faust#define TILE_VECTOR_SIZE12 16 6910*c217d954SCole Faust#define TILE_VECTOR_SIZE13 16 6911*c217d954SCole Faust#define TILE_VECTOR_SIZE14 16 6912*c217d954SCole Faust#define TILE_VECTOR_SIZE15 16 6913*c217d954SCole Faust#define TILE_VECTOR_SIZE16 16 6914*c217d954SCole Faust 6915*c217d954SCole Faust#define TILE_VECTOR_TYPE1(DATA_TYPE) DATA_TYPE##1 6916*c217d954SCole Faust#define TILE_VECTOR_TYPE2(DATA_TYPE) DATA_TYPE##2 6917*c217d954SCole Faust#define TILE_VECTOR_TYPE3(DATA_TYPE) DATA_TYPE##3 6918*c217d954SCole Faust#define TILE_VECTOR_TYPE4(DATA_TYPE) DATA_TYPE##4 6919*c217d954SCole Faust#define TILE_VECTOR_TYPE5(DATA_TYPE) DATA_TYPE##8 6920*c217d954SCole Faust#define TILE_VECTOR_TYPE6(DATA_TYPE) DATA_TYPE##8 6921*c217d954SCole Faust#define TILE_VECTOR_TYPE7(DATA_TYPE) DATA_TYPE##8 6922*c217d954SCole Faust#define TILE_VECTOR_TYPE8(DATA_TYPE) DATA_TYPE##8 6923*c217d954SCole Faust#define TILE_VECTOR_TYPE9(DATA_TYPE) DATA_TYPE##16 6924*c217d954SCole Faust#define TILE_VECTOR_TYPE10(DATA_TYPE) DATA_TYPE##16 6925*c217d954SCole Faust#define TILE_VECTOR_TYPE11(DATA_TYPE) DATA_TYPE##16 6926*c217d954SCole Faust#define TILE_VECTOR_TYPE12(DATA_TYPE) DATA_TYPE##16 6927*c217d954SCole Faust#define TILE_VECTOR_TYPE13(DATA_TYPE) DATA_TYPE##16 6928*c217d954SCole Faust#define TILE_VECTOR_TYPE14(DATA_TYPE) DATA_TYPE##16 6929*c217d954SCole Faust#define TILE_VECTOR_TYPE15(DATA_TYPE) DATA_TYPE##16 6930*c217d954SCole Faust#define TILE_VECTOR_TYPE16(DATA_TYPE) DATA_TYPE##16 6931*c217d954SCole Faust 6932*c217d954SCole Faust 6933*c217d954SCole Faust#define TILE(DATA_TYPE, H, W, BASENAME) TILE_STR(DATA_TYPE, H, W, BASENAME) 6934*c217d954SCole Faust#define TILE_STR(DATA_TYPE, H, W, BASENAME) \ 6935*c217d954SCole Faust union { \ 6936*c217d954SCole Faust DATA_TYPE s[TILE_VECTOR_SIZE##W]; \ 6937*c217d954SCole Faust TILE_VECTOR_TYPE##W(DATA_TYPE) v; \ 6938*c217d954SCole Faust } BASENAME[H] 6939*c217d954SCole Faust 6940*c217d954SCole Faust#define TENSOR4D_IMAGE(name) \ 6941*c217d954SCole Faust __read_only image2d_t name##_img, \ 6942*c217d954SCole Faust __global uchar *name##_ptr, \ 6943*c217d954SCole Faust uint name##_stride_x, \ 6944*c217d954SCole Faust uint name##_step_x, \ 6945*c217d954SCole Faust uint name##_stride_y, \ 6946*c217d954SCole Faust uint name##_step_y, \ 6947*c217d954SCole Faust uint name##_stride_z, \ 6948*c217d954SCole Faust uint name##_step_z, \ 6949*c217d954SCole Faust uint name##_stride_w, \ 6950*c217d954SCole Faust uint name##_step_w, \ 6951*c217d954SCole Faust uint name##_offset_first_element_in_bytes 6952*c217d954SCole Faust 6953*c217d954SCole Faust#define TENSOR4D_BUFFER(name) \ 6954*c217d954SCole Faust __global uchar *name##_ptr, \ 6955*c217d954SCole Faust uint name##_stride_x, \ 6956*c217d954SCole Faust uint name##_step_x, \ 6957*c217d954SCole Faust uint name##_stride_y, \ 6958*c217d954SCole Faust uint name##_step_y, \ 6959*c217d954SCole Faust uint name##_stride_z, \ 6960*c217d954SCole Faust uint name##_step_z, \ 6961*c217d954SCole Faust uint name##_stride_w, \ 6962*c217d954SCole Faust uint name##_step_w, \ 6963*c217d954SCole Faust uint name##_offset_first_element_in_bytes 6964*c217d954SCole Faust 6965*c217d954SCole Faust#define TENSOR4D_STR(name, type) TENSOR4D_##type(name) 6966*c217d954SCole Faust#define TENSOR4D(name, type) TENSOR4D_STR(name, type) 6967*c217d954SCole Faust 6968*c217d954SCole Faust#define TENSOR4D_T_IMAGE(name) \ 6969*c217d954SCole Faust __read_only image2d_t name##_img, \ 6970*c217d954SCole Faust __global uchar *name##_ptr, \ 6971*c217d954SCole Faust uint name##_stride_y, \ 6972*c217d954SCole Faust uint name##_stride_z, \ 6973*c217d954SCole Faust uint name##_stride_w, \ 6974*c217d954SCole Faust uint name##_c, \ 6975*c217d954SCole Faust uint name##_w, \ 6976*c217d954SCole Faust uint name##_h, \ 6977*c217d954SCole Faust uint name##_n, \ 6978*c217d954SCole Faust uint name##_offset_first_element_in_bytes 6979*c217d954SCole Faust 6980*c217d954SCole Faust#define TENSOR4D_T_BUFFER(name) \ 6981*c217d954SCole Faust __global uchar *name##_ptr, \ 6982*c217d954SCole Faust uint name##_stride_y, \ 6983*c217d954SCole Faust uint name##_stride_z, \ 6984*c217d954SCole Faust uint name##_stride_w, \ 6985*c217d954SCole Faust uint name##_c, \ 6986*c217d954SCole Faust uint name##_w, \ 6987*c217d954SCole Faust uint name##_h, \ 6988*c217d954SCole Faust uint name##_n, \ 6989*c217d954SCole Faust uint name##_offset_first_element_in_bytes 6990*c217d954SCole Faust 6991*c217d954SCole Faust#define TENSOR4D_T_STR(name, type) TENSOR4D_T_##type(name) 6992*c217d954SCole Faust 6993*c217d954SCole Faust 6994*c217d954SCole Faust#define TENSOR4D_T(name, type) TENSOR4D_T_STR(name, type) 6995*c217d954SCole Faust 6996*c217d954SCole Faust#define TENSOR4D_RO_T_IMAGE(name) \ 6997*c217d954SCole Faust __read_only image2d_t name##_img, \ 6998*c217d954SCole Faust TENSOR4D_T_BUFFER(name) 6999*c217d954SCole Faust 7000*c217d954SCole Faust#define TENSOR4D_RO_T_BUFFER(name) TENSOR4D_T_BUFFER(name) 7001*c217d954SCole Faust 7002*c217d954SCole Faust#define TENSOR4D_RO_T_STR(name, type) TENSOR4D_RO_T_##type(name) 7003*c217d954SCole Faust 7004*c217d954SCole Faust 7005*c217d954SCole Faust#define TENSOR4D_RO_T(name, type) TENSOR4D_RO_T_STR(name, type) 7006*c217d954SCole Faust 7007*c217d954SCole Faust#define TENSOR4D_WO_T_IMAGE(name) \ 7008*c217d954SCole Faust __write_only image2d_t name##_img, \ 7009*c217d954SCole Faust TENSOR4D_T_BUFFER(name) 7010*c217d954SCole Faust 7011*c217d954SCole Faust#define TENSOR4D_WO_T_BUFFER(name) TENSOR4D_T_BUFFER(name) 7012*c217d954SCole Faust 7013*c217d954SCole Faust#define TENSOR4D_WO_T_STR(name, type) TENSOR4D_WO_T_##type(name) 7014*c217d954SCole Faust 7015*c217d954SCole Faust 7016*c217d954SCole Faust#define TENSOR4D_WO_T(name, type) TENSOR4D_WO_T_STR(name, type) 7017*c217d954SCole Faust 7018*c217d954SCole Faust#define TENSOR3D_T_IMAGE(name) \ 7019*c217d954SCole Faust __read_only image2d_t name##_img, \ 7020*c217d954SCole Faust __global uchar *name##_ptr, \ 7021*c217d954SCole Faust uint name##_stride_y, \ 7022*c217d954SCole Faust uint name##_stride_z, \ 7023*c217d954SCole Faust uint name##_w, \ 7024*c217d954SCole Faust uint name##_h, \ 7025*c217d954SCole Faust uint name##_n, \ 7026*c217d954SCole Faust uint name##_offset_first_element_in_bytes 7027*c217d954SCole Faust 7028*c217d954SCole Faust#define TENSOR3D_T_BUFFER(name) \ 7029*c217d954SCole Faust __global uchar *name##_ptr, \ 7030*c217d954SCole Faust uint name##_stride_y, \ 7031*c217d954SCole Faust uint name##_stride_z, \ 7032*c217d954SCole Faust uint name##_w, \ 7033*c217d954SCole Faust uint name##_h, \ 7034*c217d954SCole Faust uint name##_n, \ 7035*c217d954SCole Faust uint name##_offset_first_element_in_bytes 7036*c217d954SCole Faust 7037*c217d954SCole Faust#define TENSOR3D_T_STR(name, type) TENSOR3D_T_##type(name) 7038*c217d954SCole Faust#define TENSOR3D_T(name, type) TENSOR3D_T_STR(name, type) 7039*c217d954SCole Faust 7040*c217d954SCole Faust#if !defined(UNROLL_WITH_PRAGMA) 7041*c217d954SCole Faust#define UNROLL_INCR(idx, step, macro) idx += (step); (macro) 7042*c217d954SCole Faust 7043*c217d954SCole Faust#define LOOP_UNROLLING_1(idx, step, macro) (macro) 7044*c217d954SCole Faust#define LOOP_UNROLLING_2(idx, step, macro) LOOP_UNROLLING_1(idx, step, macro); UNROLL_INCR(idx, step, macro) 7045*c217d954SCole Faust#define LOOP_UNROLLING_3(idx, step, macro) LOOP_UNROLLING_2(idx, step, macro); UNROLL_INCR(idx, step, macro) 7046*c217d954SCole Faust#define LOOP_UNROLLING_4(idx, step, macro) LOOP_UNROLLING_3(idx, step, macro); UNROLL_INCR(idx, step, macro) 7047*c217d954SCole Faust#define LOOP_UNROLLING_5(idx, step, macro) LOOP_UNROLLING_4(idx, step, macro); UNROLL_INCR(idx, step, macro) 7048*c217d954SCole Faust#define LOOP_UNROLLING_6(idx, step, macro) LOOP_UNROLLING_5(idx, step, macro); UNROLL_INCR(idx, step, macro) 7049*c217d954SCole Faust#define LOOP_UNROLLING_7(idx, step, macro) LOOP_UNROLLING_6(idx, step, macro); UNROLL_INCR(idx, step, macro) 7050*c217d954SCole Faust#define LOOP_UNROLLING_8(idx, step, macro) LOOP_UNROLLING_7(idx, step, macro); UNROLL_INCR(idx, step, macro) 7051*c217d954SCole Faust#define LOOP_UNROLLING_9(idx, step, macro) LOOP_UNROLLING_8(idx, step, macro); UNROLL_INCR(idx, step, macro) 7052*c217d954SCole Faust#define LOOP_UNROLLING_10(idx, step, macro) LOOP_UNROLLING_9(idx, step, macro); UNROLL_INCR(idx, step, macro) 7053*c217d954SCole Faust#define LOOP_UNROLLING_11(idx, step, macro) LOOP_UNROLLING_10(idx, step, macro); UNROLL_INCR(idx, step, macro) 7054*c217d954SCole Faust#define LOOP_UNROLLING_12(idx, step, macro) LOOP_UNROLLING_11(idx, step, macro); UNROLL_INCR(idx, step, macro) 7055*c217d954SCole Faust#define LOOP_UNROLLING_13(idx, step, macro) LOOP_UNROLLING_12(idx, step, macro); UNROLL_INCR(idx, step, macro) 7056*c217d954SCole Faust#define LOOP_UNROLLING_14(idx, step, macro) LOOP_UNROLLING_13(idx, step, macro); UNROLL_INCR(idx, step, macro) 7057*c217d954SCole Faust#define LOOP_UNROLLING_15(idx, step, macro) LOOP_UNROLLING_14(idx, step, macro); UNROLL_INCR(idx, step, macro) 7058*c217d954SCole Faust#define LOOP_UNROLLING_16(idx, step, macro) LOOP_UNROLLING_15(idx, step, macro); UNROLL_INCR(idx, step, macro) 7059*c217d954SCole Faust#define LOOP_UNROLLING_17(idx, step, macro) LOOP_UNROLLING_16(idx, step, macro); UNROLL_INCR(idx, step, macro) 7060*c217d954SCole Faust#define LOOP_UNROLLING_18(idx, step, macro) LOOP_UNROLLING_17(idx, step, macro); UNROLL_INCR(idx, step, macro) 7061*c217d954SCole Faust#define LOOP_UNROLLING_19(idx, step, macro) LOOP_UNROLLING_18(idx, step, macro); UNROLL_INCR(idx, step, macro) 7062*c217d954SCole Faust#define LOOP_UNROLLING_20(idx, step, macro) LOOP_UNROLLING_19(idx, step, macro); UNROLL_INCR(idx, step, macro) 7063*c217d954SCole Faust#define LOOP_UNROLLING_21(idx, step, macro) LOOP_UNROLLING_20(idx, step, macro); UNROLL_INCR(idx, step, macro) 7064*c217d954SCole Faust#define LOOP_UNROLLING_22(idx, step, macro) LOOP_UNROLLING_21(idx, step, macro); UNROLL_INCR(idx, step, macro) 7065*c217d954SCole Faust#define LOOP_UNROLLING_23(idx, step, macro) LOOP_UNROLLING_22(idx, step, macro); UNROLL_INCR(idx, step, macro) 7066*c217d954SCole Faust#define LOOP_UNROLLING_24(idx, step, macro) LOOP_UNROLLING_23(idx, step, macro); UNROLL_INCR(idx, step, macro) 7067*c217d954SCole Faust#define LOOP_UNROLLING_25(idx, step, macro) LOOP_UNROLLING_24(idx, step, macro); UNROLL_INCR(idx, step, macro) 7068*c217d954SCole Faust#define LOOP_UNROLLING_26(idx, step, macro) LOOP_UNROLLING_25(idx, step, macro); UNROLL_INCR(idx, step, macro) 7069*c217d954SCole Faust#define LOOP_UNROLLING_27(idx, step, macro) LOOP_UNROLLING_26(idx, step, macro); UNROLL_INCR(idx, step, macro) 7070*c217d954SCole Faust#define LOOP_UNROLLING_28(idx, step, macro) LOOP_UNROLLING_27(idx, step, macro); UNROLL_INCR(idx, step, macro) 7071*c217d954SCole Faust#define LOOP_UNROLLING_29(idx, step, macro) LOOP_UNROLLING_28(idx, step, macro); UNROLL_INCR(idx, step, macro) 7072*c217d954SCole Faust#define LOOP_UNROLLING_30(idx, step, macro) LOOP_UNROLLING_29(idx, step, macro); UNROLL_INCR(idx, step, macro) 7073*c217d954SCole Faust#define LOOP_UNROLLING_31(idx, step, macro) LOOP_UNROLLING_30(idx, step, macro); UNROLL_INCR(idx, step, macro) 7074*c217d954SCole Faust#define LOOP_UNROLLING_32(idx, step, macro) LOOP_UNROLLING_31(idx, step, macro); UNROLL_INCR(idx, step, macro) 7075*c217d954SCole Faust#define LOOP_UNROLLING_33(idx, step, macro) LOOP_UNROLLING_32(idx, step, macro); UNROLL_INCR(idx, step, macro) 7076*c217d954SCole Faust#define LOOP_UNROLLING_34(idx, step, macro) LOOP_UNROLLING_33(idx, step, macro); UNROLL_INCR(idx, step, macro) 7077*c217d954SCole Faust#define LOOP_UNROLLING_35(idx, step, macro) LOOP_UNROLLING_34(idx, step, macro); UNROLL_INCR(idx, step, macro) 7078*c217d954SCole Faust#define LOOP_UNROLLING_36(idx, step, macro) LOOP_UNROLLING_35(idx, step, macro); UNROLL_INCR(idx, step, macro) 7079*c217d954SCole Faust#define LOOP_UNROLLING_37(idx, step, macro) LOOP_UNROLLING_36(idx, step, macro); UNROLL_INCR(idx, step, macro) 7080*c217d954SCole Faust#define LOOP_UNROLLING_38(idx, step, macro) LOOP_UNROLLING_37(idx, step, macro); UNROLL_INCR(idx, step, macro) 7081*c217d954SCole Faust#define LOOP_UNROLLING_39(idx, step, macro) LOOP_UNROLLING_38(idx, step, macro); UNROLL_INCR(idx, step, macro) 7082*c217d954SCole Faust#define LOOP_UNROLLING_40(idx, step, macro) LOOP_UNROLLING_39(idx, step, macro); UNROLL_INCR(idx, step, macro) 7083*c217d954SCole Faust#define LOOP_UNROLLING_41(idx, step, macro) LOOP_UNROLLING_40(idx, step, macro); UNROLL_INCR(idx, step, macro) 7084*c217d954SCole Faust#define LOOP_UNROLLING_42(idx, step, macro) LOOP_UNROLLING_41(idx, step, macro); UNROLL_INCR(idx, step, macro) 7085*c217d954SCole Faust#define LOOP_UNROLLING_43(idx, step, macro) LOOP_UNROLLING_42(idx, step, macro); UNROLL_INCR(idx, step, macro) 7086*c217d954SCole Faust#define LOOP_UNROLLING_44(idx, step, macro) LOOP_UNROLLING_43(idx, step, macro); UNROLL_INCR(idx, step, macro) 7087*c217d954SCole Faust#define LOOP_UNROLLING_45(idx, step, macro) LOOP_UNROLLING_44(idx, step, macro); UNROLL_INCR(idx, step, macro) 7088*c217d954SCole Faust#define LOOP_UNROLLING_46(idx, step, macro) LOOP_UNROLLING_45(idx, step, macro); UNROLL_INCR(idx, step, macro) 7089*c217d954SCole Faust#define LOOP_UNROLLING_47(idx, step, macro) LOOP_UNROLLING_46(idx, step, macro); UNROLL_INCR(idx, step, macro) 7090*c217d954SCole Faust#define LOOP_UNROLLING_48(idx, step, macro) LOOP_UNROLLING_47(idx, step, macro); UNROLL_INCR(idx, step, macro) 7091*c217d954SCole Faust#define LOOP_UNROLLING_49(idx, step, macro) LOOP_UNROLLING_48(idx, step, macro); UNROLL_INCR(idx, step, macro) 7092*c217d954SCole Faust#define LOOP_UNROLLING_50(idx, step, macro) LOOP_UNROLLING_49(idx, step, macro); UNROLL_INCR(idx, step, macro) 7093*c217d954SCole Faust#define LOOP_UNROLLING_51(idx, step, macro) LOOP_UNROLLING_50(idx, step, macro); UNROLL_INCR(idx, step, macro) 7094*c217d954SCole Faust#define LOOP_UNROLLING_52(idx, step, macro) LOOP_UNROLLING_51(idx, step, macro); UNROLL_INCR(idx, step, macro) 7095*c217d954SCole Faust#define LOOP_UNROLLING_53(idx, step, macro) LOOP_UNROLLING_52(idx, step, macro); UNROLL_INCR(idx, step, macro) 7096*c217d954SCole Faust#define LOOP_UNROLLING_54(idx, step, macro) LOOP_UNROLLING_53(idx, step, macro); UNROLL_INCR(idx, step, macro) 7097*c217d954SCole Faust#define LOOP_UNROLLING_55(idx, step, macro) LOOP_UNROLLING_54(idx, step, macro); UNROLL_INCR(idx, step, macro) 7098*c217d954SCole Faust#define LOOP_UNROLLING_56(idx, step, macro) LOOP_UNROLLING_55(idx, step, macro); UNROLL_INCR(idx, step, macro) 7099*c217d954SCole Faust#define LOOP_UNROLLING_57(idx, step, macro) LOOP_UNROLLING_56(idx, step, macro); UNROLL_INCR(idx, step, macro) 7100*c217d954SCole Faust#define LOOP_UNROLLING_58(idx, step, macro) LOOP_UNROLLING_57(idx, step, macro); UNROLL_INCR(idx, step, macro) 7101*c217d954SCole Faust#define LOOP_UNROLLING_59(idx, step, macro) LOOP_UNROLLING_58(idx, step, macro); UNROLL_INCR(idx, step, macro) 7102*c217d954SCole Faust#define LOOP_UNROLLING_60(idx, step, macro) LOOP_UNROLLING_59(idx, step, macro); UNROLL_INCR(idx, step, macro) 7103*c217d954SCole Faust#define LOOP_UNROLLING_61(idx, step, macro) LOOP_UNROLLING_60(idx, step, macro); UNROLL_INCR(idx, step, macro) 7104*c217d954SCole Faust#define LOOP_UNROLLING_62(idx, step, macro) LOOP_UNROLLING_61(idx, step, macro); UNROLL_INCR(idx, step, macro) 7105*c217d954SCole Faust#define LOOP_UNROLLING_63(idx, step, macro) LOOP_UNROLLING_62(idx, step, macro); UNROLL_INCR(idx, step, macro) 7106*c217d954SCole Faust#define LOOP_UNROLLING_64(idx, step, macro) LOOP_UNROLLING_63(idx, step, macro); UNROLL_INCR(idx, step, macro) 7107*c217d954SCole Faust#define LOOP_UNROLLING_65(idx, step, macro) LOOP_UNROLLING_64(idx, step, macro); UNROLL_INCR(idx, step, macro) 7108*c217d954SCole Faust#define LOOP_UNROLLING_66(idx, step, macro) LOOP_UNROLLING_65(idx, step, macro); UNROLL_INCR(idx, step, macro) 7109*c217d954SCole Faust#define LOOP_UNROLLING_67(idx, step, macro) LOOP_UNROLLING_66(idx, step, macro); UNROLL_INCR(idx, step, macro) 7110*c217d954SCole Faust#define LOOP_UNROLLING_68(idx, step, macro) LOOP_UNROLLING_67(idx, step, macro); UNROLL_INCR(idx, step, macro) 7111*c217d954SCole Faust#define LOOP_UNROLLING_69(idx, step, macro) LOOP_UNROLLING_68(idx, step, macro); UNROLL_INCR(idx, step, macro) 7112*c217d954SCole Faust#define LOOP_UNROLLING_70(idx, step, macro) LOOP_UNROLLING_69(idx, step, macro); UNROLL_INCR(idx, step, macro) 7113*c217d954SCole Faust#define LOOP_UNROLLING_71(idx, step, macro) LOOP_UNROLLING_70(idx, step, macro); UNROLL_INCR(idx, step, macro) 7114*c217d954SCole Faust#define LOOP_UNROLLING_72(idx, step, macro) LOOP_UNROLLING_71(idx, step, macro); UNROLL_INCR(idx, step, macro) 7115*c217d954SCole Faust#define LOOP_UNROLLING_73(idx, step, macro) LOOP_UNROLLING_72(idx, step, macro); UNROLL_INCR(idx, step, macro) 7116*c217d954SCole Faust#define LOOP_UNROLLING_74(idx, step, macro) LOOP_UNROLLING_73(idx, step, macro); UNROLL_INCR(idx, step, macro) 7117*c217d954SCole Faust#define LOOP_UNROLLING_75(idx, step, macro) LOOP_UNROLLING_74(idx, step, macro); UNROLL_INCR(idx, step, macro) 7118*c217d954SCole Faust#define LOOP_UNROLLING_76(idx, step, macro) LOOP_UNROLLING_75(idx, step, macro); UNROLL_INCR(idx, step, macro) 7119*c217d954SCole Faust#define LOOP_UNROLLING_77(idx, step, macro) LOOP_UNROLLING_76(idx, step, macro); UNROLL_INCR(idx, step, macro) 7120*c217d954SCole Faust#define LOOP_UNROLLING_78(idx, step, macro) LOOP_UNROLLING_77(idx, step, macro); UNROLL_INCR(idx, step, macro) 7121*c217d954SCole Faust#define LOOP_UNROLLING_79(idx, step, macro) LOOP_UNROLLING_78(idx, step, macro); UNROLL_INCR(idx, step, macro) 7122*c217d954SCole Faust#define LOOP_UNROLLING_80(idx, step, macro) LOOP_UNROLLING_79(idx, step, macro); UNROLL_INCR(idx, step, macro) 7123*c217d954SCole Faust#define LOOP_UNROLLING_81(idx, step, macro) LOOP_UNROLLING_80(idx, step, macro); UNROLL_INCR(idx, step, macro) 7124*c217d954SCole Faust#define LOOP_UNROLLING_82(idx, step, macro) LOOP_UNROLLING_81(idx, step, macro); UNROLL_INCR(idx, step, macro) 7125*c217d954SCole Faust#define LOOP_UNROLLING_83(idx, step, macro) LOOP_UNROLLING_82(idx, step, macro); UNROLL_INCR(idx, step, macro) 7126*c217d954SCole Faust#define LOOP_UNROLLING_84(idx, step, macro) LOOP_UNROLLING_83(idx, step, macro); UNROLL_INCR(idx, step, macro) 7127*c217d954SCole Faust#define LOOP_UNROLLING_85(idx, step, macro) LOOP_UNROLLING_84(idx, step, macro); UNROLL_INCR(idx, step, macro) 7128*c217d954SCole Faust#define LOOP_UNROLLING_86(idx, step, macro) LOOP_UNROLLING_85(idx, step, macro); UNROLL_INCR(idx, step, macro) 7129*c217d954SCole Faust#define LOOP_UNROLLING_87(idx, step, macro) LOOP_UNROLLING_86(idx, step, macro); UNROLL_INCR(idx, step, macro) 7130*c217d954SCole Faust#define LOOP_UNROLLING_88(idx, step, macro) LOOP_UNROLLING_87(idx, step, macro); UNROLL_INCR(idx, step, macro) 7131*c217d954SCole Faust#define LOOP_UNROLLING_89(idx, step, macro) LOOP_UNROLLING_88(idx, step, macro); UNROLL_INCR(idx, step, macro) 7132*c217d954SCole Faust#define LOOP_UNROLLING_90(idx, step, macro) LOOP_UNROLLING_89(idx, step, macro); UNROLL_INCR(idx, step, macro) 7133*c217d954SCole Faust#define LOOP_UNROLLING_91(idx, step, macro) LOOP_UNROLLING_90(idx, step, macro); UNROLL_INCR(idx, step, macro) 7134*c217d954SCole Faust#define LOOP_UNROLLING_92(idx, step, macro) LOOP_UNROLLING_91(idx, step, macro); UNROLL_INCR(idx, step, macro) 7135*c217d954SCole Faust#define LOOP_UNROLLING_93(idx, step, macro) LOOP_UNROLLING_92(idx, step, macro); UNROLL_INCR(idx, step, macro) 7136*c217d954SCole Faust#define LOOP_UNROLLING_94(idx, step, macro) LOOP_UNROLLING_93(idx, step, macro); UNROLL_INCR(idx, step, macro) 7137*c217d954SCole Faust#define LOOP_UNROLLING_95(idx, step, macro) LOOP_UNROLLING_94(idx, step, macro); UNROLL_INCR(idx, step, macro) 7138*c217d954SCole Faust#define LOOP_UNROLLING_96(idx, step, macro) LOOP_UNROLLING_95(idx, step, macro); UNROLL_INCR(idx, step, macro) 7139*c217d954SCole Faust#define LOOP_UNROLLING_97(idx, step, macro) LOOP_UNROLLING_96(idx, step, macro); UNROLL_INCR(idx, step, macro) 7140*c217d954SCole Faust#define LOOP_UNROLLING_98(idx, step, macro) LOOP_UNROLLING_97(idx, step, macro); UNROLL_INCR(idx, step, macro) 7141*c217d954SCole Faust#define LOOP_UNROLLING_99(idx, step, macro) LOOP_UNROLLING_98(idx, step, macro); UNROLL_INCR(idx, step, macro) 7142*c217d954SCole Faust#define LOOP_UNROLLING_100(idx, step, macro) LOOP_UNROLLING_99(idx, step, macro); UNROLL_INCR(idx, step, macro) 7143*c217d954SCole Faust#define LOOP_UNROLLING_101(idx, step, macro) LOOP_UNROLLING_100(idx, step, macro); UNROLL_INCR(idx, step, macro) 7144*c217d954SCole Faust#define LOOP_UNROLLING_102(idx, step, macro) LOOP_UNROLLING_101(idx, step, macro); UNROLL_INCR(idx, step, macro) 7145*c217d954SCole Faust#define LOOP_UNROLLING_103(idx, step, macro) LOOP_UNROLLING_102(idx, step, macro); UNROLL_INCR(idx, step, macro) 7146*c217d954SCole Faust#define LOOP_UNROLLING_104(idx, step, macro) LOOP_UNROLLING_103(idx, step, macro); UNROLL_INCR(idx, step, macro) 7147*c217d954SCole Faust#define LOOP_UNROLLING_105(idx, step, macro) LOOP_UNROLLING_104(idx, step, macro); UNROLL_INCR(idx, step, macro) 7148*c217d954SCole Faust#define LOOP_UNROLLING_106(idx, step, macro) LOOP_UNROLLING_105(idx, step, macro); UNROLL_INCR(idx, step, macro) 7149*c217d954SCole Faust#define LOOP_UNROLLING_107(idx, step, macro) LOOP_UNROLLING_106(idx, step, macro); UNROLL_INCR(idx, step, macro) 7150*c217d954SCole Faust#define LOOP_UNROLLING_108(idx, step, macro) LOOP_UNROLLING_107(idx, step, macro); UNROLL_INCR(idx, step, macro) 7151*c217d954SCole Faust#define LOOP_UNROLLING_109(idx, step, macro) LOOP_UNROLLING_108(idx, step, macro); UNROLL_INCR(idx, step, macro) 7152*c217d954SCole Faust#define LOOP_UNROLLING_110(idx, step, macro) LOOP_UNROLLING_109(idx, step, macro); UNROLL_INCR(idx, step, macro) 7153*c217d954SCole Faust#define LOOP_UNROLLING_111(idx, step, macro) LOOP_UNROLLING_110(idx, step, macro); UNROLL_INCR(idx, step, macro) 7154*c217d954SCole Faust#define LOOP_UNROLLING_112(idx, step, macro) LOOP_UNROLLING_111(idx, step, macro); UNROLL_INCR(idx, step, macro) 7155*c217d954SCole Faust#define LOOP_UNROLLING_113(idx, step, macro) LOOP_UNROLLING_112(idx, step, macro); UNROLL_INCR(idx, step, macro) 7156*c217d954SCole Faust#define LOOP_UNROLLING_114(idx, step, macro) LOOP_UNROLLING_113(idx, step, macro); UNROLL_INCR(idx, step, macro) 7157*c217d954SCole Faust#define LOOP_UNROLLING_115(idx, step, macro) LOOP_UNROLLING_114(idx, step, macro); UNROLL_INCR(idx, step, macro) 7158*c217d954SCole Faust#define LOOP_UNROLLING_116(idx, step, macro) LOOP_UNROLLING_115(idx, step, macro); UNROLL_INCR(idx, step, macro) 7159*c217d954SCole Faust#define LOOP_UNROLLING_117(idx, step, macro) LOOP_UNROLLING_116(idx, step, macro); UNROLL_INCR(idx, step, macro) 7160*c217d954SCole Faust#define LOOP_UNROLLING_118(idx, step, macro) LOOP_UNROLLING_117(idx, step, macro); UNROLL_INCR(idx, step, macro) 7161*c217d954SCole Faust#define LOOP_UNROLLING_119(idx, step, macro) LOOP_UNROLLING_118(idx, step, macro); UNROLL_INCR(idx, step, macro) 7162*c217d954SCole Faust#define LOOP_UNROLLING_120(idx, step, macro) LOOP_UNROLLING_119(idx, step, macro); UNROLL_INCR(idx, step, macro) 7163*c217d954SCole Faust#define LOOP_UNROLLING_121(idx, step, macro) LOOP_UNROLLING_120(idx, step, macro); UNROLL_INCR(idx, step, macro) 7164*c217d954SCole Faust#define LOOP_UNROLLING_122(idx, step, macro) LOOP_UNROLLING_121(idx, step, macro); UNROLL_INCR(idx, step, macro) 7165*c217d954SCole Faust#define LOOP_UNROLLING_123(idx, step, macro) LOOP_UNROLLING_122(idx, step, macro); UNROLL_INCR(idx, step, macro) 7166*c217d954SCole Faust#define LOOP_UNROLLING_124(idx, step, macro) LOOP_UNROLLING_123(idx, step, macro); UNROLL_INCR(idx, step, macro) 7167*c217d954SCole Faust#define LOOP_UNROLLING_125(idx, step, macro) LOOP_UNROLLING_124(idx, step, macro); UNROLL_INCR(idx, step, macro) 7168*c217d954SCole Faust#define LOOP_UNROLLING_126(idx, step, macro) LOOP_UNROLLING_125(idx, step, macro); UNROLL_INCR(idx, step, macro) 7169*c217d954SCole Faust#define LOOP_UNROLLING_127(idx, step, macro) LOOP_UNROLLING_126(idx, step, macro); UNROLL_INCR(idx, step, macro) 7170*c217d954SCole Faust#define LOOP_UNROLLING_128(idx, step, macro) LOOP_UNROLLING_127(idx, step, macro); UNROLL_INCR(idx, step, macro) 7171*c217d954SCole Faust 7172*c217d954SCole Faust#define LOOP_UNROLLING_STR(type, idx, start, step, num, macro) \ 7173*c217d954SCole Faust { \ 7174*c217d954SCole Faust type idx = start; \ 7175*c217d954SCole Faust LOOP_UNROLLING_##num(idx, step, macro); \ 7176*c217d954SCole Faust } 7177*c217d954SCole Faust#else 7178*c217d954SCole Faust#define LOOP_UNROLLING_STR(type, idx, start, step, num, macro) \ 7179*c217d954SCole Faust { \ 7180*c217d954SCole Faust _Pragma("unroll") \ 7181*c217d954SCole Faust for(type idx = start; idx < (num * step); idx += step) \ 7182*c217d954SCole Faust { \ 7183*c217d954SCole Faust (macro); \ 7184*c217d954SCole Faust } \ 7185*c217d954SCole Faust } 7186*c217d954SCole Faust#endif 7187*c217d954SCole Faust#define LOOP_UNROLLING(type, idx, start, step, num, macro) LOOP_UNROLLING_STR(type, idx, start, step, num, macro) 7188*c217d954SCole Faust 7189*c217d954SCole Faust 7190*c217d954SCole Faust#define GET_SPATIAL_IDX(IDX, N0, PARTIAL_N0) (max((int)(get_global_id(IDX) * N0 - (N0 - PARTIAL_N0) % N0), 0)) 7191*c217d954SCole Faust 7192*c217d954SCole Faust 7193*c217d954SCole Faust#define DOT_PRODUCT_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c) DOT_PRODUCT_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c) 7194*c217d954SCole Faust#define DOT_PRODUCT_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c) DOT_PRODUCT##K0##_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) 7195*c217d954SCole Faust#define DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 7196*c217d954SCole Faust ({ \ 7197*c217d954SCole Faust c += (C_DATA_TYPE)(a) * (C_DATA_TYPE)(b); \ 7198*c217d954SCole Faust }) 7199*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_khr_integer_dot_product) 7200*c217d954SCole Faust#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0))); 7201*c217d954SCole Faust#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0)); 7202*c217d954SCole Faust#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((a), (b)); 7203*c217d954SCole Faust#elif defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 7204*c217d954SCole Faust#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0)), (c)); 7205*c217d954SCole Faust#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0), (c)); 7206*c217d954SCole Faust#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((a), (b), (c)); 7207*c217d954SCole Faust#elif defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 7208*c217d954SCole Faust#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0))); 7209*c217d954SCole Faust#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0)); 7210*c217d954SCole Faust#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((a), (b)); 7211*c217d954SCole Faust#else 7212*c217d954SCole Faust#define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 7213*c217d954SCole Faust ({ \ 7214*c217d954SCole Faust c += (C_DATA_TYPE)(a).s0 * (C_DATA_TYPE)(b).s0; \ 7215*c217d954SCole Faust c += (C_DATA_TYPE)(a).s1 * (C_DATA_TYPE)(b).s1; \ 7216*c217d954SCole Faust }) 7217*c217d954SCole Faust#define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 7218*c217d954SCole Faust ({ \ 7219*c217d954SCole Faust DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c); \ 7220*c217d954SCole Faust c += (C_DATA_TYPE)(a).s2 * (C_DATA_TYPE)(b).s2; \ 7221*c217d954SCole Faust }) 7222*c217d954SCole Faust#define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, x, y, val) \ 7223*c217d954SCole Faust ({ \ 7224*c217d954SCole Faust val += (C_DATA_TYPE)(x).s0 * (C_DATA_TYPE)(y).s0; \ 7225*c217d954SCole Faust val += (C_DATA_TYPE)(x).s1 * (C_DATA_TYPE)(y).s1; \ 7226*c217d954SCole Faust val += (C_DATA_TYPE)(x).s2 * (C_DATA_TYPE)(y).s2; \ 7227*c217d954SCole Faust val += (C_DATA_TYPE)(x).s3 * (C_DATA_TYPE)(y).s3; \ 7228*c217d954SCole Faust }) 7229*c217d954SCole Faust#endif 7230*c217d954SCole Faust#define DOT_PRODUCT5_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 7231*c217d954SCole Faust ({ \ 7232*c217d954SCole Faust DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c); \ 7233*c217d954SCole Faust DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s4), ((b).s4), c); \ 7234*c217d954SCole Faust }) 7235*c217d954SCole Faust#define DOT_PRODUCT6_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 7236*c217d954SCole Faust ({ \ 7237*c217d954SCole Faust DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c); \ 7238*c217d954SCole Faust DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s45), ((b).s45), c); \ 7239*c217d954SCole Faust }) 7240*c217d954SCole Faust#define DOT_PRODUCT7_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 7241*c217d954SCole Faust ({ \ 7242*c217d954SCole Faust DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c); \ 7243*c217d954SCole Faust DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s456), ((b).s456), c); \ 7244*c217d954SCole Faust }) 7245*c217d954SCole Faust#define DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 7246*c217d954SCole Faust ({ \ 7247*c217d954SCole Faust DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).lo), ((b).lo), c); \ 7248*c217d954SCole Faust DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).hi), ((b).hi), c); \ 7249*c217d954SCole Faust }) 7250*c217d954SCole Faust#define DOT_PRODUCT9_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 7251*c217d954SCole Faust ({ \ 7252*c217d954SCole Faust DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c); \ 7253*c217d954SCole Faust DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s8), ((b).s8), c); \ 7254*c217d954SCole Faust }) 7255*c217d954SCole Faust#define DOT_PRODUCT10_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 7256*c217d954SCole Faust ({ \ 7257*c217d954SCole Faust DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c); \ 7258*c217d954SCole Faust DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89), ((b).s89), c); \ 7259*c217d954SCole Faust }) 7260*c217d954SCole Faust#define DOT_PRODUCT11_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 7261*c217d954SCole Faust ({ \ 7262*c217d954SCole Faust DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c); \ 7263*c217d954SCole Faust DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89A), ((b).s89A), c); \ 7264*c217d954SCole Faust }) 7265*c217d954SCole Faust#define DOT_PRODUCT12_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 7266*c217d954SCole Faust ({ \ 7267*c217d954SCole Faust DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c); \ 7268*c217d954SCole Faust DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89AB), ((b).s89AB), c); \ 7269*c217d954SCole Faust }) 7270*c217d954SCole Faust#define DOT_PRODUCT13_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 7271*c217d954SCole Faust ({ \ 7272*c217d954SCole Faust DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c); \ 7273*c217d954SCole Faust DOT_PRODUCT5_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89ABC), ((b).s89ABC), c); \ 7274*c217d954SCole Faust }) 7275*c217d954SCole Faust#define DOT_PRODUCT14_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 7276*c217d954SCole Faust ({ \ 7277*c217d954SCole Faust DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c); \ 7278*c217d954SCole Faust DOT_PRODUCT6_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89ABCD), ((b).s89ABCD), c); \ 7279*c217d954SCole Faust }) 7280*c217d954SCole Faust#define DOT_PRODUCT15_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 7281*c217d954SCole Faust ({ \ 7282*c217d954SCole Faust DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c); \ 7283*c217d954SCole Faust DOT_PRODUCT7_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89ABCDE), ((b).s89ABCDE), c); \ 7284*c217d954SCole Faust }) 7285*c217d954SCole Faust#define DOT_PRODUCT16_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \ 7286*c217d954SCole Faust ({ \ 7287*c217d954SCole Faust DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).lo), ((b).lo), c); \ 7288*c217d954SCole Faust DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).hi), ((b).hi), c); \ 7289*c217d954SCole Faust }) 7290*c217d954SCole Faust 7291*c217d954SCole Faust 7292*c217d954SCole Faust#define REDUCE_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c) REDUCE_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c) 7293*c217d954SCole Faust#define REDUCE_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c) DOT_PRODUCT_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, (TILE_VECTOR_TYPE##K0(B_DATA_TYPE))1, c) 7294*c217d954SCole Faust 7295*c217d954SCole Faust 7296*c217d954SCole Faust#define V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y) V_LOAD_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y) 7297*c217d954SCole Faust#define V_LOAD_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y) V_LOAD_##TENSOR_TYPE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y) 7298*c217d954SCole Faust#define V_LOAD_BUFFER(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y) \ 7299*c217d954SCole Faust VLOAD(WIDTH) \ 7300*c217d954SCole Faust (0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (Y) * (STRIDE_Y))) 7301*c217d954SCole Faust#define V_LOAD_IMAGE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y) READ_IMAGE2D(DATA_TYPE, CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(WIDTH), TENSOR##_img, (X) / 4, (Y)) 7302*c217d954SCole Faust 7303*c217d954SCole Faust 7304*c217d954SCole Faust#define V_STORE(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES) V_STORE_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES) 7305*c217d954SCole Faust#define V_STORE_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES) V_STORE_##TENSOR_TYPE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES) 7306*c217d954SCole Faust#define V_STORE_BUFFER(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES) \ 7307*c217d954SCole Faust VSTORE(WIDTH) \ 7308*c217d954SCole Faust (VALUES, 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (Y) * (STRIDE_Y))) 7309*c217d954SCole Faust#define V_STORE_IMAGE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES) WRITE_IMAGE2D(DATA_TYPE, CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(WIDTH), TENSOR##_img, (X) / 4, (Y), VALUES) 7310*c217d954SCole Faust 7311*c217d954SCole Faust 7312*c217d954SCole Faust#define T_LOAD(DATA_TYPE, HEIGHT, WIDTH, TENSOR_TYPE, TENSOR, X, Y, YI_MULTIPLIER, STRIDE_Y, dst) \ 7313*c217d954SCole Faust ({ \ 7314*c217d954SCole Faust LOOP_UNROLLING(int, _i, 0, 1, HEIGHT, \ 7315*c217d954SCole Faust { \ 7316*c217d954SCole Faust dst[_i].v = V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, ((Y) + _i * (int)(YI_MULTIPLIER)), STRIDE_Y); \ 7317*c217d954SCole Faust }) \ 7318*c217d954SCole Faust }) 7319*c217d954SCole Faust 7320*c217d954SCole Faust 7321*c217d954SCole Faust#define T_LOAD_INDIRECT(DATA_TYPE, HEIGHT, WIDTH, TENSOR_TYPE, TENSOR, X, STRIDE_Y, indirect_y, dst) \ 7322*c217d954SCole Faust ({ \ 7323*c217d954SCole Faust LOOP_UNROLLING(int, _i, 0, 1, HEIGHT, \ 7324*c217d954SCole Faust { \ 7325*c217d954SCole Faust dst[_i].v = V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, (indirect_y[_i].v), STRIDE_Y); \ 7326*c217d954SCole Faust }) \ 7327*c217d954SCole Faust }) 7328*c217d954SCole Faust 7329*c217d954SCole Faust 7330*c217d954SCole Faust#define T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, HEIGHT, WIDTH0, WIDTH1, TENSOR_TYPE, TENSOR, X, STRIDE_Y, WIDTH1_CONDITION, dst, indirect_y) \ 7331*c217d954SCole Faust ({ \ 7332*c217d954SCole Faust if(WIDTH1_CONDITION) \ 7333*c217d954SCole Faust { \ 7334*c217d954SCole Faust LOOP_UNROLLING(int, _i, 0, 1, HEIGHT, \ 7335*c217d954SCole Faust { \ 7336*c217d954SCole Faust VLOAD_PARTIAL(WIDTH0, WIDTH1) \ 7337*c217d954SCole Faust (dst[HEIGHT - 1 - _i].v, 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y)); \ 7338*c217d954SCole Faust }) \ 7339*c217d954SCole Faust } \ 7340*c217d954SCole Faust else \ 7341*c217d954SCole Faust { \ 7342*c217d954SCole Faust LOOP_UNROLLING(int, _i, 0, 1, HEIGHT, \ 7343*c217d954SCole Faust { \ 7344*c217d954SCole Faust dst[HEIGHT - 1 - _i].v = V_LOAD(DATA_TYPE, WIDTH0, TENSOR_TYPE, TENSOR, X, (indirect_y[HEIGHT - 1 - _i].v), STRIDE_Y); \ 7345*c217d954SCole Faust }) \ 7346*c217d954SCole Faust } \ 7347*c217d954SCole Faust }) 7348*c217d954SCole Faust 7349*c217d954SCole Faust#define T_LOAD_NHWC(DATA_TYPE, TILE_HEIGHT, TILE_WIDTH, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, STRIDE_Y, dst) \ 7350*c217d954SCole Faust ({ \ 7351*c217d954SCole Faust LOOP_UNROLLING(int, _yk, 0, 1, TILE_HEIGHT, \ 7352*c217d954SCole Faust { \ 7353*c217d954SCole Faust LOOP_UNROLLING(int, _xk, 0, 1, TILE_WIDTH, \ 7354*c217d954SCole Faust { \ 7355*c217d954SCole Faust int _src_y = (X) + _xk + ((Y) + _yk) * (TENSOR_WIDTH); \ 7356*c217d954SCole Faust _src_y += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT); \ 7357*c217d954SCole Faust int _src_valid_y = (((X) + _xk) >= 0 && ((X) + _xk) < (int)(TENSOR_WIDTH) && ((Y) + _yk) >= 0 && ((Y) + _yk) < (int)(TENSOR_HEIGHT)); \ 7358*c217d954SCole Faust if(_src_valid_y != 0) \ 7359*c217d954SCole Faust { \ 7360*c217d954SCole Faust dst[_xk + _yk * (TILE_WIDTH)].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y); \ 7361*c217d954SCole Faust } \ 7362*c217d954SCole Faust }) \ 7363*c217d954SCole Faust }) \ 7364*c217d954SCole Faust }) 7365*c217d954SCole Faust 7366*c217d954SCole Faust 7367*c217d954SCole Faust#define T_LOAD_NHWC_WITH_DILATION(DATA_TYPE, TILE_HEIGHT, TILE_WIDTH, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, DILATION_X, DILATION_Y, BOUNDARY_CHECK, dst) \ 7368*c217d954SCole Faust ({ \ 7369*c217d954SCole Faust LOOP_UNROLLING(int, _yk, 0, 1, TILE_HEIGHT, \ 7370*c217d954SCole Faust { \ 7371*c217d954SCole Faust LOOP_UNROLLING(int, _xk, 0, 1, TILE_WIDTH, \ 7372*c217d954SCole Faust { \ 7373*c217d954SCole Faust int _src_y = (X) + _xk * (DILATION_X); \ 7374*c217d954SCole Faust int _src_z = ((Y) + _yk * (DILATION_Y)); \ 7375*c217d954SCole Faust int _src_w = (B); \ 7376*c217d954SCole Faust bool _src_valid_y = (((X) + _xk * (DILATION_X)) >= 0) && (((X) + _xk * (DILATION_X)) < (int)(TENSOR_WIDTH)) && (((Y) + _yk * (DILATION_Y)) >= 0) && (((Y) + _yk * (DILATION_Y)) < (int)(TENSOR_HEIGHT)); \ 7377*c217d954SCole Faust if(!(BOUNDARY_CHECK)) \ 7378*c217d954SCole Faust { \ 7379*c217d954SCole Faust dst[_xk + _yk * (TILE_WIDTH)].v = VLOAD(TILE_CHANNELS) \ 7380*c217d954SCole Faust (0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (C) * sizeof(DATA_TYPE) + (_src_y) * (TENSOR##_stride_y) + (_src_z) * (TENSOR##_stride_z) + (_src_w) * (TENSOR##_stride_w))); \ 7381*c217d954SCole Faust } \ 7382*c217d954SCole Faust else \ 7383*c217d954SCole Faust { \ 7384*c217d954SCole Faust if(_src_valid_y) \ 7385*c217d954SCole Faust { \ 7386*c217d954SCole Faust dst[_xk + _yk * (TILE_WIDTH)].v = VLOAD(TILE_CHANNELS) \ 7387*c217d954SCole Faust (0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (C) * sizeof(DATA_TYPE) + (_src_y) * (TENSOR##_stride_y) + (_src_z) * (TENSOR##_stride_z) + (_src_w) * (TENSOR##_stride_w))); \ 7388*c217d954SCole Faust } \ 7389*c217d954SCole Faust } \ 7390*c217d954SCole Faust }) \ 7391*c217d954SCole Faust }) \ 7392*c217d954SCole Faust }) 7393*c217d954SCole Faust 7394*c217d954SCole Faust 7395*c217d954SCole Faust#define T_LOAD_NHWC_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, STRIDE_Y, xi, yi, dst) \ 7396*c217d954SCole Faust ({ \ 7397*c217d954SCole Faust LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \ 7398*c217d954SCole Faust { \ 7399*c217d954SCole Faust int _src_y = (X) + xi[_i].v + ((Y) + yi[_i].v) * (TENSOR_WIDTH); \ 7400*c217d954SCole Faust _src_y += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT); \ 7401*c217d954SCole Faust int _src_valid_y = (((X) + xi[_i].v) >= 0 && ((X) + xi[_i].v) < (int)(TENSOR_WIDTH) && ((Y) + yi[_i].v) >= 0 && ((Y) + yi[_i].v) < (int)(TENSOR_HEIGHT)); \ 7402*c217d954SCole Faust if(_src_valid_y != 0) \ 7403*c217d954SCole Faust { \ 7404*c217d954SCole Faust dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y); \ 7405*c217d954SCole Faust } \ 7406*c217d954SCole Faust }) \ 7407*c217d954SCole Faust }) 7408*c217d954SCole Faust 7409*c217d954SCole Faust 7410*c217d954SCole Faust#define T_LOAD2D_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) T_LOAD2D_INDIRECT_STR(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) 7411*c217d954SCole Faust#define T_LOAD2D_INDIRECT_STR(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) T_LOAD2D_INDIRECT_##TENSOR_TYPE(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) 7412*c217d954SCole Faust#define T_LOAD2D_INDIRECT_BUFFER(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) \ 7413*c217d954SCole Faust ({ \ 7414*c217d954SCole Faust LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \ 7415*c217d954SCole Faust { \ 7416*c217d954SCole Faust if(yi[0].s[_i] >= 0) \ 7417*c217d954SCole Faust { \ 7418*c217d954SCole Faust dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, yi[0].s[_i], STRIDE_Y); \ 7419*c217d954SCole Faust } \ 7420*c217d954SCole Faust }) \ 7421*c217d954SCole Faust }) 7422*c217d954SCole Faust 7423*c217d954SCole Faust#define T_LOAD2D_INDIRECT_IMAGE(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) \ 7424*c217d954SCole Faust ({ \ 7425*c217d954SCole Faust LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \ 7426*c217d954SCole Faust { \ 7427*c217d954SCole Faust dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, yi[0].s[_i], STRIDE_Y); \ 7428*c217d954SCole Faust }) \ 7429*c217d954SCole Faust }) 7430*c217d954SCole Faust 7431*c217d954SCole Faust 7432*c217d954SCole Faust#define T_LOAD_NDHWC_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Z, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, TENSOR_DEPTH, STRIDE_Y, xi, yi, zi, dst) \ 7433*c217d954SCole Faust ({ \ 7434*c217d954SCole Faust LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \ 7435*c217d954SCole Faust { \ 7436*c217d954SCole Faust int _src_y = (X) + xi[_i].v + ((Y) + yi[_i].v) * (TENSOR_WIDTH) + ((Z) + zi[_i].v) * (TENSOR_WIDTH * TENSOR_HEIGHT); \ 7437*c217d954SCole Faust _src_y += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT) * (int)(TENSOR_DEPTH); \ 7438*c217d954SCole Faust int _src_valid_y = (((X) + xi[_i].v) >= 0 && ((X) + xi[_i].v) < (int)(TENSOR_WIDTH) && ((Y) + yi[_i].v) >= 0 && ((Y) + yi[_i].v) < (int)(TENSOR_HEIGHT) \ 7439*c217d954SCole Faust && ((Z) + zi[_i].v) >= 0 && ((Z) + zi[_i].v) < (int)(TENSOR_DEPTH)); \ 7440*c217d954SCole Faust if(_src_valid_y != 0) \ 7441*c217d954SCole Faust { \ 7442*c217d954SCole Faust dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y); \ 7443*c217d954SCole Faust } \ 7444*c217d954SCole Faust }) \ 7445*c217d954SCole Faust }) 7446*c217d954SCole Faust 7447*c217d954SCole Faust 7448*c217d954SCole Faust#define T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, HEIGHT, WIDTH0, WIDTH1, TENSOR_TYPE, TENSOR, X, STRIDE_Y, WIDTH1_CONDITION, src, indirect_y) \ 7449*c217d954SCole Faust ({ \ 7450*c217d954SCole Faust if(WIDTH1_CONDITION) \ 7451*c217d954SCole Faust { \ 7452*c217d954SCole Faust LOOP_UNROLLING(int, _i, 0, 1, HEIGHT, \ 7453*c217d954SCole Faust { \ 7454*c217d954SCole Faust VSTORE_PARTIAL(WIDTH0, WIDTH1) \ 7455*c217d954SCole Faust (CONVERT(src[HEIGHT - 1 - _i].v, VEC_DATA_TYPE(DATA_TYPE, WIDTH0)), 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y)); \ 7456*c217d954SCole Faust }) \ 7457*c217d954SCole Faust } \ 7458*c217d954SCole Faust else \ 7459*c217d954SCole Faust { \ 7460*c217d954SCole Faust LOOP_UNROLLING(int, _i, 0, 1, HEIGHT, \ 7461*c217d954SCole Faust { \ 7462*c217d954SCole Faust VSTORE(WIDTH0) \ 7463*c217d954SCole Faust (CONVERT(src[HEIGHT - 1 - _i].v, VEC_DATA_TYPE(DATA_TYPE, WIDTH0)), 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y)); \ 7464*c217d954SCole Faust }) \ 7465*c217d954SCole Faust } \ 7466*c217d954SCole Faust }) 7467*c217d954SCole Faust 7468*c217d954SCole Faust 7469*c217d954SCole Faust#define T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, K0, SRC_OFFSET, WEI_OFFSET, lhs, rhs, dst) \ 7470*c217d954SCole Faust ({ \ 7471*c217d954SCole Faust LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 7472*c217d954SCole Faust { \ 7473*c217d954SCole Faust ACC_DATA_TYPE _tm = 0; \ 7474*c217d954SCole Faust LOOP_UNROLLING(int, _k0, 0, 1, K0, \ 7475*c217d954SCole Faust { \ 7476*c217d954SCole Faust _tm += ((ACC_DATA_TYPE)lhs[_m0].s[_k0] * (ACC_DATA_TYPE)WEI_OFFSET); \ 7477*c217d954SCole Faust }) \ 7478*c217d954SCole Faust LOOP_UNROLLING(int, _n0, 0, 1, N0, \ 7479*c217d954SCole Faust { \ 7480*c217d954SCole Faust dst[_m0].s[_n0] += _tm; \ 7481*c217d954SCole Faust LOOP_UNROLLING(int, _k0, 0, 1, K0, \ 7482*c217d954SCole Faust { \ 7483*c217d954SCole Faust dst[_m0].s[_n0] += ((ACC_DATA_TYPE)rhs[_n0].s[_k0] * (ACC_DATA_TYPE)SRC_OFFSET); \ 7484*c217d954SCole Faust }) \ 7485*c217d954SCole Faust }) \ 7486*c217d954SCole Faust }) \ 7487*c217d954SCole Faust }) 7488*c217d954SCole Faust 7489*c217d954SCole Faust 7490*c217d954SCole Faust#define T_QUANTIZE8(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) T_QUANTIZE8_STR(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) 7491*c217d954SCole Faust#define T_QUANTIZE8_STR(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) T_QUANTIZE8_##QUANTIZATION_TYPE(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) 7492*c217d954SCole Faust 7493*c217d954SCole Faust 7494*c217d954SCole Faust#define T_QUANTIZE8_PER_TENSOR(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) \ 7495*c217d954SCole Faust ({ \ 7496*c217d954SCole Faust LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 7497*c217d954SCole Faust { \ 7498*c217d954SCole Faust LOOP_UNROLLING(int, _n0, 0, 1, N0, \ 7499*c217d954SCole Faust { \ 7500*c217d954SCole Faust SRC_DATA_TYPE _tmp = 0; \ 7501*c217d954SCole Faust SRC_DATA_TYPE _src = src[_m0].s[_n0]; \ 7502*c217d954SCole Faust _src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-DST_SHIFT)), ((SRC_DATA_TYPE)DST_SHIFT < (SRC_DATA_TYPE)0)); \ 7503*c217d954SCole Faust SRC_DATA_TYPE overflow = _src == DST_MULTIPLIER && _src == INT_MIN; \ 7504*c217d954SCole Faust long a_64 = (long)(_src); \ 7505*c217d954SCole Faust long b_64 = (long)(DST_MULTIPLIER); \ 7506*c217d954SCole Faust long ab_64 = a_64 * b_64; \ 7507*c217d954SCole Faust long mask1 = 1 << 30; \ 7508*c217d954SCole Faust long mask2 = 1 - (1 << 30); \ 7509*c217d954SCole Faust long is_positive_or_zero = ab_64 >= 0; \ 7510*c217d954SCole Faust long nudge = select(mask2, mask1, is_positive_or_zero); \ 7511*c217d954SCole Faust SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \ 7512*c217d954SCole Faust _tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \ 7513*c217d954SCole Faust if(DST_SHIFT >= 0) \ 7514*c217d954SCole Faust { \ 7515*c217d954SCole Faust long mask = ((((int)1) << DST_SHIFT) - (long)1); \ 7516*c217d954SCole Faust long threshold = _tmp < (int)0 ? (mask >> 1) + (long)1 : (mask >> 1) + 0; \ 7517*c217d954SCole Faust _tmp = (_tmp & mask) > threshold ? (_tmp >> DST_SHIFT) + (int)1 : (_tmp >> DST_SHIFT); \ 7518*c217d954SCole Faust } \ 7519*c217d954SCole Faust _tmp += DST_OFFSET; \ 7520*c217d954SCole Faust dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE); \ 7521*c217d954SCole Faust }) \ 7522*c217d954SCole Faust }) \ 7523*c217d954SCole Faust }) 7524*c217d954SCole Faust 7525*c217d954SCole Faust 7526*c217d954SCole Faust#define T_QUANTIZE8_PER_CHANNEL(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) \ 7527*c217d954SCole Faust ({ \ 7528*c217d954SCole Faust LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 7529*c217d954SCole Faust { \ 7530*c217d954SCole Faust LOOP_UNROLLING(int, _n0, 0, 1, N0, \ 7531*c217d954SCole Faust { \ 7532*c217d954SCole Faust SRC_DATA_TYPE _tmp = 0; \ 7533*c217d954SCole Faust SRC_DATA_TYPE _tmp2 = 0; \ 7534*c217d954SCole Faust SRC_DATA_TYPE _src = src[_m0].s[_n0]; \ 7535*c217d954SCole Faust SRC_DATA_TYPE _dst_multiplier = dst_multipliers[0].s[_n0]; \ 7536*c217d954SCole Faust SRC_DATA_TYPE _dst_shift = dst_shifts[0].s[_n0]; \ 7537*c217d954SCole Faust _src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-_dst_shift)), ((SRC_DATA_TYPE)_dst_shift < (SRC_DATA_TYPE)0)); \ 7538*c217d954SCole Faust SRC_DATA_TYPE overflow = _src == _dst_multiplier && _src == INT_MIN; \ 7539*c217d954SCole Faust long a_64 = (long)(_src); \ 7540*c217d954SCole Faust long b_64 = (long)(_dst_multiplier); \ 7541*c217d954SCole Faust long ab_64 = a_64 * b_64; \ 7542*c217d954SCole Faust long mask1 = 1 << 30; \ 7543*c217d954SCole Faust long mask2 = 1 - (1 << 30); \ 7544*c217d954SCole Faust long is_positive_or_zero = ab_64 >= 0; \ 7545*c217d954SCole Faust long nudge = select(mask2, mask1, is_positive_or_zero); \ 7546*c217d954SCole Faust SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \ 7547*c217d954SCole Faust _tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \ 7548*c217d954SCole Faust long mask = ((((int)1) << _dst_shift) - (int)1); \ 7549*c217d954SCole Faust long threshold = (mask >> 1) + any(_tmp); \ 7550*c217d954SCole Faust _tmp2 = _tmp >> _dst_shift; \ 7551*c217d954SCole Faust _tmp2 += select(0, 1, (_tmp & mask) > threshold); \ 7552*c217d954SCole Faust _tmp = select(_tmp, _tmp2, _dst_shift >= 0); \ 7553*c217d954SCole Faust _tmp += DST_OFFSET; \ 7554*c217d954SCole Faust dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE); \ 7555*c217d954SCole Faust }) \ 7556*c217d954SCole Faust }) \ 7557*c217d954SCole Faust }) 7558*c217d954SCole Faust 7559*c217d954SCole Faust 7560*c217d954SCole Faust#define T_QUANTIZE8_ASYMMETRIC(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst) \ 7561*c217d954SCole Faust ({ \ 7562*c217d954SCole Faust LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 7563*c217d954SCole Faust { \ 7564*c217d954SCole Faust LOOP_UNROLLING(int, _n0, 0, 1, N0, \ 7565*c217d954SCole Faust { \ 7566*c217d954SCole Faust SRC_DATA_TYPE _tmp = 0; \ 7567*c217d954SCole Faust SRC_DATA_TYPE _src = src[_m0].s[_n0]; \ 7568*c217d954SCole Faust _src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-DST_SHIFT)), ((SRC_DATA_TYPE)DST_SHIFT < (SRC_DATA_TYPE)0)); \ 7569*c217d954SCole Faust SRC_DATA_TYPE overflow = _src == DST_MULTIPLIER && _src == INT_MIN; \ 7570*c217d954SCole Faust long a_64 = (long)(_src); \ 7571*c217d954SCole Faust long b_64 = (long)(DST_MULTIPLIER); \ 7572*c217d954SCole Faust long ab_64 = a_64 * b_64; \ 7573*c217d954SCole Faust long mask1 = 1 << 30; \ 7574*c217d954SCole Faust long mask2 = 1 - (1 << 30); \ 7575*c217d954SCole Faust long is_positive_or_zero = ab_64 >= 0; \ 7576*c217d954SCole Faust long nudge = select(mask2, mask1, is_positive_or_zero); \ 7577*c217d954SCole Faust SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \ 7578*c217d954SCole Faust _tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \ 7579*c217d954SCole Faust if(DST_SHIFT >= 0) \ 7580*c217d954SCole Faust { \ 7581*c217d954SCole Faust long mask = ((((int)1) << DST_SHIFT) - (int)1); \ 7582*c217d954SCole Faust long threshold = _tmp < (int)0 ? (mask >> 1) + (long)1 : (mask >> 1) + 0; \ 7583*c217d954SCole Faust _tmp = (_tmp & mask) > threshold ? (_tmp >> DST_SHIFT) + (int)1 : (_tmp >> DST_SHIFT); \ 7584*c217d954SCole Faust } \ 7585*c217d954SCole Faust _tmp += DST_OFFSET; \ 7586*c217d954SCole Faust dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE); \ 7587*c217d954SCole Faust }) \ 7588*c217d954SCole Faust }) \ 7589*c217d954SCole Faust }) 7590*c217d954SCole Faust 7591*c217d954SCole Faust 7592*c217d954SCole Faust#define T_ROWSET_MASK(DATA_TYPE, M0, N0, VALUE_TO_SET, a, mask) \ 7593*c217d954SCole Faust ({ \ 7594*c217d954SCole Faust LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 7595*c217d954SCole Faust { \ 7596*c217d954SCole Faust LOOP_UNROLLING(int, _n0, 0, 1, N0, \ 7597*c217d954SCole Faust { \ 7598*c217d954SCole Faust a[_m0].s[_n0] = select((DATA_TYPE)(a[_m0].s[_n0]), (DATA_TYPE)(VALUE_TO_SET), (SELECT_DATA_TYPE(DATA_TYPE))(mask[_m0].v == (DATA_TYPE)0)); \ 7599*c217d954SCole Faust }) \ 7600*c217d954SCole Faust }) \ 7601*c217d954SCole Faust }) 7602*c217d954SCole Faust 7603*c217d954SCole Faust 7604*c217d954SCole Faust#define T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, src, dst) \ 7605*c217d954SCole Faust ({ \ 7606*c217d954SCole Faust LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 7607*c217d954SCole Faust { \ 7608*c217d954SCole Faust dst[_m0].v = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, N0, src[_m0].v, A_VAL, B_VAL); \ 7609*c217d954SCole Faust }) \ 7610*c217d954SCole Faust }) 7611*c217d954SCole Faust 7612*c217d954SCole Faust 7613*c217d954SCole Faust#define relu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (max((DATA_TYPE)ZERO_VALUE, x)) 7614*c217d954SCole Faust 7615*c217d954SCole Faust#define brelu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)ZERO_VALUE, x))) 7616*c217d954SCole Faust 7617*c217d954SCole Faust#define lu_brelu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL)) 7618*c217d954SCole Faust 7619*c217d954SCole Faust#define hard_swish_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (x * ((min(max((DATA_TYPE)(x + (DATA_TYPE)3.f), (DATA_TYPE)0.f), (DATA_TYPE)6.f)) * (DATA_TYPE)0.166666667f)) 7620*c217d954SCole Faust 7621*c217d954SCole Faust#define identity_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (x) 7622*c217d954SCole Faust 7623*c217d954SCole Faust#define ACT_OP_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) op##_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) 7624*c217d954SCole Faust#define ACTIVATION_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) ACT_OP_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) 7625*c217d954SCole Faust 7626*c217d954SCole Faust#define V_ADD(A_VAL, B_VAL) ((A_VAL) + (B_VAL)) 7627*c217d954SCole Faust#define V_SUB(A_VAL, B_VAL) ((A_VAL) - (B_VAL)) 7628*c217d954SCole Faust#define V_DIV(A_VAL, B_VAL) ((A_VAL) / (B_VAL)) 7629*c217d954SCole Faust#define V_MUL(A_VAL, B_VAL) ((A_VAL) * (B_VAL)) 7630*c217d954SCole Faust 7631*c217d954SCole Faust 7632*c217d954SCole Faust#define T_ACTIVATION_QUANTIZED(DATA_TYPE, M0, N0, ACTIVATION_TYPE, ZERO_VALUE, A_VAL, B_VAL, src, dst) \ 7633*c217d954SCole Faust ({ \ 7634*c217d954SCole Faust LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 7635*c217d954SCole Faust { \ 7636*c217d954SCole Faust dst[_m0].v = ACTIVATION_QUANTIZED(ACTIVATION_TYPE, DATA_TYPE, N0, ZERO_VALUE, A_VAL, B_VAL, src[_m0].v); \ 7637*c217d954SCole Faust }) \ 7638*c217d954SCole Faust }) 7639*c217d954SCole Faust 7640*c217d954SCole Faust 7641*c217d954SCole Faust#define T_ADD(DATA_TYPE, M0, N0, lhs, rhs, dst) \ 7642*c217d954SCole Faust ({ \ 7643*c217d954SCole Faust LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 7644*c217d954SCole Faust { \ 7645*c217d954SCole Faust dst[_m0].v = lhs[_m0].v + rhs[_m0].v; \ 7646*c217d954SCole Faust }) \ 7647*c217d954SCole Faust }) 7648*c217d954SCole Faust 7649*c217d954SCole Faust 7650*c217d954SCole Faust#define T_ADD_CONSTANT(DATA_TYPE, M0, N0, lhs, rhs_constant, dst) \ 7651*c217d954SCole Faust ({ \ 7652*c217d954SCole Faust LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 7653*c217d954SCole Faust { \ 7654*c217d954SCole Faust dst[_m0].v = lhs[_m0].v + (DATA_TYPE)rhs_constant; \ 7655*c217d954SCole Faust }) \ 7656*c217d954SCole Faust }) 7657*c217d954SCole Faust 7658*c217d954SCole Faust#define T_ELTWISE_BROADCAST_ADD_X(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 7659*c217d954SCole Faust#define T_ELTWISE_BROADCAST_LHS_X_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_LHS_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 7660*c217d954SCole Faust#define T_ELTWISE_BROADCAST_RHS_X_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 7661*c217d954SCole Faust 7662*c217d954SCole Faust#define T_ELTWISE_BROADCAST_LHS_X_SUB(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_LHS_X(V_SUB, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 7663*c217d954SCole Faust#define T_ELTWISE_BROADCAST_RHS_X_SUB(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_SUB, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 7664*c217d954SCole Faust 7665*c217d954SCole Faust#define T_ELTWISE_BROADCAST_DIV_X(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_DIV, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 7666*c217d954SCole Faust 7667*c217d954SCole Faust#define T_ELTWISE_BROADCAST_LHS_X_MUL(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_LHS_X(V_MUL, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 7668*c217d954SCole Faust#define T_ELTWISE_BROADCAST_RHS_X_MUL(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_MUL, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 7669*c217d954SCole Faust 7670*c217d954SCole Faust 7671*c217d954SCole Faust#define T_SCALE_CONSTANT(DATA_TYPE, M0, N0, lhs, rhs_constant, dst) \ 7672*c217d954SCole Faust ({ \ 7673*c217d954SCole Faust LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 7674*c217d954SCole Faust { \ 7675*c217d954SCole Faust dst[_m0].v = lhs[_m0].v * (DATA_TYPE)rhs_constant; \ 7676*c217d954SCole Faust }) \ 7677*c217d954SCole Faust }) 7678*c217d954SCole Faust 7679*c217d954SCole Faust 7680*c217d954SCole Faust#define T_ELTWISE_BROADCAST_X(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \ 7681*c217d954SCole Faust ({ \ 7682*c217d954SCole Faust LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 7683*c217d954SCole Faust { \ 7684*c217d954SCole Faust dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0))); \ 7685*c217d954SCole Faust }) \ 7686*c217d954SCole Faust }) 7687*c217d954SCole Faust 7688*c217d954SCole Faust 7689*c217d954SCole Faust#define T_ELTWISE_BROADCAST_LHS_X(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \ 7690*c217d954SCole Faust ({ \ 7691*c217d954SCole Faust LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 7692*c217d954SCole Faust { \ 7693*c217d954SCole Faust dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0))); \ 7694*c217d954SCole Faust }) \ 7695*c217d954SCole Faust }) 7696*c217d954SCole Faust 7697*c217d954SCole Faust#define T_ELTWISE_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 7698*c217d954SCole Faust#define T_ELTWISE_SUB(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_SUB, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 7699*c217d954SCole Faust#define T_ELTWISE_DIV(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_DIV, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 7700*c217d954SCole Faust#define T_ELTWISE_MUL(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_MUL, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) 7701*c217d954SCole Faust 7702*c217d954SCole Faust 7703*c217d954SCole Faust#define T_ELTWISE(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \ 7704*c217d954SCole Faust ({ \ 7705*c217d954SCole Faust LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 7706*c217d954SCole Faust { \ 7707*c217d954SCole Faust dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0))); \ 7708*c217d954SCole Faust }) \ 7709*c217d954SCole Faust }) 7710*c217d954SCole Faust 7711*c217d954SCole Faust 7712*c217d954SCole Faust#define T_FLOOR(DST_DATA_TYPE, M0, N0, src, dst) \ 7713*c217d954SCole Faust ({ \ 7714*c217d954SCole Faust LOOP_UNROLLING(int, _m0, 0, 1, M0, \ 7715*c217d954SCole Faust { \ 7716*c217d954SCole Faust dst[_m0].v = floor(CONVERT(src[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0))); \ 7717*c217d954SCole Faust }) \ 7718*c217d954SCole Faust }) 7719*c217d954SCole Faust 7720*c217d954SCole Faust 7721*c217d954SCole Faust#define T_MMUL(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, LHS_LAYOUT, RHS_LAYOUT, lhs, rhs, dst) T_MMUL_##LHS_LAYOUT##_##RHS_LAYOUT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) 7722*c217d954SCole Faust#define T_MMUL_NT_T(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_##LHS_DATA_TYPE##_##RHS_DATA_TYPE##_##DST_DATA_TYPE(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) 7723*c217d954SCole Faust#define T_MMUL_NT_T_float_float_float(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) 7724*c217d954SCole Faust#define T_MMUL_NT_T_half_half_float(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) 7725*c217d954SCole Faust#define T_MMUL_NT_T_half_half_half(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) 7726*c217d954SCole Faust#define T_MMUL_NT_T_char_char_int(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) 7727*c217d954SCole Faust#define T_MMUL_NT_T_uchar_uchar_uint(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) 7728*c217d954SCole Faust#define T_MMUL_NT_T_uchar_uchar_int(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) 7729*c217d954SCole Faust#define T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) \ 7730*c217d954SCole Faust { \ 7731*c217d954SCole Faust LOOP_UNROLLING(int, _m, 0, 1, M0, \ 7732*c217d954SCole Faust { \ 7733*c217d954SCole Faust LOOP_UNROLLING(int, _n, 0, 1, N0, \ 7734*c217d954SCole Faust { \ 7735*c217d954SCole Faust LOOP_UNROLLING(int, _k, 0, 1, K0, \ 7736*c217d954SCole Faust { \ 7737*c217d954SCole Faust dst[_m].s[_n] = fma((DST_DATA_TYPE)(lhs[_m].s[_k]), (DST_DATA_TYPE)(rhs[_n].s[_k]), dst[_m].s[_n]); \ 7738*c217d954SCole Faust }) \ 7739*c217d954SCole Faust }) \ 7740*c217d954SCole Faust }) \ 7741*c217d954SCole Faust } 7742*c217d954SCole Faust 7743*c217d954SCole Faust#define T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) \ 7744*c217d954SCole Faust ({ \ 7745*c217d954SCole Faust LOOP_UNROLLING(int, _m, 0, 1, M0, \ 7746*c217d954SCole Faust { \ 7747*c217d954SCole Faust LOOP_UNROLLING(int, _n, 0, 1, N0, \ 7748*c217d954SCole Faust { \ 7749*c217d954SCole Faust DOT_PRODUCT_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, K0, (lhs[_m].v), (rhs[_n].v), dst[_m].s[_n]); \ 7750*c217d954SCole Faust }) \ 7751*c217d954SCole Faust }) \ 7752*c217d954SCole Faust }) 7753*c217d954SCole Faust 7754*c217d954SCole Faust#endif 7755*c217d954SCole Faust 7756*c217d954SCole Faust#if defined(DATA_TYPE) && defined(ACC_DATA_TYPE) 7757*c217d954SCole Faust 7758*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 7759*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 7760*c217d954SCole Faust#define ARM_DOT(x, y, val) val = arm_dot_acc((x), (y), (val)); 7761*c217d954SCole Faust#else 7762*c217d954SCole Faust#define ARM_DOT(x, y, val) val += arm_dot((x), (y)); 7763*c217d954SCole Faust#endif 7764*c217d954SCole Faust#endif 7765*c217d954SCole Faust 7766*c217d954SCole Faust#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 7767*c217d954SCole Faust 7768*c217d954SCole Faust#define ARM_DOT1(a, b, c) \ 7769*c217d954SCole Faust ({ \ 7770*c217d954SCole Faust ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (VEC_DATA_TYPE(DATA_TYPE, 3))0), (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (VEC_DATA_TYPE(DATA_TYPE, 3))0), c); \ 7771*c217d954SCole Faust }) 7772*c217d954SCole Faust#define ARM_DOT2(a, b, c) \ 7773*c217d954SCole Faust ({ \ 7774*c217d954SCole Faust ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (VEC_DATA_TYPE(DATA_TYPE, 2))0), (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (VEC_DATA_TYPE(DATA_TYPE, 2))0), c); \ 7775*c217d954SCole Faust }) 7776*c217d954SCole Faust#define ARM_DOT3(a, b, c) \ 7777*c217d954SCole Faust ({ \ 7778*c217d954SCole Faust ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (DATA_TYPE)0), (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (DATA_TYPE)0), c); \ 7779*c217d954SCole Faust }) 7780*c217d954SCole Faust#define ARM_DOT4(a, b, c) \ 7781*c217d954SCole Faust ({ \ 7782*c217d954SCole Faust ARM_DOT(a, b, c); \ 7783*c217d954SCole Faust }) 7784*c217d954SCole Faust#define ARM_DOT8(a, b, c) \ 7785*c217d954SCole Faust ({ \ 7786*c217d954SCole Faust ARM_DOT4((a.lo), (b.lo), c); \ 7787*c217d954SCole Faust ARM_DOT4((a.hi), (b.hi), c); \ 7788*c217d954SCole Faust }) 7789*c217d954SCole Faust#define ARM_DOT16(a, b, c) \ 7790*c217d954SCole Faust ({ \ 7791*c217d954SCole Faust ARM_DOT8((a.lo), (b.lo), c); \ 7792*c217d954SCole Faust ARM_DOT8((a.hi), (b.hi), c); \ 7793*c217d954SCole Faust }) 7794*c217d954SCole Faust 7795*c217d954SCole Faust#else 7796*c217d954SCole Faust 7797*c217d954SCole Faust 7798*c217d954SCole Faust#define ARM_DOT1(a, b, c) \ 7799*c217d954SCole Faust ({ \ 7800*c217d954SCole Faust c += (ACC_DATA_TYPE)a * b; \ 7801*c217d954SCole Faust }) 7802*c217d954SCole Faust#define ARM_DOT2(a, b, c) \ 7803*c217d954SCole Faust ({ \ 7804*c217d954SCole Faust c += (ACC_DATA_TYPE)a.s0 * b.s0; \ 7805*c217d954SCole Faust c += (ACC_DATA_TYPE)a.s1 * b.s1; \ 7806*c217d954SCole Faust }) 7807*c217d954SCole Faust#define ARM_DOT3(a, b, c) \ 7808*c217d954SCole Faust ({ \ 7809*c217d954SCole Faust ARM_DOT2(a, b, c); \ 7810*c217d954SCole Faust c += (ACC_DATA_TYPE)a.s2 * b.s2; \ 7811*c217d954SCole Faust }) 7812*c217d954SCole Faust#define ARM_DOT4(a, b, c) \ 7813*c217d954SCole Faust ({ \ 7814*c217d954SCole Faust ARM_DOT3(a, b, c); \ 7815*c217d954SCole Faust c += (ACC_DATA_TYPE)a.s3 * b.s3; \ 7816*c217d954SCole Faust }) 7817*c217d954SCole Faust#define ARM_DOT8(a, b, c) \ 7818*c217d954SCole Faust ({ \ 7819*c217d954SCole Faust ARM_DOT4((a.lo), (b.lo), c); \ 7820*c217d954SCole Faust ARM_DOT4((a.hi), (b.hi), c); \ 7821*c217d954SCole Faust }) 7822*c217d954SCole Faust#define ARM_DOT16(a, b, c) \ 7823*c217d954SCole Faust ({ \ 7824*c217d954SCole Faust ARM_DOT8((a.lo), (b.lo), c); \ 7825*c217d954SCole Faust ARM_DOT8((a.hi), (b.hi), c); \ 7826*c217d954SCole Faust }) 7827*c217d954SCole Faust#endif 7828*c217d954SCole Faust 7829*c217d954SCole Faust 7830*c217d954SCole Faust#define ARM_DOT_K0X1(k0, a, b, c) \ 7831*c217d954SCole Faust ({ \ 7832*c217d954SCole Faust ARM_DOT_K0(k0, (a), (b##0), (c)); \ 7833*c217d954SCole Faust }) 7834*c217d954SCole Faust#define ARM_DOT_K0X2(k0, a, b, c) \ 7835*c217d954SCole Faust ({ \ 7836*c217d954SCole Faust ARM_DOT_K0(k0, (a), (b##0), (c.s0)); \ 7837*c217d954SCole Faust ARM_DOT_K0(k0, (a), (b##1), (c.s1)); \ 7838*c217d954SCole Faust }) 7839*c217d954SCole Faust#define ARM_DOT_K0X3(k0, a, b, c) \ 7840*c217d954SCole Faust ({ \ 7841*c217d954SCole Faust ARM_DOT_K0X2(k0, a, b, c); \ 7842*c217d954SCole Faust ARM_DOT_K0(k0, (a), (b##2), (c.s2)); \ 7843*c217d954SCole Faust }) 7844*c217d954SCole Faust#define ARM_DOT_K0X4(k0, a, b, c) \ 7845*c217d954SCole Faust ({ \ 7846*c217d954SCole Faust ARM_DOT_K0X3(k0, a, b, c); \ 7847*c217d954SCole Faust ARM_DOT_K0(k0, (a), (b##3), (c.s3)); \ 7848*c217d954SCole Faust }) 7849*c217d954SCole Faust#define ARM_DOT_K0X8(k0, a, b, c) \ 7850*c217d954SCole Faust ({ \ 7851*c217d954SCole Faust ARM_DOT_K0X4(k0, a, b, c); \ 7852*c217d954SCole Faust ARM_DOT_K0(k0, (a), (b##4), (c.s4)); \ 7853*c217d954SCole Faust ARM_DOT_K0(k0, (a), (b##5), (c.s5)); \ 7854*c217d954SCole Faust ARM_DOT_K0(k0, (a), (b##6), (c.s6)); \ 7855*c217d954SCole Faust ARM_DOT_K0(k0, (a), (b##7), (c.s7)); \ 7856*c217d954SCole Faust }) 7857*c217d954SCole Faust#define ARM_DOT_K0X16(k0, a, b, c) \ 7858*c217d954SCole Faust ({ \ 7859*c217d954SCole Faust ARM_DOT_K0X8(k0, a, b, c); \ 7860*c217d954SCole Faust ARM_DOT_K0(k0, (a), (b##8), (c.s8)); \ 7861*c217d954SCole Faust ARM_DOT_K0(k0, (a), (b##9), (c.s9)); \ 7862*c217d954SCole Faust ARM_DOT_K0(k0, (a), (b##A), (c.sA)); \ 7863*c217d954SCole Faust ARM_DOT_K0(k0, (a), (b##B), (c.sB)); \ 7864*c217d954SCole Faust ARM_DOT_K0(k0, (a), (b##C), (c.sC)); \ 7865*c217d954SCole Faust ARM_DOT_K0(k0, (a), (b##D), (c.sD)); \ 7866*c217d954SCole Faust ARM_DOT_K0(k0, (a), (b##E), (c.sE)); \ 7867*c217d954SCole Faust ARM_DOT_K0(k0, (a), (b##F), (c.sF)); \ 7868*c217d954SCole Faust }) 7869*c217d954SCole Faust 7870*c217d954SCole Faust 7871*c217d954SCole Faust#define ARM_MM_K0XN0X1(n0, k0, a, b, c) \ 7872*c217d954SCole Faust ({ \ 7873*c217d954SCole Faust ARM_DOT_K0XN0(n0, k0, (a##0), b, (c##0)); \ 7874*c217d954SCole Faust }) 7875*c217d954SCole Faust#define ARM_MM_K0XN0X2(n0, k0, a, b, c) \ 7876*c217d954SCole Faust ({ \ 7877*c217d954SCole Faust ARM_MM_K0XN0X1(n0, k0, a, b, c); \ 7878*c217d954SCole Faust ARM_DOT_K0XN0(n0, k0, (a##1), b, (c##1)); \ 7879*c217d954SCole Faust }) 7880*c217d954SCole Faust#define ARM_MM_K0XN0X3(n0, k0, a, b, c) \ 7881*c217d954SCole Faust ({ \ 7882*c217d954SCole Faust ARM_MM_K0XN0X2(n0, k0, a, b, c); \ 7883*c217d954SCole Faust ARM_DOT_K0XN0(n0, k0, (a##2), b, (c##2)); \ 7884*c217d954SCole Faust }) 7885*c217d954SCole Faust#define ARM_MM_K0XN0X4(n0, k0, a, b, c) \ 7886*c217d954SCole Faust ({ \ 7887*c217d954SCole Faust ARM_MM_K0XN0X3(n0, k0, a, b, c); \ 7888*c217d954SCole Faust ARM_DOT_K0XN0(n0, k0, (a##3), b, (c##3)); \ 7889*c217d954SCole Faust }) 7890*c217d954SCole Faust#define ARM_MM_K0XN0X5(n0, k0, a, b, c) \ 7891*c217d954SCole Faust ({ \ 7892*c217d954SCole Faust ARM_MM_K0XN0X4(n0, k0, a, b, c); \ 7893*c217d954SCole Faust ARM_DOT_K0XN0(n0, k0, (a##4), b, (c##4)); \ 7894*c217d954SCole Faust }) 7895*c217d954SCole Faust#define ARM_MM_K0XN0X6(n0, k0, a, b, c) \ 7896*c217d954SCole Faust ({ \ 7897*c217d954SCole Faust ARM_MM_K0XN0X5(n0, k0, a, b, c); \ 7898*c217d954SCole Faust ARM_DOT_K0XN0(n0, k0, (a##5), b, (c##5)); \ 7899*c217d954SCole Faust }) 7900*c217d954SCole Faust#define ARM_MM_K0XN0X7(n0, k0, a, b, c) \ 7901*c217d954SCole Faust ({ \ 7902*c217d954SCole Faust ARM_MM_K0XN0X6(n0, k0, a, b, c); \ 7903*c217d954SCole Faust ARM_DOT_K0XN0(n0, k0, (a##6), b, (c##6)); \ 7904*c217d954SCole Faust }) 7905*c217d954SCole Faust#define ARM_MM_K0XN0X8(n0, k0, a, b, c) \ 7906*c217d954SCole Faust ({ \ 7907*c217d954SCole Faust ARM_MM_K0XN0X7(n0, k0, a, b, c); \ 7908*c217d954SCole Faust ARM_DOT_K0XN0(n0, k0, (a##7), b, (c##7)); \ 7909*c217d954SCole Faust }) 7910*c217d954SCole Faust 7911*c217d954SCole Faust#define ARM_DOT_K0(k0, a, b, c) \ 7912*c217d954SCole Faust ({ \ 7913*c217d954SCole Faust CONCAT(ARM_DOT, k0) \ 7914*c217d954SCole Faust ((a), (b), (c)); \ 7915*c217d954SCole Faust }) 7916*c217d954SCole Faust 7917*c217d954SCole Faust#define ARM_DOT_K0XN0(n0, k0, a, b, c) \ 7918*c217d954SCole Faust ({ \ 7919*c217d954SCole Faust CONCAT(ARM_DOT_K0X, n0) \ 7920*c217d954SCole Faust (k0, (a), b, (c)); \ 7921*c217d954SCole Faust }) 7922*c217d954SCole Faust 7923*c217d954SCole Faust#define ARM_MM_K0XN0XM0(m0, n0, k0, a, b, c) \ 7924*c217d954SCole Faust ({ \ 7925*c217d954SCole Faust CONCAT(ARM_MM_K0XN0X, m0) \ 7926*c217d954SCole Faust (n0, k0, a, b, c); \ 7927*c217d954SCole Faust }) 7928*c217d954SCole Faust 7929*c217d954SCole Faust 7930*c217d954SCole Faust#define ARM_MUL_N0X1(VECTOR_ACC_TYPE, a, b, c) \ 7931*c217d954SCole Faust ({ \ 7932*c217d954SCole Faust c += CONVERT(b##0, VECTOR_ACC_TYPE) * a; \ 7933*c217d954SCole Faust }) 7934*c217d954SCole Faust#define ARM_MUL_N0X2(VECTOR_ACC_TYPE, a, b, c) \ 7935*c217d954SCole Faust ({ \ 7936*c217d954SCole Faust c += CONVERT(b##0, VECTOR_ACC_TYPE) * a.s##0; \ 7937*c217d954SCole Faust c += CONVERT(b##1, VECTOR_ACC_TYPE) * a.s##1; \ 7938*c217d954SCole Faust }) 7939*c217d954SCole Faust#define ARM_MUL_N0X3(VECTOR_ACC_TYPE, a, b, c) \ 7940*c217d954SCole Faust ({ \ 7941*c217d954SCole Faust ARM_MUL_N0X2(VECTOR_ACC_TYPE, a, b, c); \ 7942*c217d954SCole Faust c += CONVERT(b##2, VECTOR_ACC_TYPE) * a.s##2; \ 7943*c217d954SCole Faust }) 7944*c217d954SCole Faust#define ARM_MUL_N0X4(VECTOR_ACC_TYPE, a, b, c) \ 7945*c217d954SCole Faust ({ \ 7946*c217d954SCole Faust ARM_MUL_N0X3(VECTOR_ACC_TYPE, a, b, c); \ 7947*c217d954SCole Faust c += CONVERT(b##3, VECTOR_ACC_TYPE) * a.s##3; \ 7948*c217d954SCole Faust }) 7949*c217d954SCole Faust#define ARM_MUL_N0X8(VECTOR_ACC_TYPE, a, b, c) \ 7950*c217d954SCole Faust ({ \ 7951*c217d954SCole Faust ARM_MUL_N0X4(VECTOR_ACC_TYPE, a, b, c); \ 7952*c217d954SCole Faust c += CONVERT(b##4, VECTOR_ACC_TYPE) * a.s##4; \ 7953*c217d954SCole Faust c += CONVERT(b##5, VECTOR_ACC_TYPE) * a.s##5; \ 7954*c217d954SCole Faust c += CONVERT(b##6, VECTOR_ACC_TYPE) * a.s##6; \ 7955*c217d954SCole Faust c += CONVERT(b##7, VECTOR_ACC_TYPE) * a.s##7; \ 7956*c217d954SCole Faust }) 7957*c217d954SCole Faust#define ARM_MUL_N0X16(VECTOR_ACC_TYPE, a, b, c) \ 7958*c217d954SCole Faust ({ \ 7959*c217d954SCole Faust ARM_MUL_N0X8(VECTOR_ACC_TYPE, a, b, c); \ 7960*c217d954SCole Faust c += CONVERT(b##8, VECTOR_ACC_TYPE) * a.s##8; \ 7961*c217d954SCole Faust c += CONVERT(b##9, VECTOR_ACC_TYPE) * a.s##9; \ 7962*c217d954SCole Faust c += CONVERT(b##A, VECTOR_ACC_TYPE) * a.s##A; \ 7963*c217d954SCole Faust c += CONVERT(b##B, VECTOR_ACC_TYPE) * a.s##B; \ 7964*c217d954SCole Faust c += CONVERT(b##C, VECTOR_ACC_TYPE) * a.s##C; \ 7965*c217d954SCole Faust c += CONVERT(b##D, VECTOR_ACC_TYPE) * a.s##D; \ 7966*c217d954SCole Faust c += CONVERT(b##E, VECTOR_ACC_TYPE) * a.s##E; \ 7967*c217d954SCole Faust c += CONVERT(b##F, VECTOR_ACC_TYPE) * a.s##F; \ 7968*c217d954SCole Faust }) 7969*c217d954SCole Faust 7970*c217d954SCole Faust#define ARM_MM_NATIVE_N0XK0X1(VECTOR_ACC_TYPE, k0, a, b, c) \ 7971*c217d954SCole Faust ({ \ 7972*c217d954SCole Faust ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##0), b, (c##0)); \ 7973*c217d954SCole Faust }) 7974*c217d954SCole Faust#define ARM_MM_NATIVE_N0XK0X2(VECTOR_ACC_TYPE, k0, a, b, c) \ 7975*c217d954SCole Faust ({ \ 7976*c217d954SCole Faust ARM_MM_NATIVE_N0XK0X1(VECTOR_ACC_TYPE, k0, a, b, c); \ 7977*c217d954SCole Faust ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##1), b, (c##1)); \ 7978*c217d954SCole Faust }) 7979*c217d954SCole Faust#define ARM_MM_NATIVE_N0XK0X3(VECTOR_ACC_TYPE, k0, a, b, c) \ 7980*c217d954SCole Faust ({ \ 7981*c217d954SCole Faust ARM_MM_NATIVE_N0XK0X2(VECTOR_ACC_TYPE, k0, a, b, c); \ 7982*c217d954SCole Faust ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##2), b, (c##2)); \ 7983*c217d954SCole Faust }) 7984*c217d954SCole Faust#define ARM_MM_NATIVE_N0XK0X4(VECTOR_ACC_TYPE, k0, a, b, c) \ 7985*c217d954SCole Faust ({ \ 7986*c217d954SCole Faust ARM_MM_NATIVE_N0XK0X3(VECTOR_ACC_TYPE, k0, a, b, c); \ 7987*c217d954SCole Faust ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##3), b, (c##3)); \ 7988*c217d954SCole Faust }) 7989*c217d954SCole Faust#define ARM_MM_NATIVE_N0XK0X5(VECTOR_ACC_TYPE, k0, a, b, c) \ 7990*c217d954SCole Faust ({ \ 7991*c217d954SCole Faust ARM_MM_NATIVE_N0XK0X4(VECTOR_ACC_TYPE, k0, a, b, c); \ 7992*c217d954SCole Faust ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##4), b, (c##4)); \ 7993*c217d954SCole Faust }) 7994*c217d954SCole Faust#define ARM_MM_NATIVE_N0XK0X6(VECTOR_ACC_TYPE, k0, a, b, c) \ 7995*c217d954SCole Faust ({ \ 7996*c217d954SCole Faust ARM_MM_NATIVE_N0XK0X5(VECTOR_ACC_TYPE, k0, a, b, c); \ 7997*c217d954SCole Faust ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##5), b, (c##5)); \ 7998*c217d954SCole Faust }) 7999*c217d954SCole Faust#define ARM_MM_NATIVE_N0XK0X7(VECTOR_ACC_TYPE, k0, a, b, c) \ 8000*c217d954SCole Faust ({ \ 8001*c217d954SCole Faust ARM_MM_NATIVE_N0XK0X6(VECTOR_ACC_TYPE, k0, a, b, c); \ 8002*c217d954SCole Faust ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##6), b, (c##6)); \ 8003*c217d954SCole Faust }) 8004*c217d954SCole Faust#define ARM_MM_NATIVE_N0XK0X8(VECTOR_ACC_TYPE, k0, a, b, c) \ 8005*c217d954SCole Faust ({ \ 8006*c217d954SCole Faust ARM_MM_NATIVE_N0XK0X7(VECTOR_ACC_TYPE, k0, a, b, c); \ 8007*c217d954SCole Faust ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##7), b, (c##7)); \ 8008*c217d954SCole Faust }) 8009*c217d954SCole Faust#define ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, a, b, c) \ 8010*c217d954SCole Faust ({ \ 8011*c217d954SCole Faust CONCAT(ARM_MUL_N0X, k0) \ 8012*c217d954SCole Faust (VECTOR_ACC_TYPE, (a), b, (c)); \ 8013*c217d954SCole Faust }) 8014*c217d954SCole Faust#define ARM_MM_NATIVE_N0XK0XM0(VECTOR_ACC_TYPE, m0, k0, a, b, c) \ 8015*c217d954SCole Faust ({ \ 8016*c217d954SCole Faust CONCAT(ARM_MM_NATIVE_N0XK0X, m0) \ 8017*c217d954SCole Faust (VECTOR_ACC_TYPE, k0, a, b, c); \ 8018*c217d954SCole Faust }) 8019*c217d954SCole Faust 8020*c217d954SCole Faust#if defined(GEMMLOWP_MM_RESHAPED_LHS_NT_RHS_T) 8021*c217d954SCole Faust 8022*c217d954SCole Faust__kernel void gemmlowp_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs), 8023*c217d954SCole Faust IMAGE_DECLARATION(rhs), 8024*c217d954SCole Faust IMAGE_DECLARATION(dst), 8025*c217d954SCole Faust uint k, 8026*c217d954SCole Faust uint lhs_stride_z, 8027*c217d954SCole Faust uint rhs_stride_z, 8028*c217d954SCole Faust uint dst_stride_z 8029*c217d954SCole Faust#if defined(REINTERPRET_OUTPUT_AS_3D) 8030*c217d954SCole Faust , 8031*c217d954SCole Faust uint dst_cross_plane_pad 8032*c217d954SCole Faust#endif 8033*c217d954SCole Faust ) 8034*c217d954SCole Faust{ 8035*c217d954SCole Faust 8036*c217d954SCole Faust#define LHS_BLOCK_SIZE ((K0) * (M0)) 8037*c217d954SCole Faust 8038*c217d954SCole Faust#if defined(LHS_INTERLEAVE) 8039*c217d954SCole Faust#define LHS_OFFSET_X (K0) 8040*c217d954SCole Faust#define LHS_STEP_X ((K0) * (V0)) 8041*c217d954SCole Faust#define LHS_STEP_LOOP (1) 8042*c217d954SCole Faust#else 8043*c217d954SCole Faust#define LHS_OFFSET_X (LHS_BLOCK_SIZE) 8044*c217d954SCole Faust#define LHS_STEP_X (K0) 8045*c217d954SCole Faust#define LHS_STEP_LOOP (V0) 8046*c217d954SCole Faust#endif 8047*c217d954SCole Faust 8048*c217d954SCole Faust 8049*c217d954SCole Faust#define RHS_BLOCK_SIZE ((K0) * (N0)) 8050*c217d954SCole Faust 8051*c217d954SCole Faust 8052*c217d954SCole Faust#if defined(RHS_INTERLEAVE) 8053*c217d954SCole Faust#define RHS_OFFSET_X (K0) 8054*c217d954SCole Faust#define RHS_STEP_X ((K0) * (H0)) 8055*c217d954SCole Faust#define RHS_STEP_LOOP (1) 8056*c217d954SCole Faust#else 8057*c217d954SCole Faust#define RHS_OFFSET_X (RHS_BLOCK_SIZE) 8058*c217d954SCole Faust#define RHS_STEP_X (K0) 8059*c217d954SCole Faust#define RHS_STEP_LOOP (H0) 8060*c217d954SCole Faust#endif 8061*c217d954SCole Faust 8062*c217d954SCole Faust uint x = get_global_id(0); 8063*c217d954SCole Faust uint y = get_global_id(1); 8064*c217d954SCole Faust uint z = get_global_id(2); 8065*c217d954SCole Faust 8066*c217d954SCole Faust#if defined(DUMMY_WORK_ITEMS) 8067*c217d954SCole Faust if((x * N0 >= N) || (y * M0 >= M)) 8068*c217d954SCole Faust { 8069*c217d954SCole Faust return; 8070*c217d954SCole Faust } 8071*c217d954SCole Faust#endif 8072*c217d954SCole Faust 8073*c217d954SCole Faust 8074*c217d954SCole Faust __global DATA_TYPE *lhs_addr = (__global DATA_TYPE *)(lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z)); 8075*c217d954SCole Faust 8076*c217d954SCole Faust 8077*c217d954SCole Faust __global DATA_TYPE *rhs_addr = (__global DATA_TYPE *)(rhs_ptr + rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X + (x / (uint)H0) * rhs_stride_y); 8078*c217d954SCole Faust 8079*c217d954SCole Faust#if defined(MATRIX_B_DEPTH) 8080*c217d954SCole Faust 8081*c217d954SCole Faust rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z; 8082*c217d954SCole Faust#else 8083*c217d954SCole Faust rhs_addr += z * rhs_stride_z; 8084*c217d954SCole Faust#endif 8085*c217d954SCole Faust 8086*c217d954SCole Faust REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); 8087*c217d954SCole Faust REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0); 8088*c217d954SCole Faust 8089*c217d954SCole Faust 8090*c217d954SCole Faust REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(ACC_DATA_TYPE, N0), c, 0); 8091*c217d954SCole Faust 8092*c217d954SCole Faust for(int i = 0; i < k; i += K0) 8093*c217d954SCole Faust { 8094*c217d954SCole Faust 8095*c217d954SCole Faust LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X, zlhs); 8096*c217d954SCole Faust 8097*c217d954SCole Faust 8098*c217d954SCole Faust LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_addr, 0, RHS_STEP_X, zrhs); 8099*c217d954SCole Faust 8100*c217d954SCole Faust 8101*c217d954SCole Faust ARM_MM_K0XN0XM0(M0, N0, K0, a, b, c); 8102*c217d954SCole Faust 8103*c217d954SCole Faust 8104*c217d954SCole Faust lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP); 8105*c217d954SCole Faust rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP); 8106*c217d954SCole Faust } 8107*c217d954SCole Faust 8108*c217d954SCole Faust __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(int)) + (y * (uint)M0 * dst_stride_y); 8109*c217d954SCole Faust 8110*c217d954SCole Faust REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); 8111*c217d954SCole Faust 8112*c217d954SCole Faust#if defined(REINTERPRET_OUTPUT_AS_3D) 8113*c217d954SCole Faust 8114*c217d954SCole Faust CALCULATE_Z_OFFSET(M0, uint, zout, y * M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); 8115*c217d954SCole Faust 8116*c217d954SCole Faust 8117*c217d954SCole Faust 8118*c217d954SCole Faust dst_addr += z * dst_stride_z * DEPTH_GEMM3D; 8119*c217d954SCole Faust 8120*c217d954SCole Faust#else 8121*c217d954SCole Faust 8122*c217d954SCole Faust 8123*c217d954SCole Faust dst_addr += z * dst_stride_z; 8124*c217d954SCole Faust 8125*c217d954SCole Faust#endif 8126*c217d954SCole Faust 8127*c217d954SCole Faust 8128*c217d954SCole Faust const bool cond_y = ((get_global_id(1) + 1) * M0 >= M); 8129*c217d954SCole Faust const bool cond_x = ((get_global_id(0) + 1) * N0 >= N); 8130*c217d954SCole Faust 8131*c217d954SCole Faust 8132*c217d954SCole Faust REPEAT_VAR_INIT_CONVERT_SAT(M0, VEC_DATA_TYPE(int, N0), c, c_lp); 8133*c217d954SCole Faust STORE_BLOCK_BOUNDARY_AWARE(M0, N0, int, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); 8134*c217d954SCole Faust 8135*c217d954SCole Faust#undef LHS_BLOCK_SIZE 8136*c217d954SCole Faust#undef LHS_OFFSET_X 8137*c217d954SCole Faust#undef LHS_STEP_X 8138*c217d954SCole Faust#undef RHS_BLOCK_SIZE 8139*c217d954SCole Faust#undef RHS_OFFSET_X 8140*c217d954SCole Faust#undef RHS_STEP_X 8141*c217d954SCole Faust} 8142*c217d954SCole Faust#endif 8143*c217d954SCole Faust 8144*c217d954SCole Faust#if defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_T_FUSED_OUTPUT_STAGE_FIXEDPOINT) || defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_T) 8145*c217d954SCole Faust#if defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT) 8146*c217d954SCole Faust#define FUSED_OUTPUT_STAGE_FIXED_POINT 8147*c217d954SCole Faust#endif 8148*c217d954SCole Faust 8149*c217d954SCole Faust 8150*c217d954SCole Faust#if defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_T_FUSED_OUTPUT_STAGE_FIXEDPOINT) 8151*c217d954SCole Faust__kernel void gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint 8152*c217d954SCole Faust#elif defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_T) 8153*c217d954SCole Faust__kernel void gemmlowp_mm_reshaped_only_rhs_t 8154*c217d954SCole Faust#endif 8155*c217d954SCole Faust(IMAGE_DECLARATION(lhs), 8156*c217d954SCole Faust IMAGE_DECLARATION(rhs), 8157*c217d954SCole Faust IMAGE_DECLARATION(dst), 8158*c217d954SCole Faust uint lhs_stride_z, 8159*c217d954SCole Faust uint rhs_stride_z, 8160*c217d954SCole Faust uint dst_stride_z 8161*c217d954SCole Faust#if defined(REINTERPRET_INPUT_AS_3D) 8162*c217d954SCole Faust , 8163*c217d954SCole Faust uint lhs_cross_plane_pad 8164*c217d954SCole Faust#endif 8165*c217d954SCole Faust#if defined(REINTERPRET_OUTPUT_AS_3D) 8166*c217d954SCole Faust , 8167*c217d954SCole Faust uint dst_cross_plane_pad 8168*c217d954SCole Faust#endif 8169*c217d954SCole Faust#if defined(A_OFFSET) 8170*c217d954SCole Faust , 8171*c217d954SCole Faust IMAGE_DECLARATION(sum_col) 8172*c217d954SCole Faust#endif 8173*c217d954SCole Faust#if defined(B_OFFSET) 8174*c217d954SCole Faust , 8175*c217d954SCole Faust IMAGE_DECLARATION(sum_row) 8176*c217d954SCole Faust#endif 8177*c217d954SCole Faust#if defined(ADD_BIAS) 8178*c217d954SCole Faust , 8179*c217d954SCole Faust VECTOR_DECLARATION(biases) 8180*c217d954SCole Faust#endif 8181*c217d954SCole Faust#if defined(PER_CHANNEL_QUANTIZATION) 8182*c217d954SCole Faust , 8183*c217d954SCole Faust VECTOR_DECLARATION(result_multipliers), 8184*c217d954SCole Faust VECTOR_DECLARATION(result_shifts) 8185*c217d954SCole Faust#endif 8186*c217d954SCole Faust) 8187*c217d954SCole Faust{ 8188*c217d954SCole Faust 8189*c217d954SCole Faust#define FULL_LHS_HEIGHT (lhs_stride_z / lhs_stride_y) 8190*c217d954SCole Faust#define FULL_DST_HEIGHT (dst_stride_z / dst_stride_y) 8191*c217d954SCole Faust 8192*c217d954SCole Faust 8193*c217d954SCole Faust#if defined(RHS_INTERLEAVE) 8194*c217d954SCole Faust#define RHS_OFFSET_X (K0) 8195*c217d954SCole Faust#define RHS_STEP_X (K0 * H0) 8196*c217d954SCole Faust#else 8197*c217d954SCole Faust#define RHS_OFFSET_X (K0 * N0) 8198*c217d954SCole Faust#define RHS_STEP_X (K0) 8199*c217d954SCole Faust#endif 8200*c217d954SCole Faust#define RHS_STEP_LOOP (N0 * K0 * H0) 8201*c217d954SCole Faust 8202*c217d954SCole Faust uint x = GET_SPATIAL_IDX(0, 1, 1); 8203*c217d954SCole Faust uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0); 8204*c217d954SCole Faust uint z = GET_SPATIAL_IDX(2, 1, 1); 8205*c217d954SCole Faust int xo = (x * N0); 8206*c217d954SCole Faust 8207*c217d954SCole Faust#if defined(DUMMY_WORK_ITEMS) 8208*c217d954SCole Faust if((xo >= N) || (y >= M)) 8209*c217d954SCole Faust { 8210*c217d954SCole Faust return; 8211*c217d954SCole Faust } 8212*c217d954SCole Faust#endif 8213*c217d954SCole Faust 8214*c217d954SCole Faust 8215*c217d954SCole Faust uint lhs_y = y + z * FULL_LHS_HEIGHT; 8216*c217d954SCole Faust 8217*c217d954SCole Faust 8218*c217d954SCole Faust uint rhs_offset_x = (x % H0) * RHS_OFFSET_X; 8219*c217d954SCole Faust uint rhs_offset_y = (x / H0) * rhs_stride_y; 8220*c217d954SCole Faust 8221*c217d954SCole Faust#if defined(MATRIX_B_DEPTH) 8222*c217d954SCole Faust 8223*c217d954SCole Faust rhs_offset_y += (z % MATRIX_B_DEPTH) * rhs_stride_z; 8224*c217d954SCole Faust#else 8225*c217d954SCole Faust rhs_offset_y += z * rhs_stride_z; 8226*c217d954SCole Faust#endif 8227*c217d954SCole Faust 8228*c217d954SCole Faust 8229*c217d954SCole Faust TILE(ACC_DATA_TYPE, M0, N0, c); 8230*c217d954SCole Faust LOOP_UNROLLING(int, i, 0, 1, M0, 8231*c217d954SCole Faust { 8232*c217d954SCole Faust c[i].v = 0; 8233*c217d954SCole Faust }) 8234*c217d954SCole Faust 8235*c217d954SCole Faust int i = 0; 8236*c217d954SCole Faust for(; i <= (K - K0); i += K0) 8237*c217d954SCole Faust { 8238*c217d954SCole Faust TILE(DATA_TYPE, M0, K0, a); 8239*c217d954SCole Faust TILE(DATA_TYPE, N0, K0, b); 8240*c217d954SCole Faust 8241*c217d954SCole Faust 8242*c217d954SCole Faust T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, i, lhs_y, 1, lhs_stride_y, a); 8243*c217d954SCole Faust 8244*c217d954SCole Faust 8245*c217d954SCole Faust LOOP_UNROLLING(int, _i, 0, 1, N0, 8246*c217d954SCole Faust { 8247*c217d954SCole Faust b[_i].v = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset_first_element_in_bytes + rhs_offset_x + rhs_offset_y + _i * RHS_STEP_X)); 8248*c217d954SCole Faust }) 8249*c217d954SCole Faust 8250*c217d954SCole Faust 8251*c217d954SCole Faust T_MMUL(DATA_TYPE, DATA_TYPE, ACC_DATA_TYPE, M0, N0, K0, NT, T, a, b, c); 8252*c217d954SCole Faust 8253*c217d954SCole Faust rhs_offset_x += RHS_STEP_LOOP; 8254*c217d954SCole Faust } 8255*c217d954SCole Faust 8256*c217d954SCole Faust#if((K % K0) != 0) 8257*c217d954SCole Faust 8258*c217d954SCole Faust 8259*c217d954SCole Faust for(; i < K; ++i) 8260*c217d954SCole Faust { 8261*c217d954SCole Faust TILE(DATA_TYPE, M0, 1, a); 8262*c217d954SCole Faust TILE(DATA_TYPE, N0, 1, b); 8263*c217d954SCole Faust 8264*c217d954SCole Faust 8265*c217d954SCole Faust T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, i, lhs_y, 1, lhs_stride_y, a); 8266*c217d954SCole Faust 8267*c217d954SCole Faust LOOP_UNROLLING(int, _i, 0, 1, N0, 8268*c217d954SCole Faust { 8269*c217d954SCole Faust b[_i].v = *(__global DATA_TYPE *)(rhs_ptr + rhs_offset_first_element_in_bytes + rhs_offset_x + rhs_offset_y + _i * RHS_STEP_X); 8270*c217d954SCole Faust }) 8271*c217d954SCole Faust 8272*c217d954SCole Faust T_MMUL(DATA_TYPE, DATA_TYPE, ACC_DATA_TYPE, M0, N0, 1, NT, T, a, b, c); 8273*c217d954SCole Faust 8274*c217d954SCole Faust rhs_offset_x += 1; 8275*c217d954SCole Faust } 8276*c217d954SCole Faust#endif 8277*c217d954SCole Faust 8278*c217d954SCole Faust#if defined(FUSED_OUTPUT_STAGE_FIXED_POINT) 8279*c217d954SCole Faust 8280*c217d954SCole Faust TILE(int, M0, N0, c_int); 8281*c217d954SCole Faust TILE(int, M0, N0, offset_s32); 8282*c217d954SCole Faust LOOP_UNROLLING(int, i, 0, 1, M0, 8283*c217d954SCole Faust { 8284*c217d954SCole Faust offset_s32[i].v = (VEC_DATA_TYPE(int, N0))K_OFFSET; 8285*c217d954SCole Faust }) 8286*c217d954SCole Faust 8287*c217d954SCole Faust LOOP_UNROLLING(int, i, 0, 1, M0, 8288*c217d954SCole Faust { 8289*c217d954SCole Faust c_int[i].v = CONVERT_SAT(c[i].v, VEC_DATA_TYPE(int, N0)); 8290*c217d954SCole Faust }) 8291*c217d954SCole Faust 8292*c217d954SCole Faust#if defined(A_OFFSET) 8293*c217d954SCole Faust 8294*c217d954SCole Faust#if defined(SUM_COL_HAS_BATCHES) 8295*c217d954SCole Faust int sum_col_y = z; 8296*c217d954SCole Faust#else 8297*c217d954SCole Faust int sum_col_y = 0; 8298*c217d954SCole Faust#endif 8299*c217d954SCole Faust TILE(int, 1, N0, a_offset_s32); 8300*c217d954SCole Faust 8301*c217d954SCole Faust T_LOAD(int, 1, N0, BUFFER, sum_col, xo, sum_col_y, 1, sum_col_stride_y, a_offset_s32); 8302*c217d954SCole Faust 8303*c217d954SCole Faust a_offset_s32[0].v *= A_OFFSET; 8304*c217d954SCole Faust 8305*c217d954SCole Faust T_ELTWISE_BROADCAST_ADD_X(int, M0, N0, offset_s32, a_offset_s32, offset_s32); 8306*c217d954SCole Faust#endif 8307*c217d954SCole Faust 8308*c217d954SCole Faust#if defined(B_OFFSET) 8309*c217d954SCole Faust 8310*c217d954SCole Faust 8311*c217d954SCole Faust 8312*c217d954SCole Faust 8313*c217d954SCole Faust TILE(int, M0, N0, b_offset_s32); 8314*c217d954SCole Faust 8315*c217d954SCole Faust T_LOAD(int, M0, 1, BUFFER, sum_row, y + z * (sum_row_stride_y / sizeof(int)), 0, 1, sum_row_stride_x, b_offset_s32); 8316*c217d954SCole Faust 8317*c217d954SCole Faust LOOP_UNROLLING(int, i, 0, 1, M0, 8318*c217d954SCole Faust { 8319*c217d954SCole Faust offset_s32[i].v += b_offset_s32[i].v *B_OFFSET; 8320*c217d954SCole Faust }) 8321*c217d954SCole Faust 8322*c217d954SCole Faust#endif 8323*c217d954SCole Faust 8324*c217d954SCole Faust#if defined(ADD_BIAS) 8325*c217d954SCole Faust 8326*c217d954SCole Faust TILE(int, 1, N0, bias); 8327*c217d954SCole Faust 8328*c217d954SCole Faust T_LOAD(int, 1, N0, BUFFER, biases, xo, 0, 1, 0, bias); 8329*c217d954SCole Faust 8330*c217d954SCole Faust T_ELTWISE_BROADCAST_ADD_X(int, M0, N0, offset_s32, bias, offset_s32); 8331*c217d954SCole Faust#endif 8332*c217d954SCole Faust 8333*c217d954SCole Faust LOOP_UNROLLING(int, i, 0, 1, M0, 8334*c217d954SCole Faust { 8335*c217d954SCole Faust c_int[i].v += offset_s32[i].v; 8336*c217d954SCole Faust }) 8337*c217d954SCole Faust 8338*c217d954SCole Faust TILE(DATA_TYPE, M0, N0, c_lp); 8339*c217d954SCole Faust 8340*c217d954SCole Faust 8341*c217d954SCole Faust#if defined(PER_CHANNEL_QUANTIZATION) 8342*c217d954SCole Faust TILE(int, 1, N0, res_mul); 8343*c217d954SCole Faust TILE(int, 1, N0, res_shift); 8344*c217d954SCole Faust 8345*c217d954SCole Faust T_LOAD(int, 1, N0, BUFFER, result_multipliers, xo, 0, 0, 0, res_mul); 8346*c217d954SCole Faust T_LOAD(int, 1, N0, BUFFER, result_shifts, xo, 0, 0, 0, res_shift); 8347*c217d954SCole Faust 8348*c217d954SCole Faust T_QUANTIZE8(int, DATA_TYPE, PER_CHANNEL, M0, N0, RESULT_OFFSET, RESULT_SHIFT, RESULT_MULTIPLIER, c_int, res_mul, res_shift, c_lp); 8349*c217d954SCole Faust#else 8350*c217d954SCole Faust T_QUANTIZE8(int, DATA_TYPE, PER_TENSOR, M0, N0, RESULT_OFFSET, RESULT_SHIFT, RESULT_MULTIPLIER, c_int, 0, 0, c_lp); 8351*c217d954SCole Faust#endif 8352*c217d954SCole Faust 8353*c217d954SCole Faust#if defined(MIN_BOUND) 8354*c217d954SCole Faust LOOP_UNROLLING(int, i, 0, 1, M0, 8355*c217d954SCole Faust { 8356*c217d954SCole Faust c_lp[i].v = max(c_lp[i].v, (VEC_DATA_TYPE(DATA_TYPE, N0))MIN_BOUND); 8357*c217d954SCole Faust }) 8358*c217d954SCole Faust#endif 8359*c217d954SCole Faust#if defined(MAX_BOUND) 8360*c217d954SCole Faust LOOP_UNROLLING(int, i, 0, 1, M0, 8361*c217d954SCole Faust { 8362*c217d954SCole Faust c_lp[i].v = min(c_lp[i].v, (VEC_DATA_TYPE(DATA_TYPE, N0))MAX_BOUND); 8363*c217d954SCole Faust }) 8364*c217d954SCole Faust#endif 8365*c217d954SCole Faust 8366*c217d954SCole Faust#else 8367*c217d954SCole Faust TILE(int, M0, N0, c_lp); 8368*c217d954SCole Faust 8369*c217d954SCole Faust LOOP_UNROLLING(int, i, 0, 1, M0, 8370*c217d954SCole Faust { 8371*c217d954SCole Faust c_lp[i].v = CONVERT_SAT(c[i].v, VEC_DATA_TYPE(int, N0)); 8372*c217d954SCole Faust }) 8373*c217d954SCole Faust#endif 8374*c217d954SCole Faust 8375*c217d954SCole Faust TILE(uint, M0, 1, dst_indirect_y); 8376*c217d954SCole Faust 8377*c217d954SCole Faust LOOP_UNROLLING(int, i, 0, 1, M0, 8378*c217d954SCole Faust { 8379*c217d954SCole Faust#if defined(REINTERPRET_OUTPUT_AS_3D) 8380*c217d954SCole Faust dst_indirect_y[i].v = (uint)min((int)((y + i) % HEIGHT_GEMM3D), (int)HEIGHT_GEMM3D - 1); 8381*c217d954SCole Faust dst_indirect_y[i].v += (uint)min((int)((y + i) / HEIGHT_GEMM3D), (int)DEPTH_GEMM3D - 1) * FULL_DST_HEIGHT; 8382*c217d954SCole Faust dst_indirect_y[i].v += z *FULL_DST_HEIGHT *DEPTH_GEMM3D; 8383*c217d954SCole Faust#else 8384*c217d954SCole Faust dst_indirect_y[i].v = (uint)min((int)y + i, (int)M - 1) + z *FULL_DST_HEIGHT; 8385*c217d954SCole Faust#endif 8386*c217d954SCole Faust }) 8387*c217d954SCole Faust 8388*c217d954SCole Faust const bool cond_x = (xo > (N - N0)) & (PARTIAL_STORE_N0 != 0); 8389*c217d954SCole Faust 8390*c217d954SCole Faust#if defined(FUSED_OUTPUT_STAGE_FIXED_POINT) 8391*c217d954SCole Faust T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, xo, dst_stride_y, cond_x, c_lp, dst_indirect_y); 8392*c217d954SCole Faust#else 8393*c217d954SCole Faust T_STORE_INDIRECT_WIDTH_SELECT(int, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, xo, dst_stride_y, cond_x, c_lp, dst_indirect_y); 8394*c217d954SCole Faust#endif 8395*c217d954SCole Faust 8396*c217d954SCole Faust#undef RHS_OFFSET_X 8397*c217d954SCole Faust#undef RHS_STEP_X 8398*c217d954SCole Faust#undef RHS_STEP_LOOP 8399*c217d954SCole Faust} 8400*c217d954SCole Faust#endif 8401*c217d954SCole Faust 8402*c217d954SCole Faust#if defined(GEMMLOWP_MM_NATIVE) 8403*c217d954SCole Faust 8404*c217d954SCole Faust 8405*c217d954SCole Faust__kernel void gemmlowp_mm_native(IMAGE_DECLARATION(lhs), 8406*c217d954SCole Faust IMAGE_DECLARATION(rhs), 8407*c217d954SCole Faust IMAGE_DECLARATION(dst), 8408*c217d954SCole Faust uint lhs_stride_z, 8409*c217d954SCole Faust uint rhs_stride_z, 8410*c217d954SCole Faust uint dst_stride_z 8411*c217d954SCole Faust#if defined(REINTERPRET_INPUT_AS_3D) 8412*c217d954SCole Faust , 8413*c217d954SCole Faust uint lhs_cross_plane_pad 8414*c217d954SCole Faust#endif 8415*c217d954SCole Faust#if defined(REINTERPRET_OUTPUT_AS_3D) 8416*c217d954SCole Faust , 8417*c217d954SCole Faust uint dst_cross_plane_pad 8418*c217d954SCole Faust#endif 8419*c217d954SCole Faust ) 8420*c217d954SCole Faust{ 8421*c217d954SCole Faust uint x = get_global_id(0); 8422*c217d954SCole Faust uint y = get_global_id(1); 8423*c217d954SCole Faust uint z = get_global_id(2); 8424*c217d954SCole Faust 8425*c217d954SCole Faust#if defined(DUMMY_WORK_ITEMS) 8426*c217d954SCole Faust if((x * N0 >= N) || (y * M0 >= M)) 8427*c217d954SCole Faust { 8428*c217d954SCole Faust return; 8429*c217d954SCole Faust } 8430*c217d954SCole Faust#endif 8431*c217d954SCole Faust 8432*c217d954SCole Faust 8433*c217d954SCole Faust uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y; 8434*c217d954SCole Faust 8435*c217d954SCole Faust 8436*c217d954SCole Faust uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0 * sizeof(DATA_TYPE); 8437*c217d954SCole Faust 8438*c217d954SCole Faust#if defined(MATRIX_B_DEPTH) 8439*c217d954SCole Faust 8440*c217d954SCole Faust rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z; 8441*c217d954SCole Faust#else 8442*c217d954SCole Faust rhs_offset += z * rhs_stride_z; 8443*c217d954SCole Faust#endif 8444*c217d954SCole Faust 8445*c217d954SCole Faust REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); 8446*c217d954SCole Faust REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0); 8447*c217d954SCole Faust 8448*c217d954SCole Faust#if defined(REINTERPRET_INPUT_AS_3D) 8449*c217d954SCole Faust 8450*c217d954SCole Faust CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y); 8451*c217d954SCole Faust 8452*c217d954SCole Faust 8453*c217d954SCole Faust 8454*c217d954SCole Faust lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D; 8455*c217d954SCole Faust 8456*c217d954SCole Faust#else 8457*c217d954SCole Faust 8458*c217d954SCole Faust 8459*c217d954SCole Faust lhs_offset += z * lhs_stride_z; 8460*c217d954SCole Faust 8461*c217d954SCole Faust#endif 8462*c217d954SCole Faust 8463*c217d954SCole Faust 8464*c217d954SCole Faust REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(ACC_DATA_TYPE, N0), c, 0); 8465*c217d954SCole Faust 8466*c217d954SCole Faust int i = 0; 8467*c217d954SCole Faust 8468*c217d954SCole Faust for(; i <= (K - K0); i += K0) 8469*c217d954SCole Faust { 8470*c217d954SCole Faust 8471*c217d954SCole Faust LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs); 8472*c217d954SCole Faust 8473*c217d954SCole Faust 8474*c217d954SCole Faust LOAD_BLOCK(K0, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zrhs); 8475*c217d954SCole Faust 8476*c217d954SCole Faust 8477*c217d954SCole Faust#if(GPU_ARCH == GPU_ARCH_MIDGARD) 8478*c217d954SCole Faust ARM_MM_NATIVE_N0XK0XM0(VEC_DATA_TYPE(ACC_DATA_TYPE, N0), M0, K0, a, b, c); 8479*c217d954SCole Faust#else 8480*c217d954SCole Faust 8481*c217d954SCole Faust TRANSPOSE_K0XN0(K0, N0, b_t, b, DATA_TYPE); 8482*c217d954SCole Faust 8483*c217d954SCole Faust ARM_MM_K0XN0XM0(M0, N0, K0, a, b_t, c); 8484*c217d954SCole Faust#endif 8485*c217d954SCole Faust 8486*c217d954SCole Faust 8487*c217d954SCole Faust lhs_offset += K0; 8488*c217d954SCole Faust rhs_offset += K0 * rhs_stride_y; 8489*c217d954SCole Faust } 8490*c217d954SCole Faust 8491*c217d954SCole Faust 8492*c217d954SCole Faust for(; i < K; ++i) 8493*c217d954SCole Faust { 8494*c217d954SCole Faust 8495*c217d954SCole Faust LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs); 8496*c217d954SCole Faust 8497*c217d954SCole Faust 8498*c217d954SCole Faust LOAD_BLOCK(1, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zrhs); 8499*c217d954SCole Faust 8500*c217d954SCole Faust 8501*c217d954SCole Faust#if(GPU_ARCH == GPU_ARCH_MIDGARD) 8502*c217d954SCole Faust ARM_MM_NATIVE_N0XK0XM0(VEC_DATA_TYPE(ACC_DATA_TYPE, N0), M0, 1, a, b, c); 8503*c217d954SCole Faust#else 8504*c217d954SCole Faust 8505*c217d954SCole Faust TRANSPOSE_K0XN0(1, N0, b_t, b, DATA_TYPE); 8506*c217d954SCole Faust 8507*c217d954SCole Faust ARM_MM_K0XN0XM0(M0, N0, 1, a, b_t, c); 8508*c217d954SCole Faust#endif 8509*c217d954SCole Faust 8510*c217d954SCole Faust 8511*c217d954SCole Faust lhs_offset += 1; 8512*c217d954SCole Faust rhs_offset += rhs_stride_y; 8513*c217d954SCole Faust } 8514*c217d954SCole Faust 8515*c217d954SCole Faust __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(int)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y); 8516*c217d954SCole Faust 8517*c217d954SCole Faust REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); 8518*c217d954SCole Faust 8519*c217d954SCole Faust#if defined(REINTERPRET_OUTPUT_AS_3D) 8520*c217d954SCole Faust 8521*c217d954SCole Faust CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); 8522*c217d954SCole Faust 8523*c217d954SCole Faust 8524*c217d954SCole Faust 8525*c217d954SCole Faust dst_addr += z * dst_stride_z * DEPTH_GEMM3D; 8526*c217d954SCole Faust 8527*c217d954SCole Faust#else 8528*c217d954SCole Faust 8529*c217d954SCole Faust 8530*c217d954SCole Faust dst_addr += z * dst_stride_z; 8531*c217d954SCole Faust 8532*c217d954SCole Faust#endif 8533*c217d954SCole Faust const bool cond_y = y == 0; 8534*c217d954SCole Faust const bool cond_x = ((x + 1) * N0 >= N); 8535*c217d954SCole Faust 8536*c217d954SCole Faust 8537*c217d954SCole Faust REPEAT_VAR_INIT_CONVERT(M0, VEC_DATA_TYPE(int, N0), c, res); 8538*c217d954SCole Faust STORE_BLOCK_BOUNDARY_AWARE(M0, N0, int, res, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x); 8539*c217d954SCole Faust} 8540*c217d954SCole Faust#endif 8541*c217d954SCole Faust 8542*c217d954SCole Faust#if defined(GEMMLOWP_MATRIX_A_REDUCTION) 8543*c217d954SCole Faust 8544*c217d954SCole Faust__kernel void gemmlowp_matrix_a_reduction(TENSOR3D_DECLARATION(src), 8545*c217d954SCole Faust IMAGE_DECLARATION(dst)) 8546*c217d954SCole Faust{ 8547*c217d954SCole Faust 8548*c217d954SCole Faust Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); 8549*c217d954SCole Faust Image dst = CONVERT_TO_IMAGE_STRUCT(dst); 8550*c217d954SCole Faust 8551*c217d954SCole Faust VEC_DATA_TYPE(ACC_DATA_TYPE, 4) 8552*c217d954SCole Faust sum_row_32 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))0; 8553*c217d954SCole Faust ACC_DATA_TYPE sum_row = 0; 8554*c217d954SCole Faust 8555*c217d954SCole Faust __global const DATA_TYPE *matrix_a = (__global const DATA_TYPE *)(src.ptr + get_global_id(0) * src_stride_y + get_global_id(1) * src_stride_z); 8556*c217d954SCole Faust 8557*c217d954SCole Faust int i = 0; 8558*c217d954SCole Faust 8559*c217d954SCole Faust 8560*c217d954SCole Faust for(; i <= ((int)COLS_A - 16); i += 16) 8561*c217d954SCole Faust { 8562*c217d954SCole Faust const VEC_DATA_TYPE(DATA_TYPE, 16) a0 = vload16(0, matrix_a + i); 8563*c217d954SCole Faust 8564*c217d954SCole Faust sum_row_32 += CONVERT(a0.s0123, VEC_DATA_TYPE(ACC_DATA_TYPE, 4)) + CONVERT(a0.s4567, VEC_DATA_TYPE(ACC_DATA_TYPE, 4)) + CONVERT(a0.s89AB, VEC_DATA_TYPE(ACC_DATA_TYPE, 4)) + CONVERT(a0.sCDEF, 8565*c217d954SCole Faust VEC_DATA_TYPE(ACC_DATA_TYPE, 4)); 8566*c217d954SCole Faust } 8567*c217d954SCole Faust 8568*c217d954SCole Faust 8569*c217d954SCole Faust for(; i < COLS_A; ++i) 8570*c217d954SCole Faust { 8571*c217d954SCole Faust sum_row += (ACC_DATA_TYPE)matrix_a[i]; 8572*c217d954SCole Faust } 8573*c217d954SCole Faust 8574*c217d954SCole Faust sum_row += sum_row_32.s0 + sum_row_32.s1 + sum_row_32.s2 + sum_row_32.s3; 8575*c217d954SCole Faust 8576*c217d954SCole Faust#if defined(SCALAR) 8577*c217d954SCole Faust sum_row *= (int)SCALAR; 8578*c217d954SCole Faust#endif 8579*c217d954SCole Faust *((__global int *)dst.ptr) = (int)sum_row; 8580*c217d954SCole Faust} 8581*c217d954SCole Faust#endif 8582*c217d954SCole Faust 8583*c217d954SCole Faust#if defined(GEMMLOWP_MATRIX_A_REDUCTION_DOT8) 8584*c217d954SCole Faust 8585*c217d954SCole Faust__kernel void gemmlowp_matrix_a_reduction_dot8(TENSOR3D_DECLARATION(src), 8586*c217d954SCole Faust IMAGE_DECLARATION(dst)) 8587*c217d954SCole Faust{ 8588*c217d954SCole Faust 8589*c217d954SCole Faust Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); 8590*c217d954SCole Faust Image dst = CONVERT_TO_IMAGE_STRUCT(dst); 8591*c217d954SCole Faust 8592*c217d954SCole Faust ACC_DATA_TYPE sum_row = 0; 8593*c217d954SCole Faust 8594*c217d954SCole Faust __global const DATA_TYPE *matrix_a = (__global const DATA_TYPE *)(src.ptr + get_global_id(0) * src_stride_y + get_global_id(1) * src_stride_z); 8595*c217d954SCole Faust 8596*c217d954SCole Faust int i = 0; 8597*c217d954SCole Faust 8598*c217d954SCole Faust 8599*c217d954SCole Faust for(; i <= ((int)COLS_A - 32); i += 32) 8600*c217d954SCole Faust { 8601*c217d954SCole Faust VEC_DATA_TYPE(DATA_TYPE, 16) 8602*c217d954SCole Faust a0 = vload16(0, matrix_a + i); 8603*c217d954SCole Faust 8604*c217d954SCole Faust DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.s0123, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row); 8605*c217d954SCole Faust DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.s4567, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row); 8606*c217d954SCole Faust DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.s89AB, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row); 8607*c217d954SCole Faust DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.sCDEF, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row); 8608*c217d954SCole Faust 8609*c217d954SCole Faust a0 = vload16(1, matrix_a + i); 8610*c217d954SCole Faust 8611*c217d954SCole Faust DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.s0123, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row); 8612*c217d954SCole Faust DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.s4567, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row); 8613*c217d954SCole Faust DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.s89AB, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row); 8614*c217d954SCole Faust DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.sCDEF, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row); 8615*c217d954SCole Faust } 8616*c217d954SCole Faust 8617*c217d954SCole Faust 8618*c217d954SCole Faust for(; i < COLS_A; ++i) 8619*c217d954SCole Faust { 8620*c217d954SCole Faust sum_row += (ACC_DATA_TYPE)matrix_a[i]; 8621*c217d954SCole Faust } 8622*c217d954SCole Faust 8623*c217d954SCole Faust#if defined(SCALAR) 8624*c217d954SCole Faust sum_row *= (int)SCALAR; 8625*c217d954SCole Faust#endif 8626*c217d954SCole Faust *((__global int *)dst.ptr) = (int)sum_row; 8627*c217d954SCole Faust} 8628*c217d954SCole Faust#endif 8629*c217d954SCole Faust 8630*c217d954SCole Faust#if defined(GEMMLOWP_MATRIX_B_REDUCTION) 8631*c217d954SCole Faust 8632*c217d954SCole Faust__kernel void gemmlowp_matrix_b_reduction(TENSOR3D_DECLARATION(src), 8633*c217d954SCole Faust IMAGE_DECLARATION(dst)) 8634*c217d954SCole Faust{ 8635*c217d954SCole Faust 8636*c217d954SCole Faust const uint x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0); 8637*c217d954SCole Faust const uint y = get_global_id(1); 8638*c217d954SCole Faust 8639*c217d954SCole Faust __global const DATA_TYPE *matrix_b = (__global const DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + y * src_step_y + y * src_stride_z); 8640*c217d954SCole Faust __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs * sizeof(int) + y * dst_stride_y; 8641*c217d954SCole Faust 8642*c217d954SCole Faust VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE) 8643*c217d954SCole Faust sum_col_32 = (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))0; 8644*c217d954SCole Faust 8645*c217d954SCole Faust int i = 0; 8646*c217d954SCole Faust 8647*c217d954SCole Faust for(; i <= ((int)ROWS_B - 4); i += 4) 8648*c217d954SCole Faust { 8649*c217d954SCole Faust const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) 8650*c217d954SCole Faust b0 = VLOAD(VEC_SIZE)(0, matrix_b + 0 * src_stride_y); 8651*c217d954SCole Faust const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) 8652*c217d954SCole Faust b1 = VLOAD(VEC_SIZE)(0, matrix_b + 1 * src_stride_y); 8653*c217d954SCole Faust const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) 8654*c217d954SCole Faust b2 = VLOAD(VEC_SIZE)(0, matrix_b + 2 * src_stride_y); 8655*c217d954SCole Faust const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) 8656*c217d954SCole Faust b3 = VLOAD(VEC_SIZE)(0, matrix_b + 3 * src_stride_y); 8657*c217d954SCole Faust 8658*c217d954SCole Faust sum_col_32 += CONVERT(b0, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)) + CONVERT(b1, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)) + CONVERT(b2, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)) + CONVERT(b3, 8659*c217d954SCole Faust VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)); 8660*c217d954SCole Faust 8661*c217d954SCole Faust matrix_b += 4 * src_stride_y; 8662*c217d954SCole Faust } 8663*c217d954SCole Faust 8664*c217d954SCole Faust 8665*c217d954SCole Faust for(; i < (int)ROWS_B; ++i) 8666*c217d954SCole Faust { 8667*c217d954SCole Faust const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) 8668*c217d954SCole Faust b0 = VLOAD(VEC_SIZE)(0, matrix_b); 8669*c217d954SCole Faust 8670*c217d954SCole Faust sum_col_32 += CONVERT(b0, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)); 8671*c217d954SCole Faust 8672*c217d954SCole Faust matrix_b += src_stride_y; 8673*c217d954SCole Faust } 8674*c217d954SCole Faust 8675*c217d954SCole Faust#if defined(SCALAR) 8676*c217d954SCole Faust sum_col_32 *= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))SCALAR; 8677*c217d954SCole Faust#endif 8678*c217d954SCole Faust VEC_DATA_TYPE(int, VEC_SIZE) 8679*c217d954SCole Faust res0 = CONVERT(sum_col_32, VEC_DATA_TYPE(int, VEC_SIZE)); 8680*c217d954SCole Faust 8681*c217d954SCole Faust STORE_VECTOR_SELECT(res, int, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0) 8682*c217d954SCole Faust} 8683*c217d954SCole Faust#endif 8684*c217d954SCole Faust 8685*c217d954SCole Faust#endif 8686*c217d954SCole Faust 8687*c217d954SCole Faust#if defined(K_OFFSET) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) 8688*c217d954SCole Faust 8689*c217d954SCole Faust#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE) 8690*c217d954SCole Faust 8691*c217d954SCole Faust 8692*c217d954SCole Faustinline VEC_INT offset_contribution( 8693*c217d954SCole Faust int x, 8694*c217d954SCole Faust int y, 8695*c217d954SCole Faust int z 8696*c217d954SCole Faust#if defined(A_OFFSET) 8697*c217d954SCole Faust , 8698*c217d954SCole Faust IMAGE_DECLARATION(sum_col) 8699*c217d954SCole Faust#endif 8700*c217d954SCole Faust#if defined(B_OFFSET) 8701*c217d954SCole Faust , 8702*c217d954SCole Faust IMAGE_DECLARATION(sum_row) 8703*c217d954SCole Faust#endif 8704*c217d954SCole Faust#if defined(ADD_BIAS) 8705*c217d954SCole Faust , 8706*c217d954SCole Faust VECTOR_DECLARATION(biases) 8707*c217d954SCole Faust#endif 8708*c217d954SCole Faust) 8709*c217d954SCole Faust{ 8710*c217d954SCole Faust VEC_INT a_offset_s32 = (VEC_INT)0; 8711*c217d954SCole Faust VEC_INT b_offset_s32 = (VEC_INT)0; 8712*c217d954SCole Faust 8713*c217d954SCole Faust int batch_id = z; 8714*c217d954SCole Faust#if defined(DEPTH_INPUT3D) 8715*c217d954SCole Faust batch_id /= (int)DEPTH_INPUT3D; 8716*c217d954SCole Faust#endif 8717*c217d954SCole Faust 8718*c217d954SCole Faust#if defined(A_OFFSET) 8719*c217d954SCole Faust 8720*c217d954SCole Faust __global uchar *sum_col_addr = sum_col_ptr + sum_col_offset_first_element_in_bytes + x * sizeof(int); 8721*c217d954SCole Faust 8722*c217d954SCole Faust 8723*c217d954SCole Faust#if defined(SUM_COL_HAS_BATCHES) 8724*c217d954SCole Faust a_offset_s32 = VLOAD(VEC_SIZE)(0, (__global int *)(sum_col_addr + batch_id * sum_col_stride_y)); 8725*c217d954SCole Faust#else 8726*c217d954SCole Faust a_offset_s32 = VLOAD(VEC_SIZE)(0, (__global int *)sum_col_addr); 8727*c217d954SCole Faust#endif 8728*c217d954SCole Faust 8729*c217d954SCole Faust a_offset_s32 *= (VEC_INT)A_OFFSET; 8730*c217d954SCole Faust#endif 8731*c217d954SCole Faust 8732*c217d954SCole Faust#if defined(B_OFFSET) 8733*c217d954SCole Faust 8734*c217d954SCole Faust __global uchar *sum_row_addr = sum_row_ptr + sum_row_offset_first_element_in_bytes + y * sizeof(int); 8735*c217d954SCole Faust 8736*c217d954SCole Faust 8737*c217d954SCole Faust#if defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D) 8738*c217d954SCole Faust b_offset_s32 = (VEC_INT) * (((__global int *)(sum_row_addr + batch_id * sum_row_stride_y)) + (z % (int)DEPTH_INPUT3D) * (int)HEIGHT_INPUT3D); 8739*c217d954SCole Faust#else 8740*c217d954SCole Faust b_offset_s32 = (VEC_INT) * (((__global int *)(sum_row_addr + batch_id * sum_row_stride_y))); 8741*c217d954SCole Faust#endif 8742*c217d954SCole Faust b_offset_s32 *= (VEC_INT)B_OFFSET; 8743*c217d954SCole Faust#endif 8744*c217d954SCole Faust 8745*c217d954SCole Faust#if defined(ADD_BIAS) 8746*c217d954SCole Faust 8747*c217d954SCole Faust __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int); 8748*c217d954SCole Faust 8749*c217d954SCole Faust VEC_INT biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr); 8750*c217d954SCole Faust b_offset_s32 += (VEC_INT)biases_values; 8751*c217d954SCole Faust#endif 8752*c217d954SCole Faust 8753*c217d954SCole Faust return (VEC_INT)K_OFFSET + a_offset_s32 + b_offset_s32; 8754*c217d954SCole Faust} 8755*c217d954SCole Faust 8756*c217d954SCole Faust#if defined(GEMMLOWP_OFFSET_CONTRIBUTION) 8757*c217d954SCole Faust 8758*c217d954SCole Faust__kernel void gemmlowp_offset_contribution(TENSOR3D_DECLARATION(mm_result) 8759*c217d954SCole Faust#if defined(A_OFFSET) 8760*c217d954SCole Faust , 8761*c217d954SCole Faust IMAGE_DECLARATION(sum_col) 8762*c217d954SCole Faust#endif 8763*c217d954SCole Faust#if defined(B_OFFSET) 8764*c217d954SCole Faust , 8765*c217d954SCole Faust IMAGE_DECLARATION(sum_row) 8766*c217d954SCole Faust#endif 8767*c217d954SCole Faust#if defined(ADD_BIAS) 8768*c217d954SCole Faust , 8769*c217d954SCole Faust VECTOR_DECLARATION(biases) 8770*c217d954SCole Faust#endif 8771*c217d954SCole Faust ) 8772*c217d954SCole Faust{ 8773*c217d954SCole Faust const int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0); 8774*c217d954SCole Faust const int y = get_global_id(1); 8775*c217d954SCole Faust const int z = get_global_id(2); 8776*c217d954SCole Faust 8777*c217d954SCole Faust 8778*c217d954SCole Faust VEC_INT offset_term_s32 = offset_contribution( 8779*c217d954SCole Faust x, y, z 8780*c217d954SCole Faust#if defined(A_OFFSET) 8781*c217d954SCole Faust , 8782*c217d954SCole Faust sum_col_ptr, 8783*c217d954SCole Faust sum_col_stride_x, 8784*c217d954SCole Faust sum_col_step_x, 8785*c217d954SCole Faust sum_col_stride_y, 8786*c217d954SCole Faust sum_col_step_y, 8787*c217d954SCole Faust sum_col_offset_first_element_in_bytes 8788*c217d954SCole Faust#endif 8789*c217d954SCole Faust#if defined(B_OFFSET) 8790*c217d954SCole Faust , 8791*c217d954SCole Faust sum_row_ptr, 8792*c217d954SCole Faust sum_row_stride_x, 8793*c217d954SCole Faust sum_row_step_x, 8794*c217d954SCole Faust sum_row_stride_y, 8795*c217d954SCole Faust sum_row_step_y, 8796*c217d954SCole Faust sum_row_offset_first_element_in_bytes 8797*c217d954SCole Faust#endif 8798*c217d954SCole Faust#if defined(ADD_BIAS) 8799*c217d954SCole Faust , 8800*c217d954SCole Faust biases_ptr, 8801*c217d954SCole Faust biases_stride_x, 8802*c217d954SCole Faust biases_step_x, 8803*c217d954SCole Faust biases_offset_first_element_in_bytes 8804*c217d954SCole Faust#endif 8805*c217d954SCole Faust ); 8806*c217d954SCole Faust 8807*c217d954SCole Faust __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z; 8808*c217d954SCole Faust 8809*c217d954SCole Faust VEC_INT in_s32_0 = VLOAD(VEC_SIZE)(0, (__global int *)mm_result_addr); 8810*c217d954SCole Faust 8811*c217d954SCole Faust 8812*c217d954SCole Faust in_s32_0 += offset_term_s32; 8813*c217d954SCole Faust 8814*c217d954SCole Faust 8815*c217d954SCole Faust STORE_VECTOR_SELECT(in_s32_, int, mm_result_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0) 8816*c217d954SCole Faust} 8817*c217d954SCole Faust#endif 8818*c217d954SCole Faust 8819*c217d954SCole Faust#if defined(GEMMLOWP_OFFSET_CONTRIBUTION_QUANTIZE_DOWN) 8820*c217d954SCole Faust 8821*c217d954SCole Faust__kernel void gemmlowp_offset_contribution_quantize_down(TENSOR3D_DECLARATION(mm_result) 8822*c217d954SCole Faust#if defined(A_OFFSET) 8823*c217d954SCole Faust , 8824*c217d954SCole Faust IMAGE_DECLARATION(sum_col) 8825*c217d954SCole Faust#endif 8826*c217d954SCole Faust#if defined(B_OFFSET) 8827*c217d954SCole Faust , 8828*c217d954SCole Faust IMAGE_DECLARATION(sum_row) 8829*c217d954SCole Faust#endif 8830*c217d954SCole Faust , 8831*c217d954SCole Faust#if defined(ADD_BIAS) 8832*c217d954SCole Faust VECTOR_DECLARATION(biases), 8833*c217d954SCole Faust#endif 8834*c217d954SCole Faust TENSOR3D_DECLARATION(dst) 8835*c217d954SCole Faust#if defined(PER_CHANNEL_QUANTIZATION) 8836*c217d954SCole Faust , 8837*c217d954SCole Faust VECTOR_DECLARATION(result_multipliers), 8838*c217d954SCole Faust VECTOR_DECLARATION(result_shifts) 8839*c217d954SCole Faust#endif 8840*c217d954SCole Faust ) 8841*c217d954SCole Faust{ 8842*c217d954SCole Faust const int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0); 8843*c217d954SCole Faust const int y = get_global_id(1); 8844*c217d954SCole Faust const int z = get_global_id(2); 8845*c217d954SCole Faust 8846*c217d954SCole Faust __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z; 8847*c217d954SCole Faust 8848*c217d954SCole Faust 8849*c217d954SCole Faust VEC_INT offset_term_s32 = offset_contribution( 8850*c217d954SCole Faust x, y, z 8851*c217d954SCole Faust#if defined(A_OFFSET) 8852*c217d954SCole Faust , 8853*c217d954SCole Faust sum_col_ptr, 8854*c217d954SCole Faust sum_col_stride_x, 8855*c217d954SCole Faust sum_col_step_x, 8856*c217d954SCole Faust sum_col_stride_y, 8857*c217d954SCole Faust sum_col_step_y, 8858*c217d954SCole Faust sum_col_offset_first_element_in_bytes 8859*c217d954SCole Faust#endif 8860*c217d954SCole Faust#if defined(B_OFFSET) 8861*c217d954SCole Faust , 8862*c217d954SCole Faust sum_row_ptr, 8863*c217d954SCole Faust sum_row_stride_x, 8864*c217d954SCole Faust sum_row_step_x, 8865*c217d954SCole Faust sum_row_stride_y, 8866*c217d954SCole Faust sum_row_step_y, 8867*c217d954SCole Faust sum_row_offset_first_element_in_bytes 8868*c217d954SCole Faust#endif 8869*c217d954SCole Faust#if defined(ADD_BIAS) 8870*c217d954SCole Faust , 8871*c217d954SCole Faust biases_ptr, 8872*c217d954SCole Faust biases_stride_x, 8873*c217d954SCole Faust biases_step_x, 8874*c217d954SCole Faust biases_offset_first_element_in_bytes 8875*c217d954SCole Faust#endif 8876*c217d954SCole Faust ); 8877*c217d954SCole Faust 8878*c217d954SCole Faust __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z; 8879*c217d954SCole Faust 8880*c217d954SCole Faust VEC_INT in_s32 = VLOAD(VEC_SIZE)(0, (__global int *)mm_result_addr); 8881*c217d954SCole Faust 8882*c217d954SCole Faust 8883*c217d954SCole Faust in_s32 += offset_term_s32; 8884*c217d954SCole Faust 8885*c217d954SCole Faust 8886*c217d954SCole Faust 8887*c217d954SCole Faust 8888*c217d954SCole Faust in_s32 += (VEC_INT)RESULT_OFFSET; 8889*c217d954SCole Faust 8890*c217d954SCole Faust 8891*c217d954SCole Faust#if defined(PER_CHANNEL_QUANTIZATION) 8892*c217d954SCole Faust __global uchar *result_multipliers_addr = result_multipliers_ptr + result_multipliers_offset_first_element_in_bytes + x * sizeof(int); 8893*c217d954SCole Faust __global uchar *result_shifts_addr = result_shifts_ptr + result_shifts_offset_first_element_in_bytes + x * sizeof(int); 8894*c217d954SCole Faust VEC_INT result_multipliers_values = VLOAD(VEC_SIZE)(0, (__global int *)result_multipliers_addr); 8895*c217d954SCole Faust VEC_INT result_shifts_values = VLOAD(VEC_SIZE)(0, (__global int *)result_shifts_addr); 8896*c217d954SCole Faust 8897*c217d954SCole Faust in_s32 *= result_multipliers_values; 8898*c217d954SCole Faust in_s32 >>= result_shifts_values; 8899*c217d954SCole Faust#else 8900*c217d954SCole Faust in_s32 *= RESULT_MULTIPLIER; 8901*c217d954SCole Faust 8902*c217d954SCole Faust in_s32 >>= RESULT_SHIFT; 8903*c217d954SCole Faust#endif 8904*c217d954SCole Faust 8905*c217d954SCole Faust VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE) 8906*c217d954SCole Faust res0 = CONVERT_SAT(in_s32, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)); 8907*c217d954SCole Faust 8908*c217d954SCole Faust#if defined(MIN_BOUND) 8909*c217d954SCole Faust res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND); 8910*c217d954SCole Faust#endif 8911*c217d954SCole Faust#if defined(MAX_BOUND) 8912*c217d954SCole Faust res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND); 8913*c217d954SCole Faust#endif 8914*c217d954SCole Faust 8915*c217d954SCole Faust 8916*c217d954SCole Faust STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0) 8917*c217d954SCole Faust} 8918*c217d954SCole Faust#endif 8919*c217d954SCole Faust 8920*c217d954SCole Faust#if defined(GEMMLOWP_OFFSET_CONTRIBUTION_QUANTIZE_DOWN_FIXEDPOINT) 8921*c217d954SCole Faust 8922*c217d954SCole Faust__kernel void gemmlowp_offset_contribution_quantize_down_fixedpoint(TENSOR3D_DECLARATION(mm_result) 8923*c217d954SCole Faust#if defined(A_OFFSET) 8924*c217d954SCole Faust , 8925*c217d954SCole Faust IMAGE_DECLARATION(sum_col) 8926*c217d954SCole Faust#endif 8927*c217d954SCole Faust#if defined(B_OFFSET) 8928*c217d954SCole Faust , 8929*c217d954SCole Faust IMAGE_DECLARATION(sum_row) 8930*c217d954SCole Faust#endif 8931*c217d954SCole Faust , 8932*c217d954SCole Faust#if defined(ADD_BIAS) 8933*c217d954SCole Faust VECTOR_DECLARATION(biases), 8934*c217d954SCole Faust#endif 8935*c217d954SCole Faust TENSOR3D_DECLARATION(dst) 8936*c217d954SCole Faust#if defined(PER_CHANNEL_QUANTIZATION) 8937*c217d954SCole Faust , 8938*c217d954SCole Faust VECTOR_DECLARATION(result_multipliers), 8939*c217d954SCole Faust VECTOR_DECLARATION(result_shifts) 8940*c217d954SCole Faust#endif 8941*c217d954SCole Faust ) 8942*c217d954SCole Faust{ 8943*c217d954SCole Faust const int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0); 8944*c217d954SCole Faust const int y = get_global_id(1); 8945*c217d954SCole Faust const int z = get_global_id(2); 8946*c217d954SCole Faust 8947*c217d954SCole Faust 8948*c217d954SCole Faust VEC_INT offset_term_s32 = offset_contribution( 8949*c217d954SCole Faust x, y, z 8950*c217d954SCole Faust#if defined(A_OFFSET) 8951*c217d954SCole Faust , 8952*c217d954SCole Faust sum_col_ptr, 8953*c217d954SCole Faust sum_col_stride_x, 8954*c217d954SCole Faust sum_col_step_x, 8955*c217d954SCole Faust sum_col_stride_y, 8956*c217d954SCole Faust sum_col_step_y, 8957*c217d954SCole Faust sum_col_offset_first_element_in_bytes 8958*c217d954SCole Faust#endif 8959*c217d954SCole Faust#if defined(B_OFFSET) 8960*c217d954SCole Faust , 8961*c217d954SCole Faust sum_row_ptr, 8962*c217d954SCole Faust sum_row_stride_x, 8963*c217d954SCole Faust sum_row_step_x, 8964*c217d954SCole Faust sum_row_stride_y, 8965*c217d954SCole Faust sum_row_step_y, 8966*c217d954SCole Faust sum_row_offset_first_element_in_bytes 8967*c217d954SCole Faust#endif 8968*c217d954SCole Faust#if defined(ADD_BIAS) 8969*c217d954SCole Faust , 8970*c217d954SCole Faust biases_ptr, 8971*c217d954SCole Faust biases_stride_x, 8972*c217d954SCole Faust biases_step_x, 8973*c217d954SCole Faust biases_offset_first_element_in_bytes 8974*c217d954SCole Faust#endif 8975*c217d954SCole Faust ); 8976*c217d954SCole Faust 8977*c217d954SCole Faust __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z; 8978*c217d954SCole Faust 8979*c217d954SCole Faust __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z; 8980*c217d954SCole Faust 8981*c217d954SCole Faust VEC_INT in_s32 = VLOAD(VEC_SIZE)(0, (__global int *)mm_result_addr); 8982*c217d954SCole Faust 8983*c217d954SCole Faust 8984*c217d954SCole Faust in_s32 += offset_term_s32; 8985*c217d954SCole Faust 8986*c217d954SCole Faust 8987*c217d954SCole Faust 8988*c217d954SCole Faust 8989*c217d954SCole Faust#if defined(PER_CHANNEL_QUANTIZATION) 8990*c217d954SCole Faust __global uchar *result_multipliers_addr = result_multipliers_ptr + result_multipliers_offset_first_element_in_bytes + x * sizeof(int); 8991*c217d954SCole Faust __global uchar *result_shifts_addr = result_shifts_ptr + result_shifts_offset_first_element_in_bytes + x * sizeof(int); 8992*c217d954SCole Faust VEC_INT result_multipliers_values = VLOAD(VEC_SIZE)(0, (__global int *)result_multipliers_addr); 8993*c217d954SCole Faust VEC_INT result_shifts_values = VLOAD(VEC_SIZE)(0, (__global int *)result_shifts_addr); 8994*c217d954SCole Faust 8995*c217d954SCole Faust VEC_INT in_s32_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(in_s32, result_multipliers_values, result_shifts_values, VEC_SIZE); 8996*c217d954SCole Faust VEC_INT in_s32_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, result_multipliers_values, result_shifts_values, VEC_SIZE); 8997*c217d954SCole Faust in_s32 = select(in_s32_shift_lt0, in_s32_shift_gt0, result_shifts_values >= 0); 8998*c217d954SCole Faust#else 8999*c217d954SCole Faust 9000*c217d954SCole Faust#if RESULT_SHIFT < 0 9001*c217d954SCole Faust in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE); 9002*c217d954SCole Faust#else 9003*c217d954SCole Faust in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE); 9004*c217d954SCole Faust#endif 9005*c217d954SCole Faust 9006*c217d954SCole Faust#endif 9007*c217d954SCole Faust 9008*c217d954SCole Faust 9009*c217d954SCole Faust in_s32 += (VEC_INT)RESULT_OFFSET; 9010*c217d954SCole Faust 9011*c217d954SCole Faust VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE) 9012*c217d954SCole Faust res0 = CONVERT_SAT(in_s32, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)); 9013*c217d954SCole Faust 9014*c217d954SCole Faust#if defined(MIN_BOUND) 9015*c217d954SCole Faust res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND); 9016*c217d954SCole Faust#endif 9017*c217d954SCole Faust#if defined(MAX_BOUND) 9018*c217d954SCole Faust res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND); 9019*c217d954SCole Faust#endif 9020*c217d954SCole Faust 9021*c217d954SCole Faust 9022*c217d954SCole Faust STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0) 9023*c217d954SCole Faust} 9024*c217d954SCole Faust#endif 9025*c217d954SCole Faust 9026*c217d954SCole Faust#undef VEC_INT 9027*c217d954SCole Faust 9028*c217d954SCole Faust#endif 9029*c217d954SCole Faust 9030*c217d954SCole Faust#if defined(GEMMLOWP_OUTPUT_STAGE_QUANTIZE_DOWN) 9031*c217d954SCole Faust 9032*c217d954SCole Faust__kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src), 9033*c217d954SCole Faust#if defined(ADD_BIAS) 9034*c217d954SCole Faust VECTOR_DECLARATION(biases), 9035*c217d954SCole Faust#endif 9036*c217d954SCole Faust TENSOR3D_DECLARATION(dst)) 9037*c217d954SCole Faust{ 9038*c217d954SCole Faust 9039*c217d954SCole Faust int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0); 9040*c217d954SCole Faust int y = get_global_id(1); 9041*c217d954SCole Faust int z = get_global_id(2); 9042*c217d954SCole Faust 9043*c217d954SCole Faust __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z; 9044*c217d954SCole Faust 9045*c217d954SCole Faust __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z; 9046*c217d954SCole Faust 9047*c217d954SCole Faust VEC_DATA_TYPE(int, VEC_SIZE) 9048*c217d954SCole Faust input_values = VLOAD(VEC_SIZE)(0, (__global int *)src_addr); 9049*c217d954SCole Faust 9050*c217d954SCole Faust#if defined(ADD_BIAS) 9051*c217d954SCole Faust 9052*c217d954SCole Faust __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int); 9053*c217d954SCole Faust 9054*c217d954SCole Faust VEC_DATA_TYPE(int, VEC_SIZE) 9055*c217d954SCole Faust biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr); 9056*c217d954SCole Faust input_values += biases_values; 9057*c217d954SCole Faust#endif 9058*c217d954SCole Faust 9059*c217d954SCole Faust 9060*c217d954SCole Faust input_values += (VEC_DATA_TYPE(int, VEC_SIZE))RESULT_OFFSET; 9061*c217d954SCole Faust 9062*c217d954SCole Faust 9063*c217d954SCole Faust input_values *= RESULT_MULT_INT; 9064*c217d954SCole Faust 9065*c217d954SCole Faust#if RESULT_SHIFT < 0 9066*c217d954SCole Faust input_values >>= -RESULT_SHIFT; 9067*c217d954SCole Faust#else 9068*c217d954SCole Faust input_values >>= RESULT_SHIFT; 9069*c217d954SCole Faust#endif 9070*c217d954SCole Faust 9071*c217d954SCole Faust VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE) 9072*c217d954SCole Faust res0 = CONVERT_SAT(input_values, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)); 9073*c217d954SCole Faust 9074*c217d954SCole Faust#if defined(MIN_BOUND) 9075*c217d954SCole Faust res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND); 9076*c217d954SCole Faust#endif 9077*c217d954SCole Faust#if defined(MAX_BOUND) 9078*c217d954SCole Faust res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND); 9079*c217d954SCole Faust#endif 9080*c217d954SCole Faust 9081*c217d954SCole Faust 9082*c217d954SCole Faust STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0) 9083*c217d954SCole Faust} 9084*c217d954SCole Faust#endif 9085*c217d954SCole Faust 9086*c217d954SCole Faust#if defined(GEMMLOWP_OUTPUT_STAGE_QUANTIZE_DOWN_FIXEDPOINT) 9087*c217d954SCole Faust 9088*c217d954SCole Faust__kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATION(src), 9089*c217d954SCole Faust#if defined(ADD_BIAS) 9090*c217d954SCole Faust VECTOR_DECLARATION(biases), 9091*c217d954SCole Faust#endif 9092*c217d954SCole Faust TENSOR3D_DECLARATION(dst)) 9093*c217d954SCole Faust{ 9094*c217d954SCole Faust 9095*c217d954SCole Faust int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0); 9096*c217d954SCole Faust int y = get_global_id(1); 9097*c217d954SCole Faust int z = get_global_id(2); 9098*c217d954SCole Faust 9099*c217d954SCole Faust __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z; 9100*c217d954SCole Faust 9101*c217d954SCole Faust __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z; 9102*c217d954SCole Faust 9103*c217d954SCole Faust VEC_DATA_TYPE(int, VEC_SIZE) 9104*c217d954SCole Faust input_values = VLOAD(VEC_SIZE)(0, (__global int *)src_addr); 9105*c217d954SCole Faust 9106*c217d954SCole Faust#if defined(ADD_BIAS) 9107*c217d954SCole Faust 9108*c217d954SCole Faust __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int); 9109*c217d954SCole Faust 9110*c217d954SCole Faust VEC_DATA_TYPE(int, VEC_SIZE) 9111*c217d954SCole Faust biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr); 9112*c217d954SCole Faust input_values += biases_values; 9113*c217d954SCole Faust#endif 9114*c217d954SCole Faust 9115*c217d954SCole Faust 9116*c217d954SCole Faust#if RESULT_SHIFT < 0 9117*c217d954SCole Faust input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE); 9118*c217d954SCole Faust#else 9119*c217d954SCole Faust input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE); 9120*c217d954SCole Faust#endif 9121*c217d954SCole Faust 9122*c217d954SCole Faust 9123*c217d954SCole Faust input_values += (VEC_DATA_TYPE(int, VEC_SIZE))RESULT_OFFSET_AFTER_SHIFT; 9124*c217d954SCole Faust 9125*c217d954SCole Faust VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE) 9126*c217d954SCole Faust res0 = CONVERT_SAT(input_values, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)); 9127*c217d954SCole Faust 9128*c217d954SCole Faust#if defined(MIN_BOUND) 9129*c217d954SCole Faust res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND); 9130*c217d954SCole Faust#endif 9131*c217d954SCole Faust#if defined(MAX_BOUND) 9132*c217d954SCole Faust res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND); 9133*c217d954SCole Faust#endif 9134*c217d954SCole Faust 9135*c217d954SCole Faust 9136*c217d954SCole Faust STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0) 9137*c217d954SCole Faust} 9138*c217d954SCole Faust#endif 9139*c217d954SCole Faust 9140*c217d954SCole Faust#if defined(GEMMLOWP_OUTPUT_STAGE_QUANTIZE_DOWN_FIXEDPOINT_QSYMM16) 9141*c217d954SCole Faust 9142*c217d954SCole Faust__kernel void gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16(TENSOR3D_DECLARATION(src), 9143*c217d954SCole Faust#if defined(ADD_BIAS) 9144*c217d954SCole Faust VECTOR_DECLARATION(biases), 9145*c217d954SCole Faust#endif 9146*c217d954SCole Faust TENSOR3D_DECLARATION(dst)) 9147*c217d954SCole Faust{ 9148*c217d954SCole Faust 9149*c217d954SCole Faust int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0); 9150*c217d954SCole Faust int y = get_global_id(1); 9151*c217d954SCole Faust int z = get_global_id(2); 9152*c217d954SCole Faust 9153*c217d954SCole Faust __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z; 9154*c217d954SCole Faust 9155*c217d954SCole Faust __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(short) + y * dst_stride_y + z * dst_stride_z; 9156*c217d954SCole Faust 9157*c217d954SCole Faust VEC_DATA_TYPE(int, VEC_SIZE) 9158*c217d954SCole Faust input_values = VLOAD(VEC_SIZE)(0, (__global int *)src_addr); 9159*c217d954SCole Faust 9160*c217d954SCole Faust#if defined(ADD_BIAS) 9161*c217d954SCole Faust 9162*c217d954SCole Faust __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int); 9163*c217d954SCole Faust 9164*c217d954SCole Faust VEC_DATA_TYPE(int, VEC_SIZE) 9165*c217d954SCole Faust biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr); 9166*c217d954SCole Faust input_values += biases_values; 9167*c217d954SCole Faust#endif 9168*c217d954SCole Faust 9169*c217d954SCole Faust 9170*c217d954SCole Faust#if RESULT_SHIFT < 0 9171*c217d954SCole Faust input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE); 9172*c217d954SCole Faust#else 9173*c217d954SCole Faust input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE); 9174*c217d954SCole Faust#endif 9175*c217d954SCole Faust 9176*c217d954SCole Faust VEC_DATA_TYPE(short, VEC_SIZE) 9177*c217d954SCole Faust res0 = CONVERT_SAT(input_values, VEC_DATA_TYPE(short, VEC_SIZE)); 9178*c217d954SCole Faust 9179*c217d954SCole Faust#if defined(MIN_BOUND) 9180*c217d954SCole Faust res0 = max(res0, (VEC_DATA_TYPE(short, VEC_SIZE))MIN_BOUND); 9181*c217d954SCole Faust#endif 9182*c217d954SCole Faust#if defined(MAX_BOUND) 9183*c217d954SCole Faust res0 = min(res0, (VEC_DATA_TYPE(short, VEC_SIZE))MAX_BOUND); 9184*c217d954SCole Faust#endif 9185*c217d954SCole Faust 9186*c217d954SCole Faust 9187*c217d954SCole Faust STORE_VECTOR_SELECT(res, short, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0) 9188*c217d954SCole Faust} 9189*c217d954SCole Faust#endif 9190*c217d954SCole Faust 9191*c217d954SCole Faust#if defined(GEMMLOWP_OUTPUT_STAGE_QUANTIZE_DOWN_FLOAT) 9192*c217d954SCole Faust 9193*c217d954SCole Faust__kernel void gemmlowp_output_stage_quantize_down_float(TENSOR3D_DECLARATION(src), 9194*c217d954SCole Faust#if defined(ADD_BIAS) 9195*c217d954SCole Faust VECTOR_DECLARATION(biases), 9196*c217d954SCole Faust#endif 9197*c217d954SCole Faust#if defined(DST_HEIGHT) 9198*c217d954SCole Faust TENSOR4D_DECLARATION(dst)) 9199*c217d954SCole Faust#else 9200*c217d954SCole Faust TENSOR3D_DECLARATION(dst)) 9201*c217d954SCole Faust#endif 9202*c217d954SCole Faust{ 9203*c217d954SCole Faust 9204*c217d954SCole Faust int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0); 9205*c217d954SCole Faust int y = get_global_id(1); 9206*c217d954SCole Faust int z = get_global_id(2); 9207*c217d954SCole Faust 9208*c217d954SCole Faust __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z; 9209*c217d954SCole Faust 9210*c217d954SCole Faust __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z; 9211*c217d954SCole Faust 9212*c217d954SCole Faust VEC_DATA_TYPE(int, VEC_SIZE) 9213*c217d954SCole Faust input_values = VLOAD(VEC_SIZE)(0, (__global int *)src_addr); 9214*c217d954SCole Faust 9215*c217d954SCole Faust#if defined(ADD_BIAS) 9216*c217d954SCole Faust 9217*c217d954SCole Faust __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int); 9218*c217d954SCole Faust 9219*c217d954SCole Faust VEC_DATA_TYPE(int, VEC_SIZE) 9220*c217d954SCole Faust biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr); 9221*c217d954SCole Faust input_values += (VEC_DATA_TYPE(int, VEC_SIZE))biases_values; 9222*c217d954SCole Faust#endif 9223*c217d954SCole Faust 9224*c217d954SCole Faust 9225*c217d954SCole Faust VEC_DATA_TYPE(float, VEC_SIZE) 9226*c217d954SCole Faust input_values_f = CONVERT(input_values, VEC_DATA_TYPE(float, VEC_SIZE)); 9227*c217d954SCole Faust input_values_f = round(input_values_f * (float)REAL_MULTIPLIER + (float)OUTPUT_OFFSET); 9228*c217d954SCole Faust 9229*c217d954SCole Faust VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE) 9230*c217d954SCole Faust res0 = CONVERT_SAT(input_values_f, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)); 9231*c217d954SCole Faust 9232*c217d954SCole Faust#if defined(MIN_BOUND) 9233*c217d954SCole Faust res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND); 9234*c217d954SCole Faust#endif 9235*c217d954SCole Faust#if defined(MAX_BOUND) 9236*c217d954SCole Faust res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND); 9237*c217d954SCole Faust#endif 9238*c217d954SCole Faust 9239*c217d954SCole Faust 9240*c217d954SCole Faust STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0) 9241*c217d954SCole Faust} 9242*c217d954SCole Faust#endif )"