1// Copyright 2020 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6$assert CHANNEL_TILE % 8 == 0 7$assert CHANNEL_TILE >= 8 8$assert ROW_TILE >= 3 9$assert ROW_SUBTILE >= 3 10$assert ROW_SUBTILE <= ROW_TILE 11$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 12#include <assert.h> 13 14#include <arm_neon.h> 15 16#include <xnnpack/gavgpool.h> 17#include <xnnpack/math.h> 18 19 20void xnn_f16_gavgpool_minmax_ukernel_${ROW_TILE}p${ROW_SUBTILE}x__neonfp16arith_c${CHANNEL_TILE}( 21 size_t rows, 22 size_t channels, 23 const void* input, 24 size_t input_stride, 25 const void* zero, 26 void* buffer, 27 void* output, 28 const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS 29{ 30 assert(rows > ${ROW_TILE}); 31 assert(channels != 0); 32 33 const __fp16* i0 = input; 34 $for M in range(1, ROW_TILE): 35 const __fp16* i${M} = (const __fp16*) ((uintptr_t) i${M-1} + input_stride); 36 const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, 8) * sizeof(__fp16); 37 38 __fp16* b = buffer; 39 size_t c = channels; 40 for (; ${"c >= %d" % CHANNEL_TILE if CHANNEL_TILE > 8 else "c != 0"}; ${("c -= %d" if CHANNEL_TILE > 8 else "c = doz(c, %d)") % CHANNEL_TILE}) { 41 $for M in range(2): 42 $for C in range(0, CHANNEL_TILE, 8): 43 const float16x8_t vi${M}x${ABC[C:C+8]} = vld1q_f16(i${M}); i${M} += 8; 44 45 $for C in range(0, CHANNEL_TILE, 8): 46 const float16x8_t vi2x${ABC[C:C+8]} = vld1q_f16(i2); i2 += 8; 47 float16x8_t vacc${ABC[C:C+8]} = vaddq_f16(vi0x${ABC[C:C+8]}, vi1x${ABC[C:C+8]}); 48 49 $for M in range(2, ROW_TILE): 50 $for C in range(0, CHANNEL_TILE, 8): 51 $if M + 1 != ROW_TILE: 52 const float16x8_t vi${M+1}x${ABC[C:C+8]} = vld1q_f16(i${M+1}); i${M+1} += 8; 53 vacc${ABC[C:C+8]} = vaddq_f16(vacc${ABC[C:C+8]}, vi${M}x${ABC[C:C+8]}); 54 55 $for C in range(0, CHANNEL_TILE, 8): 56 vst1q_f16(b, vacc${ABC[C:C+8]}); b += 8; 57 } 58 $if CHANNEL_TILE > 8: 59 if XNN_UNLIKELY(c != 0) { 60 do { 61 $for M in range(3): 62 const float16x8_t vi${M}x${ABC[0:8]} = vld1q_f16(i${M}); i${M} += 8; 63 float16x8_t vacc${ABC[0:8]} = vaddq_f16(vi0x${ABC[0:8]}, vi1x${ABC[0:8]}); 64 65 $for M in range(2, ROW_TILE): 66 $if M + 1 != ROW_TILE: 67 const float16x8_t vi${M+1}x${ABC[0:8]} = vld1q_f16(i${M+1}); i${M+1} += 8; 68 vacc${ABC[0:8]} = vaddq_f16(vacc${ABC[0:8]}, vi${M}x${ABC[0:8]}); 69 70 vst1q_f16(b, vacc${ABC[0:8]}); b += 8; 71 72 c = doz(c, 8); 73 } while (c != 0); 74 } 75 76 for (rows -= ${ROW_TILE}; rows > ${ROW_SUBTILE}; rows -= ${ROW_SUBTILE}) { 77 $for M in range(ROW_SUBTILE): 78 i${M} = (const __fp16*) ((uintptr_t) i${M + ROW_TILE - ROW_SUBTILE} + input_increment); 79 80 __fp16* b = buffer; 81 size_t c = channels; 82 for (; ${"c >= %d" % CHANNEL_TILE if CHANNEL_TILE > 8 else "c != 0"}; ${("c -= %d" if CHANNEL_TILE > 8 else "c = doz(c, %d)") % CHANNEL_TILE}) { 83 float16x8_t vacc${ABC[0:8]} = vld1q_f16(b); 84 $for C in range(8, CHANNEL_TILE, 8): 85 float16x8_t vacc${ABC[C:C+8]} = vld1q_f16(b + ${C}); 86 87 $for C in range(0, CHANNEL_TILE, 8): 88 const float16x8_t vi0x${ABC[C:C+8]} = vld1q_f16(i0); i0 += 8; 89 90 $for M in range(ROW_TILE): 91 $for C in range(0, CHANNEL_TILE, 8): 92 $if M + 1 != ROW_TILE: 93 const float16x8_t vi${M+1}x${ABC[C:C+8]} = vld1q_f16(i${M+1}); i${M+1} += 8; 94 vacc${ABC[C:C+8]} = vaddq_f16(vacc${ABC[C:C+8]}, vi${M}x${ABC[C:C+8]}); 95 96 $for C in range(0, CHANNEL_TILE, 8): 97 vst1q_f16(b, vacc${ABC[C:C+8]}); b += 8; 98 } 99 $if CHANNEL_TILE > 8: 100 if XNN_UNLIKELY(c != 0) { 101 do { 102 float16x8_t vacc${ABC[0:8]} = vld1q_f16(b); 103 const float16x8_t vi0x${ABC[0:8]} = vld1q_f16(i0); i0 += 8; 104 105 $for M in range(ROW_TILE): 106 $if M + 1 != ROW_TILE: 107 const float16x8_t vi${M+1}x${ABC[0:8]} = vld1q_f16(i${M+1}); i${M+1} += 8; 108 vacc${ABC[0:8]} = vaddq_f16(vacc${ABC[0:8]}, vi${M}x${ABC[0:8]}); 109 110 vst1q_f16(b, vacc${ABC[0:8]}); b += 8; 111 112 c = doz(c, 8); 113 } while (c != 0); 114 } 115 } 116 117 i0 = (const __fp16*) ((uintptr_t) i${ROW_TILE - ROW_SUBTILE} + input_increment); 118 $for M in range(1, ROW_SUBTILE): 119 i${M} = (const __fp16*) ((uintptr_t) i${M + ROW_TILE - ROW_SUBTILE} + input_increment); 120 $if M % 2 == 1: 121 if XNN_UNPREDICTABLE(rows < ${M+1}) { 122 i${M} = (const __fp16*) zero; 123 } 124 $else: 125 if XNN_UNPREDICTABLE(rows <= ${M}) { 126 i${M} = (const __fp16*) zero; 127 } 128 129 const float16x8_t vscale = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neon.scale)); 130 const float16x8_t vmin = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neon.min)); 131 const float16x8_t vmax = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neon.max)); 132 for (; channels >= ${CHANNEL_TILE}; channels -= ${CHANNEL_TILE}) { 133 $for C in range(0, CHANNEL_TILE, 8): 134 float16x8_t vacc${ABC[C:C+8]} = vld1q_f16(buffer); buffer = (__fp16*) buffer + 8; 135 136 $for C in range(0, CHANNEL_TILE, 8): 137 const float16x8_t vi0x${ABC[C:C+8]} = vld1q_f16(i0); i0 += 8; 138 139 $for M in range(ROW_TILE): 140 $for C in range(0, CHANNEL_TILE, 8): 141 $if M + 1 != ROW_TILE: 142 const float16x8_t vi${M+1}x${ABC[C:C+8]} = vld1q_f16(i${M+1}); i${M+1} += 8; 143 vacc${ABC[C:C+8]} = vaddq_f16(vacc${ABC[C:C+8]}, vi${M}x${ABC[C:C+8]}); 144 145 $for C in range(0, CHANNEL_TILE, 8): 146 vacc${ABC[C:C+8]} = vmulq_f16(vacc${ABC[C:C+8]}, vscale); 147 148 $for C in range(0, CHANNEL_TILE, 8): 149 vacc${ABC[C:C+8]} = vmaxq_f16(vacc${ABC[C:C+8]}, vmin); 150 151 $for C in range(0, CHANNEL_TILE, 8): 152 vacc${ABC[C:C+8]} = vminq_f16(vacc${ABC[C:C+8]}, vmax); 153 154 $for C in range(0, CHANNEL_TILE, 8): 155 vst1q_f16(output, vacc${ABC[C:C+8]}); output = (__fp16*) output + 8; 156 } 157 if XNN_UNLIKELY(channels != 0) { 158 ${"do " if CHANNEL_TILE > 8 else ""}{ 159 float16x8_t vacc${ABC[0:8]} = vld1q_f16(buffer); buffer = (__fp16*) buffer + 8; 160 161 const float16x8_t vi0x${ABC[0:8]} = vld1q_f16(i0); i0 += 8; 162 $for M in range(ROW_TILE): 163 $if M + 1 != ROW_TILE: 164 const float16x8_t vi${M+1}x${ABC[0:8]} = vld1q_f16(i${M+1}); i${M+1} += 8; 165 vacc${ABC[0:8]} = vaddq_f16(vacc${ABC[0:8]}, vi${M}x${ABC[0:8]}); 166 167 vacc${ABC[0:8]} = vmulq_f16(vacc${ABC[0:8]}, vscale); 168 vacc${ABC[0:8]} = vmaxq_f16(vacc${ABC[0:8]}, vmin); 169 vacc${ABC[0:8]} = vminq_f16(vacc${ABC[0:8]}, vmax); 170 171 $if CHANNEL_TILE > 8: 172 if XNN_LIKELY(channels >= 8) { 173 vst1q_f16(output, vacc${ABC[0:8]}); output = (__fp16*) output + 8; 174 channels -= 8; 175 } else { 176 float16x4_t vacc${ABC[0:4]} = vget_low_f16(vacc${ABC[0:8]}); 177 if (channels & 4) { 178 vst1_f16(output, vacc${ABC[0:4]}); output = (__fp16*) output + 4; 179 vacc${ABC[0:4]} = vget_high_f16(vacc${ABC[0:8]}); 180 } 181 if (channels & 2) { 182 vst1_lane_u32(output, vreinterpret_u32_f16(vacc${ABC[0:4]}), 0); output = (__fp16*) output + 2; 183 vacc${ABC[0:4]} = vext_f16(vacc${ABC[0:4]}, vacc${ABC[0:4]}, 2); 184 } 185 if (channels & 1) { 186 vst1_lane_f16(output, vacc${ABC[0:4]}, 0); output = (__fp16*) output + 1; 187 } 188 channels = 0; 189 } 190 $else: 191 float16x4_t vacc${ABC[0:4]} = vget_low_f16(vacc${ABC[0:8]}); 192 if (channels & 4) { 193 vst1_f16(output, vacc${ABC[0:4]}); output = (__fp16*) output + 4; 194 vacc${ABC[0:4]} = vget_high_f16(vacc${ABC[0:8]}); 195 } 196 if (channels & 2) { 197 vst1_lane_u32(output, vreinterpret_u32_f16(vacc${ABC[0:4]}), 0); output = (__fp16*) output + 2; 198 vacc${ABC[0:4]} = vext_f16(vacc${ABC[0:4]}, vacc${ABC[0:4]}, 2); 199 } 200 if (channels & 1) { 201 vst1_lane_f16(output, vacc${ABC[0:4]}, 0); output = (__fp16*) output + 1; 202 } 203 }${" while (channels != 0);" if CHANNEL_TILE > 8 else ""} 204 } 205} 206