1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6$assert CHANNEL_TILE >= 1 7$assert KERNEL_TILE >= 2 8$assert ACCUMULATORS >= 1 9$assert ACTIVATION in ["LINEAR", "MINMAX"] 10$assert ACTIVATION != "LINEAR" or not WASM 11$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 12#include <assert.h> 13 14#include <xnnpack/dwconv.h> 15#include <xnnpack/math.h> 16 17 18$MIN_F32 = "__builtin_wasm_min_f32" if WASM else "math_min_f32" 19$MAX_F32 = "__builtin_wasm_max_f32" if WASM else "math_max_f32" 20$SUFFIX = {"LINEAR": "", "MINMAX": "_minmax"}[ACTIVATION] 21$PARAMS = {"LINEAR": "xnn_f32_default_params", "MINMAX": "xnn_f32_minmax_params"}[ACTIVATION] 22void xnn_f32_dwconv${SUFFIX}_ukernel_up${CHANNEL_TILE}x${KERNEL_TILE}__${"wasm" if WASM else "scalar"}${"" if ACCUMULATORS == 1 else "_acc%d" % ACCUMULATORS}( 23 size_t channels, 24 size_t output_width, 25 const float** input, 26 const float* weights, 27 float* output, 28 size_t input_stride, 29 size_t output_increment, 30 size_t input_offset, 31 const float* zero, 32 const union ${PARAMS} params[restrict XNN_MIN_ELEMENTS(1)]) 33{ 34 assert(channels != 0); 35 assert(output_width != 0); 36 37 $if ACTIVATION == "MINMAX": 38 const float vmin = params->scalar.min; 39 const float vmax = params->scalar.max; 40 do { 41 $for K in range(KERNEL_TILE): 42 const float* i${K} = input[${K}]; 43 assert(i${K} != NULL); 44 if XNN_UNPREDICTABLE(i${K} != zero) { 45 i${K} = (const float*) ((uintptr_t) i${K} + input_offset); 46 } 47 input = (const float**) ((uintptr_t) input + input_stride); 48 49 size_t c = channels; 50 const float* w = weights; 51 $if CHANNEL_TILE > 1: 52 for (; c >= ${CHANNEL_TILE}; c -= ${CHANNEL_TILE}) { 53 $for C in range(CHANNEL_TILE): 54 float vacc${C}p0 = w[${C}]; 55 56 $for K in range(KERNEL_TILE): 57 58 $for C in range(CHANNEL_TILE): 59 const float vi${K}x${C} = i${K}[${C}]; 60 i${K} += ${CHANNEL_TILE}; 61 62 $for C in range(CHANNEL_TILE): 63 const float vk${K}x${C} = w[${(K + 1) * CHANNEL_TILE + C}]; 64 $if 1 <= K < ACCUMULATORS: 65 float vacc${C}p${K} = vi${K}x${C} * vk${K}x${C}; 66 $else: 67 vacc${C}p${K % ACCUMULATORS} = math_muladd_f32(vi${K}x${C}, vk${K}x${C}, vacc${C}p${K % ACCUMULATORS}); 68 69 w += ${(KERNEL_TILE + 1) * CHANNEL_TILE}; 70 71 $if ACCUMULATORS > 1: 72 // Add up all accumulators to vacc${ABC[0:CHANNEL_TILE]}p0 73 $ACC_SLICE = 1 74 $while ACC_SLICE < ACCUMULATORS: 75 $for A in range(0, ACCUMULATORS, ACC_SLICE * 2): 76 $if A + ACC_SLICE < ACCUMULATORS: 77 $for C in range(CHANNEL_TILE): 78 vacc${C}p${A} = vacc${C}p${A} + vacc${C}p${A + ACC_SLICE}; 79 $ACC_SLICE *= 2 80 81 $if ACTIVATION == "MINMAX": 82 $for C in range(CHANNEL_TILE): 83 float vacc${C} = ${MAX_F32}(vacc${C}p0, vmin); 84 85 $for C in range(CHANNEL_TILE): 86 vacc${C} = ${MIN_F32}(vacc${C}, vmax); 87 88 $for C in range(CHANNEL_TILE): 89 output[${C}] = vacc${C}; 90 $else: 91 $for C in range(CHANNEL_TILE): 92 output[${C}] = vacc${C}p0; 93 output += ${CHANNEL_TILE}; 94 } 95 for (; c >= 1; c -= 1) { 96 float vacc0p0 = *w++; 97 98 $for K in range(KERNEL_TILE): 99 const float vi${K} = *i${K}++; 100 const float vk${K} = w[${(K + 1) * CHANNEL_TILE - 1}]; 101 $if 1 <= K < ACCUMULATORS: 102 float vacc0p${K} = vi${K} * vk${K}; 103 $else: 104 vacc0p${K % ACCUMULATORS} = math_muladd_f32(vi${K}, vk${K}, vacc0p${K % ACCUMULATORS}); 105 106 $if ACCUMULATORS > 1: 107 // Add up all accumulators to vacc${ABC[0:CHANNEL_TILE]}p0 108 $ACC_SLICE = 1 109 $while ACC_SLICE < ACCUMULATORS: 110 $for A in range(0, ACCUMULATORS, ACC_SLICE * 2): 111 $if A + ACC_SLICE < ACCUMULATORS: 112 vacc0p${A} = vacc0p${A} + vacc0p${A + ACC_SLICE}; 113 $ACC_SLICE *= 2 114 115 $if ACTIVATION == "MINMAX": 116 float vacc0 = ${MAX_F32}(vacc0p0, vmin); 117 vacc0 = ${MIN_F32}(vacc0, vmax); 118 *output++ = vacc0; 119 $else: 120 *output++ = vacc0p0; 121 } 122 $else: 123 do { 124 float vacc0p0 = w[0]; 125 $for K in range(KERNEL_TILE): 126 127 const float vi${K} = *i${K}++; 128 const float vk${K} = w[${K+1}]; 129 $if 1 <= K < ACCUMULATORS: 130 float vacc0p${K} = vi${K} * vk${K}; 131 $else: 132 vacc0p${K % ACCUMULATORS} = math_muladd_f32(vi${K}, vk${K}, vacc0p${K % ACCUMULATORS}); 133 134 w += ${KERNEL_TILE + 1}; 135 136 $ACC_STEP = 1 137 $while ACC_STEP < ACCUMULATORS: 138 $for A in range(0, ACCUMULATORS, ACC_STEP * 2): 139 $if A + ACC_STEP < ACCUMULATORS: 140 vacc0p${A} += vacc0p${A + ACC_STEP}; 141 $ACC_STEP *= 2 142 143 $if ACTIVATION == "MINMAX": 144 float vacc0 = ${MAX_F32}(vacc0p0, vmin); 145 vacc0 = ${MIN_F32}(vacc0, vmax); 146 *output++ = vacc0; 147 $else: 148 *output++ = vacc0p0; 149 } while (--c != 0); 150 151 output = (float*) ((uintptr_t) output + output_increment); 152 } while (--output_width != 0); 153} 154