xref: /aosp_15_r20/external/XNNPACK/src/f32-dwconv/up-scalar.c.in (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6$assert CHANNEL_TILE >= 1
7$assert KERNEL_TILE >= 2
8$assert ACCUMULATORS >= 1
9$assert ACTIVATION in ["LINEAR", "MINMAX"]
10$assert ACTIVATION != "LINEAR" or not WASM
11$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
12#include <assert.h>
13
14#include <xnnpack/dwconv.h>
15#include <xnnpack/math.h>
16
17
18$MIN_F32 = "__builtin_wasm_min_f32" if WASM else "math_min_f32"
19$MAX_F32 = "__builtin_wasm_max_f32" if WASM else "math_max_f32"
20$SUFFIX = {"LINEAR": "", "MINMAX": "_minmax"}[ACTIVATION]
21$PARAMS = {"LINEAR": "xnn_f32_default_params", "MINMAX": "xnn_f32_minmax_params"}[ACTIVATION]
22void xnn_f32_dwconv${SUFFIX}_ukernel_up${CHANNEL_TILE}x${KERNEL_TILE}__${"wasm" if WASM else "scalar"}${"" if ACCUMULATORS == 1 else "_acc%d" % ACCUMULATORS}(
23    size_t channels,
24    size_t output_width,
25    const float** input,
26    const float* weights,
27    float* output,
28    size_t input_stride,
29    size_t output_increment,
30    size_t input_offset,
31    const float* zero,
32    const union ${PARAMS} params[restrict XNN_MIN_ELEMENTS(1)])
33{
34  assert(channels != 0);
35  assert(output_width != 0);
36
37  $if ACTIVATION == "MINMAX":
38    const float vmin = params->scalar.min;
39    const float vmax = params->scalar.max;
40  do {
41    $for K in range(KERNEL_TILE):
42      const float* i${K} = input[${K}];
43      assert(i${K} != NULL);
44      if XNN_UNPREDICTABLE(i${K} != zero) {
45        i${K} = (const float*) ((uintptr_t) i${K} + input_offset);
46      }
47    input = (const float**) ((uintptr_t) input + input_stride);
48
49    size_t c = channels;
50    const float* w = weights;
51    $if CHANNEL_TILE > 1:
52      for (; c >= ${CHANNEL_TILE}; c -= ${CHANNEL_TILE}) {
53        $for C in range(CHANNEL_TILE):
54          float vacc${C}p0 = w[${C}];
55
56        $for K in range(KERNEL_TILE):
57
58          $for C in range(CHANNEL_TILE):
59            const float vi${K}x${C} = i${K}[${C}];
60          i${K} += ${CHANNEL_TILE};
61
62          $for C in range(CHANNEL_TILE):
63            const float vk${K}x${C} = w[${(K + 1) * CHANNEL_TILE + C}];
64            $if 1 <= K < ACCUMULATORS:
65              float vacc${C}p${K} = vi${K}x${C} * vk${K}x${C};
66            $else:
67              vacc${C}p${K % ACCUMULATORS} = math_muladd_f32(vi${K}x${C}, vk${K}x${C}, vacc${C}p${K % ACCUMULATORS});
68
69        w += ${(KERNEL_TILE + 1) * CHANNEL_TILE};
70
71        $if ACCUMULATORS > 1:
72          // Add up all accumulators to vacc${ABC[0:CHANNEL_TILE]}p0
73          $ACC_SLICE = 1
74          $while ACC_SLICE < ACCUMULATORS:
75            $for A in range(0, ACCUMULATORS, ACC_SLICE * 2):
76              $if A + ACC_SLICE < ACCUMULATORS:
77                $for C in range(CHANNEL_TILE):
78                  vacc${C}p${A} = vacc${C}p${A} + vacc${C}p${A + ACC_SLICE};
79            $ACC_SLICE *= 2
80
81        $if ACTIVATION == "MINMAX":
82          $for C in range(CHANNEL_TILE):
83            float vacc${C} = ${MAX_F32}(vacc${C}p0, vmin);
84
85          $for C in range(CHANNEL_TILE):
86            vacc${C} = ${MIN_F32}(vacc${C}, vmax);
87
88          $for C in range(CHANNEL_TILE):
89            output[${C}] = vacc${C};
90        $else:
91          $for C in range(CHANNEL_TILE):
92            output[${C}] = vacc${C}p0;
93        output += ${CHANNEL_TILE};
94      }
95      for (; c >= 1; c -= 1) {
96        float vacc0p0 = *w++;
97
98        $for K in range(KERNEL_TILE):
99          const float vi${K} = *i${K}++;
100          const float vk${K} = w[${(K + 1) * CHANNEL_TILE - 1}];
101          $if 1 <= K < ACCUMULATORS:
102            float vacc0p${K} = vi${K} * vk${K};
103          $else:
104            vacc0p${K % ACCUMULATORS} = math_muladd_f32(vi${K}, vk${K}, vacc0p${K % ACCUMULATORS});
105
106        $if ACCUMULATORS > 1:
107          // Add up all accumulators to vacc${ABC[0:CHANNEL_TILE]}p0
108          $ACC_SLICE = 1
109          $while ACC_SLICE < ACCUMULATORS:
110            $for A in range(0, ACCUMULATORS, ACC_SLICE * 2):
111              $if A + ACC_SLICE < ACCUMULATORS:
112                vacc0p${A} = vacc0p${A} + vacc0p${A + ACC_SLICE};
113            $ACC_SLICE *= 2
114
115        $if ACTIVATION == "MINMAX":
116          float vacc0 = ${MAX_F32}(vacc0p0, vmin);
117          vacc0 = ${MIN_F32}(vacc0, vmax);
118          *output++ = vacc0;
119        $else:
120          *output++ = vacc0p0;
121      }
122    $else:
123      do {
124        float vacc0p0 = w[0];
125        $for K in range(KERNEL_TILE):
126
127          const float vi${K} = *i${K}++;
128          const float vk${K} = w[${K+1}];
129          $if 1 <= K < ACCUMULATORS:
130            float vacc0p${K} = vi${K} * vk${K};
131          $else:
132            vacc0p${K % ACCUMULATORS} = math_muladd_f32(vi${K}, vk${K}, vacc0p${K % ACCUMULATORS});
133
134        w += ${KERNEL_TILE + 1};
135
136        $ACC_STEP = 1
137        $while ACC_STEP < ACCUMULATORS:
138          $for A in range(0, ACCUMULATORS, ACC_STEP * 2):
139            $if A + ACC_STEP < ACCUMULATORS:
140              vacc0p${A} += vacc0p${A + ACC_STEP};
141          $ACC_STEP *= 2
142
143        $if ACTIVATION == "MINMAX":
144          float vacc0 = ${MAX_F32}(vacc0p0, vmin);
145          vacc0 = ${MIN_F32}(vacc0, vmax);
146          *output++ = vacc0;
147        $else:
148          *output++ = vacc0p0;
149      } while (--c != 0);
150
151    output = (float*) ((uintptr_t) output + output_increment);
152  } while (--output_width != 0);
153}
154